src/synthetic/Utf8.zig


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103

/// Generates UTF-8.
///
/// This doesn't yet generate multi-codepoint graphemes, but it
/// has the ability to generate a custom distribution of UTF-8
/// encoding lengths (1, 2, 3, or 4 bytes).
const Utf8 = @This();

const std = @import("std");
const assert = std.debug.assert;
const Generator = @import("Generator.zig");

/// Possible UTF-8 encoding lengths.
pub const Utf8Len = enum(u3) {
    one = 1,
    two = 2,
    three = 3,
    four = 4,
};

/// Random number generator.
rand: std.Random,

/// The minimum and maximum length of the generated bytes. The maximum
/// length will be capped to the length of the buffer passed in if the
/// buffer length is smaller.
min_len: usize = 1,
max_len: usize = std.math.maxInt(usize),

/// Probability of a specific UTF-8 encoding length being generated.
/// The probabilities are weighted relative to each other, so they
/// can sum greater than 1.0. A length of weight 1.0 and a length
/// of weight 2.0 will have a 2:1 chance of the latter being
/// selected.
///
/// If a UTF-8 encoding of a chosen length can't fit into the remaining
/// buffer, a smaller length will be chosen. For small buffers this may
/// skew the distribution of lengths.
p_length: std.enums.EnumArray(Utf8Len, f64) = .initFill(1.0),

pub fn generator(self: *Utf8) Generator {
    return .init(self, next);
}

pub fn next(self: *Utf8, buf: []u8) Generator.Error![]const u8 {
    const len = @min(
        self.rand.intRangeAtMostBiased(usize, self.min_len, self.max_len),
        buf.len,
    );

    const result = buf[0..len];
    var rem: usize = len;
    while (rem > 0) {
        // Pick a utf8 byte count to generate.
        const utf8_len: Utf8Len = len: {
            const Indexer = @TypeOf(self.p_length).Indexer;
            const idx = self.rand.weightedIndex(f64, &self.p_length.values);
            var utf8_len = Indexer.keyForIndex(idx);
            assert(rem > 0);
            while (@intFromEnum(utf8_len) > rem) {
                // If the chosen length can't fit into the remaining buffer,
                // choose a smaller length.
                utf8_len = @enumFromInt(@intFromEnum(utf8_len) - 1);
            }
            break :len utf8_len;
        };

        // Generate a UTF-8 sequence that encodes to this length.
        const cp: u21 = switch (utf8_len) {
            .one => self.rand.intRangeAtMostBiased(u21, 0x00, 0x7F),
            .two => self.rand.intRangeAtMostBiased(u21, 0x80, 0x7FF),
            .three => self.rand.intRangeAtMostBiased(u21, 0x800, 0xFFFF),
            .four => self.rand.intRangeAtMostBiased(u21, 0x10000, 0x10FFFF),
        };

        assert(std.unicode.utf8CodepointSequenceLength(
            cp,
        ) catch unreachable == @intFromEnum(utf8_len));
        rem -= std.unicode.utf8Encode(
            cp,
            result[result.len - rem ..],
        ) catch |err| switch (err) {
            // Impossible because our generation above is hardcoded to
            // produce a valid range. If not, a bug.
            error.CodepointTooLarge => unreachable,

            // Possible, in which case we redo the loop and encode nothing.
            error.Utf8CannotEncodeSurrogateHalf => continue,
        };
    }

    return result;
}

test "utf8" {
    const testing = std.testing;
    var prng = std.Random.DefaultPrng.init(0);
    var buf: [256]u8 = undefined;
    var v: Utf8 = .{ .rand = prng.random() };
    const gen = v.generator();
    const result = try gen.next(&buf);
    try testing.expect(result.len > 0);
    try testing.expect(std.unicode.utf8ValidateSlice(result));
}