src/unicode/props_uucode.zig


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120

const props = @This();
const std = @import("std");
const assert = std.debug.assert;
const uucode = @import("uucode");
const lut = @import("lut.zig");
const Properties = @import("Properties.zig");
const GraphemeBoundaryClass = Properties.GraphemeBoundaryClass;

/// Gets the grapheme boundary class for a codepoint.
/// The use case for this is only in generating lookup tables.
fn graphemeBoundaryClass(cp: u21) GraphemeBoundaryClass {
    if (cp > uucode.config.max_code_point) return .invalid;

    // We special-case modifier bases because we should not break
    // if a modifier isn't next to a base.
    if (uucode.get(.is_emoji_modifier, cp)) return .emoji_modifier;
    if (uucode.get(.is_emoji_modifier_base, cp)) return .extended_pictographic_base;

    return switch (uucode.get(.grapheme_break, cp)) {
        .extended_pictographic => .extended_pictographic,
        .l => .L,
        .v => .V,
        .t => .T,
        .lv => .LV,
        .lvt => .LVT,
        .prepend => .prepend,
        .zwj => .zwj,
        .spacing_mark => .spacing_mark,
        .regional_indicator => .regional_indicator,

        .zwnj,
        .indic_conjunct_break_extend,
        .indic_conjunct_break_linker,
        => .extend,

        // This is obviously not INVALID invalid, there is SOME grapheme
        // boundary class for every codepoint. But we don't care about
        // anything that doesn't fit into the above categories. Also note
        // that `indic_conjunct_break_consonant` is `other` in
        // 'GraphemeBreakProperty.txt' (it's missing).
        .other,
        .indic_conjunct_break_consonant,
        .cr,
        .lf,
        .control,
        => .invalid,
    };
}

pub fn get(cp: u21) Properties {
    const width = if (cp > uucode.config.max_code_point)
        1
    else
        uucode.get(.width, cp);

    return .{
        .width = width,
        .grapheme_boundary_class = graphemeBoundaryClass(cp),
    };
}

/// Runnable binary to generate the lookup tables and output to stdout.
pub fn main() !void {
    var arena_state = std.heap.ArenaAllocator.init(std.heap.page_allocator);
    defer arena_state.deinit();
    const alloc = arena_state.allocator();

    const gen: lut.Generator(
        Properties,
        struct {
            pub fn get(ctx: @This(), cp: u21) !Properties {
                _ = ctx;
                return props.get(cp);
            }

            pub fn eql(ctx: @This(), a: Properties, b: Properties) bool {
                _ = ctx;
                return a.eql(b);
            }
        },
    ) = .{};

    const t = try gen.generate(alloc);
    defer alloc.free(t.stage1);
    defer alloc.free(t.stage2);
    defer alloc.free(t.stage3);

    var buf: [4096]u8 = undefined;
    var stdout = std.fs.File.stdout().writer(&buf);
    try t.writeZig(&stdout.interface);
    try stdout.end();

    // Uncomment when manually debugging to see our table sizes.
    // std.log.warn("stage1={} stage2={} stage3={}", .{
    //     t.stage1.len,
    //     t.stage2.len,
    //     t.stage3.len,
    // });
}

test "unicode props: tables match uucode" {
    if (std.valgrind.runningOnValgrind() > 0) return error.SkipZigTest;

    const testing = std.testing;
    const table = @import("props_table.zig").table;

    const min = 0xFF + 1; // start outside ascii
    const max = std.math.maxInt(u21) + 1;
    for (min..max) |cp| {
        const t = table.get(@intCast(cp));
        const uu = if (cp > uucode.config.max_code_point)
            1
        else
            uucode.get(.width, @intCast(cp));
        if (t.width != uu) {
            std.log.warn("mismatch cp=U+{x} t={} uu={}", .{ cp, t.width, uu });
            try testing.expect(false);
        }
    }
}