diff options
| author | Mitchell Hashimoto <mitchell.hashimoto@gmail.com> | 2024-02-09 20:50:13 -0800 |
|---|---|---|
| committer | Mitchell Hashimoto <mitchell.hashimoto@gmail.com> | 2024-02-09 20:50:13 -0800 |
| commit | 5275d44e7dc4c7f86978b5bdf285b5d4f45eb0e9 (patch) | |
| tree | a99e295a2ad9be2cb954766056f138c9c3b3480c /src/unicode/grapheme.zig | |
| parent | 6f8b4204b99463a264b1d2311bce46db7634023e (diff) | |
unicode: precompute grapheme break data
Diffstat (limited to 'src/unicode/grapheme.zig')
| -rw-r--r-- | src/unicode/grapheme.zig | 66 |
1 files changed, 55 insertions, 11 deletions
diff --git a/src/unicode/grapheme.zig b/src/unicode/grapheme.zig index d4c146e49..09f452114 100644 --- a/src/unicode/grapheme.zig +++ b/src/unicode/grapheme.zig @@ -3,14 +3,6 @@ const props = @import("props.zig"); const GraphemeBoundaryClass = props.GraphemeBoundaryClass; const table = props.table; -// The algorithm in this file is based on the Ziglyph and utf8proc algorithm, -// only modified to use our own lookup tables. -// -// I'll note I also tried a fully precomputed table approach where all -// combinations of state and boundary classes were precomputed. It was -// marginally faster (about 2%) but the table is a few KB and I'm not -// sure it's worth it. - /// Determines if there is a grapheme break between two codepoints. This /// must be called sequentially maintaining the state between calls. /// @@ -19,9 +11,15 @@ const table = props.table; /// calling this function. This is because this function is tuned for /// Ghostty. pub fn graphemeBreak(cp1: u21, cp2: u21, state: *BreakState) bool { - const gbc1 = table.get(cp1).grapheme_boundary_class; - const gbc2 = table.get(cp2).grapheme_boundary_class; - return graphemeBreakClass(gbc1, gbc2, state); + const value = Precompute.data[ + (Precompute.Key{ + .gbc1 = table.get(cp1).grapheme_boundary_class, + .gbc2 = table.get(cp2).grapheme_boundary_class, + .state = state.*, + }).index() + ]; + state.* = value.state; + return value.result; } /// The state that must be maintained between calls to `graphemeBreak`. @@ -30,6 +28,52 @@ pub const BreakState = packed struct(u2) { regional_indicator: bool = false, }; +/// This is all the structures and data for the precomputed lookup table +/// for all possible permutations of state and grapheme boundary classes. +/// Precomputation only requires 2^10 keys of 3 bit values so the whole +/// table is less than 1KB. +const Precompute = struct { + const Key = packed struct(u10) { + state: BreakState, + gbc1: GraphemeBoundaryClass, + gbc2: GraphemeBoundaryClass, + + fn index(self: Key) usize { + return @intCast(@as(u10, @bitCast(self))); + } + }; + + const Value = packed struct(u3) { + result: bool, + state: BreakState, + }; + + const data = precompute: { + var result: [std.math.maxInt(u10)]Value = undefined; + + @setEvalBranchQuota(2_000); + const info = @typeInfo(GraphemeBoundaryClass).Enum; + for (0..std.math.maxInt(u2) + 1) |state_init| { + for (info.fields) |field1| { + for (info.fields) |field2| { + var state: BreakState = @bitCast(@as(u2, @intCast(state_init))); + const key: Key = .{ + .gbc1 = @field(GraphemeBoundaryClass, field1.name), + .gbc2 = @field(GraphemeBoundaryClass, field2.name), + .state = state, + }; + const v = graphemeBreakClass(key.gbc1, key.gbc2, &state); + result[key.index()] = .{ .result = v, .state = state }; + } + } + } + + break :precompute result; + }; +}; + +/// This is the algorithm from utf8proc. We only use this offline for +/// precomputing the lookup table. fn graphemeBreakClass( gbc1: GraphemeBoundaryClass, gbc2: GraphemeBoundaryClass, |
