unicode: use packed struct for break state

author: Mitchell Hashimoto <mitchell.hashimoto@gmail.com> 2024-02-09 20:29:36 -0800
committer: Mitchell Hashimoto <mitchell.hashimoto@gmail.com> 2024-02-09 20:29:36 -0800
commit: 132fbb3a4695b09d8674914e8d68a660fb28df6d (patch)
tree: ca2b57d7e429d84953dc962ae16d815157b483ce /src
parent: c47ad97f62ca1f5e6132d46839b7cda999af461b (diff)
4 files changed, 19 insertions, 70 deletions
diff --git a/src/bench/grapheme-break.sh b/src/bench/grapheme-break.sh
index c395c3799..24f475caa 100755
--- a/src/bench/grapheme-break.sh
+++ b/src/bench/grapheme-break.sh
@@ -27,8 +27,6 @@ hyperfine \
   "./zig-out/bin/bench-grapheme-break --mode=noop${ARGS} </tmp/ghostty_bench_data" \
   -n ziglyph \
   "./zig-out/bin/bench-grapheme-break --mode=ziglyph${ARGS} </tmp/ghostty_bench_data" \
-  -n utf8proc \
-  "./zig-out/bin/bench-grapheme-break --mode=utf8proc${ARGS} </tmp/ghostty_bench_data" \
   -n table \
   "./zig-out/bin/bench-grapheme-break --mode=table${ARGS} </tmp/ghostty_bench_data"
 
diff --git a/src/bench/grapheme-break.zig b/src/bench/grapheme-break.zig
index 55caca313..7decd525d 100644
--- a/src/bench/grapheme-break.zig
+++ b/src/bench/grapheme-break.zig
@@ -46,8 +46,6 @@ const Mode = enum {
 
     /// Ghostty's table-based approach.
     table,
-
-    utf8proc,
 };
 
 pub const std_options = struct {
@@ -75,7 +73,6 @@ pub fn main() !void {
         .noop => try benchNoop(reader, buf),
         .ziglyph => try benchZiglyph(reader, buf),
         .table => try benchTable(reader, buf),
-        .utf8proc => try benchUtf8proc(reader, buf),
     }
 }
 
@@ -101,7 +98,7 @@ noinline fn benchTable(
     buf: []u8,
 ) !void {
     var d: UTF8Decoder = .{};
-    var state: u3 = 0;
+    var state: unicode.GraphemeBreakState = .{};
     var cp1: u21 = 0;
     while (true) {
         const n = try reader.read(buf);
@@ -145,29 +142,3 @@ noinline fn benchZiglyph(
         }
     }
 }
-
-noinline fn benchUtf8proc(
-    reader: anytype,
-    buf: []u8,
-) !void {
-    const utf8proc = @import("utf8proc");
-    var d: UTF8Decoder = .{};
-    var state: i32 = 0;
-    var cp1: u21 = 0;
-    while (true) {
-        const n = try reader.read(buf);
-        if (n == 0) break;
-
-        // Using stream.next directly with a for loop applies a naive
-        // scalar approach.
-        for (buf[0..n]) |c| {
-            const cp_, const consumed = d.next(c);
-            assert(consumed);
-            if (cp_) |cp2| {
-                const v = utf8proc.graphemeBreakStateful(cp1, @intCast(cp2), &state);
-                buf[0] = @intCast(@intFromBool(v));
-                cp1 = cp2;
-            }
-        }
-    }
-}
diff --git a/src/unicode/grapheme.zig b/src/unicode/grapheme.zig
index 19437844c..d4c146e49 100644
--- a/src/unicode/grapheme.zig
+++ b/src/unicode/grapheme.zig
@@ -18,19 +18,27 @@ const table = props.table;
 /// line feeds, and carriage returns are expected to be filtered out before
 /// calling this function. This is because this function is tuned for
 /// Ghostty.
-pub fn graphemeBreak(cp1: u21, cp2: u21, state: *u3) bool {
+pub fn graphemeBreak(cp1: u21, cp2: u21, state: *BreakState) bool {
     const gbc1 = table.get(cp1).grapheme_boundary_class;
     const gbc2 = table.get(cp2).grapheme_boundary_class;
     return graphemeBreakClass(gbc1, gbc2, state);
 }
 
+/// The state that must be maintained between calls to `graphemeBreak`.
+pub const BreakState = packed struct(u2) {
+    extended_pictographic: bool = false,
+    regional_indicator: bool = false,
+};
+
 fn graphemeBreakClass(
     gbc1: GraphemeBoundaryClass,
     gbc2: GraphemeBoundaryClass,
-    state: *u3,
+    state: *BreakState,
 ) bool {
     // GB11: Emoji Extend* ZWJ x Emoji
-    if (!hasXpic(state) and gbc1 == .extended_pictographic) setXpic(state);
+    if (!state.extended_pictographic and gbc1 == .extended_pictographic) {
+        state.extended_pictographic = true;
+    }
 
     // These two properties are ignored because they're not relevant to
     // Ghostty -- they're filtered out before checking grapheme boundaries.
@@ -67,56 +75,27 @@ fn graphemeBreakClass(
 
     // GB12, GB13: RI x RI
     if (gbc1 == .regional_indicator and gbc2 == .regional_indicator) {
-        if (hasRegional(state)) {
-            unsetRegional(state);
+        if (state.regional_indicator) {
+            state.regional_indicator = false;
             return true;
         } else {
-            setRegional(state);
+            state.regional_indicator = true;
             return false;
         }
     }
 
     // GB11: Emoji Extend* ZWJ x Emoji
-    if (hasXpic(state) and
+    if (state.extended_pictographic and
         gbc1 == .zwj and
         gbc2 == .extended_pictographic)
     {
-        unsetXpic(state);
+        state.extended_pictographic = false;
         return false;
     }
 
     return true;
 }
 
-const State = packed struct(u2) {
-    extended_pictographic: bool = false,
-    regional_indicator: bool = false,
-};
-
-fn hasXpic(state: *const u3) bool {
-    return state.* & 1 == 1;
-}
-
-fn setXpic(state: *u3) void {
-    state.* |= 1;
-}
-
-fn unsetXpic(state: *u3) void {
-    state.* ^= 1;
-}
-
-fn hasRegional(state: *const u3) bool {
-    return state.* & 2 == 2;
-}
-
-fn setRegional(state: *u3) void {
-    state.* |= 2;
-}
-
-fn unsetRegional(state: *u3) void {
-    state.* ^= 2;
-}
-
 /// If you build this file as a binary, we will verify the grapheme break
 /// implementation. This iterates over billions of codepoints so it is
 /// SLOW. It's not meant to be run in CI, but it's useful for debugging.
@@ -127,7 +106,7 @@ pub fn main() !void {
     const min = 0;
     const max = std.math.maxInt(u21) + 1;
 
-    var state: u3 = 0;
+    var state: BreakState = .{};
     var zg_state: u3 = 0;
     for (min..max) |cp1| {
         if (cp1 % 1000 == 0) std.log.warn("progress cp1={}", .{cp1});
diff --git a/src/unicode/main.zig b/src/unicode/main.zig
index 3cc4779ed..e8ba05b72 100644
--- a/src/unicode/main.zig
+++ b/src/unicode/main.zig
@@ -5,6 +5,7 @@ const props = @import("props.zig");
 pub const table = props.table;
 pub const Properties = props.Properties;
 pub const graphemeBreak = grapheme.graphemeBreak;
+pub const GraphemeBreakState = grapheme.BreakState;
 
 test {
     @import("std").testing.refAllDecls(@This());
author	Mitchell Hashimoto <mitchell.hashimoto@gmail.com>	2024-02-09 20:29:36 -0800
committer	Mitchell Hashimoto <mitchell.hashimoto@gmail.com>	2024-02-09 20:29:36 -0800
commit	132fbb3a4695b09d8674914e8d68a660fb28df6d (patch)
tree	ca2b57d7e429d84953dc962ae16d815157b483ce /src
parent	c47ad97f62ca1f5e6132d46839b7cda999af461b (diff)