summaryrefslogtreecommitdiff
path: root/src/unicode
diff options
context:
space:
mode:
authorJeffrey C. Ollie <jeff@ocjtech.us>2025-09-04 23:04:08 -0500
committerJeffrey C. Ollie <jeff@ocjtech.us>2025-09-05 07:58:01 -0500
commita7da96faeea305d5ff1758a98557a42afe0fed32 (patch)
treeae6ab8b6b6d7f7bf81897dc21f3d0615ec5bd0f0 /src/unicode
parent968b9d536dd1a14c23024dd1d10eccabd60fb796 (diff)
add two LUT-based implementations of isSymbol
Diffstat (limited to 'src/unicode')
-rw-r--r--src/unicode/lut.zig26
-rw-r--r--src/unicode/lut2.zig183
-rw-r--r--src/unicode/main.zig2
-rw-r--r--src/unicode/props.zig2
-rw-r--r--src/unicode/symbols1.zig93
-rw-r--r--src/unicode/symbols2.zig85
6 files changed, 390 insertions, 1 deletions
diff --git a/src/unicode/lut.zig b/src/unicode/lut.zig
index 95c6a3688..e709bf1fe 100644
--- a/src/unicode/lut.zig
+++ b/src/unicode/lut.zig
@@ -142,6 +142,32 @@ pub fn Tables(comptime Elem: type) type {
return self.stage3[self.stage2[self.stage1[high] + low]];
}
+ pub inline fn getInline(self: *const Self, cp: u21) Elem {
+ const high = cp >> 8;
+ const low = cp & 0xFF;
+ return self.stage3[self.stage2[self.stage1[high] + low]];
+ }
+
+ pub fn getBool(self: *const Self, cp: u21) bool {
+ assert(Elem == bool);
+ assert(self.stage3.len == 2);
+ assert(self.stage3[0] == false);
+ assert(self.stage3[1] == true);
+ const high = cp >> 8;
+ const low = cp & 0xFF;
+ return self.stage2[self.stage1[high] + low] != 0;
+ }
+
+ pub inline fn getBoolInline(self: *const Self, cp: u21) bool {
+ assert(Elem == bool);
+ assert(self.stage3.len == 2);
+ assert(self.stage3[0] == false);
+ assert(self.stage3[1] == true);
+ const high = cp >> 8;
+ const low = cp & 0xFF;
+ return self.stage2[self.stage1[high] + low] != 0;
+ }
+
/// Writes the lookup table as Zig to the given writer. The
/// written file exports three constants: stage1, stage2, and
/// stage3. These can be used to rebuild the lookup table in Zig.
diff --git a/src/unicode/lut2.zig b/src/unicode/lut2.zig
new file mode 100644
index 000000000..ef5c886a2
--- /dev/null
+++ b/src/unicode/lut2.zig
@@ -0,0 +1,183 @@
+const std = @import("std");
+const assert = std.debug.assert;
+const Allocator = std.mem.Allocator;
+
+// This whole file is based on the algorithm described here:
+// https://here-be-braces.com/fast-lookup-of-unicode-properties/
+
+const set_size = @typeInfo(usize).int.bits;
+// const Set = std.bit_set.ArrayBitSet(usize, set_size);
+const Set = std.bit_set.IntegerBitSet(set_size);
+const cp_shift = std.math.log2_int(u21, set_size);
+const cp_mask = set_size - 1;
+
+/// Creates a type that is able to generate a 2-level lookup table
+/// from a Unicode codepoint to a mapping of type bool. The lookup table
+/// generally is expected to be codegen'd and then reloaded, although it
+/// can in theory be generated at runtime.
+///
+/// Context must have one function:
+/// - `get(Context, u21) bool`: returns the mapping for a given codepoint
+///
+pub fn Generator(
+ comptime Context: type,
+) type {
+ return struct {
+ const Self = @This();
+
+ /// Mapping of a block to its index in the stage2 array.
+ const SetMap = std.HashMap(
+ Set,
+ u16,
+ struct {
+ pub fn hash(ctx: @This(), k: Set) u64 {
+ _ = ctx;
+ var hasher = std.hash.Wyhash.init(0);
+ std.hash.autoHashStrat(&hasher, k, .DeepRecursive);
+ return hasher.final();
+ }
+
+ pub fn eql(ctx: @This(), a: Set, b: Set) bool {
+ _ = ctx;
+ return a.eql(b);
+ }
+ },
+ std.hash_map.default_max_load_percentage,
+ );
+
+ ctx: Context = undefined,
+
+ /// Generate the lookup tables. The arrays in the return value
+ /// are owned by the caller and must be freed.
+ pub fn generate(self: *const Self, alloc: Allocator) !Tables {
+ var min: u21 = std.math.maxInt(u21);
+ var max: u21 = std.math.minInt(u21);
+
+ // Maps block => stage2 index
+ var set_map = SetMap.init(alloc);
+ defer set_map.deinit();
+
+ // Our stages
+ var stage1 = std.ArrayList(u16).init(alloc);
+ defer stage1.deinit();
+ var stage2 = std.ArrayList(Set).init(alloc);
+ defer stage2.deinit();
+
+ var set: Set = .initEmpty();
+
+ // ensure that the 1st entry is always all false
+ try stage2.append(set);
+ try set_map.putNoClobber(set, 0);
+
+ for (0..std.math.maxInt(u21) + 1) |cp_| {
+ const cp: u21 = @intCast(cp_);
+ const high = cp >> cp_shift;
+ const low = cp & cp_mask;
+
+ if (self.ctx.get(cp)) {
+ if (cp < min) min = cp;
+ if (cp > max) max = cp;
+ set.set(low);
+ }
+
+ // If we still have space and we're not done with codepoints,
+ // we keep building up the block. Conversely: we finalize this
+ // block if we've filled it or are out of codepoints.
+ if (low + 1 < set_size and cp != std.math.maxInt(u21)) continue;
+
+ // Look for the stage2 index for this block. If it doesn't exist
+ // we add it to stage2 and update the mapping.
+ const gop = try set_map.getOrPut(set);
+ if (!gop.found_existing) {
+ gop.value_ptr.* = std.math.cast(
+ u16,
+ stage2.items.len,
+ ) orelse return error.Stage2TooLarge;
+ try stage2.append(set);
+ }
+
+ // Map stage1 => stage2 and reset our block
+ try stage1.append(gop.value_ptr.*);
+ set = .initEmpty();
+ assert(stage1.items.len - 1 == high);
+ }
+
+ // All of our lengths must fit in a u16 for this to work
+ assert(stage1.items.len <= std.math.maxInt(u16));
+ assert(stage2.items.len <= std.math.maxInt(u16));
+
+ const stage1_owned = try stage1.toOwnedSlice();
+ errdefer alloc.free(stage1_owned);
+ const stage2_owned = try stage2.toOwnedSlice();
+ errdefer alloc.free(stage2_owned);
+
+ return .{
+ .min = min,
+ .max = max,
+ .stage1 = stage1_owned,
+ .stage2 = stage2_owned,
+ };
+ }
+ };
+}
+
+/// Creates a type that given a 3-level lookup table, can be used to
+/// look up a mapping for a given codepoint, encode it out to Zig, etc.
+pub const Tables = struct {
+ const Self = @This();
+
+ min: u21,
+ max: u21,
+ stage1: []const u16,
+ stage2: []const Set,
+
+ /// Given a codepoint, returns the mapping for that codepoint.
+ pub fn get(self: *const Self, cp: u21) bool {
+ if (cp < self.min) return false;
+ if (cp > self.max) return false;
+ const high = cp >> cp_shift;
+ const stage2 = self.stage1[high];
+ // take advantage of the fact that the first entry is always all false
+ if (stage2 == 0) return false;
+ const low = cp & cp_mask;
+ return self.stage2[stage2].isSet(low);
+ }
+
+ /// Writes the lookup table as Zig to the given writer. The
+ /// written file exports three constants: stage1, stage2, and
+ /// stage3. These can be used to rebuild the lookup table in Zig.
+ pub fn writeZig(self: *const Self, writer: anytype) !void {
+ try writer.print(
+ \\//! This file is auto-generated. Do not edit.
+ \\const std = @import("std");
+ \\
+ \\pub const min: u21 = {};
+ \\pub const max: u21 = {};
+ \\
+ \\pub const stage1: [{}]u16 = .{{
+ , .{ self.min, self.max, self.stage1.len });
+ for (self.stage1) |entry| try writer.print("{},", .{entry});
+
+ try writer.print(
+ \\
+ \\}};
+ \\
+ \\pub const Set = std.bit_set.IntegerBitSet({d});
+ \\pub const stage2: [{d}]Set = .{{
+ \\
+ , .{ set_size, self.stage2.len });
+ // for (self.stage2) |entry| {
+ // try writer.print(" .{{\n", .{});
+ // try writer.print(" .masks = [{d}]{s}{{\n", .{ entry.masks.len, @typeName(Set.MaskInt) });
+ // for (entry.masks) |mask| {
+ // try writer.print(" {d},\n", .{mask});
+ // }
+ // try writer.print(" }},\n", .{});
+ // try writer.print(" }},\n", .{});
+ // }
+ for (self.stage2) |entry| {
+ try writer.print(" .{{ .mask = {d} }},\n", .{entry.mask});
+ }
+ try writer.writeAll("};\n");
+ }
+};
diff --git a/src/unicode/main.zig b/src/unicode/main.zig
index f5b911948..91dfd482c 100644
--- a/src/unicode/main.zig
+++ b/src/unicode/main.zig
@@ -9,5 +9,7 @@ pub const graphemeBreak = grapheme.graphemeBreak;
pub const GraphemeBreakState = grapheme.BreakState;
test {
+ _ = @import("symbols1.zig");
+ _ = @import("symbols2.zig");
@import("std").testing.refAllDecls(@This());
}
diff --git a/src/unicode/props.zig b/src/unicode/props.zig
index 99c57aa0a..7edb3761c 100644
--- a/src/unicode/props.zig
+++ b/src/unicode/props.zig
@@ -166,7 +166,7 @@ pub fn main() !void {
// This is not very fast in debug modes, so its commented by default.
// IMPORTANT: UNCOMMENT THIS WHENEVER MAKING CODEPOINTWIDTH CHANGES.
-// test "tables match ziglyph" {
+// test "unicode props: tables match ziglyph" {
// const testing = std.testing;
//
// const min = 0xFF + 1; // start outside ascii
diff --git a/src/unicode/symbols1.zig b/src/unicode/symbols1.zig
new file mode 100644
index 000000000..e5b8cc22a
--- /dev/null
+++ b/src/unicode/symbols1.zig
@@ -0,0 +1,93 @@
+const props = @This();
+const std = @import("std");
+const assert = std.debug.assert;
+const ziglyph = @import("ziglyph");
+const lut = @import("lut.zig");
+
+/// The lookup tables for Ghostty.
+pub const table = table: {
+ // This is only available after running main() below as part of the Ghostty
+ // build.zig, but due to Zig's lazy analysis we can still reference it here.
+ const generated = @import("symbols1_tables").Tables(bool);
+ const Tables = lut.Tables(bool);
+ break :table Tables{
+ .stage1 = &generated.stage1,
+ .stage2 = &generated.stage2,
+ .stage3 = &generated.stage3,
+ };
+};
+
+/// Returns true of the codepoint is a "symbol-like" character, which
+/// for now we define as anything in a private use area and anything
+/// in several unicode blocks:
+/// - Dingbats
+/// - Emoticons
+/// - Miscellaneous Symbols
+/// - Enclosed Alphanumerics
+/// - Enclosed Alphanumeric Supplement
+/// - Miscellaneous Symbols and Pictographs
+/// - Transport and Map Symbols
+///
+/// In the future it may be prudent to expand this to encompass more
+/// symbol-like characters, and/or exclude some PUA sections.
+pub fn isSymbol(cp: u21) bool {
+ return ziglyph.general_category.isPrivateUse(cp) or
+ ziglyph.blocks.isDingbats(cp) or
+ ziglyph.blocks.isEmoticons(cp) or
+ ziglyph.blocks.isMiscellaneousSymbols(cp) or
+ ziglyph.blocks.isEnclosedAlphanumerics(cp) or
+ ziglyph.blocks.isEnclosedAlphanumericSupplement(cp) or
+ ziglyph.blocks.isMiscellaneousSymbolsAndPictographs(cp) or
+ ziglyph.blocks.isTransportAndMapSymbols(cp);
+}
+
+/// Runnable binary to generate the lookup tables and output to stdout.
+pub fn main() !void {
+ var arena_state = std.heap.ArenaAllocator.init(std.heap.page_allocator);
+ defer arena_state.deinit();
+ const alloc = arena_state.allocator();
+
+ const gen: lut.Generator(
+ bool,
+ struct {
+ pub fn get(ctx: @This(), cp: u21) !bool {
+ _ = ctx;
+ return isSymbol(cp);
+ }
+
+ pub fn eql(ctx: @This(), a: bool, b: bool) bool {
+ _ = ctx;
+ return a == b;
+ }
+ },
+ ) = .{};
+
+ const t = try gen.generate(alloc);
+ defer alloc.free(t.stage1);
+ defer alloc.free(t.stage2);
+ defer alloc.free(t.stage3);
+ try t.writeZig(std.io.getStdOut().writer());
+
+ // Uncomment when manually debugging to see our table sizes.
+ // std.log.warn("stage1={} stage2={} stage3={}", .{
+ // t.stage1.len,
+ // t.stage2.len,
+ // t.stage3.len,
+ // });
+}
+
+// This is not very fast in debug modes, so its commented by default.
+// IMPORTANT: UNCOMMENT THIS WHENEVER MAKING CHANGES.
+test "unicode symbols1: tables match ziglyph" {
+ const testing = std.testing;
+
+ for (0..std.math.maxInt(u21)) |cp| {
+ const t = table.get(@intCast(cp));
+ const zg = isSymbol(@intCast(cp));
+
+ if (t != zg) {
+ std.log.warn("mismatch cp=U+{x} t={} zg={}", .{ cp, t, zg });
+ try testing.expect(false);
+ }
+ }
+}
diff --git a/src/unicode/symbols2.zig b/src/unicode/symbols2.zig
new file mode 100644
index 000000000..1d23c51be
--- /dev/null
+++ b/src/unicode/symbols2.zig
@@ -0,0 +1,85 @@
+const props = @This();
+const std = @import("std");
+const assert = std.debug.assert;
+const ziglyph = @import("ziglyph");
+const lut2 = @import("lut2.zig");
+
+/// The lookup tables for Ghostty.
+pub const table = table: {
+ // This is only available after running main() below as part of the Ghostty
+ // build.zig, but due to Zig's lazy analysis we can still reference it here.
+ const generated = @import("symbols2_tables");
+ break :table lut2.Tables{
+ .min = generated.min,
+ .max = generated.max,
+ .stage1 = &generated.stage1,
+ .stage2 = &generated.stage2,
+ };
+};
+
+/// Returns true of the codepoint is a "symbol-like" character, which
+/// for now we define as anything in a private use area and anything
+/// in several unicode blocks:
+/// - Dingbats
+/// - Emoticons
+/// - Miscellaneous Symbols
+/// - Enclosed Alphanumerics
+/// - Enclosed Alphanumeric Supplement
+/// - Miscellaneous Symbols and Pictographs
+/// - Transport and Map Symbols
+///
+/// In the future it may be prudent to expand this to encompass more
+/// symbol-like characters, and/or exclude some PUA sections.
+pub fn isSymbol(cp: u21) bool {
+ return ziglyph.general_category.isPrivateUse(cp) or
+ ziglyph.blocks.isDingbats(cp) or
+ ziglyph.blocks.isEmoticons(cp) or
+ ziglyph.blocks.isMiscellaneousSymbols(cp) or
+ ziglyph.blocks.isEnclosedAlphanumerics(cp) or
+ ziglyph.blocks.isEnclosedAlphanumericSupplement(cp) or
+ ziglyph.blocks.isMiscellaneousSymbolsAndPictographs(cp) or
+ ziglyph.blocks.isTransportAndMapSymbols(cp);
+}
+
+/// Runnable binary to generate the lookup tables and output to stdout.
+pub fn main() !void {
+ var arena_state = std.heap.ArenaAllocator.init(std.heap.page_allocator);
+ defer arena_state.deinit();
+ const alloc = arena_state.allocator();
+
+ const gen: lut2.Generator(
+ struct {
+ pub fn get(ctx: @This(), cp: u21) bool {
+ _ = ctx;
+ return isSymbol(cp);
+ }
+ },
+ ) = .{};
+
+ const t = try gen.generate(alloc);
+ defer alloc.free(t.stage1);
+ defer alloc.free(t.stage2);
+ try t.writeZig(std.io.getStdOut().writer());
+
+ // Uncomment when manually debugging to see our table sizes.
+ // std.log.warn("stage1={} stage2={}", .{
+ // t.stage1.len,
+ // t.stage2.len,
+ // });
+}
+
+// This is not very fast in debug modes, so its commented by default.
+// IMPORTANT: UNCOMMENT THIS WHENEVER MAKING CHANGES.
+test "unicode symbols2: tables match ziglyph" {
+ const testing = std.testing;
+
+ for (0..std.math.maxInt(u21)) |cp| {
+ const t1 = table.get(@intCast(cp));
+ const zg = isSymbol(@intCast(cp));
+
+ if (t1 != zg) {
+ std.log.warn("mismatch cp=U+{x} t={} zg={}", .{ cp, t1, zg });
+ try testing.expect(false);
+ }
+ }
+}