diff options
| author | Jeffrey C. Ollie <jeff@ocjtech.us> | 2025-09-04 23:04:08 -0500 |
|---|---|---|
| committer | Jeffrey C. Ollie <jeff@ocjtech.us> | 2025-09-05 07:58:01 -0500 |
| commit | a7da96faeea305d5ff1758a98557a42afe0fed32 (patch) | |
| tree | ae6ab8b6b6d7f7bf81897dc21f3d0615ec5bd0f0 /src/unicode | |
| parent | 968b9d536dd1a14c23024dd1d10eccabd60fb796 (diff) | |
add two LUT-based implementations of isSymbol
Diffstat (limited to 'src/unicode')
| -rw-r--r-- | src/unicode/lut.zig | 26 | ||||
| -rw-r--r-- | src/unicode/lut2.zig | 183 | ||||
| -rw-r--r-- | src/unicode/main.zig | 2 | ||||
| -rw-r--r-- | src/unicode/props.zig | 2 | ||||
| -rw-r--r-- | src/unicode/symbols1.zig | 93 | ||||
| -rw-r--r-- | src/unicode/symbols2.zig | 85 |
6 files changed, 390 insertions, 1 deletions
diff --git a/src/unicode/lut.zig b/src/unicode/lut.zig index 95c6a3688..e709bf1fe 100644 --- a/src/unicode/lut.zig +++ b/src/unicode/lut.zig @@ -142,6 +142,32 @@ pub fn Tables(comptime Elem: type) type { return self.stage3[self.stage2[self.stage1[high] + low]]; } + pub inline fn getInline(self: *const Self, cp: u21) Elem { + const high = cp >> 8; + const low = cp & 0xFF; + return self.stage3[self.stage2[self.stage1[high] + low]]; + } + + pub fn getBool(self: *const Self, cp: u21) bool { + assert(Elem == bool); + assert(self.stage3.len == 2); + assert(self.stage3[0] == false); + assert(self.stage3[1] == true); + const high = cp >> 8; + const low = cp & 0xFF; + return self.stage2[self.stage1[high] + low] != 0; + } + + pub inline fn getBoolInline(self: *const Self, cp: u21) bool { + assert(Elem == bool); + assert(self.stage3.len == 2); + assert(self.stage3[0] == false); + assert(self.stage3[1] == true); + const high = cp >> 8; + const low = cp & 0xFF; + return self.stage2[self.stage1[high] + low] != 0; + } + /// Writes the lookup table as Zig to the given writer. The /// written file exports three constants: stage1, stage2, and /// stage3. These can be used to rebuild the lookup table in Zig. diff --git a/src/unicode/lut2.zig b/src/unicode/lut2.zig new file mode 100644 index 000000000..ef5c886a2 --- /dev/null +++ b/src/unicode/lut2.zig @@ -0,0 +1,183 @@ +const std = @import("std"); +const assert = std.debug.assert; +const Allocator = std.mem.Allocator; + +// This whole file is based on the algorithm described here: +// https://here-be-braces.com/fast-lookup-of-unicode-properties/ + +const set_size = @typeInfo(usize).int.bits; +// const Set = std.bit_set.ArrayBitSet(usize, set_size); +const Set = std.bit_set.IntegerBitSet(set_size); +const cp_shift = std.math.log2_int(u21, set_size); +const cp_mask = set_size - 1; + +/// Creates a type that is able to generate a 2-level lookup table +/// from a Unicode codepoint to a mapping of type bool. The lookup table +/// generally is expected to be codegen'd and then reloaded, although it +/// can in theory be generated at runtime. +/// +/// Context must have one function: +/// - `get(Context, u21) bool`: returns the mapping for a given codepoint +/// +pub fn Generator( + comptime Context: type, +) type { + return struct { + const Self = @This(); + + /// Mapping of a block to its index in the stage2 array. + const SetMap = std.HashMap( + Set, + u16, + struct { + pub fn hash(ctx: @This(), k: Set) u64 { + _ = ctx; + var hasher = std.hash.Wyhash.init(0); + std.hash.autoHashStrat(&hasher, k, .DeepRecursive); + return hasher.final(); + } + + pub fn eql(ctx: @This(), a: Set, b: Set) bool { + _ = ctx; + return a.eql(b); + } + }, + std.hash_map.default_max_load_percentage, + ); + + ctx: Context = undefined, + + /// Generate the lookup tables. The arrays in the return value + /// are owned by the caller and must be freed. + pub fn generate(self: *const Self, alloc: Allocator) !Tables { + var min: u21 = std.math.maxInt(u21); + var max: u21 = std.math.minInt(u21); + + // Maps block => stage2 index + var set_map = SetMap.init(alloc); + defer set_map.deinit(); + + // Our stages + var stage1 = std.ArrayList(u16).init(alloc); + defer stage1.deinit(); + var stage2 = std.ArrayList(Set).init(alloc); + defer stage2.deinit(); + + var set: Set = .initEmpty(); + + // ensure that the 1st entry is always all false + try stage2.append(set); + try set_map.putNoClobber(set, 0); + + for (0..std.math.maxInt(u21) + 1) |cp_| { + const cp: u21 = @intCast(cp_); + const high = cp >> cp_shift; + const low = cp & cp_mask; + + if (self.ctx.get(cp)) { + if (cp < min) min = cp; + if (cp > max) max = cp; + set.set(low); + } + + // If we still have space and we're not done with codepoints, + // we keep building up the block. Conversely: we finalize this + // block if we've filled it or are out of codepoints. + if (low + 1 < set_size and cp != std.math.maxInt(u21)) continue; + + // Look for the stage2 index for this block. If it doesn't exist + // we add it to stage2 and update the mapping. + const gop = try set_map.getOrPut(set); + if (!gop.found_existing) { + gop.value_ptr.* = std.math.cast( + u16, + stage2.items.len, + ) orelse return error.Stage2TooLarge; + try stage2.append(set); + } + + // Map stage1 => stage2 and reset our block + try stage1.append(gop.value_ptr.*); + set = .initEmpty(); + assert(stage1.items.len - 1 == high); + } + + // All of our lengths must fit in a u16 for this to work + assert(stage1.items.len <= std.math.maxInt(u16)); + assert(stage2.items.len <= std.math.maxInt(u16)); + + const stage1_owned = try stage1.toOwnedSlice(); + errdefer alloc.free(stage1_owned); + const stage2_owned = try stage2.toOwnedSlice(); + errdefer alloc.free(stage2_owned); + + return .{ + .min = min, + .max = max, + .stage1 = stage1_owned, + .stage2 = stage2_owned, + }; + } + }; +} + +/// Creates a type that given a 3-level lookup table, can be used to +/// look up a mapping for a given codepoint, encode it out to Zig, etc. +pub const Tables = struct { + const Self = @This(); + + min: u21, + max: u21, + stage1: []const u16, + stage2: []const Set, + + /// Given a codepoint, returns the mapping for that codepoint. + pub fn get(self: *const Self, cp: u21) bool { + if (cp < self.min) return false; + if (cp > self.max) return false; + const high = cp >> cp_shift; + const stage2 = self.stage1[high]; + // take advantage of the fact that the first entry is always all false + if (stage2 == 0) return false; + const low = cp & cp_mask; + return self.stage2[stage2].isSet(low); + } + + /// Writes the lookup table as Zig to the given writer. The + /// written file exports three constants: stage1, stage2, and + /// stage3. These can be used to rebuild the lookup table in Zig. + pub fn writeZig(self: *const Self, writer: anytype) !void { + try writer.print( + \\//! This file is auto-generated. Do not edit. + \\const std = @import("std"); + \\ + \\pub const min: u21 = {}; + \\pub const max: u21 = {}; + \\ + \\pub const stage1: [{}]u16 = .{{ + , .{ self.min, self.max, self.stage1.len }); + for (self.stage1) |entry| try writer.print("{},", .{entry}); + + try writer.print( + \\ + \\}}; + \\ + \\pub const Set = std.bit_set.IntegerBitSet({d}); + \\pub const stage2: [{d}]Set = .{{ + \\ + , .{ set_size, self.stage2.len }); + // for (self.stage2) |entry| { + // try writer.print(" .{{\n", .{}); + // try writer.print(" .masks = [{d}]{s}{{\n", .{ entry.masks.len, @typeName(Set.MaskInt) }); + // for (entry.masks) |mask| { + // try writer.print(" {d},\n", .{mask}); + // } + // try writer.print(" }},\n", .{}); + // try writer.print(" }},\n", .{}); + // } + for (self.stage2) |entry| { + try writer.print(" .{{ .mask = {d} }},\n", .{entry.mask}); + } + try writer.writeAll("};\n"); + } +}; diff --git a/src/unicode/main.zig b/src/unicode/main.zig index f5b911948..91dfd482c 100644 --- a/src/unicode/main.zig +++ b/src/unicode/main.zig @@ -9,5 +9,7 @@ pub const graphemeBreak = grapheme.graphemeBreak; pub const GraphemeBreakState = grapheme.BreakState; test { + _ = @import("symbols1.zig"); + _ = @import("symbols2.zig"); @import("std").testing.refAllDecls(@This()); } diff --git a/src/unicode/props.zig b/src/unicode/props.zig index 99c57aa0a..7edb3761c 100644 --- a/src/unicode/props.zig +++ b/src/unicode/props.zig @@ -166,7 +166,7 @@ pub fn main() !void { // This is not very fast in debug modes, so its commented by default. // IMPORTANT: UNCOMMENT THIS WHENEVER MAKING CODEPOINTWIDTH CHANGES. -// test "tables match ziglyph" { +// test "unicode props: tables match ziglyph" { // const testing = std.testing; // // const min = 0xFF + 1; // start outside ascii diff --git a/src/unicode/symbols1.zig b/src/unicode/symbols1.zig new file mode 100644 index 000000000..e5b8cc22a --- /dev/null +++ b/src/unicode/symbols1.zig @@ -0,0 +1,93 @@ +const props = @This(); +const std = @import("std"); +const assert = std.debug.assert; +const ziglyph = @import("ziglyph"); +const lut = @import("lut.zig"); + +/// The lookup tables for Ghostty. +pub const table = table: { + // This is only available after running main() below as part of the Ghostty + // build.zig, but due to Zig's lazy analysis we can still reference it here. + const generated = @import("symbols1_tables").Tables(bool); + const Tables = lut.Tables(bool); + break :table Tables{ + .stage1 = &generated.stage1, + .stage2 = &generated.stage2, + .stage3 = &generated.stage3, + }; +}; + +/// Returns true of the codepoint is a "symbol-like" character, which +/// for now we define as anything in a private use area and anything +/// in several unicode blocks: +/// - Dingbats +/// - Emoticons +/// - Miscellaneous Symbols +/// - Enclosed Alphanumerics +/// - Enclosed Alphanumeric Supplement +/// - Miscellaneous Symbols and Pictographs +/// - Transport and Map Symbols +/// +/// In the future it may be prudent to expand this to encompass more +/// symbol-like characters, and/or exclude some PUA sections. +pub fn isSymbol(cp: u21) bool { + return ziglyph.general_category.isPrivateUse(cp) or + ziglyph.blocks.isDingbats(cp) or + ziglyph.blocks.isEmoticons(cp) or + ziglyph.blocks.isMiscellaneousSymbols(cp) or + ziglyph.blocks.isEnclosedAlphanumerics(cp) or + ziglyph.blocks.isEnclosedAlphanumericSupplement(cp) or + ziglyph.blocks.isMiscellaneousSymbolsAndPictographs(cp) or + ziglyph.blocks.isTransportAndMapSymbols(cp); +} + +/// Runnable binary to generate the lookup tables and output to stdout. +pub fn main() !void { + var arena_state = std.heap.ArenaAllocator.init(std.heap.page_allocator); + defer arena_state.deinit(); + const alloc = arena_state.allocator(); + + const gen: lut.Generator( + bool, + struct { + pub fn get(ctx: @This(), cp: u21) !bool { + _ = ctx; + return isSymbol(cp); + } + + pub fn eql(ctx: @This(), a: bool, b: bool) bool { + _ = ctx; + return a == b; + } + }, + ) = .{}; + + const t = try gen.generate(alloc); + defer alloc.free(t.stage1); + defer alloc.free(t.stage2); + defer alloc.free(t.stage3); + try t.writeZig(std.io.getStdOut().writer()); + + // Uncomment when manually debugging to see our table sizes. + // std.log.warn("stage1={} stage2={} stage3={}", .{ + // t.stage1.len, + // t.stage2.len, + // t.stage3.len, + // }); +} + +// This is not very fast in debug modes, so its commented by default. +// IMPORTANT: UNCOMMENT THIS WHENEVER MAKING CHANGES. +test "unicode symbols1: tables match ziglyph" { + const testing = std.testing; + + for (0..std.math.maxInt(u21)) |cp| { + const t = table.get(@intCast(cp)); + const zg = isSymbol(@intCast(cp)); + + if (t != zg) { + std.log.warn("mismatch cp=U+{x} t={} zg={}", .{ cp, t, zg }); + try testing.expect(false); + } + } +} diff --git a/src/unicode/symbols2.zig b/src/unicode/symbols2.zig new file mode 100644 index 000000000..1d23c51be --- /dev/null +++ b/src/unicode/symbols2.zig @@ -0,0 +1,85 @@ +const props = @This(); +const std = @import("std"); +const assert = std.debug.assert; +const ziglyph = @import("ziglyph"); +const lut2 = @import("lut2.zig"); + +/// The lookup tables for Ghostty. +pub const table = table: { + // This is only available after running main() below as part of the Ghostty + // build.zig, but due to Zig's lazy analysis we can still reference it here. + const generated = @import("symbols2_tables"); + break :table lut2.Tables{ + .min = generated.min, + .max = generated.max, + .stage1 = &generated.stage1, + .stage2 = &generated.stage2, + }; +}; + +/// Returns true of the codepoint is a "symbol-like" character, which +/// for now we define as anything in a private use area and anything +/// in several unicode blocks: +/// - Dingbats +/// - Emoticons +/// - Miscellaneous Symbols +/// - Enclosed Alphanumerics +/// - Enclosed Alphanumeric Supplement +/// - Miscellaneous Symbols and Pictographs +/// - Transport and Map Symbols +/// +/// In the future it may be prudent to expand this to encompass more +/// symbol-like characters, and/or exclude some PUA sections. +pub fn isSymbol(cp: u21) bool { + return ziglyph.general_category.isPrivateUse(cp) or + ziglyph.blocks.isDingbats(cp) or + ziglyph.blocks.isEmoticons(cp) or + ziglyph.blocks.isMiscellaneousSymbols(cp) or + ziglyph.blocks.isEnclosedAlphanumerics(cp) or + ziglyph.blocks.isEnclosedAlphanumericSupplement(cp) or + ziglyph.blocks.isMiscellaneousSymbolsAndPictographs(cp) or + ziglyph.blocks.isTransportAndMapSymbols(cp); +} + +/// Runnable binary to generate the lookup tables and output to stdout. +pub fn main() !void { + var arena_state = std.heap.ArenaAllocator.init(std.heap.page_allocator); + defer arena_state.deinit(); + const alloc = arena_state.allocator(); + + const gen: lut2.Generator( + struct { + pub fn get(ctx: @This(), cp: u21) bool { + _ = ctx; + return isSymbol(cp); + } + }, + ) = .{}; + + const t = try gen.generate(alloc); + defer alloc.free(t.stage1); + defer alloc.free(t.stage2); + try t.writeZig(std.io.getStdOut().writer()); + + // Uncomment when manually debugging to see our table sizes. + // std.log.warn("stage1={} stage2={}", .{ + // t.stage1.len, + // t.stage2.len, + // }); +} + +// This is not very fast in debug modes, so its commented by default. +// IMPORTANT: UNCOMMENT THIS WHENEVER MAKING CHANGES. +test "unicode symbols2: tables match ziglyph" { + const testing = std.testing; + + for (0..std.math.maxInt(u21)) |cp| { + const t1 = table.get(@intCast(cp)); + const zg = isSymbol(@intCast(cp)); + + if (t1 != zg) { + std.log.warn("mismatch cp=U+{x} t={} zg={}", .{ cp, t1, zg }); + try testing.expect(false); + } + } +} |
