summaryrefslogtreecommitdiff
path: root/src/simd
diff options
context:
space:
mode:
Diffstat (limited to 'src/simd')
-rw-r--r--src/simd/vt.zig51
1 files changed, 39 insertions, 12 deletions
diff --git a/src/simd/vt.zig b/src/simd/vt.zig
index 2a19c52c7..8e974ad7e 100644
--- a/src/simd/vt.zig
+++ b/src/simd/vt.zig
@@ -43,19 +43,44 @@ fn utf8DecodeUntilControlSeqScalar(
) DecodeResult {
// Find our escape
const idx = indexOf(input, 0x1B) orelse input.len;
-
- // Copy up to the escape
- const view = std.unicode.Utf8View.init(input[0..idx]) catch unreachable;
- var it = view.iterator();
- var i: usize = 0;
- while (it.nextCodepoint()) |cp| {
- output[i] = @intCast(cp);
- i += 1;
+ const decode = input[0..idx];
+
+ // Go through and decode one item at a time.
+ var decode_offset: usize = 0;
+ var decode_count: usize = 0;
+ while (decode_offset < decode.len) {
+ const decode_rem = decode[decode_offset..];
+ const cp_len = std.unicode.utf8ByteSequenceLength(decode_rem[0]) catch {
+ // Note, this is matching our SIMD behavior, but it is admittedly
+ // a bit weird. See our "decode invalid leading byte" test too.
+ // SIMD should be our source of truth then we copy behavior here.
+ break;
+ };
+
+ // If we don't have that number of bytes available. we finish. We
+ // assume this is a partial input and we defer to the future.
+ if (decode_rem.len < cp_len) break;
+
+ // We have the bytes available, so move forward
+ const cp_bytes = decode_rem[0..cp_len];
+ decode_offset += cp_len;
+ if (std.unicode.utf8Decode(cp_bytes)) |cp| {
+ output[decode_count] = @intCast(cp);
+ decode_count += 1;
+ } else |_| {
+ // If decoding failed, we replace the leading byte with the
+ // replacement char and then continue decoding after that
+ // byte. This matches the SIMD behavior and is tested by the
+ // "invalid UTF-8" tests.
+ output[decode_count] = 0xFFFD;
+ decode_count += 1;
+ decode_offset -= cp_len - 1;
+ }
}
return .{
- .consumed = idx,
- .decoded = i,
+ .consumed = decode_offset,
+ .decoded = decode_count,
};
}
@@ -139,16 +164,18 @@ test "decode invalid UTF-8" {
var output: [64]u32 = undefined;
- // Invalid leading 1s
+ // Invalid leading 2-byte sequence
{
- const str = "hello\xc2\x00";
+ const str = "hello\xc2\x01";
try testing.expectEqual(DecodeResult{
.consumed = 7,
.decoded = 7,
}, utf8DecodeUntilControlSeq(str, &output));
}
+ // Replacement will only replace the invalid leading byte.
try testing.expectEqual(@as(u32, 0xFFFD), output[5]);
+ try testing.expectEqual(@as(u32, 0x01), output[6]);
}
// This is testing our current behavior so that we know we have to handle