1 files changed, 39 insertions, 12 deletions
diff --git a/src/simd/vt.zig b/src/simd/vt.zig
index 2a19c52c7..8e974ad7e 100644
--- a/src/simd/vt.zig
+++ b/src/simd/vt.zig
@@ -43,19 +43,44 @@ fn utf8DecodeUntilControlSeqScalar(
 ) DecodeResult {
     // Find our escape
     const idx = indexOf(input, 0x1B) orelse input.len;
-
-    // Copy up to the escape
-    const view = std.unicode.Utf8View.init(input[0..idx]) catch unreachable;
-    var it = view.iterator();
-    var i: usize = 0;
-    while (it.nextCodepoint()) |cp| {
-        output[i] = @intCast(cp);
-        i += 1;
+    const decode = input[0..idx];
+
+    // Go through and decode one item at a time.
+    var decode_offset: usize = 0;
+    var decode_count: usize = 0;
+    while (decode_offset < decode.len) {
+        const decode_rem = decode[decode_offset..];
+        const cp_len = std.unicode.utf8ByteSequenceLength(decode_rem[0]) catch {
+            // Note, this is matching our SIMD behavior, but it is admittedly
+            // a bit weird. See our "decode invalid leading byte" test too.
+            // SIMD should be our source of truth then we copy behavior here.
+            break;
+        };
+
+        // If we don't have that number of bytes available. we finish. We
+        // assume this is a partial input and we defer to the future.
+        if (decode_rem.len < cp_len) break;
+
+        // We have the bytes available, so move forward
+        const cp_bytes = decode_rem[0..cp_len];
+        decode_offset += cp_len;
+        if (std.unicode.utf8Decode(cp_bytes)) |cp| {
+            output[decode_count] = @intCast(cp);
+            decode_count += 1;
+        } else |_| {
+            // If decoding failed, we replace the leading byte with the
+            // replacement char and then continue decoding after that
+            // byte. This matches the SIMD behavior and is tested by the
+            // "invalid UTF-8" tests.
+            output[decode_count] = 0xFFFD;
+            decode_count += 1;
+            decode_offset -= cp_len - 1;
+        }
     }
 
     return .{
-        .consumed = idx,
-        .decoded = i,
+        .consumed = decode_offset,
+        .decoded = decode_count,
     };
 }
 
@@ -139,16 +164,18 @@ test "decode invalid UTF-8" {
 
     var output: [64]u32 = undefined;
 
-    // Invalid leading 1s
+    // Invalid leading 2-byte sequence
     {
-        const str = "hello\xc2\x00";
+        const str = "hello\xc2\x01";
         try testing.expectEqual(DecodeResult{
             .consumed = 7,
             .decoded = 7,
         }, utf8DecodeUntilControlSeq(str, &output));
     }
 
+    // Replacement will only replace the invalid leading byte.
     try testing.expectEqual(@as(u32, 0xFFFD), output[5]);
+    try testing.expectEqual(@as(u32, 0x01), output[6]);
 }
 
 // This is testing our current behavior so that we know we have to handle