Optimize TextEncoderStream, part 1 (#13222)

2026-02-11 19:38:58 +00:00 · 2024-08-10 02:13:36 -07:00
parent 1eb5ecb563
commit d861347dc5
8 changed files with 190 additions and 112 deletions
--- a/src/string_immutable.zig
+++ b/src/string_immutable.zig
@@ -1614,13 +1614,18 @@ pub fn toUTF16AllocNoTrim(allocator: std.mem.Allocator, bytes: []const u8, compt
 }

 pub fn utf16CodepointWithFFFD(comptime Type: type, input: Type) UTF16Replacement {
-    const c0 = @as(u21, input[0]);
+    return utf16CodepointWithFFFDAndFirstInputChar(Type, input[0], input);
+}
+
+fn utf16CodepointWithFFFDAndFirstInputChar(comptime Type: type, char: std.meta.Elem(Type), input: Type) UTF16Replacement {
+    const c0 = @as(u21, char);

    if (c0 & ~@as(u21, 0x03ff) == 0xd800) {
        // surrogate pair
        if (input.len == 1)
            return .{
                .len = 1,
+                .is_lead = true,
            };
        //error.DanglingSurrogateHalf;
        const c1 = @as(u21, input[1]);
@@ -1634,6 +1639,7 @@ pub fn utf16CodepointWithFFFD(comptime Type: type, input: Type) UTF16Replacement
                    .fail = true,
                    .len = 1,
                    .code_point = unicode_replacement,
+                    .is_lead = true,
                };
            };
        // return error.ExpectedSecondSurrogateHalf;
@@ -1862,7 +1868,7 @@ pub fn convertUTF16ToUTF8(list_: std.ArrayList(u8), comptime Type: type, utf16:
    );
    if (result.status == .surrogate) {
        // Slow path: there was invalid UTF-16, so we need to convert it without simdutf.
-        return toUTF8ListWithTypeBun(&list, Type, utf16);
+        return toUTF8ListWithTypeBun(&list, Type, utf16, false);
    }

    list.items.len = result.count;
@@ -1877,7 +1883,7 @@ pub fn convertUTF16ToUTF8Append(list: *std.ArrayList(u8), utf16: []const u16) !v

    if (result.status == .surrogate) {
        // Slow path: there was invalid UTF-16, so we need to convert it without simdutf.
-        _ = try toUTF8ListWithTypeBun(list, []const u16, utf16);
+        _ = try toUTF8ListWithTypeBun(list, []const u16, utf16, false);
        return;
    }

@@ -1951,14 +1957,15 @@ pub fn toUTF8FromLatin1Z(allocator: std.mem.Allocator, latin1: []const u8) !?std
    return list1;
 }

-pub fn toUTF8ListWithTypeBun(list: *std.ArrayList(u8), comptime Type: type, utf16: Type) !std.ArrayList(u8) {
+pub fn toUTF8ListWithTypeBun(list: *std.ArrayList(u8), comptime Type: type, utf16: Type, comptime skip_trailing_replacement: bool) !(if (skip_trailing_replacement) ?u16 else std.ArrayList(u8)) {
    var utf16_remaining = utf16;

    while (firstNonASCII16(Type, utf16_remaining)) |i| {
        const to_copy = utf16_remaining[0..i];
        utf16_remaining = utf16_remaining[i..];
+        const token = utf16_remaining[0];

-        const replacement = utf16CodepointWithFFFD(Type, utf16_remaining);
+        const replacement = utf16CodepointWithFFFDAndFirstInputChar(Type, token, utf16_remaining);
        utf16_remaining = utf16_remaining[replacement.len..];

        const count: usize = replacement.utf8Width();
@@ -1975,8 +1982,13 @@ pub fn toUTF8ListWithTypeBun(list: *std.ArrayList(u8), comptime Type: type, utf1
            to_copy,
        );

-        list.items.len += count;
+        if (comptime skip_trailing_replacement) {
+            if (replacement.is_lead and utf16_remaining.len == 0) {
+                return token;
+            }
+        }

+        list.items.len += count;
        _ = encodeWTF8RuneT(
            list.items.ptr[list.items.len - count .. list.items.len - count + 4][0..4],
            u32,
@@ -1993,6 +2005,9 @@ pub fn toUTF8ListWithTypeBun(list: *std.ArrayList(u8), comptime Type: type, utf1

    log("UTF16 {d} -> {d} UTF8", .{ utf16.len, list.items.len });

+    if (comptime skip_trailing_replacement) {
+        return null;
+    }
    return list.*;
 }

@@ -2140,6 +2155,7 @@ pub const UTF16Replacement = struct {
    fail: bool = false,

    can_buffer: bool = true,
+    is_lead: bool = false,

    pub inline fn utf8Width(replacement: UTF16Replacement) u3 {
        return switch (replacement.code_point) {