revert 84c91bf7e1 (#13214)

2026-02-09 18:38:55 +00:00 · 2024-08-09 19:28:08 -07:00
parent b9ead441c1
commit 9302b42919
31 changed files with 2027 additions and 263 deletions
--- a/src/string_immutable.zig
+++ b/src/string_immutable.zig
@@ -1340,15 +1340,7 @@ pub fn toUTF16Alloc(allocator: std.mem.Allocator, bytes: []const u8, comptime fa
        var remaining = bytes[i..];

        {
-            const sequence: [4]u8 = switch (remaining.len) {
-                0 => unreachable,
-                1 => [_]u8{ remaining[0], 0, 0, 0 },
-                2 => [_]u8{ remaining[0], remaining[1], 0, 0 },
-                3 => [_]u8{ remaining[0], remaining[1], remaining[2], 0 },
-                else => remaining[0..4].*,
-            };
-
-            const replacement = strings.convertUTF8BytesIntoUTF16(&sequence);
+            const replacement = strings.convertUTF8BytesIntoUTF16(remaining);
            if (comptime fail_if_invalid) {
                if (replacement.fail) {
                    if (comptime Environment.allow_assert) assert(replacement.code_point == unicode_replacement);
@@ -1375,15 +1367,7 @@ pub fn toUTF16Alloc(allocator: std.mem.Allocator, bytes: []const u8, comptime fa
            strings.copyU8IntoU16(output.items[end..][0..j], remaining[0..j]);
            remaining = remaining[j..];

-            const sequence: [4]u8 = switch (remaining.len) {
-                0 => unreachable,
-                1 => [_]u8{ remaining[0], 0, 0, 0 },
-                2 => [_]u8{ remaining[0], remaining[1], 0, 0 },
-                3 => [_]u8{ remaining[0], remaining[1], remaining[2], 0 },
-                else => remaining[0..4].*,
-            };
-
-            const replacement = strings.convertUTF8BytesIntoUTF16(&sequence);
+            const replacement = strings.convertUTF8BytesIntoUTF16(remaining);
            if (comptime fail_if_invalid) {
                if (replacement.fail) {
                    if (comptime Environment.allow_assert) assert(replacement.code_point == unicode_replacement);
@@ -1436,6 +1420,101 @@ pub fn toUTF16AllocForReal(allocator: std.mem.Allocator, bytes: []const u8, comp
    };
 }

+pub fn toUTF16AllocMaybeBuffered(
+    allocator: std.mem.Allocator,
+    bytes: []const u8,
+    comptime fail_if_invalid: bool,
+    comptime flush: bool,
+) error{ OutOfMemory, InvalidByteSequence }!?struct { []u16, [3]u8, u2 } {
+    const first_non_ascii = strings.firstNonASCII(bytes) orelse return null;
+
+    var output: std.ArrayListUnmanaged(u16) = if (comptime bun.FeatureFlags.use_simdutf) output: {
+        const out_length = bun.simdutf.length.utf16.from.utf8(bytes);
+
+        if (out_length == 0) {
+            break :output .{};
+        }
+
+        var out = try allocator.alloc(u16, out_length);
+
+        const res = bun.simdutf.convert.utf8.to.utf16.with_errors.le(bytes, out);
+        if (res.status == .success) {
+            log("toUTF16 {d} UTF8 -> {d} UTF16", .{ bytes.len, out_length });
+            return .{ out, .{0} ** 3, 0 };
+        }
+
+        var list = std.ArrayListUnmanaged(u16).fromOwnedSlice(out[0..first_non_ascii]);
+        list.capacity = out.len;
+
+        break :output list;
+    } else .{};
+    errdefer output.deinit(allocator);
+
+    const start = if (output.items.len > 0) first_non_ascii else 0;
+    var remaining = bytes[start..];
+
+    var non_ascii: ?u32 = 0;
+    while (non_ascii) |i| : (non_ascii = strings.firstNonASCII(remaining)) {
+        {
+            const end = output.items.len;
+            try output.ensureUnusedCapacity(allocator, i + 2); // +2 for UTF16 codepoint
+            output.items.len += i;
+            strings.copyU8IntoU16(output.items[end..][0..i], remaining[0..i]);
+            remaining = remaining[i..];
+        }
+
+        const sequence: [4]u8 = switch (remaining.len) {
+            0 => unreachable,
+            1 => .{ remaining[0], 0, 0, 0 },
+            2 => .{ remaining[0], remaining[1], 0, 0 },
+            3 => .{ remaining[0], remaining[1], remaining[2], 0 },
+            else => remaining[0..4].*,
+        };
+
+        const converted_length = strings.nonASCIISequenceLength(sequence[0]);
+
+        const converted = strings.convertUTF8BytesIntoUTF16WithLength(&sequence, converted_length, remaining.len);
+
+        if (comptime !flush) {
+            if (converted.fail and converted.can_buffer and converted_length > remaining.len) {
+                const buffered: [3]u8 = switch (remaining.len) {
+                    else => unreachable,
+                    1 => .{ remaining[0], 0, 0 },
+                    2 => .{ remaining[0], remaining[1], 0 },
+                    3 => .{ remaining[0], remaining[1], remaining[2] },
+                };
+                return .{ output.items, buffered, @intCast(remaining.len) };
+            }
+        }
+
+        if (comptime fail_if_invalid) {
+            if (converted.fail) {
+                if (comptime Environment.allow_assert) {
+                    bun.assert(converted.code_point == unicode_replacement);
+                }
+                return error.InvalidByteSequence;
+            }
+        }
+
+        remaining = remaining[@max(converted.len, 1)..];
+
+        // #define U16_LENGTH(c) ((uint32_t)(c)<=0xffff ? 1 : 2)
+        switch (converted.code_point) {
+            0...0xffff => |c| output.appendAssumeCapacity(@intCast(c)),
+            else => |c| output.appendSliceAssumeCapacity(&.{ strings.u16Lead(c), strings.u16Trail(c) }),
+        }
+    }
+
+    if (remaining.len > 0) {
+        try output.ensureTotalCapacityPrecise(allocator, output.items.len + remaining.len);
+        output.items.len += remaining.len;
+        strings.copyU8IntoU16(output.items[output.items.len - remaining.len ..], remaining);
+    }
+
+    log("toUTF16 {d} UTF8 -> {d} UTF16", .{ bytes.len, output.items.len });
+    return .{ output.items, .{0} ** 3, 0 };
+}
+
 pub fn toUTF16AllocNoTrim(allocator: std.mem.Allocator, bytes: []const u8, comptime fail_if_invalid: bool, comptime _: bool) !?[]u16 {
    if (strings.firstNonASCII(bytes)) |i| {
        const output_: ?std.ArrayList(u16) = if (comptime bun.FeatureFlags.use_simdutf) simd: {
@@ -1474,15 +1553,7 @@ pub fn toUTF16AllocNoTrim(allocator: std.mem.Allocator, bytes: []const u8, compt
        var remaining = bytes[i..];

        {
-            const sequence: [4]u8 = switch (remaining.len) {
-                0 => unreachable,
-                1 => [_]u8{ remaining[0], 0, 0, 0 },
-                2 => [_]u8{ remaining[0], remaining[1], 0, 0 },
-                3 => [_]u8{ remaining[0], remaining[1], remaining[2], 0 },
-                else => remaining[0..4].*,
-            };
-
-            const replacement = strings.convertUTF8BytesIntoUTF16(&sequence);
+            const replacement = strings.convertUTF8BytesIntoUTF16(remaining);
            if (comptime fail_if_invalid) {
                if (replacement.fail) {
                    if (comptime Environment.allow_assert) assert(replacement.code_point == unicode_replacement);
@@ -1509,15 +1580,7 @@ pub fn toUTF16AllocNoTrim(allocator: std.mem.Allocator, bytes: []const u8, compt
            strings.copyU8IntoU16(output.items[end..][0..j], remaining[0..j]);
            remaining = remaining[j..];

-            const sequence: [4]u8 = switch (remaining.len) {
-                0 => unreachable,
-                1 => [_]u8{ remaining[0], 0, 0, 0 },
-                2 => [_]u8{ remaining[0], remaining[1], 0, 0 },
-                3 => [_]u8{ remaining[0], remaining[1], remaining[2], 0 },
-                else => remaining[0..4].*,
-            };
-
-            const replacement = strings.convertUTF8BytesIntoUTF16(&sequence);
+            const replacement = strings.convertUTF8BytesIntoUTF16(remaining);
            if (comptime fail_if_invalid) {
                if (replacement.fail) {
                    if (comptime Environment.allow_assert) assert(replacement.code_point == unicode_replacement);
@@ -2076,7 +2139,9 @@ pub const UTF16Replacement = struct {
    /// and a genuine error.
    fail: bool = false,

-    pub inline fn utf8Width(replacement: UTF16Replacement) usize {
+    can_buffer: bool = true,
+
+    pub inline fn utf8Width(replacement: UTF16Replacement) u3 {
        return switch (replacement.code_point) {
            0...0x7F => 1,
            (0x7F + 1)...0x7FF => 2,
@@ -2086,10 +2151,8 @@ pub const UTF16Replacement = struct {
    }
 };

-// This variation matches WebKit behavior.
-fn convertUTF8BytesIntoUTF16(sequence: *const [4]u8) UTF16Replacement {
+fn convertUTF8BytesIntoUTF16WithLength(sequence: *const [4]u8, len: u3, remaining_len: usize) UTF16Replacement {
    if (comptime Environment.allow_assert) assert(sequence[0] > 127);
-    const len = nonASCIISequenceLength(sequence[0]);
    switch (len) {
        2 => {
            if (comptime Environment.allow_assert) {
@@ -2097,7 +2160,7 @@ fn convertUTF8BytesIntoUTF16(sequence: *const [4]u8) UTF16Replacement {
                bun.assert(sequence[0] <= 0xDF);
            }
            if (sequence[1] < 0x80 or sequence[1] > 0xBF) {
-                return .{ .len = 1, .fail = true };
+                return .{ .len = 1, .fail = true, .can_buffer = remaining_len < 2 };
            }
            return .{ .len = len, .code_point = ((@as(u32, sequence[0]) << 6) + @as(u32, sequence[1])) - 0x00003080 };
        },
@@ -2109,22 +2172,22 @@ fn convertUTF8BytesIntoUTF16(sequence: *const [4]u8) UTF16Replacement {
            switch (sequence[0]) {
                0xE0 => {
                    if (sequence[1] < 0xA0 or sequence[1] > 0xBF) {
-                        return .{ .len = 1, .fail = true };
+                        return .{ .len = 1, .fail = true, .can_buffer = remaining_len < 2 };
                    }
                },
                0xED => {
                    if (sequence[1] < 0x80 or sequence[1] > 0x9F) {
-                        return .{ .len = 1, .fail = true };
+                        return .{ .len = 1, .fail = true, .can_buffer = remaining_len < 2 };
                    }
                },
                else => {
                    if (sequence[1] < 0x80 or sequence[1] > 0xBF) {
-                        return .{ .len = 1, .fail = true };
+                        return .{ .len = 1, .fail = true, .can_buffer = remaining_len < 2 };
                    }
                },
            }
            if (sequence[2] < 0x80 or sequence[2] > 0xBF) {
-                return .{ .len = 2, .fail = true };
+                return .{ .len = 2, .fail = true, .can_buffer = remaining_len < 3 };
            }
            return .{
                .len = len,
@@ -2135,36 +2198,36 @@ fn convertUTF8BytesIntoUTF16(sequence: *const [4]u8) UTF16Replacement {
            switch (sequence[0]) {
                0xF0 => {
                    if (sequence[1] < 0x90 or sequence[1] > 0xBF) {
-                        return .{ .len = 1, .fail = true };
+                        return .{ .len = 1, .fail = true, .can_buffer = remaining_len < 2 };
                    }
                },
                0xF4 => {
                    if (sequence[1] < 0x80 or sequence[1] > 0x8F) {
-                        return .{ .len = 1, .fail = true };
+                        return .{ .len = 1, .fail = true, .can_buffer = remaining_len < 2 };
                    }
                },

                // invalid code point
                // this used to be an assertion
                0...(0xF0 - 1), 0xF4 + 1...std.math.maxInt(@TypeOf(sequence[0])) => {
-                    return UTF16Replacement{ .len = 1, .fail = true };
+                    return .{ .len = 1, .fail = true, .can_buffer = false };
                },

                else => {
                    if (sequence[1] < 0x80 or sequence[1] > 0xBF) {
-                        return .{ .len = 1, .fail = true };
+                        return .{ .len = 1, .fail = true, .can_buffer = remaining_len < 2 };
                    }
                },
            }

            if (sequence[2] < 0x80 or sequence[2] > 0xBF) {
-                return .{ .len = 2, .fail = true };
+                return .{ .len = 2, .fail = true, .can_buffer = remaining_len < 3 };
            }
            if (sequence[3] < 0x80 or sequence[3] > 0xBF) {
-                return .{ .len = 3, .fail = true };
+                return .{ .len = 3, .fail = true, .can_buffer = remaining_len < 4 };
            }
            return .{
-                .len = 4,
+                .len = len,
                .code_point = ((@as(u32, sequence[0]) << 18) +
                    (@as(u32, sequence[1]) << 12) +
                    (@as(u32, sequence[2]) << 6) + @as(u32, sequence[3])) - 0x03C82080,
@@ -2176,6 +2239,21 @@ fn convertUTF8BytesIntoUTF16(sequence: *const [4]u8) UTF16Replacement {
    }
 }

+// This variation matches WebKit behavior.
+// fn convertUTF8BytesIntoUTF16(sequence: *const [4]u8, remaining_len: usize) UTF16Replacement {
+fn convertUTF8BytesIntoUTF16(bytes: []const u8) UTF16Replacement {
+    const sequence: [4]u8 = switch (bytes.len) {
+        0 => unreachable,
+        1 => [_]u8{ bytes[0], 0, 0, 0 },
+        2 => [_]u8{ bytes[0], bytes[1], 0, 0 },
+        3 => [_]u8{ bytes[0], bytes[1], bytes[2], 0 },
+        else => bytes[0..4].*,
+    };
+    if (comptime Environment.allow_assert) assert(sequence[0] > 127);
+    const sequence_length = nonASCIISequenceLength(sequence[0]);
+    return convertUTF8BytesIntoUTF16WithLength(&sequence, sequence_length, bytes.len);
+}
+
 pub fn copyLatin1IntoUTF8(buf_: []u8, comptime Type: type, latin1_: Type) EncodeIntoResult {
    return copyLatin1IntoUTF8StopOnNonASCII(buf_, Type, latin1_, false);
 }
@@ -3337,6 +3415,35 @@ pub fn encodeWTF8RuneT(p: *[4]u8, comptime R: type, r: R) u3 {
    }
 }

+pub fn wtf8Sequence(code_point: u32) [4]u8 {
+    return switch (code_point) {
+        0...0x7f => .{
+            @intCast(code_point),
+            0,
+            0,
+            0,
+        },
+        (0x7f + 1)...0x7ff => .{
+            @truncate(0xc0 | (code_point >> 6)),
+            @truncate(0x80 | (code_point & 0x3f)),
+            0,
+            0,
+        },
+        (0x7ff + 1)...0xffff => .{
+            @truncate(0xe0 | (code_point >> 12)),
+            @truncate(0x80 | ((code_point >> 6) & 0x3f)),
+            @truncate(0x80 | (code_point & 0x3f)),
+            0,
+        },
+        else => .{
+            @truncate(0xf0 | (code_point >> 18)),
+            @truncate(0x80 | ((code_point >> 12) & 0x3f)),
+            @truncate(0x80 | ((code_point >> 6) & 0x3f)),
+            @truncate(0x80 | (code_point & 0x3f)),
+        },
+    };
+}
+
 pub inline fn wtf8ByteSequenceLength(first_byte: u8) u3 {
    return switch (first_byte) {
        0 => 0,
@@ -3521,16 +3628,36 @@ pub fn isAllASCII(slice: []const u8) bool {
    return true;
 }

-//#define U16_LEAD(supplementary) (UChar)(((supplementary)>>10)+0xd7c0)
+// #define U16_LEAD(supplementary) (UChar)(((supplementary)>>10)+0xd7c0)
 pub inline fn u16Lead(supplementary: anytype) u16 {
-    return @as(u16, @intCast((supplementary >> 10) + 0xd7c0));
+    return @intCast((supplementary >> 10) + 0xd7c0);
 }

-//#define U16_TRAIL(supplementary) (UChar)(((supplementary)&0x3ff)|0xdc00)
+// #define U16_TRAIL(supplementary) (UChar)(((supplementary)&0x3ff)|0xdc00)
 pub inline fn u16Trail(supplementary: anytype) u16 {
-    return @as(u16, @intCast((supplementary & 0x3ff) | 0xdc00));
+    return @intCast((supplementary & 0x3ff) | 0xdc00);
 }

+// #define U16_IS_TRAIL(c) (((c)&0xfffffc00)==0xdc00)
+pub inline fn u16IsTrail(supplementary: u16) bool {
+    return (@as(u32, @intCast(supplementary)) & 0xfffffc00) == 0xdc00;
+}
+
+// #define U16_IS_LEAD(c) (((c)&0xfffffc00)==0xd800)
+pub inline fn u16IsLead(supplementary: u16) bool {
+    return (@as(u32, @intCast(supplementary)) & 0xfffffc00) == 0xd800;
+}
+
+// #define U16_GET_SUPPLEMENTARY(lead, trail) \
+//     (((UChar32)(lead)<<10UL)+(UChar32)(trail)-U16_SURROGATE_OFFSET)
+pub inline fn u16GetSupplementary(lead: u32, trail: u32) u32 {
+    const shifted = lead << 10;
+    return (shifted + trail) - u16_surrogate_offset;
+}
+
+// #define U16_SURROGATE_OFFSET ((0xd800<<10UL)+0xdc00-0x10000)
+pub const u16_surrogate_offset = 56613888;
+
 pub fn firstNonASCII(slice: []const u8) ?u32 {
    return firstNonASCIIWithType([]const u8, slice);
 }