fix(string): use simdutf for Latin1-to-UTF8 to prevent OOM from buffer under-allocation

The old `allocateLatin1IntoUTF8WithList` allocated only `latin1.len` bytes initially, but Latin1 non-ASCII characters (128-255) each need 2 bytes in UTF-8. For large strings with many non-ASCII characters, this caused repeated buffer growth that could trigger OOM. Now uses `simdutf` to compute the exact UTF-8 length upfront and convert in a single pass, eliminating unnecessary reallocations. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-09 08:50:42 +01:00 · 2026-03-01 05:56:49 +00:00
2 changed files with 86 additions and 113 deletions
--- a/src/string/immutable/unicode.zig
+++ b/src/string/immutable/unicode.zig
@@ -355,7 +355,8 @@ pub fn toUTF8FromLatin1(allocator: std.mem.Allocator, latin1: []const u8) !?std.
    if (isAllASCII(latin1))
        return null;

-    const list = try std.array_list.Managed(u8).initCapacity(allocator, latin1.len);
+    const utf8_len = bun.simdutf.length.utf8.from.latin1(latin1);
+    const list = try std.array_list.Managed(u8).initCapacity(allocator, utf8_len);
    return try allocateLatin1IntoUTF8WithList(list, 0, latin1);
 }

@@ -363,7 +364,8 @@ pub fn toUTF8FromLatin1Z(allocator: std.mem.Allocator, latin1: []const u8) !?std
    if (isAllASCII(latin1))
        return null;

-    const list = try std.array_list.Managed(u8).initCapacity(allocator, latin1.len + 1);
+    const utf8_len = bun.simdutf.length.utf8.from.latin1(latin1);
+    const list = try std.array_list.Managed(u8).initCapacity(allocator, utf8_len + 1);
    var list1 = try allocateLatin1IntoUTF8WithList(list, 0, latin1);
    try list1.append(0);
    return list1;
@@ -432,125 +434,24 @@ pub fn allocateLatin1IntoUTF8(allocator: std.mem.Allocator, latin1_: []const u8)
        return out;
    }

-    const list = try std.array_list.Managed(u8).initCapacity(allocator, latin1_.len);
+    const utf8_len = bun.simdutf.length.utf8.from.latin1(latin1_);
+    const list = try std.array_list.Managed(u8).initCapacity(allocator, utf8_len);
    var foo = try allocateLatin1IntoUTF8WithList(list, 0, latin1_);
    return try foo.toOwnedSlice();
 }

 pub fn allocateLatin1IntoUTF8WithList(list_: std.array_list.Managed(u8), offset_into_list: usize, latin1_: []const u8) OOM!std.array_list.Managed(u8) {
-    var latin1 = latin1_;
-    var i: usize = offset_into_list;
    var list = list_;
-    try list.ensureUnusedCapacity(latin1.len);

-    while (latin1.len > 0) {
-        if (comptime Environment.allow_assert) assert(i < list.capacity);
-        var buf = list.items.ptr[i..list.capacity];
+    // Use simdutf to compute exact UTF-8 length upfront and convert in a single pass.
+    // This avoids repeated buffer growth that could cause OOM for large strings with
+    // many non-ASCII characters (each Latin1 byte 128-255 becomes 2 UTF-8 bytes).
+    const utf8_len = bun.simdutf.length.utf8.from.latin1(latin1_);
+    try list.ensureTotalCapacity(offset_into_list + utf8_len);
+    const written = bun.simdutf.convert.latin1.to.utf8(latin1_, list.items.ptr[offset_into_list .. offset_into_list + utf8_len]);
+    list.items.len = offset_into_list + written;

-        inner: {
-            var count = latin1.len / ascii_vector_size;
-            while (count > 0) : (count -= 1) {
-                const vec: AsciiVector = latin1[0..ascii_vector_size].*;
-
-                if (@reduce(.Max, vec) > 127) {
-                    const Int = u64;
-                    const size = @sizeOf(Int);
-
-                    // zig or LLVM doesn't do @ctz nicely with SIMD
-                    if (comptime ascii_vector_size >= 8) {
-                        {
-                            const bytes = @as(Int, @bitCast(latin1[0..size].*));
-                            // https://dotat.at/@/2022-06-27-tolower-swar.html
-                            const mask = bytes & 0x8080808080808080;
-
-                            if (mask > 0) {
-                                const first_set_byte = @ctz(mask) / 8;
-                                if (comptime Environment.allow_assert) assert(latin1[first_set_byte] >= 127);
-
-                                buf[0..size].* = @as([size]u8, @bitCast(bytes));
-                                buf = buf[first_set_byte..];
-                                latin1 = latin1[first_set_byte..];
-                                break :inner;
-                            }
-
-                            buf[0..size].* = @as([size]u8, @bitCast(bytes));
-                            latin1 = latin1[size..];
-                            buf = buf[size..];
-                        }
-
-                        if (comptime ascii_vector_size >= 16) {
-                            const bytes = @as(Int, @bitCast(latin1[0..size].*));
-                            // https://dotat.at/@/2022-06-27-tolower-swar.html
-                            const mask = bytes & 0x8080808080808080;
-
-                            if (mask > 0) {
-                                const first_set_byte = @ctz(mask) / 8;
-                                if (comptime Environment.allow_assert) assert(latin1[first_set_byte] >= 127);
-
-                                buf[0..size].* = @as([size]u8, @bitCast(bytes));
-                                buf = buf[first_set_byte..];
-                                latin1 = latin1[first_set_byte..];
-                                break :inner;
-                            }
-                        }
-                    }
-                    unreachable;
-                }
-
-                buf[0..ascii_vector_size].* = @as([ascii_vector_size]u8, @bitCast(vec))[0..ascii_vector_size].*;
-                latin1 = latin1[ascii_vector_size..];
-                buf = buf[ascii_vector_size..];
-            }
-
-            while (latin1.len >= 8) {
-                const Int = u64;
-                const size = @sizeOf(Int);
-
-                const bytes = @as(Int, @bitCast(latin1[0..size].*));
-                // https://dotat.at/@/2022-06-27-tolower-swar.html
-                const mask = bytes & 0x8080808080808080;
-
-                if (mask > 0) {
-                    const first_set_byte = @ctz(mask) / 8;
-                    if (comptime Environment.allow_assert) assert(latin1[first_set_byte] >= 127);
-
-                    buf[0..size].* = @as([size]u8, @bitCast(bytes));
-                    latin1 = latin1[first_set_byte..];
-                    buf = buf[first_set_byte..];
-                    break :inner;
-                }
-
-                buf[0..size].* = @as([size]u8, @bitCast(bytes));
-                latin1 = latin1[size..];
-                buf = buf[size..];
-            }
-
-            {
-                if (comptime Environment.allow_assert) assert(latin1.len < 8);
-                const end = latin1.ptr + latin1.len;
-                while (latin1.ptr != end and latin1[0] < 128) {
-                    buf[0] = latin1[0];
-                    buf = buf[1..];
-                    latin1 = latin1[1..];
-                }
-            }
-        }
-
-        while (latin1.len > 0 and latin1[0] > 127) {
-            i = @intFromPtr(buf.ptr) - @intFromPtr(list.items.ptr);
-            list.items.len = i;
-            try list.ensureUnusedCapacity(2 + latin1.len);
-            buf = list.items.ptr[i..list.capacity];
-            buf[0..2].* = latin1ToCodepointBytesAssumeNotASCII(latin1[0]);
-            latin1 = latin1[1..];
-            buf = buf[2..];
-        }
-
-        i = @intFromPtr(buf.ptr) - @intFromPtr(list.items.ptr);
-        list.items.len = i;
-    }
-
-    log("Latin1 {d} -> UTF8 {d}", .{ latin1_.len, i });
+    log("Latin1 {d} -> UTF8 {d}", .{ latin1_.len, written });

    return list;
 }
--- a/test/js/node/fs/fs-latin1-utf8.test.ts
+++ b/test/js/node/fs/fs-latin1-utf8.test.ts
@@ -0,0 +1,72 @@
+import { expect, test } from "bun:test";
+import { tempDir } from "harness";
+import { promises, readFileSync, writeFileSync } from "node:fs";
+import { join } from "node:path";
+
+// Regression test: Latin1 strings with non-ASCII characters (128-255) need
+// 2 bytes per character in UTF-8. The initial buffer allocation must account
+// for this expansion to avoid OOM from repeated buffer growth.
+
+test("writeFile with non-ASCII Latin1 string", async () => {
+  using dir = tempDir("latin1-utf8", {});
+
+  // Create a string where all characters are in the high Latin1 range (128-255).
+  // JSC stores this as an 8-bit Latin1 string internally.
+  // Each byte needs 2 bytes in UTF-8, so the output is 2x the input size.
+  const chars = [];
+  for (let i = 128; i < 256; i++) {
+    chars.push(String.fromCharCode(i));
+  }
+  const highLatin1 = chars.join("");
+
+  // Build a large string by repeating the pattern
+  const largeStr = highLatin1.repeat(1000); // 128,000 high Latin1 characters
+
+  const filePath = join(String(dir), "test.txt");
+
+  // Async writeFile - matches the crash scenario (runAsync path)
+  await promises.writeFile(filePath, largeStr);
+
+  const buf = readFileSync(filePath);
+
+  // Verify: each Latin1 char 128-255 becomes a 2-byte UTF-8 sequence
+  expect(buf.length).toBe(largeStr.length * 2);
+
+  // Verify content is correct by decoding back
+  const decoded = buf.toString("utf8");
+  expect(decoded).toBe(largeStr);
+});
+
+test("writeFileSync with non-ASCII Latin1 string", () => {
+  using dir = tempDir("latin1-utf8-sync", {});
+
+  const chars = [];
+  for (let i = 128; i < 256; i++) {
+    chars.push(String.fromCharCode(i));
+  }
+  const highLatin1 = chars.join("").repeat(500);
+
+  const filePath = join(String(dir), "test.txt");
+
+  writeFileSync(filePath, highLatin1);
+
+  const buf = readFileSync(filePath);
+  expect(buf.length).toBe(highLatin1.length * 2);
+  expect(buf.toString("utf8")).toBe(highLatin1);
+});
+
+test("writeFile with mixed ASCII and Latin1 string", async () => {
+  using dir = tempDir("latin1-utf8-mixed", {});
+
+  // Mix of ASCII and high Latin1 characters
+  let mixed = "";
+  for (let i = 0; i < 10000; i++) {
+    mixed += "hello\xE9\xE8\xFC\xF1world\xC0\xDF";
+  }
+
+  const filePath = join(String(dir), "test.txt");
+  await promises.writeFile(filePath, mixed);
+
+  const content = readFileSync(filePath, "utf8");
+  expect(content).toBe(mixed);
+});