Update string_immutable.zig

windows
Slightly reduce memory usage of IPC
2026-02-02 23:18:47 +00:00 · 2025-01-11 18:31:42 -08:00 · 2025-01-11 18:28:15 -08:00 · 2025-01-11 18:23:20 -08:00
10 changed files with 141 additions and 160 deletions
--- a/src/baby_list.zig
+++ b/src/baby_list.zig
@@ -338,9 +338,13 @@ pub fn BabyList(comptime Type: type) type {
            if (comptime Type != u8)
                @compileError("Unsupported for type " ++ @typeName(Type));
            const initial = this.len;
-            const old = this.listManaged(allocator);
-            const new = try strings.allocateLatin1IntoUTF8WithList(old, old.items.len, []const u8, str);
-            this.update(new);
+            var list_ = this.listManaged(allocator);
+
+            {
+                defer this.update(list_);
+                try strings.allocateLatin1IntoUTF8WithList(&list_, list_.items.len, []const u8, str);
+            }
+
            return this.len - initial;
        }

--- a/src/bun.js/bindings/bindings.zig
+++ b/src/bun.js/bindings/bindings.zig
@@ -484,10 +484,12 @@ pub const ZigString = extern struct {
            return try allocator.dupeZ(u8, this.slice());

        var list = std.ArrayList(u8).init(allocator);
-        list = if (this.is16Bit())
-            try strings.toUTF8ListWithType(list, []const u16, this.utf16SliceAligned())
+        errdefer list.deinit();
+
+        if (this.is16Bit())
+            try strings.toUTF8ListWithType(&list, []const u16, this.utf16SliceAligned())
        else
-            try strings.allocateLatin1IntoUTF8WithList(list, 0, []const u8, this.slice());
+            try strings.allocateLatin1IntoUTF8WithList(&list, 0, []const u8, this.slice());

        if (list.capacity > list.items.len) {
            list.items.ptr[list.items.len] = 0;
@@ -501,10 +503,12 @@ pub const ZigString = extern struct {
            return allocator.dupeZ(u8, this.slice());

        var list = std.ArrayList(u8).init(allocator);
-        list = if (this.is16Bit())
-            try strings.toUTF8ListWithType(list, []const u16, this.utf16SliceAligned())
+        errdefer list.deinit();
+
+        if (this.is16Bit())
+            try strings.toUTF8ListWithType(&list, []const u16, this.utf16SliceAligned())
        else
-            try strings.allocateLatin1IntoUTF8WithList(list, 0, []const u8, this.slice());
+            try strings.allocateLatin1IntoUTF8WithList(&list, 0, []const u8, this.slice());

        try list.append(0);
        return list.items[0 .. list.items.len - 1 :0];
--- a/src/bun.js/ipc.zig
+++ b/src/bun.js/ipc.zig
@@ -161,7 +161,7 @@ const advanced = struct {
        return payload_length;
    }

-    pub fn serializeInternal(_: *IPCData, writer: anytype, global: *JSC.JSGlobalObject, value: JSValue) !usize {
+    pub fn serializeInternal(_: *IPCData, writer: *bun.io.StreamBuffer, global: *JSC.JSGlobalObject, value: JSValue) !usize {
        const serialized = value.serialize(global) orelse
            return IPCSerializationError.SerializationFailed;
        defer serialized.deinit();
@@ -246,48 +246,33 @@ const json = struct {
        return IPCDecodeError.NotEnoughBytes;
    }

-    pub fn serialize(_: *IPCData, writer: anytype, global: *JSC.JSGlobalObject, value: JSValue) !usize {
+    pub fn serialize(_: *IPCData, writer: *bun.io.StreamBuffer, global: *JSC.JSGlobalObject, value: JSValue) !usize {
        var out: bun.String = undefined;
        value.jsonStringify(global, 0, &out);
        defer out.deref();

        if (out.tag == .Dead) return IPCSerializationError.SerializationFailed;

-        // TODO: it would be cool to have a 'toUTF8Into' which can write directly into 'ipc_data.outgoing.list'
-        const str = out.toUTF8(bun.default_allocator);
-        defer str.deinit();
+        const initial = writer.list.items.len;
+        try writer.write(&.{1});
+        try writer.writeString(out);
+        try writer.write("\n");

-        const slice = str.slice();
-
-        try writer.ensureUnusedCapacity(1 + slice.len + 1);
-
-        writer.writeAssumeCapacity(&.{1});
-        writer.writeAssumeCapacity(slice);
-        writer.writeAssumeCapacity("\n");
-
-        return 1 + slice.len + 1;
+        return writer.list.items.len - initial;
    }

-    pub fn serializeInternal(_: *IPCData, writer: anytype, global: *JSC.JSGlobalObject, value: JSValue) !usize {
+    pub fn serializeInternal(_: *IPCData, writer: *bun.io.StreamBuffer, global: *JSC.JSGlobalObject, value: JSValue) !usize {
        var out: bun.String = undefined;
        value.jsonStringify(global, 0, &out);
        defer out.deref();

        if (out.tag == .Dead) return IPCSerializationError.SerializationFailed;
+        const initial = writer.list.items.len;
+        try writer.write(&.{2});
+        try writer.writeString(out);
+        try writer.write("\n");

-        // TODO: it would be cool to have a 'toUTF8Into' which can write directly into 'ipc_data.outgoing.list'
-        const str = out.toUTF8(bun.default_allocator);
-        defer str.deinit();
-
-        const slice = str.slice();
-
-        try writer.ensureUnusedCapacity(1 + slice.len + 1);
-
-        writer.writeAssumeCapacity(&.{2});
-        writer.writeAssumeCapacity(slice);
-        writer.writeAssumeCapacity("\n");
-
-        return 1 + slice.len + 1;
+        return writer.list.items.len - initial;
    }
 };

@@ -307,7 +292,7 @@ pub fn getVersionPacket(mode: Mode) []const u8 {

 /// Given a writer interface, serialize and write a value.
 /// Returns true if the value was written, false if it was not.
-pub fn serialize(data: *IPCData, writer: anytype, global: *JSC.JSGlobalObject, value: JSValue) !usize {
+pub fn serialize(data: *IPCData, writer: *bun.io.StreamBuffer, global: *JSC.JSGlobalObject, value: JSValue) !usize {
    return switch (data.mode) {
        inline else => |t| @field(@This(), @tagName(t)).serialize(data, writer, global, value),
    };
@@ -315,7 +300,7 @@ pub fn serialize(data: *IPCData, writer: anytype, global: *JSC.JSGlobalObject, v

 /// Given a writer interface, serialize and write a value.
 /// Returns true if the value was written, false if it was not.
-pub fn serializeInternal(data: *IPCData, writer: anytype, global: *JSC.JSGlobalObject, value: JSValue) !usize {
+pub fn serializeInternal(data: *IPCData, writer: *bun.io.StreamBuffer, global: *JSC.JSGlobalObject, value: JSValue) !usize {
    return switch (data.mode) {
        inline else => |t| @field(@This(), @tagName(t)).serializeInternal(data, writer, global, value),
    };
--- a/src/io/PipeWriter.zig
+++ b/src/io/PipeWriter.zig
@@ -1053,6 +1053,10 @@ pub const StreamBuffer = struct {
        return this.size() > 0;
    }

+    pub fn writeString(this: *StreamBuffer, str: bun.String) !void {
+        try str.writeUTF8Into(&this.list);
+    }
+
    pub fn write(this: *StreamBuffer, buffer: []const u8) !void {
        _ = try this.list.appendSlice(buffer);
    }
--- a/src/libarchive/libarchive.zig
+++ b/src/libarchive/libarchive.zig
@@ -358,11 +358,12 @@ pub const Archiver = struct {
                    if (comptime ContextType != void and @hasDecl(std.meta.Child(ContextType), "onFirstDirectoryName")) {
                        if (appender.needs_first_dirname) {
                            if (comptime Environment.isWindows) {
-                                const list = std.ArrayList(u8).init(default_allocator);
-                                var result = try strings.toUTF8ListWithType(list, []const u16, pathname[0..pathname.len]);
+                                var list = std.ArrayList(u8).init(default_allocator);
+                                defer list.deinit();
+                                try strings.toUTF8ListWithType(&list, []const u16, pathname[0..pathname.len]);
                                // onFirstDirectoryName copies the contents of pathname to another buffer, safe to free
-                                defer result.deinit();
-                                appender.onFirstDirectoryName(strings.withoutTrailingSlash(result.items));
+
+                                appender.onFirstDirectoryName(strings.withoutTrailingSlash(list.items));
                            } else {
                                appender.onFirstDirectoryName(strings.withoutTrailingSlash(bun.asByteSlice(pathname)));
                            }
--- a/src/shell/shell.zig
+++ b/src/shell/shell.zig
@@ -3061,26 +3061,7 @@ pub fn NewLexer(comptime encoding: StringEncoding) type {

        fn appendStringToStrPool(self: *@This(), bunstr: bun.String) !void {
            const start = self.strpool.items.len;
-            if (bunstr.isUTF16()) {
-                const utf16 = bunstr.utf16();
-                const additional = bun.simdutf.simdutf__utf8_length_from_utf16le(utf16.ptr, utf16.len);
-                try self.strpool.ensureUnusedCapacity(additional);
-                try bun.strings.convertUTF16ToUTF8Append(&self.strpool, bunstr.utf16());
-            } else if (bunstr.isUTF8()) {
-                try self.strpool.appendSlice(bunstr.byteSlice());
-            } else if (bunstr.is8Bit()) {
-                if (isAllAscii(bunstr.byteSlice())) {
-                    try self.strpool.appendSlice(bunstr.byteSlice());
-                } else {
-                    const bytes = bunstr.byteSlice();
-                    const non_ascii_idx = bun.strings.firstNonASCII(bytes) orelse 0;
-
-                    if (non_ascii_idx > 0) {
-                        try self.strpool.appendSlice(bytes[0..non_ascii_idx]);
-                    }
-                    self.strpool = try bun.strings.allocateLatin1IntoUTF8WithList(self.strpool, self.strpool.items.len, []const u8, bytes[non_ascii_idx..]);
-                }
-            }
+            try bunstr.writeUTF8Into(&self.strpool);
            const end = self.strpool.items.len;
            self.j += @intCast(end - start);
        }
@@ -3926,15 +3907,8 @@ pub const ShellSrcBuilder = struct {
                return true;
            }
        }
-        if (bunstr.isUTF16()) {
-            try this.appendUTF16Impl(bunstr.utf16());
-            return true;
-        }
-        if (bunstr.isUTF8() or bun.strings.isAllASCII(bunstr.byteSlice())) {
-            try this.appendUTF8Impl(bunstr.byteSlice());
-            return true;
-        }
-        try this.appendLatin1Impl(bunstr.byteSlice());
+
+        try bunstr.writeUTF8Into(this.outbuf);
        return true;
    }

@@ -3971,7 +3945,7 @@ pub const ShellSrcBuilder = struct {
            try this.appendUTF8Impl(latin1[0..non_ascii_idx]);
        }

-        this.outbuf.* = try bun.strings.allocateLatin1IntoUTF8WithList(this.outbuf.*, this.outbuf.items.len, []const u8, latin1);
+        try bun.strings.allocateLatin1IntoUTF8WithList(this.outbuf, this.outbuf.items.len, []const u8, latin1);
    }

    pub fn appendJSStrRef(this: *ShellSrcBuilder, bunstr: bun.String) bun.OOM!void {
--- a/src/string.zig
+++ b/src/string.zig
@@ -961,6 +961,23 @@ pub const String = extern struct {
        }
    }

+    pub fn writeUTF8Into(this: String, out: *std.ArrayList(u8)) !void {
+        if (this.isEmpty())
+            return;
+
+        if (this.isUTF8()) {
+            try out.appendSlice(this.utf8());
+            return;
+        }
+
+        if (this.is8Bit()) {
+            try bun.strings.allocateLatin1IntoUTF8WithList(out, out.items.len, []const u8, this.latin1());
+            return;
+        }
+
+        try bun.strings.toUTF8AppendToList(out, this.utf16());
+    }
+
    pub fn toUTF8(this: String, allocator: std.mem.Allocator) ZigString.Slice {
        if (this.tag == .WTFStringImpl) {
            return this.value.WTFStringImpl.toUTF8(allocator);
--- a/src/string_immutable.zig
+++ b/src/string_immutable.zig
@@ -2079,33 +2079,18 @@ pub fn toPathMaybeDir(buf: []u8, utf8: []const u8, comptime add_trailing_lash: b
    return buf[0..len :0];
 }

-pub fn convertUTF16ToUTF8(list_: std.ArrayList(u8), comptime Type: type, utf16: Type) !std.ArrayList(u8) {
-    var list = list_;
+pub fn convertUTF16ToUTF8(list: *std.ArrayList(u8), comptime Type: type, utf16: Type) !void {
    const result = bun.simdutf.convert.utf16.to.utf8.with_errors.le(
        utf16,
-        list.items.ptr[0..list.capacity],
+        list.items.ptr[list.items.len..list.capacity],
    );
    if (result.status == .surrogate) {
        // Slow path: there was invalid UTF-16, so we need to convert it without simdutf.
-        return toUTF8ListWithTypeBun(&list, Type, utf16, false);
+        list.* = try toUTF8ListWithTypeBun(list, Type, utf16, false);
+        return;
    }

-    list.items.len = result.count;
-    return list;
-}
-
-pub fn convertUTF16ToUTF8WithoutInvalidSurrogatePairs(list_: std.ArrayList(u8), comptime Type: type, utf16: Type) !std.ArrayList(u8) {
-    var list = list_;
-    const result = bun.simdutf.convert.utf16.to.utf8.with_errors.le(
-        utf16,
-        list.items.ptr[0..list.capacity],
-    );
-    if (result.status == .surrogate) {
-        return error.SurrogatePair;
-    }
-
-    list.items.len = result.count;
-    return list;
+    list.items.len += result.count;
 }

 pub fn convertUTF16ToUTF8Append(list: *std.ArrayList(u8), utf16: []const u16) !void {
@@ -2123,40 +2108,30 @@ pub fn convertUTF16ToUTF8Append(list: *std.ArrayList(u8), utf16: []const u16) !v
    list.items.len += result.count;
 }

-pub fn toUTF8AllocWithTypeWithoutInvalidSurrogatePairs(allocator: std.mem.Allocator, comptime Type: type, utf16: Type) ![]u8 {
-    if (bun.FeatureFlags.use_simdutf and comptime Type == []const u16) {
-        const length = bun.simdutf.length.utf8.from.utf16.le(utf16);
-        // add 16 bytes of padding for SIMDUTF
-        var list = try std.ArrayList(u8).initCapacity(allocator, length + 16);
-        list = try convertUTF16ToUTF8(list, Type, utf16);
-        return list.items;
-    }
-
-    var list = try std.ArrayList(u8).initCapacity(allocator, utf16.len);
-    list = try toUTF8ListWithType(list, Type, utf16);
-    return list.items;
-}
+// These do the same thing.
+pub const toUTF8AllocWithTypeWithoutInvalidSurrogatePairs = toUTF8AllocWithType;

 pub fn toUTF8AllocWithType(allocator: std.mem.Allocator, comptime Type: type, utf16: Type) ![]u8 {
    if (bun.FeatureFlags.use_simdutf and comptime Type == []const u16) {
        const length = bun.simdutf.length.utf8.from.utf16.le(utf16);
        // add 16 bytes of padding for SIMDUTF
        var list = try std.ArrayList(u8).initCapacity(allocator, length + 16);
-        list = try convertUTF16ToUTF8(list, Type, utf16);
+        errdefer list.deinit();
+        try convertUTF16ToUTF8(&list, Type, utf16);
        return list.items;
    }

-    var list = try std.ArrayList(u8).initCapacity(allocator, utf16.len);
-    list = try toUTF8ListWithType(list, Type, utf16);
+    var list = std.ArrayList(u8).init(allocator);
+    errdefer list.deinit();
+    list = try toUTF8ListWithType(&list, Type, utf16);
    return list.items;
 }

-pub fn toUTF8ListWithType(list_: std.ArrayList(u8), comptime Type: type, utf16: Type) !std.ArrayList(u8) {
+pub fn toUTF8ListWithType(list: *std.ArrayList(u8), comptime Type: type, utf16: Type) !void {
    if (bun.FeatureFlags.use_simdutf and comptime Type == []const u16) {
-        var list = list_;
        const length = bun.simdutf.length.utf8.from.utf16.le(utf16);
        try list.ensureTotalCapacityPrecise(length + 16);
-        const buf = try convertUTF16ToUTF8(list, Type, utf16);
+        try convertUTF16ToUTF8(list, Type, utf16);

        // Commenting out because `convertUTF16ToUTF8` may convert to WTF-8
        // which uses 3 bytes for invalid surrogates, causing the length to not
@@ -2164,8 +2139,7 @@ pub fn toUTF8ListWithType(list_: std.ArrayList(u8), comptime Type: type, utf16:
        // if (Environment.allow_assert) {
        //     bun.unsafeAssert(buf.items.len == length);
        // }
-
-        return buf;
+        return;
    }

    @compileError("not implemented");
@@ -2187,8 +2161,10 @@ pub fn toUTF8FromLatin1(allocator: std.mem.Allocator, latin1: []const u8) !?std.
    if (isAllASCII(latin1))
        return null;

-    const list = try std.ArrayList(u8).initCapacity(allocator, latin1.len);
-    return try allocateLatin1IntoUTF8WithList(list, 0, []const u8, latin1);
+    var list = try std.ArrayList(u8).initCapacity(allocator, latin1.len);
+    errdefer list.deinit();
+    try allocateLatin1IntoUTF8WithList(&list, 0, []const u8, latin1);
+    return list;
 }

 pub fn toUTF8FromLatin1Z(allocator: std.mem.Allocator, latin1: []const u8) !?std.ArrayList(u8) {
@@ -2198,10 +2174,11 @@ pub fn toUTF8FromLatin1Z(allocator: std.mem.Allocator, latin1: []const u8) !?std
    if (isAllASCII(latin1))
        return null;

-    const list = try std.ArrayList(u8).initCapacity(allocator, latin1.len + 1);
-    var list1 = try allocateLatin1IntoUTF8WithList(list, 0, []const u8, latin1);
-    try list1.append(0);
-    return list1;
+    var list = try std.ArrayList(u8).initCapacity(allocator, latin1.len + 1);
+    errdefer list.deinit();
+    try allocateLatin1IntoUTF8WithList(&list, 0, []const u8, latin1);
+    try list.append(0);
+    return list;
 }

 pub fn toUTF8ListWithTypeBun(list: *std.ArrayList(u8), comptime Type: type, utf16: Type, comptime skip_trailing_replacement: bool) !(if (skip_trailing_replacement) ?u16 else std.ArrayList(u8)) {
@@ -2269,15 +2246,21 @@ pub fn allocateLatin1IntoUTF8(allocator: std.mem.Allocator, comptime Type: type,
        return out;
    }

-    const list = try std.ArrayList(u8).initCapacity(allocator, latin1_.len);
-    var foo = try allocateLatin1IntoUTF8WithList(list, 0, Type, latin1_);
-    return try foo.toOwnedSlice();
+    var list = try std.ArrayList(u8).initCapacity(allocator, latin1_.len);
+    errdefer list.deinit();
+    try allocateLatin1IntoUTF8WithList(&list, 0, Type, latin1_);
+
+    // Large reallocations are expensive and may cause more heap fragmentation.
+    if (list.items.len > 64 and list.items.len + 64 > list.capacity) {
+        return list.items;
+    }
+
+    return try list.toOwnedSlice();
 }

-pub fn allocateLatin1IntoUTF8WithList(list_: std.ArrayList(u8), offset_into_list: usize, comptime Type: type, latin1_: Type) !std.ArrayList(u8) {
+pub fn allocateLatin1IntoUTF8WithList(list: *std.ArrayList(u8), offset_into_list: usize, comptime Type: type, latin1_: Type) !void {
    var latin1 = latin1_;
    var i: usize = offset_into_list;
-    var list = list_;
    try list.ensureUnusedCapacity(latin1.len);

    while (latin1.len > 0) {
@@ -2388,8 +2371,6 @@ pub fn allocateLatin1IntoUTF8WithList(list_: std.ArrayList(u8), offset_into_list
    }

    log("Latin1 {d} -> UTF8 {d}", .{ latin1_.len, i });
-
-    return list;
 }

 pub const UTF16Replacement = struct {
--- a/test/js/bun/spawn/bun-ipc-child.js
+++ b/test/js/bun/spawn/bun-ipc-child.js
@@ -1 +1 @@
-process.send("hello");
+process.send(process.argv.at(-1));
--- a/test/js/bun/spawn/spawn.ipc.test.ts
+++ b/test/js/bun/spawn/spawn.ipc.test.ts
@@ -3,38 +3,49 @@ import { describe, expect, it } from "bun:test";
 import { bunExe, gcTick } from "harness";
 import path from "path";

-describe.each(["advanced", "json"])("ipc mode %s", mode => {
-  it("the subprocess should be defined and the child should send", done => {
-    gcTick();
-    const returned_subprocess = spawn([bunExe(), path.join(__dirname, "bun-ipc-child.js")], {
-      ipc: (message, subProcess) => {
-        expect(subProcess).toBe(returned_subprocess);
-        expect(message).toBe("hello");
-        subProcess.kill();
-        done();
+const messages = [
+  "ASCII",
+  // latin1
+  String.fromCharCode(...("Copyright " + String.fromCharCode(0x00a9) + " 2025").split("").map(a => a.charCodeAt(0))),
+  // UTF-16
+  "🌟 Hello from the emoji! ✨",
+];
+
+for (const message of messages) {
+  describe(JSON.stringify(message), () => {
+    describe.each(["advanced", "json"])("ipc mode %s", mode => {
+      it("the subprocess should be defined and the child should send", done => {
        gcTick();
-      },
-      stdio: ["inherit", "inherit", "inherit"],
-      serialization: mode,
+        const returned_subprocess = spawn([bunExe(), path.join(__dirname, "bun-ipc-child.js"), message], {
+          ipc: (reply, subProcess) => {
+            expect(subProcess).toBe(returned_subprocess);
+            expect(reply).toBe(message);
+            subProcess.kill();
+            done();
+            gcTick();
+          },
+          stdio: ["inherit", "inherit", "inherit"],
+          serialization: mode,
+        });
+      });
+
+      it("the subprocess should receive the parent message and respond back", done => {
+        gcTick();
+
+        const childProc = spawn([bunExe(), path.join(__dirname, "bun-ipc-child-respond.js")], {
+          ipc: (reply, subProcess) => {
+            expect(reply).toBe(`pong:${message}`);
+            subProcess.kill();
+            done();
+            gcTick();
+          },
+          stdio: ["inherit", "inherit", "inherit"],
+          serialization: mode,
+        });
+
+        childProc.send(message);
+        gcTick();
+      });
    });
  });
-
-  it("the subprocess should receive the parent message and respond back", done => {
-    gcTick();
-
-    const parentMessage = "I am your father";
-    const childProc = spawn([bunExe(), path.join(__dirname, "bun-ipc-child-respond.js")], {
-      ipc: (message, subProcess) => {
-        expect(message).toBe(`pong:${parentMessage}`);
-        subProcess.kill();
-        done();
-        gcTick();
-      },
-      stdio: ["inherit", "inherit", "inherit"],
-      serialization: mode,
-    });
-
-    childProc.send(parentMessage);
-    gcTick();
-  });
-});
+}
Author	SHA1	Message	Date
Jarred Sumner	534eeb85e7	Update string_immutable.zig	2025-01-11 18:31:42 -08:00
Jarred Sumner	8b3cf6f777	windows	2025-01-11 18:28:15 -08:00
Jarred Sumner	0f2309c507	Slightly reduce memory usage of IPC	2025-01-11 18:23:20 -08:00