diff --git a/src/baby_list.zig b/src/baby_list.zig index 4e95e7d9f2..f7a5272e5a 100644 --- a/src/baby_list.zig +++ b/src/baby_list.zig @@ -389,7 +389,18 @@ pub fn BabyList(comptime Type: type) type { const orig_len = list_.items.len; const slice_ = list_.items.ptr[orig_len..list_.capacity]; - const result = strings.copyUTF16IntoUTF8WithBuffer(slice_, []const u16, remain, trimmed, out_len, true); + const result = strings.copyUTF16IntoUTF8WithBufferImpl( + slice_, + []const u16, + remain, + trimmed, + out_len, + // FIXME: Unclear whether or not we should allow + // incomplete UTF-8 sequences. If you are solving a bug + // with invalid UTF-8 sequences, this may be the + // culprit... + true, + ); remain = remain[result.read..]; list_.items.len += @as(usize, result.written); if (result.read == 0 or result.written == 0) break; diff --git a/src/bun.js/webcore/Sink.zig b/src/bun.js/webcore/Sink.zig index e46bed8c48..c6081017f8 100644 --- a/src/bun.js/webcore/Sink.zig +++ b/src/bun.js/webcore/Sink.zig @@ -79,7 +79,7 @@ pub const UTF8Fallback = struct { if (stack_size >= str.len * 2) { var buf: [stack_size]u8 = undefined; - const copied = bun.strings.copyUTF16IntoUTF8(&buf, []const u16, str, true); + const copied = bun.strings.copyUTF16IntoUTF8Impl(&buf, []const u16, str, true); bun.assert(copied.written <= stack_size); bun.assert(copied.read <= stack_size); if (input.isDone()) { diff --git a/src/bun.js/webcore/TextEncoder.zig b/src/bun.js/webcore/TextEncoder.zig index e6845f3b88..caf57f14e8 100644 --- a/src/bun.js/webcore/TextEncoder.zig +++ b/src/bun.js/webcore/TextEncoder.zig @@ -51,7 +51,7 @@ pub export fn TextEncoder__encode16( // max utf16 -> utf8 length if (slice.len <= buf.len / 4) { - const result = strings.copyUTF16IntoUTF8(&buf, @TypeOf(slice), slice, true); + const result = strings.copyUTF16IntoUTF8(&buf, @TypeOf(slice), slice); if (result.read == 0 or result.written == 0) { const uint8array = JSC.JSValue.createUninitializedUint8Array(globalThis, 3); const array_buffer = uint8array.asArrayBuffer(globalThis).?; @@ -97,7 +97,7 @@ pub export fn c( // max utf16 -> utf8 length if (slice.len <= buf.len / 4) { - const result = strings.copyUTF16IntoUTF8(&buf, @TypeOf(slice), slice, true); + const result = strings.copyUTF16IntoUTF8(&buf, @TypeOf(slice), slice); if (result.read == 0 or result.written == 0) { const uint8array = JSC.JSValue.createUninitializedUint8Array(globalThis, 3); const array_buffer = uint8array.asArrayBuffer(globalThis).?; @@ -220,7 +220,7 @@ pub export fn TextEncoder__encodeInto16( ) u64 { const output = buf_ptr[0..buf_len]; const input = input_ptr[0..input_len]; - var result: strings.EncodeIntoResult = strings.copyUTF16IntoUTF8(output, []const u16, input, false); + var result: strings.EncodeIntoResult = strings.copyUTF16IntoUTF8(output, []const u16, input); if (output.len >= 3 and (result.read == 0 or result.written == 0)) { const replacement_char = [_]u8{ 239, 191, 189 }; @memcpy(buf_ptr[0..replacement_char.len], &replacement_char); diff --git a/src/bun.js/webcore/encoding.zig b/src/bun.js/webcore/encoding.zig index a1fdca0158..8eda33a62f 100644 --- a/src/bun.js/webcore/encoding.zig +++ b/src/bun.js/webcore/encoding.zig @@ -78,7 +78,7 @@ export fn Bun__encoding__toString(input: [*]const u8, len: usize, globalObject: } // pub fn writeUTF16AsUTF8(utf16: [*]const u16, len: usize, to: [*]u8, to_len: usize) callconv(.C) i32 { -// return @intCast(i32, strings.copyUTF16IntoUTF8(to[0..to_len], []const u16, utf16[0..len], true).written); +// return @intCast(i32, strings.copyUTF16IntoUTF8(to[0..to_len], []const u16, utf16[0..len]).written); // } pub fn toString(input: []const u8, globalObject: *JSGlobalObject, encoding: Encoding) JSValue { return switch (encoding) { @@ -357,7 +357,12 @@ pub fn writeU16(input: [*]const u16, len: usize, to: [*]u8, to_len: usize, compt switch (comptime encoding) { .utf8 => { - return strings.copyUTF16IntoUTF8(to[0..to_len], []const u16, input[0..len], allow_partial_write).written; + return strings.copyUTF16IntoUTF8Impl( + to[0..to_len], + []const u16, + input[0..len], + allow_partial_write, + ).written; }, .latin1, .ascii, .buffer => { const out = @min(len, to_len); diff --git a/src/bun.zig b/src/bun.zig index 56d1213325..dc357b262a 100644 --- a/src/bun.zig +++ b/src/bun.zig @@ -1274,7 +1274,7 @@ pub fn getFdPath(fd: FileDescriptor, buf: *bun.PathBuffer) ![]u8 { if (comptime Environment.isWindows) { var wide_buf: WPathBuffer = undefined; const wide_slice = try windows.GetFinalPathNameByHandle(fd.native(), .{}, wide_buf[0..]); - const res = strings.copyUTF16IntoUTF8(buf[0..], @TypeOf(wide_slice), wide_slice, true); + const res = strings.copyUTF16IntoUTF8(buf[0..], @TypeOf(wide_slice), wide_slice); return buf[0..res.written]; } diff --git a/src/fmt.zig b/src/fmt.zig index a3540ad842..2ecd2ce4d0 100644 --- a/src/fmt.zig +++ b/src/fmt.zig @@ -282,7 +282,7 @@ pub fn formatUTF16Type(comptime Slice: type, slice_: Slice, writer: anytype) !vo var slice = slice_; while (slice.len > 0) { - const result = strings.copyUTF16IntoUTF8(chunk, Slice, slice, true); + const result = strings.copyUTF16IntoUTF8(chunk, Slice, slice); if (result.read == 0 or result.written == 0) break; try writer.writeAll(chunk[0..result.written]); @@ -308,7 +308,7 @@ pub fn formatUTF16TypeWithPathOptions(comptime Slice: type, slice_: Slice, write var slice = slice_; while (slice.len > 0) { - const result = strings.copyUTF16IntoUTF8(chunk, Slice, slice, true); + const result = strings.copyUTF16IntoUTF8(chunk, Slice, slice); if (result.read == 0 or result.written == 0) break; diff --git a/src/http/websocket_client.zig b/src/http/websocket_client.zig index 87c99ee22b..a9bb5cc2f8 100644 --- a/src/http/websocket_client.zig +++ b/src/http/websocket_client.zig @@ -1228,7 +1228,7 @@ const Copy = union(enum) { switch (this) { .utf16 => |utf16| { header.len = WebsocketHeader.packLength(content_byte_len); - const encode_into_result = strings.copyUTF16IntoUTF8(to_mask, []const u16, utf16, true); + const encode_into_result = strings.copyUTF16IntoUTF8Impl(to_mask, []const u16, utf16, true); bun.assert(@as(usize, encode_into_result.written) == content_byte_len); bun.assert(@as(usize, encode_into_result.read) == utf16.len); header.len = WebsocketHeader.packLength(encode_into_result.written); diff --git a/src/install/PackageInstall.zig b/src/install/PackageInstall.zig index 589205a197..9736414a83 100644 --- a/src/install/PackageInstall.zig +++ b/src/install/PackageInstall.zig @@ -1317,7 +1317,7 @@ pub const PackageInstall = struct { _ = node_fs_for_package_installer.mkdirRecursiveOSPathImpl(void, {}, fullpath, 0, false); } - const res = strings.copyUTF16IntoUTF8(dest_buf[0..], []const u16, wbuf[0..i], true); + const res = strings.copyUTF16IntoUTF8(dest_buf[0..], []const u16, wbuf[0..i]); var offset: usize = res.written; if (dest_buf[offset - 1] != std.fs.path.sep_windows) { dest_buf[offset] = std.fs.path.sep_windows; diff --git a/src/meta.zig b/src/meta.zig index 49d5b794dc..607124e246 100644 --- a/src/meta.zig +++ b/src/meta.zig @@ -330,3 +330,11 @@ pub fn Tagged(comptime U: type, comptime T: type) type { info.decls = &.{}; return @Type(.{ .@"union" = info }); } + +pub fn SliceChild(comptime T: type) type { + const tyinfo = @typeInfo(T); + if (tyinfo == .pointer and tyinfo.pointer.size == .slice) { + return tyinfo.pointer.child; + } + return T; +} diff --git a/src/string/MutableString.zig b/src/string/MutableString.zig index 0cdb26f374..e98c675349 100644 --- a/src/string/MutableString.zig +++ b/src/string/MutableString.zig @@ -379,7 +379,6 @@ pub const BufferedWriter = struct { this.remain()[0 .. bytes.len * 2], []const u16, bytes, - true, ); this.context.list.items.len += @as(usize, decoded.written); return pending.len; @@ -393,7 +392,6 @@ pub const BufferedWriter = struct { this.remain()[0 .. bytes.len * 2], []const u16, bytes, - true, ); this.pos += @as(usize, decoded.written); } diff --git a/src/string/paths.zig b/src/string/paths.zig index da6073f39b..07813f037e 100644 --- a/src/string/paths.zig +++ b/src/string/paths.zig @@ -39,7 +39,7 @@ pub fn isWindowsAbsolutePathMissingDriveLetter(comptime T: type, chars: []const pub fn fromWPath(buf: []u8, utf16: []const u16) [:0]const u8 { bun.unsafeAssert(buf.len > 0); const to_copy = trimPrefixComptime(u16, utf16, bun.windows.long_path_prefix); - const encode_into_result = copyUTF16IntoUTF8(buf[0 .. buf.len - 1], []const u16, to_copy, false); + const encode_into_result = copyUTF16IntoUTF8(buf[0 .. buf.len - 1], []const u16, to_copy); bun.unsafeAssert(encode_into_result.written < buf.len); buf[encode_into_result.written] = 0; return buf[0..encode_into_result.written :0]; diff --git a/src/string/unicode.zig b/src/string/unicode.zig index 0404ff020d..42baa91591 100644 --- a/src/string/unicode.zig +++ b/src/string/unicode.zig @@ -428,7 +428,9 @@ pub fn toUTF8ListWithTypeBun(list: *std.ArrayList(u8), comptime Type: type, utf1 } pub const EncodeIntoResult = struct { + /// The number of u16s we read from the utf-16 buffer read: u32 = 0, + /// The number of u8s we wrote to the utf-8 buffer written: u32 = 0, }; pub fn allocateLatin1IntoUTF8(allocator: std.mem.Allocator, comptime Type: type, latin1_: Type) ![]u8 { @@ -1679,7 +1681,15 @@ pub fn latin1ToCodepointBytesAssumeNotASCII16(char: u32) u16 { return latin1_to_utf16_conversion_table[@as(u8, @truncate(char))]; } -pub fn copyUTF16IntoUTF8(buf: []u8, comptime Type: type, utf16: Type, comptime allow_partial_write: bool) EncodeIntoResult { +/// Copy a UTF-16 string as UTF-8 into `buf` +/// +/// This may not encode everything if `buf` is not big enough. +pub fn copyUTF16IntoUTF8(buf: []u8, comptime Type: type, utf16: Type) EncodeIntoResult { + return copyUTF16IntoUTF8Impl(buf, Type, utf16, false); +} + +/// See comment on `copyUTF16IntoUTF8WithBufferImpl` on what `allow_truncated_utf8_sequence` should do +pub fn copyUTF16IntoUTF8Impl(buf: []u8, comptime Type: type, utf16: Type, comptime allow_truncated_utf8_sequence: bool) EncodeIntoResult { if (comptime Type == []const u16) { if (bun.FeatureFlags.use_simdutf) { if (utf16.len == 0) @@ -1693,14 +1703,33 @@ pub fn copyUTF16IntoUTF8(buf: []u8, comptime Type: type, utf16: Type, comptime a else buf.len; - return copyUTF16IntoUTF8WithBuffer(buf, Type, utf16, trimmed, out_len, allow_partial_write); + return copyUTF16IntoUTF8WithBufferImpl(buf, Type, utf16, trimmed, out_len, allow_truncated_utf8_sequence); } } - return copyUTF16IntoUTF8WithBuffer(buf, Type, utf16, utf16, utf16.len, allow_partial_write); + return copyUTF16IntoUTF8WithBufferImpl(buf, Type, utf16, utf16, utf16.len, allow_truncated_utf8_sequence); } -pub fn copyUTF16IntoUTF8WithBuffer(buf: []u8, comptime Type: type, utf16: Type, trimmed: Type, out_len: usize, comptime allow_partial_write: bool) EncodeIntoResult { +pub fn copyUTF16IntoUTF8WithBuffer(buf: []u8, comptime Type: type, utf16: Type, trimmed: Type, out_len: usize) EncodeIntoResult { + return copyUTF16IntoUTF8WithBufferImpl(buf, Type, utf16, trimmed, out_len, false); +} + +/// Q: What does the `allow_truncated_utf8_sequence` parameter do? +/// A: If the output buffer can't fit everything, this function will write +/// incomplete utf-8 byte sequences if `allow_truncated_utf8_sequence` is +/// enabled. +/// +/// Q: Doesn't that mean this function would output invalid utf-8? Why would you +/// ever want to do that? +/// A: Yes. This is needed for writing a UTF-16 string to a node Buffer that +/// doesn't have enough space for all the bytes: +/// +/// ```js +/// let buffer = Buffer.allocUnsafe(1); +/// buffer.fill("\u0222"); +/// expect(buffer[0]).toBe(0xc8); +/// ``` +pub fn copyUTF16IntoUTF8WithBufferImpl(buf: []u8, comptime Type: type, utf16: Type, trimmed: Type, out_len: usize, comptime allow_truncated_utf8_sequence: bool) EncodeIntoResult { var remaining = buf; var utf16_remaining = utf16; var ended_on_non_ascii = false; @@ -1734,9 +1763,10 @@ pub fn copyUTF16IntoUTF8WithBuffer(buf: []u8, comptime Type: type, utf16: Type, const replacement = utf16CodepointWithFFFD(Type, utf16_remaining); const width: usize = replacement.utf8Width(); + bun.assert(width > 1); if (width > remaining.len) { ended_on_non_ascii = width > 1; - if (comptime allow_partial_write) switch (width) { + if (comptime allow_truncated_utf8_sequence) switch (width) { 2 => { if (remaining.len > 0) { //only first will be written diff --git a/src/string_immutable.zig b/src/string_immutable.zig index 63a1d8991a..7c4f6637f3 100644 --- a/src/string_immutable.zig +++ b/src/string_immutable.zig @@ -2247,7 +2247,9 @@ pub const copyU16IntoU8 = unicode.copyU16IntoU8; pub const copyU8IntoU16 = unicode.copyU8IntoU16; pub const copyU8IntoU16WithAlignment = unicode.copyU8IntoU16WithAlignment; pub const copyUTF16IntoUTF8 = unicode.copyUTF16IntoUTF8; +pub const copyUTF16IntoUTF8Impl = unicode.copyUTF16IntoUTF8Impl; pub const copyUTF16IntoUTF8WithBuffer = unicode.copyUTF16IntoUTF8WithBuffer; +pub const copyUTF16IntoUTF8WithBufferImpl = unicode.copyUTF16IntoUTF8WithBufferImpl; pub const decodeCheck = unicode.decodeCheck; pub const decodeWTF8RuneT = unicode.decodeWTF8RuneT; pub const decodeWTF8RuneTMultibyte = unicode.decodeWTF8RuneTMultibyte; diff --git a/src/watcher/WindowsWatcher.zig b/src/watcher/WindowsWatcher.zig index c53666468c..e9e88ec19e 100644 --- a/src/watcher/WindowsWatcher.zig +++ b/src/watcher/WindowsWatcher.zig @@ -216,7 +216,7 @@ pub fn watchLoopCycle(this: *bun.Watcher) bun.JSC.Maybe(void) { const item_paths = this.watchlist.items(.file_path); log("number of watched items: {d}", .{item_paths.len}); while (iter.next()) |event| { - const convert_res = bun.strings.copyUTF16IntoUTF8(buf[base_idx..], []const u16, event.filename, false); + const convert_res = bun.strings.copyUTF16IntoUTF8(buf[base_idx..], []const u16, event.filename); const eventpath = buf[0 .. base_idx + convert_res.written]; log("watcher update event: (filename: {s}, action: {s}", .{ eventpath, @tagName(event.action) }); diff --git a/test/js/web/console/console-log-utf16.fixture.js b/test/js/web/console/console-log-utf16.fixture.js new file mode 100644 index 0000000000..5a4b2a99ff --- /dev/null +++ b/test/js/web/console/console-log-utf16.fixture.js @@ -0,0 +1,2 @@ +const text = Array(10000).fill("肉醬意大利粉").join("\n"); +console.log(text); diff --git a/test/js/web/console/console-log-utf16.test.ts b/test/js/web/console/console-log-utf16.test.ts new file mode 100644 index 0000000000..cf1d42f0a6 --- /dev/null +++ b/test/js/web/console/console-log-utf16.test.ts @@ -0,0 +1,22 @@ +import { expect, it } from "bun:test"; +import { bunEnv, bunExe } from "harness"; +import { join } from "node:path"; + +it("works with large utf-16 strings", async () => { + const filepath = join(import.meta.dir, "console-log-utf16.fixture.js").replaceAll("\\", "/"); + const proc = Bun.spawn({ + cmd: [bunExe(), filepath], + env: { ...bunEnv }, + stdio: ["inherit", "pipe", "pipe"], + }); + + const exitCode = await proc.exited; + const stdout = await new Response(proc.stdout).text(); + const stderr = await new Response(proc.stderr).text(); + expect(stderr).toBeEmpty(); + expect(exitCode).toBe(0); + + const expected = Array(10000).fill("肉醬意大利粉").join("\n"); + // Add the \n because `console.log` adds a newline + expect(stdout).toBe(expected + "\n"); +});