mirror of
https://github.com/oven-sh/bun
synced 2026-02-14 12:51:54 +00:00
Fix copying UTF-16 -> UTF-8 sometimes causing invalid UTF-8 bytes (#20601)
This commit is contained in:
@@ -389,7 +389,18 @@ pub fn BabyList(comptime Type: type) type {
|
||||
const orig_len = list_.items.len;
|
||||
|
||||
const slice_ = list_.items.ptr[orig_len..list_.capacity];
|
||||
const result = strings.copyUTF16IntoUTF8WithBuffer(slice_, []const u16, remain, trimmed, out_len, true);
|
||||
const result = strings.copyUTF16IntoUTF8WithBufferImpl(
|
||||
slice_,
|
||||
[]const u16,
|
||||
remain,
|
||||
trimmed,
|
||||
out_len,
|
||||
// FIXME: Unclear whether or not we should allow
|
||||
// incomplete UTF-8 sequences. If you are solving a bug
|
||||
// with invalid UTF-8 sequences, this may be the
|
||||
// culprit...
|
||||
true,
|
||||
);
|
||||
remain = remain[result.read..];
|
||||
list_.items.len += @as(usize, result.written);
|
||||
if (result.read == 0 or result.written == 0) break;
|
||||
|
||||
@@ -79,7 +79,7 @@ pub const UTF8Fallback = struct {
|
||||
|
||||
if (stack_size >= str.len * 2) {
|
||||
var buf: [stack_size]u8 = undefined;
|
||||
const copied = bun.strings.copyUTF16IntoUTF8(&buf, []const u16, str, true);
|
||||
const copied = bun.strings.copyUTF16IntoUTF8Impl(&buf, []const u16, str, true);
|
||||
bun.assert(copied.written <= stack_size);
|
||||
bun.assert(copied.read <= stack_size);
|
||||
if (input.isDone()) {
|
||||
|
||||
@@ -51,7 +51,7 @@ pub export fn TextEncoder__encode16(
|
||||
|
||||
// max utf16 -> utf8 length
|
||||
if (slice.len <= buf.len / 4) {
|
||||
const result = strings.copyUTF16IntoUTF8(&buf, @TypeOf(slice), slice, true);
|
||||
const result = strings.copyUTF16IntoUTF8(&buf, @TypeOf(slice), slice);
|
||||
if (result.read == 0 or result.written == 0) {
|
||||
const uint8array = JSC.JSValue.createUninitializedUint8Array(globalThis, 3);
|
||||
const array_buffer = uint8array.asArrayBuffer(globalThis).?;
|
||||
@@ -97,7 +97,7 @@ pub export fn c(
|
||||
|
||||
// max utf16 -> utf8 length
|
||||
if (slice.len <= buf.len / 4) {
|
||||
const result = strings.copyUTF16IntoUTF8(&buf, @TypeOf(slice), slice, true);
|
||||
const result = strings.copyUTF16IntoUTF8(&buf, @TypeOf(slice), slice);
|
||||
if (result.read == 0 or result.written == 0) {
|
||||
const uint8array = JSC.JSValue.createUninitializedUint8Array(globalThis, 3);
|
||||
const array_buffer = uint8array.asArrayBuffer(globalThis).?;
|
||||
@@ -220,7 +220,7 @@ pub export fn TextEncoder__encodeInto16(
|
||||
) u64 {
|
||||
const output = buf_ptr[0..buf_len];
|
||||
const input = input_ptr[0..input_len];
|
||||
var result: strings.EncodeIntoResult = strings.copyUTF16IntoUTF8(output, []const u16, input, false);
|
||||
var result: strings.EncodeIntoResult = strings.copyUTF16IntoUTF8(output, []const u16, input);
|
||||
if (output.len >= 3 and (result.read == 0 or result.written == 0)) {
|
||||
const replacement_char = [_]u8{ 239, 191, 189 };
|
||||
@memcpy(buf_ptr[0..replacement_char.len], &replacement_char);
|
||||
|
||||
@@ -78,7 +78,7 @@ export fn Bun__encoding__toString(input: [*]const u8, len: usize, globalObject:
|
||||
}
|
||||
|
||||
// pub fn writeUTF16AsUTF8(utf16: [*]const u16, len: usize, to: [*]u8, to_len: usize) callconv(.C) i32 {
|
||||
// return @intCast(i32, strings.copyUTF16IntoUTF8(to[0..to_len], []const u16, utf16[0..len], true).written);
|
||||
// return @intCast(i32, strings.copyUTF16IntoUTF8(to[0..to_len], []const u16, utf16[0..len]).written);
|
||||
// }
|
||||
pub fn toString(input: []const u8, globalObject: *JSGlobalObject, encoding: Encoding) JSValue {
|
||||
return switch (encoding) {
|
||||
@@ -357,7 +357,12 @@ pub fn writeU16(input: [*]const u16, len: usize, to: [*]u8, to_len: usize, compt
|
||||
|
||||
switch (comptime encoding) {
|
||||
.utf8 => {
|
||||
return strings.copyUTF16IntoUTF8(to[0..to_len], []const u16, input[0..len], allow_partial_write).written;
|
||||
return strings.copyUTF16IntoUTF8Impl(
|
||||
to[0..to_len],
|
||||
[]const u16,
|
||||
input[0..len],
|
||||
allow_partial_write,
|
||||
).written;
|
||||
},
|
||||
.latin1, .ascii, .buffer => {
|
||||
const out = @min(len, to_len);
|
||||
|
||||
@@ -1274,7 +1274,7 @@ pub fn getFdPath(fd: FileDescriptor, buf: *bun.PathBuffer) ![]u8 {
|
||||
if (comptime Environment.isWindows) {
|
||||
var wide_buf: WPathBuffer = undefined;
|
||||
const wide_slice = try windows.GetFinalPathNameByHandle(fd.native(), .{}, wide_buf[0..]);
|
||||
const res = strings.copyUTF16IntoUTF8(buf[0..], @TypeOf(wide_slice), wide_slice, true);
|
||||
const res = strings.copyUTF16IntoUTF8(buf[0..], @TypeOf(wide_slice), wide_slice);
|
||||
return buf[0..res.written];
|
||||
}
|
||||
|
||||
|
||||
@@ -282,7 +282,7 @@ pub fn formatUTF16Type(comptime Slice: type, slice_: Slice, writer: anytype) !vo
|
||||
var slice = slice_;
|
||||
|
||||
while (slice.len > 0) {
|
||||
const result = strings.copyUTF16IntoUTF8(chunk, Slice, slice, true);
|
||||
const result = strings.copyUTF16IntoUTF8(chunk, Slice, slice);
|
||||
if (result.read == 0 or result.written == 0)
|
||||
break;
|
||||
try writer.writeAll(chunk[0..result.written]);
|
||||
@@ -308,7 +308,7 @@ pub fn formatUTF16TypeWithPathOptions(comptime Slice: type, slice_: Slice, write
|
||||
var slice = slice_;
|
||||
|
||||
while (slice.len > 0) {
|
||||
const result = strings.copyUTF16IntoUTF8(chunk, Slice, slice, true);
|
||||
const result = strings.copyUTF16IntoUTF8(chunk, Slice, slice);
|
||||
if (result.read == 0 or result.written == 0)
|
||||
break;
|
||||
|
||||
|
||||
@@ -1228,7 +1228,7 @@ const Copy = union(enum) {
|
||||
switch (this) {
|
||||
.utf16 => |utf16| {
|
||||
header.len = WebsocketHeader.packLength(content_byte_len);
|
||||
const encode_into_result = strings.copyUTF16IntoUTF8(to_mask, []const u16, utf16, true);
|
||||
const encode_into_result = strings.copyUTF16IntoUTF8Impl(to_mask, []const u16, utf16, true);
|
||||
bun.assert(@as(usize, encode_into_result.written) == content_byte_len);
|
||||
bun.assert(@as(usize, encode_into_result.read) == utf16.len);
|
||||
header.len = WebsocketHeader.packLength(encode_into_result.written);
|
||||
|
||||
@@ -1317,7 +1317,7 @@ pub const PackageInstall = struct {
|
||||
_ = node_fs_for_package_installer.mkdirRecursiveOSPathImpl(void, {}, fullpath, 0, false);
|
||||
}
|
||||
|
||||
const res = strings.copyUTF16IntoUTF8(dest_buf[0..], []const u16, wbuf[0..i], true);
|
||||
const res = strings.copyUTF16IntoUTF8(dest_buf[0..], []const u16, wbuf[0..i]);
|
||||
var offset: usize = res.written;
|
||||
if (dest_buf[offset - 1] != std.fs.path.sep_windows) {
|
||||
dest_buf[offset] = std.fs.path.sep_windows;
|
||||
|
||||
@@ -330,3 +330,11 @@ pub fn Tagged(comptime U: type, comptime T: type) type {
|
||||
info.decls = &.{};
|
||||
return @Type(.{ .@"union" = info });
|
||||
}
|
||||
|
||||
pub fn SliceChild(comptime T: type) type {
|
||||
const tyinfo = @typeInfo(T);
|
||||
if (tyinfo == .pointer and tyinfo.pointer.size == .slice) {
|
||||
return tyinfo.pointer.child;
|
||||
}
|
||||
return T;
|
||||
}
|
||||
|
||||
@@ -379,7 +379,6 @@ pub const BufferedWriter = struct {
|
||||
this.remain()[0 .. bytes.len * 2],
|
||||
[]const u16,
|
||||
bytes,
|
||||
true,
|
||||
);
|
||||
this.context.list.items.len += @as(usize, decoded.written);
|
||||
return pending.len;
|
||||
@@ -393,7 +392,6 @@ pub const BufferedWriter = struct {
|
||||
this.remain()[0 .. bytes.len * 2],
|
||||
[]const u16,
|
||||
bytes,
|
||||
true,
|
||||
);
|
||||
this.pos += @as(usize, decoded.written);
|
||||
}
|
||||
|
||||
@@ -39,7 +39,7 @@ pub fn isWindowsAbsolutePathMissingDriveLetter(comptime T: type, chars: []const
|
||||
pub fn fromWPath(buf: []u8, utf16: []const u16) [:0]const u8 {
|
||||
bun.unsafeAssert(buf.len > 0);
|
||||
const to_copy = trimPrefixComptime(u16, utf16, bun.windows.long_path_prefix);
|
||||
const encode_into_result = copyUTF16IntoUTF8(buf[0 .. buf.len - 1], []const u16, to_copy, false);
|
||||
const encode_into_result = copyUTF16IntoUTF8(buf[0 .. buf.len - 1], []const u16, to_copy);
|
||||
bun.unsafeAssert(encode_into_result.written < buf.len);
|
||||
buf[encode_into_result.written] = 0;
|
||||
return buf[0..encode_into_result.written :0];
|
||||
|
||||
@@ -428,7 +428,9 @@ pub fn toUTF8ListWithTypeBun(list: *std.ArrayList(u8), comptime Type: type, utf1
|
||||
}
|
||||
|
||||
pub const EncodeIntoResult = struct {
|
||||
/// The number of u16s we read from the utf-16 buffer
|
||||
read: u32 = 0,
|
||||
/// The number of u8s we wrote to the utf-8 buffer
|
||||
written: u32 = 0,
|
||||
};
|
||||
pub fn allocateLatin1IntoUTF8(allocator: std.mem.Allocator, comptime Type: type, latin1_: Type) ![]u8 {
|
||||
@@ -1679,7 +1681,15 @@ pub fn latin1ToCodepointBytesAssumeNotASCII16(char: u32) u16 {
|
||||
return latin1_to_utf16_conversion_table[@as(u8, @truncate(char))];
|
||||
}
|
||||
|
||||
pub fn copyUTF16IntoUTF8(buf: []u8, comptime Type: type, utf16: Type, comptime allow_partial_write: bool) EncodeIntoResult {
|
||||
/// Copy a UTF-16 string as UTF-8 into `buf`
|
||||
///
|
||||
/// This may not encode everything if `buf` is not big enough.
|
||||
pub fn copyUTF16IntoUTF8(buf: []u8, comptime Type: type, utf16: Type) EncodeIntoResult {
|
||||
return copyUTF16IntoUTF8Impl(buf, Type, utf16, false);
|
||||
}
|
||||
|
||||
/// See comment on `copyUTF16IntoUTF8WithBufferImpl` on what `allow_truncated_utf8_sequence` should do
|
||||
pub fn copyUTF16IntoUTF8Impl(buf: []u8, comptime Type: type, utf16: Type, comptime allow_truncated_utf8_sequence: bool) EncodeIntoResult {
|
||||
if (comptime Type == []const u16) {
|
||||
if (bun.FeatureFlags.use_simdutf) {
|
||||
if (utf16.len == 0)
|
||||
@@ -1693,14 +1703,33 @@ pub fn copyUTF16IntoUTF8(buf: []u8, comptime Type: type, utf16: Type, comptime a
|
||||
else
|
||||
buf.len;
|
||||
|
||||
return copyUTF16IntoUTF8WithBuffer(buf, Type, utf16, trimmed, out_len, allow_partial_write);
|
||||
return copyUTF16IntoUTF8WithBufferImpl(buf, Type, utf16, trimmed, out_len, allow_truncated_utf8_sequence);
|
||||
}
|
||||
}
|
||||
|
||||
return copyUTF16IntoUTF8WithBuffer(buf, Type, utf16, utf16, utf16.len, allow_partial_write);
|
||||
return copyUTF16IntoUTF8WithBufferImpl(buf, Type, utf16, utf16, utf16.len, allow_truncated_utf8_sequence);
|
||||
}
|
||||
|
||||
pub fn copyUTF16IntoUTF8WithBuffer(buf: []u8, comptime Type: type, utf16: Type, trimmed: Type, out_len: usize, comptime allow_partial_write: bool) EncodeIntoResult {
|
||||
pub fn copyUTF16IntoUTF8WithBuffer(buf: []u8, comptime Type: type, utf16: Type, trimmed: Type, out_len: usize) EncodeIntoResult {
|
||||
return copyUTF16IntoUTF8WithBufferImpl(buf, Type, utf16, trimmed, out_len, false);
|
||||
}
|
||||
|
||||
/// Q: What does the `allow_truncated_utf8_sequence` parameter do?
|
||||
/// A: If the output buffer can't fit everything, this function will write
|
||||
/// incomplete utf-8 byte sequences if `allow_truncated_utf8_sequence` is
|
||||
/// enabled.
|
||||
///
|
||||
/// Q: Doesn't that mean this function would output invalid utf-8? Why would you
|
||||
/// ever want to do that?
|
||||
/// A: Yes. This is needed for writing a UTF-16 string to a node Buffer that
|
||||
/// doesn't have enough space for all the bytes:
|
||||
///
|
||||
/// ```js
|
||||
/// let buffer = Buffer.allocUnsafe(1);
|
||||
/// buffer.fill("\u0222");
|
||||
/// expect(buffer[0]).toBe(0xc8);
|
||||
/// ```
|
||||
pub fn copyUTF16IntoUTF8WithBufferImpl(buf: []u8, comptime Type: type, utf16: Type, trimmed: Type, out_len: usize, comptime allow_truncated_utf8_sequence: bool) EncodeIntoResult {
|
||||
var remaining = buf;
|
||||
var utf16_remaining = utf16;
|
||||
var ended_on_non_ascii = false;
|
||||
@@ -1734,9 +1763,10 @@ pub fn copyUTF16IntoUTF8WithBuffer(buf: []u8, comptime Type: type, utf16: Type,
|
||||
const replacement = utf16CodepointWithFFFD(Type, utf16_remaining);
|
||||
|
||||
const width: usize = replacement.utf8Width();
|
||||
bun.assert(width > 1);
|
||||
if (width > remaining.len) {
|
||||
ended_on_non_ascii = width > 1;
|
||||
if (comptime allow_partial_write) switch (width) {
|
||||
if (comptime allow_truncated_utf8_sequence) switch (width) {
|
||||
2 => {
|
||||
if (remaining.len > 0) {
|
||||
//only first will be written
|
||||
|
||||
@@ -2247,7 +2247,9 @@ pub const copyU16IntoU8 = unicode.copyU16IntoU8;
|
||||
pub const copyU8IntoU16 = unicode.copyU8IntoU16;
|
||||
pub const copyU8IntoU16WithAlignment = unicode.copyU8IntoU16WithAlignment;
|
||||
pub const copyUTF16IntoUTF8 = unicode.copyUTF16IntoUTF8;
|
||||
pub const copyUTF16IntoUTF8Impl = unicode.copyUTF16IntoUTF8Impl;
|
||||
pub const copyUTF16IntoUTF8WithBuffer = unicode.copyUTF16IntoUTF8WithBuffer;
|
||||
pub const copyUTF16IntoUTF8WithBufferImpl = unicode.copyUTF16IntoUTF8WithBufferImpl;
|
||||
pub const decodeCheck = unicode.decodeCheck;
|
||||
pub const decodeWTF8RuneT = unicode.decodeWTF8RuneT;
|
||||
pub const decodeWTF8RuneTMultibyte = unicode.decodeWTF8RuneTMultibyte;
|
||||
|
||||
@@ -216,7 +216,7 @@ pub fn watchLoopCycle(this: *bun.Watcher) bun.JSC.Maybe(void) {
|
||||
const item_paths = this.watchlist.items(.file_path);
|
||||
log("number of watched items: {d}", .{item_paths.len});
|
||||
while (iter.next()) |event| {
|
||||
const convert_res = bun.strings.copyUTF16IntoUTF8(buf[base_idx..], []const u16, event.filename, false);
|
||||
const convert_res = bun.strings.copyUTF16IntoUTF8(buf[base_idx..], []const u16, event.filename);
|
||||
const eventpath = buf[0 .. base_idx + convert_res.written];
|
||||
|
||||
log("watcher update event: (filename: {s}, action: {s}", .{ eventpath, @tagName(event.action) });
|
||||
|
||||
2
test/js/web/console/console-log-utf16.fixture.js
Normal file
2
test/js/web/console/console-log-utf16.fixture.js
Normal file
@@ -0,0 +1,2 @@
|
||||
const text = Array(10000).fill("肉醬意大利粉").join("\n");
|
||||
console.log(text);
|
||||
22
test/js/web/console/console-log-utf16.test.ts
Normal file
22
test/js/web/console/console-log-utf16.test.ts
Normal file
@@ -0,0 +1,22 @@
|
||||
import { expect, it } from "bun:test";
|
||||
import { bunEnv, bunExe } from "harness";
|
||||
import { join } from "node:path";
|
||||
|
||||
it("works with large utf-16 strings", async () => {
|
||||
const filepath = join(import.meta.dir, "console-log-utf16.fixture.js").replaceAll("\\", "/");
|
||||
const proc = Bun.spawn({
|
||||
cmd: [bunExe(), filepath],
|
||||
env: { ...bunEnv },
|
||||
stdio: ["inherit", "pipe", "pipe"],
|
||||
});
|
||||
|
||||
const exitCode = await proc.exited;
|
||||
const stdout = await new Response(proc.stdout).text();
|
||||
const stderr = await new Response(proc.stderr).text();
|
||||
expect(stderr).toBeEmpty();
|
||||
expect(exitCode).toBe(0);
|
||||
|
||||
const expected = Array(10000).fill("肉醬意大利粉").join("\n");
|
||||
// Add the \n because `console.log` adds a newline
|
||||
expect(stdout).toBe(expected + "\n");
|
||||
});
|
||||
Reference in New Issue
Block a user