diff --git a/src/bun.js/webcore/TextDecoder.zig b/src/bun.js/webcore/TextDecoder.zig index 397d15a7b9..715b3d246a 100644 --- a/src/bun.js/webcore/TextDecoder.zig +++ b/src/bun.js/webcore/TextDecoder.zig @@ -201,11 +201,11 @@ fn decodeSlice(this: *TextDecoder, globalThis: *jsc.JSGlobalObject, buffer_slice // It's unintuitive that we encode Latin1 as UTF16 even though the engine natively supports Latin1 strings... // However, this is also what WebKit seems to do. // - // It's not clear why we couldn't jusst use Latin1 here, but tests failures proved it necessary. - const out_length = strings.elementLengthLatin1IntoUTF16([]const u8, buffer_slice); + // => The reason we need to encode it is because TextDecoder "latin1" is actually CP1252, while WebKit latin1 is 8-bit utf-16 + const out_length = strings.elementLengthCP1252IntoUTF16([]const u8, buffer_slice); const bytes = try bun.default_allocator.alloc(u16, out_length); - const out = strings.copyLatin1IntoUTF16([]u16, bytes, []const u8, buffer_slice); + const out = strings.copyCP1252IntoUTF16([]u16, bytes, []const u8, buffer_slice); return ZigString.toExternalU16(bytes.ptr, out.written, globalThis); }, EncodingLabel.@"UTF-8" => { diff --git a/src/js_printer.zig b/src/js_printer.zig index afdf531890..0b6faa2198 100644 --- a/src/js_printer.zig +++ b/src/js_printer.zig @@ -175,10 +175,7 @@ pub fn writePreQuotedString(text_in: []const u8, comptime Writer: type, writer: std.debug.assert(text[i] <= 0x7F); break :brk text[i]; }, - .latin1 => brk: { - if (text[i] <= 0x7F) break :brk text[i]; - break :brk strings.latin1ToCodepointAssumeNotASCII(text[i], i32); - }, + .latin1 => text[i], .utf16 => brk: { // TODO: if this is a part of a surrogate pair, we could parse the whole codepoint in order // to emit it as a single \u{result} rather than two paired \uLOW\uHIGH. diff --git a/src/string/immutable.zig b/src/string/immutable.zig index c96db67bf1..475e490446 100644 --- a/src/string/immutable.zig +++ b/src/string/immutable.zig @@ -2239,6 +2239,7 @@ pub const convertUTF8toUTF16InBuffer = unicode.convertUTF8toUTF16InBuffer; pub const convertUTF8toUTF16InBufferZ = unicode.convertUTF8toUTF16InBufferZ; pub const copyLatin1IntoASCII = unicode.copyLatin1IntoASCII; pub const copyLatin1IntoUTF16 = unicode.copyLatin1IntoUTF16; +pub const copyCP1252IntoUTF16 = unicode.copyCP1252IntoUTF16; pub const copyLatin1IntoUTF8 = unicode.copyLatin1IntoUTF8; pub const copyLatin1IntoUTF8StopOnNonASCII = unicode.copyLatin1IntoUTF8StopOnNonASCII; pub const copyU16IntoU8 = unicode.copyU16IntoU8; @@ -2251,7 +2252,7 @@ pub const copyUTF16IntoUTF8WithBufferImpl = unicode.copyUTF16IntoUTF8WithBufferI pub const decodeCheck = unicode.decodeCheck; pub const decodeWTF8RuneT = unicode.decodeWTF8RuneT; pub const decodeWTF8RuneTMultibyte = unicode.decodeWTF8RuneTMultibyte; -pub const elementLengthLatin1IntoUTF16 = unicode.elementLengthLatin1IntoUTF16; +pub const elementLengthCP1252IntoUTF16 = unicode.elementLengthCP1252IntoUTF16; pub const elementLengthLatin1IntoUTF8 = unicode.elementLengthLatin1IntoUTF8; pub const elementLengthUTF16IntoUTF8 = unicode.elementLengthUTF16IntoUTF8; pub const elementLengthUTF8IntoUTF16 = unicode.elementLengthUTF8IntoUTF16; @@ -2262,9 +2263,9 @@ pub const eqlUtf16 = unicode.eqlUtf16; pub const isAllASCII = unicode.isAllASCII; pub const isValidUTF8 = unicode.isValidUTF8; pub const isValidUTF8WithoutSIMD = unicode.isValidUTF8WithoutSIMD; -pub const latin1ToCodepointAssumeNotASCII = unicode.latin1ToCodepointAssumeNotASCII; -pub const latin1ToCodepointBytesAssumeNotASCII = unicode.latin1ToCodepointBytesAssumeNotASCII; -pub const latin1ToCodepointBytesAssumeNotASCII16 = unicode.latin1ToCodepointBytesAssumeNotASCII16; +pub const cp1252ToCodepointAssumeNotASCII = unicode.cp1252ToCodepointAssumeNotASCII; +pub const cp1252ToCodepointBytesAssumeNotASCII = unicode.cp1252ToCodepointBytesAssumeNotASCII; +pub const cp1252ToCodepointBytesAssumeNotASCII16 = unicode.cp1252ToCodepointBytesAssumeNotASCII16; pub const literal = unicode.literal; pub const nonASCIISequenceLength = unicode.nonASCIISequenceLength; pub const replaceLatin1WithUTF8 = unicode.replaceLatin1WithUTF8; diff --git a/src/string/immutable/unicode.zig b/src/string/immutable/unicode.zig index ea8492b0e1..06a92236f7 100644 --- a/src/string/immutable/unicode.zig +++ b/src/string/immutable/unicode.zig @@ -839,7 +839,7 @@ pub fn elementLengthLatin1IntoUTF8(slice: []const u8) usize { return bun.simdutf.length.utf8.from.latin1(slice); } -pub fn copyLatin1IntoUTF16(comptime Buffer: type, buf_: Buffer, comptime Type: type, latin1_: Type) EncodeIntoResult { +pub fn copyCP1252IntoUTF16(comptime Buffer: type, buf_: Buffer, comptime Type: type, latin1_: Type) EncodeIntoResult { var buf = buf_; var latin1 = latin1_; while (buf.len > 0 and latin1.len > 0) { @@ -853,7 +853,7 @@ pub fn copyLatin1IntoUTF16(comptime Buffer: type, buf_: Buffer, comptime Type: t latin1 = latin1[to_write..]; buf = buf[to_write..]; if (latin1.len > 0 and buf.len >= 1) { - buf[0] = latin1ToCodepointBytesAssumeNotASCII16(latin1[0]); + buf[0] = cp1252ToCodepointBytesAssumeNotASCII16(latin1[0]); latin1 = latin1[1..]; buf = buf[1..]; } @@ -865,13 +865,15 @@ pub fn copyLatin1IntoUTF16(comptime Buffer: type, buf_: Buffer, comptime Type: t }; } -pub fn elementLengthLatin1IntoUTF16(comptime Type: type, latin1_: Type) usize { - // latin1 is always at most 1 UTF-16 code unit long - if (comptime std.meta.Child([]const u16) == Type) { - return latin1_.len; - } +pub fn copyLatin1IntoUTF16(comptime Buffer: type, buf_: Buffer, comptime Type: type, latin1_: Type) EncodeIntoResult { + const len = @min(buf_.len, latin1_.len); + for (buf_[0..len], latin1_[0..len]) |*out, in| out.* = in; + return .{ .read = @as(u32, @truncate(len)), .written = @as(u32, @truncate(len)) }; +} - return bun.simdutf.length.utf16.from.latin1(latin1_); +pub fn elementLengthCP1252IntoUTF16(comptime Type: type, cp1252_: Type) usize { + // cp1252 is always at most 1 UTF-16 code unit long + return cp1252_.len; } pub fn eqlUtf16(comptime self: string, other: []const u16) bool { @@ -1629,14 +1631,14 @@ pub fn convertUTF16toUTF8InBuffer( return buf[0..result]; } -pub fn latin1ToCodepointAssumeNotASCII(char: u8, comptime CodePointType: type) CodePointType { +pub fn cp1252ToCodepointAssumeNotASCII(char: u8, comptime CodePointType: type) CodePointType { return @as( CodePointType, - @intCast(latin1ToCodepointBytesAssumeNotASCII16(char)), + @intCast(cp1252ToCodepointBytesAssumeNotASCII16(char)), ); } -const latin1_to_utf16_conversion_table = [256]u16{ +const cp1252_to_utf16_conversion_table = [256]u16{ 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, // 00-07 0x0008, 0x0009, 0x000A, 0x000B, 0x000C, 0x000D, 0x000E, 0x000F, // 08-0F 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, // 10-17 @@ -1677,8 +1679,8 @@ pub fn latin1ToCodepointBytesAssumeNotASCII(char: u32) [2]u8 { return bytes[0..2].*; } -pub fn latin1ToCodepointBytesAssumeNotASCII16(char: u32) u16 { - return latin1_to_utf16_conversion_table[@as(u8, @truncate(char))]; +pub fn cp1252ToCodepointBytesAssumeNotASCII16(char: u32) u16 { + return cp1252_to_utf16_conversion_table[@as(u8, @truncate(char))]; } /// Copy a UTF-16 string as UTF-8 into `buf` diff --git a/test/js/node/buffer-utf16.test.ts b/test/js/node/buffer-utf16.test.ts new file mode 100644 index 0000000000..2504d052e4 --- /dev/null +++ b/test/js/node/buffer-utf16.test.ts @@ -0,0 +1,34 @@ +import { expect, test } from "bun:test"; + +test("utf16-le buffer", () => { + const twoByteString = new Array(16) + .fill(0) + .map((_, i) => + Buffer.from( + new Array(16) + .fill(0) + .map((_, j) => String.fromCharCode(i * 16 + j)) + .join(""), + "utf-16le", + ).toString("hex"), + ) + .join("\n"); + expect(twoByteString.toString("hex")).toEqual( + `00000100020003000400050006000700080009000a000b000c000d000e000f00 +10001100120013001400150016001700180019001a001b001c001d001e001f00 +20002100220023002400250026002700280029002a002b002c002d002e002f00 +30003100320033003400350036003700380039003a003b003c003d003e003f00 +40004100420043004400450046004700480049004a004b004c004d004e004f00 +50005100520053005400550056005700580059005a005b005c005d005e005f00 +60006100620063006400650066006700680069006a006b006c006d006e006f00 +70007100720073007400750076007700780079007a007b007c007d007e007f00 +80008100820083008400850086008700880089008a008b008c008d008e008f00 +90009100920093009400950096009700980099009a009b009c009d009e009f00 +a000a100a200a300a400a500a600a700a800a900aa00ab00ac00ad00ae00af00 +b000b100b200b300b400b500b600b700b800b900ba00bb00bc00bd00be00bf00 +c000c100c200c300c400c500c600c700c800c900ca00cb00cc00cd00ce00cf00 +d000d100d200d300d400d500d600d700d800d900da00db00dc00dd00de00df00 +e000e100e200e300e400e500e600e700e800e900ea00eb00ec00ed00ee00ef00 +f000f100f200f300f400f500f600f700f800f900fa00fb00fc00fd00fe00ff00`, + ); +});