From 6303af3ce0534499c8d6923a5c1869f76b1b0892 Mon Sep 17 00:00:00 2001 From: Dylan Conway <35280289+dylan-conway@users.noreply.github.com> Date: Fri, 2 Aug 2024 23:01:34 -0700 Subject: [PATCH] fix(TextDecoder): decoding sequences starting with 192 or 193 (#13043) --- src/bun.js/webcore/encoding.zig | 3 --- src/string_immutable.zig | 15 +++++++++++++-- test/js/web/encoding/text-decoder.test.js | 9 +++++++++ 3 files changed, 22 insertions(+), 5 deletions(-) diff --git a/src/bun.js/webcore/encoding.zig b/src/bun.js/webcore/encoding.zig index b1bb409f9d..fa2cca8d98 100644 --- a/src/bun.js/webcore/encoding.zig +++ b/src/bun.js/webcore/encoding.zig @@ -40,8 +40,6 @@ const picohttp = bun.picohttp; pub const TextEncoder = struct { filler: u32 = 0, - const utf8_string: string = "utf-8"; - pub export fn TextEncoder__encode8( globalThis: *JSGlobalObject, ptr: [*]const u8, @@ -451,7 +449,6 @@ pub const TextDecoder = struct { return JSC.JSValue.jsBoolean(this.fatal); } - const utf8_string: string = "utf-8"; pub fn getEncoding( this: *TextDecoder, globalThis: *JSC.JSGlobalObject, diff --git a/src/string_immutable.zig b/src/string_immutable.zig index 46bff268e9..5552cae6a7 100644 --- a/src/string_immutable.zig +++ b/src/string_immutable.zig @@ -1279,6 +1279,17 @@ pub fn withoutUTF8BOM(bytes: []const u8) []const u8 { } } +// https://github.com/WebKit/WebKit/blob/443e796d1538654c34f2690e39600c70c8052b63/Source/WebCore/PAL/pal/text/TextCodecUTF8.cpp#L69 +pub fn nonASCIISequenceLength(first_byte: u8) u3 { + return switch (first_byte) { + 0...193 => 0, + 194...223 => 2, + 224...239 => 3, + 240...244 => 4, + 245...255 => 0, + }; +} + /// Convert a UTF-8 string to a UTF-16 string IF there are any non-ascii characters /// If there are no non-ascii characters, this returns null /// This is intended to be used for strings that go to JavaScript @@ -2076,9 +2087,9 @@ pub const UTF16Replacement = struct { }; // This variation matches WebKit behavior. -pub fn convertUTF8BytesIntoUTF16(sequence: *const [4]u8) UTF16Replacement { +fn convertUTF8BytesIntoUTF16(sequence: *const [4]u8) UTF16Replacement { if (comptime Environment.allow_assert) assert(sequence[0] > 127); - const len = wtf8ByteSequenceLengthWithInvalid(sequence[0]); + const len = nonASCIISequenceLength(sequence[0]); switch (len) { 2 => { if (comptime Environment.allow_assert) { diff --git a/test/js/web/encoding/text-decoder.test.js b/test/js/web/encoding/text-decoder.test.js index 1113d46dd1..10b29557cf 100644 --- a/test/js/web/encoding/text-decoder.test.js +++ b/test/js/web/encoding/text-decoder.test.js @@ -310,3 +310,12 @@ it("truncated sequences", () => { assert_equals(new TextDecoder().decode(new Uint8Array([0xf0, 0x41, 0xf0])), "\uFFFDA\uFFFD"); assert_equals(new TextDecoder().decode(new Uint8Array([0xf0, 0x8f, 0x92])), "\uFFFD\uFFFD\uFFFD"); }); + +it.each([ + [0xc0, 0x80], // 192 + [0xc1, 0x80], // 193 +])(`should handle %d`, (...input) => { + const decoder = new TextDecoder(); + const output = decoder.decode(Uint8Array.from(input)); + expect(output).toBe("\uFFFD\uFFFD"); +});