From 6303af3ce0534499c8d6923a5c1869f76b1b0892 Mon Sep 17 00:00:00 2001
From: Dylan Conway <35280289+dylan-conway@users.noreply.github.com>
Date: Fri, 2 Aug 2024 23:01:34 -0700
Subject: [PATCH] fix(TextDecoder): decoding sequences starting with 192 or 193
 (#13043)

---
 src/bun.js/webcore/encoding.zig           |  3 ---
 src/string_immutable.zig                  | 15 +++++++++++++--
 test/js/web/encoding/text-decoder.test.js |  9 +++++++++
 3 files changed, 22 insertions(+), 5 deletions(-)

diff --git a/src/bun.js/webcore/encoding.zig b/src/bun.js/webcore/encoding.zig
index b1bb409f9d..fa2cca8d98 100644
--- a/src/bun.js/webcore/encoding.zig
+++ b/src/bun.js/webcore/encoding.zig
@@ -40,8 +40,6 @@ const picohttp = bun.picohttp;
 pub const TextEncoder = struct {
     filler: u32 = 0,
 
-    const utf8_string: string = "utf-8";
-
     pub export fn TextEncoder__encode8(
         globalThis: *JSGlobalObject,
         ptr: [*]const u8,
@@ -451,7 +449,6 @@ pub const TextDecoder = struct {
         return JSC.JSValue.jsBoolean(this.fatal);
     }
 
-    const utf8_string: string = "utf-8";
     pub fn getEncoding(
         this: *TextDecoder,
         globalThis: *JSC.JSGlobalObject,
diff --git a/src/string_immutable.zig b/src/string_immutable.zig
index 46bff268e9..5552cae6a7 100644
--- a/src/string_immutable.zig
+++ b/src/string_immutable.zig
@@ -1279,6 +1279,17 @@ pub fn withoutUTF8BOM(bytes: []const u8) []const u8 {
     }
 }
 
+// https://github.com/WebKit/WebKit/blob/443e796d1538654c34f2690e39600c70c8052b63/Source/WebCore/PAL/pal/text/TextCodecUTF8.cpp#L69
+pub fn nonASCIISequenceLength(first_byte: u8) u3 {
+    return switch (first_byte) {
+        0...193 => 0,
+        194...223 => 2,
+        224...239 => 3,
+        240...244 => 4,
+        245...255 => 0,
+    };
+}
+
 /// Convert a UTF-8 string to a UTF-16 string IF there are any non-ascii characters
 /// If there are no non-ascii characters, this returns null
 /// This is intended to be used for strings that go to JavaScript
@@ -2076,9 +2087,9 @@ pub const UTF16Replacement = struct {
 };
 
 // This variation matches WebKit behavior.
-pub fn convertUTF8BytesIntoUTF16(sequence: *const [4]u8) UTF16Replacement {
+fn convertUTF8BytesIntoUTF16(sequence: *const [4]u8) UTF16Replacement {
     if (comptime Environment.allow_assert) assert(sequence[0] > 127);
-    const len = wtf8ByteSequenceLengthWithInvalid(sequence[0]);
+    const len = nonASCIISequenceLength(sequence[0]);
     switch (len) {
         2 => {
             if (comptime Environment.allow_assert) {
diff --git a/test/js/web/encoding/text-decoder.test.js b/test/js/web/encoding/text-decoder.test.js
index 1113d46dd1..10b29557cf 100644
--- a/test/js/web/encoding/text-decoder.test.js
+++ b/test/js/web/encoding/text-decoder.test.js
@@ -310,3 +310,12 @@ it("truncated sequences", () => {
   assert_equals(new TextDecoder().decode(new Uint8Array([0xf0, 0x41, 0xf0])), "\uFFFDA\uFFFD");
   assert_equals(new TextDecoder().decode(new Uint8Array([0xf0, 0x8f, 0x92])), "\uFFFD\uFFFD\uFFFD");
 });
+
+it.each([
+  [0xc0, 0x80], // 192
+  [0xc1, 0x80], // 193
+])(`should handle %d`, (...input) => {
+  const decoder = new TextDecoder();
+  const output = decoder.decode(Uint8Array.from(input));
+  expect(output).toBe("\uFFFD\uFFFD");
+});