diff --git a/src/bun.js/webcore/blob.zig b/src/bun.js/webcore/blob.zig index 455a515008..130079118d 100644 --- a/src/bun.js/webcore/blob.zig +++ b/src/bun.js/webcore/blob.zig @@ -3620,7 +3620,7 @@ pub const Blob = struct { if (could_be_all_ascii == null or !could_be_all_ascii.?) { // if toUTF16Alloc returns null, it means there are no non-ASCII characters // instead of erroring, invalid characters will become a U+FFFD replacement character - if (strings.toUTF16Alloc(bun.default_allocator, buf, false) catch unreachable) |external| { + if (strings.toUTF16AllocAllowBOM(bun.default_allocator, buf, false, true) catch unreachable) |external| { if (lifetime != .temporary) this.setIsASCIIFlag(false); @@ -3702,7 +3702,7 @@ pub const Blob = struct { var stack_fallback = std.heap.stackFallback(4096, bun.default_allocator); const allocator = stack_fallback.get(); // if toUTF16Alloc returns null, it means there are no non-ASCII characters - if (strings.toUTF16Alloc(allocator, buf, false) catch null) |external| { + if (strings.toUTF16AllocAllowBOM(allocator, buf, false, true) catch null) |external| { if (comptime lifetime != .temporary) this.setIsASCIIFlag(false); const result = ZigString.init16(external).toJSONObject(global); allocator.free(external); @@ -4331,7 +4331,7 @@ pub const InternalBlob = struct { was_string: bool = false, pub fn toStringOwned(this: *@This(), globalThis: *JSC.JSGlobalObject) JSValue { - if (strings.toUTF16Alloc(globalThis.allocator(), this.bytes.items, false) catch &[_]u16{}) |out| { + if (strings.toUTF16AllocAllowBOM(globalThis.allocator(), this.bytes.items, false, true) catch &[_]u16{}) |out| { const return_value = ZigString.toExternalU16(out.ptr, out.len, globalThis); return_value.ensureStillAlive(); this.deinit(); @@ -4344,7 +4344,7 @@ pub const InternalBlob = struct { } pub fn toJSON(this: *@This(), globalThis: *JSC.JSGlobalObject) JSValue { - const str_bytes = ZigString.init(this.bytes.items).withEncoding(); + const str_bytes = ZigString.init(strings.withoutUTF8BOM(this.bytes.items)).withEncoding(); const json = str_bytes.toJSONObject(globalThis); this.deinit(); return json; diff --git a/src/js/builtins/ReadableStreamInternals.ts b/src/js/builtins/ReadableStreamInternals.ts index eee2c41864..82c60f15e0 100644 --- a/src/js/builtins/ReadableStreamInternals.ts +++ b/src/js/builtins/ReadableStreamInternals.ts @@ -1070,11 +1070,15 @@ export function createTextStream(highWaterMark) { } if (hasString && !hasBuffer) { + if (rope.charCodeAt(0) === 0xfeff) { + rope = rope.slice(1); + } + return rope; } if (hasBuffer && !hasString) { - return new globalThis.TextDecoder().decode(Bun.concatArrayBuffers(array)); + return new globalThis.TextDecoder("utf-8", { ignoreBOM: true }).decode(Bun.concatArrayBuffers(array)); } // worst case: mixed content @@ -1089,12 +1093,16 @@ export function createTextStream(highWaterMark) { } array.length = 0; if (rope.length > 0) { + if (rope.charCodeAt(0) === 0xfeff) { + rope = rope.slice(1); + } + arrayBufferSink.write(rope); rope = ""; } // TODO: use builtin - return new globalThis.TextDecoder().decode(arrayBufferSink.end()); + return new globalThis.TextDecoder("utf-8", { ignoreBOM: true }).decode(arrayBufferSink.end()); }, close() { @@ -1678,13 +1686,23 @@ export function readableStreamIntoArray(stream) { return processManyResult(manyResult); } +export function withoutUTF8BOM(result) { + if (result.charCodeAt(0) === 0xfeff) { + return result.slice(1); + } + + return result; +} + export function readableStreamIntoText(stream) { const [textStream, closer] = $createTextStream($getByIdDirectPrivate(stream, "highWaterMark")); const prom = $readStreamIntoSink(stream, textStream, false); + if (prom && $isPromise(prom)) { - return Promise.$resolve(prom).$then(closer.promise); + return Promise.$resolve(prom).$then(closer.promise).$then($withoutUTF8BOM); } - return closer.promise; + + return closer.promise.$then($withoutUTF8BOM); } export function readableStreamToArrayBufferDirect(stream, underlyingSource) { diff --git a/src/string_immutable.zig b/src/string_immutable.zig index ab45364bbb..5600055955 100644 --- a/src/string_immutable.zig +++ b/src/string_immutable.zig @@ -1311,11 +1311,37 @@ pub fn copyLatin1IntoASCII(dest: []u8, src: []const u8) void { } } +const utf8_bom = [_]u8{ 0xef, 0xbb, 0xbf }; + +pub fn toUTF16Alloc(allocator: std.mem.Allocator, bytes: []const u8, comptime fail_if_invalid: bool) !?[]u16 { + return toUTF16AllocAllowBOM(allocator, bytes, fail_if_invalid, false); +} + +pub fn withoutUTF8BOM(bytes: []const u8) []const u8 { + if (bytes.len > 3 and strings.eqlComptime(bytes[0..3], utf8_bom)) { + return bytes[3..]; + } else { + return bytes; + } +} + /// Convert a UTF-8 string to a UTF-16 string IF there are any non-ascii characters /// If there are no non-ascii characters, this returns null /// This is intended to be used for strings that go to JavaScript -pub fn toUTF16Alloc(allocator: std.mem.Allocator, bytes: []const u8, comptime fail_if_invalid: bool) !?[]u16 { +pub fn toUTF16AllocAllowBOM(allocator: std.mem.Allocator, bytes_: []const u8, comptime fail_if_invalid: bool, comptime allow_bom: bool) !?[]u16 { + var bytes = bytes_; if (strings.firstNonASCII(bytes)) |i| { + if (comptime allow_bom) { + // we could avoid the allocation here when it's otherwise ASCII. But + // it gets really complicated because most memory allocators need + // the head pointer to be the allocated one so if we instead return + // a non-head pointer and try to free that the allocator might not + // be able to free it, and we would have a big problem. + if (i == 0 and bytes.len > 3 and strings.eqlComptime(bytes[0..3], utf8_bom)) { + bytes = bytes[3..]; + } + } + const output_: ?std.ArrayList(u16) = if (comptime bun.FeatureFlags.use_simdutf) simd: { const trimmed = bun.simdutf.trim.utf8(bytes); diff --git a/src/url.zig b/src/url.zig index 4704aec87a..4d8ffc4f44 100644 --- a/src/url.zig +++ b/src/url.zig @@ -973,7 +973,7 @@ pub const FormData = struct { pub fn toJS(globalThis: *JSC.JSGlobalObject, input: []const u8, encoding: Encoding) !JSC.JSValue { switch (encoding) { .URLEncoded => { - var str = JSC.ZigString.fromUTF8(input); + var str = JSC.ZigString.fromUTF8(strings.withoutUTF8BOM(input)); return JSC.DOMFormData.createFromURLQuery(globalThis, &str); }, .Multipart => |boundary| return toJSFromMultipartData(globalThis, input, boundary), @@ -1104,7 +1104,16 @@ pub const FormData = struct { wrap.form.appendBlob(wrap.globalThis, &key, &blob, &filename); } else { - var value = JSC.ZigString.initUTF8(value_str); + var value = JSC.ZigString.initUTF8( + // > Each part whose `Content-Disposition` header does not + // > contain a `filename` parameter must be parsed into an + // > entry whose value is the UTF-8 decoded without BOM + // > content of the part. This is done regardless of the + // > presence or the value of a `Content-Type` header and + // > regardless of the presence or the value of a + // > `charset` parameter. + strings.withoutUTF8BOM(value_str), + ); wrap.form.append(&key, &value); } } diff --git a/test/js/web/fetch/utf8-bom.test.ts b/test/js/web/fetch/utf8-bom.test.ts new file mode 100644 index 0000000000..2a0003fb3e --- /dev/null +++ b/test/js/web/fetch/utf8-bom.test.ts @@ -0,0 +1,114 @@ +import { describe, expect, it, test } from "bun:test"; + +describe("UTF-8 BOM should be ignored", () => { + describe("Blob", () => { + it("in text()", async () => { + const blob = new Blob(["\uFEFFHello, World!"], { type: "text/plain" }); + expect(await blob.text()).toBe("Hello, World!"); + }); + + it("in json()", async () => { + const blob = new Blob(['\uFEFF{"hello":"World"}'], { type: "application/json" }); + expect(await blob.json()).toEqual({ "hello": "World" } as any); + }); + + it("in formData()", async () => { + const blob = new Blob(["\uFEFFhello=world"], { type: "application/x-www-form-urlencoded" }); + const formData = await blob.formData(); + expect(formData.get("hello")).toBe("world"); + }); + }); + + describe("Response", () => { + it("in text()", async () => { + const response = new Response(Buffer.from("\uFEFFHello, World!"), { headers: { "content-type": "text/plain" } }); + expect(await response.text()).toBe("Hello, World!"); + }); + + it("in json()", async () => { + const response = new Response(Buffer.from('\uFEFF{"hello":"World"}'), { + headers: { "content-type": "application/json" }, + }); + expect(await response.json()).toEqual({ "hello": "World" } as any); + }); + + it("in formData()", async () => { + const response = new Response(Buffer.from("\uFEFFhello=world"), { + headers: { "content-type": "application/x-www-form-urlencoded" }, + }); + const formData = await response.formData(); + expect(formData.get("hello")).toBe("world"); + }); + }); + + describe("Request", () => { + it("in text()", async () => { + const request = new Request("https://example.com", { + body: Buffer.from("\uFEFFHello, World!"), + headers: { "content-type": "text/plain" }, + }); + expect(await request.text()).toBe("Hello, World!"); + }); + + it("in json()", async () => { + const request = new Request("https://example.com", { + body: Buffer.from('\uFEFF{"hello":"World"}'), + headers: { "content-type": "application/json" }, + }); + expect(await request.json()).toEqual({ "hello": "World" } as any); + }); + + it("in formData()", async () => { + const request = new Request("https://example.com", { + body: Buffer.from("\uFEFFhello=world"), + headers: { "content-type": "application/x-www-form-urlencoded" }, + }); + const formData = await request.formData(); + expect(formData.get("hello")).toBe("world"); + }); + }); + + describe("readable stream", () => { + it("in Bun.readableStreamToText()", async () => { + const stream = new ReadableStream({ + start(controller) { + controller.enqueue(Buffer.from("\uFEFFHello, World!")); + controller.close(); + }, + }); + expect(await Bun.readableStreamToText(stream)).toBe("Hello, World!"); + }); + + it("in Bun.readableStreamToJSON()", async () => { + const stream = new ReadableStream({ + start(controller) { + controller.enqueue(Buffer.from('\uFEFF{"hello":"World"}')); + controller.close(); + }, + }); + expect(await Bun.readableStreamToJSON(stream)).toEqual({ "hello": "World" } as any); + }); + + it("in Bun.readableStreamToFormData()", async () => { + const stream = new ReadableStream({ + start(controller) { + controller.enqueue(Buffer.from("\uFEFFhello=world")); + controller.close(); + }, + }); + const formData = await Bun.readableStreamToFormData(stream); + expect(formData.get("hello")).toBe("world"); + }); + + it("in Bun.readableStreamToBlob()", async () => { + const stream = new ReadableStream({ + start(controller) { + controller.enqueue(Buffer.from("\uFEFFHello, World!")); + controller.close(); + }, + }); + const blob = await Bun.readableStreamToBlob(stream); + expect(await blob.text()).toBe("Hello, World!"); + }); + }); +});