diff --git a/src/bun.js/webcore/blob.zig b/src/bun.js/webcore/blob.zig index a3e175057e..a32001a1f2 100644 --- a/src/bun.js/webcore/blob.zig +++ b/src/bun.js/webcore/blob.zig @@ -3815,15 +3815,17 @@ pub const Blob = struct { return this.store != null and this.store.?.data == .file; } - pub fn toStringWithBytes(this: *Blob, global: *JSGlobalObject, buf: []const u8, comptime lifetime: Lifetime) JSValue { + pub fn toStringWithBytes(this: *Blob, global: *JSGlobalObject, buf_: []const u8, comptime lifetime: Lifetime) JSValue { // null == unknown // false == can't be const could_be_all_ascii = this.is_all_ascii orelse this.store.?.is_all_ascii; + const buf = strings.withoutUTF8BOM(buf_); + if (could_be_all_ascii == null or !could_be_all_ascii.?) { // if toUTF16Alloc returns null, it means there are no non-ASCII characters // instead of erroring, invalid characters will become a U+FFFD replacement character - if (strings.toUTF16AllocAllowBOM(bun.default_allocator, buf, false, true) catch unreachable) |external| { + if (strings.toUTF16Alloc(bun.default_allocator, buf, false) catch unreachable) |external| { if (lifetime != .temporary) this.setIsASCIIFlag(false); @@ -3850,21 +3852,36 @@ pub const Blob = struct { // we don't need to clone .clone => { this.store.?.ref(); + // we don't need to worry about UTF-8 BOM in this case because the store owns the memory. return ZigString.init(buf).external(global, this.store.?, Store.external); }, .transfer => { var store = this.store.?; std.debug.assert(store.data == .bytes); this.transfer(); + // we don't need to worry about UTF-8 BOM in this case because the store owns the memory. return ZigString.init(buf).external(global, store, Store.external); }, // strings are immutable // sharing isn't really a thing .share => { this.store.?.ref(); + // we don't need to worry about UTF-8 BOM in this case because the store owns the memory.s return ZigString.init(buf).external(global, this.store.?, Store.external); }, .temporary => { + // if there was a UTF-8 BOM, we need to clone the buffer because + // external doesn't support this case here yet. + if (buf.len != buf_.len) { + var out = bun.String.createLatin1(buf); + defer { + bun.default_allocator.free(buf_); + out.deref(); + } + + return out.toJS(global); + } + return ZigString.init(buf).toExternalValue(global); }, } @@ -3894,7 +3911,8 @@ pub const Blob = struct { return toJSONWithBytes(this, global, view_, lifetime); } - pub fn toJSONWithBytes(this: *Blob, global: *JSGlobalObject, buf: []const u8, comptime lifetime: Lifetime) JSValue { + pub fn toJSONWithBytes(this: *Blob, global: *JSGlobalObject, buf_: []const u8, comptime lifetime: Lifetime) JSValue { + const buf = strings.withoutUTF8BOM(buf_); if (buf.len == 0) return global.createSyntaxErrorInstance("Unexpected end of JSON input", .{}); // null == unknown // false == can't be @@ -3905,7 +3923,7 @@ pub const Blob = struct { var stack_fallback = std.heap.stackFallback(4096, bun.default_allocator); const allocator = stack_fallback.get(); // if toUTF16Alloc returns null, it means there are no non-ASCII characters - if (strings.toUTF16AllocAllowBOM(allocator, buf, false, true) catch null) |external| { + if (strings.toUTF16Alloc(allocator, buf, false) catch null) |external| { if (comptime lifetime != .temporary) this.setIsASCIIFlag(false); const result = ZigString.init16(external).toJSONObject(global); allocator.free(external); @@ -4534,11 +4552,19 @@ pub const InternalBlob = struct { was_string: bool = false, pub fn toStringOwned(this: *@This(), globalThis: *JSC.JSGlobalObject) JSValue { - if (strings.toUTF16AllocAllowBOM(globalThis.allocator(), this.bytes.items, false, true) catch &[_]u16{}) |out| { + const bytes_without_bom = strings.withoutUTF8BOM(this.bytes.items); + if (strings.toUTF16Alloc(globalThis.allocator(), bytes_without_bom, false) catch &[_]u16{}) |out| { const return_value = ZigString.toExternalU16(out.ptr, out.len, globalThis); return_value.ensureStillAlive(); this.deinit(); return return_value; + } else if + // If there was a UTF8 BOM, we clone it + (bytes_without_bom.len != this.bytes.items.len) { + defer this.deinit(); + var out = bun.String.createLatin1(this.bytes.items[3..]); + defer out.deref(); + return out.toJS(globalThis); } else { var str = ZigString.init(this.toOwnedSlice()); str.mark(); diff --git a/src/string_immutable.zig b/src/string_immutable.zig index 47812c3484..f878614801 100644 --- a/src/string_immutable.zig +++ b/src/string_immutable.zig @@ -1313,13 +1313,9 @@ pub fn copyLatin1IntoASCII(dest: []u8, src: []const u8) void { const utf8_bom = [_]u8{ 0xef, 0xbb, 0xbf }; -pub fn toUTF16Alloc(allocator: std.mem.Allocator, bytes: []const u8, comptime fail_if_invalid: bool) !?[]u16 { - return toUTF16AllocAllowBOM(allocator, bytes, fail_if_invalid, false); -} - pub fn withoutUTF8BOM(bytes: []const u8) []const u8 { - if (bytes.len > 3 and strings.eqlComptime(bytes[0..3], utf8_bom)) { - return bytes[3..]; + if (strings.hasPrefixComptime(bytes, utf8_bom)) { + return bytes[utf8_bom.len..]; } else { return bytes; } @@ -1328,20 +1324,8 @@ pub fn withoutUTF8BOM(bytes: []const u8) []const u8 { /// Convert a UTF-8 string to a UTF-16 string IF there are any non-ascii characters /// If there are no non-ascii characters, this returns null /// This is intended to be used for strings that go to JavaScript -pub fn toUTF16AllocAllowBOM(allocator: std.mem.Allocator, bytes_: []const u8, comptime fail_if_invalid: bool, comptime allow_bom: bool) !?[]u16 { - var bytes = bytes_; +pub fn toUTF16Alloc(allocator: std.mem.Allocator, bytes: []const u8, comptime fail_if_invalid: bool) !?[]u16 { if (strings.firstNonASCII(bytes)) |i| { - if (comptime allow_bom) { - // we could avoid the allocation here when it's otherwise ASCII. But - // it gets really complicated because most memory allocators need - // the head pointer to be the allocated one so if we instead return - // a non-head pointer and try to free that the allocator might not - // be able to free it, and we would have a big problem. - if (i == 0 and bytes.len > 3 and strings.eqlComptime(bytes[0..3], utf8_bom)) { - bytes = bytes[3..]; - } - } - const output_: ?std.ArrayList(u16) = if (comptime bun.FeatureFlags.use_simdutf) simd: { const trimmed = bun.simdutf.trim.utf8(bytes); diff --git a/test/js/web/fetch/utf8-bom.test.ts b/test/js/web/fetch/utf8-bom.test.ts index 2a0003fb3e..5dd57fd399 100644 --- a/test/js/web/fetch/utf8-bom.test.ts +++ b/test/js/web/fetch/utf8-bom.test.ts @@ -1,7 +1,39 @@ import { describe, expect, it, test } from "bun:test"; describe("UTF-8 BOM should be ignored", () => { + test("handles empty strings", async () => { + const blob = new Response(new Blob([Buffer.from([0xef, 0xbb, 0xbf])])); + + expect(await blob.text()).toHaveLength(0); + expect(async () => await blob.json()).toThrow(); + }); + + test("handles UTF8 BOM + emoji", async () => { + const blob = new Response(new Blob([Buffer.from([0xef, 0xbb, 0xbf]), Buffer.from("🌎")])); + + expect(await blob.text()).toHaveLength(2); + expect(async () => await blob.json()).toThrow(); + }); + describe("Blob", () => { + describe("with emoji", () => { + it("in text()", async () => { + const blob = new Blob(["\uFEFFHello, World! 🌎"], { type: "text/plain" }); + expect(await blob.text()).toBe("Hello, World! 🌎"); + }); + + it("in json()", async () => { + const blob = new Blob(['\uFEFF{"hello":"World 🌎"}'], { type: "application/json" }); + expect(await blob.json()).toStrictEqual({ "hello": "World 🌎" } as any); + }); + + it("in formData()", async () => { + const blob = new Blob(["\uFEFFhello=world 🌎"], { type: "application/x-www-form-urlencoded" }); + const formData = await blob.formData(); + expect(formData.get("hello")).toBe("world 🌎"); + }); + }); + it("in text()", async () => { const blob = new Blob(["\uFEFFHello, World!"], { type: "text/plain" }); expect(await blob.text()).toBe("Hello, World!");