diff --git a/src/bun.js/WebKit b/src/bun.js/WebKit index 9e975e808a..8522750525 160000 --- a/src/bun.js/WebKit +++ b/src/bun.js/WebKit @@ -1 +1 @@ -Subproject commit 9e975e808ab32043ae6c7927cdb51de4501b9f73 +Subproject commit 8522750525f55198095a0ac3e70e4ccdf9a240b5 diff --git a/src/bun.js/api/bun.zig b/src/bun.js/api/bun.zig index a1e6a23818..b33c852805 100644 --- a/src/bun.js/api/bun.zig +++ b/src/bun.js/api/bun.zig @@ -3146,7 +3146,7 @@ const TOMLObject = struct { var out = bun.String.fromUTF8(slice); defer out.deref(); - return out.toJSForParseJSON(globalThis); + return out.toJSByParseJSON(globalThis); } }; diff --git a/src/bun.js/webcore/blob.zig b/src/bun.js/webcore/blob.zig index 58d024776c..26db6dcb5c 100644 --- a/src/bun.js/webcore/blob.zig +++ b/src/bun.js/webcore/blob.zig @@ -4445,13 +4445,23 @@ pub const Blob = struct { return this.store != null and this.store.?.data == .file; } - pub fn toStringWithBytes(this: *Blob, global: *JSGlobalObject, buf_: []const u8, comptime lifetime: Lifetime) JSValue { + pub fn toStringWithBytes(this: *Blob, global: *JSGlobalObject, raw_bytes: []const u8, comptime lifetime: Lifetime) JSValue { + const bom, const buf = strings.BOM.detectAndSplit(raw_bytes); + + if (buf.len == 0) { + return ZigString.Empty.toValue(global); + } + + if (bom == .utf16_le) { + var out = bun.String.createUTF16(bun.reinterpretSlice(u16, buf)); + defer out.deref(); + return out.toJS(global); + } + // null == unknown // false == can't be const could_be_all_ascii = this.is_all_ascii orelse this.store.?.is_all_ascii; - const buf = strings.withoutUTF8BOM(buf_); - if (could_be_all_ascii == null or !could_be_all_ascii.?) { // if toUTF16Alloc returns null, it means there are no non-ASCII characters // instead of erroring, invalid characters will become a U+FFFD replacement character @@ -4473,10 +4483,6 @@ pub const Blob = struct { if (lifetime != .temporary) this.setIsASCIIFlag(true); } - if (buf.len == 0) { - return ZigString.Empty.toValue(global); - } - switch (comptime lifetime) { // strings are immutable // we don't need to clone @@ -4502,10 +4508,10 @@ pub const Blob = struct { .temporary => { // if there was a UTF-8 BOM, we need to clone the buffer because // external doesn't support this case here yet. - if (buf.len != buf_.len) { + if (buf.len != raw_bytes.len) { var out = bun.String.createLatin1(buf); defer { - bun.default_allocator.free(buf_); + bun.default_allocator.free(raw_bytes); out.deref(); } @@ -4541,9 +4547,15 @@ pub const Blob = struct { return toJSONWithBytes(this, global, view_, lifetime); } - pub fn toJSONWithBytes(this: *Blob, global: *JSGlobalObject, buf_: []const u8, comptime lifetime: Lifetime) JSValue { - const buf = strings.withoutUTF8BOM(buf_); + pub fn toJSONWithBytes(this: *Blob, global: *JSGlobalObject, raw_bytes: []const u8, comptime lifetime: Lifetime) JSValue { + const bom, const buf = strings.BOM.detectAndSplit(raw_bytes); if (buf.len == 0) return global.createSyntaxErrorInstance("Unexpected end of JSON input", .{}); + + if (bom == .utf16_le) { + var out = bun.String.createUTF16(bun.reinterpretSlice(u16, buf)); + defer out.deref(); + return out.toJSByParseJSON(global); + } // null == unknown // false == can't be const could_be_all_ascii = this.is_all_ascii orelse this.store.?.is_all_ascii; @@ -5028,7 +5040,7 @@ pub const AnyBlob = union(enum) { return JSValue.jsNull(); } - return str.toJSForParseJSON(global); + return str.toJSByParseJSON(global); }, } } diff --git a/src/bun.zig b/src/bun.zig index 05f75c8bcb..1224489cd9 100644 --- a/src/bun.zig +++ b/src/bun.zig @@ -2434,3 +2434,16 @@ pub const brotli = @import("./brotli.zig"); pub fn iterateDir(dir: std.fs.Dir) DirIterator.Iterator { return DirIterator.iterate(dir, .u8).iter; } + +fn ReinterpretSliceType(comptime T: type, comptime slice: type) type { + const is_const = @typeInfo(slice).Pointer.is_const; + return if (is_const) []const T else []T; +} + +/// Zig has a todo for @ptrCast changing the `.len`. This is the workaround +pub fn reinterpretSlice(comptime T: type, slice: anytype) ReinterpretSliceType(T, @TypeOf(slice)) { + const is_const = @typeInfo(@TypeOf(slice)).Pointer.is_const; + const bytes = std.mem.sliceAsBytes(slice); + const new_ptr = @as(if (is_const) [*]const T else [*]T, @ptrCast(@alignCast(bytes.ptr))); + return new_ptr[0..@divTrunc(bytes.len, @sizeOf(T))]; +} diff --git a/src/fs.zig b/src/fs.zig index d5305d9526..6cfc55bacf 100644 --- a/src/fs.zig +++ b/src/fs.zig @@ -1174,6 +1174,11 @@ pub const FileSystem = struct { if (shared_buffer.list.capacity > file_contents.len) { file_contents.ptr[file_contents.len] = 0; } + + if (strings.BOM.detect(file_contents)) |bom| { + debug("Convert {s} BOM", .{@tagName(bom)}); + file_contents = try bom.removeAndConvertToUTF8WithoutDealloc(allocator, &shared_buffer.list); + } } else { // We use pread to ensure if the file handle was open, it doesn't seek from the last position var buf = try allocator.alloc(u8, size + 1); @@ -1187,6 +1192,11 @@ pub const FileSystem = struct { }; file_contents = buf[0..read_count]; debug("pread({d}, {d}) = {d}", .{ file.handle, size, read_count }); + + if (strings.BOM.detect(file_contents)) |bom| { + debug("Convert {s} BOM", .{@tagName(bom)}); + file_contents = try bom.removeAndConvertToUTF8AndFree(allocator, file_contents); + } } return PathContentsPair{ .path = Path.init(path), .contents = file_contents }; diff --git a/src/string.zig b/src/string.zig index f12f5e25d0..65f56b9347 100644 --- a/src/string.zig +++ b/src/string.zig @@ -711,7 +711,7 @@ pub const String = extern struct { this: *String, ) JSC.JSValue; - pub fn toJSForParseJSON(self: *String, globalObject: *JSC.JSGlobalObject) JSC.JSValue { + pub fn toJSByParseJSON(self: *String, globalObject: *JSC.JSGlobalObject) JSC.JSValue { JSC.markBinding(@src()); return BunString__toJSON(globalObject, self); } diff --git a/src/string_immutable.zig b/src/string_immutable.zig index 0169c7b1cf..5824765d72 100644 --- a/src/string_immutable.zig +++ b/src/string_immutable.zig @@ -1027,7 +1027,7 @@ pub fn eqlUtf16(comptime self: string, other: []const u16) bool { return bun.C.memcmp(bun.cast([*]const u8, self.ptr), bun.cast([*]const u8, other.ptr), self.len * @sizeOf(u16)) == 0; } -pub fn toUTF8Alloc(allocator: std.mem.Allocator, js: []const u16) !string { +pub fn toUTF8Alloc(allocator: std.mem.Allocator, js: []const u16) ![]u8 { return try toUTF8AllocWithType(allocator, []const u16, js); } @@ -1197,11 +1197,116 @@ pub fn copyLatin1IntoASCII(dest: []u8, src: []const u8) void { } } -const utf8_bom = [_]u8{ 0xef, 0xbb, 0xbf }; +/// It is common on Windows to find files that are not encoded in UTF8. Most of these include +/// a 'byte-order mark' codepoint at the start of the file. The layout of this codepoint can +/// determine the encoding. +/// +/// https://en.wikipedia.org/wiki/Byte_order_mark +pub const BOM = enum { + utf8, + utf16_le, + utf16_be, + utf32_le, + utf32_be, + pub const utf8_bytes = [_]u8{ 0xef, 0xbb, 0xbf }; + pub const utf16_le_bytes = [_]u8{ 0xff, 0xfe }; + pub const utf16_be_bytes = [_]u8{ 0xfe, 0xff }; + pub const utf32_le_bytes = [_]u8{ 0xff, 0xfe, 0x00, 0x00 }; + pub const utf32_be_bytes = [_]u8{ 0x00, 0x00, 0xfe, 0xff }; + + pub fn detect(bytes: []const u8) ?BOM { + if (bytes.len < 3) return null; + if (eqlComptimeIgnoreLen(bytes, utf8_bytes)) return .utf8; + if (eqlComptimeIgnoreLen(bytes, utf16_le_bytes)) { + // if (bytes.len > 4 and eqlComptimeIgnoreLen(bytes[2..], utf32_le_bytes[2..])) + // return .utf32_le; + return .utf16_le; + } + // if (eqlComptimeIgnoreLen(bytes, utf16_be_bytes)) return .utf16_be; + // if (bytes.len > 4 and eqlComptimeIgnoreLen(bytes, utf32_le_bytes)) return .utf32_le; + return null; + } + + pub fn detectAndSplit(bytes: []const u8) struct { ?BOM, []const u8 } { + const bom = detect(bytes); + if (bom == null) return .{ null, bytes }; + return .{ bom, bytes[bom.?.length()..] }; + } + + pub fn getHeader(bom: BOM) []const u8 { + return switch (bom) { + inline else => |t| comptime &@field(BOM, @tagName(t) ++ "_bytes"), + }; + } + + pub fn length(bom: BOM) usize { + return switch (bom) { + inline else => |t| comptime (&@field(BOM, @tagName(t) ++ "_bytes")).len, + }; + } + + /// If an allocation is needed, free the input and the caller will + /// replace it with the new return + pub fn removeAndConvertToUTF8AndFree(bom: BOM, allocator: std.mem.Allocator, bytes: []u8) ![]u8 { + switch (bom) { + .utf8 => { + bun.C.memmove(bytes.ptr, bytes.ptr + utf8_bytes.len, bytes.len - utf8_bytes.len); + return bytes[0 .. bytes.len - utf8_bytes.len]; + }, + .utf16_le => { + const trimmed_bytes = bytes[utf16_le_bytes.len..]; + const trimmed_bytes_u16: []const u16 = @alignCast(std.mem.bytesAsSlice(u16, trimmed_bytes)); + const out = try toUTF8Alloc(allocator, trimmed_bytes_u16); + allocator.free(bytes); + return out; + }, + else => { + // TODO: this needs to re-encode, for now we just remove the BOM + const bom_bytes = bom.getHeader(); + bun.C.memmove(bytes.ptr, bytes.ptr + bom_bytes.len, bytes.len - bom_bytes.len); + return bytes[0 .. bytes.len - bom_bytes.len]; + }, + } + } + + /// This is required for fs.zig's `use_shared_buffer` flag. we cannot free that pointer. + /// The returned slice will always point to the base of the input. + /// + /// Requires an arraylist in case it must be grown. + pub fn removeAndConvertToUTF8WithoutDealloc(bom: BOM, allocator: std.mem.Allocator, list: *std.ArrayListUnmanaged(u8)) ![]u8 { + const bytes = list.items; + switch (bom) { + .utf8 => { + bun.C.memmove(bytes.ptr, bytes.ptr + utf8_bytes.len, bytes.len - utf8_bytes.len); + return bytes[0 .. bytes.len - utf8_bytes.len]; + }, + .utf16_le => { + const trimmed_bytes = bytes[utf16_le_bytes.len..]; + const trimmed_bytes_u16: []const u16 = @alignCast(std.mem.bytesAsSlice(u16, trimmed_bytes)); + const out = try toUTF8Alloc(allocator, trimmed_bytes_u16); + if (list.capacity < out.len) { + try list.ensureTotalCapacity(allocator, out.len); + } + list.items.len = out.len; + @memcpy(list.items, out); + return out; + }, + else => { + // TODO: this needs to re-encode, for now we just remove the BOM + const bom_bytes = bom.getHeader(); + bun.C.memmove(bytes.ptr, bytes.ptr + bom_bytes.len, bytes.len - bom_bytes.len); + return bytes[0 .. bytes.len - bom_bytes.len]; + }, + } + } +}; + +/// @deprecated. If you are using this, you likely will need to remove other BOMs and handle encoding. +/// Use the BOM struct's `detect` and conversion functions instead. pub fn withoutUTF8BOM(bytes: []const u8) []const u8 { - if (strings.hasPrefixComptime(bytes, utf8_bom)) { - return bytes[utf8_bom.len..]; + if (strings.hasPrefixComptime(bytes, BOM.utf8_bytes)) { + return bytes[BOM.utf8_bytes.len..]; } else { return bytes; } diff --git a/test/bundler/bundler_edgecase.test.ts b/test/bundler/bundler_edgecase.test.ts index 14e10fd898..ddf2637b91 100644 --- a/test/bundler/bundler_edgecase.test.ts +++ b/test/bundler/bundler_edgecase.test.ts @@ -1040,6 +1040,7 @@ describe("bundler", () => { }, }); + // TODO(@paperdave): test every case of this. I had already tested it manually, but it may break later const requireTranspilationListESM = [ // input, output:bun, output:node ["require", "import.meta.require", "__require"], diff --git a/test/bundler/cli.test.ts b/test/bundler/cli.test.ts index d574ceab42..2fea1a070c 100644 --- a/test/bundler/cli.test.ts +++ b/test/bundler/cli.test.ts @@ -53,4 +53,15 @@ describe("bun build", () => { fs.rmSync(baseDir, { recursive: true, force: true }); } }); + + test("works with utf8 bom", () => { + const tmp = fs.mkdtempSync(path.join(tmpdir(), "bun-build-utf8-bom-")); + const src = path.join(tmp, "index.js"); + fs.writeFileSync(src, '\ufeffconsole.log("hello world");', { encoding: "utf8" }); + const { exitCode } = Bun.spawnSync({ + cmd: [bunExe(), "build", src], + env: bunEnv, + }); + expect(exitCode).toBe(0); + }); });