mirror of
https://github.com/oven-sh/bun
synced 2026-02-09 10:28:47 +00:00
feat(encoding): support BOM detection with test passed (#6074)
This commit is contained in:
18
src/bun.js/bindings/ZigGeneratedClasses.cpp
generated
18
src/bun.js/bindings/ZigGeneratedClasses.cpp
generated
@@ -26705,12 +26705,16 @@ JSC_DECLARE_CUSTOM_GETTER(TextDecoderPrototype__encodingGetterWrap);
|
||||
extern "C" JSC::EncodedJSValue TextDecoderPrototype__getFatal(void* ptr, JSC::JSGlobalObject* lexicalGlobalObject);
|
||||
JSC_DECLARE_CUSTOM_GETTER(TextDecoderPrototype__fatalGetterWrap);
|
||||
|
||||
extern "C" JSC::EncodedJSValue TextDecoderPrototype__getIgnoreBOM(void* ptr, JSC::JSGlobalObject* lexicalGlobalObject);
|
||||
JSC_DECLARE_CUSTOM_GETTER(TextDecoderPrototype__ignoreBOMGetterWrap);
|
||||
|
||||
STATIC_ASSERT_ISO_SUBSPACE_SHARABLE(JSTextDecoderPrototype, JSTextDecoderPrototype::Base);
|
||||
|
||||
static const HashTableValue JSTextDecoderPrototypeTableValues[] = {
|
||||
{ "decode"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function | JSC::PropertyAttribute::DOMJITFunction | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::DOMJITFunctionType, TextDecoderPrototype__decodeCallback, &DOMJITSignatureForTextDecoderPrototype__decode } },
|
||||
{ "encoding"_s, static_cast<unsigned>(JSC::PropertyAttribute::ReadOnly | JSC::PropertyAttribute::CustomAccessor | JSC::PropertyAttribute::DOMAttribute | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::GetterSetterType, TextDecoderPrototype__encodingGetterWrap, 0 } },
|
||||
{ "fatal"_s, static_cast<unsigned>(JSC::PropertyAttribute::ReadOnly | JSC::PropertyAttribute::CustomAccessor | JSC::PropertyAttribute::DOMAttribute | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::GetterSetterType, TextDecoderPrototype__fatalGetterWrap, 0 } }
|
||||
{ "fatal"_s, static_cast<unsigned>(JSC::PropertyAttribute::ReadOnly | JSC::PropertyAttribute::CustomAccessor | JSC::PropertyAttribute::DOMAttribute | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::GetterSetterType, TextDecoderPrototype__fatalGetterWrap, 0 } },
|
||||
{ "ignoreBOM"_s, static_cast<unsigned>(JSC::PropertyAttribute::ReadOnly | JSC::PropertyAttribute::CustomAccessor | JSC::PropertyAttribute::DOMAttribute | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::GetterSetterType, TextDecoderPrototype__ignoreBOMGetterWrap, 0 } }
|
||||
};
|
||||
|
||||
const ClassInfo JSTextDecoderPrototype::s_info = { "TextDecoder"_s, &Base::s_info, nullptr, nullptr, CREATE_METHOD_TABLE(JSTextDecoderPrototype) };
|
||||
@@ -26798,6 +26802,18 @@ JSC_DEFINE_CUSTOM_GETTER(TextDecoderPrototype__fatalGetterWrap, (JSGlobalObject
|
||||
RELEASE_AND_RETURN(throwScope, result);
|
||||
}
|
||||
|
||||
JSC_DEFINE_CUSTOM_GETTER(TextDecoderPrototype__ignoreBOMGetterWrap, (JSGlobalObject * lexicalGlobalObject, EncodedJSValue thisValue, PropertyName attributeName))
|
||||
{
|
||||
auto& vm = lexicalGlobalObject->vm();
|
||||
Zig::GlobalObject* globalObject = reinterpret_cast<Zig::GlobalObject*>(lexicalGlobalObject);
|
||||
auto throwScope = DECLARE_THROW_SCOPE(vm);
|
||||
JSTextDecoder* thisObject = jsCast<JSTextDecoder*>(JSValue::decode(thisValue));
|
||||
JSC::EnsureStillAliveScope thisArg = JSC::EnsureStillAliveScope(thisObject);
|
||||
JSC::EncodedJSValue result = TextDecoderPrototype__getIgnoreBOM(thisObject->wrapped(), globalObject);
|
||||
RETURN_IF_EXCEPTION(throwScope, {});
|
||||
RELEASE_AND_RETURN(throwScope, result);
|
||||
}
|
||||
|
||||
void JSTextDecoderPrototype::finishCreation(JSC::VM& vm, JSC::JSGlobalObject* globalObject)
|
||||
{
|
||||
Base::finishCreation(vm);
|
||||
|
||||
@@ -6874,6 +6874,9 @@ pub const JSTextDecoder = struct {
|
||||
if (@TypeOf(TextDecoder.getFatal) != GetterType)
|
||||
@compileLog("Expected TextDecoder.getFatal to be a getter");
|
||||
|
||||
if (@TypeOf(TextDecoder.getIgnoreBOM) != GetterType)
|
||||
@compileLog("Expected TextDecoder.getIgnoreBOM to be a getter");
|
||||
|
||||
if (!JSC.is_bindgen) {
|
||||
@export(TextDecoder.constructor, .{ .name = "TextDecoderClass__construct" });
|
||||
@export(TextDecoder.decode, .{ .name = "TextDecoderPrototype__decode" });
|
||||
@@ -6881,6 +6884,7 @@ pub const JSTextDecoder = struct {
|
||||
@export(TextDecoder.finalize, .{ .name = "TextDecoderClass__finalize" });
|
||||
@export(TextDecoder.getEncoding, .{ .name = "TextDecoderPrototype__getEncoding" });
|
||||
@export(TextDecoder.getFatal, .{ .name = "TextDecoderPrototype__getFatal" });
|
||||
@export(TextDecoder.getIgnoreBOM, .{ .name = "TextDecoderPrototype__getIgnoreBOM" });
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
@@ -16,6 +16,9 @@ export default [
|
||||
fatal: {
|
||||
getter: "getFatal",
|
||||
},
|
||||
ignoreBOM: {
|
||||
getter: "getIgnoreBOM",
|
||||
},
|
||||
|
||||
decode: {
|
||||
fn: "decode",
|
||||
|
||||
@@ -559,6 +559,13 @@ pub const TextDecoder = struct {
|
||||
remainder = remainder[1..];
|
||||
continue;
|
||||
},
|
||||
// BOM handling
|
||||
0xFEFF => {
|
||||
buffer.ensureTotalCapacity(allocator, 1) catch unreachable;
|
||||
buffer.items.ptr[buffer.items.len] = remainder[0];
|
||||
buffer.items.len += 1;
|
||||
remainder = remainder[1..];
|
||||
},
|
||||
|
||||
// Is this an unpaired low surrogate or four-digit hex escape?
|
||||
else => {
|
||||
@@ -629,8 +636,13 @@ pub const TextDecoder = struct {
|
||||
},
|
||||
EncodingLabel.@"UTF-8" => {
|
||||
const toUTF16 = if (stream) strings.toUTF16Alloc else strings.toUTF16AllocNoTrim;
|
||||
const moved_buffer_slice_8 = if (!this.ignore_bom and buffer_slice.len > 3 and std.mem.eql(u8, &[_]u8{ '\xEF', '\xBB', '\xBF' }, buffer_slice[0..3]))
|
||||
buffer_slice[3..]
|
||||
else
|
||||
buffer_slice;
|
||||
|
||||
if (this.fatal) {
|
||||
if (toUTF16(default_allocator, buffer_slice, true)) |result_| {
|
||||
if (toUTF16(default_allocator, moved_buffer_slice_8, true)) |result_| {
|
||||
if (result_) |result| {
|
||||
return ZigString.toExternalU16(result.ptr, result.len, globalThis);
|
||||
}
|
||||
@@ -649,7 +661,7 @@ pub const TextDecoder = struct {
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (toUTF16(default_allocator, buffer_slice, false)) |result_| {
|
||||
if (toUTF16(default_allocator, moved_buffer_slice_8, false)) |result_| {
|
||||
if (result_) |result| {
|
||||
return ZigString.toExternalU16(result.ptr, result.len, globalThis);
|
||||
}
|
||||
@@ -664,15 +676,20 @@ pub const TextDecoder = struct {
|
||||
}
|
||||
|
||||
// Experiment: using mimalloc directly is slightly slower
|
||||
return ZigString.init(buffer_slice).toValueGC(globalThis);
|
||||
return ZigString.init(moved_buffer_slice_8).toValueGC(globalThis);
|
||||
},
|
||||
|
||||
EncodingLabel.@"UTF-16LE" => {
|
||||
if (std.mem.isAligned(@intFromPtr(buffer_slice.ptr), @alignOf([*]const u16))) {
|
||||
return this.decodeUTF16WithAlignment([]align(2) const u16, @as([]align(2) const u16, @alignCast(std.mem.bytesAsSlice(u16, buffer_slice))), globalThis);
|
||||
const moved_buffer_slice_16 = if (!this.ignore_bom and buffer_slice.len > 2 and std.mem.eql(u8, &[_]u8{ '\xFF', '\xFE' }, buffer_slice[0..2]))
|
||||
buffer_slice[2..]
|
||||
else
|
||||
buffer_slice;
|
||||
|
||||
if (std.mem.isAligned(@intFromPtr(moved_buffer_slice_16.ptr), @alignOf([*]const u16))) {
|
||||
return this.decodeUTF16WithAlignment([]align(2) const u16, @as([]align(2) const u16, @alignCast(std.mem.bytesAsSlice(u16, moved_buffer_slice_16))), globalThis);
|
||||
}
|
||||
|
||||
return this.decodeUTF16WithAlignment([]align(1) const u16, std.mem.bytesAsSlice(u16, buffer_slice), globalThis);
|
||||
return this.decodeUTF16WithAlignment([]align(1) const u16, std.mem.bytesAsSlice(u16, moved_buffer_slice_16), globalThis);
|
||||
},
|
||||
else => {
|
||||
globalThis.throwInvalidArguments("TextDecoder.decode set to unsupported encoding", .{});
|
||||
|
||||
@@ -250,7 +250,7 @@ describe("TextDecoder", () => {
|
||||
it("constructor should set values", () => {
|
||||
const decoder = new TextDecoder("utf-8", { fatal: true, ignoreBOM: false });
|
||||
expect(decoder.fatal).toBe(true);
|
||||
// expect(decoder.ignoreBOM).toBe(false); // currently the getter for ignoreBOM doesn't work and always returns undefined
|
||||
expect(decoder.ignoreBOM).toBe(false);
|
||||
});
|
||||
|
||||
it("should throw on invalid input", () => {
|
||||
@@ -265,6 +265,28 @@ describe("TextDecoder", () => {
|
||||
});
|
||||
});
|
||||
|
||||
describe("TextDecoder ignoreBOM", () => {
|
||||
it.each([
|
||||
{
|
||||
encoding: "utf-8",
|
||||
bytes: [0xef, 0xbb, 0xbf, 0x61, 0x62, 0x63],
|
||||
},
|
||||
{
|
||||
encoding: "utf-16le",
|
||||
bytes: [0xff, 0xfe, 0x61, 0x00, 0x62, 0x00, 0x63, 0x00],
|
||||
},
|
||||
])("should ignoreBOM for: %o", ({ encoding, bytes }) => {
|
||||
const BOM = "\uFEFF";
|
||||
const array = new Uint8Array(bytes);
|
||||
|
||||
const decoder_ignore_bom = new TextDecoder(encoding, { ignoreBOM: true });
|
||||
expect(decoder_ignore_bom.decode(array)).toStrictEqual(`${BOM}abc`);
|
||||
|
||||
const decoder_not_ignore_bom = new TextDecoder(encoding, { ignoreBOM: false });
|
||||
expect(decoder_not_ignore_bom.decode(array)).toStrictEqual("abc");
|
||||
});
|
||||
});
|
||||
|
||||
it("truncated sequences", () => {
|
||||
const assert_equals = (a, b) => expect(a).toBe(b);
|
||||
|
||||
|
||||
@@ -111,7 +111,7 @@ describe("TextEncoder", () => {
|
||||
const fixture = new Uint8Array(await Bun.file(import.meta.dir + "/utf8-encoding-fixture.bin").arrayBuffer());
|
||||
const length = 0x110000;
|
||||
let textEncoder = new TextEncoder();
|
||||
let textDecoder = new TextDecoder();
|
||||
let textDecoder = new TextDecoder("utf-8", { ignoreBOM: true });
|
||||
let encodeOut = new Uint8Array(length * 4);
|
||||
let encodeIntoOut = new Uint8Array(length * 4);
|
||||
let encodeIntoBuffer = new Uint8Array(4);
|
||||
|
||||
Reference in New Issue
Block a user