mirror of
https://github.com/oven-sh/bun
synced 2026-02-17 14:22:01 +00:00
Move TextDecoder, TextEncoderStreamEncoder, TextEncoder, EncodingLabel into separate files (#18430)
This commit is contained in:
350
src/bun.js/webcore/TextDecoder.zig
Normal file
350
src/bun.js/webcore/TextDecoder.zig
Normal file
@@ -0,0 +1,350 @@
|
||||
// used for utf8 decoding
|
||||
buffered: struct {
|
||||
buf: [3]u8 = .{0} ** 3,
|
||||
len: u2 = 0,
|
||||
|
||||
pub fn slice(this: *@This()) []const u8 {
|
||||
return this.buf[0..this.len];
|
||||
}
|
||||
} = .{},
|
||||
|
||||
// used for utf16 decoding
|
||||
lead_byte: ?u8 = null,
|
||||
lead_surrogate: ?u16 = null,
|
||||
|
||||
ignore_bom: bool = false,
|
||||
fatal: bool = false,
|
||||
encoding: EncodingLabel = EncodingLabel.@"UTF-8",
|
||||
|
||||
pub usingnamespace bun.New(TextDecoder);
|
||||
pub usingnamespace JSC.Codegen.JSTextDecoder;
|
||||
|
||||
pub fn finalize(this: *TextDecoder) void {
|
||||
this.destroy();
|
||||
}
|
||||
|
||||
pub fn getIgnoreBOM(
|
||||
this: *TextDecoder,
|
||||
_: *JSC.JSGlobalObject,
|
||||
) JSC.JSValue {
|
||||
return JSC.JSValue.jsBoolean(this.ignore_bom);
|
||||
}
|
||||
|
||||
pub fn getFatal(
|
||||
this: *TextDecoder,
|
||||
_: *JSC.JSGlobalObject,
|
||||
) JSC.JSValue {
|
||||
return JSC.JSValue.jsBoolean(this.fatal);
|
||||
}
|
||||
|
||||
pub fn getEncoding(
|
||||
this: *TextDecoder,
|
||||
globalThis: *JSC.JSGlobalObject,
|
||||
) JSC.JSValue {
|
||||
return ZigString.init(EncodingLabel.label.get(this.encoding).?).toJS(globalThis);
|
||||
}
|
||||
const Vector16 = std.meta.Vector(16, u16);
|
||||
const max_16_ascii: Vector16 = @splat(@as(u16, 127));
|
||||
|
||||
fn processCodeUnitUTF16(
|
||||
this: *TextDecoder,
|
||||
output: *std.ArrayListUnmanaged(u16),
|
||||
saw_error: *bool,
|
||||
code_unit: u16,
|
||||
) error{OutOfMemory}!void {
|
||||
if (this.lead_surrogate) |lead_surrogate| {
|
||||
this.lead_surrogate = null;
|
||||
|
||||
if (strings.u16IsTrail(code_unit)) {
|
||||
// TODO: why is this here?
|
||||
// const code_point = strings.u16GetSupplementary(lead_surrogate, code_unit);
|
||||
try output.appendSlice(
|
||||
bun.default_allocator,
|
||||
&.{ lead_surrogate, code_unit },
|
||||
);
|
||||
return;
|
||||
}
|
||||
try output.append(bun.default_allocator, strings.unicode_replacement);
|
||||
saw_error.* = true;
|
||||
}
|
||||
|
||||
if (strings.u16IsLead(code_unit)) {
|
||||
this.lead_surrogate = code_unit;
|
||||
return;
|
||||
}
|
||||
|
||||
if (strings.u16IsTrail(code_unit)) {
|
||||
try output.append(bun.default_allocator, strings.unicode_replacement);
|
||||
saw_error.* = true;
|
||||
return;
|
||||
}
|
||||
|
||||
try output.append(bun.default_allocator, code_unit);
|
||||
return;
|
||||
}
|
||||
|
||||
pub fn codeUnitFromBytesUTF16(
|
||||
first: u16,
|
||||
second: u16,
|
||||
comptime big_endian: bool,
|
||||
) u16 {
|
||||
return if (comptime big_endian)
|
||||
(first << 8) | second
|
||||
else
|
||||
first | (second << 8);
|
||||
}
|
||||
|
||||
pub fn decodeUTF16(
|
||||
this: *TextDecoder,
|
||||
bytes: []const u8,
|
||||
comptime big_endian: bool,
|
||||
comptime flush: bool,
|
||||
) error{OutOfMemory}!struct { std.ArrayListUnmanaged(u16), bool } {
|
||||
var output: std.ArrayListUnmanaged(u16) = .{};
|
||||
try output.ensureTotalCapacity(bun.default_allocator, @divFloor(bytes.len, 2));
|
||||
|
||||
var remain = bytes;
|
||||
var saw_error = false;
|
||||
|
||||
if (this.lead_byte) |lead_byte| {
|
||||
if (remain.len > 0) {
|
||||
this.lead_byte = null;
|
||||
|
||||
try this.processCodeUnitUTF16(
|
||||
&output,
|
||||
&saw_error,
|
||||
codeUnitFromBytesUTF16(@intCast(lead_byte), @intCast(remain[0]), big_endian),
|
||||
);
|
||||
remain = remain[1..];
|
||||
}
|
||||
}
|
||||
|
||||
var i: usize = 0;
|
||||
|
||||
while (i < remain.len -| 1) {
|
||||
try this.processCodeUnitUTF16(
|
||||
&output,
|
||||
&saw_error,
|
||||
codeUnitFromBytesUTF16(@intCast(remain[i]), @intCast(remain[i + 1]), big_endian),
|
||||
);
|
||||
i += 2;
|
||||
}
|
||||
|
||||
if (remain.len != 0 and i == remain.len - 1) {
|
||||
this.lead_byte = remain[i];
|
||||
} else {
|
||||
bun.assertWithLocation(i == remain.len, @src());
|
||||
}
|
||||
|
||||
if (comptime flush) {
|
||||
if (this.lead_byte != null or this.lead_surrogate != null) {
|
||||
this.lead_byte = null;
|
||||
this.lead_surrogate = null;
|
||||
try output.append(bun.default_allocator, strings.unicode_replacement);
|
||||
saw_error = true;
|
||||
return .{ output, saw_error };
|
||||
}
|
||||
}
|
||||
|
||||
return .{ output, saw_error };
|
||||
}
|
||||
|
||||
pub fn decode(this: *TextDecoder, globalThis: *JSC.JSGlobalObject, callframe: *JSC.CallFrame) bun.JSError!JSValue {
|
||||
const arguments = callframe.arguments_old(2).slice();
|
||||
|
||||
const input_slice = input_slice: {
|
||||
if (arguments.len == 0 or arguments[0].isUndefined()) {
|
||||
break :input_slice "";
|
||||
}
|
||||
|
||||
if (arguments[0].asArrayBuffer(globalThis)) |array_buffer| {
|
||||
break :input_slice array_buffer.slice();
|
||||
}
|
||||
|
||||
return globalThis.throwInvalidArguments("TextDecoder.decode expects an ArrayBuffer or TypedArray", .{});
|
||||
};
|
||||
|
||||
const stream = stream: {
|
||||
if (arguments.len > 1 and arguments[1].isObject()) {
|
||||
if (arguments[1].fastGet(globalThis, .stream)) |stream_value| {
|
||||
const stream_bool = stream_value.coerce(bool, globalThis);
|
||||
if (globalThis.hasException()) {
|
||||
return .zero;
|
||||
}
|
||||
break :stream stream_bool;
|
||||
}
|
||||
}
|
||||
|
||||
break :stream false;
|
||||
};
|
||||
|
||||
return switch (!stream) {
|
||||
inline else => |flush| this.decodeSlice(globalThis, input_slice, flush),
|
||||
};
|
||||
}
|
||||
|
||||
pub fn decodeWithoutTypeChecks(this: *TextDecoder, globalThis: *JSC.JSGlobalObject, uint8array: *JSC.JSUint8Array) bun.JSError!JSValue {
|
||||
return this.decodeSlice(globalThis, uint8array.slice(), false);
|
||||
}
|
||||
|
||||
fn decodeSlice(this: *TextDecoder, globalThis: *JSC.JSGlobalObject, buffer_slice: []const u8, comptime flush: bool) bun.JSError!JSValue {
|
||||
switch (this.encoding) {
|
||||
EncodingLabel.latin1 => {
|
||||
if (strings.isAllASCII(buffer_slice)) {
|
||||
return ZigString.init(buffer_slice).toJS(globalThis);
|
||||
}
|
||||
|
||||
// It's unintuitive that we encode Latin1 as UTF16 even though the engine natively supports Latin1 strings...
|
||||
// However, this is also what WebKit seems to do.
|
||||
//
|
||||
// It's not clear why we couldn't jusst use Latin1 here, but tests failures proved it necessary.
|
||||
const out_length = strings.elementLengthLatin1IntoUTF16([]const u8, buffer_slice);
|
||||
const bytes = try globalThis.allocator().alloc(u16, out_length);
|
||||
|
||||
const out = strings.copyLatin1IntoUTF16([]u16, bytes, []const u8, buffer_slice);
|
||||
return ZigString.toExternalU16(bytes.ptr, out.written, globalThis);
|
||||
},
|
||||
EncodingLabel.@"UTF-8" => {
|
||||
const input, const deinit = input: {
|
||||
const maybe_without_bom = if (!this.ignore_bom and strings.hasPrefixComptime(buffer_slice, "\xef\xbb\xbf"))
|
||||
buffer_slice[3..]
|
||||
else
|
||||
buffer_slice;
|
||||
|
||||
if (this.buffered.len > 0) {
|
||||
defer this.buffered.len = 0;
|
||||
const joined = try bun.default_allocator.alloc(u8, maybe_without_bom.len + this.buffered.len);
|
||||
@memcpy(joined[0..this.buffered.len], this.buffered.slice());
|
||||
@memcpy(joined[this.buffered.len..][0..maybe_without_bom.len], maybe_without_bom);
|
||||
break :input .{ joined, true };
|
||||
}
|
||||
|
||||
break :input .{ maybe_without_bom, false };
|
||||
};
|
||||
|
||||
const maybe_decode_result = switch (this.fatal) {
|
||||
inline else => |fail_if_invalid| strings.toUTF16AllocMaybeBuffered(bun.default_allocator, input, fail_if_invalid, flush) catch |err| {
|
||||
if (deinit) bun.default_allocator.free(input);
|
||||
if (comptime fail_if_invalid) {
|
||||
if (err == error.InvalidByteSequence) {
|
||||
return globalThis.ERR_ENCODING_INVALID_ENCODED_DATA("Invalid byte sequence", .{}).throw();
|
||||
}
|
||||
}
|
||||
|
||||
bun.assert(err == error.OutOfMemory);
|
||||
return globalThis.throwOutOfMemory();
|
||||
},
|
||||
};
|
||||
|
||||
if (maybe_decode_result) |decode_result| {
|
||||
if (deinit) bun.default_allocator.free(input);
|
||||
const decoded, const leftover, const leftover_len = decode_result;
|
||||
bun.assert(this.buffered.len == 0);
|
||||
if (comptime !flush) {
|
||||
if (leftover_len != 0) {
|
||||
this.buffered.buf = leftover;
|
||||
this.buffered.len = leftover_len;
|
||||
}
|
||||
}
|
||||
return ZigString.toExternalU16(decoded.ptr, decoded.len, globalThis);
|
||||
}
|
||||
|
||||
bun.debugAssert(input.len == 0 or !deinit);
|
||||
|
||||
// Experiment: using mimalloc directly is slightly slower
|
||||
return ZigString.init(input).toJS(globalThis);
|
||||
},
|
||||
|
||||
inline .@"UTF-16LE", .@"UTF-16BE" => |utf16_encoding| {
|
||||
const bom = if (comptime utf16_encoding == .@"UTF-16LE") "\xff\xfe" else "\xfe\xff";
|
||||
const input = if (!this.ignore_bom and strings.hasPrefixComptime(buffer_slice, bom))
|
||||
buffer_slice[2..]
|
||||
else
|
||||
buffer_slice;
|
||||
|
||||
var decoded, const saw_error = try this.decodeUTF16(input, utf16_encoding == .@"UTF-16BE", flush);
|
||||
|
||||
if (saw_error and this.fatal) {
|
||||
decoded.deinit(bun.default_allocator);
|
||||
return globalThis.ERR_ENCODING_INVALID_ENCODED_DATA("The encoded data was not valid {s} data", .{@tagName(utf16_encoding)}).throw();
|
||||
}
|
||||
|
||||
var output = bun.String.fromUTF16(decoded.items);
|
||||
return output.toJS(globalThis);
|
||||
},
|
||||
else => {
|
||||
return globalThis.throwInvalidArguments("TextDecoder.decode set to unsupported encoding", .{});
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
pub fn constructor(globalThis: *JSC.JSGlobalObject, callframe: *JSC.CallFrame) bun.JSError!*TextDecoder {
|
||||
var args_ = callframe.arguments_old(2);
|
||||
var arguments: []const JSC.JSValue = args_.ptr[0..args_.len];
|
||||
|
||||
var decoder = TextDecoder{};
|
||||
|
||||
if (arguments.len > 0) {
|
||||
// encoding
|
||||
if (arguments[0].isString()) {
|
||||
var str = try arguments[0].toSlice(globalThis, bun.default_allocator);
|
||||
defer if (str.isAllocated()) str.deinit();
|
||||
|
||||
if (EncodingLabel.which(str.slice())) |label| {
|
||||
decoder.encoding = label;
|
||||
} else {
|
||||
return globalThis.throwInvalidArguments("Unsupported encoding label \"{s}\"", .{str.slice()});
|
||||
}
|
||||
} else if (arguments[0].isUndefined()) {
|
||||
// default to utf-8
|
||||
decoder.encoding = EncodingLabel.@"UTF-8";
|
||||
} else {
|
||||
return globalThis.throwInvalidArguments("TextDecoder(encoding) label is invalid", .{});
|
||||
}
|
||||
|
||||
if (arguments.len >= 2) {
|
||||
const options = arguments[1];
|
||||
|
||||
if (!options.isObject()) {
|
||||
return globalThis.throwInvalidArguments("TextDecoder(options) is invalid", .{});
|
||||
}
|
||||
|
||||
if (try options.get(globalThis, "fatal")) |fatal| {
|
||||
if (fatal.isBoolean()) {
|
||||
decoder.fatal = fatal.asBoolean();
|
||||
} else {
|
||||
return globalThis.throwInvalidArguments("TextDecoder(options) fatal is invalid. Expected boolean value", .{});
|
||||
}
|
||||
}
|
||||
|
||||
if (try options.get(globalThis, "ignoreBOM")) |ignoreBOM| {
|
||||
if (ignoreBOM.isBoolean()) {
|
||||
decoder.ignore_bom = ignoreBOM.asBoolean();
|
||||
} else {
|
||||
return globalThis.throwInvalidArguments("TextDecoder(options) ignoreBOM is invalid. Expected boolean value", .{});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return TextDecoder.new(decoder);
|
||||
}
|
||||
|
||||
const TextDecoder = @This();
|
||||
|
||||
const std = @import("std");
|
||||
const bun = @import("root").bun;
|
||||
const JSC = bun.JSC;
|
||||
const Output = bun.Output;
|
||||
const MutableString = bun.MutableString;
|
||||
const strings = bun.strings;
|
||||
const string = bun.string;
|
||||
const FeatureFlags = bun.FeatureFlags;
|
||||
const ArrayBuffer = JSC.ArrayBuffer;
|
||||
const JSUint8Array = JSC.JSUint8Array;
|
||||
const ZigString = JSC.ZigString;
|
||||
const JSInternalPromise = JSC.JSInternalPromise;
|
||||
const JSPromise = JSC.JSPromise;
|
||||
const JSValue = JSC.JSValue;
|
||||
const JSGlobalObject = JSC.JSGlobalObject;
|
||||
const EncodingLabel = JSC.WebCore.EncodingLabel;
|
||||
Reference in New Issue
Block a user