mirror of
https://github.com/oven-sh/bun
synced 2026-02-02 15:08:46 +00:00
Move TextDecoder, TextEncoderStreamEncoder, TextEncoder, EncodingLabel into separate files (#18430)
This commit is contained in:
160
src/bun.js/webcore/EncodingLabel.zig
Normal file
160
src/bun.js/webcore/EncodingLabel.zig
Normal file
@@ -0,0 +1,160 @@
|
||||
/// https://encoding.spec.whatwg.org/encodings.json
|
||||
pub const EncodingLabel = enum {
|
||||
@"UTF-8",
|
||||
IBM866,
|
||||
@"ISO-8859-2",
|
||||
@"ISO-8859-3",
|
||||
@"ISO-8859-4",
|
||||
@"ISO-8859-5",
|
||||
@"ISO-8859-6",
|
||||
@"ISO-8859-7",
|
||||
@"ISO-8859-8",
|
||||
@"ISO-8859-8-I",
|
||||
@"ISO-8859-10",
|
||||
@"ISO-8859-13",
|
||||
@"ISO-8859-14",
|
||||
@"ISO-8859-15",
|
||||
@"ISO-8859-16",
|
||||
@"KOI8-R",
|
||||
@"KOI8-U",
|
||||
macintosh,
|
||||
@"windows-874",
|
||||
@"windows-1250",
|
||||
@"windows-1251",
|
||||
/// Also known as
|
||||
/// - ASCII
|
||||
/// - latin1
|
||||
@"windows-1252",
|
||||
@"windows-1253",
|
||||
@"windows-1254",
|
||||
@"windows-1255",
|
||||
@"windows-1256",
|
||||
@"windows-1257",
|
||||
@"windows-1258",
|
||||
@"x-mac-cyrillic",
|
||||
Big5,
|
||||
@"EUC-JP",
|
||||
@"ISO-2022-JP",
|
||||
Shift_JIS,
|
||||
@"EUC-KR",
|
||||
@"UTF-16BE",
|
||||
@"UTF-16LE",
|
||||
@"x-user-defined",
|
||||
|
||||
pub const Map = std.enums.EnumMap(EncodingLabel, string);
|
||||
pub const label: Map = brk: {
|
||||
var map = Map.initFull("");
|
||||
map.put(EncodingLabel.@"UTF-8", "utf-8");
|
||||
map.put(EncodingLabel.@"UTF-16LE", "utf-16le");
|
||||
map.put(EncodingLabel.@"windows-1252", "windows-1252");
|
||||
break :brk map;
|
||||
};
|
||||
|
||||
const utf16_names = [_]string{
|
||||
"ucs-2",
|
||||
"utf-16",
|
||||
"unicode",
|
||||
"utf-16le",
|
||||
"csunicode",
|
||||
"unicodefeff",
|
||||
"iso-10646-ucs-2",
|
||||
};
|
||||
|
||||
const utf8_names = [_]string{
|
||||
"utf8",
|
||||
"utf-8",
|
||||
"unicode11utf8",
|
||||
"unicode20utf8",
|
||||
"x-unicode20utf8",
|
||||
"unicode-1-1-utf-8",
|
||||
};
|
||||
|
||||
const latin1_names = [_]string{
|
||||
"l1",
|
||||
"ascii",
|
||||
"cp819",
|
||||
"cp1252",
|
||||
"ibm819",
|
||||
"latin1",
|
||||
"iso88591",
|
||||
"us-ascii",
|
||||
"x-cp1252",
|
||||
"iso8859-1",
|
||||
"iso_8859-1",
|
||||
"iso-8859-1",
|
||||
"iso-ir-100",
|
||||
"csisolatin1",
|
||||
"windows-1252",
|
||||
"ansi_x3.4-1968",
|
||||
"iso_8859-1:1987",
|
||||
};
|
||||
|
||||
pub const latin1 = EncodingLabel.@"windows-1252";
|
||||
|
||||
pub fn which(input_: string) ?EncodingLabel {
|
||||
const input = strings.trim(input_, " \t\r\n");
|
||||
const ExactMatcher = strings.ExactSizeMatcher;
|
||||
const Eight = ExactMatcher(8);
|
||||
const Sixteen = ExactMatcher(16);
|
||||
return switch (input.len) {
|
||||
1, 0 => null,
|
||||
2...8 => switch (Eight.matchLower(input)) {
|
||||
Eight.case("l1"),
|
||||
Eight.case("ascii"),
|
||||
Eight.case("cp819"),
|
||||
Eight.case("cp1252"),
|
||||
Eight.case("ibm819"),
|
||||
Eight.case("latin1"),
|
||||
Eight.case("iso88591"),
|
||||
Eight.case("us-ascii"),
|
||||
Eight.case("x-cp1252"),
|
||||
=> EncodingLabel.latin1,
|
||||
|
||||
Eight.case("ucs-2"),
|
||||
Eight.case("utf-16"),
|
||||
Eight.case("unicode"),
|
||||
Eight.case("utf-16le"),
|
||||
=> EncodingLabel.@"UTF-16LE",
|
||||
|
||||
Eight.case("utf-16be"),
|
||||
=> EncodingLabel.@"UTF-16BE",
|
||||
|
||||
Eight.case("utf8"), Eight.case("utf-8") => EncodingLabel.@"UTF-8",
|
||||
else => null,
|
||||
},
|
||||
|
||||
9...16 => switch (Sixteen.matchLower(input)) {
|
||||
Sixteen.case("iso8859-1"),
|
||||
Sixteen.case("iso_8859-1"),
|
||||
Sixteen.case("iso-8859-1"),
|
||||
Sixteen.case("iso-ir-100"),
|
||||
Sixteen.case("csisolatin1"),
|
||||
Sixteen.case("windows-1252"),
|
||||
Sixteen.case("ansi_x3.4-1968"),
|
||||
Sixteen.case("iso_8859-1:1987"),
|
||||
=> EncodingLabel.latin1,
|
||||
|
||||
Sixteen.case("unicode11utf8"),
|
||||
Sixteen.case("unicode20utf8"),
|
||||
Sixteen.case("x-unicode20utf8"),
|
||||
=> EncodingLabel.@"UTF-8",
|
||||
|
||||
Sixteen.case("csunicode"),
|
||||
Sixteen.case("unicodefeff"),
|
||||
Sixteen.case("iso-10646-ucs-2"),
|
||||
=> EncodingLabel.@"UTF-16LE",
|
||||
|
||||
else => null,
|
||||
},
|
||||
else => if (strings.eqlCaseInsensitiveASCII(input, "unicode-1-1-utf-8", true))
|
||||
EncodingLabel.@"UTF-8"
|
||||
else
|
||||
null,
|
||||
};
|
||||
}
|
||||
};
|
||||
const std = @import("std");
|
||||
const bun = @import("root").bun;
|
||||
const encoding = @import("encoding.zig");
|
||||
const string = []const u8;
|
||||
const strings = bun.strings;
|
||||
350
src/bun.js/webcore/TextDecoder.zig
Normal file
350
src/bun.js/webcore/TextDecoder.zig
Normal file
@@ -0,0 +1,350 @@
|
||||
// used for utf8 decoding
|
||||
buffered: struct {
|
||||
buf: [3]u8 = .{0} ** 3,
|
||||
len: u2 = 0,
|
||||
|
||||
pub fn slice(this: *@This()) []const u8 {
|
||||
return this.buf[0..this.len];
|
||||
}
|
||||
} = .{},
|
||||
|
||||
// used for utf16 decoding
|
||||
lead_byte: ?u8 = null,
|
||||
lead_surrogate: ?u16 = null,
|
||||
|
||||
ignore_bom: bool = false,
|
||||
fatal: bool = false,
|
||||
encoding: EncodingLabel = EncodingLabel.@"UTF-8",
|
||||
|
||||
pub usingnamespace bun.New(TextDecoder);
|
||||
pub usingnamespace JSC.Codegen.JSTextDecoder;
|
||||
|
||||
pub fn finalize(this: *TextDecoder) void {
|
||||
this.destroy();
|
||||
}
|
||||
|
||||
pub fn getIgnoreBOM(
|
||||
this: *TextDecoder,
|
||||
_: *JSC.JSGlobalObject,
|
||||
) JSC.JSValue {
|
||||
return JSC.JSValue.jsBoolean(this.ignore_bom);
|
||||
}
|
||||
|
||||
pub fn getFatal(
|
||||
this: *TextDecoder,
|
||||
_: *JSC.JSGlobalObject,
|
||||
) JSC.JSValue {
|
||||
return JSC.JSValue.jsBoolean(this.fatal);
|
||||
}
|
||||
|
||||
pub fn getEncoding(
|
||||
this: *TextDecoder,
|
||||
globalThis: *JSC.JSGlobalObject,
|
||||
) JSC.JSValue {
|
||||
return ZigString.init(EncodingLabel.label.get(this.encoding).?).toJS(globalThis);
|
||||
}
|
||||
const Vector16 = std.meta.Vector(16, u16);
|
||||
const max_16_ascii: Vector16 = @splat(@as(u16, 127));
|
||||
|
||||
fn processCodeUnitUTF16(
|
||||
this: *TextDecoder,
|
||||
output: *std.ArrayListUnmanaged(u16),
|
||||
saw_error: *bool,
|
||||
code_unit: u16,
|
||||
) error{OutOfMemory}!void {
|
||||
if (this.lead_surrogate) |lead_surrogate| {
|
||||
this.lead_surrogate = null;
|
||||
|
||||
if (strings.u16IsTrail(code_unit)) {
|
||||
// TODO: why is this here?
|
||||
// const code_point = strings.u16GetSupplementary(lead_surrogate, code_unit);
|
||||
try output.appendSlice(
|
||||
bun.default_allocator,
|
||||
&.{ lead_surrogate, code_unit },
|
||||
);
|
||||
return;
|
||||
}
|
||||
try output.append(bun.default_allocator, strings.unicode_replacement);
|
||||
saw_error.* = true;
|
||||
}
|
||||
|
||||
if (strings.u16IsLead(code_unit)) {
|
||||
this.lead_surrogate = code_unit;
|
||||
return;
|
||||
}
|
||||
|
||||
if (strings.u16IsTrail(code_unit)) {
|
||||
try output.append(bun.default_allocator, strings.unicode_replacement);
|
||||
saw_error.* = true;
|
||||
return;
|
||||
}
|
||||
|
||||
try output.append(bun.default_allocator, code_unit);
|
||||
return;
|
||||
}
|
||||
|
||||
pub fn codeUnitFromBytesUTF16(
|
||||
first: u16,
|
||||
second: u16,
|
||||
comptime big_endian: bool,
|
||||
) u16 {
|
||||
return if (comptime big_endian)
|
||||
(first << 8) | second
|
||||
else
|
||||
first | (second << 8);
|
||||
}
|
||||
|
||||
pub fn decodeUTF16(
|
||||
this: *TextDecoder,
|
||||
bytes: []const u8,
|
||||
comptime big_endian: bool,
|
||||
comptime flush: bool,
|
||||
) error{OutOfMemory}!struct { std.ArrayListUnmanaged(u16), bool } {
|
||||
var output: std.ArrayListUnmanaged(u16) = .{};
|
||||
try output.ensureTotalCapacity(bun.default_allocator, @divFloor(bytes.len, 2));
|
||||
|
||||
var remain = bytes;
|
||||
var saw_error = false;
|
||||
|
||||
if (this.lead_byte) |lead_byte| {
|
||||
if (remain.len > 0) {
|
||||
this.lead_byte = null;
|
||||
|
||||
try this.processCodeUnitUTF16(
|
||||
&output,
|
||||
&saw_error,
|
||||
codeUnitFromBytesUTF16(@intCast(lead_byte), @intCast(remain[0]), big_endian),
|
||||
);
|
||||
remain = remain[1..];
|
||||
}
|
||||
}
|
||||
|
||||
var i: usize = 0;
|
||||
|
||||
while (i < remain.len -| 1) {
|
||||
try this.processCodeUnitUTF16(
|
||||
&output,
|
||||
&saw_error,
|
||||
codeUnitFromBytesUTF16(@intCast(remain[i]), @intCast(remain[i + 1]), big_endian),
|
||||
);
|
||||
i += 2;
|
||||
}
|
||||
|
||||
if (remain.len != 0 and i == remain.len - 1) {
|
||||
this.lead_byte = remain[i];
|
||||
} else {
|
||||
bun.assertWithLocation(i == remain.len, @src());
|
||||
}
|
||||
|
||||
if (comptime flush) {
|
||||
if (this.lead_byte != null or this.lead_surrogate != null) {
|
||||
this.lead_byte = null;
|
||||
this.lead_surrogate = null;
|
||||
try output.append(bun.default_allocator, strings.unicode_replacement);
|
||||
saw_error = true;
|
||||
return .{ output, saw_error };
|
||||
}
|
||||
}
|
||||
|
||||
return .{ output, saw_error };
|
||||
}
|
||||
|
||||
pub fn decode(this: *TextDecoder, globalThis: *JSC.JSGlobalObject, callframe: *JSC.CallFrame) bun.JSError!JSValue {
|
||||
const arguments = callframe.arguments_old(2).slice();
|
||||
|
||||
const input_slice = input_slice: {
|
||||
if (arguments.len == 0 or arguments[0].isUndefined()) {
|
||||
break :input_slice "";
|
||||
}
|
||||
|
||||
if (arguments[0].asArrayBuffer(globalThis)) |array_buffer| {
|
||||
break :input_slice array_buffer.slice();
|
||||
}
|
||||
|
||||
return globalThis.throwInvalidArguments("TextDecoder.decode expects an ArrayBuffer or TypedArray", .{});
|
||||
};
|
||||
|
||||
const stream = stream: {
|
||||
if (arguments.len > 1 and arguments[1].isObject()) {
|
||||
if (arguments[1].fastGet(globalThis, .stream)) |stream_value| {
|
||||
const stream_bool = stream_value.coerce(bool, globalThis);
|
||||
if (globalThis.hasException()) {
|
||||
return .zero;
|
||||
}
|
||||
break :stream stream_bool;
|
||||
}
|
||||
}
|
||||
|
||||
break :stream false;
|
||||
};
|
||||
|
||||
return switch (!stream) {
|
||||
inline else => |flush| this.decodeSlice(globalThis, input_slice, flush),
|
||||
};
|
||||
}
|
||||
|
||||
pub fn decodeWithoutTypeChecks(this: *TextDecoder, globalThis: *JSC.JSGlobalObject, uint8array: *JSC.JSUint8Array) bun.JSError!JSValue {
|
||||
return this.decodeSlice(globalThis, uint8array.slice(), false);
|
||||
}
|
||||
|
||||
fn decodeSlice(this: *TextDecoder, globalThis: *JSC.JSGlobalObject, buffer_slice: []const u8, comptime flush: bool) bun.JSError!JSValue {
|
||||
switch (this.encoding) {
|
||||
EncodingLabel.latin1 => {
|
||||
if (strings.isAllASCII(buffer_slice)) {
|
||||
return ZigString.init(buffer_slice).toJS(globalThis);
|
||||
}
|
||||
|
||||
// It's unintuitive that we encode Latin1 as UTF16 even though the engine natively supports Latin1 strings...
|
||||
// However, this is also what WebKit seems to do.
|
||||
//
|
||||
// It's not clear why we couldn't jusst use Latin1 here, but tests failures proved it necessary.
|
||||
const out_length = strings.elementLengthLatin1IntoUTF16([]const u8, buffer_slice);
|
||||
const bytes = try globalThis.allocator().alloc(u16, out_length);
|
||||
|
||||
const out = strings.copyLatin1IntoUTF16([]u16, bytes, []const u8, buffer_slice);
|
||||
return ZigString.toExternalU16(bytes.ptr, out.written, globalThis);
|
||||
},
|
||||
EncodingLabel.@"UTF-8" => {
|
||||
const input, const deinit = input: {
|
||||
const maybe_without_bom = if (!this.ignore_bom and strings.hasPrefixComptime(buffer_slice, "\xef\xbb\xbf"))
|
||||
buffer_slice[3..]
|
||||
else
|
||||
buffer_slice;
|
||||
|
||||
if (this.buffered.len > 0) {
|
||||
defer this.buffered.len = 0;
|
||||
const joined = try bun.default_allocator.alloc(u8, maybe_without_bom.len + this.buffered.len);
|
||||
@memcpy(joined[0..this.buffered.len], this.buffered.slice());
|
||||
@memcpy(joined[this.buffered.len..][0..maybe_without_bom.len], maybe_without_bom);
|
||||
break :input .{ joined, true };
|
||||
}
|
||||
|
||||
break :input .{ maybe_without_bom, false };
|
||||
};
|
||||
|
||||
const maybe_decode_result = switch (this.fatal) {
|
||||
inline else => |fail_if_invalid| strings.toUTF16AllocMaybeBuffered(bun.default_allocator, input, fail_if_invalid, flush) catch |err| {
|
||||
if (deinit) bun.default_allocator.free(input);
|
||||
if (comptime fail_if_invalid) {
|
||||
if (err == error.InvalidByteSequence) {
|
||||
return globalThis.ERR_ENCODING_INVALID_ENCODED_DATA("Invalid byte sequence", .{}).throw();
|
||||
}
|
||||
}
|
||||
|
||||
bun.assert(err == error.OutOfMemory);
|
||||
return globalThis.throwOutOfMemory();
|
||||
},
|
||||
};
|
||||
|
||||
if (maybe_decode_result) |decode_result| {
|
||||
if (deinit) bun.default_allocator.free(input);
|
||||
const decoded, const leftover, const leftover_len = decode_result;
|
||||
bun.assert(this.buffered.len == 0);
|
||||
if (comptime !flush) {
|
||||
if (leftover_len != 0) {
|
||||
this.buffered.buf = leftover;
|
||||
this.buffered.len = leftover_len;
|
||||
}
|
||||
}
|
||||
return ZigString.toExternalU16(decoded.ptr, decoded.len, globalThis);
|
||||
}
|
||||
|
||||
bun.debugAssert(input.len == 0 or !deinit);
|
||||
|
||||
// Experiment: using mimalloc directly is slightly slower
|
||||
return ZigString.init(input).toJS(globalThis);
|
||||
},
|
||||
|
||||
inline .@"UTF-16LE", .@"UTF-16BE" => |utf16_encoding| {
|
||||
const bom = if (comptime utf16_encoding == .@"UTF-16LE") "\xff\xfe" else "\xfe\xff";
|
||||
const input = if (!this.ignore_bom and strings.hasPrefixComptime(buffer_slice, bom))
|
||||
buffer_slice[2..]
|
||||
else
|
||||
buffer_slice;
|
||||
|
||||
var decoded, const saw_error = try this.decodeUTF16(input, utf16_encoding == .@"UTF-16BE", flush);
|
||||
|
||||
if (saw_error and this.fatal) {
|
||||
decoded.deinit(bun.default_allocator);
|
||||
return globalThis.ERR_ENCODING_INVALID_ENCODED_DATA("The encoded data was not valid {s} data", .{@tagName(utf16_encoding)}).throw();
|
||||
}
|
||||
|
||||
var output = bun.String.fromUTF16(decoded.items);
|
||||
return output.toJS(globalThis);
|
||||
},
|
||||
else => {
|
||||
return globalThis.throwInvalidArguments("TextDecoder.decode set to unsupported encoding", .{});
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
pub fn constructor(globalThis: *JSC.JSGlobalObject, callframe: *JSC.CallFrame) bun.JSError!*TextDecoder {
|
||||
var args_ = callframe.arguments_old(2);
|
||||
var arguments: []const JSC.JSValue = args_.ptr[0..args_.len];
|
||||
|
||||
var decoder = TextDecoder{};
|
||||
|
||||
if (arguments.len > 0) {
|
||||
// encoding
|
||||
if (arguments[0].isString()) {
|
||||
var str = try arguments[0].toSlice(globalThis, bun.default_allocator);
|
||||
defer if (str.isAllocated()) str.deinit();
|
||||
|
||||
if (EncodingLabel.which(str.slice())) |label| {
|
||||
decoder.encoding = label;
|
||||
} else {
|
||||
return globalThis.throwInvalidArguments("Unsupported encoding label \"{s}\"", .{str.slice()});
|
||||
}
|
||||
} else if (arguments[0].isUndefined()) {
|
||||
// default to utf-8
|
||||
decoder.encoding = EncodingLabel.@"UTF-8";
|
||||
} else {
|
||||
return globalThis.throwInvalidArguments("TextDecoder(encoding) label is invalid", .{});
|
||||
}
|
||||
|
||||
if (arguments.len >= 2) {
|
||||
const options = arguments[1];
|
||||
|
||||
if (!options.isObject()) {
|
||||
return globalThis.throwInvalidArguments("TextDecoder(options) is invalid", .{});
|
||||
}
|
||||
|
||||
if (try options.get(globalThis, "fatal")) |fatal| {
|
||||
if (fatal.isBoolean()) {
|
||||
decoder.fatal = fatal.asBoolean();
|
||||
} else {
|
||||
return globalThis.throwInvalidArguments("TextDecoder(options) fatal is invalid. Expected boolean value", .{});
|
||||
}
|
||||
}
|
||||
|
||||
if (try options.get(globalThis, "ignoreBOM")) |ignoreBOM| {
|
||||
if (ignoreBOM.isBoolean()) {
|
||||
decoder.ignore_bom = ignoreBOM.asBoolean();
|
||||
} else {
|
||||
return globalThis.throwInvalidArguments("TextDecoder(options) ignoreBOM is invalid. Expected boolean value", .{});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return TextDecoder.new(decoder);
|
||||
}
|
||||
|
||||
const TextDecoder = @This();
|
||||
|
||||
const std = @import("std");
|
||||
const bun = @import("root").bun;
|
||||
const JSC = bun.JSC;
|
||||
const Output = bun.Output;
|
||||
const MutableString = bun.MutableString;
|
||||
const strings = bun.strings;
|
||||
const string = bun.string;
|
||||
const FeatureFlags = bun.FeatureFlags;
|
||||
const ArrayBuffer = JSC.ArrayBuffer;
|
||||
const JSUint8Array = JSC.JSUint8Array;
|
||||
const ZigString = JSC.ZigString;
|
||||
const JSInternalPromise = JSC.JSInternalPromise;
|
||||
const JSPromise = JSC.JSPromise;
|
||||
const JSValue = JSC.JSValue;
|
||||
const JSGlobalObject = JSC.JSGlobalObject;
|
||||
const EncodingLabel = JSC.WebCore.EncodingLabel;
|
||||
255
src/bun.js/webcore/TextEncoder.zig
Normal file
255
src/bun.js/webcore/TextEncoder.zig
Normal file
@@ -0,0 +1,255 @@
|
||||
pub export fn TextEncoder__encode8(
|
||||
globalThis: *JSGlobalObject,
|
||||
ptr: [*]const u8,
|
||||
len: usize,
|
||||
) JSValue {
|
||||
// as much as possible, rely on JSC to own the memory
|
||||
// their code is more battle-tested than bun's code
|
||||
// so we do a stack allocation here
|
||||
// and then copy into JSC memory
|
||||
// unless it's huge
|
||||
// JSC will GC Uint8Array that occupy less than 512 bytes
|
||||
// so it's extra good for that case
|
||||
// this also means there won't be reallocations for small strings
|
||||
var buf: [2048]u8 = undefined;
|
||||
const slice = ptr[0..len];
|
||||
|
||||
if (slice.len <= buf.len / 2) {
|
||||
const result = strings.copyLatin1IntoUTF8(&buf, []const u8, slice);
|
||||
const uint8array = JSC.JSValue.createUninitializedUint8Array(globalThis, result.written);
|
||||
bun.assert(result.written <= buf.len);
|
||||
bun.assert(result.read == slice.len);
|
||||
const array_buffer = uint8array.asArrayBuffer(globalThis) orelse return .zero;
|
||||
bun.assert(result.written == array_buffer.len);
|
||||
@memcpy(array_buffer.byteSlice()[0..result.written], buf[0..result.written]);
|
||||
return uint8array;
|
||||
} else {
|
||||
const bytes = strings.allocateLatin1IntoUTF8(globalThis.bunVM().allocator, []const u8, slice) catch {
|
||||
return globalThis.throwOutOfMemoryValue();
|
||||
};
|
||||
bun.assert(bytes.len >= slice.len);
|
||||
return ArrayBuffer.fromBytes(bytes, .Uint8Array).toJSUnchecked(globalThis, null);
|
||||
}
|
||||
}
|
||||
pub export fn TextEncoder__encode16(
|
||||
globalThis: *JSGlobalObject,
|
||||
ptr: [*]const u16,
|
||||
len: usize,
|
||||
) JSValue {
|
||||
// as much as possible, rely on JSC to own the memory
|
||||
// their code is more battle-tested than bun's code
|
||||
// so we do a stack allocation here
|
||||
// and then copy into JSC memory
|
||||
// unless it's huge
|
||||
// JSC will GC Uint8Array that occupy less than 512 bytes
|
||||
// so it's extra good for that case
|
||||
// this also means there won't be reallocations for small strings
|
||||
var buf: [2048]u8 = undefined;
|
||||
|
||||
const slice = ptr[0..len];
|
||||
|
||||
// max utf16 -> utf8 length
|
||||
if (slice.len <= buf.len / 4) {
|
||||
const result = strings.copyUTF16IntoUTF8(&buf, @TypeOf(slice), slice, true);
|
||||
if (result.read == 0 or result.written == 0) {
|
||||
const uint8array = JSC.JSValue.createUninitializedUint8Array(globalThis, 3);
|
||||
const array_buffer = uint8array.asArrayBuffer(globalThis).?;
|
||||
const replacement_char = [_]u8{ 239, 191, 189 };
|
||||
@memcpy(array_buffer.slice()[0..replacement_char.len], &replacement_char);
|
||||
return uint8array;
|
||||
}
|
||||
const uint8array = JSC.JSValue.createUninitializedUint8Array(globalThis, result.written);
|
||||
bun.assert(result.written <= buf.len);
|
||||
bun.assert(result.read == slice.len);
|
||||
const array_buffer = uint8array.asArrayBuffer(globalThis).?;
|
||||
bun.assert(result.written == array_buffer.len);
|
||||
@memcpy(array_buffer.slice()[0..result.written], buf[0..result.written]);
|
||||
return uint8array;
|
||||
} else {
|
||||
const bytes = strings.toUTF8AllocWithType(
|
||||
bun.default_allocator,
|
||||
@TypeOf(slice),
|
||||
slice,
|
||||
) catch {
|
||||
return JSC.toInvalidArguments("Out of memory", .{}, globalThis);
|
||||
};
|
||||
return ArrayBuffer.fromBytes(bytes, .Uint8Array).toJSUnchecked(globalThis, null);
|
||||
}
|
||||
}
|
||||
|
||||
pub export fn c(
|
||||
globalThis: *JSGlobalObject,
|
||||
ptr: [*]const u16,
|
||||
len: usize,
|
||||
) JSValue {
|
||||
// as much as possible, rely on JSC to own the memory
|
||||
// their code is more battle-tested than bun's code
|
||||
// so we do a stack allocation here
|
||||
// and then copy into JSC memory
|
||||
// unless it's huge
|
||||
// JSC will GC Uint8Array that occupy less than 512 bytes
|
||||
// so it's extra good for that case
|
||||
// this also means there won't be reallocations for small strings
|
||||
var buf: [2048]u8 = undefined;
|
||||
|
||||
const slice = ptr[0..len];
|
||||
|
||||
// max utf16 -> utf8 length
|
||||
if (slice.len <= buf.len / 4) {
|
||||
const result = strings.copyUTF16IntoUTF8(&buf, @TypeOf(slice), slice, true);
|
||||
if (result.read == 0 or result.written == 0) {
|
||||
const uint8array = JSC.JSValue.createUninitializedUint8Array(globalThis, 3);
|
||||
const array_buffer = uint8array.asArrayBuffer(globalThis).?;
|
||||
const replacement_char = [_]u8{ 239, 191, 189 };
|
||||
@memcpy(array_buffer.slice()[0..replacement_char.len], &replacement_char);
|
||||
return uint8array;
|
||||
}
|
||||
const uint8array = JSC.JSValue.createUninitializedUint8Array(globalThis, result.written);
|
||||
bun.assert(result.written <= buf.len);
|
||||
bun.assert(result.read == slice.len);
|
||||
const array_buffer = uint8array.asArrayBuffer(globalThis).?;
|
||||
bun.assert(result.written == array_buffer.len);
|
||||
@memcpy(array_buffer.slice()[0..result.written], buf[0..result.written]);
|
||||
return uint8array;
|
||||
} else {
|
||||
const bytes = strings.toUTF8AllocWithType(
|
||||
bun.default_allocator,
|
||||
@TypeOf(slice),
|
||||
slice,
|
||||
) catch {
|
||||
return globalThis.throwOutOfMemoryValue();
|
||||
};
|
||||
return ArrayBuffer.fromBytes(bytes, .Uint8Array).toJSUnchecked(globalThis, null);
|
||||
}
|
||||
}
|
||||
|
||||
// This is a fast path for copying a Rope string into a Uint8Array.
|
||||
// This keeps us from an extra string temporary allocation
|
||||
const RopeStringEncoder = struct {
|
||||
globalThis: *JSGlobalObject,
|
||||
buf: []u8,
|
||||
tail: usize = 0,
|
||||
any_non_ascii: bool = false,
|
||||
|
||||
pub fn append8(it: *JSC.JSString.Iterator, ptr: [*]const u8, len: u32) callconv(.C) void {
|
||||
var this = bun.cast(*RopeStringEncoder, it.data.?);
|
||||
const result = strings.copyLatin1IntoUTF8StopOnNonASCII(this.buf[this.tail..], []const u8, ptr[0..len], true);
|
||||
if (result.read == std.math.maxInt(u32) and result.written == std.math.maxInt(u32)) {
|
||||
it.stop = 1;
|
||||
this.any_non_ascii = true;
|
||||
} else {
|
||||
this.tail += result.written;
|
||||
}
|
||||
}
|
||||
pub fn append16(it: *JSC.JSString.Iterator, _: [*]const u16, _: u32) callconv(.C) void {
|
||||
var this = bun.cast(*RopeStringEncoder, it.data.?);
|
||||
this.any_non_ascii = true;
|
||||
it.stop = 1;
|
||||
}
|
||||
pub fn write8(it: *JSC.JSString.Iterator, ptr: [*]const u8, len: u32, offset: u32) callconv(.C) void {
|
||||
var this = bun.cast(*RopeStringEncoder, it.data.?);
|
||||
const result = strings.copyLatin1IntoUTF8StopOnNonASCII(this.buf[offset..], []const u8, ptr[0..len], true);
|
||||
if (result.read == std.math.maxInt(u32) and result.written == std.math.maxInt(u32)) {
|
||||
it.stop = 1;
|
||||
this.any_non_ascii = true;
|
||||
}
|
||||
}
|
||||
pub fn write16(it: *JSC.JSString.Iterator, _: [*]const u16, _: u32, _: u32) callconv(.C) void {
|
||||
var this = bun.cast(*RopeStringEncoder, it.data.?);
|
||||
this.any_non_ascii = true;
|
||||
it.stop = 1;
|
||||
}
|
||||
|
||||
pub fn iter(this: *RopeStringEncoder) JSC.JSString.Iterator {
|
||||
return .{
|
||||
.data = this,
|
||||
.stop = 0,
|
||||
.append8 = append8,
|
||||
.append16 = append16,
|
||||
.write8 = write8,
|
||||
.write16 = write16,
|
||||
};
|
||||
}
|
||||
};
|
||||
|
||||
// This fast path is only suitable for ASCII strings
|
||||
// It's not suitable for UTF-16 strings, because getting the byteLength is unpredictable
|
||||
// It also isn't usable for latin1 strings which contain non-ascii characters
|
||||
pub export fn TextEncoder__encodeRopeString(
|
||||
globalThis: *JSGlobalObject,
|
||||
rope_str: *JSC.JSString,
|
||||
) JSValue {
|
||||
if (comptime Environment.allow_assert) bun.assert(rope_str.is8Bit());
|
||||
var stack_buf: [2048]u8 = undefined;
|
||||
var buf_to_use: []u8 = &stack_buf;
|
||||
const length = rope_str.length();
|
||||
var array: JSValue = .zero;
|
||||
if (length > stack_buf.len / 2) {
|
||||
array = JSC.JSValue.createUninitializedUint8Array(globalThis, length);
|
||||
array.ensureStillAlive();
|
||||
buf_to_use = array.asArrayBuffer(globalThis).?.slice();
|
||||
}
|
||||
var encoder = RopeStringEncoder{
|
||||
.globalThis = globalThis,
|
||||
.buf = buf_to_use,
|
||||
};
|
||||
var iter = encoder.iter();
|
||||
array.ensureStillAlive();
|
||||
rope_str.iterator(globalThis, &iter);
|
||||
array.ensureStillAlive();
|
||||
|
||||
if (encoder.any_non_ascii) {
|
||||
return .undefined;
|
||||
}
|
||||
|
||||
if (array == .zero) {
|
||||
array = JSC.JSValue.createUninitializedUint8Array(globalThis, length);
|
||||
array.ensureStillAlive();
|
||||
@memcpy(array.asArrayBuffer(globalThis).?.ptr[0..length], buf_to_use[0..length]);
|
||||
}
|
||||
|
||||
return array;
|
||||
}
|
||||
|
||||
pub export fn TextEncoder__encodeInto16(
|
||||
input_ptr: [*]const u16,
|
||||
input_len: usize,
|
||||
buf_ptr: [*]u8,
|
||||
buf_len: usize,
|
||||
) u64 {
|
||||
const output = buf_ptr[0..buf_len];
|
||||
const input = input_ptr[0..input_len];
|
||||
var result: strings.EncodeIntoResult = strings.copyUTF16IntoUTF8(output, []const u16, input, false);
|
||||
if (output.len >= 3 and (result.read == 0 or result.written == 0)) {
|
||||
const replacement_char = [_]u8{ 239, 191, 189 };
|
||||
@memcpy(buf_ptr[0..replacement_char.len], &replacement_char);
|
||||
result.read = 1;
|
||||
result.written = 3;
|
||||
}
|
||||
const sized: [2]u32 = .{ result.read, result.written };
|
||||
return @bitCast(sized);
|
||||
}
|
||||
|
||||
pub export fn TextEncoder__encodeInto8(
|
||||
input_ptr: [*]const u8,
|
||||
input_len: usize,
|
||||
buf_ptr: [*]u8,
|
||||
buf_len: usize,
|
||||
) u64 {
|
||||
const output = buf_ptr[0..buf_len];
|
||||
const input = input_ptr[0..input_len];
|
||||
const result: strings.EncodeIntoResult =
|
||||
strings.copyLatin1IntoUTF8(output, []const u8, input);
|
||||
const sized: [2]u32 = .{ result.read, result.written };
|
||||
return @bitCast(sized);
|
||||
}
|
||||
|
||||
const std = @import("std");
|
||||
const bun = @import("root").bun;
|
||||
const strings = bun.strings;
|
||||
const JSC = bun.JSC;
|
||||
const Environment = bun.Environment;
|
||||
const JSGlobalObject = JSC.JSGlobalObject;
|
||||
const JSValue = JSC.JSValue;
|
||||
const ArrayBuffer = JSC.ArrayBuffer;
|
||||
const TextEncoder = @This();
|
||||
213
src/bun.js/webcore/TextEncoderStreamEncoder.zig
Normal file
213
src/bun.js/webcore/TextEncoderStreamEncoder.zig
Normal file
@@ -0,0 +1,213 @@
|
||||
pending_lead_surrogate: ?u16 = null,
|
||||
|
||||
const log = Output.scoped(.TextEncoderStreamEncoder, false);
|
||||
|
||||
pub usingnamespace JSC.Codegen.JSTextEncoderStreamEncoder;
|
||||
pub usingnamespace bun.New(TextEncoderStreamEncoder);
|
||||
|
||||
pub fn finalize(this: *TextEncoderStreamEncoder) void {
|
||||
this.destroy();
|
||||
}
|
||||
|
||||
pub fn constructor(_: *JSGlobalObject, _: *JSC.CallFrame) bun.JSError!*TextEncoderStreamEncoder {
|
||||
return TextEncoderStreamEncoder.new(.{});
|
||||
}
|
||||
|
||||
pub fn encode(this: *TextEncoderStreamEncoder, globalObject: *JSC.JSGlobalObject, callFrame: *JSC.CallFrame) bun.JSError!JSValue {
|
||||
const arguments = callFrame.arguments_old(1).slice();
|
||||
if (arguments.len == 0) {
|
||||
return globalObject.throwNotEnoughArguments("TextEncoderStreamEncoder.encode", 1, arguments.len);
|
||||
}
|
||||
|
||||
const str = try arguments[0].getZigString(globalObject);
|
||||
|
||||
if (str.is16Bit()) {
|
||||
return this.encodeUTF16(globalObject, str.utf16SliceAligned());
|
||||
}
|
||||
|
||||
return this.encodeLatin1(globalObject, str.slice());
|
||||
}
|
||||
|
||||
pub fn encodeWithoutTypeChecks(this: *TextEncoderStreamEncoder, globalObject: *JSC.JSGlobalObject, input: *JSC.JSString) JSValue {
|
||||
const str = input.getZigString(globalObject);
|
||||
|
||||
if (str.is16Bit()) {
|
||||
return this.encodeUTF16(globalObject, str.utf16SliceAligned());
|
||||
}
|
||||
|
||||
return this.encodeLatin1(globalObject, str.slice());
|
||||
}
|
||||
|
||||
fn encodeLatin1(this: *TextEncoderStreamEncoder, globalObject: *JSGlobalObject, input: []const u8) JSValue {
|
||||
log("encodeLatin1: \"{s}\"", .{input});
|
||||
|
||||
if (input.len == 0) return JSUint8Array.createEmpty(globalObject);
|
||||
|
||||
const prepend_replacement_len: usize = prepend_replacement: {
|
||||
if (this.pending_lead_surrogate != null) {
|
||||
this.pending_lead_surrogate = null;
|
||||
// no latin1 surrogate pairs
|
||||
break :prepend_replacement 3;
|
||||
}
|
||||
|
||||
break :prepend_replacement 0;
|
||||
};
|
||||
// In a previous benchmark, counting the length took about as much time as allocating the buffer.
|
||||
//
|
||||
// Benchmark Time % CPU (ns) Iterations Ratio
|
||||
// 288.00 ms 13.5% 288.00 ms simdutf::arm64::implementation::convert_latin1_to_utf8(char const*, unsigned long, char*) const
|
||||
// 278.00 ms 13.0% 278.00 ms simdutf::arm64::implementation::utf8_length_from_latin1(char const*, unsigned long) const
|
||||
//
|
||||
//
|
||||
var buffer = std.ArrayList(u8).initCapacity(bun.default_allocator, input.len + prepend_replacement_len) catch {
|
||||
return globalObject.throwOutOfMemoryValue();
|
||||
};
|
||||
if (prepend_replacement_len > 0) {
|
||||
buffer.appendSliceAssumeCapacity(&[3]u8{ 0xef, 0xbf, 0xbd });
|
||||
}
|
||||
|
||||
var remain = input;
|
||||
while (remain.len > 0) {
|
||||
const result = strings.copyLatin1IntoUTF8(buffer.unusedCapacitySlice(), []const u8, remain);
|
||||
|
||||
buffer.items.len += result.written;
|
||||
remain = remain[result.read..];
|
||||
|
||||
if (result.written == 0 and result.read == 0) {
|
||||
buffer.ensureUnusedCapacity(2) catch {
|
||||
buffer.deinit();
|
||||
return globalObject.throwOutOfMemoryValue();
|
||||
};
|
||||
} else if (buffer.items.len == buffer.capacity and remain.len > 0) {
|
||||
buffer.ensureTotalCapacity(buffer.items.len + remain.len + 1) catch {
|
||||
buffer.deinit();
|
||||
return globalObject.throwOutOfMemoryValue();
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
if (comptime Environment.isDebug) {
|
||||
// wrap in comptime if so simdutf isn't called in a release build here.
|
||||
bun.debugAssert(buffer.items.len == (bun.simdutf.length.utf8.from.latin1(input) + prepend_replacement_len));
|
||||
}
|
||||
|
||||
return JSC.JSUint8Array.fromBytes(globalObject, buffer.items);
|
||||
}
|
||||
|
||||
fn encodeUTF16(this: *TextEncoderStreamEncoder, globalObject: *JSGlobalObject, input: []const u16) JSValue {
|
||||
log("encodeUTF16: \"{}\"", .{bun.fmt.utf16(input)});
|
||||
|
||||
if (input.len == 0) return JSUint8Array.createEmpty(globalObject);
|
||||
|
||||
const Prepend = struct {
|
||||
bytes: [4]u8,
|
||||
len: u3,
|
||||
|
||||
pub const replacement: @This() = .{ .bytes = .{ 0xef, 0xbf, 0xbd, 0 }, .len = 3 };
|
||||
|
||||
pub fn fromSequence(seq: [4]u8, length: u3) @This() {
|
||||
return .{ .bytes = seq, .len = length };
|
||||
}
|
||||
};
|
||||
|
||||
var remain = input;
|
||||
|
||||
const prepend: ?Prepend = prepend: {
|
||||
if (this.pending_lead_surrogate) |lead| {
|
||||
this.pending_lead_surrogate = null;
|
||||
const maybe_trail = remain[0];
|
||||
if (strings.u16IsTrail(maybe_trail)) {
|
||||
const converted = strings.utf16CodepointWithFFFD([]const u16, &.{ lead, maybe_trail });
|
||||
// shouldn't fail because `u16IsTrail` is true and `pending_lead_surrogate` is always
|
||||
// a valid lead.
|
||||
bun.debugAssert(!converted.fail);
|
||||
|
||||
const sequence = strings.wtf8Sequence(converted.code_point);
|
||||
|
||||
remain = remain[1..];
|
||||
if (remain.len == 0) {
|
||||
return JSUint8Array.fromBytesCopy(
|
||||
globalObject,
|
||||
sequence[0..converted.utf8Width()],
|
||||
);
|
||||
}
|
||||
|
||||
break :prepend Prepend.fromSequence(sequence, converted.utf8Width());
|
||||
}
|
||||
|
||||
break :prepend Prepend.replacement;
|
||||
}
|
||||
break :prepend null;
|
||||
};
|
||||
|
||||
const length = bun.simdutf.length.utf8.from.utf16.le(remain);
|
||||
|
||||
var buf = std.ArrayList(u8).initCapacity(
|
||||
bun.default_allocator,
|
||||
length + @as(usize, if (prepend) |pre| pre.len else 0),
|
||||
) catch {
|
||||
return globalObject.throwOutOfMemoryValue();
|
||||
};
|
||||
|
||||
if (prepend) |*pre| {
|
||||
buf.appendSliceAssumeCapacity(pre.bytes[0..pre.len]);
|
||||
}
|
||||
|
||||
const result = bun.simdutf.convert.utf16.to.utf8.with_errors.le(remain, buf.unusedCapacitySlice());
|
||||
|
||||
switch (result.status) {
|
||||
else => {
|
||||
// Slow path: there was invalid UTF-16, so we need to convert it without simdutf.
|
||||
const lead_surrogate = strings.toUTF8ListWithTypeBun(&buf, []const u16, remain, true) catch {
|
||||
buf.deinit();
|
||||
return globalObject.throwOutOfMemoryValue();
|
||||
};
|
||||
|
||||
if (lead_surrogate) |pending_lead| {
|
||||
this.pending_lead_surrogate = pending_lead;
|
||||
if (buf.items.len == 0) return JSUint8Array.createEmpty(globalObject);
|
||||
}
|
||||
|
||||
return JSC.JSUint8Array.fromBytes(globalObject, buf.items);
|
||||
},
|
||||
.success => {
|
||||
buf.items.len += result.count;
|
||||
return JSC.JSUint8Array.fromBytes(globalObject, buf.items);
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
pub fn flush(this: *TextEncoderStreamEncoder, globalObject: *JSGlobalObject, _: *JSC.CallFrame) bun.JSError!JSValue {
|
||||
return flushBody(this, globalObject);
|
||||
}
|
||||
|
||||
pub fn flushWithoutTypeChecks(this: *TextEncoderStreamEncoder, globalObject: *JSGlobalObject) JSValue {
|
||||
return flushBody(this, globalObject);
|
||||
}
|
||||
|
||||
fn flushBody(this: *TextEncoderStreamEncoder, globalObject: *JSGlobalObject) JSValue {
|
||||
return if (this.pending_lead_surrogate == null)
|
||||
JSUint8Array.createEmpty(globalObject)
|
||||
else
|
||||
JSUint8Array.fromBytesCopy(globalObject, &.{ 0xef, 0xbf, 0xbd });
|
||||
}
|
||||
|
||||
const TextEncoderStreamEncoder = @This();
|
||||
|
||||
const std = @import("std");
|
||||
const bun = @import("root").bun;
|
||||
const JSC = bun.JSC;
|
||||
const Output = bun.Output;
|
||||
const MutableString = bun.MutableString;
|
||||
const strings = bun.strings;
|
||||
const string = bun.string;
|
||||
const FeatureFlags = bun.FeatureFlags;
|
||||
const ArrayBuffer = JSC.ArrayBuffer;
|
||||
const JSUint8Array = JSC.JSUint8Array;
|
||||
const ZigString = JSC.ZigString;
|
||||
const JSInternalPromise = JSC.JSInternalPromise;
|
||||
const JSPromise = JSC.JSPromise;
|
||||
const JSValue = JSC.JSValue;
|
||||
const JSGlobalObject = JSC.JSGlobalObject;
|
||||
const EncodingLabel = JSC.WebCore.EncodingLabel;
|
||||
const Environment = bun.Environment;
|
||||
@@ -35,949 +35,10 @@ const Task = JSC.Task;
|
||||
|
||||
const picohttp = bun.picohttp;
|
||||
|
||||
pub const TextEncoder = struct {
|
||||
pub export fn TextEncoder__encode8(
|
||||
globalThis: *JSGlobalObject,
|
||||
ptr: [*]const u8,
|
||||
len: usize,
|
||||
) JSValue {
|
||||
// as much as possible, rely on JSC to own the memory
|
||||
// their code is more battle-tested than bun's code
|
||||
// so we do a stack allocation here
|
||||
// and then copy into JSC memory
|
||||
// unless it's huge
|
||||
// JSC will GC Uint8Array that occupy less than 512 bytes
|
||||
// so it's extra good for that case
|
||||
// this also means there won't be reallocations for small strings
|
||||
var buf: [2048]u8 = undefined;
|
||||
const slice = ptr[0..len];
|
||||
|
||||
if (slice.len <= buf.len / 2) {
|
||||
const result = strings.copyLatin1IntoUTF8(&buf, []const u8, slice);
|
||||
const uint8array = JSC.JSValue.createUninitializedUint8Array(globalThis, result.written);
|
||||
bun.assert(result.written <= buf.len);
|
||||
bun.assert(result.read == slice.len);
|
||||
const array_buffer = uint8array.asArrayBuffer(globalThis) orelse return .zero;
|
||||
bun.assert(result.written == array_buffer.len);
|
||||
@memcpy(array_buffer.byteSlice()[0..result.written], buf[0..result.written]);
|
||||
return uint8array;
|
||||
} else {
|
||||
const bytes = strings.allocateLatin1IntoUTF8(globalThis.bunVM().allocator, []const u8, slice) catch {
|
||||
return globalThis.throwOutOfMemoryValue();
|
||||
};
|
||||
bun.assert(bytes.len >= slice.len);
|
||||
return ArrayBuffer.fromBytes(bytes, .Uint8Array).toJSUnchecked(globalThis, null);
|
||||
}
|
||||
}
|
||||
pub export fn TextEncoder__encode16(
|
||||
globalThis: *JSGlobalObject,
|
||||
ptr: [*]const u16,
|
||||
len: usize,
|
||||
) JSValue {
|
||||
// as much as possible, rely on JSC to own the memory
|
||||
// their code is more battle-tested than bun's code
|
||||
// so we do a stack allocation here
|
||||
// and then copy into JSC memory
|
||||
// unless it's huge
|
||||
// JSC will GC Uint8Array that occupy less than 512 bytes
|
||||
// so it's extra good for that case
|
||||
// this also means there won't be reallocations for small strings
|
||||
var buf: [2048]u8 = undefined;
|
||||
|
||||
const slice = ptr[0..len];
|
||||
|
||||
// max utf16 -> utf8 length
|
||||
if (slice.len <= buf.len / 4) {
|
||||
const result = strings.copyUTF16IntoUTF8(&buf, @TypeOf(slice), slice, true);
|
||||
if (result.read == 0 or result.written == 0) {
|
||||
const uint8array = JSC.JSValue.createUninitializedUint8Array(globalThis, 3);
|
||||
const array_buffer = uint8array.asArrayBuffer(globalThis).?;
|
||||
const replacement_char = [_]u8{ 239, 191, 189 };
|
||||
@memcpy(array_buffer.slice()[0..replacement_char.len], &replacement_char);
|
||||
return uint8array;
|
||||
}
|
||||
const uint8array = JSC.JSValue.createUninitializedUint8Array(globalThis, result.written);
|
||||
bun.assert(result.written <= buf.len);
|
||||
bun.assert(result.read == slice.len);
|
||||
const array_buffer = uint8array.asArrayBuffer(globalThis).?;
|
||||
bun.assert(result.written == array_buffer.len);
|
||||
@memcpy(array_buffer.slice()[0..result.written], buf[0..result.written]);
|
||||
return uint8array;
|
||||
} else {
|
||||
const bytes = strings.toUTF8AllocWithType(
|
||||
bun.default_allocator,
|
||||
@TypeOf(slice),
|
||||
slice,
|
||||
) catch {
|
||||
return JSC.toInvalidArguments("Out of memory", .{}, globalThis);
|
||||
};
|
||||
return ArrayBuffer.fromBytes(bytes, .Uint8Array).toJSUnchecked(globalThis, null);
|
||||
}
|
||||
}
|
||||
|
||||
pub export fn c(
|
||||
globalThis: *JSGlobalObject,
|
||||
ptr: [*]const u16,
|
||||
len: usize,
|
||||
) JSValue {
|
||||
// as much as possible, rely on JSC to own the memory
|
||||
// their code is more battle-tested than bun's code
|
||||
// so we do a stack allocation here
|
||||
// and then copy into JSC memory
|
||||
// unless it's huge
|
||||
// JSC will GC Uint8Array that occupy less than 512 bytes
|
||||
// so it's extra good for that case
|
||||
// this also means there won't be reallocations for small strings
|
||||
var buf: [2048]u8 = undefined;
|
||||
|
||||
const slice = ptr[0..len];
|
||||
|
||||
// max utf16 -> utf8 length
|
||||
if (slice.len <= buf.len / 4) {
|
||||
const result = strings.copyUTF16IntoUTF8(&buf, @TypeOf(slice), slice, true);
|
||||
if (result.read == 0 or result.written == 0) {
|
||||
const uint8array = JSC.JSValue.createUninitializedUint8Array(globalThis, 3);
|
||||
const array_buffer = uint8array.asArrayBuffer(globalThis).?;
|
||||
const replacement_char = [_]u8{ 239, 191, 189 };
|
||||
@memcpy(array_buffer.slice()[0..replacement_char.len], &replacement_char);
|
||||
return uint8array;
|
||||
}
|
||||
const uint8array = JSC.JSValue.createUninitializedUint8Array(globalThis, result.written);
|
||||
bun.assert(result.written <= buf.len);
|
||||
bun.assert(result.read == slice.len);
|
||||
const array_buffer = uint8array.asArrayBuffer(globalThis).?;
|
||||
bun.assert(result.written == array_buffer.len);
|
||||
@memcpy(array_buffer.slice()[0..result.written], buf[0..result.written]);
|
||||
return uint8array;
|
||||
} else {
|
||||
const bytes = strings.toUTF8AllocWithType(
|
||||
bun.default_allocator,
|
||||
@TypeOf(slice),
|
||||
slice,
|
||||
) catch {
|
||||
return globalThis.throwOutOfMemoryValue();
|
||||
};
|
||||
return ArrayBuffer.fromBytes(bytes, .Uint8Array).toJSUnchecked(globalThis, null);
|
||||
}
|
||||
}
|
||||
|
||||
// This is a fast path for copying a Rope string into a Uint8Array.
|
||||
// This keeps us from an extra string temporary allocation
|
||||
const RopeStringEncoder = struct {
|
||||
globalThis: *JSGlobalObject,
|
||||
buf: []u8,
|
||||
tail: usize = 0,
|
||||
any_non_ascii: bool = false,
|
||||
|
||||
pub fn append8(it: *JSC.JSString.Iterator, ptr: [*]const u8, len: u32) callconv(.C) void {
|
||||
var this = bun.cast(*RopeStringEncoder, it.data.?);
|
||||
const result = strings.copyLatin1IntoUTF8StopOnNonASCII(this.buf[this.tail..], []const u8, ptr[0..len], true);
|
||||
if (result.read == std.math.maxInt(u32) and result.written == std.math.maxInt(u32)) {
|
||||
it.stop = 1;
|
||||
this.any_non_ascii = true;
|
||||
} else {
|
||||
this.tail += result.written;
|
||||
}
|
||||
}
|
||||
pub fn append16(it: *JSC.JSString.Iterator, _: [*]const u16, _: u32) callconv(.C) void {
|
||||
var this = bun.cast(*RopeStringEncoder, it.data.?);
|
||||
this.any_non_ascii = true;
|
||||
it.stop = 1;
|
||||
}
|
||||
pub fn write8(it: *JSC.JSString.Iterator, ptr: [*]const u8, len: u32, offset: u32) callconv(.C) void {
|
||||
var this = bun.cast(*RopeStringEncoder, it.data.?);
|
||||
const result = strings.copyLatin1IntoUTF8StopOnNonASCII(this.buf[offset..], []const u8, ptr[0..len], true);
|
||||
if (result.read == std.math.maxInt(u32) and result.written == std.math.maxInt(u32)) {
|
||||
it.stop = 1;
|
||||
this.any_non_ascii = true;
|
||||
}
|
||||
}
|
||||
pub fn write16(it: *JSC.JSString.Iterator, _: [*]const u16, _: u32, _: u32) callconv(.C) void {
|
||||
var this = bun.cast(*RopeStringEncoder, it.data.?);
|
||||
this.any_non_ascii = true;
|
||||
it.stop = 1;
|
||||
}
|
||||
|
||||
pub fn iter(this: *RopeStringEncoder) JSC.JSString.Iterator {
|
||||
return .{
|
||||
.data = this,
|
||||
.stop = 0,
|
||||
.append8 = append8,
|
||||
.append16 = append16,
|
||||
.write8 = write8,
|
||||
.write16 = write16,
|
||||
};
|
||||
}
|
||||
};
|
||||
|
||||
// This fast path is only suitable for ASCII strings
|
||||
// It's not suitable for UTF-16 strings, because getting the byteLength is unpredictable
|
||||
// It also isn't usable for latin1 strings which contain non-ascii characters
|
||||
pub export fn TextEncoder__encodeRopeString(
|
||||
globalThis: *JSGlobalObject,
|
||||
rope_str: *JSC.JSString,
|
||||
) JSValue {
|
||||
if (comptime Environment.allow_assert) bun.assert(rope_str.is8Bit());
|
||||
var stack_buf: [2048]u8 = undefined;
|
||||
var buf_to_use: []u8 = &stack_buf;
|
||||
const length = rope_str.length();
|
||||
var array: JSValue = .zero;
|
||||
if (length > stack_buf.len / 2) {
|
||||
array = JSC.JSValue.createUninitializedUint8Array(globalThis, length);
|
||||
array.ensureStillAlive();
|
||||
buf_to_use = array.asArrayBuffer(globalThis).?.slice();
|
||||
}
|
||||
var encoder = RopeStringEncoder{
|
||||
.globalThis = globalThis,
|
||||
.buf = buf_to_use,
|
||||
};
|
||||
var iter = encoder.iter();
|
||||
array.ensureStillAlive();
|
||||
rope_str.iterator(globalThis, &iter);
|
||||
array.ensureStillAlive();
|
||||
|
||||
if (encoder.any_non_ascii) {
|
||||
return .undefined;
|
||||
}
|
||||
|
||||
if (array == .zero) {
|
||||
array = JSC.JSValue.createUninitializedUint8Array(globalThis, length);
|
||||
array.ensureStillAlive();
|
||||
@memcpy(array.asArrayBuffer(globalThis).?.ptr[0..length], buf_to_use[0..length]);
|
||||
}
|
||||
|
||||
return array;
|
||||
}
|
||||
|
||||
pub export fn TextEncoder__encodeInto16(
|
||||
input_ptr: [*]const u16,
|
||||
input_len: usize,
|
||||
buf_ptr: [*]u8,
|
||||
buf_len: usize,
|
||||
) u64 {
|
||||
const output = buf_ptr[0..buf_len];
|
||||
const input = input_ptr[0..input_len];
|
||||
var result: strings.EncodeIntoResult = strings.copyUTF16IntoUTF8(output, []const u16, input, false);
|
||||
if (output.len >= 3 and (result.read == 0 or result.written == 0)) {
|
||||
const replacement_char = [_]u8{ 239, 191, 189 };
|
||||
@memcpy(buf_ptr[0..replacement_char.len], &replacement_char);
|
||||
result.read = 1;
|
||||
result.written = 3;
|
||||
}
|
||||
const sized: [2]u32 = .{ result.read, result.written };
|
||||
return @bitCast(sized);
|
||||
}
|
||||
|
||||
pub export fn TextEncoder__encodeInto8(
|
||||
input_ptr: [*]const u8,
|
||||
input_len: usize,
|
||||
buf_ptr: [*]u8,
|
||||
buf_len: usize,
|
||||
) u64 {
|
||||
const output = buf_ptr[0..buf_len];
|
||||
const input = input_ptr[0..input_len];
|
||||
const result: strings.EncodeIntoResult =
|
||||
strings.copyLatin1IntoUTF8(output, []const u8, input);
|
||||
const sized: [2]u32 = .{ result.read, result.written };
|
||||
return @bitCast(sized);
|
||||
}
|
||||
};
|
||||
|
||||
comptime {
|
||||
_ = TextEncoder.TextEncoder__encode8;
|
||||
_ = TextEncoder.TextEncoder__encode16;
|
||||
_ = TextEncoder.TextEncoder__encodeInto8;
|
||||
_ = TextEncoder.TextEncoder__encodeInto16;
|
||||
_ = TextEncoder.TextEncoder__encodeRopeString;
|
||||
}
|
||||
|
||||
/// https://encoding.spec.whatwg.org/encodings.json
|
||||
pub const EncodingLabel = enum {
|
||||
@"UTF-8",
|
||||
IBM866,
|
||||
@"ISO-8859-2",
|
||||
@"ISO-8859-3",
|
||||
@"ISO-8859-4",
|
||||
@"ISO-8859-5",
|
||||
@"ISO-8859-6",
|
||||
@"ISO-8859-7",
|
||||
@"ISO-8859-8",
|
||||
@"ISO-8859-8-I",
|
||||
@"ISO-8859-10",
|
||||
@"ISO-8859-13",
|
||||
@"ISO-8859-14",
|
||||
@"ISO-8859-15",
|
||||
@"ISO-8859-16",
|
||||
@"KOI8-R",
|
||||
@"KOI8-U",
|
||||
macintosh,
|
||||
@"windows-874",
|
||||
@"windows-1250",
|
||||
@"windows-1251",
|
||||
/// Also known as
|
||||
/// - ASCII
|
||||
/// - latin1
|
||||
@"windows-1252",
|
||||
@"windows-1253",
|
||||
@"windows-1254",
|
||||
@"windows-1255",
|
||||
@"windows-1256",
|
||||
@"windows-1257",
|
||||
@"windows-1258",
|
||||
@"x-mac-cyrillic",
|
||||
Big5,
|
||||
@"EUC-JP",
|
||||
@"ISO-2022-JP",
|
||||
Shift_JIS,
|
||||
@"EUC-KR",
|
||||
@"UTF-16BE",
|
||||
@"UTF-16LE",
|
||||
@"x-user-defined",
|
||||
|
||||
pub const Map = std.enums.EnumMap(EncodingLabel, string);
|
||||
pub const label: Map = brk: {
|
||||
var map = Map.initFull("");
|
||||
map.put(EncodingLabel.@"UTF-8", "utf-8");
|
||||
map.put(EncodingLabel.@"UTF-16LE", "utf-16le");
|
||||
map.put(EncodingLabel.@"windows-1252", "windows-1252");
|
||||
break :brk map;
|
||||
};
|
||||
|
||||
const utf16_names = [_]string{
|
||||
"ucs-2",
|
||||
"utf-16",
|
||||
"unicode",
|
||||
"utf-16le",
|
||||
"csunicode",
|
||||
"unicodefeff",
|
||||
"iso-10646-ucs-2",
|
||||
};
|
||||
|
||||
const utf8_names = [_]string{
|
||||
"utf8",
|
||||
"utf-8",
|
||||
"unicode11utf8",
|
||||
"unicode20utf8",
|
||||
"x-unicode20utf8",
|
||||
"unicode-1-1-utf-8",
|
||||
};
|
||||
|
||||
const latin1_names = [_]string{
|
||||
"l1",
|
||||
"ascii",
|
||||
"cp819",
|
||||
"cp1252",
|
||||
"ibm819",
|
||||
"latin1",
|
||||
"iso88591",
|
||||
"us-ascii",
|
||||
"x-cp1252",
|
||||
"iso8859-1",
|
||||
"iso_8859-1",
|
||||
"iso-8859-1",
|
||||
"iso-ir-100",
|
||||
"csisolatin1",
|
||||
"windows-1252",
|
||||
"ansi_x3.4-1968",
|
||||
"iso_8859-1:1987",
|
||||
};
|
||||
|
||||
pub const latin1 = EncodingLabel.@"windows-1252";
|
||||
|
||||
pub fn which(input_: string) ?EncodingLabel {
|
||||
const input = strings.trim(input_, " \t\r\n");
|
||||
const ExactMatcher = strings.ExactSizeMatcher;
|
||||
const Eight = ExactMatcher(8);
|
||||
const Sixteen = ExactMatcher(16);
|
||||
return switch (input.len) {
|
||||
1, 0 => null,
|
||||
2...8 => switch (Eight.matchLower(input)) {
|
||||
Eight.case("l1"),
|
||||
Eight.case("ascii"),
|
||||
Eight.case("cp819"),
|
||||
Eight.case("cp1252"),
|
||||
Eight.case("ibm819"),
|
||||
Eight.case("latin1"),
|
||||
Eight.case("iso88591"),
|
||||
Eight.case("us-ascii"),
|
||||
Eight.case("x-cp1252"),
|
||||
=> EncodingLabel.latin1,
|
||||
|
||||
Eight.case("ucs-2"),
|
||||
Eight.case("utf-16"),
|
||||
Eight.case("unicode"),
|
||||
Eight.case("utf-16le"),
|
||||
=> EncodingLabel.@"UTF-16LE",
|
||||
|
||||
Eight.case("utf-16be"),
|
||||
=> EncodingLabel.@"UTF-16BE",
|
||||
|
||||
Eight.case("utf8"), Eight.case("utf-8") => EncodingLabel.@"UTF-8",
|
||||
else => null,
|
||||
},
|
||||
|
||||
9...16 => switch (Sixteen.matchLower(input)) {
|
||||
Sixteen.case("iso8859-1"),
|
||||
Sixteen.case("iso_8859-1"),
|
||||
Sixteen.case("iso-8859-1"),
|
||||
Sixteen.case("iso-ir-100"),
|
||||
Sixteen.case("csisolatin1"),
|
||||
Sixteen.case("windows-1252"),
|
||||
Sixteen.case("ansi_x3.4-1968"),
|
||||
Sixteen.case("iso_8859-1:1987"),
|
||||
=> EncodingLabel.latin1,
|
||||
|
||||
Sixteen.case("unicode11utf8"),
|
||||
Sixteen.case("unicode20utf8"),
|
||||
Sixteen.case("x-unicode20utf8"),
|
||||
=> EncodingLabel.@"UTF-8",
|
||||
|
||||
Sixteen.case("csunicode"),
|
||||
Sixteen.case("unicodefeff"),
|
||||
Sixteen.case("iso-10646-ucs-2"),
|
||||
=> EncodingLabel.@"UTF-16LE",
|
||||
|
||||
else => null,
|
||||
},
|
||||
else => if (strings.eqlCaseInsensitiveASCII(input, "unicode-1-1-utf-8", true))
|
||||
EncodingLabel.@"UTF-8"
|
||||
else
|
||||
null,
|
||||
};
|
||||
}
|
||||
};
|
||||
|
||||
pub const TextEncoderStreamEncoder = struct {
|
||||
pending_lead_surrogate: ?u16 = null,
|
||||
|
||||
const log = Output.scoped(.TextEncoderStreamEncoder, false);
|
||||
|
||||
pub usingnamespace JSC.Codegen.JSTextEncoderStreamEncoder;
|
||||
pub usingnamespace bun.New(TextEncoderStreamEncoder);
|
||||
|
||||
pub fn finalize(this: *TextEncoderStreamEncoder) void {
|
||||
this.destroy();
|
||||
}
|
||||
|
||||
pub fn constructor(_: *JSGlobalObject, _: *JSC.CallFrame) bun.JSError!*TextEncoderStreamEncoder {
|
||||
return TextEncoderStreamEncoder.new(.{});
|
||||
}
|
||||
|
||||
pub fn encode(this: *TextEncoderStreamEncoder, globalObject: *JSC.JSGlobalObject, callFrame: *JSC.CallFrame) bun.JSError!JSValue {
|
||||
const arguments = callFrame.arguments_old(1).slice();
|
||||
if (arguments.len == 0) {
|
||||
return globalObject.throwNotEnoughArguments("TextEncoderStreamEncoder.encode", 1, arguments.len);
|
||||
}
|
||||
|
||||
const str = try arguments[0].getZigString(globalObject);
|
||||
|
||||
if (str.is16Bit()) {
|
||||
return this.encodeUTF16(globalObject, str.utf16SliceAligned());
|
||||
}
|
||||
|
||||
return this.encodeLatin1(globalObject, str.slice());
|
||||
}
|
||||
|
||||
pub fn encodeWithoutTypeChecks(this: *TextEncoderStreamEncoder, globalObject: *JSC.JSGlobalObject, input: *JSC.JSString) JSValue {
|
||||
const str = input.getZigString(globalObject);
|
||||
|
||||
if (str.is16Bit()) {
|
||||
return this.encodeUTF16(globalObject, str.utf16SliceAligned());
|
||||
}
|
||||
|
||||
return this.encodeLatin1(globalObject, str.slice());
|
||||
}
|
||||
|
||||
fn encodeLatin1(this: *TextEncoderStreamEncoder, globalObject: *JSGlobalObject, input: []const u8) JSValue {
|
||||
log("encodeLatin1: \"{s}\"", .{input});
|
||||
|
||||
if (input.len == 0) return JSUint8Array.createEmpty(globalObject);
|
||||
|
||||
const prepend_replacement_len: usize = prepend_replacement: {
|
||||
if (this.pending_lead_surrogate != null) {
|
||||
this.pending_lead_surrogate = null;
|
||||
// no latin1 surrogate pairs
|
||||
break :prepend_replacement 3;
|
||||
}
|
||||
|
||||
break :prepend_replacement 0;
|
||||
};
|
||||
// In a previous benchmark, counting the length took about as much time as allocating the buffer.
|
||||
//
|
||||
// Benchmark Time % CPU (ns) Iterations Ratio
|
||||
// 288.00 ms 13.5% 288.00 ms simdutf::arm64::implementation::convert_latin1_to_utf8(char const*, unsigned long, char*) const
|
||||
// 278.00 ms 13.0% 278.00 ms simdutf::arm64::implementation::utf8_length_from_latin1(char const*, unsigned long) const
|
||||
//
|
||||
//
|
||||
var buffer = std.ArrayList(u8).initCapacity(bun.default_allocator, input.len + prepend_replacement_len) catch {
|
||||
return globalObject.throwOutOfMemoryValue();
|
||||
};
|
||||
if (prepend_replacement_len > 0) {
|
||||
buffer.appendSliceAssumeCapacity(&[3]u8{ 0xef, 0xbf, 0xbd });
|
||||
}
|
||||
|
||||
var remain = input;
|
||||
while (remain.len > 0) {
|
||||
const result = strings.copyLatin1IntoUTF8(buffer.unusedCapacitySlice(), []const u8, remain);
|
||||
|
||||
buffer.items.len += result.written;
|
||||
remain = remain[result.read..];
|
||||
|
||||
if (result.written == 0 and result.read == 0) {
|
||||
buffer.ensureUnusedCapacity(2) catch {
|
||||
buffer.deinit();
|
||||
return globalObject.throwOutOfMemoryValue();
|
||||
};
|
||||
} else if (buffer.items.len == buffer.capacity and remain.len > 0) {
|
||||
buffer.ensureTotalCapacity(buffer.items.len + remain.len + 1) catch {
|
||||
buffer.deinit();
|
||||
return globalObject.throwOutOfMemoryValue();
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
if (comptime Environment.isDebug) {
|
||||
// wrap in comptime if so simdutf isn't called in a release build here.
|
||||
bun.debugAssert(buffer.items.len == (bun.simdutf.length.utf8.from.latin1(input) + prepend_replacement_len));
|
||||
}
|
||||
|
||||
return JSC.JSUint8Array.fromBytes(globalObject, buffer.items);
|
||||
}
|
||||
|
||||
fn encodeUTF16(this: *TextEncoderStreamEncoder, globalObject: *JSGlobalObject, input: []const u16) JSValue {
|
||||
log("encodeUTF16: \"{}\"", .{bun.fmt.utf16(input)});
|
||||
|
||||
if (input.len == 0) return JSUint8Array.createEmpty(globalObject);
|
||||
|
||||
const Prepend = struct {
|
||||
bytes: [4]u8,
|
||||
len: u3,
|
||||
|
||||
pub const replacement: @This() = .{ .bytes = .{ 0xef, 0xbf, 0xbd, 0 }, .len = 3 };
|
||||
|
||||
pub fn fromSequence(seq: [4]u8, length: u3) @This() {
|
||||
return .{ .bytes = seq, .len = length };
|
||||
}
|
||||
};
|
||||
|
||||
var remain = input;
|
||||
|
||||
const prepend: ?Prepend = prepend: {
|
||||
if (this.pending_lead_surrogate) |lead| {
|
||||
this.pending_lead_surrogate = null;
|
||||
const maybe_trail = remain[0];
|
||||
if (strings.u16IsTrail(maybe_trail)) {
|
||||
const converted = strings.utf16CodepointWithFFFD([]const u16, &.{ lead, maybe_trail });
|
||||
// shouldn't fail because `u16IsTrail` is true and `pending_lead_surrogate` is always
|
||||
// a valid lead.
|
||||
bun.debugAssert(!converted.fail);
|
||||
|
||||
const sequence = strings.wtf8Sequence(converted.code_point);
|
||||
|
||||
remain = remain[1..];
|
||||
if (remain.len == 0) {
|
||||
return JSUint8Array.fromBytesCopy(
|
||||
globalObject,
|
||||
sequence[0..converted.utf8Width()],
|
||||
);
|
||||
}
|
||||
|
||||
break :prepend Prepend.fromSequence(sequence, converted.utf8Width());
|
||||
}
|
||||
|
||||
break :prepend Prepend.replacement;
|
||||
}
|
||||
break :prepend null;
|
||||
};
|
||||
|
||||
const length = bun.simdutf.length.utf8.from.utf16.le(remain);
|
||||
|
||||
var buf = std.ArrayList(u8).initCapacity(
|
||||
bun.default_allocator,
|
||||
length + @as(usize, if (prepend) |pre| pre.len else 0),
|
||||
) catch {
|
||||
return globalObject.throwOutOfMemoryValue();
|
||||
};
|
||||
|
||||
if (prepend) |*pre| {
|
||||
buf.appendSliceAssumeCapacity(pre.bytes[0..pre.len]);
|
||||
}
|
||||
|
||||
const result = bun.simdutf.convert.utf16.to.utf8.with_errors.le(remain, buf.unusedCapacitySlice());
|
||||
|
||||
switch (result.status) {
|
||||
else => {
|
||||
// Slow path: there was invalid UTF-16, so we need to convert it without simdutf.
|
||||
const lead_surrogate = strings.toUTF8ListWithTypeBun(&buf, []const u16, remain, true) catch {
|
||||
buf.deinit();
|
||||
return globalObject.throwOutOfMemoryValue();
|
||||
};
|
||||
|
||||
if (lead_surrogate) |pending_lead| {
|
||||
this.pending_lead_surrogate = pending_lead;
|
||||
if (buf.items.len == 0) return JSUint8Array.createEmpty(globalObject);
|
||||
}
|
||||
|
||||
return JSC.JSUint8Array.fromBytes(globalObject, buf.items);
|
||||
},
|
||||
.success => {
|
||||
buf.items.len += result.count;
|
||||
return JSC.JSUint8Array.fromBytes(globalObject, buf.items);
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
pub fn flush(this: *TextEncoderStreamEncoder, globalObject: *JSGlobalObject, _: *JSC.CallFrame) bun.JSError!JSValue {
|
||||
return flushBody(this, globalObject);
|
||||
}
|
||||
|
||||
pub fn flushWithoutTypeChecks(this: *TextEncoderStreamEncoder, globalObject: *JSGlobalObject) JSValue {
|
||||
return flushBody(this, globalObject);
|
||||
}
|
||||
|
||||
fn flushBody(this: *TextEncoderStreamEncoder, globalObject: *JSGlobalObject) JSValue {
|
||||
return if (this.pending_lead_surrogate == null)
|
||||
JSUint8Array.createEmpty(globalObject)
|
||||
else
|
||||
JSUint8Array.fromBytesCopy(globalObject, &.{ 0xef, 0xbf, 0xbd });
|
||||
}
|
||||
};
|
||||
|
||||
pub const TextDecoder = struct {
|
||||
|
||||
// used for utf8 decoding
|
||||
buffered: struct {
|
||||
buf: [3]u8 = .{0} ** 3,
|
||||
len: u2 = 0,
|
||||
|
||||
pub fn slice(this: *@This()) []const u8 {
|
||||
return this.buf[0..this.len];
|
||||
}
|
||||
} = .{},
|
||||
|
||||
// used for utf16 decoding
|
||||
lead_byte: ?u8 = null,
|
||||
lead_surrogate: ?u16 = null,
|
||||
|
||||
ignore_bom: bool = false,
|
||||
fatal: bool = false,
|
||||
encoding: EncodingLabel = EncodingLabel.@"UTF-8",
|
||||
|
||||
pub usingnamespace bun.New(TextDecoder);
|
||||
|
||||
pub fn finalize(this: *TextDecoder) void {
|
||||
this.destroy();
|
||||
}
|
||||
|
||||
pub usingnamespace JSC.Codegen.JSTextDecoder;
|
||||
|
||||
pub fn getIgnoreBOM(
|
||||
this: *TextDecoder,
|
||||
_: *JSC.JSGlobalObject,
|
||||
) JSC.JSValue {
|
||||
return JSC.JSValue.jsBoolean(this.ignore_bom);
|
||||
}
|
||||
|
||||
pub fn getFatal(
|
||||
this: *TextDecoder,
|
||||
_: *JSC.JSGlobalObject,
|
||||
) JSC.JSValue {
|
||||
return JSC.JSValue.jsBoolean(this.fatal);
|
||||
}
|
||||
|
||||
pub fn getEncoding(
|
||||
this: *TextDecoder,
|
||||
globalThis: *JSC.JSGlobalObject,
|
||||
) JSC.JSValue {
|
||||
return ZigString.init(EncodingLabel.label.get(this.encoding).?).toJS(globalThis);
|
||||
}
|
||||
const Vector16 = std.meta.Vector(16, u16);
|
||||
const max_16_ascii: Vector16 = @splat(@as(u16, 127));
|
||||
|
||||
fn processCodeUnitUTF16(
|
||||
this: *TextDecoder,
|
||||
output: *std.ArrayListUnmanaged(u16),
|
||||
saw_error: *bool,
|
||||
code_unit: u16,
|
||||
) error{OutOfMemory}!void {
|
||||
if (this.lead_surrogate) |lead_surrogate| {
|
||||
this.lead_surrogate = null;
|
||||
|
||||
if (strings.u16IsTrail(code_unit)) {
|
||||
// TODO: why is this here?
|
||||
// const code_point = strings.u16GetSupplementary(lead_surrogate, code_unit);
|
||||
try output.appendSlice(
|
||||
bun.default_allocator,
|
||||
&.{ lead_surrogate, code_unit },
|
||||
);
|
||||
return;
|
||||
}
|
||||
try output.append(bun.default_allocator, strings.unicode_replacement);
|
||||
saw_error.* = true;
|
||||
}
|
||||
|
||||
if (strings.u16IsLead(code_unit)) {
|
||||
this.lead_surrogate = code_unit;
|
||||
return;
|
||||
}
|
||||
|
||||
if (strings.u16IsTrail(code_unit)) {
|
||||
try output.append(bun.default_allocator, strings.unicode_replacement);
|
||||
saw_error.* = true;
|
||||
return;
|
||||
}
|
||||
|
||||
try output.append(bun.default_allocator, code_unit);
|
||||
return;
|
||||
}
|
||||
|
||||
pub fn codeUnitFromBytesUTF16(
|
||||
first: u16,
|
||||
second: u16,
|
||||
comptime big_endian: bool,
|
||||
) u16 {
|
||||
return if (comptime big_endian)
|
||||
(first << 8) | second
|
||||
else
|
||||
first | (second << 8);
|
||||
}
|
||||
|
||||
pub fn decodeUTF16(
|
||||
this: *TextDecoder,
|
||||
bytes: []const u8,
|
||||
comptime big_endian: bool,
|
||||
comptime flush: bool,
|
||||
) error{OutOfMemory}!struct { std.ArrayListUnmanaged(u16), bool } {
|
||||
var output: std.ArrayListUnmanaged(u16) = .{};
|
||||
try output.ensureTotalCapacity(bun.default_allocator, @divFloor(bytes.len, 2));
|
||||
|
||||
var remain = bytes;
|
||||
var saw_error = false;
|
||||
|
||||
if (this.lead_byte) |lead_byte| {
|
||||
if (remain.len > 0) {
|
||||
this.lead_byte = null;
|
||||
|
||||
try this.processCodeUnitUTF16(
|
||||
&output,
|
||||
&saw_error,
|
||||
codeUnitFromBytesUTF16(@intCast(lead_byte), @intCast(remain[0]), big_endian),
|
||||
);
|
||||
remain = remain[1..];
|
||||
}
|
||||
}
|
||||
|
||||
var i: usize = 0;
|
||||
|
||||
while (i < remain.len -| 1) {
|
||||
try this.processCodeUnitUTF16(
|
||||
&output,
|
||||
&saw_error,
|
||||
codeUnitFromBytesUTF16(@intCast(remain[i]), @intCast(remain[i + 1]), big_endian),
|
||||
);
|
||||
i += 2;
|
||||
}
|
||||
|
||||
if (remain.len != 0 and i == remain.len - 1) {
|
||||
this.lead_byte = remain[i];
|
||||
} else {
|
||||
bun.assertWithLocation(i == remain.len, @src());
|
||||
}
|
||||
|
||||
if (comptime flush) {
|
||||
if (this.lead_byte != null or this.lead_surrogate != null) {
|
||||
this.lead_byte = null;
|
||||
this.lead_surrogate = null;
|
||||
try output.append(bun.default_allocator, strings.unicode_replacement);
|
||||
saw_error = true;
|
||||
return .{ output, saw_error };
|
||||
}
|
||||
}
|
||||
|
||||
return .{ output, saw_error };
|
||||
}
|
||||
|
||||
pub fn decode(this: *TextDecoder, globalThis: *JSC.JSGlobalObject, callframe: *JSC.CallFrame) bun.JSError!JSValue {
|
||||
const arguments = callframe.arguments_old(2).slice();
|
||||
|
||||
const input_slice = input_slice: {
|
||||
if (arguments.len == 0 or arguments[0].isUndefined()) {
|
||||
break :input_slice "";
|
||||
}
|
||||
|
||||
if (arguments[0].asArrayBuffer(globalThis)) |array_buffer| {
|
||||
break :input_slice array_buffer.slice();
|
||||
}
|
||||
|
||||
return globalThis.throwInvalidArguments("TextDecoder.decode expects an ArrayBuffer or TypedArray", .{});
|
||||
};
|
||||
|
||||
const stream = stream: {
|
||||
if (arguments.len > 1 and arguments[1].isObject()) {
|
||||
if (arguments[1].fastGet(globalThis, .stream)) |stream_value| {
|
||||
const stream_bool = stream_value.coerce(bool, globalThis);
|
||||
if (globalThis.hasException()) {
|
||||
return .zero;
|
||||
}
|
||||
break :stream stream_bool;
|
||||
}
|
||||
}
|
||||
|
||||
break :stream false;
|
||||
};
|
||||
|
||||
return switch (!stream) {
|
||||
inline else => |flush| this.decodeSlice(globalThis, input_slice, flush),
|
||||
};
|
||||
}
|
||||
|
||||
pub fn decodeWithoutTypeChecks(this: *TextDecoder, globalThis: *JSC.JSGlobalObject, uint8array: *JSC.JSUint8Array) bun.JSError!JSValue {
|
||||
return this.decodeSlice(globalThis, uint8array.slice(), false);
|
||||
}
|
||||
|
||||
fn decodeSlice(this: *TextDecoder, globalThis: *JSC.JSGlobalObject, buffer_slice: []const u8, comptime flush: bool) bun.JSError!JSValue {
|
||||
switch (this.encoding) {
|
||||
EncodingLabel.latin1 => {
|
||||
if (strings.isAllASCII(buffer_slice)) {
|
||||
return ZigString.init(buffer_slice).toJS(globalThis);
|
||||
}
|
||||
|
||||
// It's unintuitive that we encode Latin1 as UTF16 even though the engine natively supports Latin1 strings...
|
||||
// However, this is also what WebKit seems to do.
|
||||
//
|
||||
// It's not clear why we couldn't jusst use Latin1 here, but tests failures proved it necessary.
|
||||
const out_length = strings.elementLengthLatin1IntoUTF16([]const u8, buffer_slice);
|
||||
const bytes = try globalThis.allocator().alloc(u16, out_length);
|
||||
|
||||
const out = strings.copyLatin1IntoUTF16([]u16, bytes, []const u8, buffer_slice);
|
||||
return ZigString.toExternalU16(bytes.ptr, out.written, globalThis);
|
||||
},
|
||||
EncodingLabel.@"UTF-8" => {
|
||||
const input, const deinit = input: {
|
||||
const maybe_without_bom = if (!this.ignore_bom and strings.hasPrefixComptime(buffer_slice, "\xef\xbb\xbf"))
|
||||
buffer_slice[3..]
|
||||
else
|
||||
buffer_slice;
|
||||
|
||||
if (this.buffered.len > 0) {
|
||||
defer this.buffered.len = 0;
|
||||
const joined = try bun.default_allocator.alloc(u8, maybe_without_bom.len + this.buffered.len);
|
||||
@memcpy(joined[0..this.buffered.len], this.buffered.slice());
|
||||
@memcpy(joined[this.buffered.len..][0..maybe_without_bom.len], maybe_without_bom);
|
||||
break :input .{ joined, true };
|
||||
}
|
||||
|
||||
break :input .{ maybe_without_bom, false };
|
||||
};
|
||||
|
||||
const maybe_decode_result = switch (this.fatal) {
|
||||
inline else => |fail_if_invalid| strings.toUTF16AllocMaybeBuffered(bun.default_allocator, input, fail_if_invalid, flush) catch |err| {
|
||||
if (deinit) bun.default_allocator.free(input);
|
||||
if (comptime fail_if_invalid) {
|
||||
if (err == error.InvalidByteSequence) {
|
||||
return globalThis.ERR_ENCODING_INVALID_ENCODED_DATA("Invalid byte sequence", .{}).throw();
|
||||
}
|
||||
}
|
||||
|
||||
bun.assert(err == error.OutOfMemory);
|
||||
return globalThis.throwOutOfMemory();
|
||||
},
|
||||
};
|
||||
|
||||
if (maybe_decode_result) |decode_result| {
|
||||
if (deinit) bun.default_allocator.free(input);
|
||||
const decoded, const leftover, const leftover_len = decode_result;
|
||||
bun.assert(this.buffered.len == 0);
|
||||
if (comptime !flush) {
|
||||
if (leftover_len != 0) {
|
||||
this.buffered.buf = leftover;
|
||||
this.buffered.len = leftover_len;
|
||||
}
|
||||
}
|
||||
return ZigString.toExternalU16(decoded.ptr, decoded.len, globalThis);
|
||||
}
|
||||
|
||||
bun.debugAssert(input.len == 0 or !deinit);
|
||||
|
||||
// Experiment: using mimalloc directly is slightly slower
|
||||
return ZigString.init(input).toJS(globalThis);
|
||||
},
|
||||
|
||||
inline .@"UTF-16LE", .@"UTF-16BE" => |utf16_encoding| {
|
||||
const bom = if (comptime utf16_encoding == .@"UTF-16LE") "\xff\xfe" else "\xfe\xff";
|
||||
const input = if (!this.ignore_bom and strings.hasPrefixComptime(buffer_slice, bom))
|
||||
buffer_slice[2..]
|
||||
else
|
||||
buffer_slice;
|
||||
|
||||
var decoded, const saw_error = try this.decodeUTF16(input, utf16_encoding == .@"UTF-16BE", flush);
|
||||
|
||||
if (saw_error and this.fatal) {
|
||||
decoded.deinit(bun.default_allocator);
|
||||
return globalThis.ERR_ENCODING_INVALID_ENCODED_DATA("The encoded data was not valid {s} data", .{@tagName(utf16_encoding)}).throw();
|
||||
}
|
||||
|
||||
var output = bun.String.fromUTF16(decoded.items);
|
||||
return output.toJS(globalThis);
|
||||
},
|
||||
else => {
|
||||
return globalThis.throwInvalidArguments("TextDecoder.decode set to unsupported encoding", .{});
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
pub fn constructor(globalThis: *JSC.JSGlobalObject, callframe: *JSC.CallFrame) bun.JSError!*TextDecoder {
|
||||
var args_ = callframe.arguments_old(2);
|
||||
var arguments: []const JSC.JSValue = args_.ptr[0..args_.len];
|
||||
|
||||
var decoder = TextDecoder{};
|
||||
|
||||
if (arguments.len > 0) {
|
||||
// encoding
|
||||
if (arguments[0].isString()) {
|
||||
var str = try arguments[0].toSlice(globalThis, bun.default_allocator);
|
||||
defer if (str.isAllocated()) str.deinit();
|
||||
|
||||
if (EncodingLabel.which(str.slice())) |label| {
|
||||
decoder.encoding = label;
|
||||
} else {
|
||||
return globalThis.throwInvalidArguments("Unsupported encoding label \"{s}\"", .{str.slice()});
|
||||
}
|
||||
} else if (arguments[0].isUndefined()) {
|
||||
// default to utf-8
|
||||
decoder.encoding = EncodingLabel.@"UTF-8";
|
||||
} else {
|
||||
return globalThis.throwInvalidArguments("TextDecoder(encoding) label is invalid", .{});
|
||||
}
|
||||
|
||||
if (arguments.len >= 2) {
|
||||
const options = arguments[1];
|
||||
|
||||
if (!options.isObject()) {
|
||||
return globalThis.throwInvalidArguments("TextDecoder(options) is invalid", .{});
|
||||
}
|
||||
|
||||
if (try options.get(globalThis, "fatal")) |fatal| {
|
||||
if (fatal.isBoolean()) {
|
||||
decoder.fatal = fatal.asBoolean();
|
||||
} else {
|
||||
return globalThis.throwInvalidArguments("TextDecoder(options) fatal is invalid. Expected boolean value", .{});
|
||||
}
|
||||
}
|
||||
|
||||
if (try options.get(globalThis, "ignoreBOM")) |ignoreBOM| {
|
||||
if (ignoreBOM.isBoolean()) {
|
||||
decoder.ignore_bom = ignoreBOM.asBoolean();
|
||||
} else {
|
||||
return globalThis.throwInvalidArguments("TextDecoder(options) ignoreBOM is invalid. Expected boolean value", .{});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return TextDecoder.new(decoder);
|
||||
}
|
||||
};
|
||||
pub const TextEncoder = @import("./TextEncoder.zig");
|
||||
pub const EncodingLabel = @import("./EncodingLabel.zig").EncodingLabel;
|
||||
pub const TextEncoderStreamEncoder = @import("./TextEncoderStreamEncoder.zig");
|
||||
pub const TextDecoder = @import("./TextDecoder.zig");
|
||||
|
||||
pub const Encoder = struct {
|
||||
export fn Bun__encoding__writeLatin1(input: [*]const u8, len: usize, to: [*]u8, to_len: usize, encoding: u8) usize {
|
||||
@@ -1468,22 +529,23 @@ pub const Encoder = struct {
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
comptime {
|
||||
_ = Bun__encoding__writeLatin1;
|
||||
_ = Bun__encoding__writeUTF16;
|
||||
|
||||
_ = Bun__encoding__byteLengthLatin1AsUTF8;
|
||||
_ = Bun__encoding__byteLengthUTF16AsUTF8;
|
||||
|
||||
_ = Bun__encoding__toString;
|
||||
_ = Bun__encoding__toStringUTF8;
|
||||
|
||||
_ = Bun__encoding__constructFromLatin1;
|
||||
_ = Bun__encoding__constructFromUTF16;
|
||||
}
|
||||
};
|
||||
|
||||
comptime {
|
||||
std.testing.refAllDecls(Encoder);
|
||||
_ = &TextEncoder.TextEncoder__encode8;
|
||||
_ = &TextEncoder.TextEncoder__encode16;
|
||||
_ = &TextEncoder.TextEncoder__encodeInto8;
|
||||
_ = &TextEncoder.TextEncoder__encodeInto16;
|
||||
_ = &TextEncoder.TextEncoder__encodeRopeString;
|
||||
}
|
||||
|
||||
comptime {
|
||||
_ = &Encoder.Bun__encoding__writeLatin1;
|
||||
_ = &Encoder.Bun__encoding__writeUTF16;
|
||||
_ = &Encoder.Bun__encoding__byteLengthLatin1AsUTF8;
|
||||
_ = &Encoder.Bun__encoding__byteLengthUTF16AsUTF8;
|
||||
_ = &Encoder.Bun__encoding__toString;
|
||||
_ = &Encoder.Bun__encoding__toStringUTF8;
|
||||
_ = &Encoder.Bun__encoding__constructFromLatin1;
|
||||
_ = &Encoder.Bun__encoding__constructFromUTF16;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user