From 50eaea19cb71ac2ef5092ebecbc8e917dcfbde61 Mon Sep 17 00:00:00 2001 From: Jarred Sumner Date: Mon, 24 Mar 2025 17:10:48 -0700 Subject: [PATCH] Move TextDecoder, TextEncoderStreamEncoder, TextEncoder, EncodingLabel into separate files (#18430) --- src/bun.js/webcore/EncodingLabel.zig | 160 +++ src/bun.js/webcore/TextDecoder.zig | 350 +++++++ src/bun.js/webcore/TextEncoder.zig | 255 +++++ .../webcore/TextEncoderStreamEncoder.zig | 213 ++++ src/bun.js/webcore/encoding.zig | 978 +----------------- 5 files changed, 998 insertions(+), 958 deletions(-) create mode 100644 src/bun.js/webcore/EncodingLabel.zig create mode 100644 src/bun.js/webcore/TextDecoder.zig create mode 100644 src/bun.js/webcore/TextEncoder.zig create mode 100644 src/bun.js/webcore/TextEncoderStreamEncoder.zig diff --git a/src/bun.js/webcore/EncodingLabel.zig b/src/bun.js/webcore/EncodingLabel.zig new file mode 100644 index 0000000000..7a8c43cd43 --- /dev/null +++ b/src/bun.js/webcore/EncodingLabel.zig @@ -0,0 +1,160 @@ +/// https://encoding.spec.whatwg.org/encodings.json +pub const EncodingLabel = enum { + @"UTF-8", + IBM866, + @"ISO-8859-2", + @"ISO-8859-3", + @"ISO-8859-4", + @"ISO-8859-5", + @"ISO-8859-6", + @"ISO-8859-7", + @"ISO-8859-8", + @"ISO-8859-8-I", + @"ISO-8859-10", + @"ISO-8859-13", + @"ISO-8859-14", + @"ISO-8859-15", + @"ISO-8859-16", + @"KOI8-R", + @"KOI8-U", + macintosh, + @"windows-874", + @"windows-1250", + @"windows-1251", + /// Also known as + /// - ASCII + /// - latin1 + @"windows-1252", + @"windows-1253", + @"windows-1254", + @"windows-1255", + @"windows-1256", + @"windows-1257", + @"windows-1258", + @"x-mac-cyrillic", + Big5, + @"EUC-JP", + @"ISO-2022-JP", + Shift_JIS, + @"EUC-KR", + @"UTF-16BE", + @"UTF-16LE", + @"x-user-defined", + + pub const Map = std.enums.EnumMap(EncodingLabel, string); + pub const label: Map = brk: { + var map = Map.initFull(""); + map.put(EncodingLabel.@"UTF-8", "utf-8"); + map.put(EncodingLabel.@"UTF-16LE", "utf-16le"); + map.put(EncodingLabel.@"windows-1252", "windows-1252"); + break :brk map; + }; + + const utf16_names = [_]string{ + "ucs-2", + "utf-16", + "unicode", + "utf-16le", + "csunicode", + "unicodefeff", + "iso-10646-ucs-2", + }; + + const utf8_names = [_]string{ + "utf8", + "utf-8", + "unicode11utf8", + "unicode20utf8", + "x-unicode20utf8", + "unicode-1-1-utf-8", + }; + + const latin1_names = [_]string{ + "l1", + "ascii", + "cp819", + "cp1252", + "ibm819", + "latin1", + "iso88591", + "us-ascii", + "x-cp1252", + "iso8859-1", + "iso_8859-1", + "iso-8859-1", + "iso-ir-100", + "csisolatin1", + "windows-1252", + "ansi_x3.4-1968", + "iso_8859-1:1987", + }; + + pub const latin1 = EncodingLabel.@"windows-1252"; + + pub fn which(input_: string) ?EncodingLabel { + const input = strings.trim(input_, " \t\r\n"); + const ExactMatcher = strings.ExactSizeMatcher; + const Eight = ExactMatcher(8); + const Sixteen = ExactMatcher(16); + return switch (input.len) { + 1, 0 => null, + 2...8 => switch (Eight.matchLower(input)) { + Eight.case("l1"), + Eight.case("ascii"), + Eight.case("cp819"), + Eight.case("cp1252"), + Eight.case("ibm819"), + Eight.case("latin1"), + Eight.case("iso88591"), + Eight.case("us-ascii"), + Eight.case("x-cp1252"), + => EncodingLabel.latin1, + + Eight.case("ucs-2"), + Eight.case("utf-16"), + Eight.case("unicode"), + Eight.case("utf-16le"), + => EncodingLabel.@"UTF-16LE", + + Eight.case("utf-16be"), + => EncodingLabel.@"UTF-16BE", + + Eight.case("utf8"), Eight.case("utf-8") => EncodingLabel.@"UTF-8", + else => null, + }, + + 9...16 => switch (Sixteen.matchLower(input)) { + Sixteen.case("iso8859-1"), + Sixteen.case("iso_8859-1"), + Sixteen.case("iso-8859-1"), + Sixteen.case("iso-ir-100"), + Sixteen.case("csisolatin1"), + Sixteen.case("windows-1252"), + Sixteen.case("ansi_x3.4-1968"), + Sixteen.case("iso_8859-1:1987"), + => EncodingLabel.latin1, + + Sixteen.case("unicode11utf8"), + Sixteen.case("unicode20utf8"), + Sixteen.case("x-unicode20utf8"), + => EncodingLabel.@"UTF-8", + + Sixteen.case("csunicode"), + Sixteen.case("unicodefeff"), + Sixteen.case("iso-10646-ucs-2"), + => EncodingLabel.@"UTF-16LE", + + else => null, + }, + else => if (strings.eqlCaseInsensitiveASCII(input, "unicode-1-1-utf-8", true)) + EncodingLabel.@"UTF-8" + else + null, + }; + } +}; +const std = @import("std"); +const bun = @import("root").bun; +const encoding = @import("encoding.zig"); +const string = []const u8; +const strings = bun.strings; diff --git a/src/bun.js/webcore/TextDecoder.zig b/src/bun.js/webcore/TextDecoder.zig new file mode 100644 index 0000000000..5d6da1bcc0 --- /dev/null +++ b/src/bun.js/webcore/TextDecoder.zig @@ -0,0 +1,350 @@ +// used for utf8 decoding +buffered: struct { + buf: [3]u8 = .{0} ** 3, + len: u2 = 0, + + pub fn slice(this: *@This()) []const u8 { + return this.buf[0..this.len]; + } +} = .{}, + +// used for utf16 decoding +lead_byte: ?u8 = null, +lead_surrogate: ?u16 = null, + +ignore_bom: bool = false, +fatal: bool = false, +encoding: EncodingLabel = EncodingLabel.@"UTF-8", + +pub usingnamespace bun.New(TextDecoder); +pub usingnamespace JSC.Codegen.JSTextDecoder; + +pub fn finalize(this: *TextDecoder) void { + this.destroy(); +} + +pub fn getIgnoreBOM( + this: *TextDecoder, + _: *JSC.JSGlobalObject, +) JSC.JSValue { + return JSC.JSValue.jsBoolean(this.ignore_bom); +} + +pub fn getFatal( + this: *TextDecoder, + _: *JSC.JSGlobalObject, +) JSC.JSValue { + return JSC.JSValue.jsBoolean(this.fatal); +} + +pub fn getEncoding( + this: *TextDecoder, + globalThis: *JSC.JSGlobalObject, +) JSC.JSValue { + return ZigString.init(EncodingLabel.label.get(this.encoding).?).toJS(globalThis); +} +const Vector16 = std.meta.Vector(16, u16); +const max_16_ascii: Vector16 = @splat(@as(u16, 127)); + +fn processCodeUnitUTF16( + this: *TextDecoder, + output: *std.ArrayListUnmanaged(u16), + saw_error: *bool, + code_unit: u16, +) error{OutOfMemory}!void { + if (this.lead_surrogate) |lead_surrogate| { + this.lead_surrogate = null; + + if (strings.u16IsTrail(code_unit)) { + // TODO: why is this here? + // const code_point = strings.u16GetSupplementary(lead_surrogate, code_unit); + try output.appendSlice( + bun.default_allocator, + &.{ lead_surrogate, code_unit }, + ); + return; + } + try output.append(bun.default_allocator, strings.unicode_replacement); + saw_error.* = true; + } + + if (strings.u16IsLead(code_unit)) { + this.lead_surrogate = code_unit; + return; + } + + if (strings.u16IsTrail(code_unit)) { + try output.append(bun.default_allocator, strings.unicode_replacement); + saw_error.* = true; + return; + } + + try output.append(bun.default_allocator, code_unit); + return; +} + +pub fn codeUnitFromBytesUTF16( + first: u16, + second: u16, + comptime big_endian: bool, +) u16 { + return if (comptime big_endian) + (first << 8) | second + else + first | (second << 8); +} + +pub fn decodeUTF16( + this: *TextDecoder, + bytes: []const u8, + comptime big_endian: bool, + comptime flush: bool, +) error{OutOfMemory}!struct { std.ArrayListUnmanaged(u16), bool } { + var output: std.ArrayListUnmanaged(u16) = .{}; + try output.ensureTotalCapacity(bun.default_allocator, @divFloor(bytes.len, 2)); + + var remain = bytes; + var saw_error = false; + + if (this.lead_byte) |lead_byte| { + if (remain.len > 0) { + this.lead_byte = null; + + try this.processCodeUnitUTF16( + &output, + &saw_error, + codeUnitFromBytesUTF16(@intCast(lead_byte), @intCast(remain[0]), big_endian), + ); + remain = remain[1..]; + } + } + + var i: usize = 0; + + while (i < remain.len -| 1) { + try this.processCodeUnitUTF16( + &output, + &saw_error, + codeUnitFromBytesUTF16(@intCast(remain[i]), @intCast(remain[i + 1]), big_endian), + ); + i += 2; + } + + if (remain.len != 0 and i == remain.len - 1) { + this.lead_byte = remain[i]; + } else { + bun.assertWithLocation(i == remain.len, @src()); + } + + if (comptime flush) { + if (this.lead_byte != null or this.lead_surrogate != null) { + this.lead_byte = null; + this.lead_surrogate = null; + try output.append(bun.default_allocator, strings.unicode_replacement); + saw_error = true; + return .{ output, saw_error }; + } + } + + return .{ output, saw_error }; +} + +pub fn decode(this: *TextDecoder, globalThis: *JSC.JSGlobalObject, callframe: *JSC.CallFrame) bun.JSError!JSValue { + const arguments = callframe.arguments_old(2).slice(); + + const input_slice = input_slice: { + if (arguments.len == 0 or arguments[0].isUndefined()) { + break :input_slice ""; + } + + if (arguments[0].asArrayBuffer(globalThis)) |array_buffer| { + break :input_slice array_buffer.slice(); + } + + return globalThis.throwInvalidArguments("TextDecoder.decode expects an ArrayBuffer or TypedArray", .{}); + }; + + const stream = stream: { + if (arguments.len > 1 and arguments[1].isObject()) { + if (arguments[1].fastGet(globalThis, .stream)) |stream_value| { + const stream_bool = stream_value.coerce(bool, globalThis); + if (globalThis.hasException()) { + return .zero; + } + break :stream stream_bool; + } + } + + break :stream false; + }; + + return switch (!stream) { + inline else => |flush| this.decodeSlice(globalThis, input_slice, flush), + }; +} + +pub fn decodeWithoutTypeChecks(this: *TextDecoder, globalThis: *JSC.JSGlobalObject, uint8array: *JSC.JSUint8Array) bun.JSError!JSValue { + return this.decodeSlice(globalThis, uint8array.slice(), false); +} + +fn decodeSlice(this: *TextDecoder, globalThis: *JSC.JSGlobalObject, buffer_slice: []const u8, comptime flush: bool) bun.JSError!JSValue { + switch (this.encoding) { + EncodingLabel.latin1 => { + if (strings.isAllASCII(buffer_slice)) { + return ZigString.init(buffer_slice).toJS(globalThis); + } + + // It's unintuitive that we encode Latin1 as UTF16 even though the engine natively supports Latin1 strings... + // However, this is also what WebKit seems to do. + // + // It's not clear why we couldn't jusst use Latin1 here, but tests failures proved it necessary. + const out_length = strings.elementLengthLatin1IntoUTF16([]const u8, buffer_slice); + const bytes = try globalThis.allocator().alloc(u16, out_length); + + const out = strings.copyLatin1IntoUTF16([]u16, bytes, []const u8, buffer_slice); + return ZigString.toExternalU16(bytes.ptr, out.written, globalThis); + }, + EncodingLabel.@"UTF-8" => { + const input, const deinit = input: { + const maybe_without_bom = if (!this.ignore_bom and strings.hasPrefixComptime(buffer_slice, "\xef\xbb\xbf")) + buffer_slice[3..] + else + buffer_slice; + + if (this.buffered.len > 0) { + defer this.buffered.len = 0; + const joined = try bun.default_allocator.alloc(u8, maybe_without_bom.len + this.buffered.len); + @memcpy(joined[0..this.buffered.len], this.buffered.slice()); + @memcpy(joined[this.buffered.len..][0..maybe_without_bom.len], maybe_without_bom); + break :input .{ joined, true }; + } + + break :input .{ maybe_without_bom, false }; + }; + + const maybe_decode_result = switch (this.fatal) { + inline else => |fail_if_invalid| strings.toUTF16AllocMaybeBuffered(bun.default_allocator, input, fail_if_invalid, flush) catch |err| { + if (deinit) bun.default_allocator.free(input); + if (comptime fail_if_invalid) { + if (err == error.InvalidByteSequence) { + return globalThis.ERR_ENCODING_INVALID_ENCODED_DATA("Invalid byte sequence", .{}).throw(); + } + } + + bun.assert(err == error.OutOfMemory); + return globalThis.throwOutOfMemory(); + }, + }; + + if (maybe_decode_result) |decode_result| { + if (deinit) bun.default_allocator.free(input); + const decoded, const leftover, const leftover_len = decode_result; + bun.assert(this.buffered.len == 0); + if (comptime !flush) { + if (leftover_len != 0) { + this.buffered.buf = leftover; + this.buffered.len = leftover_len; + } + } + return ZigString.toExternalU16(decoded.ptr, decoded.len, globalThis); + } + + bun.debugAssert(input.len == 0 or !deinit); + + // Experiment: using mimalloc directly is slightly slower + return ZigString.init(input).toJS(globalThis); + }, + + inline .@"UTF-16LE", .@"UTF-16BE" => |utf16_encoding| { + const bom = if (comptime utf16_encoding == .@"UTF-16LE") "\xff\xfe" else "\xfe\xff"; + const input = if (!this.ignore_bom and strings.hasPrefixComptime(buffer_slice, bom)) + buffer_slice[2..] + else + buffer_slice; + + var decoded, const saw_error = try this.decodeUTF16(input, utf16_encoding == .@"UTF-16BE", flush); + + if (saw_error and this.fatal) { + decoded.deinit(bun.default_allocator); + return globalThis.ERR_ENCODING_INVALID_ENCODED_DATA("The encoded data was not valid {s} data", .{@tagName(utf16_encoding)}).throw(); + } + + var output = bun.String.fromUTF16(decoded.items); + return output.toJS(globalThis); + }, + else => { + return globalThis.throwInvalidArguments("TextDecoder.decode set to unsupported encoding", .{}); + }, + } +} + +pub fn constructor(globalThis: *JSC.JSGlobalObject, callframe: *JSC.CallFrame) bun.JSError!*TextDecoder { + var args_ = callframe.arguments_old(2); + var arguments: []const JSC.JSValue = args_.ptr[0..args_.len]; + + var decoder = TextDecoder{}; + + if (arguments.len > 0) { + // encoding + if (arguments[0].isString()) { + var str = try arguments[0].toSlice(globalThis, bun.default_allocator); + defer if (str.isAllocated()) str.deinit(); + + if (EncodingLabel.which(str.slice())) |label| { + decoder.encoding = label; + } else { + return globalThis.throwInvalidArguments("Unsupported encoding label \"{s}\"", .{str.slice()}); + } + } else if (arguments[0].isUndefined()) { + // default to utf-8 + decoder.encoding = EncodingLabel.@"UTF-8"; + } else { + return globalThis.throwInvalidArguments("TextDecoder(encoding) label is invalid", .{}); + } + + if (arguments.len >= 2) { + const options = arguments[1]; + + if (!options.isObject()) { + return globalThis.throwInvalidArguments("TextDecoder(options) is invalid", .{}); + } + + if (try options.get(globalThis, "fatal")) |fatal| { + if (fatal.isBoolean()) { + decoder.fatal = fatal.asBoolean(); + } else { + return globalThis.throwInvalidArguments("TextDecoder(options) fatal is invalid. Expected boolean value", .{}); + } + } + + if (try options.get(globalThis, "ignoreBOM")) |ignoreBOM| { + if (ignoreBOM.isBoolean()) { + decoder.ignore_bom = ignoreBOM.asBoolean(); + } else { + return globalThis.throwInvalidArguments("TextDecoder(options) ignoreBOM is invalid. Expected boolean value", .{}); + } + } + } + } + + return TextDecoder.new(decoder); +} + +const TextDecoder = @This(); + +const std = @import("std"); +const bun = @import("root").bun; +const JSC = bun.JSC; +const Output = bun.Output; +const MutableString = bun.MutableString; +const strings = bun.strings; +const string = bun.string; +const FeatureFlags = bun.FeatureFlags; +const ArrayBuffer = JSC.ArrayBuffer; +const JSUint8Array = JSC.JSUint8Array; +const ZigString = JSC.ZigString; +const JSInternalPromise = JSC.JSInternalPromise; +const JSPromise = JSC.JSPromise; +const JSValue = JSC.JSValue; +const JSGlobalObject = JSC.JSGlobalObject; +const EncodingLabel = JSC.WebCore.EncodingLabel; diff --git a/src/bun.js/webcore/TextEncoder.zig b/src/bun.js/webcore/TextEncoder.zig new file mode 100644 index 0000000000..6f5ca0bbb0 --- /dev/null +++ b/src/bun.js/webcore/TextEncoder.zig @@ -0,0 +1,255 @@ +pub export fn TextEncoder__encode8( + globalThis: *JSGlobalObject, + ptr: [*]const u8, + len: usize, +) JSValue { + // as much as possible, rely on JSC to own the memory + // their code is more battle-tested than bun's code + // so we do a stack allocation here + // and then copy into JSC memory + // unless it's huge + // JSC will GC Uint8Array that occupy less than 512 bytes + // so it's extra good for that case + // this also means there won't be reallocations for small strings + var buf: [2048]u8 = undefined; + const slice = ptr[0..len]; + + if (slice.len <= buf.len / 2) { + const result = strings.copyLatin1IntoUTF8(&buf, []const u8, slice); + const uint8array = JSC.JSValue.createUninitializedUint8Array(globalThis, result.written); + bun.assert(result.written <= buf.len); + bun.assert(result.read == slice.len); + const array_buffer = uint8array.asArrayBuffer(globalThis) orelse return .zero; + bun.assert(result.written == array_buffer.len); + @memcpy(array_buffer.byteSlice()[0..result.written], buf[0..result.written]); + return uint8array; + } else { + const bytes = strings.allocateLatin1IntoUTF8(globalThis.bunVM().allocator, []const u8, slice) catch { + return globalThis.throwOutOfMemoryValue(); + }; + bun.assert(bytes.len >= slice.len); + return ArrayBuffer.fromBytes(bytes, .Uint8Array).toJSUnchecked(globalThis, null); + } +} +pub export fn TextEncoder__encode16( + globalThis: *JSGlobalObject, + ptr: [*]const u16, + len: usize, +) JSValue { + // as much as possible, rely on JSC to own the memory + // their code is more battle-tested than bun's code + // so we do a stack allocation here + // and then copy into JSC memory + // unless it's huge + // JSC will GC Uint8Array that occupy less than 512 bytes + // so it's extra good for that case + // this also means there won't be reallocations for small strings + var buf: [2048]u8 = undefined; + + const slice = ptr[0..len]; + + // max utf16 -> utf8 length + if (slice.len <= buf.len / 4) { + const result = strings.copyUTF16IntoUTF8(&buf, @TypeOf(slice), slice, true); + if (result.read == 0 or result.written == 0) { + const uint8array = JSC.JSValue.createUninitializedUint8Array(globalThis, 3); + const array_buffer = uint8array.asArrayBuffer(globalThis).?; + const replacement_char = [_]u8{ 239, 191, 189 }; + @memcpy(array_buffer.slice()[0..replacement_char.len], &replacement_char); + return uint8array; + } + const uint8array = JSC.JSValue.createUninitializedUint8Array(globalThis, result.written); + bun.assert(result.written <= buf.len); + bun.assert(result.read == slice.len); + const array_buffer = uint8array.asArrayBuffer(globalThis).?; + bun.assert(result.written == array_buffer.len); + @memcpy(array_buffer.slice()[0..result.written], buf[0..result.written]); + return uint8array; + } else { + const bytes = strings.toUTF8AllocWithType( + bun.default_allocator, + @TypeOf(slice), + slice, + ) catch { + return JSC.toInvalidArguments("Out of memory", .{}, globalThis); + }; + return ArrayBuffer.fromBytes(bytes, .Uint8Array).toJSUnchecked(globalThis, null); + } +} + +pub export fn c( + globalThis: *JSGlobalObject, + ptr: [*]const u16, + len: usize, +) JSValue { + // as much as possible, rely on JSC to own the memory + // their code is more battle-tested than bun's code + // so we do a stack allocation here + // and then copy into JSC memory + // unless it's huge + // JSC will GC Uint8Array that occupy less than 512 bytes + // so it's extra good for that case + // this also means there won't be reallocations for small strings + var buf: [2048]u8 = undefined; + + const slice = ptr[0..len]; + + // max utf16 -> utf8 length + if (slice.len <= buf.len / 4) { + const result = strings.copyUTF16IntoUTF8(&buf, @TypeOf(slice), slice, true); + if (result.read == 0 or result.written == 0) { + const uint8array = JSC.JSValue.createUninitializedUint8Array(globalThis, 3); + const array_buffer = uint8array.asArrayBuffer(globalThis).?; + const replacement_char = [_]u8{ 239, 191, 189 }; + @memcpy(array_buffer.slice()[0..replacement_char.len], &replacement_char); + return uint8array; + } + const uint8array = JSC.JSValue.createUninitializedUint8Array(globalThis, result.written); + bun.assert(result.written <= buf.len); + bun.assert(result.read == slice.len); + const array_buffer = uint8array.asArrayBuffer(globalThis).?; + bun.assert(result.written == array_buffer.len); + @memcpy(array_buffer.slice()[0..result.written], buf[0..result.written]); + return uint8array; + } else { + const bytes = strings.toUTF8AllocWithType( + bun.default_allocator, + @TypeOf(slice), + slice, + ) catch { + return globalThis.throwOutOfMemoryValue(); + }; + return ArrayBuffer.fromBytes(bytes, .Uint8Array).toJSUnchecked(globalThis, null); + } +} + +// This is a fast path for copying a Rope string into a Uint8Array. +// This keeps us from an extra string temporary allocation +const RopeStringEncoder = struct { + globalThis: *JSGlobalObject, + buf: []u8, + tail: usize = 0, + any_non_ascii: bool = false, + + pub fn append8(it: *JSC.JSString.Iterator, ptr: [*]const u8, len: u32) callconv(.C) void { + var this = bun.cast(*RopeStringEncoder, it.data.?); + const result = strings.copyLatin1IntoUTF8StopOnNonASCII(this.buf[this.tail..], []const u8, ptr[0..len], true); + if (result.read == std.math.maxInt(u32) and result.written == std.math.maxInt(u32)) { + it.stop = 1; + this.any_non_ascii = true; + } else { + this.tail += result.written; + } + } + pub fn append16(it: *JSC.JSString.Iterator, _: [*]const u16, _: u32) callconv(.C) void { + var this = bun.cast(*RopeStringEncoder, it.data.?); + this.any_non_ascii = true; + it.stop = 1; + } + pub fn write8(it: *JSC.JSString.Iterator, ptr: [*]const u8, len: u32, offset: u32) callconv(.C) void { + var this = bun.cast(*RopeStringEncoder, it.data.?); + const result = strings.copyLatin1IntoUTF8StopOnNonASCII(this.buf[offset..], []const u8, ptr[0..len], true); + if (result.read == std.math.maxInt(u32) and result.written == std.math.maxInt(u32)) { + it.stop = 1; + this.any_non_ascii = true; + } + } + pub fn write16(it: *JSC.JSString.Iterator, _: [*]const u16, _: u32, _: u32) callconv(.C) void { + var this = bun.cast(*RopeStringEncoder, it.data.?); + this.any_non_ascii = true; + it.stop = 1; + } + + pub fn iter(this: *RopeStringEncoder) JSC.JSString.Iterator { + return .{ + .data = this, + .stop = 0, + .append8 = append8, + .append16 = append16, + .write8 = write8, + .write16 = write16, + }; + } +}; + +// This fast path is only suitable for ASCII strings +// It's not suitable for UTF-16 strings, because getting the byteLength is unpredictable +// It also isn't usable for latin1 strings which contain non-ascii characters +pub export fn TextEncoder__encodeRopeString( + globalThis: *JSGlobalObject, + rope_str: *JSC.JSString, +) JSValue { + if (comptime Environment.allow_assert) bun.assert(rope_str.is8Bit()); + var stack_buf: [2048]u8 = undefined; + var buf_to_use: []u8 = &stack_buf; + const length = rope_str.length(); + var array: JSValue = .zero; + if (length > stack_buf.len / 2) { + array = JSC.JSValue.createUninitializedUint8Array(globalThis, length); + array.ensureStillAlive(); + buf_to_use = array.asArrayBuffer(globalThis).?.slice(); + } + var encoder = RopeStringEncoder{ + .globalThis = globalThis, + .buf = buf_to_use, + }; + var iter = encoder.iter(); + array.ensureStillAlive(); + rope_str.iterator(globalThis, &iter); + array.ensureStillAlive(); + + if (encoder.any_non_ascii) { + return .undefined; + } + + if (array == .zero) { + array = JSC.JSValue.createUninitializedUint8Array(globalThis, length); + array.ensureStillAlive(); + @memcpy(array.asArrayBuffer(globalThis).?.ptr[0..length], buf_to_use[0..length]); + } + + return array; +} + +pub export fn TextEncoder__encodeInto16( + input_ptr: [*]const u16, + input_len: usize, + buf_ptr: [*]u8, + buf_len: usize, +) u64 { + const output = buf_ptr[0..buf_len]; + const input = input_ptr[0..input_len]; + var result: strings.EncodeIntoResult = strings.copyUTF16IntoUTF8(output, []const u16, input, false); + if (output.len >= 3 and (result.read == 0 or result.written == 0)) { + const replacement_char = [_]u8{ 239, 191, 189 }; + @memcpy(buf_ptr[0..replacement_char.len], &replacement_char); + result.read = 1; + result.written = 3; + } + const sized: [2]u32 = .{ result.read, result.written }; + return @bitCast(sized); +} + +pub export fn TextEncoder__encodeInto8( + input_ptr: [*]const u8, + input_len: usize, + buf_ptr: [*]u8, + buf_len: usize, +) u64 { + const output = buf_ptr[0..buf_len]; + const input = input_ptr[0..input_len]; + const result: strings.EncodeIntoResult = + strings.copyLatin1IntoUTF8(output, []const u8, input); + const sized: [2]u32 = .{ result.read, result.written }; + return @bitCast(sized); +} + +const std = @import("std"); +const bun = @import("root").bun; +const strings = bun.strings; +const JSC = bun.JSC; +const Environment = bun.Environment; +const JSGlobalObject = JSC.JSGlobalObject; +const JSValue = JSC.JSValue; +const ArrayBuffer = JSC.ArrayBuffer; +const TextEncoder = @This(); diff --git a/src/bun.js/webcore/TextEncoderStreamEncoder.zig b/src/bun.js/webcore/TextEncoderStreamEncoder.zig new file mode 100644 index 0000000000..4269577aae --- /dev/null +++ b/src/bun.js/webcore/TextEncoderStreamEncoder.zig @@ -0,0 +1,213 @@ +pending_lead_surrogate: ?u16 = null, + +const log = Output.scoped(.TextEncoderStreamEncoder, false); + +pub usingnamespace JSC.Codegen.JSTextEncoderStreamEncoder; +pub usingnamespace bun.New(TextEncoderStreamEncoder); + +pub fn finalize(this: *TextEncoderStreamEncoder) void { + this.destroy(); +} + +pub fn constructor(_: *JSGlobalObject, _: *JSC.CallFrame) bun.JSError!*TextEncoderStreamEncoder { + return TextEncoderStreamEncoder.new(.{}); +} + +pub fn encode(this: *TextEncoderStreamEncoder, globalObject: *JSC.JSGlobalObject, callFrame: *JSC.CallFrame) bun.JSError!JSValue { + const arguments = callFrame.arguments_old(1).slice(); + if (arguments.len == 0) { + return globalObject.throwNotEnoughArguments("TextEncoderStreamEncoder.encode", 1, arguments.len); + } + + const str = try arguments[0].getZigString(globalObject); + + if (str.is16Bit()) { + return this.encodeUTF16(globalObject, str.utf16SliceAligned()); + } + + return this.encodeLatin1(globalObject, str.slice()); +} + +pub fn encodeWithoutTypeChecks(this: *TextEncoderStreamEncoder, globalObject: *JSC.JSGlobalObject, input: *JSC.JSString) JSValue { + const str = input.getZigString(globalObject); + + if (str.is16Bit()) { + return this.encodeUTF16(globalObject, str.utf16SliceAligned()); + } + + return this.encodeLatin1(globalObject, str.slice()); +} + +fn encodeLatin1(this: *TextEncoderStreamEncoder, globalObject: *JSGlobalObject, input: []const u8) JSValue { + log("encodeLatin1: \"{s}\"", .{input}); + + if (input.len == 0) return JSUint8Array.createEmpty(globalObject); + + const prepend_replacement_len: usize = prepend_replacement: { + if (this.pending_lead_surrogate != null) { + this.pending_lead_surrogate = null; + // no latin1 surrogate pairs + break :prepend_replacement 3; + } + + break :prepend_replacement 0; + }; + // In a previous benchmark, counting the length took about as much time as allocating the buffer. + // + // Benchmark Time % CPU (ns) Iterations Ratio + // 288.00 ms 13.5% 288.00 ms simdutf::arm64::implementation::convert_latin1_to_utf8(char const*, unsigned long, char*) const + // 278.00 ms 13.0% 278.00 ms simdutf::arm64::implementation::utf8_length_from_latin1(char const*, unsigned long) const + // + // + var buffer = std.ArrayList(u8).initCapacity(bun.default_allocator, input.len + prepend_replacement_len) catch { + return globalObject.throwOutOfMemoryValue(); + }; + if (prepend_replacement_len > 0) { + buffer.appendSliceAssumeCapacity(&[3]u8{ 0xef, 0xbf, 0xbd }); + } + + var remain = input; + while (remain.len > 0) { + const result = strings.copyLatin1IntoUTF8(buffer.unusedCapacitySlice(), []const u8, remain); + + buffer.items.len += result.written; + remain = remain[result.read..]; + + if (result.written == 0 and result.read == 0) { + buffer.ensureUnusedCapacity(2) catch { + buffer.deinit(); + return globalObject.throwOutOfMemoryValue(); + }; + } else if (buffer.items.len == buffer.capacity and remain.len > 0) { + buffer.ensureTotalCapacity(buffer.items.len + remain.len + 1) catch { + buffer.deinit(); + return globalObject.throwOutOfMemoryValue(); + }; + } + } + + if (comptime Environment.isDebug) { + // wrap in comptime if so simdutf isn't called in a release build here. + bun.debugAssert(buffer.items.len == (bun.simdutf.length.utf8.from.latin1(input) + prepend_replacement_len)); + } + + return JSC.JSUint8Array.fromBytes(globalObject, buffer.items); +} + +fn encodeUTF16(this: *TextEncoderStreamEncoder, globalObject: *JSGlobalObject, input: []const u16) JSValue { + log("encodeUTF16: \"{}\"", .{bun.fmt.utf16(input)}); + + if (input.len == 0) return JSUint8Array.createEmpty(globalObject); + + const Prepend = struct { + bytes: [4]u8, + len: u3, + + pub const replacement: @This() = .{ .bytes = .{ 0xef, 0xbf, 0xbd, 0 }, .len = 3 }; + + pub fn fromSequence(seq: [4]u8, length: u3) @This() { + return .{ .bytes = seq, .len = length }; + } + }; + + var remain = input; + + const prepend: ?Prepend = prepend: { + if (this.pending_lead_surrogate) |lead| { + this.pending_lead_surrogate = null; + const maybe_trail = remain[0]; + if (strings.u16IsTrail(maybe_trail)) { + const converted = strings.utf16CodepointWithFFFD([]const u16, &.{ lead, maybe_trail }); + // shouldn't fail because `u16IsTrail` is true and `pending_lead_surrogate` is always + // a valid lead. + bun.debugAssert(!converted.fail); + + const sequence = strings.wtf8Sequence(converted.code_point); + + remain = remain[1..]; + if (remain.len == 0) { + return JSUint8Array.fromBytesCopy( + globalObject, + sequence[0..converted.utf8Width()], + ); + } + + break :prepend Prepend.fromSequence(sequence, converted.utf8Width()); + } + + break :prepend Prepend.replacement; + } + break :prepend null; + }; + + const length = bun.simdutf.length.utf8.from.utf16.le(remain); + + var buf = std.ArrayList(u8).initCapacity( + bun.default_allocator, + length + @as(usize, if (prepend) |pre| pre.len else 0), + ) catch { + return globalObject.throwOutOfMemoryValue(); + }; + + if (prepend) |*pre| { + buf.appendSliceAssumeCapacity(pre.bytes[0..pre.len]); + } + + const result = bun.simdutf.convert.utf16.to.utf8.with_errors.le(remain, buf.unusedCapacitySlice()); + + switch (result.status) { + else => { + // Slow path: there was invalid UTF-16, so we need to convert it without simdutf. + const lead_surrogate = strings.toUTF8ListWithTypeBun(&buf, []const u16, remain, true) catch { + buf.deinit(); + return globalObject.throwOutOfMemoryValue(); + }; + + if (lead_surrogate) |pending_lead| { + this.pending_lead_surrogate = pending_lead; + if (buf.items.len == 0) return JSUint8Array.createEmpty(globalObject); + } + + return JSC.JSUint8Array.fromBytes(globalObject, buf.items); + }, + .success => { + buf.items.len += result.count; + return JSC.JSUint8Array.fromBytes(globalObject, buf.items); + }, + } +} + +pub fn flush(this: *TextEncoderStreamEncoder, globalObject: *JSGlobalObject, _: *JSC.CallFrame) bun.JSError!JSValue { + return flushBody(this, globalObject); +} + +pub fn flushWithoutTypeChecks(this: *TextEncoderStreamEncoder, globalObject: *JSGlobalObject) JSValue { + return flushBody(this, globalObject); +} + +fn flushBody(this: *TextEncoderStreamEncoder, globalObject: *JSGlobalObject) JSValue { + return if (this.pending_lead_surrogate == null) + JSUint8Array.createEmpty(globalObject) + else + JSUint8Array.fromBytesCopy(globalObject, &.{ 0xef, 0xbf, 0xbd }); +} + +const TextEncoderStreamEncoder = @This(); + +const std = @import("std"); +const bun = @import("root").bun; +const JSC = bun.JSC; +const Output = bun.Output; +const MutableString = bun.MutableString; +const strings = bun.strings; +const string = bun.string; +const FeatureFlags = bun.FeatureFlags; +const ArrayBuffer = JSC.ArrayBuffer; +const JSUint8Array = JSC.JSUint8Array; +const ZigString = JSC.ZigString; +const JSInternalPromise = JSC.JSInternalPromise; +const JSPromise = JSC.JSPromise; +const JSValue = JSC.JSValue; +const JSGlobalObject = JSC.JSGlobalObject; +const EncodingLabel = JSC.WebCore.EncodingLabel; +const Environment = bun.Environment; diff --git a/src/bun.js/webcore/encoding.zig b/src/bun.js/webcore/encoding.zig index 89e01415cf..600b704fc4 100644 --- a/src/bun.js/webcore/encoding.zig +++ b/src/bun.js/webcore/encoding.zig @@ -35,949 +35,10 @@ const Task = JSC.Task; const picohttp = bun.picohttp; -pub const TextEncoder = struct { - pub export fn TextEncoder__encode8( - globalThis: *JSGlobalObject, - ptr: [*]const u8, - len: usize, - ) JSValue { - // as much as possible, rely on JSC to own the memory - // their code is more battle-tested than bun's code - // so we do a stack allocation here - // and then copy into JSC memory - // unless it's huge - // JSC will GC Uint8Array that occupy less than 512 bytes - // so it's extra good for that case - // this also means there won't be reallocations for small strings - var buf: [2048]u8 = undefined; - const slice = ptr[0..len]; - - if (slice.len <= buf.len / 2) { - const result = strings.copyLatin1IntoUTF8(&buf, []const u8, slice); - const uint8array = JSC.JSValue.createUninitializedUint8Array(globalThis, result.written); - bun.assert(result.written <= buf.len); - bun.assert(result.read == slice.len); - const array_buffer = uint8array.asArrayBuffer(globalThis) orelse return .zero; - bun.assert(result.written == array_buffer.len); - @memcpy(array_buffer.byteSlice()[0..result.written], buf[0..result.written]); - return uint8array; - } else { - const bytes = strings.allocateLatin1IntoUTF8(globalThis.bunVM().allocator, []const u8, slice) catch { - return globalThis.throwOutOfMemoryValue(); - }; - bun.assert(bytes.len >= slice.len); - return ArrayBuffer.fromBytes(bytes, .Uint8Array).toJSUnchecked(globalThis, null); - } - } - pub export fn TextEncoder__encode16( - globalThis: *JSGlobalObject, - ptr: [*]const u16, - len: usize, - ) JSValue { - // as much as possible, rely on JSC to own the memory - // their code is more battle-tested than bun's code - // so we do a stack allocation here - // and then copy into JSC memory - // unless it's huge - // JSC will GC Uint8Array that occupy less than 512 bytes - // so it's extra good for that case - // this also means there won't be reallocations for small strings - var buf: [2048]u8 = undefined; - - const slice = ptr[0..len]; - - // max utf16 -> utf8 length - if (slice.len <= buf.len / 4) { - const result = strings.copyUTF16IntoUTF8(&buf, @TypeOf(slice), slice, true); - if (result.read == 0 or result.written == 0) { - const uint8array = JSC.JSValue.createUninitializedUint8Array(globalThis, 3); - const array_buffer = uint8array.asArrayBuffer(globalThis).?; - const replacement_char = [_]u8{ 239, 191, 189 }; - @memcpy(array_buffer.slice()[0..replacement_char.len], &replacement_char); - return uint8array; - } - const uint8array = JSC.JSValue.createUninitializedUint8Array(globalThis, result.written); - bun.assert(result.written <= buf.len); - bun.assert(result.read == slice.len); - const array_buffer = uint8array.asArrayBuffer(globalThis).?; - bun.assert(result.written == array_buffer.len); - @memcpy(array_buffer.slice()[0..result.written], buf[0..result.written]); - return uint8array; - } else { - const bytes = strings.toUTF8AllocWithType( - bun.default_allocator, - @TypeOf(slice), - slice, - ) catch { - return JSC.toInvalidArguments("Out of memory", .{}, globalThis); - }; - return ArrayBuffer.fromBytes(bytes, .Uint8Array).toJSUnchecked(globalThis, null); - } - } - - pub export fn c( - globalThis: *JSGlobalObject, - ptr: [*]const u16, - len: usize, - ) JSValue { - // as much as possible, rely on JSC to own the memory - // their code is more battle-tested than bun's code - // so we do a stack allocation here - // and then copy into JSC memory - // unless it's huge - // JSC will GC Uint8Array that occupy less than 512 bytes - // so it's extra good for that case - // this also means there won't be reallocations for small strings - var buf: [2048]u8 = undefined; - - const slice = ptr[0..len]; - - // max utf16 -> utf8 length - if (slice.len <= buf.len / 4) { - const result = strings.copyUTF16IntoUTF8(&buf, @TypeOf(slice), slice, true); - if (result.read == 0 or result.written == 0) { - const uint8array = JSC.JSValue.createUninitializedUint8Array(globalThis, 3); - const array_buffer = uint8array.asArrayBuffer(globalThis).?; - const replacement_char = [_]u8{ 239, 191, 189 }; - @memcpy(array_buffer.slice()[0..replacement_char.len], &replacement_char); - return uint8array; - } - const uint8array = JSC.JSValue.createUninitializedUint8Array(globalThis, result.written); - bun.assert(result.written <= buf.len); - bun.assert(result.read == slice.len); - const array_buffer = uint8array.asArrayBuffer(globalThis).?; - bun.assert(result.written == array_buffer.len); - @memcpy(array_buffer.slice()[0..result.written], buf[0..result.written]); - return uint8array; - } else { - const bytes = strings.toUTF8AllocWithType( - bun.default_allocator, - @TypeOf(slice), - slice, - ) catch { - return globalThis.throwOutOfMemoryValue(); - }; - return ArrayBuffer.fromBytes(bytes, .Uint8Array).toJSUnchecked(globalThis, null); - } - } - - // This is a fast path for copying a Rope string into a Uint8Array. - // This keeps us from an extra string temporary allocation - const RopeStringEncoder = struct { - globalThis: *JSGlobalObject, - buf: []u8, - tail: usize = 0, - any_non_ascii: bool = false, - - pub fn append8(it: *JSC.JSString.Iterator, ptr: [*]const u8, len: u32) callconv(.C) void { - var this = bun.cast(*RopeStringEncoder, it.data.?); - const result = strings.copyLatin1IntoUTF8StopOnNonASCII(this.buf[this.tail..], []const u8, ptr[0..len], true); - if (result.read == std.math.maxInt(u32) and result.written == std.math.maxInt(u32)) { - it.stop = 1; - this.any_non_ascii = true; - } else { - this.tail += result.written; - } - } - pub fn append16(it: *JSC.JSString.Iterator, _: [*]const u16, _: u32) callconv(.C) void { - var this = bun.cast(*RopeStringEncoder, it.data.?); - this.any_non_ascii = true; - it.stop = 1; - } - pub fn write8(it: *JSC.JSString.Iterator, ptr: [*]const u8, len: u32, offset: u32) callconv(.C) void { - var this = bun.cast(*RopeStringEncoder, it.data.?); - const result = strings.copyLatin1IntoUTF8StopOnNonASCII(this.buf[offset..], []const u8, ptr[0..len], true); - if (result.read == std.math.maxInt(u32) and result.written == std.math.maxInt(u32)) { - it.stop = 1; - this.any_non_ascii = true; - } - } - pub fn write16(it: *JSC.JSString.Iterator, _: [*]const u16, _: u32, _: u32) callconv(.C) void { - var this = bun.cast(*RopeStringEncoder, it.data.?); - this.any_non_ascii = true; - it.stop = 1; - } - - pub fn iter(this: *RopeStringEncoder) JSC.JSString.Iterator { - return .{ - .data = this, - .stop = 0, - .append8 = append8, - .append16 = append16, - .write8 = write8, - .write16 = write16, - }; - } - }; - - // This fast path is only suitable for ASCII strings - // It's not suitable for UTF-16 strings, because getting the byteLength is unpredictable - // It also isn't usable for latin1 strings which contain non-ascii characters - pub export fn TextEncoder__encodeRopeString( - globalThis: *JSGlobalObject, - rope_str: *JSC.JSString, - ) JSValue { - if (comptime Environment.allow_assert) bun.assert(rope_str.is8Bit()); - var stack_buf: [2048]u8 = undefined; - var buf_to_use: []u8 = &stack_buf; - const length = rope_str.length(); - var array: JSValue = .zero; - if (length > stack_buf.len / 2) { - array = JSC.JSValue.createUninitializedUint8Array(globalThis, length); - array.ensureStillAlive(); - buf_to_use = array.asArrayBuffer(globalThis).?.slice(); - } - var encoder = RopeStringEncoder{ - .globalThis = globalThis, - .buf = buf_to_use, - }; - var iter = encoder.iter(); - array.ensureStillAlive(); - rope_str.iterator(globalThis, &iter); - array.ensureStillAlive(); - - if (encoder.any_non_ascii) { - return .undefined; - } - - if (array == .zero) { - array = JSC.JSValue.createUninitializedUint8Array(globalThis, length); - array.ensureStillAlive(); - @memcpy(array.asArrayBuffer(globalThis).?.ptr[0..length], buf_to_use[0..length]); - } - - return array; - } - - pub export fn TextEncoder__encodeInto16( - input_ptr: [*]const u16, - input_len: usize, - buf_ptr: [*]u8, - buf_len: usize, - ) u64 { - const output = buf_ptr[0..buf_len]; - const input = input_ptr[0..input_len]; - var result: strings.EncodeIntoResult = strings.copyUTF16IntoUTF8(output, []const u16, input, false); - if (output.len >= 3 and (result.read == 0 or result.written == 0)) { - const replacement_char = [_]u8{ 239, 191, 189 }; - @memcpy(buf_ptr[0..replacement_char.len], &replacement_char); - result.read = 1; - result.written = 3; - } - const sized: [2]u32 = .{ result.read, result.written }; - return @bitCast(sized); - } - - pub export fn TextEncoder__encodeInto8( - input_ptr: [*]const u8, - input_len: usize, - buf_ptr: [*]u8, - buf_len: usize, - ) u64 { - const output = buf_ptr[0..buf_len]; - const input = input_ptr[0..input_len]; - const result: strings.EncodeIntoResult = - strings.copyLatin1IntoUTF8(output, []const u8, input); - const sized: [2]u32 = .{ result.read, result.written }; - return @bitCast(sized); - } -}; - -comptime { - _ = TextEncoder.TextEncoder__encode8; - _ = TextEncoder.TextEncoder__encode16; - _ = TextEncoder.TextEncoder__encodeInto8; - _ = TextEncoder.TextEncoder__encodeInto16; - _ = TextEncoder.TextEncoder__encodeRopeString; -} - -/// https://encoding.spec.whatwg.org/encodings.json -pub const EncodingLabel = enum { - @"UTF-8", - IBM866, - @"ISO-8859-2", - @"ISO-8859-3", - @"ISO-8859-4", - @"ISO-8859-5", - @"ISO-8859-6", - @"ISO-8859-7", - @"ISO-8859-8", - @"ISO-8859-8-I", - @"ISO-8859-10", - @"ISO-8859-13", - @"ISO-8859-14", - @"ISO-8859-15", - @"ISO-8859-16", - @"KOI8-R", - @"KOI8-U", - macintosh, - @"windows-874", - @"windows-1250", - @"windows-1251", - /// Also known as - /// - ASCII - /// - latin1 - @"windows-1252", - @"windows-1253", - @"windows-1254", - @"windows-1255", - @"windows-1256", - @"windows-1257", - @"windows-1258", - @"x-mac-cyrillic", - Big5, - @"EUC-JP", - @"ISO-2022-JP", - Shift_JIS, - @"EUC-KR", - @"UTF-16BE", - @"UTF-16LE", - @"x-user-defined", - - pub const Map = std.enums.EnumMap(EncodingLabel, string); - pub const label: Map = brk: { - var map = Map.initFull(""); - map.put(EncodingLabel.@"UTF-8", "utf-8"); - map.put(EncodingLabel.@"UTF-16LE", "utf-16le"); - map.put(EncodingLabel.@"windows-1252", "windows-1252"); - break :brk map; - }; - - const utf16_names = [_]string{ - "ucs-2", - "utf-16", - "unicode", - "utf-16le", - "csunicode", - "unicodefeff", - "iso-10646-ucs-2", - }; - - const utf8_names = [_]string{ - "utf8", - "utf-8", - "unicode11utf8", - "unicode20utf8", - "x-unicode20utf8", - "unicode-1-1-utf-8", - }; - - const latin1_names = [_]string{ - "l1", - "ascii", - "cp819", - "cp1252", - "ibm819", - "latin1", - "iso88591", - "us-ascii", - "x-cp1252", - "iso8859-1", - "iso_8859-1", - "iso-8859-1", - "iso-ir-100", - "csisolatin1", - "windows-1252", - "ansi_x3.4-1968", - "iso_8859-1:1987", - }; - - pub const latin1 = EncodingLabel.@"windows-1252"; - - pub fn which(input_: string) ?EncodingLabel { - const input = strings.trim(input_, " \t\r\n"); - const ExactMatcher = strings.ExactSizeMatcher; - const Eight = ExactMatcher(8); - const Sixteen = ExactMatcher(16); - return switch (input.len) { - 1, 0 => null, - 2...8 => switch (Eight.matchLower(input)) { - Eight.case("l1"), - Eight.case("ascii"), - Eight.case("cp819"), - Eight.case("cp1252"), - Eight.case("ibm819"), - Eight.case("latin1"), - Eight.case("iso88591"), - Eight.case("us-ascii"), - Eight.case("x-cp1252"), - => EncodingLabel.latin1, - - Eight.case("ucs-2"), - Eight.case("utf-16"), - Eight.case("unicode"), - Eight.case("utf-16le"), - => EncodingLabel.@"UTF-16LE", - - Eight.case("utf-16be"), - => EncodingLabel.@"UTF-16BE", - - Eight.case("utf8"), Eight.case("utf-8") => EncodingLabel.@"UTF-8", - else => null, - }, - - 9...16 => switch (Sixteen.matchLower(input)) { - Sixteen.case("iso8859-1"), - Sixteen.case("iso_8859-1"), - Sixteen.case("iso-8859-1"), - Sixteen.case("iso-ir-100"), - Sixteen.case("csisolatin1"), - Sixteen.case("windows-1252"), - Sixteen.case("ansi_x3.4-1968"), - Sixteen.case("iso_8859-1:1987"), - => EncodingLabel.latin1, - - Sixteen.case("unicode11utf8"), - Sixteen.case("unicode20utf8"), - Sixteen.case("x-unicode20utf8"), - => EncodingLabel.@"UTF-8", - - Sixteen.case("csunicode"), - Sixteen.case("unicodefeff"), - Sixteen.case("iso-10646-ucs-2"), - => EncodingLabel.@"UTF-16LE", - - else => null, - }, - else => if (strings.eqlCaseInsensitiveASCII(input, "unicode-1-1-utf-8", true)) - EncodingLabel.@"UTF-8" - else - null, - }; - } -}; - -pub const TextEncoderStreamEncoder = struct { - pending_lead_surrogate: ?u16 = null, - - const log = Output.scoped(.TextEncoderStreamEncoder, false); - - pub usingnamespace JSC.Codegen.JSTextEncoderStreamEncoder; - pub usingnamespace bun.New(TextEncoderStreamEncoder); - - pub fn finalize(this: *TextEncoderStreamEncoder) void { - this.destroy(); - } - - pub fn constructor(_: *JSGlobalObject, _: *JSC.CallFrame) bun.JSError!*TextEncoderStreamEncoder { - return TextEncoderStreamEncoder.new(.{}); - } - - pub fn encode(this: *TextEncoderStreamEncoder, globalObject: *JSC.JSGlobalObject, callFrame: *JSC.CallFrame) bun.JSError!JSValue { - const arguments = callFrame.arguments_old(1).slice(); - if (arguments.len == 0) { - return globalObject.throwNotEnoughArguments("TextEncoderStreamEncoder.encode", 1, arguments.len); - } - - const str = try arguments[0].getZigString(globalObject); - - if (str.is16Bit()) { - return this.encodeUTF16(globalObject, str.utf16SliceAligned()); - } - - return this.encodeLatin1(globalObject, str.slice()); - } - - pub fn encodeWithoutTypeChecks(this: *TextEncoderStreamEncoder, globalObject: *JSC.JSGlobalObject, input: *JSC.JSString) JSValue { - const str = input.getZigString(globalObject); - - if (str.is16Bit()) { - return this.encodeUTF16(globalObject, str.utf16SliceAligned()); - } - - return this.encodeLatin1(globalObject, str.slice()); - } - - fn encodeLatin1(this: *TextEncoderStreamEncoder, globalObject: *JSGlobalObject, input: []const u8) JSValue { - log("encodeLatin1: \"{s}\"", .{input}); - - if (input.len == 0) return JSUint8Array.createEmpty(globalObject); - - const prepend_replacement_len: usize = prepend_replacement: { - if (this.pending_lead_surrogate != null) { - this.pending_lead_surrogate = null; - // no latin1 surrogate pairs - break :prepend_replacement 3; - } - - break :prepend_replacement 0; - }; - // In a previous benchmark, counting the length took about as much time as allocating the buffer. - // - // Benchmark Time % CPU (ns) Iterations Ratio - // 288.00 ms 13.5% 288.00 ms simdutf::arm64::implementation::convert_latin1_to_utf8(char const*, unsigned long, char*) const - // 278.00 ms 13.0% 278.00 ms simdutf::arm64::implementation::utf8_length_from_latin1(char const*, unsigned long) const - // - // - var buffer = std.ArrayList(u8).initCapacity(bun.default_allocator, input.len + prepend_replacement_len) catch { - return globalObject.throwOutOfMemoryValue(); - }; - if (prepend_replacement_len > 0) { - buffer.appendSliceAssumeCapacity(&[3]u8{ 0xef, 0xbf, 0xbd }); - } - - var remain = input; - while (remain.len > 0) { - const result = strings.copyLatin1IntoUTF8(buffer.unusedCapacitySlice(), []const u8, remain); - - buffer.items.len += result.written; - remain = remain[result.read..]; - - if (result.written == 0 and result.read == 0) { - buffer.ensureUnusedCapacity(2) catch { - buffer.deinit(); - return globalObject.throwOutOfMemoryValue(); - }; - } else if (buffer.items.len == buffer.capacity and remain.len > 0) { - buffer.ensureTotalCapacity(buffer.items.len + remain.len + 1) catch { - buffer.deinit(); - return globalObject.throwOutOfMemoryValue(); - }; - } - } - - if (comptime Environment.isDebug) { - // wrap in comptime if so simdutf isn't called in a release build here. - bun.debugAssert(buffer.items.len == (bun.simdutf.length.utf8.from.latin1(input) + prepend_replacement_len)); - } - - return JSC.JSUint8Array.fromBytes(globalObject, buffer.items); - } - - fn encodeUTF16(this: *TextEncoderStreamEncoder, globalObject: *JSGlobalObject, input: []const u16) JSValue { - log("encodeUTF16: \"{}\"", .{bun.fmt.utf16(input)}); - - if (input.len == 0) return JSUint8Array.createEmpty(globalObject); - - const Prepend = struct { - bytes: [4]u8, - len: u3, - - pub const replacement: @This() = .{ .bytes = .{ 0xef, 0xbf, 0xbd, 0 }, .len = 3 }; - - pub fn fromSequence(seq: [4]u8, length: u3) @This() { - return .{ .bytes = seq, .len = length }; - } - }; - - var remain = input; - - const prepend: ?Prepend = prepend: { - if (this.pending_lead_surrogate) |lead| { - this.pending_lead_surrogate = null; - const maybe_trail = remain[0]; - if (strings.u16IsTrail(maybe_trail)) { - const converted = strings.utf16CodepointWithFFFD([]const u16, &.{ lead, maybe_trail }); - // shouldn't fail because `u16IsTrail` is true and `pending_lead_surrogate` is always - // a valid lead. - bun.debugAssert(!converted.fail); - - const sequence = strings.wtf8Sequence(converted.code_point); - - remain = remain[1..]; - if (remain.len == 0) { - return JSUint8Array.fromBytesCopy( - globalObject, - sequence[0..converted.utf8Width()], - ); - } - - break :prepend Prepend.fromSequence(sequence, converted.utf8Width()); - } - - break :prepend Prepend.replacement; - } - break :prepend null; - }; - - const length = bun.simdutf.length.utf8.from.utf16.le(remain); - - var buf = std.ArrayList(u8).initCapacity( - bun.default_allocator, - length + @as(usize, if (prepend) |pre| pre.len else 0), - ) catch { - return globalObject.throwOutOfMemoryValue(); - }; - - if (prepend) |*pre| { - buf.appendSliceAssumeCapacity(pre.bytes[0..pre.len]); - } - - const result = bun.simdutf.convert.utf16.to.utf8.with_errors.le(remain, buf.unusedCapacitySlice()); - - switch (result.status) { - else => { - // Slow path: there was invalid UTF-16, so we need to convert it without simdutf. - const lead_surrogate = strings.toUTF8ListWithTypeBun(&buf, []const u16, remain, true) catch { - buf.deinit(); - return globalObject.throwOutOfMemoryValue(); - }; - - if (lead_surrogate) |pending_lead| { - this.pending_lead_surrogate = pending_lead; - if (buf.items.len == 0) return JSUint8Array.createEmpty(globalObject); - } - - return JSC.JSUint8Array.fromBytes(globalObject, buf.items); - }, - .success => { - buf.items.len += result.count; - return JSC.JSUint8Array.fromBytes(globalObject, buf.items); - }, - } - } - - pub fn flush(this: *TextEncoderStreamEncoder, globalObject: *JSGlobalObject, _: *JSC.CallFrame) bun.JSError!JSValue { - return flushBody(this, globalObject); - } - - pub fn flushWithoutTypeChecks(this: *TextEncoderStreamEncoder, globalObject: *JSGlobalObject) JSValue { - return flushBody(this, globalObject); - } - - fn flushBody(this: *TextEncoderStreamEncoder, globalObject: *JSGlobalObject) JSValue { - return if (this.pending_lead_surrogate == null) - JSUint8Array.createEmpty(globalObject) - else - JSUint8Array.fromBytesCopy(globalObject, &.{ 0xef, 0xbf, 0xbd }); - } -}; - -pub const TextDecoder = struct { - - // used for utf8 decoding - buffered: struct { - buf: [3]u8 = .{0} ** 3, - len: u2 = 0, - - pub fn slice(this: *@This()) []const u8 { - return this.buf[0..this.len]; - } - } = .{}, - - // used for utf16 decoding - lead_byte: ?u8 = null, - lead_surrogate: ?u16 = null, - - ignore_bom: bool = false, - fatal: bool = false, - encoding: EncodingLabel = EncodingLabel.@"UTF-8", - - pub usingnamespace bun.New(TextDecoder); - - pub fn finalize(this: *TextDecoder) void { - this.destroy(); - } - - pub usingnamespace JSC.Codegen.JSTextDecoder; - - pub fn getIgnoreBOM( - this: *TextDecoder, - _: *JSC.JSGlobalObject, - ) JSC.JSValue { - return JSC.JSValue.jsBoolean(this.ignore_bom); - } - - pub fn getFatal( - this: *TextDecoder, - _: *JSC.JSGlobalObject, - ) JSC.JSValue { - return JSC.JSValue.jsBoolean(this.fatal); - } - - pub fn getEncoding( - this: *TextDecoder, - globalThis: *JSC.JSGlobalObject, - ) JSC.JSValue { - return ZigString.init(EncodingLabel.label.get(this.encoding).?).toJS(globalThis); - } - const Vector16 = std.meta.Vector(16, u16); - const max_16_ascii: Vector16 = @splat(@as(u16, 127)); - - fn processCodeUnitUTF16( - this: *TextDecoder, - output: *std.ArrayListUnmanaged(u16), - saw_error: *bool, - code_unit: u16, - ) error{OutOfMemory}!void { - if (this.lead_surrogate) |lead_surrogate| { - this.lead_surrogate = null; - - if (strings.u16IsTrail(code_unit)) { - // TODO: why is this here? - // const code_point = strings.u16GetSupplementary(lead_surrogate, code_unit); - try output.appendSlice( - bun.default_allocator, - &.{ lead_surrogate, code_unit }, - ); - return; - } - try output.append(bun.default_allocator, strings.unicode_replacement); - saw_error.* = true; - } - - if (strings.u16IsLead(code_unit)) { - this.lead_surrogate = code_unit; - return; - } - - if (strings.u16IsTrail(code_unit)) { - try output.append(bun.default_allocator, strings.unicode_replacement); - saw_error.* = true; - return; - } - - try output.append(bun.default_allocator, code_unit); - return; - } - - pub fn codeUnitFromBytesUTF16( - first: u16, - second: u16, - comptime big_endian: bool, - ) u16 { - return if (comptime big_endian) - (first << 8) | second - else - first | (second << 8); - } - - pub fn decodeUTF16( - this: *TextDecoder, - bytes: []const u8, - comptime big_endian: bool, - comptime flush: bool, - ) error{OutOfMemory}!struct { std.ArrayListUnmanaged(u16), bool } { - var output: std.ArrayListUnmanaged(u16) = .{}; - try output.ensureTotalCapacity(bun.default_allocator, @divFloor(bytes.len, 2)); - - var remain = bytes; - var saw_error = false; - - if (this.lead_byte) |lead_byte| { - if (remain.len > 0) { - this.lead_byte = null; - - try this.processCodeUnitUTF16( - &output, - &saw_error, - codeUnitFromBytesUTF16(@intCast(lead_byte), @intCast(remain[0]), big_endian), - ); - remain = remain[1..]; - } - } - - var i: usize = 0; - - while (i < remain.len -| 1) { - try this.processCodeUnitUTF16( - &output, - &saw_error, - codeUnitFromBytesUTF16(@intCast(remain[i]), @intCast(remain[i + 1]), big_endian), - ); - i += 2; - } - - if (remain.len != 0 and i == remain.len - 1) { - this.lead_byte = remain[i]; - } else { - bun.assertWithLocation(i == remain.len, @src()); - } - - if (comptime flush) { - if (this.lead_byte != null or this.lead_surrogate != null) { - this.lead_byte = null; - this.lead_surrogate = null; - try output.append(bun.default_allocator, strings.unicode_replacement); - saw_error = true; - return .{ output, saw_error }; - } - } - - return .{ output, saw_error }; - } - - pub fn decode(this: *TextDecoder, globalThis: *JSC.JSGlobalObject, callframe: *JSC.CallFrame) bun.JSError!JSValue { - const arguments = callframe.arguments_old(2).slice(); - - const input_slice = input_slice: { - if (arguments.len == 0 or arguments[0].isUndefined()) { - break :input_slice ""; - } - - if (arguments[0].asArrayBuffer(globalThis)) |array_buffer| { - break :input_slice array_buffer.slice(); - } - - return globalThis.throwInvalidArguments("TextDecoder.decode expects an ArrayBuffer or TypedArray", .{}); - }; - - const stream = stream: { - if (arguments.len > 1 and arguments[1].isObject()) { - if (arguments[1].fastGet(globalThis, .stream)) |stream_value| { - const stream_bool = stream_value.coerce(bool, globalThis); - if (globalThis.hasException()) { - return .zero; - } - break :stream stream_bool; - } - } - - break :stream false; - }; - - return switch (!stream) { - inline else => |flush| this.decodeSlice(globalThis, input_slice, flush), - }; - } - - pub fn decodeWithoutTypeChecks(this: *TextDecoder, globalThis: *JSC.JSGlobalObject, uint8array: *JSC.JSUint8Array) bun.JSError!JSValue { - return this.decodeSlice(globalThis, uint8array.slice(), false); - } - - fn decodeSlice(this: *TextDecoder, globalThis: *JSC.JSGlobalObject, buffer_slice: []const u8, comptime flush: bool) bun.JSError!JSValue { - switch (this.encoding) { - EncodingLabel.latin1 => { - if (strings.isAllASCII(buffer_slice)) { - return ZigString.init(buffer_slice).toJS(globalThis); - } - - // It's unintuitive that we encode Latin1 as UTF16 even though the engine natively supports Latin1 strings... - // However, this is also what WebKit seems to do. - // - // It's not clear why we couldn't jusst use Latin1 here, but tests failures proved it necessary. - const out_length = strings.elementLengthLatin1IntoUTF16([]const u8, buffer_slice); - const bytes = try globalThis.allocator().alloc(u16, out_length); - - const out = strings.copyLatin1IntoUTF16([]u16, bytes, []const u8, buffer_slice); - return ZigString.toExternalU16(bytes.ptr, out.written, globalThis); - }, - EncodingLabel.@"UTF-8" => { - const input, const deinit = input: { - const maybe_without_bom = if (!this.ignore_bom and strings.hasPrefixComptime(buffer_slice, "\xef\xbb\xbf")) - buffer_slice[3..] - else - buffer_slice; - - if (this.buffered.len > 0) { - defer this.buffered.len = 0; - const joined = try bun.default_allocator.alloc(u8, maybe_without_bom.len + this.buffered.len); - @memcpy(joined[0..this.buffered.len], this.buffered.slice()); - @memcpy(joined[this.buffered.len..][0..maybe_without_bom.len], maybe_without_bom); - break :input .{ joined, true }; - } - - break :input .{ maybe_without_bom, false }; - }; - - const maybe_decode_result = switch (this.fatal) { - inline else => |fail_if_invalid| strings.toUTF16AllocMaybeBuffered(bun.default_allocator, input, fail_if_invalid, flush) catch |err| { - if (deinit) bun.default_allocator.free(input); - if (comptime fail_if_invalid) { - if (err == error.InvalidByteSequence) { - return globalThis.ERR_ENCODING_INVALID_ENCODED_DATA("Invalid byte sequence", .{}).throw(); - } - } - - bun.assert(err == error.OutOfMemory); - return globalThis.throwOutOfMemory(); - }, - }; - - if (maybe_decode_result) |decode_result| { - if (deinit) bun.default_allocator.free(input); - const decoded, const leftover, const leftover_len = decode_result; - bun.assert(this.buffered.len == 0); - if (comptime !flush) { - if (leftover_len != 0) { - this.buffered.buf = leftover; - this.buffered.len = leftover_len; - } - } - return ZigString.toExternalU16(decoded.ptr, decoded.len, globalThis); - } - - bun.debugAssert(input.len == 0 or !deinit); - - // Experiment: using mimalloc directly is slightly slower - return ZigString.init(input).toJS(globalThis); - }, - - inline .@"UTF-16LE", .@"UTF-16BE" => |utf16_encoding| { - const bom = if (comptime utf16_encoding == .@"UTF-16LE") "\xff\xfe" else "\xfe\xff"; - const input = if (!this.ignore_bom and strings.hasPrefixComptime(buffer_slice, bom)) - buffer_slice[2..] - else - buffer_slice; - - var decoded, const saw_error = try this.decodeUTF16(input, utf16_encoding == .@"UTF-16BE", flush); - - if (saw_error and this.fatal) { - decoded.deinit(bun.default_allocator); - return globalThis.ERR_ENCODING_INVALID_ENCODED_DATA("The encoded data was not valid {s} data", .{@tagName(utf16_encoding)}).throw(); - } - - var output = bun.String.fromUTF16(decoded.items); - return output.toJS(globalThis); - }, - else => { - return globalThis.throwInvalidArguments("TextDecoder.decode set to unsupported encoding", .{}); - }, - } - } - - pub fn constructor(globalThis: *JSC.JSGlobalObject, callframe: *JSC.CallFrame) bun.JSError!*TextDecoder { - var args_ = callframe.arguments_old(2); - var arguments: []const JSC.JSValue = args_.ptr[0..args_.len]; - - var decoder = TextDecoder{}; - - if (arguments.len > 0) { - // encoding - if (arguments[0].isString()) { - var str = try arguments[0].toSlice(globalThis, bun.default_allocator); - defer if (str.isAllocated()) str.deinit(); - - if (EncodingLabel.which(str.slice())) |label| { - decoder.encoding = label; - } else { - return globalThis.throwInvalidArguments("Unsupported encoding label \"{s}\"", .{str.slice()}); - } - } else if (arguments[0].isUndefined()) { - // default to utf-8 - decoder.encoding = EncodingLabel.@"UTF-8"; - } else { - return globalThis.throwInvalidArguments("TextDecoder(encoding) label is invalid", .{}); - } - - if (arguments.len >= 2) { - const options = arguments[1]; - - if (!options.isObject()) { - return globalThis.throwInvalidArguments("TextDecoder(options) is invalid", .{}); - } - - if (try options.get(globalThis, "fatal")) |fatal| { - if (fatal.isBoolean()) { - decoder.fatal = fatal.asBoolean(); - } else { - return globalThis.throwInvalidArguments("TextDecoder(options) fatal is invalid. Expected boolean value", .{}); - } - } - - if (try options.get(globalThis, "ignoreBOM")) |ignoreBOM| { - if (ignoreBOM.isBoolean()) { - decoder.ignore_bom = ignoreBOM.asBoolean(); - } else { - return globalThis.throwInvalidArguments("TextDecoder(options) ignoreBOM is invalid. Expected boolean value", .{}); - } - } - } - } - - return TextDecoder.new(decoder); - } -}; +pub const TextEncoder = @import("./TextEncoder.zig"); +pub const EncodingLabel = @import("./EncodingLabel.zig").EncodingLabel; +pub const TextEncoderStreamEncoder = @import("./TextEncoderStreamEncoder.zig"); +pub const TextDecoder = @import("./TextDecoder.zig"); pub const Encoder = struct { export fn Bun__encoding__writeLatin1(input: [*]const u8, len: usize, to: [*]u8, to_len: usize, encoding: u8) usize { @@ -1468,22 +529,23 @@ pub const Encoder = struct { }, } } - - comptime { - _ = Bun__encoding__writeLatin1; - _ = Bun__encoding__writeUTF16; - - _ = Bun__encoding__byteLengthLatin1AsUTF8; - _ = Bun__encoding__byteLengthUTF16AsUTF8; - - _ = Bun__encoding__toString; - _ = Bun__encoding__toStringUTF8; - - _ = Bun__encoding__constructFromLatin1; - _ = Bun__encoding__constructFromUTF16; - } }; comptime { - std.testing.refAllDecls(Encoder); + _ = &TextEncoder.TextEncoder__encode8; + _ = &TextEncoder.TextEncoder__encode16; + _ = &TextEncoder.TextEncoder__encodeInto8; + _ = &TextEncoder.TextEncoder__encodeInto16; + _ = &TextEncoder.TextEncoder__encodeRopeString; +} + +comptime { + _ = &Encoder.Bun__encoding__writeLatin1; + _ = &Encoder.Bun__encoding__writeUTF16; + _ = &Encoder.Bun__encoding__byteLengthLatin1AsUTF8; + _ = &Encoder.Bun__encoding__byteLengthUTF16AsUTF8; + _ = &Encoder.Bun__encoding__toString; + _ = &Encoder.Bun__encoding__toStringUTF8; + _ = &Encoder.Bun__encoding__constructFromLatin1; + _ = &Encoder.Bun__encoding__constructFromUTF16; }