Files
bun.sh/src/javascript/jsc/webcore/encoding.zig
Jarred Sumner 6b82b67065 var -> const
2022-03-18 20:04:24 -07:00

655 lines
22 KiB
Zig

const std = @import("std");
const Api = @import("../../../api/schema.zig").Api;
const RequestContext = @import("../../../http.zig").RequestContext;
const MimeType = @import("../../../http.zig").MimeType;
const ZigURL = @import("../../../url.zig").URL;
const HTTPClient = @import("http");
const NetworkThread = HTTPClient.NetworkThread;
const JSC = @import("../../../jsc.zig");
const js = JSC.C;
const Method = @import("../../../http/method.zig").Method;
const ObjectPool = @import("../../../pool.zig").ObjectPool;
const Output = @import("../../../global.zig").Output;
const MutableString = @import("../../../global.zig").MutableString;
const strings = @import("../../../global.zig").strings;
const string = @import("../../../global.zig").string;
const default_allocator = @import("../../../global.zig").default_allocator;
const FeatureFlags = @import("../../../global.zig").FeatureFlags;
const ArrayBuffer = @import("../base.zig").ArrayBuffer;
const Properties = @import("../base.zig").Properties;
const NewClass = @import("../base.zig").NewClass;
const d = @import("../base.zig").d;
const castObj = @import("../base.zig").castObj;
const getAllocator = @import("../base.zig").getAllocator;
const JSPrivateDataPtr = @import("../base.zig").JSPrivateDataPtr;
const GetJSPrivateData = @import("../base.zig").GetJSPrivateData;
const Environment = @import("../../../env.zig");
const ZigString = JSC.ZigString;
const JSInternalPromise = JSC.JSInternalPromise;
const JSPromise = JSC.JSPromise;
const JSValue = JSC.JSValue;
const JSError = JSC.JSError;
const JSGlobalObject = JSC.JSGlobalObject;
const VirtualMachine = @import("../javascript.zig").VirtualMachine;
const Task = @import("../javascript.zig").Task;
const picohttp = @import("picohttp");
pub const TextEncoder = struct {
filler: u32 = 0,
var text_encoder: TextEncoder = TextEncoder{};
pub const Constructor = JSC.NewConstructor(
TextEncoder,
.{
.constructor = .{ .rfn = constructor },
},
.{},
);
pub const Class = NewClass(
TextEncoder,
.{
.name = "TextEncoder",
},
.{
.encode = .{
.rfn = encode,
},
.encodeInto = .{
.rfn = encodeInto,
},
},
.{
.encoding = .{
.get = getEncoding,
.readOnly = true,
},
},
);
const utf8_string: string = "utf-8";
pub fn getEncoding(
_: *TextEncoder,
ctx: js.JSContextRef,
_: js.JSValueRef,
_: js.JSStringRef,
_: js.ExceptionRef,
) js.JSValueRef {
return ZigString.init(utf8_string).toValue(ctx.ptr()).asObjectRef();
}
pub fn encode(
_: *TextEncoder,
ctx: js.JSContextRef,
_: js.JSObjectRef,
_: js.JSObjectRef,
args: []const js.JSValueRef,
exception: js.ExceptionRef,
) js.JSValueRef {
var arguments: []const JSC.JSValue = @ptrCast([*]const JSC.JSValue, args.ptr)[0..args.len];
if (arguments.len < 1) {
return JSC.C.JSObjectMakeTypedArray(ctx, JSC.C.JSTypedArrayType.kJSTypedArrayTypeUint8Array, 0, exception);
}
const value = arguments[0];
var zig_str = value.getZigString(ctx.ptr());
var array_buffer: ArrayBuffer = undefined;
if (zig_str.is16Bit()) {
var bytes = strings.toUTF8AllocWithType(
default_allocator,
@TypeOf(zig_str.utf16Slice()),
zig_str.utf16Slice(),
) catch {
JSC.throwInvalidArguments("Out of memory", .{}, ctx, exception);
return null;
};
array_buffer = ArrayBuffer.fromBytes(bytes, .Uint8Array);
} else {
var bytes = strings.allocateLatin1IntoUTF8(default_allocator, []const u8, zig_str.slice()) catch {
JSC.throwInvalidArguments("Out of memory", .{}, ctx, exception);
return null;
};
array_buffer = ArrayBuffer.fromBytes(bytes, .Uint8Array);
}
return array_buffer.toJS(ctx, exception).asObjectRef();
}
const read_key = ZigString.init("read");
const written_key = ZigString.init("written");
pub fn encodeInto(
_: *TextEncoder,
ctx: js.JSContextRef,
_: js.JSObjectRef,
_: js.JSObjectRef,
args: []const js.JSValueRef,
exception: js.ExceptionRef,
) js.JSValueRef {
var arguments: []const JSC.JSValue = @ptrCast([*]const JSC.JSValue, args.ptr)[0..args.len];
if (arguments.len < 2) {
JSC.throwInvalidArguments("TextEncoder.encodeInto expects (string, Uint8Array)", .{}, ctx, exception);
return null;
}
const value = arguments[0];
const array_buffer = arguments[1].asArrayBuffer(ctx.ptr()) orelse {
JSC.throwInvalidArguments("TextEncoder.encodeInto expects a Uint8Array", .{}, ctx, exception);
return null;
};
var output = array_buffer.slice();
const input = value.getZigString(ctx.ptr());
var result: strings.EncodeIntoResult = strings.EncodeIntoResult{ .read = 0, .written = 0 };
if (input.is16Bit()) {
const utf16_slice = input.utf16Slice();
result = strings.copyUTF16IntoUTF8(output, @TypeOf(utf16_slice), utf16_slice);
} else {
result = strings.copyLatin1IntoUTF8(output, @TypeOf(input.slice()), input.slice());
}
return JSC.JSValue.createObject2(ctx.ptr(), &read_key, &written_key, JSValue.jsNumber(result.read), JSValue.jsNumber(result.written)).asObjectRef();
}
pub fn constructor(
ctx: js.JSContextRef,
_: js.JSObjectRef,
_: []const js.JSValueRef,
_: js.ExceptionRef,
) js.JSObjectRef {
return TextEncoder.Class.make(ctx, &text_encoder);
}
};
/// https://encoding.spec.whatwg.org/encodings.json
pub const EncodingLabel = enum {
@"UTF-8",
@"IBM866",
@"ISO-8859-2",
@"ISO-8859-3",
@"ISO-8859-4",
@"ISO-8859-5",
@"ISO-8859-6",
@"ISO-8859-7",
@"ISO-8859-8",
@"ISO-8859-8-I",
@"ISO-8859-10",
@"ISO-8859-13",
@"ISO-8859-14",
@"ISO-8859-15",
@"ISO-8859-16",
@"KOI8-R",
@"KOI8-U",
@"macintosh",
@"windows-874",
@"windows-1250",
@"windows-1251",
/// Also known as
/// - ASCII
/// - latin1
@"windows-1252",
@"windows-1253",
@"windows-1254",
@"windows-1255",
@"windows-1256",
@"windows-1257",
@"windows-1258",
@"x-mac-cyrillic",
@"Big5",
@"EUC-JP",
@"ISO-2022-JP",
@"Shift_JIS",
@"EUC-KR",
@"UTF-16BE",
@"UTF-16LE",
@"x-user-defined",
pub const Map = std.enums.EnumMap(EncodingLabel, string);
pub const label: Map = brk: {
var map = Map.initFull("");
map.put(EncodingLabel.@"UTF-8", "utf-8");
map.put(EncodingLabel.@"UTF-16LE", "utf-16le");
map.put(EncodingLabel.@"windows-1252", "windows-1252");
break :brk map;
};
const utf16_names = [_]string{
"ucs-2",
"utf-16",
"unicode",
"utf-16le",
"csunicode",
"unicodefeff",
"iso-10646-ucs-2",
};
const utf8_names = [_]string{
"utf8",
"utf-8",
"unicode11utf8",
"unicode20utf8",
"x-unicode20utf8",
"unicode-1-1-utf-8",
};
const latin1_names = [_]string{
"l1",
"ascii",
"cp819",
"cp1252",
"ibm819",
"latin1",
"iso88591",
"us-ascii",
"x-cp1252",
"iso8859-1",
"iso_8859-1",
"iso-8859-1",
"iso-ir-100",
"csisolatin1",
"windows-1252",
"ansi_x3.4-1968",
"iso_8859-1:1987",
};
pub const latin1 = EncodingLabel.@"windows-1252";
pub fn which(input_: string) ?EncodingLabel {
const input = strings.trim(input_, " \t\r\n");
const ExactMatcher = strings.ExactSizeMatcher;
const Eight = ExactMatcher(8);
const Sixteen = ExactMatcher(16);
return switch (input.len) {
1, 0 => null,
2...8 => switch (Eight.matchLower(input)) {
Eight.case("l1"),
Eight.case("ascii"),
Eight.case("cp819"),
Eight.case("cp1252"),
Eight.case("ibm819"),
Eight.case("latin1"),
Eight.case("iso88591"),
Eight.case("us-ascii"),
Eight.case("x-cp1252"),
=> EncodingLabel.latin1,
Eight.case("ucs-2"),
Eight.case("utf-16"),
Eight.case("unicode"),
Eight.case("utf-16le"),
=> EncodingLabel.@"UTF-16LE",
Eight.case("utf8"), Eight.case("utf-8") => EncodingLabel.@"UTF-8",
else => null,
},
9...16 => switch (Sixteen.matchLower(input)) {
Sixteen.case("iso8859-1"),
Sixteen.case("iso_8859-1"),
Sixteen.case("iso-8859-1"),
Sixteen.case("iso-ir-100"),
Sixteen.case("csisolatin1"),
Sixteen.case("windows-1252"),
Sixteen.case("ansi_x3.4-1968"),
Sixteen.case("iso_8859-1:1987"),
=> EncodingLabel.latin1,
Sixteen.case("unicode11utf8"),
Sixteen.case("unicode20utf8"),
Sixteen.case("x-unicode20utf8"),
=> EncodingLabel.@"UTF-8",
Sixteen.case("csunicode"),
Sixteen.case("unicodefeff"),
Sixteen.case("iso-10646-ucs-2"),
=> EncodingLabel.@"UTF-16LE",
else => null,
},
else => if (strings.eqlCaseInsensitiveASCII(input, "unicode-1-1-utf-8", true))
EncodingLabel.@"UTF-8"
else
null,
};
}
};
pub const TextDecoder = struct {
scratch_memory: []u8 = &[_]u8{},
ignore_bom: bool = false,
fatal: bool = false,
encoding: EncodingLabel = EncodingLabel.utf8,
pub const Class = NewClass(
TextDecoder,
.{
.name = "TextDecoder",
},
.{
.decode = .{
.rfn = decode,
},
},
.{
.encoding = .{
.get = getEncoding,
.readOnly = true,
},
.ignoreBOM = .{
.get = getIgnoreBOM,
.set = setIgnoreBOM,
},
.fatal = .{
.get = getFatal,
.set = setFatal,
},
},
);
pub fn getIgnoreBOM(
this: *TextDecoder,
_: js.JSContextRef,
_: js.JSValueRef,
_: js.JSStringRef,
_: js.ExceptionRef,
) js.JSValueRef {
return JSC.JSValue.jsBoolean(this.ignore_bom).asObjectRef();
}
pub fn setIgnoreBOM(
this: *TextDecoder,
_: js.JSContextRef,
_: js.JSValueRef,
_: js.JSStringRef,
value: JSC.C.JSValueRef,
_: js.ExceptionRef,
) bool {
this.ignore_bom = JSValue.fromRef(value).toBoolean();
return true;
}
pub fn setFatal(
this: *TextDecoder,
_: js.JSContextRef,
_: js.JSValueRef,
_: js.JSStringRef,
value: JSC.C.JSValueRef,
_: js.ExceptionRef,
) bool {
this.fatal = JSValue.fromRef(value).toBoolean();
return true;
}
pub fn getFatal(
this: *TextDecoder,
_: js.JSContextRef,
_: js.JSValueRef,
_: js.JSStringRef,
_: js.ExceptionRef,
) js.JSValueRef {
return JSC.JSValue.jsBoolean(this.fatal).asObjectRef();
}
const utf8_string: string = "utf-8";
pub fn getEncoding(
this: *TextDecoder,
ctx: js.JSContextRef,
_: js.JSValueRef,
_: js.JSStringRef,
_: js.ExceptionRef,
) js.JSValueRef {
return ZigString.init(EncodingLabel.label.get(this.encoding).?).toValue(ctx.ptr()).asObjectRef();
}
const Vector16 = std.meta.Vector(16, u16);
const max_16_ascii: Vector16 = @splat(16, @as(u16, 127));
fn decodeUTF16WithAlignment(
_: *TextDecoder,
comptime Slice: type,
slice: Slice,
ctx: js.JSContextRef,
) JSC.C.JSValueRef {
var i: usize = 0;
while (i < slice.len) {
while (i + strings.ascii_u16_vector_size <= slice.len) {
const vec: strings.AsciiU16Vector = slice[i..][0..strings.ascii_u16_vector_size].*;
if ((@reduce(
.Or,
@bitCast(
strings.AsciiVectorU16U1,
vec > strings.max_u16_ascii,
) | @bitCast(
strings.AsciiVectorU16U1,
vec < strings.min_u16_ascii,
),
) == 0)) {
break;
}
i += strings.ascii_u16_vector_size;
}
while (i < slice.len and slice[i] <= 127) {
i += 1;
}
break;
}
// is this actually a UTF-16 string that is just ascii?
// we can still allocate as UTF-16 and just copy the bytes
if (i == slice.len) {
if (comptime Slice == []u16) {
return JSC.C.JSValueMakeString(ctx, JSC.C.JSStringCreateWithCharacters(slice.ptr, slice.len));
} else {
var str = ZigString.init("");
str.ptr = @ptrCast([*]u8, slice.ptr);
str.len = slice.len;
str.markUTF16();
return str.toValueGC(ctx.ptr()).asObjectRef();
}
}
var buffer = std.ArrayListAlignedUnmanaged(u16, @alignOf(@TypeOf(slice.ptr))){};
buffer.ensureTotalCapacity(default_allocator, slice.len) catch unreachable;
buffer.items.len = i;
defer buffer.deinit(
default_allocator,
);
if (comptime Slice == []u16) {
@memcpy(
std.mem.sliceAsBytes(buffer.items).ptr,
std.mem.sliceAsBytes(slice).ptr,
std.mem.sliceAsBytes(slice[0..i]).len,
);
} else {
for (slice[0..i]) |ch, j| {
buffer.items[j] = ch;
}
}
const first_high_surrogate = 0xD800;
const last_high_surrogate = 0xDBFF;
const first_low_surrogate = 0xDC00;
const last_low_surrogate = 0xDFFF;
var remainder = slice[i..];
while (remainder.len > 0) {
switch (remainder[0]) {
0...127 => {
var count: usize = 1;
while (remainder.len > count and remainder[count] <= 127) : (count += 1) {}
buffer.ensureUnusedCapacity(default_allocator, count) catch unreachable;
const prev = buffer.items.len;
buffer.items.len += count;
for (remainder[0..count]) |char, j| {
buffer.items[prev + j] = char;
}
remainder = remainder[count..];
},
first_high_surrogate...last_high_surrogate => |first| {
if (remainder.len > 1) {
if (remainder[1] >= first_low_surrogate and remainder[1] <= last_low_surrogate) {
buffer.ensureUnusedCapacity(default_allocator, 2) catch unreachable;
buffer.items.ptr[buffer.items.len] = first;
buffer.items.ptr[buffer.items.len + 1] = remainder[1];
buffer.items.len += 2;
remainder = remainder[2..];
continue;
}
}
buffer.ensureUnusedCapacity(default_allocator, 1) catch unreachable;
buffer.items.ptr[buffer.items.len] = strings.unicode_replacement;
buffer.items.len += 1;
remainder = remainder[1..];
continue;
},
// Is this an unpaired low surrogate or four-digit hex escape?
else => {
buffer.ensureUnusedCapacity(default_allocator, 1) catch unreachable;
buffer.items.ptr[buffer.items.len] = strings.unicode_replacement;
buffer.items.len += 1;
remainder = remainder[1..];
},
}
}
var out = ZigString.init("");
out.ptr = @ptrCast([*]u8, buffer.items.ptr);
out.len = buffer.items.len;
out.markUTF16();
return out.toValueGC(ctx.ptr()).asObjectRef();
}
pub fn decode(
this: *TextDecoder,
ctx: js.JSContextRef,
_: js.JSObjectRef,
_: js.JSObjectRef,
args: []const js.JSValueRef,
exception: js.ExceptionRef,
) js.JSValueRef {
const arguments: []const JSC.JSValue = @ptrCast([*]const JSC.JSValue, args.ptr)[0..args.len];
if (arguments.len < 1 or arguments[0].isUndefined()) {
return ZigString.Empty.toValue(ctx.ptr()).asObjectRef();
}
const array_buffer = arguments[0].asArrayBuffer(ctx.ptr()) orelse {
JSC.throwInvalidArguments("TextDecoder.decode expects an ArrayBuffer or TypedArray", .{}, ctx, exception);
return null;
};
if (array_buffer.len == 0) {
return ZigString.Empty.toValue(ctx.ptr()).asObjectRef();
}
JSC.C.JSValueProtect(ctx, args[0]);
defer JSC.C.JSValueUnprotect(ctx, args[0]);
switch (this.encoding) {
EncodingLabel.@"latin1" => {
return ZigString.init(array_buffer.slice()).toValueGC(ctx.ptr()).asObjectRef();
},
EncodingLabel.@"UTF-8" => {
const buffer_slice = array_buffer.slice();
if (this.fatal) {
if (strings.toUTF16Alloc(default_allocator, buffer_slice, true)) |result_| {
if (result_) |result| {
return ZigString.toExternalU16(result.ptr, result.len, ctx.ptr()).asObjectRef();
}
} else |err| {
switch (err) {
error.InvalidByteSequence => {
JSC.JSError(default_allocator, "Invalid byte sequence", .{}, ctx, exception);
return null;
},
error.OutOfMemory => {
JSC.JSError(default_allocator, "Out of memory", .{}, ctx, exception);
return null;
},
else => {
JSC.JSError(default_allocator, "Unknown error", .{}, ctx, exception);
return null;
},
}
}
} else {
if (strings.toUTF16Alloc(default_allocator, buffer_slice, false)) |result_| {
if (result_) |result| {
return ZigString.toExternalU16(result.ptr, result.len, ctx.ptr()).asObjectRef();
}
} else |err| {
switch (err) {
error.OutOfMemory => {
JSC.JSError(default_allocator, "Out of memory", .{}, ctx, exception);
return null;
},
else => {
JSC.JSError(default_allocator, "Unknown error", .{}, ctx, exception);
return null;
},
}
}
}
// Experiment: using mimalloc directly is slightly slower
return ZigString.init(buffer_slice).toValueGC(ctx.ptr()).asObjectRef();
},
EncodingLabel.@"UTF-16LE" => {
if (std.mem.isAligned(@ptrToInt(array_buffer.ptr) + @as(usize, array_buffer.offset), @alignOf([*]u16))) {
return this.decodeUTF16WithAlignment([]u16, array_buffer.asU16(), ctx);
}
return this.decodeUTF16WithAlignment([]align(1) u16, array_buffer.asU16Unaligned(), ctx);
},
else => {
JSC.throwInvalidArguments("TextDecoder.decode set to unsupported encoding", .{}, ctx, exception);
return null;
},
}
}
pub const Constructor = JSC.NewConstructor(TextDecoder, .{
.constructor = .{ .rfn = constructor },
}, .{});
pub fn constructor(
ctx: js.JSContextRef,
_: js.JSObjectRef,
args_: []const js.JSValueRef,
exception: js.ExceptionRef,
) js.JSObjectRef {
var arguments: []const JSC.JSValue = @ptrCast([*]const JSC.JSValue, args_.ptr)[0..args_.len];
var encoding = EncodingLabel.@"UTF-8";
if (arguments.len > 0) {
if (!arguments[0].isString()) {
JSC.throwInvalidArguments("TextDecoder(encoding) label is invalid", .{}, ctx, exception);
return null;
}
var str = arguments[0].toSlice(ctx.ptr(), default_allocator);
defer if (str.allocated) str.deinit();
encoding = EncodingLabel.which(str.slice()) orelse {
JSC.throwInvalidArguments("Unsupported encoding label \"{s}\"", .{str.slice()}, ctx, exception);
return null;
};
}
var decoder = getAllocator(ctx).create(TextDecoder) catch unreachable;
decoder.* = TextDecoder{ .encoding = encoding };
return TextDecoder.Class.make(ctx, decoder);
}
};
test "Vec" {}