Add a fast path to TextDecoder UTF-16 for when you pass a Uint16Array

This commit is contained in:
Jarred Sumner
2022-03-14 23:47:01 -07:00
parent 8f93cc37d2
commit ec14a2ef5d

View File

@@ -415,6 +415,125 @@ pub const TextDecoder = struct {
}
const Vector16 = std.meta.Vector(16, u16);
const max_16_ascii: Vector16 = @splat(16, @as(u16, 127));
fn decodeUTF16WithAlignment(
_: *TextDecoder,
comptime Slice: type,
slice: Slice,
ctx: js.JSContextRef,
) JSC.C.JSValueRef {
var i: usize = 0;
while (i < slice.len) {
while (i + strings.ascii_u16_vector_size <= slice.len) {
const vec: strings.AsciiU16Vector = slice[i..][0..strings.ascii_u16_vector_size].*;
if ((@reduce(
.Or,
@bitCast(
strings.AsciiVectorU16U1,
vec > strings.max_u16_ascii,
) | @bitCast(
strings.AsciiVectorU16U1,
vec < strings.min_u16_ascii,
),
) == 0)) {
break;
}
i += strings.ascii_u16_vector_size;
}
while (i < slice.len and slice[i] <= 127) {
i += 1;
}
break;
}
// is this actually a UTF-16 string that is just ascii?
// we can still allocate as UTF-16 and just copy the bytes
if (i == slice.len) {
if (comptime Slice == []u16) {
return JSC.C.JSValueMakeString(ctx, JSC.C.JSStringCreateWithCharacters(slice.ptr, slice.len));
} else {
var str = ZigString.init("");
str.ptr = @ptrCast([*]u8, slice.ptr);
str.len = slice.len;
str.markUTF16();
return str.toValueGC(ctx.ptr()).asObjectRef();
}
}
var buffer = std.ArrayListAlignedUnmanaged(u16, @alignOf(@TypeOf(slice.ptr))){};
buffer.ensureTotalCapacity(default_allocator, slice.len) catch unreachable;
buffer.items.len = i;
defer buffer.deinit(
default_allocator,
);
if (comptime Slice == []u16) {
@memcpy(
std.mem.sliceAsBytes(buffer.items).ptr,
std.mem.sliceAsBytes(slice).ptr,
std.mem.sliceAsBytes(slice[0..i]).len,
);
} else {
for (slice[0..i]) |ch, j| {
buffer.items[j] = ch;
}
}
const first_high_surrogate = 0xD800;
const last_high_surrogate = 0xDBFF;
const first_low_surrogate = 0xDC00;
const last_low_surrogate = 0xDFFF;
var remainder = slice[i..];
while (remainder.len > 0) {
switch (remainder[0]) {
0...127 => {
var count: usize = 1;
while (remainder.len > count and remainder[count] <= 127) : (count += 1) {}
buffer.ensureUnusedCapacity(default_allocator, count) catch unreachable;
const prev = buffer.items.len;
buffer.items.len += count;
for (remainder[0..count]) |char, j| {
buffer.items[prev + j] = char;
}
remainder = remainder[count..];
},
first_high_surrogate...last_high_surrogate => |first| {
if (remainder.len > 1) {
if (remainder[1] >= first_low_surrogate and remainder[1] <= last_low_surrogate) {
buffer.ensureUnusedCapacity(default_allocator, 2) catch unreachable;
buffer.items.ptr[buffer.items.len] = first;
buffer.items.ptr[buffer.items.len + 1] = remainder[1];
buffer.items.len += 2;
remainder = remainder[2..];
continue;
}
}
buffer.ensureUnusedCapacity(default_allocator, 1) catch unreachable;
buffer.items.ptr[buffer.items.len] = strings.unicode_replacement;
buffer.items.len += 1;
remainder = remainder[1..];
continue;
},
// Is this an unpaired low surrogate or four-digit hex escape?
else => {
buffer.ensureUnusedCapacity(default_allocator, 1) catch unreachable;
buffer.items.ptr[buffer.items.len] = strings.unicode_replacement;
buffer.items.len += 1;
remainder = remainder[1..];
},
}
}
var out = ZigString.init("");
out.ptr = @ptrCast([*]u8, buffer.items.ptr);
out.len = buffer.items.len;
out.markUTF16();
return out.toValueGC(ctx.ptr()).asObjectRef();
}
pub fn decode(
this: *TextDecoder,
ctx: js.JSContextRef,
@@ -438,6 +557,9 @@ pub const TextDecoder = struct {
return ZigString.Empty.toValue(ctx.ptr()).asObjectRef();
}
JSC.C.JSValueProtect(ctx, args[0]);
defer JSC.C.JSValueUnprotect(ctx, args[0]);
switch (this.encoding) {
EncodingLabel.@"latin1" => {
var str = ZigString.init(array_buffer.slice());
@@ -486,96 +608,16 @@ pub const TextDecoder = struct {
}
}
}
return str.toValue(ctx.ptr()).asObjectRef();
return str.toValueGC(ctx.ptr()).asObjectRef();
},
EncodingLabel.@"UTF-16LE" => {
var slice = array_buffer.asU16();
var i: usize = 0;
while (i < slice.len) {
while (i + 16 <= slice.len) {
const vec: Vector16 = slice[i..][0..16].*;
if ((@reduce(.Or, vec > max_16_ascii))) {
break;
}
i += 16;
}
while (i < slice.len and slice[i] <= 127) {
i += 1;
}
break;
if (std.mem.isAligned(@ptrToInt(array_buffer.ptr) + @as(usize, array_buffer.offset), @alignOf([*]u16))) {
return this.decodeUTF16WithAlignment([]u16, array_buffer.asU16(), ctx);
}
// is this actually a UTF-16 string that is just ascii?
// we can still allocate as UTF-16 and just copy the bytes
if (i == slice.len) {
return JSC.C.JSValueMakeString(ctx, JSC.C.JSStringCreateWithCharacters(slice.ptr, slice.len));
}
var buffer = std.ArrayListAlignedUnmanaged(u16, 1){};
buffer.ensureTotalCapacity(default_allocator, slice.len) catch unreachable;
buffer.items.len = i;
defer buffer.deinit(
default_allocator,
);
for (slice[0..i]) |char, j| {
buffer.items[j] = char;
}
const first_high_surrogate = 0xD800;
const last_high_surrogate = 0xDBFF;
const first_low_surrogate = 0xDC00;
const last_low_surrogate = 0xDFFF;
var remainder = slice[i..];
while (remainder.len > 0) {
switch (remainder[0]) {
0...127 => {
var count: usize = 1;
while (remainder.len > count and remainder[count] <= 127) : (count += 1) {}
buffer.ensureUnusedCapacity(default_allocator, count) catch unreachable;
const prev = buffer.items.len;
buffer.items.len += count;
for (remainder[0..count]) |char, j| {
buffer.items[prev + j] = char;
}
remainder = remainder[count..];
},
first_high_surrogate...last_high_surrogate => |first| {
if (remainder.len > 1) {
if (remainder[1] >= first_low_surrogate and remainder[1] <= last_low_surrogate) {
buffer.ensureUnusedCapacity(default_allocator, 2) catch unreachable;
buffer.items.ptr[buffer.items.len] = first;
buffer.items.ptr[buffer.items.len + 1] = remainder[1];
buffer.items.len += 2;
remainder = remainder[2..];
continue;
}
}
buffer.ensureUnusedCapacity(default_allocator, 1) catch unreachable;
buffer.items.ptr[buffer.items.len] = strings.unicode_replacement;
buffer.items.len += 1;
remainder = remainder[1..];
continue;
},
// Is this an unpaired low surrogate or four-digit hex escape?
else => {
buffer.ensureUnusedCapacity(default_allocator, 1) catch unreachable;
buffer.items.ptr[buffer.items.len] = strings.unicode_replacement;
buffer.items.len += 1;
remainder = remainder[1..];
},
}
}
var out = ZigString.init("");
out.ptr = @ptrCast([*]u8, buffer.items.ptr);
out.len = buffer.items.len;
out.markUTF16();
return out.toValueGC(ctx.ptr()).asObjectRef();
return this.decodeUTF16WithAlignment([]align(1) u16, array_buffer.asU16Unaligned(), ctx);
},
else => {
JSC.throwInvalidArguments("TextDecoder.decode set to unsupported encoding", .{}, ctx, exception);