[strings] Optimize TextEncoder a little more

This commit is contained in:
Jarred Sumner
2022-07-02 01:33:00 -07:00
parent 476b3c7404
commit f9f573ebb4
2 changed files with 109 additions and 143 deletions

View File

@@ -2155,7 +2155,7 @@ pub fn JSError(
if (comptime std.meta.fields(@TypeOf(args)).len == 0) {
var zig_str = JSC.ZigString.init(fmt);
if (comptime !strings.isAllASCII(fmt)) {
if (comptime !strings.isAllASCIISimple(fmt)) {
zig_str.markUTF16();
}
@@ -2167,7 +2167,6 @@ pub fn JSError(
var buf = std.fmt.allocPrint(allocator, fmt, args) catch unreachable;
var zig_str = JSC.ZigString.init(buf);
zig_str.detectEncoding();
zig_str.mark();
// it alwayas clones
exception.* = zig_str.toErrorInstance(ctx).asObjectRef();
allocator.free(buf);

View File

@@ -1017,144 +1017,110 @@ pub fn allocateLatin1IntoUTF8WithList(list_: std.ArrayList(u8), offset_into_list
var latin1 = latin1_;
var i: usize = offset_into_list;
var list = list_;
try list.ensureUnusedCapacity(latin1.len);
while (latin1.len > 0) {
try list.ensureUnusedCapacity(latin1.len);
assert(i < list.capacity);
var buf = list.items.ptr[i..list.capacity];
inner: {
if (latin1.len >= ascii_vector_size) {
const start_ptr = @ptrToInt(buf.ptr);
const start_ptr_latin1 = @ptrToInt(latin1.ptr);
const end_ptr = @ptrToInt(latin1.ptr + latin1.len - (latin1.len % ascii_vector_size));
var count = latin1.len / ascii_vector_size;
while (count > 0) : (count -= 1) {
const vec: AsciiVector = latin1[0..ascii_vector_size].*;
while (@ptrToInt(latin1.ptr) < end_ptr) {
const vec: AsciiVector = latin1[0..ascii_vector_size].*;
if (@reduce(.Max, vec) > 127) {
const Int = u64;
const size = @sizeOf(Int);
if (@reduce(.Max, vec) > 127) {
const Int = u64;
const size = @sizeOf(Int);
buf.len -= @ptrToInt(buf.ptr) - start_ptr;
latin1.len -= @ptrToInt(latin1.ptr) - start_ptr_latin1;
// zig or LLVM doesn't do @ctz nicely with SIMD
if (comptime ascii_vector_size >= 8) {
{
const bytes = @bitCast(Int, latin1[0..size].*);
// https://dotat.at/@/2022-06-27-tolower-swar.html
const mask = bytes & 0x8080808080808080;
// zig or LLVM doesn't do @ctz nicely with SIMD
if (comptime ascii_vector_size >= 8) {
{
const bytes = @bitCast(Int, latin1[0..size].*);
// https://dotat.at/@/2022-06-27-tolower-swar.html
const mask = bytes & 0x8080808080808080;
if (mask > 0) {
const first_set_byte = @ctz(Int, mask) / 8;
if (comptime Environment.allow_assert) {
assert(latin1[first_set_byte] >= 127);
var j: usize = 0;
while (j < first_set_byte) : (j += 1) {
assert(latin1[j] < 127);
}
}
buf[0..size].* = @bitCast([size]u8, bytes);
buf = buf[first_set_byte..];
latin1 = latin1[first_set_byte..];
break :inner;
if (mask > 0) {
const first_set_byte = @ctz(Int, mask) / 8;
if (comptime Environment.allow_assert) {
assert(latin1[first_set_byte] >= 127);
}
buf[0..size].* = @bitCast([size]u8, bytes);
latin1 = latin1[size..];
buf = buf[size..];
buf = buf[first_set_byte..];
latin1 = latin1[first_set_byte..];
break :inner;
}
if (comptime ascii_vector_size >= 16) {
const bytes = @bitCast(Int, latin1[0..size].*);
// https://dotat.at/@/2022-06-27-tolower-swar.html
const mask = bytes & 0x8080808080808080;
buf[0..size].* = @bitCast([size]u8, bytes);
latin1 = latin1[size..];
buf = buf[size..];
}
if (mask > 0) {
const first_set_byte = @ctz(Int, mask) / 8;
if (comptime Environment.allow_assert) {
assert(latin1[first_set_byte] >= 127);
var j: usize = 0;
while (j < first_set_byte) : (j += 1) {
assert(latin1[j] < 127);
}
}
if (comptime ascii_vector_size >= 16) {
const bytes = @bitCast(Int, latin1[0..size].*);
// https://dotat.at/@/2022-06-27-tolower-swar.html
const mask = bytes & 0x8080808080808080;
buf[0..size].* = @bitCast([size]u8, bytes);
buf = buf[first_set_byte..];
latin1 = latin1[first_set_byte..];
break :inner;
if (mask > 0) {
const first_set_byte = @ctz(Int, mask) / 8;
if (comptime Environment.allow_assert) {
assert(latin1[first_set_byte] >= 127);
}
buf[0..size].* = @bitCast([size]u8, bytes);
buf = buf[first_set_byte..];
latin1 = latin1[first_set_byte..];
break :inner;
}
}
unreachable;
}
buf[0..ascii_vector_size].* = @bitCast([ascii_vector_size]u8, vec)[0..ascii_vector_size].*;
latin1.ptr += ascii_vector_size;
buf.ptr += ascii_vector_size;
unreachable;
}
buf.len -= @ptrToInt(buf.ptr) - start_ptr;
latin1.len -= @ptrToInt(latin1.ptr) - start_ptr_latin1;
buf[0..ascii_vector_size].* = @bitCast([ascii_vector_size]u8, vec)[0..ascii_vector_size].*;
latin1 = latin1[ascii_vector_size..];
buf = buf[ascii_vector_size..];
}
{
while (latin1.len >= 8) {
const Int = u64;
const size = @sizeOf(Int);
const latin1_end_ptr = latin1.ptr + latin1.len - (latin1.len % size);
const start_ptr = @ptrToInt(buf.ptr);
const start_ptr_latin1 = @ptrToInt(latin1.ptr);
while (latin1.ptr != latin1_end_ptr) {
const bytes = @bitCast(Int, latin1[0..size].*);
// https://dotat.at/@/2022-06-27-tolower-swar.html
const mask = bytes & 0x8080808080808080;
if (mask > 0) {
const first_set_byte = @ctz(Int, mask) / 8;
if (comptime Environment.allow_assert) {
assert(latin1[first_set_byte] >= 127);
var j: usize = 0;
while (j < first_set_byte) : (j += 1) {
assert(latin1[j] < 127);
}
}
const bytes = @bitCast(Int, latin1[0..size].*);
// https://dotat.at/@/2022-06-27-tolower-swar.html
const mask = bytes & 0x8080808080808080;
buf[0..size].* = @bitCast([size]u8, bytes);
buf.ptr += first_set_byte;
latin1.ptr += first_set_byte;
buf.len -= @ptrToInt(buf.ptr) - start_ptr;
latin1.len -= @ptrToInt(latin1.ptr) - start_ptr_latin1;
break :inner;
if (mask > 0) {
const first_set_byte = @ctz(Int, mask) / 8;
if (comptime Environment.allow_assert) {
assert(latin1[first_set_byte] >= 127);
}
buf[0..size].* = @bitCast([size]u8, bytes);
latin1.ptr += size;
buf.ptr += size;
latin1 = latin1[first_set_byte..];
buf = buf[first_set_byte..];
break :inner;
}
buf.len -= @ptrToInt(buf.ptr) - start_ptr;
latin1.len -= @ptrToInt(latin1.ptr) - start_ptr_latin1;
buf[0..size].* = @bitCast([size]u8, bytes);
latin1 = latin1[size..];
buf = buf[size..];
}
{
assert(latin1.len < 8);
const end = latin1.ptr + latin1.len;
const start_ptr = @ptrToInt(buf.ptr);
const start_ptr_latin1 = @ptrToInt(latin1.ptr);
while (latin1.ptr != end and latin1.ptr[0] <= 127) {
buf.ptr[0] = latin1.ptr[0];
buf.ptr += 1;
latin1.ptr += 1;
while (latin1.ptr != end and latin1[0] < 128) {
buf[0] = latin1[0];
buf = buf[1..];
latin1 = latin1[1..];
}
buf.len -= @ptrToInt(buf.ptr) - start_ptr;
latin1.len -= @ptrToInt(latin1.ptr) - start_ptr_latin1;
}
}
while (latin1.len > 0 and latin1[0] > 127) {
i = @ptrToInt(buf.ptr) - @ptrToInt(list.items.ptr);
list.items.len = i;
try list.ensureUnusedCapacity(2 + latin1.len);
buf = list.items.ptr[i..list.capacity];
buf[0..2].* = latin1ToCodepointBytesAssumeNotASCII(latin1[0]);
@@ -1286,7 +1252,8 @@ pub fn copyLatin1IntoUTF8StopOnNonASCII(buf_: []u8, comptime Type: type, latin1_
var latin1 = latin1_;
while (buf.len > 0 and latin1.len > 0) {
inner: {
while (@minimum(buf.len, latin1.len) >= ascii_vector_size) {
var remaining_runs = @minimum(buf.len, latin1.len) / ascii_vector_size;
while (remaining_runs > 0) : (remaining_runs -= 1) {
const vec: AsciiVector = latin1[0..ascii_vector_size].*;
if (@reduce(.Max, vec) > 127) {
@@ -1302,23 +1269,19 @@ pub fn copyLatin1IntoUTF8StopOnNonASCII(buf_: []u8, comptime Type: type, latin1_
// https://dotat.at/@/2022-06-27-tolower-swar.html
const mask = bytes & 0x8080808080808080;
buf[0..size].* = @bitCast([size]u8, bytes);
if (mask > 0) {
const first_set_byte = @ctz(Int, mask) / 8;
if (comptime Environment.allow_assert) {
assert(latin1[first_set_byte] >= 127);
var j: usize = 0;
while (j < first_set_byte) : (j += 1) {
assert(latin1[j] < 127);
}
}
buf[0..size].* = @bitCast([size]u8, bytes);
buf = buf[first_set_byte..];
latin1 = latin1[first_set_byte..];
break :inner;
}
buf[0..size].* = @bitCast([size]u8, bytes);
latin1 = latin1[size..];
buf = buf[size..];
}
@@ -1328,24 +1291,20 @@ pub fn copyLatin1IntoUTF8StopOnNonASCII(buf_: []u8, comptime Type: type, latin1_
// https://dotat.at/@/2022-06-27-tolower-swar.html
const mask = bytes & 0x8080808080808080;
if (mask > 0) {
const first_set_byte = @ctz(Int, mask) / 8;
if (comptime Environment.allow_assert) {
assert(latin1[first_set_byte] >= 127);
var j: usize = 0;
while (j < first_set_byte) : (j += 1) {
assert(latin1[j] < 127);
}
}
buf[0..size].* = @bitCast([size]u8, bytes);
buf[0..size].* = @bitCast([size]u8, bytes);
buf = buf[first_set_byte..];
latin1 = latin1[first_set_byte..];
break :inner;
assert(mask > 0);
const first_set_byte = @ctz(Int, mask) / 8;
if (comptime Environment.allow_assert) {
assert(latin1[first_set_byte] >= 127);
}
buf = buf[first_set_byte..];
latin1 = latin1[first_set_byte..];
break :inner;
}
}
break;
unreachable;
}
buf[0..ascii_vector_size].* = @bitCast([ascii_vector_size]u8, vec)[0..ascii_vector_size].*;
@@ -1358,7 +1317,10 @@ pub fn copyLatin1IntoUTF8StopOnNonASCII(buf_: []u8, comptime Type: type, latin1_
const size = @sizeOf(Int);
while (@minimum(buf.len, latin1.len) >= size) {
const bytes = @bitCast(Int, latin1[0..size].*);
buf[0..size].* = @bitCast([size]u8, bytes);
// https://dotat.at/@/2022-06-27-tolower-swar.html
const mask = bytes & 0x8080808080808080;
if (mask > 0) {
@@ -1367,20 +1329,14 @@ pub fn copyLatin1IntoUTF8StopOnNonASCII(buf_: []u8, comptime Type: type, latin1_
if (comptime Environment.allow_assert) {
assert(latin1[first_set_byte] >= 127);
var j: usize = 0;
while (j < first_set_byte) : (j += 1) {
assert(latin1[j] < 127);
}
}
buf[0..size].* = @bitCast([size]u8, bytes);
buf = buf[first_set_byte..];
latin1 = latin1[first_set_byte..];
break :inner;
}
buf[0..size].* = @bitCast([size]u8, bytes);
latin1 = latin1[size..];
buf = buf[size..];
}
@@ -1422,6 +1378,7 @@ pub fn replaceLatin1WithUTF8(buf_: []u8) void {
var latin1 = buf_;
while (strings.firstNonASCII(latin1)) |i| {
latin1[i..][0..2].* = latin1ToCodepointBytesAssumeNotASCII(latin1[i]);
latin1 = latin1[i + 2 ..];
}
}
@@ -2464,35 +2421,45 @@ pub fn isAllASCII(slice: []const u8) bool {
// The NEON SIMD unit is 128-bit wide and includes 16 128-bit registers that can be used as 32 64-bit registers
if (comptime Environment.isAarch64 or Environment.isX64) {
while (remaining.len >= 128) {
comptime var count: usize = 0;
inline while (count < 8) : (count += 1) {
const vec: AsciiVector = remaining[(comptime count * ascii_vector_size)..][0..ascii_vector_size].*;
if (@reduce(.Max, vec) > 127) {
return false;
}
}
remaining = remaining[comptime ascii_vector_size * count..];
}
while (remaining.len >= ascii_vector_size) {
const remaining_end_ptr = remaining.ptr + remaining.len - (remaining.len % ascii_vector_size);
while (remaining.ptr != remaining_end_ptr) : (remaining.ptr += ascii_vector_size) {
const vec: AsciiVector = remaining[0..ascii_vector_size].*;
if (@reduce(.Max, vec) > 127) {
return false;
}
remaining = remaining[ascii_vector_size..];
}
}
for (remaining) |char| {
if (char > 127) {
const Int = u64;
const size = @sizeOf(Int);
const remaining_last8 = slice.ptr + slice.len - (slice.len % size);
while (remaining.ptr != remaining_last8) : (remaining.ptr += size) {
const bytes = @bitCast(Int, remaining[0..size].*);
// https://dotat.at/@/2022-06-27-tolower-swar.html
const mask = bytes & 0x8080808080808080;
if (mask > 0) {
return false;
}
}
const final = slice.ptr + slice.len;
while (remaining.ptr != final) : (remaining.ptr += 1) {
if (remaining[0] > 127) {
return false;
}
}
return true;
}
pub fn isAllASCIISimple(comptime slice: []const u8) bool {
for (slice) |char| {
if (char > 127) {
return false;
}
}
return true;
}