[encoder] Fix non-ascii latin1 characters

This commit is contained in:
Jarred Sumner
2022-06-29 04:29:01 -07:00
parent 95c1785271
commit 7d5464adbe
2 changed files with 399 additions and 79 deletions

View File

@@ -1010,77 +1010,168 @@ pub fn allocateLatin1IntoUTF8(allocator: std.mem.Allocator, comptime Type: type,
}
var list = try std.ArrayList(u8).initCapacity(allocator, latin1_.len);
var latin1 = latin1_;
while (latin1.len > 0) {
const read = @as(usize, firstNonASCII(latin1) orelse @intCast(u32, latin1.len));
try list.ensureTotalCapacityPrecise(
list.items.len + read + if (read != latin1.len) @as(usize, 2) else @as(usize, 0),
);
const before = list.items.len;
list.items.len += read;
@memcpy(list.items[before..].ptr, latin1.ptr, read);
latin1 = latin1[read..];
if (latin1.len > 0) {
try list.ensureUnusedCapacity(2);
var buf = list.items.ptr[list.items.len .. list.items.len + 2][0..2];
list.items.len += 2;
buf[0..2].* = latin1ToCodepointBytesAssumeNotASCII(latin1[0]);
latin1 = latin1[1..];
}
}
return list.toOwnedSlice();
return (try allocateLatin1IntoUTF8WithList(list, 0, Type, latin1_)).toOwnedSlice();
}
pub fn allocateLatin1IntoUTF8ForArrayBuffer(allocator: std.mem.Allocator, globalThis: *JSC.JSGlobalObject, comptime Type: type, latin1_: Type) !JSC.JSValue {
if (comptime bun.FeatureFlags.latin1_is_now_ascii) {
var out = try allocator.alloc(u8, latin1_.len);
@memcpy(out.ptr, latin1_.ptr, latin1_.len);
return out;
}
pub fn allocateLatin1IntoUTF8WithList(list_: std.ArrayList(u8), offset_into_list: usize, comptime Type: type, latin1_: Type) !std.ArrayList(u8) {
var latin1 = latin1_;
var i: usize = offset_into_list;
var list = list_;
while (latin1.len > 0) {
try list.ensureUnusedCapacity(latin1.len);
// assert our starting capcaicty is at least latin1
var buf = list.items.ptr[i..list.capacity];
if (firstNonASCII(latin1)) |start_i| {
var list = try std.ArrayList(u8).initCapacity(allocator, latin1_.len + 2);
list.items.len = start_i;
@memcpy(list.items.ptr, latin1.ptr, start_i);
{
var buf = list.items.ptr[list.items.len .. list.items.len + 2][0..2];
list.items.len += 2;
buf[0..2].* = latin1ToCodepointBytesAssumeNotASCII(latin1[0]);
latin1 = latin1[1..];
}
inner: {
while (latin1.len >= ascii_vector_size) {
const vec: AsciiVector = latin1[0..ascii_vector_size].*;
while (latin1.len > 0) {
const read = @as(usize, firstNonASCII(latin1) orelse @intCast(u32, latin1.len));
try list.ensureTotalCapacityPrecise(
list.items.len + read + if (read != latin1.len) @as(usize, 2) else @as(usize, 0),
);
const before = list.items.len;
list.items.len += read;
@memcpy(list.items[before..].ptr, latin1.ptr, read);
latin1 = latin1[read..];
if (@reduce(.Max, vec) > 127) {
const Int = u64;
const size = @sizeOf(Int);
if (latin1.len > 0) {
try list.ensureUnusedCapacity(2);
var buf = list.items.ptr[list.items.len .. list.items.len + 2][0..2];
list.items.len += 2;
buf[0..2].* = latin1ToCodepointBytesAssumeNotASCII(latin1[0]);
// zig or LLVM doesn't do @ctz nicely with SIMD
if (comptime ascii_vector_size >= 8) {
{
const bytes = @bitCast(Int, latin1[0..size].*);
// https://dotat.at/@/2022-06-27-tolower-swar.html
const mask = bytes & 0x8080808080808080;
if (mask > 0) {
const first_set_byte = @ctz(Int, mask) / 8;
if (comptime Environment.allow_assert) {
assert(latin1[first_set_byte] >= 127);
var j: usize = 0;
while (j < first_set_byte) : (j += 1) {
assert(latin1[j] < 127);
}
}
buf[0..size].* = @bitCast([size]u8, bytes);
buf = buf[first_set_byte..];
latin1 = latin1[first_set_byte..];
break :inner;
}
buf[0..size].* = @bitCast([size]u8, bytes);
latin1 = latin1[size..];
buf = buf[size..];
}
if (comptime ascii_vector_size >= 16) {
const bytes = @bitCast(Int, latin1[0..size].*);
// https://dotat.at/@/2022-06-27-tolower-swar.html
const mask = bytes & 0x8080808080808080;
if (mask > 0) {
const first_set_byte = @ctz(Int, mask) / 8;
if (comptime Environment.allow_assert) {
assert(latin1[first_set_byte] >= 127);
var j: usize = 0;
while (j < first_set_byte) : (j += 1) {
assert(latin1[j] < 127);
}
}
buf[0..size].* = @bitCast([size]u8, bytes);
buf = buf[first_set_byte..];
latin1 = latin1[first_set_byte..];
break :inner;
}
}
unreachable;
}
}
buf[0..ascii_vector_size].* = @bitCast([ascii_vector_size]u8, vec)[0..ascii_vector_size].*;
latin1 = latin1[ascii_vector_size..];
buf = buf[ascii_vector_size..];
}
{
const Int = u64;
const size = @sizeOf(Int);
while (latin1.len >= size) {
const bytes = @bitCast(Int, latin1[0..size].*);
// https://dotat.at/@/2022-06-27-tolower-swar.html
const mask = bytes & 0x8080808080808080;
if (mask > 0) {
const first_set_byte = @ctz(Int, mask) / 8;
if (comptime Environment.allow_assert) {
assert(latin1[first_set_byte] >= 127);
var j: usize = 0;
while (j < first_set_byte) : (j += 1) {
assert(latin1[j] < 127);
}
}
buf[0..size].* = @bitCast([size]u8, bytes);
buf = buf[first_set_byte..];
latin1 = latin1[first_set_byte..];
break :inner;
}
buf[0..size].* = @bitCast([size]u8, bytes);
latin1 = latin1[size..];
buf = buf[size..];
}
}
{
const Int = u32;
const size = @sizeOf(Int);
while (latin1.len >= size) {
const bytes = @bitCast(Int, latin1[0..size].*);
// https://dotat.at/@/2022-06-27-tolower-swar.html
const mask = bytes & 0x80808080;
if (mask > 0) {
const first_set_byte = @ctz(Int, mask) / 8;
if (comptime Environment.allow_assert) {
assert(latin1[first_set_byte] >= 127);
var j: usize = 0;
while (j < first_set_byte) : (j += 1) {
assert(latin1[j] < 127);
}
}
buf[0..size].* = @bitCast([size]u8, bytes);
buf = buf[first_set_byte..];
latin1 = latin1[first_set_byte..];
break :inner;
}
buf[0..size].* = @bitCast([size]u8, bytes);
latin1 = latin1[size..];
buf = buf[size..];
}
}
while (latin1.len >= 1 and latin1[0] < 127) {
buf[0] = latin1[0];
latin1 = latin1[1..];
buf = buf[1..];
}
}
return JSC.ArrayBuffer.fromBytes(list.toOwnedSlice(), .Uint8Array).toJS(globalThis, null);
i = @ptrToInt(buf.ptr) - @ptrToInt(list.items.ptr);
list.items.len = i;
while (latin1.len > 0 and latin1[0] >= 127) {
try list.ensureUnusedCapacity(2 + latin1.len);
buf = list.items.ptr[i..list.capacity];
buf[0..2].* = latin1ToCodepointBytesAssumeNotASCII(latin1[0]);
latin1 = latin1[1..];
buf = buf[2..];
i = @ptrToInt(buf.ptr) - @ptrToInt(list.items.ptr);
list.items.len = i;
}
}
{
const array_buffer = JSC.JSValue.createUninitializedUint8Array(globalThis, latin1.len);
var bytes = array_buffer.asArrayBuffer(globalThis).?.slice();
@memcpy(bytes.ptr, latin1.ptr, latin1.len);
return array_buffer;
}
return list;
}
pub const UTF16Replacement = struct {
@@ -1186,6 +1277,10 @@ pub fn convertUTF8BytesIntoUTF16(sequence: *const [4]u8) UTF16Replacement {
}
pub fn copyLatin1IntoUTF8(buf_: []u8, comptime Type: type, latin1_: Type) EncodeIntoResult {
return copyLatin1IntoUTF8StopOnNonASCII(buf_, Type, latin1_, false);
}
pub fn copyLatin1IntoUTF8StopOnNonASCII(buf_: []u8, comptime Type: type, latin1_: Type, comptime stop: bool) EncodeIntoResult {
if (comptime bun.FeatureFlags.latin1_is_now_ascii) {
const to_copy = @truncate(u32, @minimum(buf_.len, latin1_.len));
@memcpy(buf_.ptr, latin1_.ptr, to_copy);
@@ -1195,28 +1290,94 @@ pub fn copyLatin1IntoUTF8(buf_: []u8, comptime Type: type, latin1_: Type) Encode
var buf = buf_;
var latin1 = latin1_;
while (buf.len > 0 and latin1.len > 0) {
var read: usize = 0;
inner: {
while (@minimum(buf.len, latin1.len) >= ascii_vector_size) {
const vec: AsciiVector = latin1[0..ascii_vector_size].*;
while (latin1.len > ascii_vector_size) {
const vec: AsciiVector = latin1[0..ascii_vector_size].*;
if (@reduce(.Max, vec) > 127) {
if (comptime stop) return .{ .written = std.math.maxInt(u32), .read = std.math.maxInt(u32) };
break;
}
if (@reduce(.Max, vec) > 127) {
break;
buf[0..ascii_vector_size].* = @bitCast([ascii_vector_size]u8, vec)[0..ascii_vector_size].*;
latin1 = latin1[ascii_vector_size..];
buf = buf[ascii_vector_size..];
}
buf[0..ascii_vector_size].* = @bitCast([ascii_vector_size]u8, vec)[0..ascii_vector_size].*;
latin1 = latin1[ascii_vector_size..];
buf = buf[ascii_vector_size..];
{
const Int = u64;
const size = @sizeOf(Int);
while (@minimum(buf.len, latin1.len) >= size) {
const bytes = @bitCast(Int, latin1[0..size].*);
// https://dotat.at/@/2022-06-27-tolower-swar.html
const mask = bytes & 0x8080808080808080;
if (mask > 0) {
const first_set_byte = @ctz(Int, mask) / 8;
if (comptime stop) return .{ .written = std.math.maxInt(u32), .read = std.math.maxInt(u32) };
if (comptime Environment.allow_assert) {
assert(latin1[first_set_byte] >= 127);
var j: usize = 0;
while (j < first_set_byte) : (j += 1) {
assert(latin1[j] < 127);
}
}
buf[0..size].* = @bitCast([size]u8, bytes);
buf = buf[first_set_byte..];
latin1 = latin1[first_set_byte..];
break :inner;
}
buf[0..size].* = @bitCast([size]u8, bytes);
latin1 = latin1[size..];
buf = buf[size..];
}
}
{
const Int = u32;
const size = @sizeOf(Int);
while (@minimum(buf.len, latin1.len) >= size) {
const bytes = @bitCast(Int, latin1[0..size].*);
const mask = bytes & 0x80808080;
if (mask > 0) {
const first_set_byte = @ctz(Int, mask) / 8;
if (comptime stop) return .{ .written = std.math.maxInt(u32), .read = std.math.maxInt(u32) };
if (comptime Environment.allow_assert) {
assert(latin1[first_set_byte] >= 127);
var j: usize = 0;
while (j < first_set_byte) : (j += 1) {
assert(latin1[j] < 127);
}
}
buf[0..size].* = @bitCast([size]u8, bytes);
buf = buf[first_set_byte..];
latin1 = latin1[first_set_byte..];
break :inner;
}
buf[0..size].* = @bitCast([size]u8, bytes);
latin1 = latin1[size..];
buf = buf[size..];
}
}
while (@minimum(buf.len, latin1.len) >= 1 and latin1[0] < 127) {
buf[0] = latin1[0];
latin1 = latin1[1..];
buf = buf[1..];
}
}
while (read < latin1.len and latin1[read] < 0x80) : (read += 1) {}
const to_copy = @minimum(read, buf.len);
@memcpy(buf.ptr, latin1.ptr, to_copy);
latin1 = latin1[to_copy..];
buf = buf[to_copy..];
if (latin1.len > 0 and buf.len >= 2) {
if (comptime stop) return .{ .written = std.math.maxInt(u32), .read = std.math.maxInt(u32) };
buf[0..2].* = latin1ToCodepointBytesAssumeNotASCII(latin1[0]);
latin1 = latin1[1..];
buf = buf[2..];
@@ -1955,7 +2116,7 @@ pub fn escapeHTMLForUTF16Input(allocator: std.mem.Allocator, utf16: []const u16)
}
}
test "copyLatin1IntoUTF8" {
test "copyLatin1IntoUTF8 - ascii" {
var input: string = "hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!";
var output = std.mem.zeroes([500]u8);
const result = copyLatin1IntoUTF8(&output, string, input);
@@ -1965,6 +2126,28 @@ test "copyLatin1IntoUTF8" {
try std.testing.expectEqualSlices(u8, input, output[0..result.written]);
}
test "copyLatin1IntoUTF8 - latin1" {
{
var input: string = &[_]u8{ 104, 101, 108, 108, 111, 32, 119, 111, 114, 108, 100, 32, 169 };
var output = std.mem.zeroes([500]u8);
var expected = "hello world ©";
const result = copyLatin1IntoUTF8(&output, string, input);
try std.testing.expectEqual(input.len, result.read);
try std.testing.expectEqualSlices(u8, expected, output[0..result.written]);
}
{
var input: string = &[_]u8{ 72, 169, 101, 108, 108, 169, 111, 32, 87, 111, 114, 169, 108, 100, 33 };
var output = std.mem.zeroes([500]u8);
var expected = "H©ell©o Wor©ld!";
const result = copyLatin1IntoUTF8(&output, string, input);
try std.testing.expectEqual(input.len, result.read);
try std.testing.expectEqualSlices(u8, expected, output[0..result.written]);
}
}
pub fn latin1ToCodepointAssumeNotASCII(char: u8, comptime CodePointType: type) CodePointType {
return @intCast(
CodePointType,
@@ -1976,6 +2159,7 @@ pub fn latin1ToCodepointAssumeNotASCII(char: u8, comptime CodePointType: type) C
}
pub fn latin1ToCodepointBytesAssumeNotASCIIWIthCharType(comptime Char: type, char: u32) [2]Char {
assert(char > 127);
return [2]Char{
@as(Char, @truncate(u8, 0xc0 | char >> 6)),
@as(Char, @truncate(u8, 0x80 | (char & 0x3f))),
@@ -2299,16 +2483,99 @@ pub fn firstNonASCIIWithType(comptime Type: type, slice: Type) ?u32 {
const vec: AsciiVector = remaining[0..ascii_vector_size].*;
if (@reduce(.Max, vec) > 127) {
const cmp = vec > max_16_ascii;
const bitmask = @ptrCast(*const AsciiVectorInt, &cmp).*;
const first = @ctz(AsciiVectorInt, bitmask);
return @as(u32, first) + @intCast(u32, slice.len - remaining.len);
const Int = u64;
const size = @sizeOf(Int);
{
const bytes = @bitCast(Int, remaining[0..size].*);
// https://dotat.at/@/2022-06-27-tolower-swar.html
const mask = bytes & 0x8080808080808080;
if (mask > 0) {
const first_set_byte = @ctz(Int, mask) / 8;
if (comptime Environment.allow_assert) {
assert(remaining[first_set_byte] >= 127);
var j: usize = 0;
while (j < first_set_byte) : (j += 1) {
assert(remaining[j] < 127);
}
}
return @as(u32, first_set_byte) + @intCast(u32, slice.len - remaining.len);
}
}
{
const bytes = @bitCast(Int, remaining[size..][0..size].*);
const mask = bytes & 0x8080808080808080;
if (mask > 0) {
const first_set_byte = @ctz(Int, mask) / 8;
if (comptime Environment.allow_assert) {
assert(remaining[first_set_byte] >= 127);
var j: usize = 0;
while (j < first_set_byte) : (j += 1) {
assert(remaining[j] < 127);
}
}
return 8 + @as(u32, first_set_byte) + @intCast(u32, slice.len - remaining.len);
}
}
break;
}
remaining = remaining[ascii_vector_size..];
}
}
{
const Int = u64;
const size = @sizeOf(Int);
while (remaining.len >= size) {
const bytes = @bitCast(Int, remaining[0..size].*);
// https://dotat.at/@/2022-06-27-tolower-swar.html
const mask = bytes & 0x8080808080808080;
if (mask > 0) {
const first_set_byte = @ctz(Int, mask) / 8;
if (comptime Environment.allow_assert) {
assert(remaining[first_set_byte] >= 127);
var j: usize = 0;
while (j < first_set_byte) : (j += 1) {
assert(remaining[j] < 127);
}
}
return @as(u32, first_set_byte) + @intCast(u32, slice.len - remaining.len);
}
remaining = remaining[size..];
}
}
{
const Int = u32;
const size = @sizeOf(Int);
while (remaining.len >= size) {
const bytes = @bitCast(Int, remaining[0..size].*);
const mask = bytes & 0x80808080;
if (mask > 0) {
const first_set_byte = @ctz(Int, mask) / 8;
if (comptime Environment.allow_assert) {
assert(remaining[first_set_byte] >= 127);
var j: usize = 0;
while (j < first_set_byte) : (j += 1) {
assert(remaining[j] < 127);
}
}
return @as(u32, first_set_byte) + @intCast(u32, slice.len - remaining.len);
}
remaining = remaining[size..];
}
}
for (remaining) |char, i| {
if (char > 127) {
return @truncate(u32, i + (slice.len - remaining.len));