This commit is contained in:
Dylan Conway
2024-08-09 19:28:08 -07:00
committed by GitHub
parent b9ead441c1
commit 9302b42919
31 changed files with 2027 additions and 263 deletions

View File

@@ -1340,15 +1340,7 @@ pub fn toUTF16Alloc(allocator: std.mem.Allocator, bytes: []const u8, comptime fa
var remaining = bytes[i..];
{
const sequence: [4]u8 = switch (remaining.len) {
0 => unreachable,
1 => [_]u8{ remaining[0], 0, 0, 0 },
2 => [_]u8{ remaining[0], remaining[1], 0, 0 },
3 => [_]u8{ remaining[0], remaining[1], remaining[2], 0 },
else => remaining[0..4].*,
};
const replacement = strings.convertUTF8BytesIntoUTF16(&sequence);
const replacement = strings.convertUTF8BytesIntoUTF16(remaining);
if (comptime fail_if_invalid) {
if (replacement.fail) {
if (comptime Environment.allow_assert) assert(replacement.code_point == unicode_replacement);
@@ -1375,15 +1367,7 @@ pub fn toUTF16Alloc(allocator: std.mem.Allocator, bytes: []const u8, comptime fa
strings.copyU8IntoU16(output.items[end..][0..j], remaining[0..j]);
remaining = remaining[j..];
const sequence: [4]u8 = switch (remaining.len) {
0 => unreachable,
1 => [_]u8{ remaining[0], 0, 0, 0 },
2 => [_]u8{ remaining[0], remaining[1], 0, 0 },
3 => [_]u8{ remaining[0], remaining[1], remaining[2], 0 },
else => remaining[0..4].*,
};
const replacement = strings.convertUTF8BytesIntoUTF16(&sequence);
const replacement = strings.convertUTF8BytesIntoUTF16(remaining);
if (comptime fail_if_invalid) {
if (replacement.fail) {
if (comptime Environment.allow_assert) assert(replacement.code_point == unicode_replacement);
@@ -1436,6 +1420,101 @@ pub fn toUTF16AllocForReal(allocator: std.mem.Allocator, bytes: []const u8, comp
};
}
pub fn toUTF16AllocMaybeBuffered(
allocator: std.mem.Allocator,
bytes: []const u8,
comptime fail_if_invalid: bool,
comptime flush: bool,
) error{ OutOfMemory, InvalidByteSequence }!?struct { []u16, [3]u8, u2 } {
const first_non_ascii = strings.firstNonASCII(bytes) orelse return null;
var output: std.ArrayListUnmanaged(u16) = if (comptime bun.FeatureFlags.use_simdutf) output: {
const out_length = bun.simdutf.length.utf16.from.utf8(bytes);
if (out_length == 0) {
break :output .{};
}
var out = try allocator.alloc(u16, out_length);
const res = bun.simdutf.convert.utf8.to.utf16.with_errors.le(bytes, out);
if (res.status == .success) {
log("toUTF16 {d} UTF8 -> {d} UTF16", .{ bytes.len, out_length });
return .{ out, .{0} ** 3, 0 };
}
var list = std.ArrayListUnmanaged(u16).fromOwnedSlice(out[0..first_non_ascii]);
list.capacity = out.len;
break :output list;
} else .{};
errdefer output.deinit(allocator);
const start = if (output.items.len > 0) first_non_ascii else 0;
var remaining = bytes[start..];
var non_ascii: ?u32 = 0;
while (non_ascii) |i| : (non_ascii = strings.firstNonASCII(remaining)) {
{
const end = output.items.len;
try output.ensureUnusedCapacity(allocator, i + 2); // +2 for UTF16 codepoint
output.items.len += i;
strings.copyU8IntoU16(output.items[end..][0..i], remaining[0..i]);
remaining = remaining[i..];
}
const sequence: [4]u8 = switch (remaining.len) {
0 => unreachable,
1 => .{ remaining[0], 0, 0, 0 },
2 => .{ remaining[0], remaining[1], 0, 0 },
3 => .{ remaining[0], remaining[1], remaining[2], 0 },
else => remaining[0..4].*,
};
const converted_length = strings.nonASCIISequenceLength(sequence[0]);
const converted = strings.convertUTF8BytesIntoUTF16WithLength(&sequence, converted_length, remaining.len);
if (comptime !flush) {
if (converted.fail and converted.can_buffer and converted_length > remaining.len) {
const buffered: [3]u8 = switch (remaining.len) {
else => unreachable,
1 => .{ remaining[0], 0, 0 },
2 => .{ remaining[0], remaining[1], 0 },
3 => .{ remaining[0], remaining[1], remaining[2] },
};
return .{ output.items, buffered, @intCast(remaining.len) };
}
}
if (comptime fail_if_invalid) {
if (converted.fail) {
if (comptime Environment.allow_assert) {
bun.assert(converted.code_point == unicode_replacement);
}
return error.InvalidByteSequence;
}
}
remaining = remaining[@max(converted.len, 1)..];
// #define U16_LENGTH(c) ((uint32_t)(c)<=0xffff ? 1 : 2)
switch (converted.code_point) {
0...0xffff => |c| output.appendAssumeCapacity(@intCast(c)),
else => |c| output.appendSliceAssumeCapacity(&.{ strings.u16Lead(c), strings.u16Trail(c) }),
}
}
if (remaining.len > 0) {
try output.ensureTotalCapacityPrecise(allocator, output.items.len + remaining.len);
output.items.len += remaining.len;
strings.copyU8IntoU16(output.items[output.items.len - remaining.len ..], remaining);
}
log("toUTF16 {d} UTF8 -> {d} UTF16", .{ bytes.len, output.items.len });
return .{ output.items, .{0} ** 3, 0 };
}
pub fn toUTF16AllocNoTrim(allocator: std.mem.Allocator, bytes: []const u8, comptime fail_if_invalid: bool, comptime _: bool) !?[]u16 {
if (strings.firstNonASCII(bytes)) |i| {
const output_: ?std.ArrayList(u16) = if (comptime bun.FeatureFlags.use_simdutf) simd: {
@@ -1474,15 +1553,7 @@ pub fn toUTF16AllocNoTrim(allocator: std.mem.Allocator, bytes: []const u8, compt
var remaining = bytes[i..];
{
const sequence: [4]u8 = switch (remaining.len) {
0 => unreachable,
1 => [_]u8{ remaining[0], 0, 0, 0 },
2 => [_]u8{ remaining[0], remaining[1], 0, 0 },
3 => [_]u8{ remaining[0], remaining[1], remaining[2], 0 },
else => remaining[0..4].*,
};
const replacement = strings.convertUTF8BytesIntoUTF16(&sequence);
const replacement = strings.convertUTF8BytesIntoUTF16(remaining);
if (comptime fail_if_invalid) {
if (replacement.fail) {
if (comptime Environment.allow_assert) assert(replacement.code_point == unicode_replacement);
@@ -1509,15 +1580,7 @@ pub fn toUTF16AllocNoTrim(allocator: std.mem.Allocator, bytes: []const u8, compt
strings.copyU8IntoU16(output.items[end..][0..j], remaining[0..j]);
remaining = remaining[j..];
const sequence: [4]u8 = switch (remaining.len) {
0 => unreachable,
1 => [_]u8{ remaining[0], 0, 0, 0 },
2 => [_]u8{ remaining[0], remaining[1], 0, 0 },
3 => [_]u8{ remaining[0], remaining[1], remaining[2], 0 },
else => remaining[0..4].*,
};
const replacement = strings.convertUTF8BytesIntoUTF16(&sequence);
const replacement = strings.convertUTF8BytesIntoUTF16(remaining);
if (comptime fail_if_invalid) {
if (replacement.fail) {
if (comptime Environment.allow_assert) assert(replacement.code_point == unicode_replacement);
@@ -2076,7 +2139,9 @@ pub const UTF16Replacement = struct {
/// and a genuine error.
fail: bool = false,
pub inline fn utf8Width(replacement: UTF16Replacement) usize {
can_buffer: bool = true,
pub inline fn utf8Width(replacement: UTF16Replacement) u3 {
return switch (replacement.code_point) {
0...0x7F => 1,
(0x7F + 1)...0x7FF => 2,
@@ -2086,10 +2151,8 @@ pub const UTF16Replacement = struct {
}
};
// This variation matches WebKit behavior.
fn convertUTF8BytesIntoUTF16(sequence: *const [4]u8) UTF16Replacement {
fn convertUTF8BytesIntoUTF16WithLength(sequence: *const [4]u8, len: u3, remaining_len: usize) UTF16Replacement {
if (comptime Environment.allow_assert) assert(sequence[0] > 127);
const len = nonASCIISequenceLength(sequence[0]);
switch (len) {
2 => {
if (comptime Environment.allow_assert) {
@@ -2097,7 +2160,7 @@ fn convertUTF8BytesIntoUTF16(sequence: *const [4]u8) UTF16Replacement {
bun.assert(sequence[0] <= 0xDF);
}
if (sequence[1] < 0x80 or sequence[1] > 0xBF) {
return .{ .len = 1, .fail = true };
return .{ .len = 1, .fail = true, .can_buffer = remaining_len < 2 };
}
return .{ .len = len, .code_point = ((@as(u32, sequence[0]) << 6) + @as(u32, sequence[1])) - 0x00003080 };
},
@@ -2109,22 +2172,22 @@ fn convertUTF8BytesIntoUTF16(sequence: *const [4]u8) UTF16Replacement {
switch (sequence[0]) {
0xE0 => {
if (sequence[1] < 0xA0 or sequence[1] > 0xBF) {
return .{ .len = 1, .fail = true };
return .{ .len = 1, .fail = true, .can_buffer = remaining_len < 2 };
}
},
0xED => {
if (sequence[1] < 0x80 or sequence[1] > 0x9F) {
return .{ .len = 1, .fail = true };
return .{ .len = 1, .fail = true, .can_buffer = remaining_len < 2 };
}
},
else => {
if (sequence[1] < 0x80 or sequence[1] > 0xBF) {
return .{ .len = 1, .fail = true };
return .{ .len = 1, .fail = true, .can_buffer = remaining_len < 2 };
}
},
}
if (sequence[2] < 0x80 or sequence[2] > 0xBF) {
return .{ .len = 2, .fail = true };
return .{ .len = 2, .fail = true, .can_buffer = remaining_len < 3 };
}
return .{
.len = len,
@@ -2135,36 +2198,36 @@ fn convertUTF8BytesIntoUTF16(sequence: *const [4]u8) UTF16Replacement {
switch (sequence[0]) {
0xF0 => {
if (sequence[1] < 0x90 or sequence[1] > 0xBF) {
return .{ .len = 1, .fail = true };
return .{ .len = 1, .fail = true, .can_buffer = remaining_len < 2 };
}
},
0xF4 => {
if (sequence[1] < 0x80 or sequence[1] > 0x8F) {
return .{ .len = 1, .fail = true };
return .{ .len = 1, .fail = true, .can_buffer = remaining_len < 2 };
}
},
// invalid code point
// this used to be an assertion
0...(0xF0 - 1), 0xF4 + 1...std.math.maxInt(@TypeOf(sequence[0])) => {
return UTF16Replacement{ .len = 1, .fail = true };
return .{ .len = 1, .fail = true, .can_buffer = false };
},
else => {
if (sequence[1] < 0x80 or sequence[1] > 0xBF) {
return .{ .len = 1, .fail = true };
return .{ .len = 1, .fail = true, .can_buffer = remaining_len < 2 };
}
},
}
if (sequence[2] < 0x80 or sequence[2] > 0xBF) {
return .{ .len = 2, .fail = true };
return .{ .len = 2, .fail = true, .can_buffer = remaining_len < 3 };
}
if (sequence[3] < 0x80 or sequence[3] > 0xBF) {
return .{ .len = 3, .fail = true };
return .{ .len = 3, .fail = true, .can_buffer = remaining_len < 4 };
}
return .{
.len = 4,
.len = len,
.code_point = ((@as(u32, sequence[0]) << 18) +
(@as(u32, sequence[1]) << 12) +
(@as(u32, sequence[2]) << 6) + @as(u32, sequence[3])) - 0x03C82080,
@@ -2176,6 +2239,21 @@ fn convertUTF8BytesIntoUTF16(sequence: *const [4]u8) UTF16Replacement {
}
}
// This variation matches WebKit behavior.
// fn convertUTF8BytesIntoUTF16(sequence: *const [4]u8, remaining_len: usize) UTF16Replacement {
fn convertUTF8BytesIntoUTF16(bytes: []const u8) UTF16Replacement {
const sequence: [4]u8 = switch (bytes.len) {
0 => unreachable,
1 => [_]u8{ bytes[0], 0, 0, 0 },
2 => [_]u8{ bytes[0], bytes[1], 0, 0 },
3 => [_]u8{ bytes[0], bytes[1], bytes[2], 0 },
else => bytes[0..4].*,
};
if (comptime Environment.allow_assert) assert(sequence[0] > 127);
const sequence_length = nonASCIISequenceLength(sequence[0]);
return convertUTF8BytesIntoUTF16WithLength(&sequence, sequence_length, bytes.len);
}
pub fn copyLatin1IntoUTF8(buf_: []u8, comptime Type: type, latin1_: Type) EncodeIntoResult {
return copyLatin1IntoUTF8StopOnNonASCII(buf_, Type, latin1_, false);
}
@@ -3337,6 +3415,35 @@ pub fn encodeWTF8RuneT(p: *[4]u8, comptime R: type, r: R) u3 {
}
}
pub fn wtf8Sequence(code_point: u32) [4]u8 {
return switch (code_point) {
0...0x7f => .{
@intCast(code_point),
0,
0,
0,
},
(0x7f + 1)...0x7ff => .{
@truncate(0xc0 | (code_point >> 6)),
@truncate(0x80 | (code_point & 0x3f)),
0,
0,
},
(0x7ff + 1)...0xffff => .{
@truncate(0xe0 | (code_point >> 12)),
@truncate(0x80 | ((code_point >> 6) & 0x3f)),
@truncate(0x80 | (code_point & 0x3f)),
0,
},
else => .{
@truncate(0xf0 | (code_point >> 18)),
@truncate(0x80 | ((code_point >> 12) & 0x3f)),
@truncate(0x80 | ((code_point >> 6) & 0x3f)),
@truncate(0x80 | (code_point & 0x3f)),
},
};
}
pub inline fn wtf8ByteSequenceLength(first_byte: u8) u3 {
return switch (first_byte) {
0 => 0,
@@ -3521,16 +3628,36 @@ pub fn isAllASCII(slice: []const u8) bool {
return true;
}
//#define U16_LEAD(supplementary) (UChar)(((supplementary)>>10)+0xd7c0)
// #define U16_LEAD(supplementary) (UChar)(((supplementary)>>10)+0xd7c0)
pub inline fn u16Lead(supplementary: anytype) u16 {
return @as(u16, @intCast((supplementary >> 10) + 0xd7c0));
return @intCast((supplementary >> 10) + 0xd7c0);
}
//#define U16_TRAIL(supplementary) (UChar)(((supplementary)&0x3ff)|0xdc00)
// #define U16_TRAIL(supplementary) (UChar)(((supplementary)&0x3ff)|0xdc00)
pub inline fn u16Trail(supplementary: anytype) u16 {
return @as(u16, @intCast((supplementary & 0x3ff) | 0xdc00));
return @intCast((supplementary & 0x3ff) | 0xdc00);
}
// #define U16_IS_TRAIL(c) (((c)&0xfffffc00)==0xdc00)
pub inline fn u16IsTrail(supplementary: u16) bool {
return (@as(u32, @intCast(supplementary)) & 0xfffffc00) == 0xdc00;
}
// #define U16_IS_LEAD(c) (((c)&0xfffffc00)==0xd800)
pub inline fn u16IsLead(supplementary: u16) bool {
return (@as(u32, @intCast(supplementary)) & 0xfffffc00) == 0xd800;
}
// #define U16_GET_SUPPLEMENTARY(lead, trail) \
// (((UChar32)(lead)<<10UL)+(UChar32)(trail)-U16_SURROGATE_OFFSET)
pub inline fn u16GetSupplementary(lead: u32, trail: u32) u32 {
const shifted = lead << 10;
return (shifted + trail) - u16_surrogate_offset;
}
// #define U16_SURROGATE_OFFSET ((0xd800<<10UL)+0xdc00-0x10000)
pub const u16_surrogate_offset = 56613888;
pub fn firstNonASCII(slice: []const u8) ?u32 {
return firstNonASCIIWithType([]const u8, slice);
}