mirror of
https://github.com/oven-sh/bun
synced 2026-02-09 10:28:47 +00:00
Cleanup some of the encoding code
This commit is contained in:
@@ -292,48 +292,52 @@ pub const length = struct {
|
||||
|
||||
pub const trim = struct {
|
||||
pub fn utf8_len(buf: []const u8) usize {
|
||||
if (buf.len < 3) {
|
||||
switch (buf.len) {
|
||||
const len = buf.len;
|
||||
|
||||
if (len < 3) {
|
||||
switch (len) {
|
||||
2 => {
|
||||
if (buf[buf.len - 1] >= 0b11000000) {
|
||||
return buf.len - 1;
|
||||
if (buf[len - 1] >= 0b11000000) {
|
||||
return len - 1;
|
||||
} // 2-, 3- and 4-byte characters with only 1 byte left
|
||||
if (buf[buf.len - 2] >= 0b11100000) {
|
||||
return buf.len - 2;
|
||||
if (buf[len - 2] >= 0b11100000) {
|
||||
return len - 2;
|
||||
} // 3- and 4-byte characters with only 2 bytes left
|
||||
return buf.len;
|
||||
return len;
|
||||
},
|
||||
1 => {
|
||||
if (buf[buf.len - 1] >= 0b11000000) {
|
||||
return buf.len - 1;
|
||||
if (buf[len - 1] >= 0b11000000) {
|
||||
return len - 1;
|
||||
} // 2-, 3- and 4-byte characters with only 1 byte left
|
||||
return buf.len;
|
||||
return len;
|
||||
},
|
||||
0 => return buf.len,
|
||||
0 => return len,
|
||||
else => unreachable,
|
||||
}
|
||||
}
|
||||
|
||||
if (buf[buf.len - 1] >= 0b11000000) {
|
||||
return buf.len - 1;
|
||||
if (buf[len - 1] >= 0b11000000) {
|
||||
return len - 1;
|
||||
} // 2-, 3- and 4-byte characters with only 1 byte left
|
||||
if (buf[buf.len - 2] >= 0b11100000) {
|
||||
return buf.len - 2;
|
||||
if (buf[len - 2] >= 0b11100000) {
|
||||
return len - 2;
|
||||
} // 3- and 4-byte characters with only 1 byte left
|
||||
if (buf[buf.len - 3] >= 0b11110000) {
|
||||
return buf.len - 3;
|
||||
if (buf[len - 3] >= 0b11110000) {
|
||||
return len - 3;
|
||||
} // 4-byte characters with only 3 bytes left
|
||||
return buf.len;
|
||||
return len;
|
||||
}
|
||||
|
||||
pub fn utf16_len(buf: []const u16) usize {
|
||||
if (buf.len == 0) {
|
||||
const len = buf.len;
|
||||
|
||||
if (len == 0) {
|
||||
return 0;
|
||||
}
|
||||
if ((buf[buf.len - 1] >= 0xD800) and (buf[buf.len - 1] <= 0xDBFF)) {
|
||||
return buf.len - 1;
|
||||
if ((buf[len - 1] >= 0xD800) and (buf[len - 1] <= 0xDBFF)) {
|
||||
return len - 1;
|
||||
}
|
||||
return buf.len;
|
||||
return len;
|
||||
}
|
||||
|
||||
pub fn utf16(buf: []const u16) []const u16 {
|
||||
|
||||
@@ -875,7 +875,6 @@ pub const Encoder = struct {
|
||||
return ZigString.init(to).toExternalValue(global);
|
||||
},
|
||||
.buffer, .utf8 => {
|
||||
// JSC only supports UTF-16 strings for non-ascii text
|
||||
const converted = strings.toUTF16Alloc(allocator, input, false) catch return ZigString.init("Out of memory").toErrorInstance(global);
|
||||
if (converted) |utf16| {
|
||||
return ZigString.toExternalU16(utf16.ptr, utf16.len, global);
|
||||
@@ -886,11 +885,11 @@ pub const Encoder = struct {
|
||||
return ZigString.init(input).toValueGC(global);
|
||||
},
|
||||
.ucs2, .utf16le => {
|
||||
var output = allocator.alloc(u16, len / 2) catch return ZigString.init("Out of memory").toErrorInstance(global);
|
||||
var i: usize = 0;
|
||||
while (i < len / 2) : (i += 1) {
|
||||
output[i] = (@intCast(u16, input[2 * i + 1]) << 8) + @intCast(u16, input[2 * i]);
|
||||
}
|
||||
var output = allocator.alloc(u16, @maximum(len / 2, 1)) catch return ZigString.init("Out of memory").toErrorInstance(global);
|
||||
var output_bytes = std.mem.sliceAsBytes(output);
|
||||
output_bytes[output_bytes.len - 1] = 0;
|
||||
|
||||
@memcpy(output_bytes.ptr, input_ptr, output_bytes.len);
|
||||
return ZigString.toExternalU16(output.ptr, output.len, global);
|
||||
},
|
||||
|
||||
|
||||
@@ -936,19 +936,26 @@ const strings = @This();
|
||||
/// This is intended to be used for strings that go to JavaScript
|
||||
pub fn toUTF16Alloc(allocator: std.mem.Allocator, bytes: []const u8, comptime fail_if_invalid: bool) !?[]u16 {
|
||||
var first_non_ascii: ?u32 = null;
|
||||
var output_: ?std.ArrayList(u16) = null;
|
||||
|
||||
if (bun.FeatureFlags.use_simdutf) {
|
||||
if (bytes.len == 0)
|
||||
return &[_]u16{};
|
||||
|
||||
if (bun.simdutf.validate.ascii(bytes))
|
||||
const validated = bun.simdutf.validate.with_errors.ascii(bytes);
|
||||
if (validated.status == .success)
|
||||
return null;
|
||||
|
||||
const trimmed = bun.simdutf.trim.utf8(bytes);
|
||||
const out_length = bun.simdutf.length.utf16.from.utf8.le(trimmed);
|
||||
var out = try allocator.alloc(u16, out_length);
|
||||
const offset = @truncate(u32, validated.count);
|
||||
|
||||
const result = bun.simdutf.convert.utf8.to.utf16.with_errors.le(trimmed, out);
|
||||
const trimmed = bun.simdutf.trim.utf8(bytes[offset..]);
|
||||
const out_length = bun.simdutf.length.utf16.from.utf8.le(trimmed);
|
||||
var out = try allocator.alloc(u16, out_length + offset);
|
||||
|
||||
if (offset > 0)
|
||||
strings.copyU8IntoU16(out[0..offset], bytes[0..offset]);
|
||||
|
||||
const result = bun.simdutf.convert.utf8.to.utf16.with_errors.le(trimmed, out[offset..]);
|
||||
switch (result.status) {
|
||||
.success => {
|
||||
return out;
|
||||
@@ -959,7 +966,12 @@ pub fn toUTF16Alloc(allocator: std.mem.Allocator, bytes: []const u8, comptime fa
|
||||
return error.InvalidByteSequence;
|
||||
}
|
||||
|
||||
first_non_ascii = @truncate(u32, result.count);
|
||||
first_non_ascii = @truncate(u32, result.count) + offset;
|
||||
output_ = std.ArrayList(u16){
|
||||
.items = out[0..first_non_ascii.?],
|
||||
.capacity = out.len,
|
||||
.allocator = allocator,
|
||||
};
|
||||
},
|
||||
}
|
||||
}
|
||||
@@ -967,10 +979,11 @@ pub fn toUTF16Alloc(allocator: std.mem.Allocator, bytes: []const u8, comptime fa
|
||||
if (first_non_ascii orelse strings.firstNonASCII(bytes)) |i| {
|
||||
const ascii = bytes[0..i];
|
||||
const chunk = bytes[i..];
|
||||
var output = try std.ArrayList(u16).initCapacity(allocator, ascii.len + 2);
|
||||
var output = output_ orelse try std.ArrayList(u16).initCapacity(allocator, ascii.len + 2);
|
||||
errdefer output.deinit();
|
||||
output.items.len = ascii.len;
|
||||
strings.copyU8IntoU16(output.items, ascii);
|
||||
if (first_non_ascii == null)
|
||||
strings.copyU8IntoU16(output.items, ascii);
|
||||
|
||||
var remaining = chunk;
|
||||
|
||||
|
||||
Reference in New Issue
Block a user