mirror of
https://github.com/oven-sh/bun
synced 2026-02-09 18:38:55 +00:00
* support utf16-le files * [autofix.ci] apply automated fixes --------- Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
5764 lines
227 KiB
Zig
5764 lines
227 KiB
Zig
const std = @import("std");
|
||
const expect = std.testing.expect;
|
||
const Environment = @import("./env.zig");
|
||
const string = bun.string;
|
||
const stringZ = bun.stringZ;
|
||
const CodePoint = bun.CodePoint;
|
||
const bun = @import("root").bun;
|
||
pub const joiner = @import("./string_joiner.zig");
|
||
const log = bun.Output.scoped(.STR, true);
|
||
const js_lexer = @import("./js_lexer.zig");
|
||
|
||
pub const Encoding = enum {
|
||
ascii,
|
||
utf8,
|
||
latin1,
|
||
utf16,
|
||
};
|
||
|
||
pub inline fn containsChar(self: string, char: u8) bool {
|
||
return indexOfChar(self, char) != null;
|
||
}
|
||
|
||
pub inline fn contains(self: string, str: string) bool {
|
||
return indexOf(self, str) != null;
|
||
}
|
||
|
||
pub fn w(comptime str: []const u8) [:0]const u16 {
|
||
if (!@inComptime()) @compileError("strings.w() must be called in a comptime context");
|
||
comptime var output: [str.len + 1]u16 = undefined;
|
||
|
||
for (str, 0..) |c, i| {
|
||
output[i] = c;
|
||
}
|
||
output[str.len] = 0;
|
||
|
||
const Static = struct {
|
||
pub const literal: [:0]const u16 = output[0 .. output.len - 1 :0];
|
||
};
|
||
return Static.literal;
|
||
}
|
||
|
||
pub fn toUTF16Literal(comptime str: []const u8) []const u16 {
|
||
return comptime brk: {
|
||
comptime var output: [str.len]u16 = undefined;
|
||
|
||
for (str, 0..) |c, i| {
|
||
output[i] = c;
|
||
}
|
||
|
||
const Static = struct {
|
||
pub const literal: []const u16 = output[0..];
|
||
};
|
||
break :brk Static.literal;
|
||
};
|
||
}
|
||
|
||
pub const OptionalUsize = std.meta.Int(.unsigned, @bitSizeOf(usize) - 1);
|
||
pub fn indexOfAny(slice: string, comptime str: anytype) ?OptionalUsize {
|
||
switch (comptime str.len) {
|
||
0 => @compileError("str cannot be empty"),
|
||
1 => return indexOfChar(slice, str[0]),
|
||
else => {},
|
||
}
|
||
|
||
var remaining = slice;
|
||
if (remaining.len == 0) return null;
|
||
|
||
if (comptime Environment.enableSIMD) {
|
||
while (remaining.len >= ascii_vector_size) {
|
||
const vec: AsciiVector = remaining[0..ascii_vector_size].*;
|
||
var cmp: AsciiVectorU1 = @bitCast(vec == @as(AsciiVector, @splat(@as(u8, str[0]))));
|
||
inline for (str[1..]) |c| {
|
||
cmp |= @bitCast(vec == @as(AsciiVector, @splat(@as(u8, c))));
|
||
}
|
||
|
||
if (@reduce(.Max, cmp) > 0) {
|
||
const bitmask = @as(AsciiVectorInt, @bitCast(cmp));
|
||
const first = @ctz(bitmask);
|
||
|
||
return @as(OptionalUsize, @intCast(first + slice.len - remaining.len));
|
||
}
|
||
|
||
remaining = remaining[ascii_vector_size..];
|
||
}
|
||
|
||
if (comptime Environment.allow_assert) std.debug.assert(remaining.len < ascii_vector_size);
|
||
}
|
||
|
||
for (remaining, 0..) |c, i| {
|
||
if (strings.indexOfChar(str, c) != null) {
|
||
return @as(OptionalUsize, @intCast(i + slice.len - remaining.len));
|
||
}
|
||
}
|
||
|
||
return null;
|
||
}
|
||
pub fn indexOfAny16(self: []const u16, comptime str: anytype) ?OptionalUsize {
|
||
for (self, 0..) |c, i| {
|
||
inline for (str) |a| {
|
||
if (c == a) {
|
||
return @as(OptionalUsize, @intCast(i));
|
||
}
|
||
}
|
||
}
|
||
|
||
return null;
|
||
}
|
||
pub inline fn containsComptime(self: string, comptime str: string) bool {
|
||
var remain = self;
|
||
const Int = std.meta.Int(.unsigned, str.len * 8);
|
||
|
||
while (remain.len >= comptime str.len) {
|
||
if (@as(Int, @bitCast(remain.ptr[0..str.len].*)) == @as(Int, @bitCast(str.ptr[0..str.len].*))) {
|
||
return true;
|
||
}
|
||
remain = remain[str.len..];
|
||
}
|
||
|
||
return false;
|
||
}
|
||
pub const includes = contains;
|
||
|
||
pub fn inMapCaseInsensitive(self: string, comptime ComptimeStringMap: anytype) ?ComptimeStringMap.Value {
|
||
return bun.String.static(self).inMapCaseInsensitive(ComptimeStringMap);
|
||
}
|
||
|
||
pub inline fn containsAny(in: anytype, target: string) bool {
|
||
for (in) |str| if (contains(if (@TypeOf(str) == u8) &[1]u8{str} else bun.span(str), target)) return true;
|
||
return false;
|
||
}
|
||
|
||
/// https://docs.npmjs.com/cli/v8/configuring-npm/package-json
|
||
/// - The name must be less than or equal to 214 characters. This includes the scope for scoped packages.
|
||
/// - The names of scoped packages can begin with a dot or an underscore. This is not permitted without a scope.
|
||
/// - New packages must not have uppercase letters in the name.
|
||
/// - The name ends up being part of a URL, an argument on the command line, and
|
||
/// a folder name. Therefore, the name can't contain any non-URL-safe
|
||
/// characters.
|
||
pub inline fn isNPMPackageName(target: string) bool {
|
||
if (target.len == 0) return false;
|
||
if (target.len > 214) return false;
|
||
|
||
const scoped = switch (target[0]) {
|
||
// Old packages may have capital letters
|
||
'A'...'Z', 'a'...'z', '0'...'9', '$', '-' => false,
|
||
'@' => true,
|
||
else => return false,
|
||
};
|
||
|
||
var slash_index: usize = 0;
|
||
for (target[1..], 0..) |c, i| {
|
||
switch (c) {
|
||
// Old packages may have capital letters
|
||
'A'...'Z', 'a'...'z', '0'...'9', '-', '_', '.' => {},
|
||
'/' => {
|
||
if (!scoped) return false;
|
||
if (slash_index > 0) return false;
|
||
slash_index = i + 1;
|
||
},
|
||
// issue#7045, package "@~3/svelte_mount"
|
||
// https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/encodeURIComponent#description
|
||
// It escapes all characters except: A–Z a–z 0–9 - _ . ! ~ * ' ( )
|
||
'!', '~', '*', '\'', '(', ')' => {
|
||
if (!scoped or slash_index > 0) return false;
|
||
},
|
||
else => return false,
|
||
}
|
||
}
|
||
|
||
return !scoped or slash_index > 0 and slash_index + 1 < target.len;
|
||
}
|
||
|
||
pub inline fn indexAny(in: anytype, target: string) ?usize {
|
||
for (in, 0..) |str, i| if (indexOf(str, target) != null) return i;
|
||
return null;
|
||
}
|
||
|
||
pub inline fn indexAnyComptime(target: string, comptime chars: string) ?usize {
|
||
for (target, 0..) |parent, i| {
|
||
inline for (chars) |char| {
|
||
if (char == parent) return i;
|
||
}
|
||
}
|
||
return null;
|
||
}
|
||
|
||
pub inline fn indexEqualAny(in: anytype, target: string) ?usize {
|
||
for (in, 0..) |str, i| if (eqlLong(str, target, true)) return i;
|
||
return null;
|
||
}
|
||
|
||
pub fn repeatingAlloc(allocator: std.mem.Allocator, count: usize, char: u8) ![]u8 {
|
||
const buf = try allocator.alloc(u8, count);
|
||
repeatingBuf(buf, char);
|
||
return buf;
|
||
}
|
||
|
||
pub fn repeatingBuf(self: []u8, char: u8) void {
|
||
@memset(self, char);
|
||
}
|
||
|
||
pub fn indexOfCharNeg(self: string, char: u8) i32 {
|
||
for (self, 0..) |c, i| {
|
||
if (c == char) return @as(i32, @intCast(i));
|
||
}
|
||
return -1;
|
||
}
|
||
|
||
pub fn indexOfSigned(self: string, str: string) i32 {
|
||
const i = std.mem.indexOf(u8, self, str) orelse return -1;
|
||
return @as(i32, @intCast(i));
|
||
}
|
||
|
||
pub inline fn lastIndexOfChar(self: string, char: u8) ?usize {
|
||
return std.mem.lastIndexOfScalar(u8, self, char);
|
||
}
|
||
|
||
pub inline fn lastIndexOf(self: string, str: string) ?usize {
|
||
return std.mem.lastIndexOf(u8, self, str);
|
||
}
|
||
|
||
pub inline fn indexOf(self: string, str: string) ?usize {
|
||
if (comptime !bun.Environment.isNative) {
|
||
return std.mem.indexOf(u8, self, str);
|
||
}
|
||
|
||
const self_len = self.len;
|
||
const str_len = str.len;
|
||
|
||
// > Both old and new libc's have the bug that if needle is empty,
|
||
// > haystack-1 (instead of haystack) is returned. And glibc 2.0 makes it
|
||
// > worse, returning a pointer to the last byte of haystack. This is fixed
|
||
// > in glibc 2.1.
|
||
if (self_len == 0 or str_len == 0 or self_len < str_len)
|
||
return null;
|
||
|
||
const self_ptr = self.ptr;
|
||
const str_ptr = str.ptr;
|
||
|
||
if (str_len == 1)
|
||
return indexOfCharUsize(self, str_ptr[0]);
|
||
|
||
const start = bun.C.memmem(self_ptr, self_len, str_ptr, str_len) orelse return null;
|
||
|
||
const i = @intFromPtr(start) - @intFromPtr(self_ptr);
|
||
std.debug.assert(i < self_len);
|
||
return @as(usize, @intCast(i));
|
||
}
|
||
|
||
pub fn split(self: string, delimiter: string) SplitIterator {
|
||
return SplitIterator{
|
||
.buffer = self,
|
||
.index = 0,
|
||
.delimiter = delimiter,
|
||
};
|
||
}
|
||
|
||
pub const SplitIterator = struct {
|
||
buffer: []const u8,
|
||
index: ?usize,
|
||
delimiter: []const u8,
|
||
|
||
const Self = @This();
|
||
|
||
/// Returns a slice of the first field. This never fails.
|
||
/// Call this only to get the first field and then use `next` to get all subsequent fields.
|
||
pub fn first(self: *Self) []const u8 {
|
||
std.debug.assert(self.index.? == 0);
|
||
return self.next().?;
|
||
}
|
||
|
||
/// Returns a slice of the next field, or null if splitting is complete.
|
||
pub fn next(self: *Self) ?[]const u8 {
|
||
const start = self.index orelse return null;
|
||
const end = if (indexOf(self.buffer[start..], self.delimiter)) |delim_start| blk: {
|
||
const del = delim_start + start;
|
||
self.index = del + self.delimiter.len;
|
||
break :blk delim_start + start;
|
||
} else blk: {
|
||
self.index = null;
|
||
break :blk self.buffer.len;
|
||
};
|
||
|
||
return self.buffer[start..end];
|
||
}
|
||
|
||
/// Returns a slice of the remaining bytes. Does not affect iterator state.
|
||
pub fn rest(self: Self) []const u8 {
|
||
const end = self.buffer.len;
|
||
const start = self.index orelse end;
|
||
return self.buffer[start..end];
|
||
}
|
||
|
||
/// Resets the iterator to the initial slice.
|
||
pub fn reset(self: *Self) void {
|
||
self.index = 0;
|
||
}
|
||
};
|
||
|
||
// --
|
||
// This is faster when the string is found, by about 2x for a 8 MB file.
|
||
// It is slower when the string is NOT found
|
||
// fn indexOfPosN(comptime T: type, buf: []const u8, start_index: usize, delimiter: []const u8, comptime n: comptime_int) ?usize {
|
||
// const k = delimiter.len;
|
||
// const V8x32 = @Vector(n, T);
|
||
// const V1x32 = @Vector(n, u1);
|
||
// const Vbx32 = @Vector(n, bool);
|
||
// const first = @splat(n, delimiter[0]);
|
||
// const last = @splat(n, delimiter[k - 1]);
|
||
|
||
// var end: usize = start_index + n;
|
||
// var start: usize = end - n;
|
||
// while (end < buf.len) {
|
||
// start = end - n;
|
||
// const last_end = @min(end + k - 1, buf.len);
|
||
// const last_start = last_end - n;
|
||
|
||
// // Look for the first character in the delimter
|
||
// const first_chunk: V8x32 = buf[start..end][0..n].*;
|
||
// const last_chunk: V8x32 = buf[last_start..last_end][0..n].*;
|
||
// const mask = @bitCast(V1x32, first == first_chunk) & @bitCast(V1x32, last == last_chunk);
|
||
|
||
// if (@reduce(.Or, mask) != 0) {
|
||
// // TODO: Use __builtin_clz???
|
||
// for (@as([n]bool, @bitCast(Vbx32, mask))) |match, i| {
|
||
// if (match and eqlLong(buf[start + i .. start + i + k], delimiter, false)) {
|
||
// return start + i;
|
||
// }
|
||
// }
|
||
// }
|
||
// end = @min(end + n, buf.len);
|
||
// }
|
||
// if (start < buf.len) return std.mem.indexOfPos(T, buf, start_index, delimiter);
|
||
// return null; // Not found
|
||
// }
|
||
|
||
pub fn cat(allocator: std.mem.Allocator, first: string, second: string) !string {
|
||
var out = try allocator.alloc(u8, first.len + second.len);
|
||
bun.copy(u8, out, first);
|
||
bun.copy(u8, out[first.len..], second);
|
||
return out;
|
||
}
|
||
|
||
// 31 character string or a slice
|
||
pub const StringOrTinyString = struct {
|
||
pub const Max = 31;
|
||
const Buffer = [Max]u8;
|
||
|
||
remainder_buf: Buffer = undefined,
|
||
meta: packed struct {
|
||
remainder_len: u7 = 0,
|
||
is_tiny_string: u1 = 0,
|
||
} = .{},
|
||
|
||
comptime {
|
||
std.debug.assert(@sizeOf(@This()) == 32);
|
||
}
|
||
|
||
pub inline fn slice(this: *const StringOrTinyString) []const u8 {
|
||
// This is a switch expression instead of a statement to make sure it uses the faster assembly
|
||
return switch (this.meta.is_tiny_string) {
|
||
1 => this.remainder_buf[0..this.meta.remainder_len],
|
||
0 => @as([*]const u8, @ptrFromInt(std.mem.readInt(usize, this.remainder_buf[0..@sizeOf(usize)], .little)))[0..std.mem.readInt(usize, this.remainder_buf[@sizeOf(usize) .. @sizeOf(usize) * 2], .little)],
|
||
};
|
||
}
|
||
|
||
pub fn deinit(this: *StringOrTinyString, _: std.mem.Allocator) void {
|
||
if (this.meta.is_tiny_string == 1) return;
|
||
|
||
// var slice_ = this.slice();
|
||
// allocator.free(slice_);
|
||
}
|
||
|
||
pub fn initAppendIfNeeded(stringy: string, comptime Appender: type, appendy: Appender) !StringOrTinyString {
|
||
if (stringy.len <= StringOrTinyString.Max) {
|
||
return StringOrTinyString.init(stringy);
|
||
}
|
||
|
||
return StringOrTinyString.init(try appendy.append(string, stringy));
|
||
}
|
||
|
||
pub fn initLowerCaseAppendIfNeeded(stringy: string, comptime Appender: type, appendy: Appender) !StringOrTinyString {
|
||
if (stringy.len <= StringOrTinyString.Max) {
|
||
return StringOrTinyString.initLowerCase(stringy);
|
||
}
|
||
|
||
return StringOrTinyString.init(try appendy.appendLowerCase(string, stringy));
|
||
}
|
||
|
||
pub fn init(stringy: string) StringOrTinyString {
|
||
switch (stringy.len) {
|
||
0 => {
|
||
return StringOrTinyString{ .meta = .{
|
||
.is_tiny_string = 1,
|
||
.remainder_len = 0,
|
||
} };
|
||
},
|
||
1...(@sizeOf(Buffer)) => {
|
||
@setRuntimeSafety(false);
|
||
var tiny = StringOrTinyString{ .meta = .{
|
||
.is_tiny_string = 1,
|
||
.remainder_len = @as(u7, @truncate(stringy.len)),
|
||
} };
|
||
@memcpy(tiny.remainder_buf[0..tiny.meta.remainder_len], stringy[0..tiny.meta.remainder_len]);
|
||
return tiny;
|
||
},
|
||
else => {
|
||
var tiny = StringOrTinyString{ .meta = .{
|
||
.is_tiny_string = 0,
|
||
.remainder_len = 0,
|
||
} };
|
||
std.mem.writeInt(usize, tiny.remainder_buf[0..@sizeOf(usize)], @intFromPtr(stringy.ptr), .little);
|
||
std.mem.writeInt(usize, tiny.remainder_buf[@sizeOf(usize) .. @sizeOf(usize) * 2], stringy.len, .little);
|
||
return tiny;
|
||
},
|
||
}
|
||
}
|
||
|
||
pub fn initLowerCase(stringy: string) StringOrTinyString {
|
||
switch (stringy.len) {
|
||
0 => {
|
||
return StringOrTinyString{ .meta = .{
|
||
.is_tiny_string = 1,
|
||
.remainder_len = 0,
|
||
} };
|
||
},
|
||
1...(@sizeOf(Buffer)) => {
|
||
@setRuntimeSafety(false);
|
||
var tiny = StringOrTinyString{ .meta = .{
|
||
.is_tiny_string = 1,
|
||
.remainder_len = @as(u7, @truncate(stringy.len)),
|
||
} };
|
||
_ = copyLowercase(stringy, &tiny.remainder_buf);
|
||
return tiny;
|
||
},
|
||
else => {
|
||
var tiny = StringOrTinyString{ .meta = .{
|
||
.is_tiny_string = 0,
|
||
.remainder_len = 0,
|
||
} };
|
||
std.mem.writeInt(usize, tiny.remainder_buf[0..@sizeOf(usize)], @intFromPtr(stringy.ptr), .little);
|
||
std.mem.writeInt(usize, tiny.remainder_buf[@sizeOf(usize) .. @sizeOf(usize) * 2], stringy.len, .little);
|
||
return tiny;
|
||
},
|
||
}
|
||
}
|
||
};
|
||
|
||
pub fn copyLowercase(in: string, out: []u8) string {
|
||
var in_slice = in;
|
||
var out_slice = out;
|
||
|
||
begin: while (true) {
|
||
for (in_slice, 0..) |c, i| {
|
||
switch (c) {
|
||
'A'...'Z' => {
|
||
bun.copy(u8, out_slice, in_slice[0..i]);
|
||
out_slice[i] = std.ascii.toLower(c);
|
||
const end = i + 1;
|
||
in_slice = in_slice[end..];
|
||
out_slice = out_slice[end..];
|
||
continue :begin;
|
||
},
|
||
else => {},
|
||
}
|
||
}
|
||
|
||
bun.copy(u8, out_slice, in_slice);
|
||
break :begin;
|
||
}
|
||
|
||
return out[0..in.len];
|
||
}
|
||
|
||
pub fn copyLowercaseIfNeeded(in: string, out: []u8) string {
|
||
var in_slice = in;
|
||
var out_slice = out;
|
||
var any = false;
|
||
|
||
begin: while (true) {
|
||
for (in_slice, 0..) |c, i| {
|
||
switch (c) {
|
||
'A'...'Z' => {
|
||
bun.copy(u8, out_slice, in_slice[0..i]);
|
||
out_slice[i] = std.ascii.toLower(c);
|
||
const end = i + 1;
|
||
in_slice = in_slice[end..];
|
||
out_slice = out_slice[end..];
|
||
any = true;
|
||
continue :begin;
|
||
},
|
||
else => {},
|
||
}
|
||
}
|
||
|
||
if (any) bun.copy(u8, out_slice, in_slice);
|
||
break :begin;
|
||
}
|
||
|
||
return if (any) out[0..in.len] else in;
|
||
}
|
||
|
||
test "indexOf" {
|
||
const fixtures = .{
|
||
.{
|
||
"0123456789",
|
||
"456",
|
||
},
|
||
.{
|
||
"/foo/bar/baz/bacon/eggs/lettuce/tomatoe",
|
||
"bacon",
|
||
},
|
||
.{
|
||
"/foo/bar/baz/bacon////eggs/lettuce/tomatoe",
|
||
"eggs",
|
||
},
|
||
.{
|
||
"////////////////zfoo/bar/baz/bacon/eggs/lettuce/tomatoe",
|
||
"/",
|
||
},
|
||
.{
|
||
"/okay/well/thats/even/longer/now/well/thats/even/longer/now/well/thats/even/longer/now/foo/bar/baz/bacon/eggs/lettuce/tomatoe",
|
||
"/tomatoe",
|
||
},
|
||
.{
|
||
"/okay///////////so much length i can't believe it!much length i can't believe it!much length i can't believe it!much length i can't believe it!much length i can't believe it!much length i can't believe it!much length i can't believe it!much length i can't believe it!/well/thats/even/longer/now/well/thats/even/longer/now/well/thats/even/longer/now/foo/bar/baz/bacon/eggs/lettuce/tomatoe",
|
||
"/tomatoe",
|
||
},
|
||
};
|
||
|
||
inline for (fixtures) |pair| {
|
||
try std.testing.expectEqual(
|
||
indexOf(pair[0], pair[1]).?,
|
||
std.mem.indexOf(u8, pair[0], pair[1]).?,
|
||
);
|
||
}
|
||
}
|
||
|
||
test "eqlComptimeCheckLen" {
|
||
try std.testing.expectEqual(eqlComptime("bun-darwin-aarch64.zip", "bun-darwin-aarch64.zip"), true);
|
||
const sizes = [_]u8{ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 23, 22, 24 };
|
||
inline for (sizes) |size| {
|
||
var buf: [size]u8 = undefined;
|
||
@memset(&buf, 'a');
|
||
var buf_copy: [size]u8 = undefined;
|
||
@memset(&buf_copy, 'a');
|
||
|
||
var bad: [size]u8 = undefined;
|
||
@memset(&bad, 'b');
|
||
try std.testing.expectEqual(std.mem.eql(u8, &buf, &buf_copy), eqlComptime(&buf, comptime brk: {
|
||
var buf_copy_: [size]u8 = undefined;
|
||
@memset(&buf_copy_, 'a');
|
||
break :brk buf_copy_;
|
||
}));
|
||
|
||
try std.testing.expectEqual(std.mem.eql(u8, &buf, &bad), eqlComptime(&bad, comptime brk: {
|
||
var buf_copy_: [size]u8 = undefined;
|
||
@memset(&buf_copy_, 'a');
|
||
break :brk buf_copy_;
|
||
}));
|
||
}
|
||
}
|
||
|
||
test "eqlComptimeUTF16" {
|
||
try std.testing.expectEqual(eqlComptimeUTF16(toUTF16Literal("bun-darwin-aarch64.zip"), "bun-darwin-aarch64.zip"), true);
|
||
const sizes = [_]u16{ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 23, 22, 24 };
|
||
inline for (sizes) |size| {
|
||
var buf: [size]u16 = undefined;
|
||
@memset(&buf, @as(u8, 'a'));
|
||
var buf_copy: [size]u16 = undefined;
|
||
@memset(&buf_copy, @as(u8, 'a'));
|
||
|
||
var bad: [size]u16 = undefined;
|
||
@memset(&bad, @as(u16, 'b'));
|
||
try std.testing.expectEqual(std.mem.eql(u16, &buf, &buf_copy), eqlComptimeUTF16(&buf, comptime &brk: {
|
||
var buf_copy_: [size]u8 = undefined;
|
||
@memset(&buf_copy_, @as(u8, 'a'));
|
||
break :brk buf_copy_;
|
||
}));
|
||
|
||
try std.testing.expectEqual(std.mem.eql(u16, &buf, &bad), eqlComptimeUTF16(&bad, comptime &brk: {
|
||
var buf_copy_: [size]u8 = undefined;
|
||
@memset(&buf_copy_, @as(u8, 'a'));
|
||
break :brk buf_copy_;
|
||
}));
|
||
}
|
||
}
|
||
|
||
test "copyLowercase" {
|
||
{
|
||
const in = "Hello, World!";
|
||
var out = std.mem.zeroes([in.len]u8);
|
||
const out_ = copyLowercase(in, &out);
|
||
try std.testing.expectEqualStrings(out_, "hello, world!");
|
||
}
|
||
|
||
{
|
||
const in = "_ListCache";
|
||
var out = std.mem.zeroes([in.len]u8);
|
||
const out_ = copyLowercase(in, &out);
|
||
try std.testing.expectEqualStrings(out_, "_listcache");
|
||
}
|
||
}
|
||
|
||
test "StringOrTinyString" {
|
||
const correct: string = "helloooooooo";
|
||
const big = "wawaweewaverylargeihaveachairwawaweewaverylargeihaveachairwawaweewaverylargeihaveachairwawaweewaverylargeihaveachair";
|
||
var str = StringOrTinyString.init(correct);
|
||
try std.testing.expectEqualStrings(correct, str.slice());
|
||
|
||
str = StringOrTinyString.init(big);
|
||
try std.testing.expectEqualStrings(big, str.slice());
|
||
try std.testing.expect(@sizeOf(StringOrTinyString) == 32);
|
||
}
|
||
|
||
test "StringOrTinyString Lowercase" {
|
||
const correct: string = "HELLO!!!!!";
|
||
var str = StringOrTinyString.initLowerCase(correct);
|
||
try std.testing.expectEqualStrings("hello!!!!!", str.slice());
|
||
}
|
||
|
||
/// Copy a string into a buffer
|
||
/// Return the copied version
|
||
pub fn copy(buf: []u8, src: []const u8) []const u8 {
|
||
const len = @min(buf.len, src.len);
|
||
if (len > 0)
|
||
@memcpy(buf[0..len], src[0..len]);
|
||
return buf[0..len];
|
||
}
|
||
|
||
/// startsWith except it checks for non-empty strings
|
||
pub fn hasPrefix(self: string, str: string) bool {
|
||
return str.len > 0 and startsWith(self, str);
|
||
}
|
||
|
||
pub fn startsWith(self: string, str: string) bool {
|
||
if (str.len > self.len) {
|
||
return false;
|
||
}
|
||
|
||
return eqlLong(self[0..str.len], str, false);
|
||
}
|
||
|
||
pub inline fn endsWith(self: string, str: string) bool {
|
||
return str.len == 0 or @call(.always_inline, std.mem.endsWith, .{ u8, self, str });
|
||
}
|
||
|
||
pub inline fn endsWithComptime(self: string, comptime str: anytype) bool {
|
||
return self.len >= str.len and eqlComptimeIgnoreLen(self[self.len - str.len .. self.len], comptime str);
|
||
}
|
||
|
||
pub inline fn startsWithChar(self: string, char: u8) bool {
|
||
return self.len > 0 and self[0] == char;
|
||
}
|
||
|
||
pub inline fn endsWithChar(self: string, char: u8) bool {
|
||
return self.len > 0 and self[self.len - 1] == char;
|
||
}
|
||
|
||
pub inline fn endsWithCharOrIsZeroLength(self: string, char: u8) bool {
|
||
return self.len == 0 or self[self.len - 1] == char;
|
||
}
|
||
|
||
pub fn withoutTrailingSlash(this: string) []const u8 {
|
||
var href = this;
|
||
while (href.len > 1 and (switch (href[href.len - 1]) {
|
||
'/', '\\' => true,
|
||
else => false,
|
||
})) {
|
||
href.len -= 1;
|
||
}
|
||
|
||
return href;
|
||
}
|
||
|
||
/// Does not strip the C:\
|
||
pub fn withoutTrailingSlashWindowsPath(this: string) []const u8 {
|
||
if (this.len < 3 or
|
||
this[1] != ':') return withoutTrailingSlash(this);
|
||
|
||
var href = this;
|
||
while (href.len > 3 and (switch (href[href.len - 1]) {
|
||
'/', '\\' => true,
|
||
else => false,
|
||
})) {
|
||
href.len -= 1;
|
||
}
|
||
|
||
return href;
|
||
}
|
||
|
||
pub fn withTrailingSlash(dir: string, in: string) []const u8 {
|
||
if (comptime Environment.allow_assert) std.debug.assert(bun.isSliceInBuffer(dir, in));
|
||
return in[0..@min(strings.withoutTrailingSlash(in[0..@min(dir.len + 1, in.len)]).len + 1, in.len)];
|
||
}
|
||
|
||
pub fn withoutLeadingSlash(this: string) []const u8 {
|
||
return std.mem.trimLeft(u8, this, "/");
|
||
}
|
||
|
||
pub fn endsWithAny(self: string, str: string) bool {
|
||
const end = self[self.len - 1];
|
||
for (str) |char| {
|
||
if (char == end) {
|
||
return true;
|
||
}
|
||
}
|
||
|
||
return false;
|
||
}
|
||
|
||
pub fn quotedAlloc(allocator: std.mem.Allocator, self: string) !string {
|
||
var count: usize = 0;
|
||
for (self) |char| {
|
||
count += @intFromBool(char == '"');
|
||
}
|
||
|
||
if (count == 0) {
|
||
return allocator.dupe(u8, self);
|
||
}
|
||
|
||
var i: usize = 0;
|
||
var out = try allocator.alloc(u8, self.len + count);
|
||
for (self) |char| {
|
||
if (char == '"') {
|
||
out[i] = '\\';
|
||
i += 1;
|
||
}
|
||
out[i] = char;
|
||
i += 1;
|
||
}
|
||
|
||
return out;
|
||
}
|
||
|
||
pub fn eqlAnyComptime(self: string, comptime list: []const string) bool {
|
||
inline for (list) |item| {
|
||
if (eqlComptimeCheckLenWithType(u8, self, item, true)) return true;
|
||
}
|
||
|
||
return false;
|
||
}
|
||
|
||
/// Count the occurrences of a character in an ASCII byte array
|
||
/// uses SIMD
|
||
pub fn countChar(self: string, char: u8) usize {
|
||
var total: usize = 0;
|
||
var remaining = self;
|
||
|
||
const splatted: AsciiVector = @splat(char);
|
||
|
||
while (remaining.len >= 16) {
|
||
const vec: AsciiVector = remaining[0..ascii_vector_size].*;
|
||
const cmp = @popCount(@as(@Vector(ascii_vector_size, u1), @bitCast(vec == splatted)));
|
||
total += @as(usize, @reduce(.Add, cmp));
|
||
remaining = remaining[ascii_vector_size..];
|
||
}
|
||
|
||
while (remaining.len > 0) {
|
||
total += @as(usize, @intFromBool(remaining[0] == char));
|
||
remaining = remaining[1..];
|
||
}
|
||
|
||
return total;
|
||
}
|
||
|
||
test "countChar" {
|
||
try std.testing.expectEqual(countChar("hello there", ' '), 1);
|
||
try std.testing.expectEqual(countChar("hello;;;there", ';'), 3);
|
||
try std.testing.expectEqual(countChar("hello there", 'z'), 0);
|
||
try std.testing.expectEqual(countChar("hello there hello there hello there hello there hello there hello there hello there hello there hello there hello there hello there hello there hello there hello there ", ' '), 28);
|
||
try std.testing.expectEqual(countChar("hello there hello there hello there hello there hello there hello there hello there hello there hello there hello there hello there hello there hello there hello there ", 'z'), 0);
|
||
try std.testing.expectEqual(countChar("hello there hello there hello there hello there hello there hello there hello there hello there hello there hello there hello there hello there hello there hello there", ' '), 27);
|
||
}
|
||
|
||
pub fn endsWithAnyComptime(self: string, comptime str: string) bool {
|
||
if (comptime str.len < 10) {
|
||
const last = self[self.len - 1];
|
||
inline for (str) |char| {
|
||
if (char == last) {
|
||
return true;
|
||
}
|
||
}
|
||
|
||
return false;
|
||
} else {
|
||
return endsWithAny(self, str);
|
||
}
|
||
}
|
||
|
||
pub fn eql(self: string, other: anytype) bool {
|
||
if (self.len != other.len) return false;
|
||
if (comptime @TypeOf(other) == *string) {
|
||
return eql(self, other.*);
|
||
}
|
||
|
||
for (self, 0..) |c, i| {
|
||
if (other[i] != c) return false;
|
||
}
|
||
return true;
|
||
}
|
||
|
||
pub inline fn eqlInsensitive(self: string, other: anytype) bool {
|
||
return std.ascii.eqlIgnoreCase(self, other);
|
||
}
|
||
|
||
pub fn eqlComptime(self: string, comptime alt: anytype) bool {
|
||
return eqlComptimeCheckLenWithType(u8, self, alt, true);
|
||
}
|
||
|
||
pub fn eqlComptimeUTF16(self: []const u16, comptime alt: []const u8) bool {
|
||
return eqlComptimeCheckLenWithType(u16, self, comptime toUTF16Literal(alt), true);
|
||
}
|
||
|
||
pub fn eqlComptimeIgnoreLen(self: string, comptime alt: anytype) bool {
|
||
return eqlComptimeCheckLenWithType(u8, self, alt, false);
|
||
}
|
||
|
||
pub fn hasPrefixComptime(self: string, comptime alt: anytype) bool {
|
||
return self.len >= alt.len and eqlComptimeCheckLenWithType(u8, self[0..alt.len], alt, false);
|
||
}
|
||
|
||
pub fn isBunStandaloneFilePath(self: string) bool {
|
||
return hasPrefixComptime(self, "/$bunfs/");
|
||
}
|
||
|
||
pub fn hasPrefixComptimeUTF16(self: []const u16, comptime alt: []const u8) bool {
|
||
return self.len >= alt.len and eqlComptimeCheckLenWithType(u16, self[0..alt.len], comptime toUTF16Literal(alt), false);
|
||
}
|
||
|
||
pub fn hasSuffixComptime(self: string, comptime alt: anytype) bool {
|
||
return self.len >= alt.len and eqlComptimeCheckLenWithType(u8, self[self.len - alt.len ..], alt, false);
|
||
}
|
||
|
||
inline fn eqlComptimeCheckLenWithKnownType(comptime Type: type, a: []const Type, comptime b: []const Type, comptime check_len: bool) bool {
|
||
@setEvalBranchQuota(9999);
|
||
if (comptime check_len) {
|
||
if (comptime b.len == 0) {
|
||
return a.len == 0;
|
||
}
|
||
|
||
switch (a.len) {
|
||
b.len => {},
|
||
else => return false,
|
||
}
|
||
}
|
||
|
||
const len = comptime b.len;
|
||
comptime var dword_length = b.len >> if (Environment.isNative) 3 else 2;
|
||
const slice = b;
|
||
const divisor = comptime @sizeOf(Type);
|
||
|
||
comptime var b_ptr: usize = 0;
|
||
|
||
inline while (dword_length > 0) : (dword_length -= 1) {
|
||
if (@as(usize, @bitCast(a[b_ptr..][0 .. @sizeOf(usize) / divisor].*)) != comptime @as(usize, @bitCast((slice[b_ptr..])[0 .. @sizeOf(usize) / divisor].*)))
|
||
return false;
|
||
comptime b_ptr += @sizeOf(usize);
|
||
if (comptime b_ptr == b.len) return true;
|
||
}
|
||
|
||
if (comptime @sizeOf(usize) == 8) {
|
||
if (comptime (len & 4) != 0) {
|
||
if (@as(u32, @bitCast(a[b_ptr..][0 .. @sizeOf(u32) / divisor].*)) != comptime @as(u32, @bitCast((slice[b_ptr..])[0 .. @sizeOf(u32) / divisor].*)))
|
||
return false;
|
||
|
||
comptime b_ptr += @sizeOf(u32);
|
||
|
||
if (comptime b_ptr == b.len) return true;
|
||
}
|
||
}
|
||
|
||
if (comptime (len & 2) != 0) {
|
||
if (@as(u16, @bitCast(a[b_ptr..][0 .. @sizeOf(u16) / divisor].*)) != comptime @as(u16, @bitCast(slice[b_ptr .. b_ptr + (@sizeOf(u16) / divisor)].*)))
|
||
return false;
|
||
|
||
comptime b_ptr += @sizeOf(u16);
|
||
|
||
if (comptime b_ptr == b.len) return true;
|
||
}
|
||
|
||
if ((comptime (len & 1) != 0) and a[b_ptr] != comptime b[b_ptr]) return false;
|
||
|
||
return true;
|
||
}
|
||
|
||
/// Check if two strings are equal with one of the strings being a comptime-known value
|
||
///
|
||
/// strings.eqlComptime(input, "hello world");
|
||
/// strings.eqlComptime(input, "hai");
|
||
pub inline fn eqlComptimeCheckLenWithType(comptime Type: type, a: []const Type, comptime b: anytype, comptime check_len: bool) bool {
|
||
return eqlComptimeCheckLenWithKnownType(comptime Type, a, if (@typeInfo(@TypeOf(b)) != .Pointer) &b else b, comptime check_len);
|
||
}
|
||
|
||
pub inline fn eqlCaseInsensitiveASCIIIgnoreLength(
|
||
a: string,
|
||
b: string,
|
||
) bool {
|
||
return eqlCaseInsensitiveASCII(a, b, false);
|
||
}
|
||
|
||
pub inline fn eqlCaseInsensitiveASCIIICheckLength(
|
||
a: string,
|
||
b: string,
|
||
) bool {
|
||
return eqlCaseInsensitiveASCII(a, b, true);
|
||
}
|
||
|
||
pub fn eqlCaseInsensitiveASCII(a: string, b: string, comptime check_len: bool) bool {
|
||
if (comptime check_len) {
|
||
if (a.len != b.len) return false;
|
||
if (a.len == 0) return true;
|
||
}
|
||
|
||
std.debug.assert(b.len > 0);
|
||
std.debug.assert(a.len > 0);
|
||
|
||
return bun.C.strncasecmp(a.ptr, b.ptr, a.len) == 0;
|
||
}
|
||
|
||
pub fn eqlLong(a_str: string, b_str: string, comptime check_len: bool) bool {
|
||
const len = b_str.len;
|
||
|
||
if (comptime check_len) {
|
||
if (len == 0) {
|
||
return a_str.len == 0;
|
||
}
|
||
|
||
if (a_str.len != len) {
|
||
return false;
|
||
}
|
||
} else {
|
||
if (comptime Environment.allow_assert) std.debug.assert(b_str.len == a_str.len);
|
||
}
|
||
|
||
const end = b_str.ptr + len;
|
||
var a = a_str.ptr;
|
||
var b = b_str.ptr;
|
||
|
||
if (a == b)
|
||
return true;
|
||
|
||
{
|
||
var dword_length = len >> 3;
|
||
while (dword_length > 0) : (dword_length -= 1) {
|
||
if (@as(usize, @bitCast(a[0..@sizeOf(usize)].*)) != @as(usize, @bitCast(b[0..@sizeOf(usize)].*)))
|
||
return false;
|
||
b += @sizeOf(usize);
|
||
if (b == end) return true;
|
||
a += @sizeOf(usize);
|
||
}
|
||
}
|
||
|
||
if (comptime @sizeOf(usize) == 8) {
|
||
if ((len & 4) != 0) {
|
||
if (@as(u32, @bitCast(a[0..@sizeOf(u32)].*)) != @as(u32, @bitCast(b[0..@sizeOf(u32)].*)))
|
||
return false;
|
||
|
||
b += @sizeOf(u32);
|
||
if (b == end) return true;
|
||
a += @sizeOf(u32);
|
||
}
|
||
}
|
||
|
||
if ((len & 2) != 0) {
|
||
if (@as(u16, @bitCast(a[0..@sizeOf(u16)].*)) != @as(u16, @bitCast(b[0..@sizeOf(u16)].*)))
|
||
return false;
|
||
|
||
b += @sizeOf(u16);
|
||
|
||
if (b == end) return true;
|
||
|
||
a += @sizeOf(u16);
|
||
}
|
||
|
||
if (((len & 1) != 0) and a[0] != b[0]) return false;
|
||
|
||
return true;
|
||
}
|
||
|
||
pub inline fn append(allocator: std.mem.Allocator, self: string, other: string) ![]u8 {
|
||
var buf = try allocator.alloc(u8, self.len + other.len);
|
||
if (self.len > 0)
|
||
@memcpy(buf[0..self.len], self);
|
||
if (other.len > 0)
|
||
@memcpy(buf[self.len..][0..other.len], other);
|
||
return buf;
|
||
}
|
||
|
||
pub inline fn append3(allocator: std.mem.Allocator, self: string, other: string, third: string) ![]u8 {
|
||
var buf = try allocator.alloc(u8, self.len + other.len + third.len);
|
||
if (self.len > 0)
|
||
@memcpy(buf[0..self.len], self);
|
||
if (other.len > 0)
|
||
@memcpy(buf[self.len..][0..other.len], other);
|
||
if (third.len > 0)
|
||
@memcpy(buf[self.len + other.len ..][0..third.len], third);
|
||
return buf;
|
||
}
|
||
|
||
pub inline fn joinBuf(out: []u8, parts: anytype, comptime parts_len: usize) []u8 {
|
||
var remain = out;
|
||
var count: usize = 0;
|
||
inline for (0..parts_len) |i| {
|
||
const part = parts[i];
|
||
bun.copy(u8, remain, part);
|
||
remain = remain[part.len..];
|
||
count += part.len;
|
||
}
|
||
|
||
return out[0..count];
|
||
}
|
||
|
||
pub fn index(self: string, str: string) i32 {
|
||
if (strings.indexOf(self, str)) |i| {
|
||
return @as(i32, @intCast(i));
|
||
} else {
|
||
return -1;
|
||
}
|
||
}
|
||
|
||
pub fn eqlUtf16(comptime self: string, other: []const u16) bool {
|
||
if (self.len != other.len) return false;
|
||
|
||
if (self.len == 0) return true;
|
||
|
||
return bun.C.memcmp(bun.cast([*]const u8, self.ptr), bun.cast([*]const u8, other.ptr), self.len * @sizeOf(u16)) == 0;
|
||
}
|
||
|
||
pub fn toUTF8Alloc(allocator: std.mem.Allocator, js: []const u16) ![]u8 {
|
||
return try toUTF8AllocWithType(allocator, []const u16, js);
|
||
}
|
||
|
||
pub inline fn appendUTF8MachineWordToUTF16MachineWord(output: *[@sizeOf(usize) / 2]u16, input: *const [@sizeOf(usize) / 2]u8) void {
|
||
output[0 .. @sizeOf(usize) / 2].* = @as(
|
||
[4]u16,
|
||
@bitCast(@as(
|
||
@Vector(4, u16),
|
||
@as(@Vector(4, u8), @bitCast(input[0 .. @sizeOf(usize) / 2].*)),
|
||
)),
|
||
);
|
||
}
|
||
|
||
pub inline fn copyU8IntoU16(output_: []u16, input_: []const u8) void {
|
||
const output = output_;
|
||
const input = input_;
|
||
if (comptime Environment.allow_assert) std.debug.assert(input.len <= output.len);
|
||
|
||
// https://zig.godbolt.org/z/9rTn1orcY
|
||
|
||
var input_ptr = input.ptr;
|
||
var output_ptr = output.ptr;
|
||
|
||
const last_input_ptr = input_ptr + @min(input.len, output.len);
|
||
|
||
while (last_input_ptr != input_ptr) {
|
||
output_ptr[0] = input_ptr[0];
|
||
output_ptr += 1;
|
||
input_ptr += 1;
|
||
}
|
||
}
|
||
|
||
pub fn copyU8IntoU16WithAlignment(comptime alignment: u21, output_: []align(alignment) u16, input_: []const u8) void {
|
||
var output = output_;
|
||
var input = input_;
|
||
const word = @sizeOf(usize) / 2;
|
||
if (comptime Environment.allow_assert) std.debug.assert(input.len <= output.len);
|
||
|
||
// un-aligned data access is slow
|
||
// so we attempt to align the data
|
||
while (!std.mem.isAligned(@intFromPtr(output.ptr), @alignOf(u16)) and input.len >= word) {
|
||
output[0] = input[0];
|
||
output = output[1..];
|
||
input = input[1..];
|
||
}
|
||
|
||
if (std.mem.isAligned(@intFromPtr(output.ptr), @alignOf(u16)) and input.len > 0) {
|
||
copyU8IntoU16(@as([*]u16, @alignCast(output.ptr))[0..output.len], input);
|
||
return;
|
||
}
|
||
|
||
for (input, 0..) |c, i| {
|
||
output[i] = c;
|
||
}
|
||
}
|
||
|
||
// pub inline fn copy(output_: []u8, input_: []const u8) void {
|
||
// var output = output_;
|
||
// var input = input_;
|
||
// if (comptime Environment.allow_assert) std.debug.assert(input.len <= output.len);
|
||
|
||
// if (input.len > @sizeOf(usize) * 4) {
|
||
// comptime var i: usize = 0;
|
||
// inline while (i < 4) : (i += 1) {
|
||
// appendUTF8MachineWord(output[i * @sizeOf(usize) ..][0..@sizeOf(usize)], input[i * @sizeOf(usize) ..][0..@sizeOf(usize)]);
|
||
// }
|
||
// output = output[4 * @sizeOf(usize) ..];
|
||
// input = input[4 * @sizeOf(usize) ..];
|
||
// }
|
||
|
||
// while (input.len >= @sizeOf(usize)) {
|
||
// appendUTF8MachineWord(output[0..@sizeOf(usize)], input[0..@sizeOf(usize)]);
|
||
// output = output[@sizeOf(usize)..];
|
||
// input = input[@sizeOf(usize)..];
|
||
// }
|
||
|
||
// for (input) |c, i| {
|
||
// output[i] = c;
|
||
// }
|
||
// }
|
||
|
||
pub inline fn copyU16IntoU8(output_: []u8, comptime InputType: type, input_: InputType) void {
|
||
if (comptime Environment.allow_assert) std.debug.assert(input_.len <= output_.len);
|
||
var output = output_;
|
||
var input = input_;
|
||
if (comptime Environment.allow_assert) std.debug.assert(input.len <= output.len);
|
||
|
||
// https://zig.godbolt.org/z/9rTn1orcY
|
||
|
||
const group = @as(usize, 16);
|
||
// end at the last group of 16 bytes
|
||
var input_ptr = input.ptr;
|
||
var output_ptr = output.ptr;
|
||
|
||
if (comptime Environment.enableSIMD) {
|
||
const end_len = (@min(input.len, output.len) & ~(group - 1));
|
||
const last_vector_ptr = input.ptr + end_len;
|
||
while (last_vector_ptr != input_ptr) {
|
||
const input_vec1: @Vector(group, u16) = input_ptr[0..group].*;
|
||
inline for (0..group) |i| {
|
||
output_ptr[i] = @as(u8, @truncate(input_vec1[i]));
|
||
}
|
||
|
||
output_ptr += group;
|
||
input_ptr += group;
|
||
}
|
||
|
||
input.len -= end_len;
|
||
output.len -= end_len;
|
||
}
|
||
|
||
const last_input_ptr = input_ptr + @min(input.len, output.len);
|
||
|
||
while (last_input_ptr != input_ptr) {
|
||
output_ptr[0] = @as(u8, @truncate(input_ptr[0]));
|
||
output_ptr += 1;
|
||
input_ptr += 1;
|
||
}
|
||
}
|
||
|
||
const strings = @This();
|
||
|
||
pub fn copyLatin1IntoASCII(dest: []u8, src: []const u8) void {
|
||
var remain = src;
|
||
var to = dest;
|
||
|
||
const non_ascii_offset = strings.firstNonASCII(remain) orelse @as(u32, @truncate(remain.len));
|
||
if (non_ascii_offset > 0) {
|
||
@memcpy(to[0..non_ascii_offset], remain[0..non_ascii_offset]);
|
||
remain = remain[non_ascii_offset..];
|
||
to = to[non_ascii_offset..];
|
||
|
||
// ascii fast path
|
||
if (remain.len == 0) {
|
||
return;
|
||
}
|
||
}
|
||
|
||
if (to.len >= 16 and bun.Environment.enableSIMD) {
|
||
const vector_size = 16;
|
||
// https://zig.godbolt.org/z/qezsY8T3W
|
||
const remain_in_u64 = remain[0 .. remain.len - (remain.len % vector_size)];
|
||
const to_in_u64 = to[0 .. to.len - (to.len % vector_size)];
|
||
var remain_as_u64 = std.mem.bytesAsSlice(u64, remain_in_u64);
|
||
var to_as_u64 = std.mem.bytesAsSlice(u64, to_in_u64);
|
||
const end_vector_len = @min(remain_as_u64.len, to_as_u64.len);
|
||
remain_as_u64 = remain_as_u64[0..end_vector_len];
|
||
to_as_u64 = to_as_u64[0..end_vector_len];
|
||
const end_ptr = remain_as_u64.ptr + remain_as_u64.len;
|
||
// using the pointer instead of the length is super important for the codegen
|
||
while (end_ptr != remain_as_u64.ptr) {
|
||
const buf = remain_as_u64[0];
|
||
// this gets auto-vectorized
|
||
const mask = @as(u64, 0x7f7f7f7f7f7f7f7f);
|
||
to_as_u64[0] = buf & mask;
|
||
|
||
remain_as_u64 = remain_as_u64[1..];
|
||
to_as_u64 = to_as_u64[1..];
|
||
}
|
||
remain = remain[remain_in_u64.len..];
|
||
to = to[to_in_u64.len..];
|
||
}
|
||
|
||
for (to) |*to_byte| {
|
||
to_byte.* = @as(u8, @as(u7, @truncate(remain[0])));
|
||
remain = remain[1..];
|
||
}
|
||
}
|
||
|
||
/// It is common on Windows to find files that are not encoded in UTF8. Most of these include
|
||
/// a 'byte-order mark' codepoint at the start of the file. The layout of this codepoint can
|
||
/// determine the encoding.
|
||
///
|
||
/// https://en.wikipedia.org/wiki/Byte_order_mark
|
||
pub const BOM = enum {
|
||
utf8,
|
||
utf16_le,
|
||
utf16_be,
|
||
utf32_le,
|
||
utf32_be,
|
||
|
||
pub const utf8_bytes = [_]u8{ 0xef, 0xbb, 0xbf };
|
||
pub const utf16_le_bytes = [_]u8{ 0xff, 0xfe };
|
||
pub const utf16_be_bytes = [_]u8{ 0xfe, 0xff };
|
||
pub const utf32_le_bytes = [_]u8{ 0xff, 0xfe, 0x00, 0x00 };
|
||
pub const utf32_be_bytes = [_]u8{ 0x00, 0x00, 0xfe, 0xff };
|
||
|
||
pub fn detect(bytes: []const u8) ?BOM {
|
||
if (bytes.len < 3) return null;
|
||
if (eqlComptimeIgnoreLen(bytes, utf8_bytes)) return .utf8;
|
||
if (eqlComptimeIgnoreLen(bytes, utf16_le_bytes)) {
|
||
// if (bytes.len > 4 and eqlComptimeIgnoreLen(bytes[2..], utf32_le_bytes[2..]))
|
||
// return .utf32_le;
|
||
return .utf16_le;
|
||
}
|
||
// if (eqlComptimeIgnoreLen(bytes, utf16_be_bytes)) return .utf16_be;
|
||
// if (bytes.len > 4 and eqlComptimeIgnoreLen(bytes, utf32_le_bytes)) return .utf32_le;
|
||
return null;
|
||
}
|
||
|
||
pub fn detectAndSplit(bytes: []const u8) struct { ?BOM, []const u8 } {
|
||
const bom = detect(bytes);
|
||
if (bom == null) return .{ null, bytes };
|
||
return .{ bom, bytes[bom.?.length()..] };
|
||
}
|
||
|
||
pub fn getHeader(bom: BOM) []const u8 {
|
||
return switch (bom) {
|
||
inline else => |t| comptime &@field(BOM, @tagName(t) ++ "_bytes"),
|
||
};
|
||
}
|
||
|
||
pub fn length(bom: BOM) usize {
|
||
return switch (bom) {
|
||
inline else => |t| comptime (&@field(BOM, @tagName(t) ++ "_bytes")).len,
|
||
};
|
||
}
|
||
|
||
/// If an allocation is needed, free the input and the caller will
|
||
/// replace it with the new return
|
||
pub fn removeAndConvertToUTF8AndFree(bom: BOM, allocator: std.mem.Allocator, bytes: []u8) ![]u8 {
|
||
switch (bom) {
|
||
.utf8 => {
|
||
bun.C.memmove(bytes.ptr, bytes.ptr + utf8_bytes.len, bytes.len - utf8_bytes.len);
|
||
return bytes[0 .. bytes.len - utf8_bytes.len];
|
||
},
|
||
.utf16_le => {
|
||
const trimmed_bytes = bytes[utf16_le_bytes.len..];
|
||
const trimmed_bytes_u16: []const u16 = @alignCast(std.mem.bytesAsSlice(u16, trimmed_bytes));
|
||
const out = try toUTF8Alloc(allocator, trimmed_bytes_u16);
|
||
allocator.free(bytes);
|
||
return out;
|
||
},
|
||
else => {
|
||
// TODO: this needs to re-encode, for now we just remove the BOM
|
||
const bom_bytes = bom.getHeader();
|
||
bun.C.memmove(bytes.ptr, bytes.ptr + bom_bytes.len, bytes.len - bom_bytes.len);
|
||
return bytes[0 .. bytes.len - bom_bytes.len];
|
||
},
|
||
}
|
||
}
|
||
|
||
/// This is required for fs.zig's `use_shared_buffer` flag. we cannot free that pointer.
|
||
/// The returned slice will always point to the base of the input.
|
||
///
|
||
/// Requires an arraylist in case it must be grown.
|
||
pub fn removeAndConvertToUTF8WithoutDealloc(bom: BOM, allocator: std.mem.Allocator, list: *std.ArrayListUnmanaged(u8)) ![]u8 {
|
||
const bytes = list.items;
|
||
switch (bom) {
|
||
.utf8 => {
|
||
bun.C.memmove(bytes.ptr, bytes.ptr + utf8_bytes.len, bytes.len - utf8_bytes.len);
|
||
return bytes[0 .. bytes.len - utf8_bytes.len];
|
||
},
|
||
.utf16_le => {
|
||
const trimmed_bytes = bytes[utf16_le_bytes.len..];
|
||
const trimmed_bytes_u16: []const u16 = @alignCast(std.mem.bytesAsSlice(u16, trimmed_bytes));
|
||
const out = try toUTF8Alloc(allocator, trimmed_bytes_u16);
|
||
if (list.capacity < out.len) {
|
||
try list.ensureTotalCapacity(allocator, out.len);
|
||
}
|
||
list.items.len = out.len;
|
||
@memcpy(list.items, out);
|
||
return out;
|
||
},
|
||
else => {
|
||
// TODO: this needs to re-encode, for now we just remove the BOM
|
||
const bom_bytes = bom.getHeader();
|
||
bun.C.memmove(bytes.ptr, bytes.ptr + bom_bytes.len, bytes.len - bom_bytes.len);
|
||
return bytes[0 .. bytes.len - bom_bytes.len];
|
||
},
|
||
}
|
||
}
|
||
};
|
||
|
||
/// @deprecated. If you are using this, you likely will need to remove other BOMs and handle encoding.
|
||
/// Use the BOM struct's `detect` and conversion functions instead.
|
||
pub fn withoutUTF8BOM(bytes: []const u8) []const u8 {
|
||
if (strings.hasPrefixComptime(bytes, BOM.utf8_bytes)) {
|
||
return bytes[BOM.utf8_bytes.len..];
|
||
} else {
|
||
return bytes;
|
||
}
|
||
}
|
||
|
||
/// Convert a UTF-8 string to a UTF-16 string IF there are any non-ascii characters
|
||
/// If there are no non-ascii characters, this returns null
|
||
/// This is intended to be used for strings that go to JavaScript
|
||
pub fn toUTF16Alloc(allocator: std.mem.Allocator, bytes: []const u8, comptime fail_if_invalid: bool) !?[]u16 {
|
||
if (strings.firstNonASCII(bytes)) |i| {
|
||
const output_: ?std.ArrayList(u16) = if (comptime bun.FeatureFlags.use_simdutf) simd: {
|
||
const trimmed = bun.simdutf.trim.utf8(bytes);
|
||
|
||
if (trimmed.len == 0)
|
||
break :simd null;
|
||
|
||
const out_length = bun.simdutf.length.utf16.from.utf8.le(trimmed);
|
||
|
||
if (out_length == 0)
|
||
break :simd null;
|
||
|
||
var out = try allocator.alloc(u16, out_length);
|
||
log("toUTF16 {d} UTF8 -> {d} UTF16", .{ bytes.len, out_length });
|
||
|
||
// avoid `.with_errors.le()` due to https://github.com/simdutf/simdutf/issues/213
|
||
switch (bun.simdutf.convert.utf8.to.utf16.le(trimmed, out)) {
|
||
0 => {
|
||
if (comptime fail_if_invalid) {
|
||
allocator.free(out);
|
||
return error.InvalidByteSequence;
|
||
}
|
||
|
||
break :simd .{
|
||
.items = out[0..i],
|
||
.capacity = out.len,
|
||
.allocator = allocator,
|
||
};
|
||
},
|
||
else => return out,
|
||
}
|
||
} else null;
|
||
var output = output_ orelse fallback: {
|
||
var list = try std.ArrayList(u16).initCapacity(allocator, i + 2);
|
||
list.items.len = i;
|
||
strings.copyU8IntoU16(list.items, bytes[0..i]);
|
||
break :fallback list;
|
||
};
|
||
errdefer output.deinit();
|
||
|
||
var remaining = bytes[i..];
|
||
|
||
{
|
||
const sequence: [4]u8 = switch (remaining.len) {
|
||
0 => unreachable,
|
||
1 => [_]u8{ remaining[0], 0, 0, 0 },
|
||
2 => [_]u8{ remaining[0], remaining[1], 0, 0 },
|
||
3 => [_]u8{ remaining[0], remaining[1], remaining[2], 0 },
|
||
else => remaining[0..4].*,
|
||
};
|
||
|
||
const replacement = strings.convertUTF8BytesIntoUTF16(&sequence);
|
||
if (comptime fail_if_invalid) {
|
||
if (replacement.fail) {
|
||
if (comptime Environment.allow_assert) std.debug.assert(replacement.code_point == unicode_replacement);
|
||
return error.InvalidByteSequence;
|
||
}
|
||
}
|
||
remaining = remaining[@max(replacement.len, 1)..];
|
||
|
||
//#define U16_LENGTH(c) ((uint32_t)(c)<=0xffff ? 1 : 2)
|
||
switch (replacement.code_point) {
|
||
0...0xffff => |c| {
|
||
try output.append(@as(u16, @intCast(c)));
|
||
},
|
||
else => |c| {
|
||
try output.appendSlice(&[_]u16{ strings.u16Lead(c), strings.u16Trail(c) });
|
||
},
|
||
}
|
||
}
|
||
|
||
while (strings.firstNonASCII(remaining)) |j| {
|
||
const end = output.items.len;
|
||
try output.ensureUnusedCapacity(j);
|
||
output.items.len += j;
|
||
strings.copyU8IntoU16(output.items[end..][0..j], remaining[0..j]);
|
||
remaining = remaining[j..];
|
||
|
||
const sequence: [4]u8 = switch (remaining.len) {
|
||
0 => unreachable,
|
||
1 => [_]u8{ remaining[0], 0, 0, 0 },
|
||
2 => [_]u8{ remaining[0], remaining[1], 0, 0 },
|
||
3 => [_]u8{ remaining[0], remaining[1], remaining[2], 0 },
|
||
else => remaining[0..4].*,
|
||
};
|
||
|
||
const replacement = strings.convertUTF8BytesIntoUTF16(&sequence);
|
||
if (comptime fail_if_invalid) {
|
||
if (replacement.fail) {
|
||
if (comptime Environment.allow_assert) std.debug.assert(replacement.code_point == unicode_replacement);
|
||
return error.InvalidByteSequence;
|
||
}
|
||
}
|
||
remaining = remaining[@max(replacement.len, 1)..];
|
||
|
||
//#define U16_LENGTH(c) ((uint32_t)(c)<=0xffff ? 1 : 2)
|
||
switch (replacement.code_point) {
|
||
0...0xffff => |c| {
|
||
try output.append(@as(u16, @intCast(c)));
|
||
},
|
||
else => |c| {
|
||
try output.appendSlice(&[_]u16{ strings.u16Lead(c), strings.u16Trail(c) });
|
||
},
|
||
}
|
||
}
|
||
|
||
if (remaining.len > 0) {
|
||
try output.ensureTotalCapacityPrecise(output.items.len + remaining.len);
|
||
|
||
output.items.len += remaining.len;
|
||
strings.copyU8IntoU16(output.items[output.items.len - remaining.len ..], remaining);
|
||
}
|
||
|
||
return output.items;
|
||
}
|
||
|
||
return null;
|
||
}
|
||
|
||
pub fn toUTF16AllocNoTrim(allocator: std.mem.Allocator, bytes: []const u8, comptime fail_if_invalid: bool) !?[]u16 {
|
||
if (strings.firstNonASCII(bytes)) |i| {
|
||
const output_: ?std.ArrayList(u16) = if (comptime bun.FeatureFlags.use_simdutf) simd: {
|
||
const out_length = bun.simdutf.length.utf16.from.utf8.le(bytes);
|
||
|
||
if (out_length == 0)
|
||
break :simd null;
|
||
|
||
var out = try allocator.alloc(u16, out_length);
|
||
log("toUTF16 {d} UTF8 -> {d} UTF16", .{ bytes.len, out_length });
|
||
|
||
// avoid `.with_errors.le()` due to https://github.com/simdutf/simdutf/issues/213
|
||
switch (bun.simdutf.convert.utf8.to.utf16.le(bytes, out)) {
|
||
0 => {
|
||
if (comptime fail_if_invalid) {
|
||
allocator.free(out);
|
||
return error.InvalidByteSequence;
|
||
}
|
||
|
||
break :simd .{
|
||
.items = out[0..i],
|
||
.capacity = out.len,
|
||
.allocator = allocator,
|
||
};
|
||
},
|
||
else => return out,
|
||
}
|
||
} else null;
|
||
var output = output_ orelse fallback: {
|
||
var list = try std.ArrayList(u16).initCapacity(allocator, i + 2);
|
||
list.items.len = i;
|
||
strings.copyU8IntoU16(list.items, bytes[0..i]);
|
||
break :fallback list;
|
||
};
|
||
errdefer output.deinit();
|
||
|
||
var remaining = bytes[i..];
|
||
|
||
{
|
||
const sequence: [4]u8 = switch (remaining.len) {
|
||
0 => unreachable,
|
||
1 => [_]u8{ remaining[0], 0, 0, 0 },
|
||
2 => [_]u8{ remaining[0], remaining[1], 0, 0 },
|
||
3 => [_]u8{ remaining[0], remaining[1], remaining[2], 0 },
|
||
else => remaining[0..4].*,
|
||
};
|
||
|
||
const replacement = strings.convertUTF8BytesIntoUTF16(&sequence);
|
||
if (comptime fail_if_invalid) {
|
||
if (replacement.fail) {
|
||
if (comptime Environment.allow_assert) std.debug.assert(replacement.code_point == unicode_replacement);
|
||
return error.InvalidByteSequence;
|
||
}
|
||
}
|
||
remaining = remaining[@max(replacement.len, 1)..];
|
||
|
||
//#define U16_LENGTH(c) ((uint32_t)(c)<=0xffff ? 1 : 2)
|
||
switch (replacement.code_point) {
|
||
0...0xffff => |c| {
|
||
try output.append(@as(u16, @intCast(c)));
|
||
},
|
||
else => |c| {
|
||
try output.appendSlice(&[_]u16{ strings.u16Lead(c), strings.u16Trail(c) });
|
||
},
|
||
}
|
||
}
|
||
|
||
while (strings.firstNonASCII(remaining)) |j| {
|
||
const end = output.items.len;
|
||
try output.ensureUnusedCapacity(j);
|
||
output.items.len += j;
|
||
strings.copyU8IntoU16(output.items[end..][0..j], remaining[0..j]);
|
||
remaining = remaining[j..];
|
||
|
||
const sequence: [4]u8 = switch (remaining.len) {
|
||
0 => unreachable,
|
||
1 => [_]u8{ remaining[0], 0, 0, 0 },
|
||
2 => [_]u8{ remaining[0], remaining[1], 0, 0 },
|
||
3 => [_]u8{ remaining[0], remaining[1], remaining[2], 0 },
|
||
else => remaining[0..4].*,
|
||
};
|
||
|
||
const replacement = strings.convertUTF8BytesIntoUTF16(&sequence);
|
||
if (comptime fail_if_invalid) {
|
||
if (replacement.fail) {
|
||
if (comptime Environment.allow_assert) std.debug.assert(replacement.code_point == unicode_replacement);
|
||
return error.InvalidByteSequence;
|
||
}
|
||
}
|
||
remaining = remaining[@max(replacement.len, 1)..];
|
||
|
||
//#define U16_LENGTH(c) ((uint32_t)(c)<=0xffff ? 1 : 2)
|
||
switch (replacement.code_point) {
|
||
0...0xffff => |c| {
|
||
try output.append(@as(u16, @intCast(c)));
|
||
},
|
||
else => |c| {
|
||
try output.appendSlice(&[_]u16{ strings.u16Lead(c), strings.u16Trail(c) });
|
||
},
|
||
}
|
||
}
|
||
|
||
if (remaining.len > 0) {
|
||
try output.ensureTotalCapacityPrecise(output.items.len + remaining.len);
|
||
|
||
output.items.len += remaining.len;
|
||
strings.copyU8IntoU16(output.items[output.items.len - remaining.len ..], remaining);
|
||
}
|
||
|
||
return output.items;
|
||
}
|
||
|
||
return null;
|
||
}
|
||
|
||
pub fn utf16CodepointWithFFFD(comptime Type: type, input: Type) UTF16Replacement {
|
||
const c0 = @as(u21, input[0]);
|
||
|
||
if (c0 & ~@as(u21, 0x03ff) == 0xd800) {
|
||
// surrogate pair
|
||
if (input.len == 1)
|
||
return .{
|
||
.len = 1,
|
||
};
|
||
//error.DanglingSurrogateHalf;
|
||
const c1 = @as(u21, input[1]);
|
||
if (c1 & ~@as(u21, 0x03ff) != 0xdc00)
|
||
if (input.len == 1) {
|
||
return .{
|
||
.len = 1,
|
||
};
|
||
} else {
|
||
return .{
|
||
.fail = true,
|
||
.len = 1,
|
||
.code_point = unicode_replacement,
|
||
};
|
||
};
|
||
// return error.ExpectedSecondSurrogateHalf;
|
||
|
||
return .{ .len = 2, .code_point = 0x10000 + (((c0 & 0x03ff) << 10) | (c1 & 0x03ff)) };
|
||
} else if (c0 & ~@as(u21, 0x03ff) == 0xdc00) {
|
||
// return error.UnexpectedSecondSurrogateHalf;
|
||
return .{ .fail = true, .len = 1, .code_point = unicode_replacement };
|
||
} else {
|
||
return .{ .code_point = c0, .len = 1 };
|
||
}
|
||
}
|
||
|
||
pub fn utf16Codepoint(comptime Type: type, input: Type) UTF16Replacement {
|
||
const c0 = @as(u21, input[0]);
|
||
|
||
if (c0 & ~@as(u21, 0x03ff) == 0xd800) {
|
||
// surrogate pair
|
||
if (input.len == 1)
|
||
return .{
|
||
.len = 1,
|
||
};
|
||
//error.DanglingSurrogateHalf;
|
||
const c1 = @as(u21, input[1]);
|
||
if (c1 & ~@as(u21, 0x03ff) != 0xdc00)
|
||
if (input.len == 1)
|
||
return .{
|
||
.len = 1,
|
||
};
|
||
// return error.ExpectedSecondSurrogateHalf;
|
||
|
||
return .{ .len = 2, .code_point = 0x10000 + (((c0 & 0x03ff) << 10) | (c1 & 0x03ff)) };
|
||
} else if (c0 & ~@as(u21, 0x03ff) == 0xdc00) {
|
||
// return error.UnexpectedSecondSurrogateHalf;
|
||
return .{ .len = 1 };
|
||
} else {
|
||
return .{ .code_point = c0, .len = 1 };
|
||
}
|
||
}
|
||
|
||
fn windowsPathIsPosixAbsolute(utf8: []const u8) bool {
|
||
if (utf8.len == 0) return false;
|
||
if (!charIsAnySlash(utf8[0])) return false;
|
||
if (utf8.len > 1 and charIsAnySlash(utf8[1])) return false;
|
||
return true;
|
||
}
|
||
|
||
pub fn fromWPath(buf: []u8, utf16: []const u16) [:0]const u8 {
|
||
std.debug.assert(buf.len > 0);
|
||
const encode_into_result = copyUTF16IntoUTF8(buf[0 .. buf.len - 1], []const u16, utf16, false);
|
||
std.debug.assert(encode_into_result.written < buf.len);
|
||
buf[encode_into_result.written] = 0;
|
||
return buf[0..encode_into_result.written :0];
|
||
}
|
||
|
||
pub fn toNTPath(wbuf: []u16, utf8: []const u8) [:0]const u16 {
|
||
if (!std.fs.path.isAbsoluteWindows(utf8)) {
|
||
return toWPathNormalized(wbuf, utf8);
|
||
}
|
||
|
||
wbuf[0..4].* = [_]u16{ '\\', '?', '?', '\\' };
|
||
return wbuf[0 .. toWPathNormalized(wbuf[4..], utf8).len + 4 :0];
|
||
}
|
||
|
||
// These are the same because they don't have rules like needing a trailing slash
|
||
pub const toNTDir = toNTPath;
|
||
|
||
pub fn toExtendedPathNormalized(wbuf: []u16, utf8: []const u8) [:0]const u16 {
|
||
std.debug.assert(wbuf.len > 4);
|
||
wbuf[0..4].* = [_]u16{ '\\', '\\', '?', '\\' };
|
||
return wbuf[0 .. toWPathNormalized(wbuf[4..], utf8).len + 4 :0];
|
||
}
|
||
|
||
pub fn toWPathNormalizeAutoExtend(wbuf: []u16, utf8: []const u8) [:0]const u16 {
|
||
if (std.fs.path.isAbsoluteWindows(utf8)) {
|
||
return toExtendedPathNormalized(wbuf, utf8);
|
||
}
|
||
|
||
return toWPathNormalized(wbuf, utf8);
|
||
}
|
||
|
||
pub fn toWPathNormalized(wbuf: []u16, utf8: []const u8) [:0]const u16 {
|
||
var renormalized: [bun.MAX_PATH_BYTES]u8 = undefined;
|
||
var path_to_use = utf8;
|
||
|
||
if (bun.strings.containsChar(utf8, '/')) {
|
||
@memcpy(renormalized[0..utf8.len], utf8);
|
||
for (renormalized[0..utf8.len]) |*c| {
|
||
if (c.* == '/') {
|
||
c.* = '\\';
|
||
}
|
||
}
|
||
path_to_use = renormalized[0..utf8.len];
|
||
}
|
||
|
||
// is there a trailing slash? Let's remove it before converting to UTF-16
|
||
if (path_to_use.len > 3 and bun.path.isSepAny(path_to_use[path_to_use.len - 1])) {
|
||
path_to_use = path_to_use[0 .. path_to_use.len - 1];
|
||
}
|
||
|
||
return toWPath(wbuf, path_to_use);
|
||
}
|
||
|
||
pub fn toWDirNormalized(wbuf: []u16, utf8: []const u8) [:0]const u16 {
|
||
var renormalized: [bun.MAX_PATH_BYTES]u8 = undefined;
|
||
var path_to_use = utf8;
|
||
|
||
if (bun.strings.containsChar(utf8, '/')) {
|
||
@memcpy(renormalized[0..utf8.len], utf8);
|
||
for (renormalized[0..utf8.len]) |*c| {
|
||
if (c.* == '/') {
|
||
c.* = '\\';
|
||
}
|
||
}
|
||
path_to_use = renormalized[0..utf8.len];
|
||
}
|
||
|
||
return toWDirPath(wbuf, path_to_use);
|
||
}
|
||
|
||
pub fn toWPath(wbuf: []u16, utf8: []const u8) [:0]const u16 {
|
||
return toWPathMaybeDir(wbuf, utf8, false);
|
||
}
|
||
|
||
pub fn toWDirPath(wbuf: []u16, utf8: []const u8) [:0]const u16 {
|
||
return toWPathMaybeDir(wbuf, utf8, true);
|
||
}
|
||
|
||
pub fn assertIsValidWindowsPath(utf8: []const u8) void {
|
||
if (Environment.allow_assert and Environment.isWindows) {
|
||
if (windowsPathIsPosixAbsolute(utf8)) {
|
||
std.debug.panic("Do not pass posix paths to windows APIs, was given '{s}' (missing a root like 'C:\\', see PosixToWinNormalizer for why this is an assertion)", .{
|
||
utf8,
|
||
});
|
||
}
|
||
if (startsWith(utf8, ":/")) {
|
||
std.debug.panic("Path passed to windows API '{s}' is almost certainly invalid. Where did the drive letter go?", .{
|
||
utf8,
|
||
});
|
||
}
|
||
}
|
||
}
|
||
|
||
pub fn toWPathMaybeDir(wbuf: []u16, utf8: []const u8, comptime add_trailing_lash: bool) [:0]const u16 {
|
||
std.debug.assert(wbuf.len > 0);
|
||
|
||
assertIsValidWindowsPath(utf8);
|
||
|
||
var result = bun.simdutf.convert.utf8.to.utf16.with_errors.le(
|
||
utf8,
|
||
wbuf[0..wbuf.len -| (1 + @as(usize, @intFromBool(add_trailing_lash)))],
|
||
);
|
||
|
||
if (add_trailing_lash and result.count > 0 and wbuf[result.count - 1] != '\\') {
|
||
wbuf[result.count] = '\\';
|
||
result.count += 1;
|
||
}
|
||
|
||
wbuf[result.count] = 0;
|
||
|
||
return wbuf[0..result.count :0];
|
||
}
|
||
|
||
pub fn convertUTF16ToUTF8(list_: std.ArrayList(u8), comptime Type: type, utf16: Type) !std.ArrayList(u8) {
|
||
var list = list_;
|
||
const result = bun.simdutf.convert.utf16.to.utf8.with_errors.le(
|
||
utf16,
|
||
list.items.ptr[0..list.capacity],
|
||
);
|
||
if (result.status == .surrogate) {
|
||
// Slow path: there was invalid UTF-16, so we need to convert it without simdutf.
|
||
return toUTF8ListWithTypeBun(list, Type, utf16);
|
||
}
|
||
|
||
list.items.len = result.count;
|
||
return list;
|
||
}
|
||
|
||
pub fn toUTF8AllocWithType(allocator: std.mem.Allocator, comptime Type: type, utf16: Type) ![]u8 {
|
||
if (bun.FeatureFlags.use_simdutf and comptime Type == []const u16) {
|
||
const length = bun.simdutf.length.utf8.from.utf16.le(utf16);
|
||
// add 16 bytes of padding for SIMDUTF
|
||
var list = try std.ArrayList(u8).initCapacity(allocator, length + 16);
|
||
list = try convertUTF16ToUTF8(list, Type, utf16);
|
||
return list.items;
|
||
}
|
||
|
||
var list = try std.ArrayList(u8).initCapacity(allocator, utf16.len);
|
||
list = try toUTF8ListWithType(list, Type, utf16);
|
||
return list.items;
|
||
}
|
||
|
||
pub fn toUTF8ListWithType(list_: std.ArrayList(u8), comptime Type: type, utf16: Type) !std.ArrayList(u8) {
|
||
if (bun.FeatureFlags.use_simdutf and comptime Type == []const u16) {
|
||
var list = list_;
|
||
const length = bun.simdutf.length.utf8.from.utf16.le(utf16);
|
||
try list.ensureTotalCapacityPrecise(length + 16);
|
||
const buf = try convertUTF16ToUTF8(list, Type, utf16);
|
||
// Commenting out because `convertUTF16ToUTF8` may convert to WTF-8
|
||
// which uses 3 bytes for invalid surrogates, causing the length to not
|
||
// match from simdutf.
|
||
// if (Environment.allow_assert) {
|
||
// std.debug.assert(buf.items.len == length);
|
||
// }
|
||
return buf;
|
||
}
|
||
|
||
return toUTF8ListWithTypeBun(list_, Type, utf16);
|
||
}
|
||
|
||
pub fn toUTF8FromLatin1(allocator: std.mem.Allocator, latin1: []const u8) !?std.ArrayList(u8) {
|
||
if (bun.JSC.is_bindgen)
|
||
unreachable;
|
||
|
||
if (isAllASCII(latin1))
|
||
return null;
|
||
|
||
const list = try std.ArrayList(u8).initCapacity(allocator, latin1.len);
|
||
return try allocateLatin1IntoUTF8WithList(list, 0, []const u8, latin1);
|
||
}
|
||
|
||
pub fn toUTF8ListWithTypeBun(list_: std.ArrayList(u8), comptime Type: type, utf16: Type) !std.ArrayList(u8) {
|
||
var list = list_;
|
||
var utf16_remaining = utf16;
|
||
|
||
while (firstNonASCII16(Type, utf16_remaining)) |i| {
|
||
const to_copy = utf16_remaining[0..i];
|
||
utf16_remaining = utf16_remaining[i..];
|
||
|
||
const replacement = utf16CodepointWithFFFD(Type, utf16_remaining);
|
||
utf16_remaining = utf16_remaining[replacement.len..];
|
||
|
||
const count: usize = replacement.utf8Width();
|
||
if (comptime Environment.isNative) {
|
||
try list.ensureTotalCapacityPrecise(i + count + list.items.len + @as(usize, @intFromFloat((@as(f64, @floatFromInt(@as(u52, @truncate(utf16_remaining.len)))) * 1.2))));
|
||
} else {
|
||
try list.ensureTotalCapacityPrecise(i + count + list.items.len + utf16_remaining.len + 4);
|
||
}
|
||
list.items.len += i;
|
||
|
||
copyU16IntoU8(
|
||
list.items[list.items.len - i ..],
|
||
Type,
|
||
to_copy,
|
||
);
|
||
|
||
list.items.len += count;
|
||
|
||
_ = encodeWTF8RuneT(
|
||
list.items.ptr[list.items.len - count .. list.items.len - count + 4][0..4],
|
||
u32,
|
||
@as(u32, replacement.code_point),
|
||
);
|
||
}
|
||
|
||
if (utf16_remaining.len > 0) {
|
||
try list.ensureTotalCapacityPrecise(utf16_remaining.len + list.items.len);
|
||
const old_len = list.items.len;
|
||
list.items.len += utf16_remaining.len;
|
||
copyU16IntoU8(list.items[old_len..], Type, utf16_remaining);
|
||
}
|
||
|
||
log("UTF16 {d} -> {d} UTF8", .{ utf16.len, list.items.len });
|
||
|
||
return list;
|
||
}
|
||
|
||
pub const EncodeIntoResult = struct {
|
||
read: u32 = 0,
|
||
written: u32 = 0,
|
||
};
|
||
pub fn allocateLatin1IntoUTF8(allocator: std.mem.Allocator, comptime Type: type, latin1_: Type) ![]u8 {
|
||
if (comptime bun.FeatureFlags.latin1_is_now_ascii) {
|
||
var out = try allocator.alloc(u8, latin1_.len);
|
||
@memcpy(out[0..latin1_.len], latin1_);
|
||
return out;
|
||
}
|
||
|
||
const list = try std.ArrayList(u8).initCapacity(allocator, latin1_.len);
|
||
var foo = try allocateLatin1IntoUTF8WithList(list, 0, Type, latin1_);
|
||
return try foo.toOwnedSlice();
|
||
}
|
||
|
||
pub fn allocateLatin1IntoUTF8WithList(list_: std.ArrayList(u8), offset_into_list: usize, comptime Type: type, latin1_: Type) !std.ArrayList(u8) {
|
||
var latin1 = latin1_;
|
||
var i: usize = offset_into_list;
|
||
var list = list_;
|
||
try list.ensureUnusedCapacity(latin1.len);
|
||
|
||
while (latin1.len > 0) {
|
||
if (comptime Environment.allow_assert) std.debug.assert(i < list.capacity);
|
||
var buf = list.items.ptr[i..list.capacity];
|
||
|
||
inner: {
|
||
var count = latin1.len / ascii_vector_size;
|
||
while (count > 0) : (count -= 1) {
|
||
const vec: AsciiVector = latin1[0..ascii_vector_size].*;
|
||
|
||
if (@reduce(.Max, vec) > 127) {
|
||
const Int = u64;
|
||
const size = @sizeOf(Int);
|
||
|
||
// zig or LLVM doesn't do @ctz nicely with SIMD
|
||
if (comptime ascii_vector_size >= 8) {
|
||
{
|
||
const bytes = @as(Int, @bitCast(latin1[0..size].*));
|
||
// https://dotat.at/@/2022-06-27-tolower-swar.html
|
||
const mask = bytes & 0x8080808080808080;
|
||
|
||
if (mask > 0) {
|
||
const first_set_byte = @ctz(mask) / 8;
|
||
if (comptime Environment.allow_assert) std.debug.assert(latin1[first_set_byte] >= 127);
|
||
|
||
buf[0..size].* = @as([size]u8, @bitCast(bytes));
|
||
buf = buf[first_set_byte..];
|
||
latin1 = latin1[first_set_byte..];
|
||
break :inner;
|
||
}
|
||
|
||
buf[0..size].* = @as([size]u8, @bitCast(bytes));
|
||
latin1 = latin1[size..];
|
||
buf = buf[size..];
|
||
}
|
||
|
||
if (comptime ascii_vector_size >= 16) {
|
||
const bytes = @as(Int, @bitCast(latin1[0..size].*));
|
||
// https://dotat.at/@/2022-06-27-tolower-swar.html
|
||
const mask = bytes & 0x8080808080808080;
|
||
|
||
if (mask > 0) {
|
||
const first_set_byte = @ctz(mask) / 8;
|
||
if (comptime Environment.allow_assert) std.debug.assert(latin1[first_set_byte] >= 127);
|
||
|
||
buf[0..size].* = @as([size]u8, @bitCast(bytes));
|
||
buf = buf[first_set_byte..];
|
||
latin1 = latin1[first_set_byte..];
|
||
break :inner;
|
||
}
|
||
}
|
||
}
|
||
unreachable;
|
||
}
|
||
|
||
buf[0..ascii_vector_size].* = @as([ascii_vector_size]u8, @bitCast(vec))[0..ascii_vector_size].*;
|
||
latin1 = latin1[ascii_vector_size..];
|
||
buf = buf[ascii_vector_size..];
|
||
}
|
||
|
||
while (latin1.len >= 8) {
|
||
const Int = u64;
|
||
const size = @sizeOf(Int);
|
||
|
||
const bytes = @as(Int, @bitCast(latin1[0..size].*));
|
||
// https://dotat.at/@/2022-06-27-tolower-swar.html
|
||
const mask = bytes & 0x8080808080808080;
|
||
|
||
if (mask > 0) {
|
||
const first_set_byte = @ctz(mask) / 8;
|
||
if (comptime Environment.allow_assert) std.debug.assert(latin1[first_set_byte] >= 127);
|
||
|
||
buf[0..size].* = @as([size]u8, @bitCast(bytes));
|
||
latin1 = latin1[first_set_byte..];
|
||
buf = buf[first_set_byte..];
|
||
break :inner;
|
||
}
|
||
|
||
buf[0..size].* = @as([size]u8, @bitCast(bytes));
|
||
latin1 = latin1[size..];
|
||
buf = buf[size..];
|
||
}
|
||
|
||
{
|
||
if (comptime Environment.allow_assert) std.debug.assert(latin1.len < 8);
|
||
const end = latin1.ptr + latin1.len;
|
||
while (latin1.ptr != end and latin1[0] < 128) {
|
||
buf[0] = latin1[0];
|
||
buf = buf[1..];
|
||
latin1 = latin1[1..];
|
||
}
|
||
}
|
||
}
|
||
|
||
while (latin1.len > 0 and latin1[0] > 127) {
|
||
i = @intFromPtr(buf.ptr) - @intFromPtr(list.items.ptr);
|
||
list.items.len = i;
|
||
try list.ensureUnusedCapacity(2 + latin1.len);
|
||
buf = list.items.ptr[i..list.capacity];
|
||
buf[0..2].* = latin1ToCodepointBytesAssumeNotASCII(latin1[0]);
|
||
latin1 = latin1[1..];
|
||
buf = buf[2..];
|
||
}
|
||
|
||
i = @intFromPtr(buf.ptr) - @intFromPtr(list.items.ptr);
|
||
list.items.len = i;
|
||
}
|
||
|
||
log("Latin1 {d} -> UTF8 {d}", .{ latin1_.len, i });
|
||
|
||
return list;
|
||
}
|
||
|
||
pub const UTF16Replacement = struct {
|
||
code_point: u32 = unicode_replacement,
|
||
len: u3 = 0,
|
||
|
||
/// Explicit fail boolean to distinguish between a Unicode Replacement Codepoint
|
||
/// that was already in there
|
||
/// and a genuine error.
|
||
fail: bool = false,
|
||
|
||
pub inline fn utf8Width(replacement: UTF16Replacement) usize {
|
||
return switch (replacement.code_point) {
|
||
0...0x7F => 1,
|
||
(0x7F + 1)...0x7FF => 2,
|
||
(0x7FF + 1)...0xFFFF => 3,
|
||
else => 4,
|
||
};
|
||
}
|
||
};
|
||
|
||
// This variation matches WebKit behavior.
|
||
pub fn convertUTF8BytesIntoUTF16(sequence: *const [4]u8) UTF16Replacement {
|
||
if (comptime Environment.allow_assert) std.debug.assert(sequence[0] > 127);
|
||
const len = wtf8ByteSequenceLengthWithInvalid(sequence[0]);
|
||
switch (len) {
|
||
2 => {
|
||
if (comptime Environment.allow_assert) {
|
||
std.debug.assert(sequence[0] >= 0xC0);
|
||
std.debug.assert(sequence[0] <= 0xDF);
|
||
}
|
||
if (sequence[1] < 0x80 or sequence[1] > 0xBF) {
|
||
return .{ .len = 1, .fail = true };
|
||
}
|
||
return .{ .len = len, .code_point = ((@as(u32, sequence[0]) << 6) + @as(u32, sequence[1])) - 0x00003080 };
|
||
},
|
||
3 => {
|
||
if (comptime Environment.allow_assert) {
|
||
std.debug.assert(sequence[0] >= 0xE0);
|
||
std.debug.assert(sequence[0] <= 0xEF);
|
||
}
|
||
switch (sequence[0]) {
|
||
0xE0 => {
|
||
if (sequence[1] < 0xA0 or sequence[1] > 0xBF) {
|
||
return .{ .len = 1, .fail = true };
|
||
}
|
||
},
|
||
0xED => {
|
||
if (sequence[1] < 0x80 or sequence[1] > 0x9F) {
|
||
return .{ .len = 1, .fail = true };
|
||
}
|
||
},
|
||
else => {
|
||
if (sequence[1] < 0x80 or sequence[1] > 0xBF) {
|
||
return .{ .len = 1, .fail = true };
|
||
}
|
||
},
|
||
}
|
||
if (sequence[2] < 0x80 or sequence[2] > 0xBF) {
|
||
return .{ .len = 2, .fail = true };
|
||
}
|
||
return .{
|
||
.len = len,
|
||
.code_point = ((@as(u32, sequence[0]) << 12) + (@as(u32, sequence[1]) << 6) + @as(u32, sequence[2])) - 0x000E2080,
|
||
};
|
||
},
|
||
4 => {
|
||
switch (sequence[0]) {
|
||
0xF0 => {
|
||
if (sequence[1] < 0x90 or sequence[1] > 0xBF) {
|
||
return .{ .len = 1, .fail = true };
|
||
}
|
||
},
|
||
0xF4 => {
|
||
if (sequence[1] < 0x80 or sequence[1] > 0x8F) {
|
||
return .{ .len = 1, .fail = true };
|
||
}
|
||
},
|
||
|
||
// invalid code point
|
||
// this used to be an assertion
|
||
0...(0xF0 - 1), 0xF4 + 1...std.math.maxInt(@TypeOf(sequence[0])) => {
|
||
return UTF16Replacement{ .len = 1, .fail = true };
|
||
},
|
||
|
||
else => {
|
||
if (sequence[1] < 0x80 or sequence[1] > 0xBF) {
|
||
return .{ .len = 1, .fail = true };
|
||
}
|
||
},
|
||
}
|
||
|
||
if (sequence[2] < 0x80 or sequence[2] > 0xBF) {
|
||
return .{ .len = 2, .fail = true };
|
||
}
|
||
if (sequence[3] < 0x80 or sequence[3] > 0xBF) {
|
||
return .{ .len = 3, .fail = true };
|
||
}
|
||
return .{
|
||
.len = 4,
|
||
.code_point = ((@as(u32, sequence[0]) << 18) +
|
||
(@as(u32, sequence[1]) << 12) +
|
||
(@as(u32, sequence[2]) << 6) + @as(u32, sequence[3])) - 0x03C82080,
|
||
};
|
||
},
|
||
// invalid unicode sequence
|
||
// 1 or 0 are both invalid here
|
||
else => return UTF16Replacement{ .len = 1, .fail = true },
|
||
}
|
||
}
|
||
|
||
pub fn copyLatin1IntoUTF8(buf_: []u8, comptime Type: type, latin1_: Type) EncodeIntoResult {
|
||
return copyLatin1IntoUTF8StopOnNonASCII(buf_, Type, latin1_, false);
|
||
}
|
||
|
||
pub fn copyLatin1IntoUTF8StopOnNonASCII(buf_: []u8, comptime Type: type, latin1_: Type, comptime stop: bool) EncodeIntoResult {
|
||
if (comptime bun.FeatureFlags.latin1_is_now_ascii) {
|
||
const to_copy = @as(u32, @truncate(@min(buf_.len, latin1_.len)));
|
||
@memcpy(buf_[0..to_copy], latin1_[0..to_copy]);
|
||
|
||
return .{ .written = to_copy, .read = to_copy };
|
||
}
|
||
|
||
var buf = buf_;
|
||
var latin1 = latin1_;
|
||
|
||
log("latin1 encode {d} -> {d}", .{ buf.len, latin1.len });
|
||
|
||
while (buf.len > 0 and latin1.len > 0) {
|
||
inner: {
|
||
var remaining_runs = @min(buf.len, latin1.len) / ascii_vector_size;
|
||
while (remaining_runs > 0) : (remaining_runs -= 1) {
|
||
const vec: AsciiVector = latin1[0..ascii_vector_size].*;
|
||
|
||
if (@reduce(.Max, vec) > 127) {
|
||
if (comptime stop) return .{ .written = std.math.maxInt(u32), .read = std.math.maxInt(u32) };
|
||
|
||
// zig or LLVM doesn't do @ctz nicely with SIMD
|
||
if (comptime ascii_vector_size >= 8) {
|
||
const Int = u64;
|
||
const size = @sizeOf(Int);
|
||
|
||
{
|
||
const bytes = @as(Int, @bitCast(latin1[0..size].*));
|
||
// https://dotat.at/@/2022-06-27-tolower-swar.html
|
||
const mask = bytes & 0x8080808080808080;
|
||
|
||
buf[0..size].* = @as([size]u8, @bitCast(bytes));
|
||
|
||
if (mask > 0) {
|
||
const first_set_byte = @ctz(mask) / 8;
|
||
if (comptime Environment.allow_assert) std.debug.assert(latin1[first_set_byte] >= 127);
|
||
|
||
buf = buf[first_set_byte..];
|
||
latin1 = latin1[first_set_byte..];
|
||
break :inner;
|
||
}
|
||
|
||
latin1 = latin1[size..];
|
||
buf = buf[size..];
|
||
}
|
||
|
||
if (comptime ascii_vector_size >= 16) {
|
||
const bytes = @as(Int, @bitCast(latin1[0..size].*));
|
||
// https://dotat.at/@/2022-06-27-tolower-swar.html
|
||
const mask = bytes & 0x8080808080808080;
|
||
|
||
buf[0..size].* = @as([size]u8, @bitCast(bytes));
|
||
|
||
if (comptime Environment.allow_assert) std.debug.assert(mask > 0);
|
||
const first_set_byte = @ctz(mask) / 8;
|
||
if (comptime Environment.allow_assert) std.debug.assert(latin1[first_set_byte] >= 127);
|
||
|
||
buf = buf[first_set_byte..];
|
||
latin1 = latin1[first_set_byte..];
|
||
break :inner;
|
||
}
|
||
}
|
||
unreachable;
|
||
}
|
||
|
||
buf[0..ascii_vector_size].* = @as([ascii_vector_size]u8, @bitCast(vec))[0..ascii_vector_size].*;
|
||
latin1 = latin1[ascii_vector_size..];
|
||
buf = buf[ascii_vector_size..];
|
||
}
|
||
|
||
{
|
||
const Int = u64;
|
||
const size = @sizeOf(Int);
|
||
while (@min(buf.len, latin1.len) >= size) {
|
||
const bytes = @as(Int, @bitCast(latin1[0..size].*));
|
||
buf[0..size].* = @as([size]u8, @bitCast(bytes));
|
||
|
||
// https://dotat.at/@/2022-06-27-tolower-swar.html
|
||
|
||
const mask = bytes & 0x8080808080808080;
|
||
|
||
if (mask > 0) {
|
||
const first_set_byte = @ctz(mask) / 8;
|
||
if (comptime stop) return .{ .written = std.math.maxInt(u32), .read = std.math.maxInt(u32) };
|
||
if (comptime Environment.allow_assert) std.debug.assert(latin1[first_set_byte] >= 127);
|
||
|
||
buf = buf[first_set_byte..];
|
||
latin1 = latin1[first_set_byte..];
|
||
|
||
break :inner;
|
||
}
|
||
|
||
latin1 = latin1[size..];
|
||
buf = buf[size..];
|
||
}
|
||
}
|
||
|
||
{
|
||
const end = latin1.ptr + @min(buf.len, latin1.len);
|
||
if (comptime Environment.allow_assert) std.debug.assert(@intFromPtr(latin1.ptr + 8) > @intFromPtr(end));
|
||
const start_ptr = @intFromPtr(buf.ptr);
|
||
const start_ptr_latin1 = @intFromPtr(latin1.ptr);
|
||
|
||
while (latin1.ptr != end and latin1.ptr[0] <= 127) {
|
||
buf.ptr[0] = latin1.ptr[0];
|
||
buf.ptr += 1;
|
||
latin1.ptr += 1;
|
||
}
|
||
|
||
buf.len -= @intFromPtr(buf.ptr) - start_ptr;
|
||
latin1.len -= @intFromPtr(latin1.ptr) - start_ptr_latin1;
|
||
}
|
||
}
|
||
|
||
if (latin1.len > 0) {
|
||
if (buf.len >= 2) {
|
||
if (comptime stop) return .{ .written = std.math.maxInt(u32), .read = std.math.maxInt(u32) };
|
||
|
||
buf[0..2].* = latin1ToCodepointBytesAssumeNotASCII(latin1[0]);
|
||
latin1 = latin1[1..];
|
||
buf = buf[2..];
|
||
} else {
|
||
break;
|
||
}
|
||
}
|
||
}
|
||
|
||
return .{
|
||
.written = @as(u32, @truncate(buf_.len - buf.len)),
|
||
.read = @as(u32, @truncate(latin1_.len - latin1.len)),
|
||
};
|
||
}
|
||
|
||
pub fn replaceLatin1WithUTF8(buf_: []u8) void {
|
||
var latin1 = buf_;
|
||
while (strings.firstNonASCII(latin1)) |i| {
|
||
latin1[i..][0..2].* = latin1ToCodepointBytesAssumeNotASCII(latin1[i]);
|
||
|
||
latin1 = latin1[i + 2 ..];
|
||
}
|
||
}
|
||
|
||
pub fn elementLengthLatin1IntoUTF8(comptime Type: type, latin1_: Type) usize {
|
||
// https://zig.godbolt.org/z/zzYexPPs9
|
||
|
||
var latin1 = latin1_;
|
||
const input_len = latin1.len;
|
||
var total_non_ascii_count: usize = 0;
|
||
|
||
// This is about 30% faster on large input compared to auto-vectorization
|
||
if (comptime Environment.enableSIMD) {
|
||
const end = latin1.ptr + (latin1.len - (latin1.len % ascii_vector_size));
|
||
while (latin1.ptr != end) {
|
||
const vec: AsciiVector = latin1[0..ascii_vector_size].*;
|
||
|
||
// Shifting a unsigned 8 bit integer to the right by 7 bits always produces a value of 0 or 1.
|
||
const cmp = vec >> @as(AsciiVector, @splat(
|
||
@as(u8, 7),
|
||
));
|
||
|
||
// Anding that value rather than converting it into a @Vector(16, u1) produces better code from LLVM.
|
||
const mask: AsciiVector = cmp & @as(AsciiVector, @splat(
|
||
@as(u8, 1),
|
||
));
|
||
|
||
total_non_ascii_count += @as(usize, @reduce(.Add, mask));
|
||
latin1 = latin1[ascii_vector_size..];
|
||
}
|
||
|
||
// an important hint to the compiler to not auto-vectorize the loop below
|
||
if (latin1.len >= ascii_vector_size) unreachable;
|
||
}
|
||
|
||
for (latin1) |c| {
|
||
total_non_ascii_count += @as(usize, @intFromBool(c > 127));
|
||
}
|
||
|
||
// each non-ascii latin1 character becomes 2 UTF8 characters
|
||
return input_len + total_non_ascii_count;
|
||
}
|
||
|
||
const JSC = @import("root").bun.JSC;
|
||
|
||
pub fn copyLatin1IntoUTF16(comptime Buffer: type, buf_: Buffer, comptime Type: type, latin1_: Type) EncodeIntoResult {
|
||
var buf = buf_;
|
||
var latin1 = latin1_;
|
||
while (buf.len > 0 and latin1.len > 0) {
|
||
const to_write = strings.firstNonASCII(latin1) orelse @as(u32, @truncate(@min(latin1.len, buf.len)));
|
||
if (comptime std.meta.alignment(Buffer) != @alignOf(u16)) {
|
||
strings.copyU8IntoU16WithAlignment(std.meta.alignment(Buffer), buf, latin1[0..to_write]);
|
||
} else {
|
||
strings.copyU8IntoU16(buf, latin1[0..to_write]);
|
||
}
|
||
|
||
latin1 = latin1[to_write..];
|
||
buf = buf[to_write..];
|
||
if (latin1.len > 0 and buf.len >= 1) {
|
||
buf[0] = latin1ToCodepointBytesAssumeNotASCII16(latin1[0]);
|
||
latin1 = latin1[1..];
|
||
buf = buf[1..];
|
||
}
|
||
}
|
||
|
||
return .{
|
||
.read = @as(u32, @truncate(buf_.len - buf.len)),
|
||
.written = @as(u32, @truncate(latin1_.len - latin1.len)),
|
||
};
|
||
}
|
||
|
||
pub fn elementLengthLatin1IntoUTF16(comptime Type: type, latin1_: Type) usize {
|
||
// latin1 is always at most 1 UTF-16 code unit long
|
||
if (comptime std.meta.Child([]const u16) == Type) {
|
||
return latin1_.len;
|
||
}
|
||
|
||
var count: usize = 0;
|
||
var latin1 = latin1_;
|
||
while (latin1.len > 0) {
|
||
const function = comptime if (std.meta.Child(Type) == u8) strings.firstNonASCIIWithType else strings.firstNonASCII16;
|
||
const to_write = function(Type, latin1) orelse @as(u32, @truncate(latin1.len));
|
||
count += to_write;
|
||
latin1 = latin1[to_write..];
|
||
if (latin1.len > 0) {
|
||
count += comptime if (std.meta.Child(Type) == u8) 2 else 1;
|
||
latin1 = latin1[1..];
|
||
}
|
||
}
|
||
|
||
return count;
|
||
}
|
||
|
||
pub fn escapeHTMLForLatin1Input(allocator: std.mem.Allocator, latin1: []const u8) !Escaped(u8) {
|
||
const Scalar = struct {
|
||
pub const lengths: [std.math.maxInt(u8)]u4 = brk: {
|
||
var values: [std.math.maxInt(u8)]u4 = undefined;
|
||
for (values, 0..) |_, i| {
|
||
switch (i) {
|
||
'"' => {
|
||
values[i] = """.len;
|
||
},
|
||
'&' => {
|
||
values[i] = "&".len;
|
||
},
|
||
'\'' => {
|
||
values[i] = "'".len;
|
||
},
|
||
'<' => {
|
||
values[i] = "<".len;
|
||
},
|
||
'>' => {
|
||
values[i] = ">".len;
|
||
},
|
||
else => {
|
||
values[i] = 1;
|
||
},
|
||
}
|
||
}
|
||
|
||
break :brk values;
|
||
};
|
||
|
||
inline fn appendString(buf: [*]u8, comptime str: []const u8) usize {
|
||
buf[0..str.len].* = str[0..str.len].*;
|
||
return str.len;
|
||
}
|
||
|
||
pub inline fn append(buf: [*]u8, char: u8) usize {
|
||
if (lengths[char] == 1) {
|
||
buf[0] = char;
|
||
return 1;
|
||
}
|
||
|
||
return switch (char) {
|
||
'"' => appendString(buf, """),
|
||
'&' => appendString(buf, "&"),
|
||
'\'' => appendString(buf, "'"),
|
||
'<' => appendString(buf, "<"),
|
||
'>' => appendString(buf, ">"),
|
||
else => unreachable,
|
||
};
|
||
}
|
||
|
||
pub inline fn push(comptime len: anytype, chars_: *const [len]u8, allo: std.mem.Allocator) Escaped(u8) {
|
||
const chars = chars_.*;
|
||
var total: usize = 0;
|
||
|
||
comptime var remain_to_comp = len;
|
||
comptime var comp_i = 0;
|
||
|
||
inline while (remain_to_comp > 0) : (remain_to_comp -= 1) {
|
||
total += lengths[chars[comp_i]];
|
||
comp_i += 1;
|
||
}
|
||
|
||
if (total == len) {
|
||
return .{ .original = {} };
|
||
}
|
||
|
||
const output = allo.alloc(u8, total) catch unreachable;
|
||
var head = output.ptr;
|
||
inline for (comptime bun.range(0, len)) |i| {
|
||
head += @This().append(head, chars[i]);
|
||
}
|
||
|
||
return Escaped(u8){ .allocated = output };
|
||
}
|
||
};
|
||
@setEvalBranchQuota(5000);
|
||
switch (latin1.len) {
|
||
0 => return Escaped(u8){ .static = "" },
|
||
1 => return switch (latin1[0]) {
|
||
'"' => Escaped(u8){ .static = """ },
|
||
'&' => Escaped(u8){ .static = "&" },
|
||
'\'' => Escaped(u8){ .static = "'" },
|
||
'<' => Escaped(u8){ .static = "<" },
|
||
'>' => Escaped(u8){ .static = ">" },
|
||
else => Escaped(u8){ .original = {} },
|
||
},
|
||
2 => {
|
||
const first: []const u8 = switch (latin1[0]) {
|
||
'"' => """,
|
||
'&' => "&",
|
||
'\'' => "'",
|
||
'<' => "<",
|
||
'>' => ">",
|
||
else => latin1[0..1],
|
||
};
|
||
const second: []const u8 = switch (latin1[1]) {
|
||
'"' => """,
|
||
'&' => "&",
|
||
'\'' => "'",
|
||
'<' => "<",
|
||
'>' => ">",
|
||
else => latin1[1..2],
|
||
};
|
||
if (first.len == 1 and second.len == 1) {
|
||
return Escaped(u8){ .original = {} };
|
||
}
|
||
|
||
return Escaped(u8){ .allocated = strings.append(allocator, first, second) catch unreachable };
|
||
},
|
||
|
||
// The simd implementation is slower for inputs less than 32 bytes.
|
||
3 => return Scalar.push(3, latin1[0..3], allocator),
|
||
4 => return Scalar.push(4, latin1[0..4], allocator),
|
||
5 => return Scalar.push(5, latin1[0..5], allocator),
|
||
6 => return Scalar.push(6, latin1[0..6], allocator),
|
||
7 => return Scalar.push(7, latin1[0..7], allocator),
|
||
8 => return Scalar.push(8, latin1[0..8], allocator),
|
||
9 => return Scalar.push(9, latin1[0..9], allocator),
|
||
10 => return Scalar.push(10, latin1[0..10], allocator),
|
||
11 => return Scalar.push(11, latin1[0..11], allocator),
|
||
12 => return Scalar.push(12, latin1[0..12], allocator),
|
||
13 => return Scalar.push(13, latin1[0..13], allocator),
|
||
14 => return Scalar.push(14, latin1[0..14], allocator),
|
||
15 => return Scalar.push(15, latin1[0..15], allocator),
|
||
16 => return Scalar.push(16, latin1[0..16], allocator),
|
||
17 => return Scalar.push(17, latin1[0..17], allocator),
|
||
18 => return Scalar.push(18, latin1[0..18], allocator),
|
||
19 => return Scalar.push(19, latin1[0..19], allocator),
|
||
20 => return Scalar.push(20, latin1[0..20], allocator),
|
||
21 => return Scalar.push(21, latin1[0..21], allocator),
|
||
22 => return Scalar.push(22, latin1[0..22], allocator),
|
||
23 => return Scalar.push(23, latin1[0..23], allocator),
|
||
24 => return Scalar.push(24, latin1[0..24], allocator),
|
||
25 => return Scalar.push(25, latin1[0..25], allocator),
|
||
26 => return Scalar.push(26, latin1[0..26], allocator),
|
||
27 => return Scalar.push(27, latin1[0..27], allocator),
|
||
28 => return Scalar.push(28, latin1[0..28], allocator),
|
||
29 => return Scalar.push(29, latin1[0..29], allocator),
|
||
30 => return Scalar.push(30, latin1[0..30], allocator),
|
||
31 => return Scalar.push(31, latin1[0..31], allocator),
|
||
32 => return Scalar.push(32, latin1[0..32], allocator),
|
||
|
||
else => {
|
||
var remaining = latin1;
|
||
|
||
const vec_chars = "\"&'<>";
|
||
const vecs: [vec_chars.len]AsciiVector = comptime brk: {
|
||
var _vecs: [vec_chars.len]AsciiVector = undefined;
|
||
for (vec_chars, 0..) |c, i| {
|
||
_vecs[i] = @splat(c);
|
||
}
|
||
break :brk _vecs;
|
||
};
|
||
|
||
var any_needs_escape = false;
|
||
var buf: std.ArrayList(u8) = std.ArrayList(u8){
|
||
.items = &.{},
|
||
.capacity = 0,
|
||
.allocator = allocator,
|
||
};
|
||
|
||
if (comptime Environment.enableSIMD) {
|
||
// pass #1: scan for any characters that need escaping
|
||
// assume most strings won't need any escaping, so don't actually allocate the buffer
|
||
scan_and_allocate_lazily: while (remaining.len >= ascii_vector_size) {
|
||
if (comptime Environment.allow_assert) std.debug.assert(!any_needs_escape);
|
||
const vec: AsciiVector = remaining[0..ascii_vector_size].*;
|
||
if (@reduce(.Max, @as(AsciiVectorU1, @bitCast((vec == vecs[0]))) |
|
||
@as(AsciiVectorU1, @bitCast((vec == vecs[1]))) |
|
||
@as(AsciiVectorU1, @bitCast((vec == vecs[2]))) |
|
||
@as(AsciiVectorU1, @bitCast((vec == vecs[3]))) |
|
||
@as(AsciiVectorU1, @bitCast((vec == vecs[4])))) == 1)
|
||
{
|
||
if (comptime Environment.allow_assert) std.debug.assert(buf.capacity == 0);
|
||
|
||
buf = try std.ArrayList(u8).initCapacity(allocator, latin1.len + 6);
|
||
const copy_len = @intFromPtr(remaining.ptr) - @intFromPtr(latin1.ptr);
|
||
@memcpy(buf.items[0..copy_len], latin1[0..copy_len]);
|
||
buf.items.len = copy_len;
|
||
any_needs_escape = true;
|
||
inline for (0..ascii_vector_size) |i| {
|
||
switch (vec[i]) {
|
||
'"' => {
|
||
buf.ensureUnusedCapacity((ascii_vector_size - i) + """.len) catch unreachable;
|
||
buf.items.ptr[buf.items.len .. buf.items.len + """.len][0..""".len].* = """.*;
|
||
buf.items.len += """.len;
|
||
},
|
||
'&' => {
|
||
buf.ensureUnusedCapacity((ascii_vector_size - i) + "&".len) catch unreachable;
|
||
buf.items.ptr[buf.items.len .. buf.items.len + "&".len][0.."&".len].* = "&".*;
|
||
buf.items.len += "&".len;
|
||
},
|
||
'\'' => {
|
||
buf.ensureUnusedCapacity((ascii_vector_size - i) + "'".len) catch unreachable;
|
||
buf.items.ptr[buf.items.len .. buf.items.len + "'".len][0.."'".len].* = "'".*;
|
||
buf.items.len += "'".len;
|
||
},
|
||
'<' => {
|
||
buf.ensureUnusedCapacity((ascii_vector_size - i) + "<".len) catch unreachable;
|
||
buf.items.ptr[buf.items.len .. buf.items.len + "<".len][0.."<".len].* = "<".*;
|
||
buf.items.len += "<".len;
|
||
},
|
||
'>' => {
|
||
buf.ensureUnusedCapacity((ascii_vector_size - i) + ">".len) catch unreachable;
|
||
buf.items.ptr[buf.items.len .. buf.items.len + ">".len][0..">".len].* = ">".*;
|
||
buf.items.len += ">".len;
|
||
},
|
||
else => |c| {
|
||
buf.appendAssumeCapacity(c);
|
||
},
|
||
}
|
||
}
|
||
|
||
remaining = remaining[ascii_vector_size..];
|
||
break :scan_and_allocate_lazily;
|
||
}
|
||
|
||
remaining = remaining[ascii_vector_size..];
|
||
}
|
||
}
|
||
|
||
if (any_needs_escape) {
|
||
// pass #2: we found something that needed an escape
|
||
// so we'll go ahead and copy the buffer into a new buffer
|
||
while (remaining.len >= ascii_vector_size) {
|
||
const vec: AsciiVector = remaining[0..ascii_vector_size].*;
|
||
if (@reduce(.Max, @as(AsciiVectorU1, @bitCast((vec == vecs[0]))) |
|
||
@as(AsciiVectorU1, @bitCast((vec == vecs[1]))) |
|
||
@as(AsciiVectorU1, @bitCast((vec == vecs[2]))) |
|
||
@as(AsciiVectorU1, @bitCast((vec == vecs[3]))) |
|
||
@as(AsciiVectorU1, @bitCast((vec == vecs[4])))) == 1)
|
||
{
|
||
buf.ensureUnusedCapacity(ascii_vector_size + 6) catch unreachable;
|
||
inline for (0..ascii_vector_size) |i| {
|
||
switch (vec[i]) {
|
||
'"' => {
|
||
buf.ensureUnusedCapacity((ascii_vector_size - i) + """.len) catch unreachable;
|
||
buf.items.ptr[buf.items.len .. buf.items.len + """.len][0..""".len].* = """.*;
|
||
buf.items.len += """.len;
|
||
},
|
||
'&' => {
|
||
buf.ensureUnusedCapacity((ascii_vector_size - i) + "&".len) catch unreachable;
|
||
buf.items.ptr[buf.items.len .. buf.items.len + "&".len][0.."&".len].* = "&".*;
|
||
buf.items.len += "&".len;
|
||
},
|
||
'\'' => {
|
||
buf.ensureUnusedCapacity((ascii_vector_size - i) + "'".len) catch unreachable;
|
||
buf.items.ptr[buf.items.len .. buf.items.len + "'".len][0.."'".len].* = "'".*;
|
||
buf.items.len += "'".len;
|
||
},
|
||
'<' => {
|
||
buf.ensureUnusedCapacity((ascii_vector_size - i) + "<".len) catch unreachable;
|
||
buf.items.ptr[buf.items.len .. buf.items.len + "<".len][0.."<".len].* = "<".*;
|
||
buf.items.len += "<".len;
|
||
},
|
||
'>' => {
|
||
buf.ensureUnusedCapacity((ascii_vector_size - i) + ">".len) catch unreachable;
|
||
buf.items.ptr[buf.items.len .. buf.items.len + ">".len][0..">".len].* = ">".*;
|
||
buf.items.len += ">".len;
|
||
},
|
||
else => |c| {
|
||
buf.appendAssumeCapacity(c);
|
||
},
|
||
}
|
||
}
|
||
|
||
remaining = remaining[ascii_vector_size..];
|
||
continue;
|
||
}
|
||
|
||
try buf.ensureUnusedCapacity(ascii_vector_size);
|
||
buf.items.ptr[buf.items.len .. buf.items.len + ascii_vector_size][0..ascii_vector_size].* = remaining[0..ascii_vector_size].*;
|
||
buf.items.len += ascii_vector_size;
|
||
remaining = remaining[ascii_vector_size..];
|
||
}
|
||
}
|
||
|
||
var ptr = remaining.ptr;
|
||
const end = remaining.ptr + remaining.len;
|
||
|
||
if (!any_needs_escape) {
|
||
scan_and_allocate_lazily: while (ptr != end) : (ptr += 1) {
|
||
switch (ptr[0]) {
|
||
'"', '&', '\'', '<', '>' => |c| {
|
||
if (comptime Environment.allow_assert) std.debug.assert(buf.capacity == 0);
|
||
|
||
buf = try std.ArrayList(u8).initCapacity(allocator, latin1.len + @as(usize, Scalar.lengths[c]));
|
||
const copy_len = @intFromPtr(ptr) - @intFromPtr(latin1.ptr);
|
||
@memcpy(buf.items[0..copy_len], latin1[0..copy_len]);
|
||
buf.items.len = copy_len;
|
||
any_needs_escape = true;
|
||
break :scan_and_allocate_lazily;
|
||
},
|
||
else => {},
|
||
}
|
||
}
|
||
}
|
||
|
||
while (ptr != end) : (ptr += 1) {
|
||
switch (ptr[0]) {
|
||
'"' => {
|
||
buf.appendSlice(""") catch unreachable;
|
||
},
|
||
'&' => {
|
||
buf.appendSlice("&") catch unreachable;
|
||
},
|
||
'\'' => {
|
||
buf.appendSlice("'") catch unreachable; // modified from escape-html; used to be '''
|
||
},
|
||
'<' => {
|
||
buf.appendSlice("<") catch unreachable;
|
||
},
|
||
'>' => {
|
||
buf.appendSlice(">") catch unreachable;
|
||
},
|
||
else => |c| {
|
||
buf.append(c) catch unreachable;
|
||
},
|
||
}
|
||
}
|
||
|
||
if (!any_needs_escape) {
|
||
if (comptime Environment.allow_assert) std.debug.assert(buf.capacity == 0);
|
||
return Escaped(u8){ .original = {} };
|
||
}
|
||
|
||
return Escaped(u8){ .allocated = try buf.toOwnedSlice() };
|
||
},
|
||
}
|
||
}
|
||
|
||
fn Escaped(comptime T: type) type {
|
||
return union(enum) {
|
||
static: []const u8,
|
||
original: void,
|
||
allocated: []T,
|
||
};
|
||
}
|
||
|
||
pub fn escapeHTMLForUTF16Input(allocator: std.mem.Allocator, utf16: []const u16) !Escaped(u16) {
|
||
const Scalar = struct {
|
||
pub const lengths: [std.math.maxInt(u8)]u4 = brk: {
|
||
var values: [std.math.maxInt(u8)]u4 = undefined;
|
||
for (values, 0..) |_, i| {
|
||
values[i] = switch (i) {
|
||
'"' => """.len,
|
||
'&' => "&".len,
|
||
'\'' => "'".len,
|
||
'<' => "<".len,
|
||
'>' => ">".len,
|
||
else => 1,
|
||
};
|
||
}
|
||
|
||
break :brk values;
|
||
};
|
||
};
|
||
switch (utf16.len) {
|
||
0 => return Escaped(u16){ .static = &[_]u8{} },
|
||
1 => {
|
||
switch (utf16[0]) {
|
||
'"' => return Escaped(u16){ .static = """ },
|
||
'&' => return Escaped(u16){ .static = "&" },
|
||
'\'' => return Escaped(u16){ .static = "'" },
|
||
'<' => return Escaped(u16){ .static = "<" },
|
||
'>' => return Escaped(u16){ .static = ">" },
|
||
else => return Escaped(u16){ .original = {} },
|
||
}
|
||
},
|
||
2 => {
|
||
const first_16 = switch (utf16[0]) {
|
||
'"' => toUTF16Literal("""),
|
||
'&' => toUTF16Literal("&"),
|
||
'\'' => toUTF16Literal("'"),
|
||
'<' => toUTF16Literal("<"),
|
||
'>' => toUTF16Literal(">"),
|
||
else => @as([]const u16, utf16[0..1]),
|
||
};
|
||
|
||
const second_16 = switch (utf16[1]) {
|
||
'"' => toUTF16Literal("""),
|
||
'&' => toUTF16Literal("&"),
|
||
'\'' => toUTF16Literal("'"),
|
||
'<' => toUTF16Literal("<"),
|
||
'>' => toUTF16Literal(">"),
|
||
else => @as([]const u16, utf16[1..2]),
|
||
};
|
||
|
||
if (first_16.ptr == utf16.ptr and second_16.ptr == utf16.ptr + 1) {
|
||
return Escaped(u16){ .original = {} };
|
||
}
|
||
|
||
var buf = allocator.alloc(u16, first_16.len + second_16.len) catch unreachable;
|
||
bun.copy(u16, buf, first_16);
|
||
bun.copy(u16, buf[first_16.len..], second_16);
|
||
return Escaped(u16){ .allocated = buf };
|
||
},
|
||
|
||
else => {
|
||
var remaining = utf16;
|
||
|
||
var any_needs_escape = false;
|
||
var buf: std.ArrayList(u16) = undefined;
|
||
|
||
if (comptime Environment.enableSIMD) {
|
||
const vec_chars = "\"&'<>";
|
||
const vecs: [vec_chars.len]AsciiU16Vector = brk: {
|
||
var _vecs: [vec_chars.len]AsciiU16Vector = undefined;
|
||
for (vec_chars, 0..) |c, i| {
|
||
_vecs[i] = @splat(@as(u16, c));
|
||
}
|
||
break :brk _vecs;
|
||
};
|
||
// pass #1: scan for any characters that need escaping
|
||
// assume most strings won't need any escaping, so don't actually allocate the buffer
|
||
scan_and_allocate_lazily: while (remaining.len >= ascii_u16_vector_size) {
|
||
if (comptime Environment.allow_assert) std.debug.assert(!any_needs_escape);
|
||
const vec: AsciiU16Vector = remaining[0..ascii_u16_vector_size].*;
|
||
if (@reduce(.Max, @as(AsciiVectorU16U1, @bitCast(vec > @as(AsciiU16Vector, @splat(@as(u16, 127))))) |
|
||
@as(AsciiVectorU16U1, @bitCast((vec == vecs[0]))) |
|
||
@as(AsciiVectorU16U1, @bitCast((vec == vecs[1]))) |
|
||
@as(AsciiVectorU16U1, @bitCast((vec == vecs[2]))) |
|
||
@as(AsciiVectorU16U1, @bitCast((vec == vecs[3]))) |
|
||
@as(AsciiVectorU16U1, @bitCast((vec == vecs[4])))) == 1)
|
||
{
|
||
var i: u16 = 0;
|
||
lazy: {
|
||
while (i < ascii_u16_vector_size) {
|
||
switch (remaining[i]) {
|
||
'"', '&', '\'', '<', '>' => {
|
||
any_needs_escape = true;
|
||
break :lazy;
|
||
},
|
||
128...std.math.maxInt(u16) => {
|
||
const cp = utf16Codepoint([]const u16, remaining[i..]);
|
||
i += @as(u16, cp.len);
|
||
},
|
||
else => {
|
||
i += 1;
|
||
},
|
||
}
|
||
}
|
||
}
|
||
|
||
if (!any_needs_escape) {
|
||
remaining = remaining[i..];
|
||
continue :scan_and_allocate_lazily;
|
||
}
|
||
|
||
if (comptime Environment.allow_assert) std.debug.assert(@intFromPtr(remaining.ptr + i) >= @intFromPtr(utf16.ptr));
|
||
const to_copy = std.mem.sliceAsBytes(utf16)[0 .. @intFromPtr(remaining.ptr + i) - @intFromPtr(utf16.ptr)];
|
||
const to_copy_16 = std.mem.bytesAsSlice(u16, to_copy);
|
||
buf = try std.ArrayList(u16).initCapacity(allocator, utf16.len + 6);
|
||
try buf.appendSlice(to_copy_16);
|
||
|
||
while (i < ascii_u16_vector_size) {
|
||
switch (remaining[i]) {
|
||
'"', '&', '\'', '<', '>' => |c| {
|
||
const result = switch (c) {
|
||
'"' => toUTF16Literal("""),
|
||
'&' => toUTF16Literal("&"),
|
||
'\'' => toUTF16Literal("'"),
|
||
'<' => toUTF16Literal("<"),
|
||
'>' => toUTF16Literal(">"),
|
||
else => unreachable,
|
||
};
|
||
|
||
buf.appendSlice(result) catch unreachable;
|
||
i += 1;
|
||
},
|
||
128...std.math.maxInt(u16) => {
|
||
const cp = utf16Codepoint([]const u16, remaining[i..]);
|
||
|
||
buf.appendSlice(remaining[i..][0..@as(usize, cp.len)]) catch unreachable;
|
||
i += @as(u16, cp.len);
|
||
},
|
||
else => |c| {
|
||
i += 1;
|
||
buf.append(c) catch unreachable;
|
||
},
|
||
}
|
||
}
|
||
|
||
// edgecase: code point width could exceed asdcii_u16_vector_size
|
||
remaining = remaining[i..];
|
||
break :scan_and_allocate_lazily;
|
||
}
|
||
|
||
remaining = remaining[ascii_u16_vector_size..];
|
||
}
|
||
|
||
if (any_needs_escape) {
|
||
// pass #2: we found something that needed an escape
|
||
// but there's still some more text to
|
||
// so we'll go ahead and copy the buffer into a new buffer
|
||
while (remaining.len >= ascii_u16_vector_size) {
|
||
const vec: AsciiU16Vector = remaining[0..ascii_u16_vector_size].*;
|
||
if (@reduce(.Max, @as(AsciiVectorU16U1, @bitCast(vec > @as(AsciiU16Vector, @splat(@as(u16, 127))))) |
|
||
@as(AsciiVectorU16U1, @bitCast((vec == vecs[0]))) |
|
||
@as(AsciiVectorU16U1, @bitCast((vec == vecs[1]))) |
|
||
@as(AsciiVectorU16U1, @bitCast((vec == vecs[2]))) |
|
||
@as(AsciiVectorU16U1, @bitCast((vec == vecs[3]))) |
|
||
@as(AsciiVectorU16U1, @bitCast((vec == vecs[4])))) == 1)
|
||
{
|
||
buf.ensureUnusedCapacity(ascii_u16_vector_size) catch unreachable;
|
||
var i: u16 = 0;
|
||
while (i < ascii_u16_vector_size) {
|
||
switch (remaining[i]) {
|
||
'"' => {
|
||
buf.appendSlice(toUTF16Literal(""")) catch unreachable;
|
||
i += 1;
|
||
},
|
||
'&' => {
|
||
buf.appendSlice(toUTF16Literal("&")) catch unreachable;
|
||
i += 1;
|
||
},
|
||
'\'' => {
|
||
buf.appendSlice(toUTF16Literal("'")) catch unreachable; // modified from escape-html; used to be '''
|
||
i += 1;
|
||
},
|
||
'<' => {
|
||
buf.appendSlice(toUTF16Literal("<")) catch unreachable;
|
||
i += 1;
|
||
},
|
||
'>' => {
|
||
buf.appendSlice(toUTF16Literal(">")) catch unreachable;
|
||
i += 1;
|
||
},
|
||
128...std.math.maxInt(u16) => {
|
||
const cp = utf16Codepoint([]const u16, remaining[i..]);
|
||
|
||
buf.appendSlice(remaining[i..][0..@as(usize, cp.len)]) catch unreachable;
|
||
i += @as(u16, cp.len);
|
||
},
|
||
else => |c| {
|
||
buf.append(c) catch unreachable;
|
||
i += 1;
|
||
},
|
||
}
|
||
}
|
||
|
||
remaining = remaining[i..];
|
||
continue;
|
||
}
|
||
|
||
try buf.ensureUnusedCapacity(ascii_u16_vector_size);
|
||
buf.items.ptr[buf.items.len .. buf.items.len + ascii_u16_vector_size][0..ascii_u16_vector_size].* = remaining[0..ascii_u16_vector_size].*;
|
||
buf.items.len += ascii_u16_vector_size;
|
||
remaining = remaining[ascii_u16_vector_size..];
|
||
}
|
||
}
|
||
}
|
||
|
||
var ptr = remaining.ptr;
|
||
const end = remaining.ptr + remaining.len;
|
||
|
||
if (!any_needs_escape) {
|
||
scan_and_allocate_lazily: while (ptr != end) {
|
||
switch (ptr[0]) {
|
||
'"', '&', '\'', '<', '>' => |c| {
|
||
buf = try std.ArrayList(u16).initCapacity(allocator, utf16.len + @as(usize, Scalar.lengths[c]));
|
||
if (comptime Environment.allow_assert) std.debug.assert(@intFromPtr(ptr) >= @intFromPtr(utf16.ptr));
|
||
|
||
const to_copy = std.mem.sliceAsBytes(utf16)[0 .. @intFromPtr(ptr) - @intFromPtr(utf16.ptr)];
|
||
const to_copy_16 = std.mem.bytesAsSlice(u16, to_copy);
|
||
try buf.appendSlice(to_copy_16);
|
||
any_needs_escape = true;
|
||
break :scan_and_allocate_lazily;
|
||
},
|
||
128...std.math.maxInt(u16) => {
|
||
const cp = utf16Codepoint([]const u16, ptr[0..2]);
|
||
|
||
ptr += @as(u16, cp.len);
|
||
},
|
||
else => {
|
||
ptr += 1;
|
||
},
|
||
}
|
||
}
|
||
}
|
||
|
||
while (ptr != end) {
|
||
switch (ptr[0]) {
|
||
'"' => {
|
||
buf.appendSlice(toUTF16Literal(""")) catch unreachable;
|
||
ptr += 1;
|
||
},
|
||
'&' => {
|
||
buf.appendSlice(toUTF16Literal("&")) catch unreachable;
|
||
ptr += 1;
|
||
},
|
||
'\'' => {
|
||
buf.appendSlice(toUTF16Literal("'")) catch unreachable; // modified from escape-html; used to be '''
|
||
ptr += 1;
|
||
},
|
||
'<' => {
|
||
buf.appendSlice(toUTF16Literal("<")) catch unreachable;
|
||
ptr += 1;
|
||
},
|
||
'>' => {
|
||
buf.appendSlice(toUTF16Literal(">")) catch unreachable;
|
||
ptr += 1;
|
||
},
|
||
128...std.math.maxInt(u16) => {
|
||
const cp = utf16Codepoint([]const u16, ptr[0..2]);
|
||
|
||
buf.appendSlice(ptr[0..@as(usize, cp.len)]) catch unreachable;
|
||
ptr += @as(u16, cp.len);
|
||
},
|
||
|
||
else => |c| {
|
||
buf.append(c) catch unreachable;
|
||
ptr += 1;
|
||
},
|
||
}
|
||
}
|
||
|
||
if (!any_needs_escape) {
|
||
return Escaped(u16){ .original = {} };
|
||
}
|
||
|
||
return Escaped(u16){ .allocated = try buf.toOwnedSlice() };
|
||
},
|
||
}
|
||
}
|
||
|
||
test "copyLatin1IntoUTF8 - ascii" {
|
||
const input: string = "hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!";
|
||
var output = std.mem.zeroes([500]u8);
|
||
const result = copyLatin1IntoUTF8(&output, string, input);
|
||
try std.testing.expectEqual(input.len, result.read);
|
||
try std.testing.expectEqual(input.len, result.written);
|
||
|
||
try std.testing.expectEqualSlices(u8, input, output[0..result.written]);
|
||
}
|
||
|
||
test "copyLatin1IntoUTF8 - latin1" {
|
||
{
|
||
const input: string = &[_]u8{ 104, 101, 108, 108, 111, 32, 119, 111, 114, 108, 100, 32, 169 };
|
||
var output = std.mem.zeroes([500]u8);
|
||
const expected = "hello world ©";
|
||
const result = copyLatin1IntoUTF8(&output, string, input);
|
||
try std.testing.expectEqual(input.len, result.read);
|
||
|
||
try std.testing.expectEqualSlices(u8, expected, output[0..result.written]);
|
||
}
|
||
|
||
{
|
||
const input: string = &[_]u8{ 72, 169, 101, 108, 108, 169, 111, 32, 87, 111, 114, 169, 108, 100, 33 };
|
||
var output = std.mem.zeroes([500]u8);
|
||
const expected = "H©ell©o Wor©ld!";
|
||
const result = copyLatin1IntoUTF8(&output, string, input);
|
||
try std.testing.expectEqual(input.len, result.read);
|
||
|
||
try std.testing.expectEqualSlices(u8, expected, output[0..result.written]);
|
||
}
|
||
}
|
||
|
||
pub fn latin1ToCodepointAssumeNotASCII(char: u8, comptime CodePointType: type) CodePointType {
|
||
return @as(
|
||
CodePointType,
|
||
@intCast(latin1ToCodepointBytesAssumeNotASCII16(char)),
|
||
);
|
||
}
|
||
|
||
const latin1_to_utf16_conversion_table = [256]u16{
|
||
0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, // 00-07
|
||
0x0008, 0x0009, 0x000A, 0x000B, 0x000C, 0x000D, 0x000E, 0x000F, // 08-0F
|
||
0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, // 10-17
|
||
0x0018, 0x0019, 0x001A, 0x001B, 0x001C, 0x001D, 0x001E, 0x001F, // 18-1F
|
||
0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, // 20-27
|
||
0x0028, 0x0029, 0x002A, 0x002B, 0x002C, 0x002D, 0x002E, 0x002F, // 28-2F
|
||
0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, // 30-37
|
||
0x0038, 0x0039, 0x003A, 0x003B, 0x003C, 0x003D, 0x003E, 0x003F, // 38-3F
|
||
0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, // 40-47
|
||
0x0048, 0x0049, 0x004A, 0x004B, 0x004C, 0x004D, 0x004E, 0x004F, // 48-4F
|
||
0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, // 50-57
|
||
0x0058, 0x0059, 0x005A, 0x005B, 0x005C, 0x005D, 0x005E, 0x005F, // 58-5F
|
||
0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, // 60-67
|
||
0x0068, 0x0069, 0x006A, 0x006B, 0x006C, 0x006D, 0x006E, 0x006F, // 68-6F
|
||
0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, // 70-77
|
||
0x0078, 0x0079, 0x007A, 0x007B, 0x007C, 0x007D, 0x007E, 0x007F, // 78-7F
|
||
0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, // 80-87
|
||
0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F, // 88-8F
|
||
0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, // 90-97
|
||
0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178, // 98-9F
|
||
0x00A0, 0x00A1, 0x00A2, 0x00A3, 0x00A4, 0x00A5, 0x00A6, 0x00A7, // A0-A7
|
||
0x00A8, 0x00A9, 0x00AA, 0x00AB, 0x00AC, 0x00AD, 0x00AE, 0x00AF, // A8-AF
|
||
0x00B0, 0x00B1, 0x00B2, 0x00B3, 0x00B4, 0x00B5, 0x00B6, 0x00B7, // B0-B7
|
||
0x00B8, 0x00B9, 0x00BA, 0x00BB, 0x00BC, 0x00BD, 0x00BE, 0x00BF, // B8-BF
|
||
0x00C0, 0x00C1, 0x00C2, 0x00C3, 0x00C4, 0x00C5, 0x00C6, 0x00C7, // C0-C7
|
||
0x00C8, 0x00C9, 0x00CA, 0x00CB, 0x00CC, 0x00CD, 0x00CE, 0x00CF, // C8-CF
|
||
0x00D0, 0x00D1, 0x00D2, 0x00D3, 0x00D4, 0x00D5, 0x00D6, 0x00D7, // D0-D7
|
||
0x00D8, 0x00D9, 0x00DA, 0x00DB, 0x00DC, 0x00DD, 0x00DE, 0x00DF, // D8-DF
|
||
0x00E0, 0x00E1, 0x00E2, 0x00E3, 0x00E4, 0x00E5, 0x00E6, 0x00E7, // E0-E7
|
||
0x00E8, 0x00E9, 0x00EA, 0x00EB, 0x00EC, 0x00ED, 0x00EE, 0x00EF, // E8-EF
|
||
0x00F0, 0x00F1, 0x00F2, 0x00F3, 0x00F4, 0x00F5, 0x00F6, 0x00F7, // F0-F7
|
||
0x00F8, 0x00F9, 0x00FA, 0x00FB, 0x00FC, 0x00FD, 0x00FE, 0x00FF, // F8-FF
|
||
};
|
||
|
||
pub fn latin1ToCodepointBytesAssumeNotASCII(char: u32) [2]u8 {
|
||
var bytes = [4]u8{ 0, 0, 0, 0 };
|
||
_ = encodeWTF8Rune(&bytes, @as(i32, @intCast(char)));
|
||
return bytes[0..2].*;
|
||
}
|
||
|
||
pub fn latin1ToCodepointBytesAssumeNotASCII16(char: u32) u16 {
|
||
return latin1_to_utf16_conversion_table[@as(u8, @truncate(char))];
|
||
}
|
||
|
||
pub fn copyUTF16IntoUTF8(buf: []u8, comptime Type: type, utf16: Type, comptime allow_partial_write: bool) EncodeIntoResult {
|
||
if (comptime Type == []const u16) {
|
||
if (bun.FeatureFlags.use_simdutf) {
|
||
if (utf16.len == 0)
|
||
return .{ .read = 0, .written = 0 };
|
||
const trimmed = bun.simdutf.trim.utf16(utf16);
|
||
if (trimmed.len == 0)
|
||
return .{ .read = 0, .written = 0 };
|
||
|
||
const out_len = if (buf.len <= (trimmed.len * 3 + 2))
|
||
bun.simdutf.length.utf8.from.utf16.le(trimmed)
|
||
else
|
||
buf.len;
|
||
|
||
return copyUTF16IntoUTF8WithBuffer(buf, Type, utf16, trimmed, out_len, allow_partial_write);
|
||
}
|
||
}
|
||
|
||
return copyUTF16IntoUTF8WithBuffer(buf, Type, utf16, utf16, utf16.len, allow_partial_write);
|
||
}
|
||
|
||
pub fn copyUTF16IntoUTF8WithBuffer(buf: []u8, comptime Type: type, utf16: Type, trimmed: Type, out_len: usize, comptime allow_partial_write: bool) EncodeIntoResult {
|
||
var remaining = buf;
|
||
var utf16_remaining = utf16;
|
||
var ended_on_non_ascii = false;
|
||
|
||
brk: {
|
||
if (comptime Type == []const u16) {
|
||
if (bun.FeatureFlags.use_simdutf) {
|
||
log("UTF16 {d} -> UTF8 {d}", .{ utf16.len, out_len });
|
||
if (remaining.len >= out_len) {
|
||
const result = bun.simdutf.convert.utf16.to.utf8.with_errors.le(trimmed, remaining);
|
||
if (result.status == .surrogate) break :brk;
|
||
|
||
return EncodeIntoResult{
|
||
.read = @as(u32, @truncate(trimmed.len)),
|
||
.written = @as(u32, @truncate(result.count)),
|
||
};
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
while (firstNonASCII16(Type, utf16_remaining)) |i| {
|
||
const end = @min(i, remaining.len);
|
||
if (end > 0) copyU16IntoU8(remaining, Type, utf16_remaining[0..end]);
|
||
remaining = remaining[end..];
|
||
utf16_remaining = utf16_remaining[end..];
|
||
|
||
if (@min(utf16_remaining.len, remaining.len) == 0)
|
||
break;
|
||
|
||
const replacement = utf16CodepointWithFFFD(Type, utf16_remaining);
|
||
|
||
const width: usize = replacement.utf8Width();
|
||
if (width > remaining.len) {
|
||
ended_on_non_ascii = width > 1;
|
||
if (comptime allow_partial_write) switch (width) {
|
||
2 => {
|
||
if (remaining.len > 0) {
|
||
//only first will be written
|
||
remaining[0] = @as(u8, @truncate(0xC0 | (replacement.code_point >> 6)));
|
||
remaining = remaining[remaining.len..];
|
||
}
|
||
},
|
||
3 => {
|
||
//only first to second written
|
||
switch (remaining.len) {
|
||
1 => {
|
||
remaining[0] = @as(u8, @truncate(0xE0 | (replacement.code_point >> 12)));
|
||
remaining = remaining[remaining.len..];
|
||
},
|
||
2 => {
|
||
remaining[0] = @as(u8, @truncate(0xE0 | (replacement.code_point >> 12)));
|
||
remaining[1] = @as(u8, @truncate(0x80 | (replacement.code_point >> 6) & 0x3F));
|
||
remaining = remaining[remaining.len..];
|
||
},
|
||
else => {},
|
||
}
|
||
},
|
||
4 => {
|
||
//only 1 to 3 written
|
||
switch (remaining.len) {
|
||
1 => {
|
||
remaining[0] = @as(u8, @truncate(0xF0 | (replacement.code_point >> 18)));
|
||
remaining = remaining[remaining.len..];
|
||
},
|
||
2 => {
|
||
remaining[0] = @as(u8, @truncate(0xF0 | (replacement.code_point >> 18)));
|
||
remaining[1] = @as(u8, @truncate(0x80 | (replacement.code_point >> 12) & 0x3F));
|
||
remaining = remaining[remaining.len..];
|
||
},
|
||
3 => {
|
||
remaining[0] = @as(u8, @truncate(0xF0 | (replacement.code_point >> 18)));
|
||
remaining[1] = @as(u8, @truncate(0x80 | (replacement.code_point >> 12) & 0x3F));
|
||
remaining[2] = @as(u8, @truncate(0x80 | (replacement.code_point >> 6) & 0x3F));
|
||
remaining = remaining[remaining.len..];
|
||
},
|
||
else => {},
|
||
}
|
||
},
|
||
|
||
else => {},
|
||
};
|
||
break;
|
||
}
|
||
|
||
utf16_remaining = utf16_remaining[replacement.len..];
|
||
_ = encodeWTF8RuneT(remaining.ptr[0..4], u32, @as(u32, replacement.code_point));
|
||
remaining = remaining[width..];
|
||
}
|
||
|
||
if (remaining.len > 0 and !ended_on_non_ascii and utf16_remaining.len > 0) {
|
||
const len = @min(remaining.len, utf16_remaining.len);
|
||
copyU16IntoU8(remaining[0..len], Type, utf16_remaining[0..len]);
|
||
utf16_remaining = utf16_remaining[len..];
|
||
remaining = remaining[len..];
|
||
}
|
||
|
||
return .{
|
||
.read = @as(u32, @truncate(utf16.len - utf16_remaining.len)),
|
||
.written = @as(u32, @truncate(buf.len - remaining.len)),
|
||
};
|
||
}
|
||
|
||
pub fn elementLengthUTF16IntoUTF8(comptime Type: type, utf16: Type) usize {
|
||
if (bun.FeatureFlags.use_simdutf) {
|
||
return bun.simdutf.length.utf8.from.utf16.le(utf16);
|
||
}
|
||
|
||
var utf16_remaining = utf16;
|
||
var count: usize = 0;
|
||
|
||
while (firstNonASCII16(Type, utf16_remaining)) |i| {
|
||
count += i;
|
||
|
||
utf16_remaining = utf16_remaining[i..];
|
||
|
||
const replacement = utf16Codepoint(Type, utf16_remaining);
|
||
|
||
count += replacement.utf8Width();
|
||
utf16_remaining = utf16_remaining[replacement.len..];
|
||
}
|
||
|
||
return count + utf16_remaining.len;
|
||
}
|
||
|
||
pub fn elementLengthUTF8IntoUTF16(comptime Type: type, utf8: Type) usize {
|
||
var utf8_remaining = utf8;
|
||
var count: usize = 0;
|
||
|
||
if (bun.FeatureFlags.use_simdutf) {
|
||
return bun.simdutf.length.utf16.from.utf8.le(utf8);
|
||
}
|
||
|
||
while (firstNonASCII(utf8_remaining)) |i| {
|
||
count += i;
|
||
|
||
utf8_remaining = utf8_remaining[i..];
|
||
|
||
const replacement = utf16Codepoint(Type, utf8_remaining);
|
||
|
||
count += replacement.len;
|
||
utf8_remaining = utf8_remaining[@min(replacement.utf8Width(), utf8_remaining.len)..];
|
||
}
|
||
|
||
return count + utf8_remaining.len;
|
||
}
|
||
|
||
// Check utf16 string equals utf8 string without allocating extra memory
|
||
pub fn utf16EqlString(text: []const u16, str: string) bool {
|
||
if (text.len > str.len) {
|
||
// Strings can't be equal if UTF-16 encoding is longer than UTF-8 encoding
|
||
return false;
|
||
}
|
||
|
||
var temp = [4]u8{ 0, 0, 0, 0 };
|
||
const n = text.len;
|
||
var j: usize = 0;
|
||
var i: usize = 0;
|
||
// TODO: is it safe to just make this u32 or u21?
|
||
var r1: i32 = undefined;
|
||
while (i < n) : (i += 1) {
|
||
r1 = text[i];
|
||
if (r1 >= 0xD800 and r1 <= 0xDBFF and i + 1 < n) {
|
||
const r2: i32 = text[i + 1];
|
||
if (r2 >= 0xDC00 and r2 <= 0xDFFF) {
|
||
r1 = (r1 - 0xD800) << 10 | (r2 - 0xDC00) + 0x10000;
|
||
i += 1;
|
||
}
|
||
}
|
||
|
||
const width = encodeWTF8Rune(&temp, r1);
|
||
if (j + width > str.len) {
|
||
return false;
|
||
}
|
||
for (0..width) |k| {
|
||
if (temp[k] != str[j]) {
|
||
return false;
|
||
}
|
||
j += 1;
|
||
}
|
||
}
|
||
|
||
return j == str.len;
|
||
}
|
||
|
||
// This is a clone of golang's "utf8.EncodeRune" that has been modified to encode using
|
||
// WTF-8 instead. See https://simonsapin.github.io/wtf-8/ for more info.
|
||
pub fn encodeWTF8Rune(p: *[4]u8, r: i32) u3 {
|
||
return @call(
|
||
.always_inline,
|
||
encodeWTF8RuneT,
|
||
.{
|
||
p,
|
||
u32,
|
||
@as(u32, @intCast(r)),
|
||
},
|
||
);
|
||
}
|
||
|
||
pub fn encodeWTF8RuneT(p: *[4]u8, comptime R: type, r: R) u3 {
|
||
switch (r) {
|
||
0...0x7F => {
|
||
p[0] = @as(u8, @intCast(r));
|
||
return 1;
|
||
},
|
||
(0x7F + 1)...0x7FF => {
|
||
p[0] = @as(u8, @truncate(0xC0 | ((r >> 6))));
|
||
p[1] = @as(u8, @truncate(0x80 | (r & 0x3F)));
|
||
return 2;
|
||
},
|
||
(0x7FF + 1)...0xFFFF => {
|
||
p[0] = @as(u8, @truncate(0xE0 | ((r >> 12))));
|
||
p[1] = @as(u8, @truncate(0x80 | ((r >> 6) & 0x3F)));
|
||
p[2] = @as(u8, @truncate(0x80 | (r & 0x3F)));
|
||
return 3;
|
||
},
|
||
else => {
|
||
p[0] = @as(u8, @truncate(0xF0 | ((r >> 18))));
|
||
p[1] = @as(u8, @truncate(0x80 | ((r >> 12) & 0x3F)));
|
||
p[2] = @as(u8, @truncate(0x80 | ((r >> 6) & 0x3F)));
|
||
p[3] = @as(u8, @truncate(0x80 | (r & 0x3F)));
|
||
return 4;
|
||
},
|
||
}
|
||
}
|
||
|
||
pub inline fn wtf8ByteSequenceLength(first_byte: u8) u3 {
|
||
return switch (first_byte) {
|
||
0 => 0,
|
||
1...0x80 - 1 => 1,
|
||
else => if ((first_byte & 0xE0) == 0xC0)
|
||
@as(u3, 2)
|
||
else if ((first_byte & 0xF0) == 0xE0)
|
||
@as(u3, 3)
|
||
else if ((first_byte & 0xF8) == 0xF0)
|
||
@as(u3, 4)
|
||
else
|
||
@as(u3, 1),
|
||
};
|
||
}
|
||
|
||
/// 0 == invalid
|
||
pub inline fn wtf8ByteSequenceLengthWithInvalid(first_byte: u8) u3 {
|
||
return switch (first_byte) {
|
||
0...0x80 - 1 => 1,
|
||
else => if ((first_byte & 0xE0) == 0xC0)
|
||
@as(u3, 2)
|
||
else if ((first_byte & 0xF0) == 0xE0)
|
||
@as(u3, 3)
|
||
else if ((first_byte & 0xF8) == 0xF0)
|
||
@as(u3, 4)
|
||
else
|
||
@as(u3, 1),
|
||
};
|
||
}
|
||
|
||
/// Convert potentially ill-formed UTF-8 or UTF-16 bytes to a Unicode Codepoint.
|
||
/// Invalid codepoints are replaced with `zero` parameter
|
||
/// This is a clone of esbuild's decodeWTF8Rune
|
||
/// which was a clone of golang's "utf8.DecodeRune" that was modified to decode using WTF-8 instead.
|
||
/// Asserts a multi-byte codepoint
|
||
pub inline fn decodeWTF8RuneTMultibyte(p: *const [4]u8, len: u3, comptime T: type, comptime zero: T) T {
|
||
if (comptime Environment.allow_assert) std.debug.assert(len > 1);
|
||
|
||
const s1 = p[1];
|
||
if ((s1 & 0xC0) != 0x80) return zero;
|
||
|
||
if (len == 2) {
|
||
const cp = @as(T, p[0] & 0x1F) << 6 | @as(T, s1 & 0x3F);
|
||
if (cp < 0x80) return zero;
|
||
return cp;
|
||
}
|
||
|
||
const s2 = p[2];
|
||
|
||
if ((s2 & 0xC0) != 0x80) return zero;
|
||
|
||
if (len == 3) {
|
||
const cp = (@as(T, p[0] & 0x0F) << 12) | (@as(T, s1 & 0x3F) << 6) | (@as(T, s2 & 0x3F));
|
||
if (cp < 0x800) return zero;
|
||
return cp;
|
||
}
|
||
|
||
const s3 = p[3];
|
||
{
|
||
const cp = (@as(T, p[0] & 0x07) << 18) | (@as(T, s1 & 0x3F) << 12) | (@as(T, s2 & 0x3F) << 6) | (@as(T, s3 & 0x3F));
|
||
if (cp < 0x10000 or cp > 0x10FFFF) return zero;
|
||
return cp;
|
||
}
|
||
|
||
unreachable;
|
||
}
|
||
|
||
pub const ascii_vector_size = if (Environment.isWasm) 8 else 16;
|
||
pub const ascii_u16_vector_size = if (Environment.isWasm) 4 else 8;
|
||
pub const AsciiVectorInt = std.meta.Int(.unsigned, ascii_vector_size);
|
||
pub const AsciiVectorIntU16 = std.meta.Int(.unsigned, ascii_u16_vector_size);
|
||
pub const max_16_ascii: @Vector(ascii_vector_size, u8) = @splat(@as(u8, 127));
|
||
pub const min_16_ascii: @Vector(ascii_vector_size, u8) = @splat(@as(u8, 0x20));
|
||
pub const max_u16_ascii: @Vector(ascii_u16_vector_size, u16) = @splat(@as(u16, 127));
|
||
pub const min_u16_ascii: @Vector(ascii_u16_vector_size, u16) = @splat(@as(u16, 0x20));
|
||
pub const AsciiVector = @Vector(ascii_vector_size, u8);
|
||
pub const AsciiVectorSmall = @Vector(8, u8);
|
||
pub const AsciiVectorU1 = @Vector(ascii_vector_size, u1);
|
||
pub const AsciiVectorU1Small = @Vector(8, u1);
|
||
pub const AsciiVectorU16U1 = @Vector(ascii_u16_vector_size, u1);
|
||
pub const AsciiU16Vector = @Vector(ascii_u16_vector_size, u16);
|
||
pub const max_4_ascii: @Vector(4, u8) = @splat(@as(u8, 127));
|
||
|
||
const UTF8_ACCEPT: u8 = 0;
|
||
const UTF8_REJECT: u8 = 12;
|
||
|
||
const utf8d: [364]u8 = .{
|
||
// The first part of the table maps bytes to character classes that
|
||
// to reduce the size of the transition table and create bitmasks.
|
||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
|
||
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
|
||
8, 8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
|
||
10, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 11, 6, 6, 6, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
|
||
|
||
// The second part is a transition table that maps a combination
|
||
// of a state of the automaton and a character class to a state.
|
||
0, 12, 24, 36, 60, 96, 84, 12, 12, 12, 48, 72, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 0, 12, 12, 12, 12, 12, 0,
|
||
12, 0, 12, 12, 12, 24, 12, 12, 12, 12, 12, 24, 12, 24, 12, 12, 12, 12, 12, 12, 12, 12, 12, 24, 12, 12, 12, 12, 12, 24, 12, 12,
|
||
12, 12, 12, 12, 12, 24, 12, 12, 12, 12, 12, 12, 12, 12, 12, 36, 12, 36, 12, 12, 12, 36, 12, 12, 12, 12, 12, 36, 12, 36, 12, 12,
|
||
12, 36, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
|
||
};
|
||
|
||
pub fn decodeCheck(state: u8, byte: u8) u8 {
|
||
const char_type: u32 = utf8d[byte];
|
||
// we dont care about the codep
|
||
// codep = if (*state != UTF8_ACCEPT) (byte & 0x3f) | (*codep << 6) else (0xff >> char_type) & (byte);
|
||
|
||
const value = @as(u32, 256) + state + char_type;
|
||
if (value >= utf8d.len) return UTF8_REJECT;
|
||
return utf8d[value];
|
||
}
|
||
|
||
// Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
|
||
// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
|
||
pub fn isValidUTF8WithoutSIMD(slice: []const u8) bool {
|
||
var state: u8 = 0;
|
||
|
||
for (slice) |byte| {
|
||
state = decodeCheck(state, byte);
|
||
}
|
||
return state == UTF8_ACCEPT;
|
||
}
|
||
|
||
pub fn isValidUTF8(slice: []const u8) bool {
|
||
if (bun.FeatureFlags.use_simdutf)
|
||
return bun.simdutf.validate.utf8(slice);
|
||
|
||
return isValidUTF8WithoutSIMD(slice);
|
||
}
|
||
|
||
pub fn isAllASCII(slice: []const u8) bool {
|
||
if (@inComptime()) {
|
||
for (slice) |char| {
|
||
if (char > 127) {
|
||
return false;
|
||
}
|
||
}
|
||
return true;
|
||
}
|
||
|
||
if (bun.FeatureFlags.use_simdutf)
|
||
return bun.simdutf.validate.ascii(slice);
|
||
|
||
var remaining = slice;
|
||
|
||
// The NEON SIMD unit is 128-bit wide and includes 16 128-bit registers that can be used as 32 64-bit registers
|
||
if (comptime Environment.enableSIMD) {
|
||
const remaining_end_ptr = remaining.ptr + remaining.len - (remaining.len % ascii_vector_size);
|
||
while (remaining.ptr != remaining_end_ptr) : (remaining.ptr += ascii_vector_size) {
|
||
const vec: AsciiVector = remaining[0..ascii_vector_size].*;
|
||
|
||
if (@reduce(.Max, vec) > 127) {
|
||
return false;
|
||
}
|
||
}
|
||
}
|
||
|
||
const Int = u64;
|
||
const size = @sizeOf(Int);
|
||
const remaining_last8 = slice.ptr + slice.len - (slice.len % size);
|
||
while (remaining.ptr != remaining_last8) : (remaining.ptr += size) {
|
||
const bytes = @as(Int, @bitCast(remaining[0..size].*));
|
||
// https://dotat.at/@/2022-06-27-tolower-swar.html
|
||
const mask = bytes & 0x8080808080808080;
|
||
|
||
if (mask > 0) {
|
||
return false;
|
||
}
|
||
}
|
||
|
||
const final = slice.ptr + slice.len;
|
||
while (remaining.ptr != final) : (remaining.ptr += 1) {
|
||
if (remaining[0] > 127) {
|
||
return false;
|
||
}
|
||
}
|
||
|
||
return true;
|
||
}
|
||
|
||
//#define U16_LEAD(supplementary) (UChar)(((supplementary)>>10)+0xd7c0)
|
||
pub inline fn u16Lead(supplementary: anytype) u16 {
|
||
return @as(u16, @intCast((supplementary >> 10) + 0xd7c0));
|
||
}
|
||
|
||
//#define U16_TRAIL(supplementary) (UChar)(((supplementary)&0x3ff)|0xdc00)
|
||
pub inline fn u16Trail(supplementary: anytype) u16 {
|
||
return @as(u16, @intCast((supplementary & 0x3ff) | 0xdc00));
|
||
}
|
||
|
||
pub fn firstNonASCII(slice: []const u8) ?u32 {
|
||
return firstNonASCIIWithType([]const u8, slice);
|
||
}
|
||
|
||
pub fn firstNonASCIIWithType(comptime Type: type, slice: Type) ?u32 {
|
||
var remaining = slice;
|
||
|
||
if (comptime bun.FeatureFlags.use_simdutf) {
|
||
const result = bun.simdutf.validate.with_errors.ascii(slice);
|
||
if (result.status == .success) {
|
||
return null;
|
||
}
|
||
|
||
return @as(u32, @truncate(result.count));
|
||
}
|
||
|
||
if (comptime Environment.enableSIMD) {
|
||
if (remaining.len >= ascii_vector_size) {
|
||
const remaining_start = remaining.ptr;
|
||
const remaining_end = remaining.ptr + remaining.len - (remaining.len % ascii_vector_size);
|
||
|
||
while (remaining.ptr != remaining_end) {
|
||
const vec: AsciiVector = remaining[0..ascii_vector_size].*;
|
||
|
||
if (@reduce(.Max, vec) > 127) {
|
||
const Int = u64;
|
||
const size = @sizeOf(Int);
|
||
remaining.len -= @intFromPtr(remaining.ptr) - @intFromPtr(remaining_start);
|
||
|
||
{
|
||
const bytes = @as(Int, @bitCast(remaining[0..size].*));
|
||
// https://dotat.at/@/2022-06-27-tolower-swar.html
|
||
const mask = bytes & 0x8080808080808080;
|
||
|
||
if (mask > 0) {
|
||
const first_set_byte = @ctz(mask) / 8;
|
||
if (comptime Environment.allow_assert) {
|
||
std.debug.assert(remaining[first_set_byte] > 127);
|
||
for (0..first_set_byte) |j| {
|
||
std.debug.assert(remaining[j] <= 127);
|
||
}
|
||
}
|
||
|
||
return @as(u32, first_set_byte) + @as(u32, @intCast(slice.len - remaining.len));
|
||
}
|
||
remaining = remaining[size..];
|
||
}
|
||
{
|
||
const bytes = @as(Int, @bitCast(remaining[0..size].*));
|
||
const mask = bytes & 0x8080808080808080;
|
||
|
||
if (mask > 0) {
|
||
const first_set_byte = @ctz(mask) / 8;
|
||
if (comptime Environment.allow_assert) {
|
||
std.debug.assert(remaining[first_set_byte] > 127);
|
||
for (0..first_set_byte) |j| {
|
||
std.debug.assert(remaining[j] <= 127);
|
||
}
|
||
}
|
||
|
||
return @as(u32, first_set_byte) + @as(u32, @intCast(slice.len - remaining.len));
|
||
}
|
||
}
|
||
unreachable;
|
||
}
|
||
|
||
// the more intuitive way, using slices, produces worse codegen
|
||
// specifically: it subtracts the length at the end of the loop
|
||
// we don't need to do that
|
||
// we only need to subtract the length once at the very end
|
||
remaining.ptr += ascii_vector_size;
|
||
}
|
||
remaining.len -= @intFromPtr(remaining.ptr) - @intFromPtr(remaining_start);
|
||
}
|
||
}
|
||
|
||
{
|
||
const Int = u64;
|
||
const size = @sizeOf(Int);
|
||
const remaining_start = remaining.ptr;
|
||
const remaining_end = remaining.ptr + remaining.len - (remaining.len % size);
|
||
|
||
if (comptime Environment.enableSIMD) {
|
||
// these assertions exist more so for LLVM
|
||
std.debug.assert(remaining.len < ascii_vector_size);
|
||
std.debug.assert(@intFromPtr(remaining.ptr + ascii_vector_size) > @intFromPtr(remaining_end));
|
||
}
|
||
|
||
if (remaining.len >= size) {
|
||
while (remaining.ptr != remaining_end) {
|
||
const bytes = @as(Int, @bitCast(remaining[0..size].*));
|
||
// https://dotat.at/@/2022-06-27-tolower-swar.html
|
||
const mask = bytes & 0x8080808080808080;
|
||
|
||
if (mask > 0) {
|
||
remaining.len -= @intFromPtr(remaining.ptr) - @intFromPtr(remaining_start);
|
||
const first_set_byte = @ctz(mask) / 8;
|
||
if (comptime Environment.allow_assert) {
|
||
std.debug.assert(remaining[first_set_byte] > 127);
|
||
for (0..first_set_byte) |j| {
|
||
std.debug.assert(remaining[j] <= 127);
|
||
}
|
||
}
|
||
|
||
return @as(u32, first_set_byte) + @as(u32, @intCast(slice.len - remaining.len));
|
||
}
|
||
|
||
remaining.ptr += size;
|
||
}
|
||
remaining.len -= @intFromPtr(remaining.ptr) - @intFromPtr(remaining_start);
|
||
}
|
||
}
|
||
|
||
if (comptime Environment.allow_assert) std.debug.assert(remaining.len < 8);
|
||
|
||
for (remaining) |*char| {
|
||
if (char.* > 127) {
|
||
// try to prevent it from reading the length of the slice
|
||
return @as(u32, @truncate(@intFromPtr(char) - @intFromPtr(slice.ptr)));
|
||
}
|
||
}
|
||
|
||
return null;
|
||
}
|
||
|
||
pub fn indexOfNewlineOrNonASCIIOrANSI(slice_: []const u8, offset: u32) ?u32 {
|
||
const slice = slice_[offset..];
|
||
var remaining = slice;
|
||
|
||
if (remaining.len == 0)
|
||
return null;
|
||
|
||
if (comptime Environment.enableSIMD) {
|
||
while (remaining.len >= ascii_vector_size) {
|
||
const vec: AsciiVector = remaining[0..ascii_vector_size].*;
|
||
const cmp = @as(AsciiVectorU1, @bitCast((vec > max_16_ascii))) | @as(AsciiVectorU1, @bitCast((vec < min_16_ascii))) |
|
||
@as(AsciiVectorU1, @bitCast(vec == @as(AsciiVector, @splat(@as(u8, '\r'))))) |
|
||
@as(AsciiVectorU1, @bitCast(vec == @as(AsciiVector, @splat(@as(u8, '\n'))))) |
|
||
@as(AsciiVectorU1, @bitCast(vec == @as(AsciiVector, @splat(@as(u8, '\x1b')))));
|
||
|
||
if (@reduce(.Max, cmp) > 0) {
|
||
const bitmask = @as(AsciiVectorInt, @bitCast(cmp));
|
||
const first = @ctz(bitmask);
|
||
|
||
return @as(u32, first) + @as(u32, @intCast(slice.len - remaining.len)) + offset;
|
||
}
|
||
|
||
remaining = remaining[ascii_vector_size..];
|
||
}
|
||
|
||
if (comptime Environment.allow_assert) std.debug.assert(remaining.len < ascii_vector_size);
|
||
}
|
||
|
||
for (remaining) |*char_| {
|
||
const char = char_.*;
|
||
if (char > 127 or char < 0x20 or char == '\n' or char == '\r' or char == '\x1b') {
|
||
return @as(u32, @truncate((@intFromPtr(char_) - @intFromPtr(slice.ptr)))) + offset;
|
||
}
|
||
}
|
||
|
||
return null;
|
||
}
|
||
|
||
pub fn indexOfNewlineOrNonASCII(slice_: []const u8, offset: u32) ?u32 {
|
||
return indexOfNewlineOrNonASCIICheckStart(slice_, offset, true);
|
||
}
|
||
|
||
pub fn indexOfNewlineOrNonASCIICheckStart(slice_: []const u8, offset: u32, comptime check_start: bool) ?u32 {
|
||
const slice = slice_[offset..];
|
||
var remaining = slice;
|
||
|
||
if (remaining.len == 0)
|
||
return null;
|
||
|
||
if (comptime check_start) {
|
||
// this shows up in profiling
|
||
if (remaining[0] > 127 or remaining[0] < 0x20 or remaining[0] == '\r' or remaining[0] == '\n') {
|
||
return offset;
|
||
}
|
||
}
|
||
|
||
if (comptime Environment.enableSIMD) {
|
||
while (remaining.len >= ascii_vector_size) {
|
||
const vec: AsciiVector = remaining[0..ascii_vector_size].*;
|
||
const cmp = @as(AsciiVectorU1, @bitCast((vec > max_16_ascii))) | @as(AsciiVectorU1, @bitCast((vec < min_16_ascii))) |
|
||
@as(AsciiVectorU1, @bitCast(vec == @as(AsciiVector, @splat(@as(u8, '\r'))))) |
|
||
@as(AsciiVectorU1, @bitCast(vec == @as(AsciiVector, @splat(@as(u8, '\n')))));
|
||
|
||
if (@reduce(.Max, cmp) > 0) {
|
||
const bitmask = @as(AsciiVectorInt, @bitCast(cmp));
|
||
const first = @ctz(bitmask);
|
||
|
||
return @as(u32, first) + @as(u32, @intCast(slice.len - remaining.len)) + offset;
|
||
}
|
||
|
||
remaining = remaining[ascii_vector_size..];
|
||
}
|
||
|
||
if (comptime Environment.allow_assert) std.debug.assert(remaining.len < ascii_vector_size);
|
||
}
|
||
|
||
for (remaining) |*char_| {
|
||
const char = char_.*;
|
||
if (char > 127 or char < 0x20 or char == '\n' or char == '\r') {
|
||
return @as(u32, @truncate((@intFromPtr(char_) - @intFromPtr(slice.ptr)))) + offset;
|
||
}
|
||
}
|
||
|
||
return null;
|
||
}
|
||
|
||
pub fn containsNewlineOrNonASCIIOrQuote(slice_: []const u8) bool {
|
||
const slice = slice_;
|
||
var remaining = slice;
|
||
|
||
if (remaining.len == 0)
|
||
return false;
|
||
|
||
if (comptime Environment.enableSIMD) {
|
||
while (remaining.len >= ascii_vector_size) {
|
||
const vec: AsciiVector = remaining[0..ascii_vector_size].*;
|
||
const cmp = @as(AsciiVectorU1, @bitCast((vec > max_16_ascii))) | @as(AsciiVectorU1, @bitCast((vec < min_16_ascii))) |
|
||
@as(AsciiVectorU1, @bitCast(vec == @as(AsciiVector, @splat(@as(u8, '\r'))))) |
|
||
@as(AsciiVectorU1, @bitCast(vec == @as(AsciiVector, @splat(@as(u8, '\n'))))) |
|
||
@as(AsciiVectorU1, @bitCast(vec == @as(AsciiVector, @splat(@as(u8, '"')))));
|
||
|
||
if (@reduce(.Max, cmp) > 0) {
|
||
return true;
|
||
}
|
||
|
||
remaining = remaining[ascii_vector_size..];
|
||
}
|
||
|
||
if (comptime Environment.allow_assert) std.debug.assert(remaining.len < ascii_vector_size);
|
||
}
|
||
|
||
for (remaining) |*char_| {
|
||
const char = char_.*;
|
||
if (char > 127 or char < 0x20 or char == '\n' or char == '\r' or char == '"') {
|
||
return true;
|
||
}
|
||
}
|
||
|
||
return false;
|
||
}
|
||
|
||
pub fn indexOfNeedsEscape(slice: []const u8) ?u32 {
|
||
var remaining = slice;
|
||
if (remaining.len == 0)
|
||
return null;
|
||
|
||
if (remaining[0] >= 127 or remaining[0] < 0x20 or remaining[0] == '\\' or remaining[0] == '"') {
|
||
return 0;
|
||
}
|
||
|
||
if (comptime Environment.enableSIMD) {
|
||
while (remaining.len >= ascii_vector_size) {
|
||
const vec: AsciiVector = remaining[0..ascii_vector_size].*;
|
||
const cmp = @as(AsciiVectorU1, @bitCast((vec > max_16_ascii))) | @as(AsciiVectorU1, @bitCast((vec < min_16_ascii))) |
|
||
@as(AsciiVectorU1, @bitCast(vec == @as(AsciiVector, @splat(@as(u8, '\\'))))) |
|
||
@as(AsciiVectorU1, @bitCast(vec == @as(AsciiVector, @splat(@as(u8, '"')))));
|
||
|
||
if (@reduce(.Max, cmp) > 0) {
|
||
const bitmask = @as(AsciiVectorInt, @bitCast(cmp));
|
||
const first = @ctz(bitmask);
|
||
|
||
return @as(u32, first) + @as(u32, @truncate(@intFromPtr(remaining.ptr) - @intFromPtr(slice.ptr)));
|
||
}
|
||
|
||
remaining = remaining[ascii_vector_size..];
|
||
}
|
||
}
|
||
|
||
for (remaining) |*char_| {
|
||
const char = char_.*;
|
||
if (char > 127 or char < 0x20 or char == '\\' or char == '"') {
|
||
return @as(u32, @truncate(@intFromPtr(char_) - @intFromPtr(slice.ptr)));
|
||
}
|
||
}
|
||
|
||
return null;
|
||
}
|
||
|
||
test "indexOfNeedsEscape" {
|
||
const out = indexOfNeedsEscape(
|
||
\\la la la la la la la la la la la la la la la la "oh!" okay "well"
|
||
,
|
||
);
|
||
try std.testing.expectEqual(out.?, 48);
|
||
}
|
||
|
||
pub fn indexOfCharZ(sliceZ: [:0]const u8, char: u8) ?u63 {
|
||
const ptr = bun.C.strchr(sliceZ.ptr, char) orelse return null;
|
||
const pos = @intFromPtr(ptr) - @intFromPtr(sliceZ.ptr);
|
||
|
||
if (comptime Environment.allow_assert)
|
||
std.debug.assert(@intFromPtr(sliceZ.ptr) <= @intFromPtr(ptr) and
|
||
@intFromPtr(ptr) < @intFromPtr(sliceZ.ptr + sliceZ.len) and
|
||
pos <= sliceZ.len);
|
||
|
||
return @as(u63, @truncate(pos));
|
||
}
|
||
|
||
pub fn indexOfChar(slice: []const u8, char: u8) ?u32 {
|
||
return @as(u32, @truncate(indexOfCharUsize(slice, char) orelse return null));
|
||
}
|
||
|
||
pub fn indexOfCharUsize(slice: []const u8, char: u8) ?usize {
|
||
if (slice.len == 0)
|
||
return null;
|
||
|
||
if (comptime !Environment.isNative) {
|
||
return std.mem.indexOfScalar(u8, slice, char);
|
||
}
|
||
|
||
const ptr = bun.C.memchr(slice.ptr, char, slice.len) orelse return null;
|
||
const i = @intFromPtr(ptr) - @intFromPtr(slice.ptr);
|
||
std.debug.assert(i < slice.len);
|
||
std.debug.assert(slice[i] == char);
|
||
|
||
return i;
|
||
}
|
||
|
||
test "indexOfChar" {
|
||
const pairs = .{
|
||
.{
|
||
"fooooooboooooofoooooofoooooofoooooofoooooozball",
|
||
'b',
|
||
},
|
||
.{
|
||
"foooooofoooooofoooooofoooooofoooooofoooooozball",
|
||
'z',
|
||
},
|
||
.{
|
||
"foooooofoooooofoooooofoooooofoooooofoooooozball",
|
||
'a',
|
||
},
|
||
.{
|
||
"foooooofoooooofoooooofoooooofoooooofoooooozball",
|
||
'l',
|
||
},
|
||
.{
|
||
"baconaopsdkaposdkpaosdkpaosdkaposdkpoasdkpoaskdpoaskdpoaskdpo;",
|
||
';',
|
||
},
|
||
.{
|
||
";baconaopsdkaposdkpaosdkpaosdkaposdkpoasdkpoaskdpoaskdpoaskdpo;",
|
||
';',
|
||
},
|
||
};
|
||
inline for (pairs) |pair| {
|
||
try std.testing.expectEqual(
|
||
indexOfChar(pair.@"0", pair.@"1").?,
|
||
@as(u32, @truncate(std.mem.indexOfScalar(u8, pair.@"0", pair.@"1").?)),
|
||
);
|
||
}
|
||
}
|
||
|
||
pub fn indexOfNotChar(slice: []const u8, char: u8) ?u32 {
|
||
var remaining = slice;
|
||
if (remaining.len == 0)
|
||
return null;
|
||
|
||
if (remaining[0] != char)
|
||
return 0;
|
||
|
||
if (comptime Environment.enableSIMD) {
|
||
while (remaining.len >= ascii_vector_size) {
|
||
const vec: AsciiVector = remaining[0..ascii_vector_size].*;
|
||
const cmp = @as(AsciiVector, @splat(char)) != vec;
|
||
if (@reduce(.Max, @as(AsciiVectorU1, @bitCast(cmp))) > 0) {
|
||
const bitmask = @as(AsciiVectorInt, @bitCast(cmp));
|
||
const first = @ctz(bitmask);
|
||
return @as(u32, first) + @as(u32, @intCast(slice.len - remaining.len));
|
||
}
|
||
|
||
remaining = remaining[ascii_vector_size..];
|
||
}
|
||
}
|
||
|
||
for (remaining) |*current| {
|
||
if (current.* != char) {
|
||
return @as(u32, @truncate(@intFromPtr(current) - @intFromPtr(slice.ptr)));
|
||
}
|
||
}
|
||
|
||
return null;
|
||
}
|
||
|
||
const invalid_char: u8 = 0xff;
|
||
const hex_table: [255]u8 = brk: {
|
||
var values: [255]u8 = [_]u8{invalid_char} ** 255;
|
||
values['0'] = 0;
|
||
values['1'] = 1;
|
||
values['2'] = 2;
|
||
values['3'] = 3;
|
||
values['4'] = 4;
|
||
values['5'] = 5;
|
||
values['6'] = 6;
|
||
values['7'] = 7;
|
||
values['8'] = 8;
|
||
values['9'] = 9;
|
||
values['A'] = 10;
|
||
values['B'] = 11;
|
||
values['C'] = 12;
|
||
values['D'] = 13;
|
||
values['E'] = 14;
|
||
values['F'] = 15;
|
||
values['a'] = 10;
|
||
values['b'] = 11;
|
||
values['c'] = 12;
|
||
values['d'] = 13;
|
||
values['e'] = 14;
|
||
values['f'] = 15;
|
||
|
||
break :brk values;
|
||
};
|
||
|
||
pub fn decodeHexToBytes(destination: []u8, comptime Char: type, source: []const Char) !usize {
|
||
return _decodeHexToBytes(destination, Char, source, false);
|
||
}
|
||
|
||
pub fn decodeHexToBytesTruncate(destination: []u8, comptime Char: type, source: []const Char) usize {
|
||
return _decodeHexToBytes(destination, Char, source, true) catch 0;
|
||
}
|
||
|
||
inline fn _decodeHexToBytes(destination: []u8, comptime Char: type, source: []const Char, comptime truncate: bool) !usize {
|
||
var remain = destination;
|
||
var input = source;
|
||
|
||
while (remain.len > 0 and input.len > 1) {
|
||
const int = input[0..2].*;
|
||
if (comptime @sizeOf(Char) > 1) {
|
||
if (int[0] > std.math.maxInt(u8) or int[1] > std.math.maxInt(u8)) {
|
||
if (comptime truncate) break;
|
||
return error.InvalidByteSequence;
|
||
}
|
||
}
|
||
const a = hex_table[@as(u8, @truncate(int[0]))];
|
||
const b = hex_table[@as(u8, @truncate(int[1]))];
|
||
if (a == invalid_char or b == invalid_char) {
|
||
if (comptime truncate) break;
|
||
return error.InvalidByteSequence;
|
||
}
|
||
remain[0] = a << 4 | b;
|
||
remain = remain[1..];
|
||
input = input[2..];
|
||
}
|
||
|
||
if (comptime !truncate) {
|
||
if (remain.len > 0 and input.len > 0) return error.InvalidByteSequence;
|
||
}
|
||
|
||
return destination.len - remain.len;
|
||
}
|
||
|
||
fn byte2hex(char: u8) u8 {
|
||
return switch (char) {
|
||
0...9 => char + '0',
|
||
10...15 => char - 10 + 'a',
|
||
else => unreachable,
|
||
};
|
||
}
|
||
|
||
pub fn encodeBytesToHex(destination: []u8, source: []const u8) usize {
|
||
if (comptime Environment.allow_assert) {
|
||
std.debug.assert(destination.len > 0);
|
||
std.debug.assert(source.len > 0);
|
||
}
|
||
const to_write = if (destination.len < source.len * 2)
|
||
destination.len - destination.len % 2
|
||
else
|
||
source.len * 2;
|
||
|
||
const to_read = to_write / 2;
|
||
|
||
var remaining = source[0..to_read];
|
||
var remaining_dest = destination;
|
||
if (comptime Environment.enableSIMD) {
|
||
const remaining_end = remaining.ptr + remaining.len - (remaining.len % 16);
|
||
while (remaining.ptr != remaining_end) {
|
||
const input_chunk: @Vector(16, u8) = remaining[0..16].*;
|
||
const input_chunk_4: @Vector(16, u8) = input_chunk >> @as(@Vector(16, u8), @splat(@as(u8, 4)));
|
||
const input_chunk_15: @Vector(16, u8) = input_chunk & @as(@Vector(16, u8), @splat(@as(u8, 15)));
|
||
|
||
// This looks extremely redundant but it was the easiest way to make the compiler do the right thing
|
||
// the more convienient "0123456789abcdef" string produces worse codegen
|
||
// https://zig.godbolt.org/z/bfdracEeq
|
||
const lower_16 = [16]u8{
|
||
byte2hex(input_chunk_4[0]),
|
||
byte2hex(input_chunk_4[1]),
|
||
byte2hex(input_chunk_4[2]),
|
||
byte2hex(input_chunk_4[3]),
|
||
byte2hex(input_chunk_4[4]),
|
||
byte2hex(input_chunk_4[5]),
|
||
byte2hex(input_chunk_4[6]),
|
||
byte2hex(input_chunk_4[7]),
|
||
byte2hex(input_chunk_4[8]),
|
||
byte2hex(input_chunk_4[9]),
|
||
byte2hex(input_chunk_4[10]),
|
||
byte2hex(input_chunk_4[11]),
|
||
byte2hex(input_chunk_4[12]),
|
||
byte2hex(input_chunk_4[13]),
|
||
byte2hex(input_chunk_4[14]),
|
||
byte2hex(input_chunk_4[15]),
|
||
};
|
||
const upper_16 = [16]u8{
|
||
byte2hex(input_chunk_15[0]),
|
||
byte2hex(input_chunk_15[1]),
|
||
byte2hex(input_chunk_15[2]),
|
||
byte2hex(input_chunk_15[3]),
|
||
byte2hex(input_chunk_15[4]),
|
||
byte2hex(input_chunk_15[5]),
|
||
byte2hex(input_chunk_15[6]),
|
||
byte2hex(input_chunk_15[7]),
|
||
byte2hex(input_chunk_15[8]),
|
||
byte2hex(input_chunk_15[9]),
|
||
byte2hex(input_chunk_15[10]),
|
||
byte2hex(input_chunk_15[11]),
|
||
byte2hex(input_chunk_15[12]),
|
||
byte2hex(input_chunk_15[13]),
|
||
byte2hex(input_chunk_15[14]),
|
||
byte2hex(input_chunk_15[15]),
|
||
};
|
||
|
||
const output_chunk = std.simd.interlace(.{
|
||
lower_16,
|
||
upper_16,
|
||
});
|
||
|
||
remaining_dest[0..32].* = @bitCast(output_chunk);
|
||
remaining_dest = remaining_dest[32..];
|
||
remaining = remaining[16..];
|
||
}
|
||
}
|
||
|
||
for (remaining) |c| {
|
||
const charset = "0123456789abcdef";
|
||
|
||
const buf: [2]u8 = .{ charset[c >> 4], charset[c & 15] };
|
||
remaining_dest[0..2].* = buf;
|
||
remaining_dest = remaining_dest[2..];
|
||
}
|
||
|
||
return to_read * 2;
|
||
}
|
||
|
||
test "decodeHexToBytes" {
|
||
var buffer = std.mem.zeroes([1024]u8);
|
||
for (buffer, 0..) |_, i| {
|
||
buffer[i] = @as(u8, @truncate(i % 256));
|
||
}
|
||
var written: [2048]u8 = undefined;
|
||
const hex = std.fmt.bufPrint(&written, "{}", .{std.fmt.fmtSliceHexLower(&buffer)}) catch unreachable;
|
||
var good: [4096]u8 = undefined;
|
||
var ours_buf: [4096]u8 = undefined;
|
||
const match = try std.fmt.hexToBytes(good[0..1024], hex);
|
||
const ours = decodeHexToBytes(&ours_buf, u8, hex);
|
||
try std.testing.expectEqualSlices(u8, match, ours_buf[0..ours]);
|
||
try std.testing.expectEqualSlices(u8, &buffer, ours_buf[0..ours]);
|
||
}
|
||
|
||
// test "formatBytesToHex" {
|
||
// var buffer = std.mem.zeroes([1024]u8);
|
||
// for (buffer) |_, i| {
|
||
// buffer[i] = @truncate(u8, i % 256);
|
||
// }
|
||
// var written: [2048]u8 = undefined;
|
||
// var hex = std.fmt.bufPrint(&written, "{}", .{std.fmt.fmtSliceHexLower(&buffer)}) catch unreachable;
|
||
// var ours_buf: [4096]u8 = undefined;
|
||
// // var ours = formatBytesToHex(&ours_buf, &buffer);
|
||
// // try std.testing.expectEqualSlices(u8, match, ours_buf[0..ours]);
|
||
// try std.testing.expectEqualSlices(u8, &buffer, ours_buf[0..ours]);
|
||
// }
|
||
|
||
pub fn trimLeadingChar(slice: []const u8, char: u8) []const u8 {
|
||
if (indexOfNotChar(slice, char)) |i| {
|
||
return slice[i..];
|
||
}
|
||
return "";
|
||
}
|
||
|
||
pub fn firstNonASCII16(comptime Slice: type, slice: Slice) ?u32 {
|
||
return firstNonASCII16CheckMin(Slice, slice, true);
|
||
}
|
||
|
||
/// Get the line number and the byte offsets of `line_range_count` above the desired line number
|
||
/// The final element is the end index of the desired line
|
||
const LineRange = struct {
|
||
start: u32,
|
||
end: u32,
|
||
};
|
||
pub fn indexOfLineRanges(text: []const u8, target_line: u32, comptime line_range_count: usize) std.BoundedArray(LineRange, line_range_count) {
|
||
const remaining = text;
|
||
if (remaining.len == 0) return .{};
|
||
|
||
var ranges = std.BoundedArray(LineRange, line_range_count){};
|
||
|
||
var current_line: u32 = 0;
|
||
const first_newline_or_nonascii_i = strings.indexOfNewlineOrNonASCIICheckStart(text, 0, true) orelse {
|
||
if (target_line == 0) {
|
||
ranges.appendAssumeCapacity(.{
|
||
.start = 0,
|
||
.end = @truncate(text.len),
|
||
});
|
||
}
|
||
|
||
return ranges;
|
||
};
|
||
|
||
var iter = CodepointIterator.initOffset(text, 0);
|
||
var cursor = CodepointIterator.Cursor{
|
||
.i = first_newline_or_nonascii_i,
|
||
};
|
||
const first_newline_range: LineRange = brk: {
|
||
while (iter.next(&cursor)) {
|
||
const codepoint = cursor.c;
|
||
switch (codepoint) {
|
||
'\n' => {
|
||
current_line += 1;
|
||
break :brk .{
|
||
.start = 0,
|
||
.end = cursor.i,
|
||
};
|
||
},
|
||
'\r' => {
|
||
if (iter.next(&cursor)) {
|
||
const codepoint2 = cursor.c;
|
||
if (codepoint2 == '\n') {
|
||
current_line += 1;
|
||
break :brk .{
|
||
.start = 0,
|
||
.end = cursor.i,
|
||
};
|
||
}
|
||
}
|
||
},
|
||
else => {},
|
||
}
|
||
}
|
||
|
||
ranges.appendAssumeCapacity(.{
|
||
.start = 0,
|
||
.end = @truncate(text.len),
|
||
});
|
||
return ranges;
|
||
};
|
||
|
||
ranges.appendAssumeCapacity(first_newline_range);
|
||
|
||
if (target_line == 0) {
|
||
return ranges;
|
||
}
|
||
|
||
var prev_end = first_newline_range.end;
|
||
while (strings.indexOfNewlineOrNonASCIICheckStart(text, cursor.i + @as(u32, cursor.width), true)) |current_i| {
|
||
cursor.i = current_i;
|
||
cursor.width = 0;
|
||
const current_line_range: LineRange = brk: {
|
||
if (iter.next(&cursor)) {
|
||
const codepoint = cursor.c;
|
||
switch (codepoint) {
|
||
'\n' => {
|
||
const start = prev_end;
|
||
prev_end = cursor.i;
|
||
break :brk .{
|
||
.start = start,
|
||
.end = cursor.i + 1,
|
||
};
|
||
},
|
||
'\r' => {
|
||
const current_end = cursor.i;
|
||
if (iter.next(&cursor)) {
|
||
const codepoint2 = cursor.c;
|
||
if (codepoint2 == '\n') {
|
||
defer prev_end = cursor.i;
|
||
break :brk .{
|
||
.start = prev_end,
|
||
.end = current_end,
|
||
};
|
||
}
|
||
}
|
||
},
|
||
else => continue,
|
||
}
|
||
}
|
||
};
|
||
|
||
if (ranges.len == line_range_count and current_line <= target_line) {
|
||
var new_ranges = std.BoundedArray(LineRange, line_range_count){};
|
||
new_ranges.appendSliceAssumeCapacity(ranges.slice()[1..]);
|
||
ranges = new_ranges;
|
||
}
|
||
ranges.appendAssumeCapacity(current_line_range);
|
||
|
||
if (current_line >= target_line) {
|
||
return ranges;
|
||
}
|
||
|
||
current_line += 1;
|
||
}
|
||
|
||
if (ranges.len == line_range_count and current_line <= target_line) {
|
||
var new_ranges = std.BoundedArray(LineRange, line_range_count){};
|
||
new_ranges.appendSliceAssumeCapacity(ranges.slice()[1..]);
|
||
ranges = new_ranges;
|
||
}
|
||
|
||
return ranges;
|
||
}
|
||
|
||
/// Get N lines from the start of the text
|
||
pub fn getLinesInText(text: []const u8, line: u32, comptime line_range_count: usize) ?std.BoundedArray([]const u8, line_range_count) {
|
||
const ranges = indexOfLineRanges(text, line, line_range_count);
|
||
if (ranges.len == 0) return null;
|
||
var results = std.BoundedArray([]const u8, line_range_count){};
|
||
results.len = ranges.len;
|
||
|
||
for (results.slice()[0..ranges.len], ranges.slice()) |*chunk, range| {
|
||
chunk.* = text[range.start..range.end];
|
||
}
|
||
|
||
std.mem.reverse([]const u8, results.slice());
|
||
|
||
return results;
|
||
}
|
||
|
||
pub fn firstNonASCII16CheckMin(comptime Slice: type, slice: Slice, comptime check_min: bool) ?u32 {
|
||
var remaining = slice;
|
||
|
||
if (comptime Environment.enableSIMD and Environment.isNative) {
|
||
const end_ptr = remaining.ptr + remaining.len - (remaining.len % ascii_u16_vector_size);
|
||
if (remaining.len > ascii_u16_vector_size) {
|
||
const remaining_start = remaining.ptr;
|
||
while (remaining.ptr != end_ptr) {
|
||
const vec: AsciiU16Vector = remaining[0..ascii_u16_vector_size].*;
|
||
const max_value = @reduce(.Max, vec);
|
||
|
||
if (comptime check_min) {
|
||
// by using @reduce here, we make it only do one comparison
|
||
// @reduce doesn't tell us the index though
|
||
const min_value = @reduce(.Min, vec);
|
||
if (min_value < 0x20 or max_value > 127) {
|
||
remaining.len -= (@intFromPtr(remaining.ptr) - @intFromPtr(remaining_start)) / 2;
|
||
|
||
// this is really slow
|
||
// it does it element-wise for every single u8 on the vector
|
||
// instead of doing the SIMD instructions
|
||
// it removes a loop, but probably is slower in the end
|
||
const cmp = @as(AsciiVectorU16U1, @bitCast(vec > max_u16_ascii)) |
|
||
@as(AsciiVectorU16U1, @bitCast(vec < min_u16_ascii));
|
||
const bitmask: u8 = @as(u8, @bitCast(cmp));
|
||
const first = @ctz(bitmask);
|
||
|
||
return @as(u32, @intCast(@as(u32, first) +
|
||
@as(u32, @intCast(slice.len - remaining.len))));
|
||
}
|
||
} else if (comptime !check_min) {
|
||
if (max_value > 127) {
|
||
remaining.len -= (@intFromPtr(remaining.ptr) - @intFromPtr(remaining_start)) / 2;
|
||
|
||
const cmp = vec > max_u16_ascii;
|
||
const bitmask: u8 = @as(u8, @bitCast(cmp));
|
||
const first = @ctz(bitmask);
|
||
|
||
return @as(u32, @intCast(@as(u32, first) +
|
||
@as(u32, @intCast(slice.len - remaining.len))));
|
||
}
|
||
}
|
||
|
||
remaining.ptr += ascii_u16_vector_size;
|
||
}
|
||
remaining.len -= (@intFromPtr(remaining.ptr) - @intFromPtr(remaining_start)) / 2;
|
||
}
|
||
}
|
||
|
||
if (comptime check_min) {
|
||
var i: usize = 0;
|
||
for (remaining) |char| {
|
||
if (char > 127 or char < 0x20) {
|
||
return @as(u32, @truncate(i));
|
||
}
|
||
|
||
i += 1;
|
||
}
|
||
} else {
|
||
var i: usize = 0;
|
||
for (remaining) |char| {
|
||
if (char > 127) {
|
||
return @as(u32, @truncate(i));
|
||
}
|
||
|
||
i += 1;
|
||
}
|
||
}
|
||
|
||
return null;
|
||
}
|
||
|
||
/// Fast path for printing template literal strings
|
||
pub fn @"nextUTF16NonASCIIOr$`\\"(
|
||
comptime Slice: type,
|
||
slice: Slice,
|
||
) ?u32 {
|
||
var remaining = slice;
|
||
|
||
if (comptime Environment.enableSIMD and Environment.isNative) {
|
||
while (remaining.len >= ascii_u16_vector_size) {
|
||
const vec: AsciiU16Vector = remaining[0..ascii_u16_vector_size].*;
|
||
|
||
const cmp = @as(AsciiVectorU16U1, @bitCast((vec > max_u16_ascii))) |
|
||
@as(AsciiVectorU16U1, @bitCast((vec < min_u16_ascii))) |
|
||
@as(AsciiVectorU16U1, @bitCast((vec == @as(AsciiU16Vector, @splat(@as(u16, '$')))))) |
|
||
@as(AsciiVectorU16U1, @bitCast((vec == @as(AsciiU16Vector, @splat(@as(u16, '`')))))) |
|
||
@as(AsciiVectorU16U1, @bitCast((vec == @as(AsciiU16Vector, @splat(@as(u16, '\\'))))));
|
||
|
||
const bitmask = @as(u8, @bitCast(cmp));
|
||
const first = @ctz(bitmask);
|
||
if (first < ascii_u16_vector_size) {
|
||
return @as(u32, @intCast(@as(u32, first) +
|
||
@as(u32, @intCast(slice.len - remaining.len))));
|
||
}
|
||
|
||
remaining = remaining[ascii_u16_vector_size..];
|
||
}
|
||
}
|
||
|
||
for (remaining, 0..) |char, i| {
|
||
switch (char) {
|
||
'$', '`', '\\', 0...0x20 - 1, 128...std.math.maxInt(u16) => {
|
||
return @as(u32, @truncate(i + (slice.len - remaining.len)));
|
||
},
|
||
|
||
else => {},
|
||
}
|
||
}
|
||
|
||
return null;
|
||
}
|
||
|
||
test "indexOfNotChar" {
|
||
{
|
||
var yes: [312]u8 = undefined;
|
||
for (0..yes.len) |i| {
|
||
@memset(yes, 'a');
|
||
yes[i] = 'b';
|
||
if (comptime Environment.allow_assert) std.debug.assert(indexOfNotChar(&yes, 'a').? == i);
|
||
i += 1;
|
||
}
|
||
}
|
||
}
|
||
|
||
test "trimLeadingChar" {
|
||
{
|
||
const yes = " fooo bar";
|
||
try std.testing.expectEqualStrings(trimLeadingChar(yes, ' '), "fooo bar");
|
||
}
|
||
}
|
||
|
||
test "isAllASCII" {
|
||
const yes = "aspdokasdpokasdpokasd aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasd aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasd aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasd aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123";
|
||
try std.testing.expectEqual(true, isAllASCII(yes));
|
||
|
||
const no = "aspdokasdpokasdpokasd aspdokasdpokasdpokasdaspdoka🙂sdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123";
|
||
try std.testing.expectEqual(false, isAllASCII(no));
|
||
}
|
||
|
||
test "firstNonASCII" {
|
||
const yes = "aspdokasdpokasdpokasd aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasd aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasd aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasd aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123";
|
||
try std.testing.expectEqual(true, firstNonASCII(yes) == null);
|
||
|
||
{
|
||
const no = "aspdokasdpokasdpokasd aspdokasdpokasdpokasdaspdoka🙂sdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123";
|
||
try std.testing.expectEqual(@as(u32, 50), firstNonASCII(no).?);
|
||
}
|
||
|
||
{
|
||
const no = "aspdokasdpokasdpokasd aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd12312🙂3";
|
||
try std.testing.expectEqual(@as(u32, 366), firstNonASCII(no).?);
|
||
}
|
||
}
|
||
|
||
test "firstNonASCII16" {
|
||
@setEvalBranchQuota(99999);
|
||
const yes = std.mem.bytesAsSlice(u16, toUTF16Literal("aspdokasdpokasdpokasd aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasd aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasd aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasd aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123"));
|
||
try std.testing.expectEqual(true, firstNonASCII16(@TypeOf(yes), yes) == null);
|
||
|
||
{
|
||
@setEvalBranchQuota(99999);
|
||
const no = std.mem.bytesAsSlice(u16, toUTF16Literal("aspdokasdpokasdpokasd aspdokasdpokasdpokasdaspdoka🙂sdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123"));
|
||
try std.testing.expectEqual(@as(u32, 50), firstNonASCII16(@TypeOf(no), no).?);
|
||
}
|
||
{
|
||
@setEvalBranchQuota(99999);
|
||
const no = std.mem.bytesAsSlice(u16, toUTF16Literal("🙂sdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123"));
|
||
try std.testing.expectEqual(@as(u32, 0), firstNonASCII16(@TypeOf(no), no).?);
|
||
}
|
||
{
|
||
@setEvalBranchQuota(99999);
|
||
const no = std.mem.bytesAsSlice(u16, toUTF16Literal("a🙂sdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123"));
|
||
try std.testing.expectEqual(@as(u32, 1), firstNonASCII16(@TypeOf(no), no).?);
|
||
}
|
||
{
|
||
@setEvalBranchQuota(99999);
|
||
const no = std.mem.bytesAsSlice(u16, toUTF16Literal("aspdokasdpokasdpokasd aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd12312🙂3"));
|
||
try std.testing.expectEqual(@as(u32, 366), firstNonASCII16(@TypeOf(no), no).?);
|
||
}
|
||
}
|
||
|
||
test "print UTF16" {
|
||
var err = std.io.getStdErr();
|
||
const utf16 = comptime toUTF16Literal("❌ ✅ opkay ");
|
||
try bun.fmt.str.formatUTF16(utf16, err.writer());
|
||
// std.unicode.fmtUtf16le(utf16le: []const u16)
|
||
}
|
||
|
||
/// Convert potentially ill-formed UTF-8 or UTF-16 bytes to a Unicode Codepoint.
|
||
/// - Invalid codepoints are replaced with `zero` parameter
|
||
/// - Null bytes return 0
|
||
pub fn decodeWTF8RuneT(p: *const [4]u8, len: u3, comptime T: type, comptime zero: T) T {
|
||
if (len == 0) return zero;
|
||
if (len == 1) return p[0];
|
||
|
||
return decodeWTF8RuneTMultibyte(p, len, T, zero);
|
||
}
|
||
|
||
pub fn codepointSize(comptime R: type, r: R) u3 {
|
||
return switch (r) {
|
||
0b0000_0000...0b0111_1111 => 1,
|
||
0b1100_0000...0b1101_1111 => 2,
|
||
0b1110_0000...0b1110_1111 => 3,
|
||
0b1111_0000...0b1111_0111 => 4,
|
||
else => 0,
|
||
};
|
||
}
|
||
|
||
// /// Encode Type into UTF-8 bytes.
|
||
// /// - Invalid unicode data becomes U+FFFD REPLACEMENT CHARACTER.
|
||
// /// -
|
||
// pub fn encodeUTF8RuneT(out: *[4]u8, comptime R: type, c: R) u3 {
|
||
// switch (c) {
|
||
// 0b0000_0000...0b0111_1111 => {
|
||
// out[0] = @intCast(u8, c);
|
||
// return 1;
|
||
// },
|
||
// 0b1100_0000...0b1101_1111 => {
|
||
// out[0] = @truncate(u8, 0b11000000 | (c >> 6));
|
||
// out[1] = @truncate(u8, 0b10000000 | c & 0b111111);
|
||
// return 2;
|
||
// },
|
||
|
||
// 0b1110_0000...0b1110_1111 => {
|
||
// if (0xd800 <= c and c <= 0xdfff) {
|
||
// // Replacement character
|
||
// out[0..3].* = [_]u8{ 0xEF, 0xBF, 0xBD };
|
||
|
||
// return 3;
|
||
// }
|
||
|
||
// out[0] = @truncate(u8, 0b11100000 | (c >> 12));
|
||
// out[1] = @truncate(u8, 0b10000000 | (c >> 6) & 0b111111);
|
||
// out[2] = @truncate(u8, 0b10000000 | c & 0b111111);
|
||
// return 3;
|
||
// },
|
||
// 0b1111_0000...0b1111_0111 => {
|
||
// out[0] = @truncate(u8, 0b11110000 | (c >> 18));
|
||
// out[1] = @truncate(u8, 0b10000000 | (c >> 12) & 0b111111);
|
||
// out[2] = @truncate(u8, 0b10000000 | (c >> 6) & 0b111111);
|
||
// out[3] = @truncate(u8, 0b10000000 | c & 0b111111);
|
||
// return 4;
|
||
// },
|
||
// else => {
|
||
// // Replacement character
|
||
// out[0..3].* = [_]u8{ 0xEF, 0xBF, 0xBD };
|
||
|
||
// return 3;
|
||
// },
|
||
// }
|
||
// }
|
||
|
||
pub fn containsNonBmpCodePoint(text: string) bool {
|
||
var iter = CodepointIterator.init(text);
|
||
var curs = CodepointIterator.Cursor{};
|
||
|
||
while (iter.next(&curs)) {
|
||
if (curs.c > 0xFFFF) {
|
||
return true;
|
||
}
|
||
}
|
||
|
||
return false;
|
||
}
|
||
|
||
pub fn containsNonBmpCodePointOrIsInvalidIdentifier(text: string) bool {
|
||
var iter = CodepointIterator.init(text);
|
||
var curs = CodepointIterator.Cursor{};
|
||
|
||
if (!iter.next(&curs)) return true;
|
||
|
||
if (curs.c > 0xFFFF or !js_lexer.isIdentifierStart(curs.c))
|
||
return true;
|
||
|
||
while (iter.next(&curs)) {
|
||
if (curs.c > 0xFFFF or !js_lexer.isIdentifierContinue(curs.c)) {
|
||
return true;
|
||
}
|
||
}
|
||
|
||
return false;
|
||
}
|
||
|
||
// this is std.mem.trim except it doesn't forcibly change the slice to be const
|
||
pub fn trim(slice: anytype, comptime values_to_strip: []const u8) @TypeOf(slice) {
|
||
var begin: usize = 0;
|
||
var end: usize = slice.len;
|
||
|
||
while (begin < end and std.mem.indexOfScalar(u8, values_to_strip, slice[begin]) != null) : (begin += 1) {}
|
||
while (end > begin and std.mem.indexOfScalar(u8, values_to_strip, slice[end - 1]) != null) : (end -= 1) {}
|
||
return slice[begin..end];
|
||
}
|
||
|
||
pub const whitespace_chars = [_]u8{ ' ', '\t', '\n', '\r', std.ascii.control_code.vt, std.ascii.control_code.ff };
|
||
|
||
pub fn lengthOfLeadingWhitespaceASCII(slice: string) usize {
|
||
brk: for (slice) |*c| {
|
||
inline for (whitespace_chars) |wc| if (c.* == wc) continue :brk;
|
||
return @intFromPtr(c) - @intFromPtr(slice.ptr);
|
||
}
|
||
|
||
return slice.len;
|
||
}
|
||
|
||
pub fn containsNonBmpCodePointUTF16(_text: []const u16) bool {
|
||
const n = _text.len;
|
||
if (n > 0) {
|
||
var i: usize = 0;
|
||
const text = _text[0 .. n - 1];
|
||
while (i < n - 1) : (i += 1) {
|
||
switch (text[i]) {
|
||
// Check for a high surrogate
|
||
0xD800...0xDBFF => {
|
||
// Check for a low surrogate
|
||
switch (text[i + 1]) {
|
||
0xDC00...0xDFFF => {
|
||
return true;
|
||
},
|
||
else => {},
|
||
}
|
||
},
|
||
else => {},
|
||
}
|
||
}
|
||
}
|
||
|
||
return false;
|
||
}
|
||
|
||
pub fn join(slices: []const string, delimiter: string, allocator: std.mem.Allocator) !string {
|
||
return try std.mem.join(allocator, delimiter, slices);
|
||
}
|
||
|
||
pub fn order(a: []const u8, b: []const u8) std.math.Order {
|
||
const len = @min(a.len, b.len);
|
||
|
||
const cmp = if (comptime Environment.isNative) bun.C.memcmp(a.ptr, b.ptr, len) else return std.mem.order(u8, a, b);
|
||
return switch (std.math.sign(cmp)) {
|
||
0 => std.math.order(a.len, b.len),
|
||
1 => .gt,
|
||
-1 => .lt,
|
||
else => unreachable,
|
||
};
|
||
}
|
||
|
||
pub fn cmpStringsAsc(_: void, a: string, b: string) bool {
|
||
return order(a, b) == .lt;
|
||
}
|
||
|
||
pub fn cmpStringsDesc(_: void, a: string, b: string) bool {
|
||
return order(a, b) == .gt;
|
||
}
|
||
|
||
const sort_asc = std.sort.asc(u8);
|
||
const sort_desc = std.sort.desc(u8);
|
||
|
||
pub fn sortAsc(in: []string) void {
|
||
// TODO: experiment with simd to see if it's faster
|
||
std.sort.pdq([]const u8, in, {}, cmpStringsAsc);
|
||
}
|
||
|
||
pub fn sortDesc(in: []string) void {
|
||
// TODO: experiment with simd to see if it's faster
|
||
std.sort.pdq([]const u8, in, {}, cmpStringsDesc);
|
||
}
|
||
|
||
pub const StringArrayByIndexSorter = struct {
|
||
keys: []const []const u8,
|
||
pub fn lessThan(sorter: *const @This(), a: usize, b: usize) bool {
|
||
return strings.order(sorter.keys[a], sorter.keys[b]) == .lt;
|
||
}
|
||
|
||
pub fn init(keys: []const []const u8) @This() {
|
||
return .{
|
||
.keys = keys,
|
||
};
|
||
}
|
||
};
|
||
|
||
pub fn isASCIIHexDigit(c: u8) bool {
|
||
return std.ascii.isHex(c);
|
||
}
|
||
|
||
pub fn toASCIIHexValue(character: u8) u8 {
|
||
if (comptime Environment.allow_assert) std.debug.assert(isASCIIHexDigit(character));
|
||
return switch (character) {
|
||
0...('A' - 1) => character - '0',
|
||
else => (character - 'A' + 10) & 0xF,
|
||
};
|
||
}
|
||
|
||
pub inline fn utf8ByteSequenceLength(first_byte: u8) u3 {
|
||
return switch (first_byte) {
|
||
0b0000_0000...0b0111_1111 => 1,
|
||
0b1100_0000...0b1101_1111 => 2,
|
||
0b1110_0000...0b1110_1111 => 3,
|
||
0b1111_0000...0b1111_0111 => 4,
|
||
else => 0,
|
||
};
|
||
}
|
||
|
||
pub const PackedCodepointIterator = struct {
|
||
const Iterator = @This();
|
||
const CodePointType = u32;
|
||
const zeroValue = 0;
|
||
|
||
bytes: []const u8,
|
||
i: usize,
|
||
next_width: usize = 0,
|
||
width: u3 = 0,
|
||
c: CodePointType = zeroValue,
|
||
|
||
pub const ZeroValue = zeroValue;
|
||
|
||
pub const Cursor = packed struct {
|
||
i: u32 = 0,
|
||
c: u29 = zeroValue,
|
||
width: u3 = 0,
|
||
pub const CodePointType = u29;
|
||
};
|
||
|
||
pub fn init(str: string) Iterator {
|
||
return Iterator{ .bytes = str, .i = 0, .c = zeroValue };
|
||
}
|
||
|
||
pub fn initOffset(str: string, i: usize) Iterator {
|
||
return Iterator{ .bytes = str, .i = i, .c = zeroValue };
|
||
}
|
||
|
||
pub inline fn next(it: *const Iterator, cursor: *Cursor) bool {
|
||
const pos: u32 = @as(u32, cursor.width) + cursor.i;
|
||
if (pos >= it.bytes.len) {
|
||
return false;
|
||
}
|
||
|
||
const cp_len = wtf8ByteSequenceLength(it.bytes[pos]);
|
||
const error_char = comptime std.math.minInt(CodePointType);
|
||
|
||
const codepoint = @as(
|
||
CodePointType,
|
||
switch (cp_len) {
|
||
0 => return false,
|
||
1 => it.bytes[pos],
|
||
else => decodeWTF8RuneTMultibyte(it.bytes[pos..].ptr[0..4], cp_len, CodePointType, error_char),
|
||
},
|
||
);
|
||
|
||
{
|
||
@setRuntimeSafety(false);
|
||
cursor.* = Cursor{
|
||
.i = pos,
|
||
.c = if (error_char != codepoint)
|
||
@truncate(codepoint)
|
||
else
|
||
unicode_replacement,
|
||
.width = if (codepoint != error_char) cp_len else 1,
|
||
};
|
||
}
|
||
|
||
return true;
|
||
}
|
||
|
||
inline fn nextCodepointSlice(it: *Iterator) []const u8 {
|
||
const bytes = it.bytes;
|
||
const prev = it.i;
|
||
const next_ = prev + it.next_width;
|
||
if (bytes.len <= next_) return "";
|
||
|
||
const cp_len = utf8ByteSequenceLength(bytes[next_]);
|
||
it.next_width = cp_len;
|
||
it.i = @min(next_, bytes.len);
|
||
|
||
const slice = bytes[prev..][0..cp_len];
|
||
it.width = @as(u3, @intCast(slice.len));
|
||
return slice;
|
||
}
|
||
|
||
pub fn needsUTF8Decoding(slice: string) bool {
|
||
var it = Iterator{ .bytes = slice, .i = 0 };
|
||
|
||
while (true) {
|
||
const part = it.nextCodepointSlice();
|
||
@setRuntimeSafety(false);
|
||
switch (part.len) {
|
||
0 => return false,
|
||
1 => continue,
|
||
else => return true,
|
||
}
|
||
}
|
||
}
|
||
|
||
pub fn scanUntilQuotedValueOrEOF(iter: *Iterator, comptime quote: CodePointType) usize {
|
||
while (iter.c > -1) {
|
||
if (!switch (iter.nextCodepoint()) {
|
||
quote => false,
|
||
'\\' => brk: {
|
||
if (iter.nextCodepoint() == quote) {
|
||
continue;
|
||
}
|
||
break :brk true;
|
||
},
|
||
else => true,
|
||
}) {
|
||
return iter.i + 1;
|
||
}
|
||
}
|
||
|
||
return iter.i;
|
||
}
|
||
|
||
pub fn nextCodepoint(it: *Iterator) CodePointType {
|
||
const slice = it.nextCodepointSlice();
|
||
|
||
it.c = switch (slice.len) {
|
||
0 => zeroValue,
|
||
1 => @as(CodePointType, @intCast(slice[0])),
|
||
2 => @as(CodePointType, @intCast(std.unicode.utf8Decode2(slice) catch unreachable)),
|
||
3 => @as(CodePointType, @intCast(std.unicode.utf8Decode3(slice) catch unreachable)),
|
||
4 => @as(CodePointType, @intCast(std.unicode.utf8Decode4(slice) catch unreachable)),
|
||
else => unreachable,
|
||
};
|
||
|
||
return it.c;
|
||
}
|
||
|
||
/// Look ahead at the next n codepoints without advancing the iterator.
|
||
/// If fewer than n codepoints are available, then return the remainder of the string.
|
||
pub fn peek(it: *Iterator, n: usize) []const u8 {
|
||
const original_i = it.i;
|
||
defer it.i = original_i;
|
||
|
||
var end_ix = original_i;
|
||
var found: usize = 0;
|
||
while (found < n) : (found += 1) {
|
||
const next_codepoint = it.nextCodepointSlice() orelse return it.bytes[original_i..];
|
||
end_ix += next_codepoint.len;
|
||
}
|
||
|
||
return it.bytes[original_i..end_ix];
|
||
}
|
||
};
|
||
|
||
pub fn NewCodePointIterator(comptime CodePointType: type, comptime zeroValue: comptime_int) type {
|
||
return struct {
|
||
const Iterator = @This();
|
||
bytes: []const u8,
|
||
i: usize,
|
||
next_width: usize = 0,
|
||
width: u3 = 0,
|
||
c: CodePointType = zeroValue,
|
||
|
||
pub const ZeroValue = zeroValue;
|
||
|
||
pub const Cursor = struct {
|
||
i: u32 = 0,
|
||
c: CodePointType = zeroValue,
|
||
width: u3 = 0,
|
||
};
|
||
|
||
pub fn init(str: string) Iterator {
|
||
return Iterator{ .bytes = str, .i = 0, .c = zeroValue };
|
||
}
|
||
|
||
pub fn initOffset(str: string, i: usize) Iterator {
|
||
return Iterator{ .bytes = str, .i = i, .c = zeroValue };
|
||
}
|
||
|
||
pub inline fn next(it: *const Iterator, cursor: *Cursor) bool {
|
||
const pos: u32 = @as(u32, cursor.width) + cursor.i;
|
||
if (pos >= it.bytes.len) {
|
||
return false;
|
||
}
|
||
|
||
const cp_len = wtf8ByteSequenceLength(it.bytes[pos]);
|
||
const error_char = comptime std.math.minInt(CodePointType);
|
||
|
||
const codepoint = @as(
|
||
CodePointType,
|
||
switch (cp_len) {
|
||
0 => return false,
|
||
1 => it.bytes[pos],
|
||
else => decodeWTF8RuneTMultibyte(it.bytes[pos..].ptr[0..4], cp_len, CodePointType, error_char),
|
||
},
|
||
);
|
||
|
||
cursor.* = Cursor{
|
||
.i = pos,
|
||
.c = if (error_char != codepoint)
|
||
codepoint
|
||
else
|
||
unicode_replacement,
|
||
.width = if (codepoint != error_char) cp_len else 1,
|
||
};
|
||
|
||
return true;
|
||
}
|
||
|
||
inline fn nextCodepointSlice(it: *Iterator) []const u8 {
|
||
const bytes = it.bytes;
|
||
const prev = it.i;
|
||
const next_ = prev + it.next_width;
|
||
if (bytes.len <= next_) return "";
|
||
|
||
const cp_len = utf8ByteSequenceLength(bytes[next_]);
|
||
it.next_width = cp_len;
|
||
it.i = @min(next_, bytes.len);
|
||
|
||
const slice = bytes[prev..][0..cp_len];
|
||
it.width = @as(u3, @intCast(slice.len));
|
||
return slice;
|
||
}
|
||
|
||
pub fn needsUTF8Decoding(slice: string) bool {
|
||
var it = Iterator{ .bytes = slice, .i = 0 };
|
||
|
||
while (true) {
|
||
const part = it.nextCodepointSlice();
|
||
@setRuntimeSafety(false);
|
||
switch (part.len) {
|
||
0 => return false,
|
||
1 => continue,
|
||
else => return true,
|
||
}
|
||
}
|
||
}
|
||
|
||
pub fn scanUntilQuotedValueOrEOF(iter: *Iterator, comptime quote: CodePointType) usize {
|
||
while (iter.c > -1) {
|
||
if (!switch (iter.nextCodepoint()) {
|
||
quote => false,
|
||
'\\' => brk: {
|
||
if (iter.nextCodepoint() == quote) {
|
||
continue;
|
||
}
|
||
break :brk true;
|
||
},
|
||
else => true,
|
||
}) {
|
||
return iter.i + 1;
|
||
}
|
||
}
|
||
|
||
return iter.i;
|
||
}
|
||
|
||
pub fn nextCodepoint(it: *Iterator) CodePointType {
|
||
const slice = it.nextCodepointSlice();
|
||
|
||
it.c = switch (slice.len) {
|
||
0 => zeroValue,
|
||
1 => @as(CodePointType, @intCast(slice[0])),
|
||
2 => @as(CodePointType, @intCast(std.unicode.utf8Decode2(slice) catch unreachable)),
|
||
3 => @as(CodePointType, @intCast(std.unicode.utf8Decode3(slice) catch unreachable)),
|
||
4 => @as(CodePointType, @intCast(std.unicode.utf8Decode4(slice) catch unreachable)),
|
||
else => unreachable,
|
||
};
|
||
|
||
return it.c;
|
||
}
|
||
|
||
/// Look ahead at the next n codepoints without advancing the iterator.
|
||
/// If fewer than n codepoints are available, then return the remainder of the string.
|
||
pub fn peek(it: *Iterator, n: usize) []const u8 {
|
||
const original_i = it.i;
|
||
defer it.i = original_i;
|
||
|
||
var end_ix = original_i;
|
||
for (0..n) |_| {
|
||
const next_codepoint = it.nextCodepointSlice() orelse return it.bytes[original_i..];
|
||
end_ix += next_codepoint.len;
|
||
}
|
||
|
||
return it.bytes[original_i..end_ix];
|
||
}
|
||
};
|
||
}
|
||
|
||
pub const CodepointIterator = NewCodePointIterator(CodePoint, -1);
|
||
pub const UnsignedCodepointIterator = NewCodePointIterator(u32, 0);
|
||
|
||
pub fn NewLengthSorter(comptime Type: type, comptime field: string) type {
|
||
return struct {
|
||
const LengthSorter = @This();
|
||
pub fn lessThan(_: LengthSorter, lhs: Type, rhs: Type) bool {
|
||
return @field(lhs, field).len < @field(rhs, field).len;
|
||
}
|
||
};
|
||
}
|
||
|
||
pub fn NewGlobLengthSorter(comptime Type: type, comptime field: string) type {
|
||
return struct {
|
||
const GlobLengthSorter = @This();
|
||
pub fn lessThan(_: GlobLengthSorter, lhs: Type, rhs: Type) bool {
|
||
// Assert: keyA ends with "/" or contains only a single "*".
|
||
// Assert: keyB ends with "/" or contains only a single "*".
|
||
const key_a = @field(lhs, field);
|
||
const key_b = @field(rhs, field);
|
||
|
||
// Let baseLengthA be the index of "*" in keyA plus one, if keyA contains "*", or the length of keyA otherwise.
|
||
// Let baseLengthB be the index of "*" in keyB plus one, if keyB contains "*", or the length of keyB otherwise.
|
||
const star_a = indexOfChar(key_a, '*');
|
||
const star_b = indexOfChar(key_b, '*');
|
||
const base_length_a = star_a orelse key_a.len;
|
||
const base_length_b = star_b orelse key_b.len;
|
||
|
||
// If baseLengthA is greater than baseLengthB, return -1.
|
||
// If baseLengthB is greater than baseLengthA, return 1.
|
||
if (base_length_a > base_length_b)
|
||
return true;
|
||
if (base_length_b > base_length_a)
|
||
return false;
|
||
|
||
// If keyA does not contain "*", return 1.
|
||
// If keyB does not contain "*", return -1.
|
||
if (star_a == null)
|
||
return false;
|
||
if (star_b == null)
|
||
return true;
|
||
|
||
// If the length of keyA is greater than the length of keyB, return -1.
|
||
// If the length of keyB is greater than the length of keyA, return 1.
|
||
if (key_a.len > key_b.len)
|
||
return true;
|
||
if (key_b.len > key_a.len)
|
||
return false;
|
||
|
||
return false;
|
||
}
|
||
};
|
||
}
|
||
|
||
/// Update all strings in a struct pointing to "from" to point to "to".
|
||
pub fn moveAllSlices(comptime Type: type, container: *Type, from: string, to: string) void {
|
||
const fields_we_care_about = comptime brk: {
|
||
var count: usize = 0;
|
||
for (std.meta.fields(Type)) |field| {
|
||
if (std.meta.isSlice(field.type) and std.meta.Child(field.type) == u8) {
|
||
count += 1;
|
||
}
|
||
}
|
||
|
||
var fields: [count][]const u8 = undefined;
|
||
count = 0;
|
||
for (std.meta.fields(Type)) |field| {
|
||
if (std.meta.isSlice(field.type) and std.meta.Child(field.type) == u8) {
|
||
fields[count] = field.name;
|
||
count += 1;
|
||
}
|
||
}
|
||
break :brk fields;
|
||
};
|
||
|
||
inline for (fields_we_care_about) |name| {
|
||
const slice = @field(container, name);
|
||
if ((@intFromPtr(from.ptr) + from.len) >= @intFromPtr(slice.ptr) + slice.len and
|
||
(@intFromPtr(from.ptr) <= @intFromPtr(slice.ptr)))
|
||
{
|
||
@field(container, name) = moveSlice(slice, from, to);
|
||
}
|
||
}
|
||
}
|
||
|
||
pub fn moveSlice(slice: string, from: string, to: string) string {
|
||
if (comptime Environment.allow_assert) {
|
||
std.debug.assert(from.len <= to.len and from.len >= slice.len);
|
||
// assert we are in bounds
|
||
std.debug.assert(
|
||
(@intFromPtr(from.ptr) + from.len) >=
|
||
@intFromPtr(slice.ptr) + slice.len and
|
||
(@intFromPtr(from.ptr) <= @intFromPtr(slice.ptr)),
|
||
);
|
||
std.debug.assert(eqlLong(from, to[0..from.len], false)); // data should be identical
|
||
}
|
||
|
||
const ptr_offset = @intFromPtr(slice.ptr) - @intFromPtr(from.ptr);
|
||
const result = to[ptr_offset..][0..slice.len];
|
||
|
||
if (comptime Environment.allow_assert) std.debug.assert(eqlLong(slice, result, false)); // data should be identical
|
||
|
||
return result;
|
||
}
|
||
|
||
test "moveSlice" {
|
||
var input: string = "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz";
|
||
const cloned = try std.heap.page_allocator.dupe(u8, input);
|
||
|
||
const slice = input[20..][0..10];
|
||
|
||
try std.testing.expectEqual(eqlLong(moveSlice(slice, input, cloned), slice, false), true);
|
||
}
|
||
|
||
test "moveAllSlices" {
|
||
const Move = struct {
|
||
foo: string,
|
||
bar: string,
|
||
baz: string,
|
||
wrong: string,
|
||
};
|
||
var input: string = "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz";
|
||
var move = Move{ .foo = input[20..], .bar = input[30..], .baz = input[10..20], .wrong = "baz" };
|
||
var cloned = try std.heap.page_allocator.dupe(u8, input);
|
||
moveAllSlices(Move, &move, input, cloned);
|
||
const expected = Move{ .foo = cloned[20..], .bar = cloned[30..], .baz = cloned[10..20], .wrong = "bar" };
|
||
try std.testing.expectEqual(move.foo.ptr, expected.foo.ptr);
|
||
try std.testing.expectEqual(move.bar.ptr, expected.bar.ptr);
|
||
try std.testing.expectEqual(move.baz.ptr, expected.baz.ptr);
|
||
try std.testing.expectEqual(move.foo.len, expected.foo.len);
|
||
try std.testing.expectEqual(move.bar.len, expected.bar.len);
|
||
try std.testing.expectEqual(move.baz.len, expected.baz.len);
|
||
try std.testing.expect(move.wrong.ptr != expected.wrong.ptr);
|
||
}
|
||
|
||
test "join" {
|
||
const string_list = &[_]string{ "abc", "def", "123", "hello" };
|
||
const list = try join(string_list, "-", std.heap.page_allocator);
|
||
try std.testing.expectEqualStrings("abc-def-123-hello", list);
|
||
}
|
||
|
||
test "sortAsc" {
|
||
var string_list = [_]string{ "abc", "def", "123", "hello" };
|
||
var sorted_string_list = [_]string{ "123", "abc", "def", "hello" };
|
||
const sorted_join = try join(&sorted_string_list, "-", std.heap.page_allocator);
|
||
sortAsc(&string_list);
|
||
const string_join = try join(&string_list, "-", std.heap.page_allocator);
|
||
|
||
try std.testing.expectEqualStrings(sorted_join, string_join);
|
||
}
|
||
|
||
test "sortDesc" {
|
||
var string_list = [_]string{ "abc", "def", "123", "hello" };
|
||
var sorted_string_list = [_]string{ "hello", "def", "abc", "123" };
|
||
const sorted_join = try join(&sorted_string_list, "-", std.heap.page_allocator);
|
||
sortDesc(&string_list);
|
||
const string_join = try join(&string_list, "-", std.heap.page_allocator);
|
||
|
||
try std.testing.expectEqualStrings(sorted_join, string_join);
|
||
}
|
||
|
||
pub usingnamespace @import("exact_size_matcher.zig");
|
||
|
||
pub const unicode_replacement = 0xFFFD;
|
||
pub const unicode_replacement_str = brk: {
|
||
var out: [std.unicode.utf8CodepointSequenceLength(unicode_replacement) catch unreachable]u8 = undefined;
|
||
_ = std.unicode.utf8Encode(unicode_replacement, &out) catch unreachable;
|
||
break :brk out;
|
||
};
|
||
|
||
test "eqlCaseInsensitiveASCII" {
|
||
try std.testing.expect(eqlCaseInsensitiveASCII("abc", "ABC", true));
|
||
try std.testing.expect(eqlCaseInsensitiveASCII("abc", "abc", true));
|
||
try std.testing.expect(eqlCaseInsensitiveASCII("aBcD", "aBcD", true));
|
||
try std.testing.expect(!eqlCaseInsensitiveASCII("aBcD", "NOOO", true));
|
||
try std.testing.expect(!eqlCaseInsensitiveASCII("aBcD", "LENGTH CHECK", true));
|
||
}
|
||
|
||
pub fn isIPAddress(input: []const u8) bool {
|
||
var max_ip_address_buffer: [512]u8 = undefined;
|
||
if (input.len > max_ip_address_buffer.len) return false;
|
||
|
||
var sockaddr: std.os.sockaddr = undefined;
|
||
@memset(std.mem.asBytes(&sockaddr), 0);
|
||
@memcpy(max_ip_address_buffer[0..input.len], input);
|
||
max_ip_address_buffer[input.len] = 0;
|
||
|
||
const ip_addr_str: [:0]const u8 = max_ip_address_buffer[0..input.len :0];
|
||
|
||
return bun.c_ares.ares_inet_pton(std.os.AF.INET, ip_addr_str.ptr, &sockaddr) != 0 or bun.c_ares.ares_inet_pton(std.os.AF.INET6, ip_addr_str.ptr, &sockaddr) != 0;
|
||
}
|
||
|
||
pub fn isIPV6Address(input: []const u8) bool {
|
||
var max_ip_address_buffer: [512]u8 = undefined;
|
||
if (input.len > max_ip_address_buffer.len) return false;
|
||
|
||
var sockaddr: std.os.sockaddr = undefined;
|
||
@memset(std.mem.asBytes(&sockaddr), 0);
|
||
@memcpy(max_ip_address_buffer[0..input.len], input);
|
||
max_ip_address_buffer[input.len] = 0;
|
||
|
||
const ip_addr_str: [:0]const u8 = max_ip_address_buffer[0..input.len :0];
|
||
return bun.c_ares.ares_inet_pton(std.os.AF.INET6, ip_addr_str.ptr, &sockaddr) != 0;
|
||
}
|
||
|
||
pub fn cloneNormalizingSeparators(
|
||
allocator: std.mem.Allocator,
|
||
input: []const u8,
|
||
) ![]u8 {
|
||
// remove duplicate slashes in the file path
|
||
const base = withoutTrailingSlash(input);
|
||
var tokenized = std.mem.tokenize(u8, base, std.fs.path.sep_str);
|
||
var buf = try allocator.alloc(u8, base.len + 2);
|
||
if (comptime Environment.allow_assert) std.debug.assert(base.len > 0);
|
||
if (base[0] == std.fs.path.sep) {
|
||
buf[0] = std.fs.path.sep;
|
||
}
|
||
var remain = buf[@as(usize, @intFromBool(base[0] == std.fs.path.sep))..];
|
||
|
||
while (tokenized.next()) |token| {
|
||
if (token.len == 0) continue;
|
||
bun.copy(u8, remain, token);
|
||
remain[token.len..][0] = std.fs.path.sep;
|
||
remain = remain[token.len + 1 ..];
|
||
}
|
||
if ((remain.ptr - 1) != buf.ptr and (remain.ptr - 1)[0] != std.fs.path.sep) {
|
||
remain[0] = std.fs.path.sep;
|
||
remain = remain[1..];
|
||
}
|
||
remain[0] = 0;
|
||
|
||
return buf[0 .. @intFromPtr(remain.ptr) - @intFromPtr(buf.ptr)];
|
||
}
|
||
|
||
pub fn leftHasAnyInRight(to_check: []const string, against: []const string) bool {
|
||
for (to_check) |check| {
|
||
for (against) |item| {
|
||
if (eqlLong(check, item, true)) return true;
|
||
}
|
||
}
|
||
return false;
|
||
}
|
||
|
||
pub fn hasPrefixWithWordBoundary(input: []const u8, comptime prefix: []const u8) bool {
|
||
if (hasPrefixComptime(input, prefix)) {
|
||
if (input.len == prefix.len) return true;
|
||
|
||
const next = input[prefix.len..];
|
||
var bytes: [4]u8 = .{
|
||
next[0],
|
||
if (next.len > 1) next[1] else 0,
|
||
if (next.len > 2) next[2] else 0,
|
||
if (next.len > 3) next[3] else 0,
|
||
};
|
||
|
||
if (!bun.js_lexer.isIdentifierContinue(decodeWTF8RuneT(&bytes, wtf8ByteSequenceLength(next[0]), i32, -1))) {
|
||
return true;
|
||
}
|
||
}
|
||
|
||
return false;
|
||
}
|
||
|
||
pub fn concatWithLength(
|
||
allocator: std.mem.Allocator,
|
||
args: []const string,
|
||
length: usize,
|
||
) !string {
|
||
const out = try allocator.alloc(u8, length);
|
||
var remain = out;
|
||
for (args) |arg| {
|
||
@memcpy(remain[0..arg.len], arg);
|
||
remain = remain[arg.len..];
|
||
}
|
||
std.debug.assert(remain.len == 0); // all bytes should be used
|
||
return out;
|
||
}
|
||
|
||
pub fn concat(
|
||
allocator: std.mem.Allocator,
|
||
args: []const string,
|
||
) !string {
|
||
var length: usize = 0;
|
||
for (args) |arg| {
|
||
length += arg.len;
|
||
}
|
||
return concatWithLength(allocator, args, length);
|
||
}
|
||
|
||
pub fn concatIfNeeded(
|
||
allocator: std.mem.Allocator,
|
||
dest: *[]const u8,
|
||
args: []const string,
|
||
interned_strings_to_check: []const string,
|
||
) !void {
|
||
const total_length: usize = brk: {
|
||
var length: usize = 0;
|
||
for (args) |arg| {
|
||
length += arg.len;
|
||
}
|
||
break :brk length;
|
||
};
|
||
|
||
if (total_length == 0) {
|
||
dest.* = "";
|
||
return;
|
||
}
|
||
|
||
if (total_length < 1024) {
|
||
var stack = std.heap.stackFallback(1024, allocator);
|
||
const stack_copy = concatWithLength(stack.get(), args, total_length) catch unreachable;
|
||
for (interned_strings_to_check) |interned| {
|
||
if (eqlLong(stack_copy, interned, true)) {
|
||
dest.* = interned;
|
||
return;
|
||
}
|
||
}
|
||
}
|
||
|
||
const is_needed = brk: {
|
||
const out = dest.*;
|
||
var remain = out;
|
||
|
||
for (args) |arg| {
|
||
if (args.len > remain.len) {
|
||
break :brk true;
|
||
}
|
||
|
||
if (eqlLong(remain[0..args.len], arg, true)) {
|
||
remain = remain[args.len..];
|
||
} else {
|
||
break :brk true;
|
||
}
|
||
}
|
||
|
||
break :brk false;
|
||
};
|
||
|
||
if (!is_needed) return;
|
||
|
||
var buf = try allocator.alloc(u8, total_length);
|
||
dest.* = buf;
|
||
var remain = buf[0..];
|
||
for (args) |arg| {
|
||
@memcpy(remain[0..arg.len], arg);
|
||
|
||
remain = remain[arg.len..];
|
||
}
|
||
std.debug.assert(remain.len == 0);
|
||
}
|
||
|
||
pub fn convertUTF8toUTF16InBuffer(
|
||
buf: []u16,
|
||
input: []const u8,
|
||
) []const u16 {
|
||
if (!Environment.isWindows) @compileError("please dont't use this function on posix until fixing the todos.");
|
||
|
||
const result = bun.simdutf.convert.utf8.to.utf16.with_errors.le(input, buf);
|
||
switch (result.status) {
|
||
.success => return buf[0..result.count],
|
||
// TODO(@paperdave): handle surrogate
|
||
.surrogate => @panic("TODO: handle surrogate in convertUTF8toUTF16"),
|
||
else => @panic("TODO: handle error in convertUTF8toUTF16"),
|
||
}
|
||
}
|
||
|
||
pub fn convertUTF16toUTF8InBuffer(
|
||
buf: []u8,
|
||
input: []const u16,
|
||
) ![]const u8 {
|
||
if (!Environment.isWindows) @compileError("please dont't use this function on posix until fixing the todos.");
|
||
|
||
const result = bun.simdutf.convert.utf16.to.utf8.with_errors.le(input, buf);
|
||
switch (result.status) {
|
||
.success => return buf[0..result.count],
|
||
// TODO(@paperdave): handle surrogate
|
||
.surrogate => @panic("TODO: handle surrogate in convertUTF8toUTF16"),
|
||
else => @panic("TODO: handle error in convertUTF16toUTF8InBuffer"),
|
||
}
|
||
}
|
||
|
||
pub inline fn charIsAnySlash(char: u8) bool {
|
||
return char == '/' or char == '\\';
|
||
}
|
||
|
||
pub inline fn startsWithWindowsDriveLetter(s: []const u8) bool {
|
||
return s.len >= 2 and s[0] == ':' and switch (s[1]) {
|
||
'a'...'z', 'A'...'Z' => true,
|
||
else => false,
|
||
};
|
||
}
|
||
|
||
pub fn mustEscapeYAMLString(contents: []const u8) bool {
|
||
if (contents.len == 0) return true;
|
||
|
||
return switch (contents[0]) {
|
||
'A'...'Z', 'a'...'z' => strings.hasPrefixComptime(contents, "Yes") or strings.hasPrefixComptime(contents, "No") or strings.hasPrefixComptime(contents, "true") or
|
||
strings.hasPrefixComptime(contents, "false") or
|
||
std.mem.indexOfAnyPos(u8, contents, 1, ": \t\r\n\x0B\x0C\\\",[]") != null,
|
||
else => true,
|
||
};
|
||
}
|
||
|
||
pub fn pathContainsNodeModulesFolder(path: []const u8) bool {
|
||
return strings.contains(path, comptime std.fs.path.sep_str ++ "node_modules" ++ std.fs.path.sep_str);
|
||
}
|
||
|
||
pub fn isZeroWidthCodepointType(comptime T: type, cp: T) bool {
|
||
if (cp <= 0x1f) {
|
||
return true;
|
||
}
|
||
|
||
if (cp >= 0x7f and cp <= 0x9f) {
|
||
// C1 control characters
|
||
return true;
|
||
}
|
||
|
||
if (comptime @sizeOf(T) == 1) {
|
||
return false;
|
||
}
|
||
|
||
if (cp >= 0x300 and cp <= 0x36f) {
|
||
// Combining Diacritical Marks
|
||
return true;
|
||
}
|
||
if (cp >= 0x300 and cp <= 0x36f)
|
||
// Combining Diacritical Marks
|
||
return true;
|
||
|
||
if (cp >= 0x200b and cp <= 0x200f) {
|
||
// Modifying Invisible Characters
|
||
return true;
|
||
}
|
||
|
||
if (cp >= 0x20d0 and cp <= 0x20ff)
|
||
// Combining Diacritical Marks for Symbols
|
||
return true;
|
||
|
||
if (cp >= 0xfe00 and cp <= 0xfe0f)
|
||
// Variation Selectors
|
||
return true;
|
||
if (cp >= 0xfe20 and cp <= 0xfe2f)
|
||
// Combining Half Marks
|
||
return true;
|
||
|
||
if (cp == 0xfeff)
|
||
// Zero Width No-Break Space (BOM, ZWNBSP)
|
||
return true;
|
||
|
||
if (cp >= 0xe0100 and cp <= 0xe01ef)
|
||
// Variation Selectors
|
||
return true;
|
||
|
||
return false;
|
||
}
|
||
|
||
/// Official unicode reference: https://www.unicode.org/Public/UCD/latest/ucd/EastAsianWidth.txt
|
||
/// Tag legend:
|
||
/// - `W` (wide) -> true
|
||
/// - `F` (full-width) -> true
|
||
/// - `H` (half-width) -> false
|
||
/// - `N` (neutral) -> false
|
||
/// - `Na` (narrow) -> false
|
||
/// - `A` (ambiguous) -> false?
|
||
///
|
||
/// To regenerate the switch body list, run:
|
||
/// ```js
|
||
/// [...(await (await fetch("https://www.unicode.org/Public/UCD/latest/ucd/EastAsianWidth.txt")).text()).matchAll(/^([\dA-F]{4,})(?:\.\.([\dA-F]{4,}))?\s+;\s+(\w+)\s+#\s+(.*?)\s*$/gm)].flatMap(([,start, end, type, comment]) => (
|
||
/// (['W', 'F'].includes(type)) ? [` ${(end ? `0x${start}...0x${end}` : `0x${start}`)}, // ${''.padStart(17 - start.length - (end ? end.length + 5 : 0))}[${type}] ${comment}`] : []
|
||
/// )).join('\n')
|
||
/// ```
|
||
pub fn isFullWidthCodepointType(comptime T: type, cp: T) bool {
|
||
if (!(cp >= 0x1100)) {
|
||
return false;
|
||
}
|
||
|
||
return switch (cp) {
|
||
0x1100...0x115F, // [W] Lo [96] HANGUL CHOSEONG KIYEOK..HANGUL CHOSEONG FILLER
|
||
0x231A...0x231B, // [W] So [2] WATCH..HOURGLASS
|
||
0x2329, // [W] Ps LEFT-POINTING ANGLE BRACKET
|
||
0x232A, // [W] Pe RIGHT-POINTING ANGLE BRACKET
|
||
0x23E9...0x23EC, // [W] So [4] BLACK RIGHT-POINTING DOUBLE TRIANGLE..BLACK DOWN-POINTING DOUBLE TRIANGLE
|
||
0x23F0, // [W] So ALARM CLOCK
|
||
0x23F3, // [W] So HOURGLASS WITH FLOWING SAND
|
||
0x25FD...0x25FE, // [W] Sm [2] WHITE MEDIUM SMALL SQUARE..BLACK MEDIUM SMALL SQUARE
|
||
0x2614...0x2615, // [W] So [2] UMBRELLA WITH RAIN DROPS..HOT BEVERAGE
|
||
0x2648...0x2653, // [W] So [12] ARIES..PISCES
|
||
0x267F, // [W] So WHEELCHAIR SYMBOL
|
||
0x2693, // [W] So ANCHOR
|
||
0x26A1, // [W] So HIGH VOLTAGE SIGN
|
||
0x26AA...0x26AB, // [W] So [2] MEDIUM WHITE CIRCLE..MEDIUM BLACK CIRCLE
|
||
0x26BD...0x26BE, // [W] So [2] SOCCER BALL..BASEBALL
|
||
0x26C4...0x26C5, // [W] So [2] SNOWMAN WITHOUT SNOW..SUN BEHIND CLOUD
|
||
0x26CE, // [W] So OPHIUCHUS
|
||
0x26D4, // [W] So NO ENTRY
|
||
0x26EA, // [W] So CHURCH
|
||
0x26F2...0x26F3, // [W] So [2] FOUNTAIN..FLAG IN HOLE
|
||
0x26F5, // [W] So SAILBOAT
|
||
0x26FA, // [W] So TENT
|
||
0x26FD, // [W] So FUEL PUMP
|
||
0x2705, // [W] So WHITE HEAVY CHECK MARK
|
||
0x270A...0x270B, // [W] So [2] RAISED FIST..RAISED HAND
|
||
0x2728, // [W] So SPARKLES
|
||
0x274C, // [W] So CROSS MARK
|
||
0x274E, // [W] So NEGATIVE SQUARED CROSS MARK
|
||
0x2753...0x2755, // [W] So [3] BLACK QUESTION MARK ORNAMENT..WHITE EXCLAMATION MARK ORNAMENT
|
||
0x2757, // [W] So HEAVY EXCLAMATION MARK SYMBOL
|
||
0x2795...0x2797, // [W] So [3] HEAVY PLUS SIGN..HEAVY DIVISION SIGN
|
||
0x27B0, // [W] So CURLY LOOP
|
||
0x27BF, // [W] So DOUBLE CURLY LOOP
|
||
0x2B1B...0x2B1C, // [W] So [2] BLACK LARGE SQUARE..WHITE LARGE SQUARE
|
||
0x2B50, // [W] So WHITE MEDIUM STAR
|
||
0x2B55, // [W] So HEAVY LARGE CIRCLE
|
||
0x2E80...0x2E99, // [W] So [26] CJK RADICAL REPEAT..CJK RADICAL RAP
|
||
0x2E9B...0x2EF3, // [W] So [89] CJK RADICAL CHOKE..CJK RADICAL C-SIMPLIFIED TURTLE
|
||
0x2F00...0x2FD5, // [W] So [214] KANGXI RADICAL ONE..KANGXI RADICAL FLUTE
|
||
0x2FF0...0x2FFF, // [W] So [16] IDEOGRAPHIC DESCRIPTION CHARACTER LEFT TO RIGHT..IDEOGRAPHIC DESCRIPTION CHARACTER ROTATION
|
||
0x3000, // [F] Zs IDEOGRAPHIC SPACE
|
||
0x3001...0x3003, // [W] Po [3] IDEOGRAPHIC COMMA..DITTO MARK
|
||
0x3004, // [W] So JAPANESE INDUSTRIAL STANDARD SYMBOL
|
||
0x3005, // [W] Lm IDEOGRAPHIC ITERATION MARK
|
||
0x3006, // [W] Lo IDEOGRAPHIC CLOSING MARK
|
||
0x3007, // [W] Nl IDEOGRAPHIC NUMBER ZERO
|
||
0x3008, // [W] Ps LEFT ANGLE BRACKET
|
||
0x3009, // [W] Pe RIGHT ANGLE BRACKET
|
||
0x300A, // [W] Ps LEFT DOUBLE ANGLE BRACKET
|
||
0x300B, // [W] Pe RIGHT DOUBLE ANGLE BRACKET
|
||
0x300C, // [W] Ps LEFT CORNER BRACKET
|
||
0x300D, // [W] Pe RIGHT CORNER BRACKET
|
||
0x300E, // [W] Ps LEFT WHITE CORNER BRACKET
|
||
0x300F, // [W] Pe RIGHT WHITE CORNER BRACKET
|
||
0x3010, // [W] Ps LEFT BLACK LENTICULAR BRACKET
|
||
0x3011, // [W] Pe RIGHT BLACK LENTICULAR BRACKET
|
||
0x3012...0x3013, // [W] So [2] POSTAL MARK..GETA MARK
|
||
0x3014, // [W] Ps LEFT TORTOISE SHELL BRACKET
|
||
0x3015, // [W] Pe RIGHT TORTOISE SHELL BRACKET
|
||
0x3016, // [W] Ps LEFT WHITE LENTICULAR BRACKET
|
||
0x3017, // [W] Pe RIGHT WHITE LENTICULAR BRACKET
|
||
0x3018, // [W] Ps LEFT WHITE TORTOISE SHELL BRACKET
|
||
0x3019, // [W] Pe RIGHT WHITE TORTOISE SHELL BRACKET
|
||
0x301A, // [W] Ps LEFT WHITE SQUARE BRACKET
|
||
0x301B, // [W] Pe RIGHT WHITE SQUARE BRACKET
|
||
0x301C, // [W] Pd WAVE DASH
|
||
0x301D, // [W] Ps REVERSED DOUBLE PRIME QUOTATION MARK
|
||
0x301E...0x301F, // [W] Pe [2] DOUBLE PRIME QUOTATION MARK..LOW DOUBLE PRIME QUOTATION MARK
|
||
0x3020, // [W] So POSTAL MARK FACE
|
||
0x3021...0x3029, // [W] Nl [9] HANGZHOU NUMERAL ONE..HANGZHOU NUMERAL NINE
|
||
0x302A...0x302D, // [W] Mn [4] IDEOGRAPHIC LEVEL TONE MARK..IDEOGRAPHIC ENTERING TONE MARK
|
||
0x302E...0x302F, // [W] Mc [2] HANGUL SINGLE DOT TONE MARK..HANGUL DOUBLE DOT TONE MARK
|
||
0x3030, // [W] Pd WAVY DASH
|
||
0x3031...0x3035, // [W] Lm [5] VERTICAL KANA REPEAT MARK..VERTICAL KANA REPEAT MARK LOWER HALF
|
||
0x3036...0x3037, // [W] So [2] CIRCLED POSTAL MARK..IDEOGRAPHIC TELEGRAPH LINE FEED SEPARATOR SYMBOL
|
||
0x3038...0x303A, // [W] Nl [3] HANGZHOU NUMERAL TEN..HANGZHOU NUMERAL THIRTY
|
||
0x303B, // [W] Lm VERTICAL IDEOGRAPHIC ITERATION MARK
|
||
0x303C, // [W] Lo MASU MARK
|
||
0x303D, // [W] Po PART ALTERNATION MARK
|
||
0x303E, // [W] So IDEOGRAPHIC VARIATION INDICATOR
|
||
0x3041...0x3096, // [W] Lo [86] HIRAGANA LETTER SMALL A..HIRAGANA LETTER SMALL KE
|
||
0x3099...0x309A, // [W] Mn [2] COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK..COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
|
||
0x309B...0x309C, // [W] Sk [2] KATAKANA-HIRAGANA VOICED SOUND MARK..KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
|
||
0x309D...0x309E, // [W] Lm [2] HIRAGANA ITERATION MARK..HIRAGANA VOICED ITERATION MARK
|
||
0x309F, // [W] Lo HIRAGANA DIGRAPH YORI
|
||
0x30A0, // [W] Pd KATAKANA-HIRAGANA DOUBLE HYPHEN
|
||
0x30A1...0x30FA, // [W] Lo [90] KATAKANA LETTER SMALL A..KATAKANA LETTER VO
|
||
0x30FB, // [W] Po KATAKANA MIDDLE DOT
|
||
0x30FC...0x30FE, // [W] Lm [3] KATAKANA-HIRAGANA PROLONGED SOUND MARK..KATAKANA VOICED ITERATION MARK
|
||
0x30FF, // [W] Lo KATAKANA DIGRAPH KOTO
|
||
0x3105...0x312F, // [W] Lo [43] BOPOMOFO LETTER B..BOPOMOFO LETTER NN
|
||
0x3131...0x318E, // [W] Lo [94] HANGUL LETTER KIYEOK..HANGUL LETTER ARAEAE
|
||
0x3190...0x3191, // [W] So [2] IDEOGRAPHIC ANNOTATION LINKING MARK..IDEOGRAPHIC ANNOTATION REVERSE MARK
|
||
0x3192...0x3195, // [W] No [4] IDEOGRAPHIC ANNOTATION ONE MARK..IDEOGRAPHIC ANNOTATION FOUR MARK
|
||
0x3196...0x319F, // [W] So [10] IDEOGRAPHIC ANNOTATION TOP MARK..IDEOGRAPHIC ANNOTATION MAN MARK
|
||
0x31A0...0x31BF, // [W] Lo [32] BOPOMOFO LETTER BU..BOPOMOFO LETTER AH
|
||
0x31C0...0x31E3, // [W] So [36] CJK STROKE T..CJK STROKE Q
|
||
0x31EF, // [W] So IDEOGRAPHIC DESCRIPTION CHARACTER SUBTRACTION
|
||
0x31F0...0x31FF, // [W] Lo [16] KATAKANA LETTER SMALL KU..KATAKANA LETTER SMALL RO
|
||
0x3200...0x321E, // [W] So [31] PARENTHESIZED HANGUL KIYEOK..PARENTHESIZED KOREAN CHARACTER O HU
|
||
0x3220...0x3229, // [W] No [10] PARENTHESIZED IDEOGRAPH ONE..PARENTHESIZED IDEOGRAPH TEN
|
||
0x322A...0x3247, // [W] So [30] PARENTHESIZED IDEOGRAPH MOON..CIRCLED IDEOGRAPH KOTO
|
||
0x3250, // [W] So PARTNERSHIP SIGN
|
||
0x3251...0x325F, // [W] No [15] CIRCLED NUMBER TWENTY ONE..CIRCLED NUMBER THIRTY FIVE
|
||
0x3260...0x327F, // [W] So [32] CIRCLED HANGUL KIYEOK..KOREAN STANDARD SYMBOL
|
||
0x3280...0x3289, // [W] No [10] CIRCLED IDEOGRAPH ONE..CIRCLED IDEOGRAPH TEN
|
||
0x328A...0x32B0, // [W] So [39] CIRCLED IDEOGRAPH MOON..CIRCLED IDEOGRAPH NIGHT
|
||
0x32B1...0x32BF, // [W] No [15] CIRCLED NUMBER THIRTY SIX..CIRCLED NUMBER FIFTY
|
||
0x32C0...0x32FF, // [W] So [64] IDEOGRAPHIC TELEGRAPH SYMBOL FOR JANUARY..SQUARE ERA NAME REIWA
|
||
0x3300...0x33FF, // [W] So [256] SQUARE APAATO..SQUARE GAL
|
||
0x3400...0x4DBF, // [W] Lo [6592] CJK UNIFIED IDEOGRAPH-3400..CJK UNIFIED IDEOGRAPH-4DBF
|
||
0x4E00...0x9FFF, // [W] Lo [20992] CJK UNIFIED IDEOGRAPH-4E00..CJK UNIFIED IDEOGRAPH-9FFF
|
||
0xA000...0xA014, // [W] Lo [21] YI SYLLABLE IT..YI SYLLABLE E
|
||
0xA015, // [W] Lm YI SYLLABLE WU
|
||
0xA016...0xA48C, // [W] Lo [1143] YI SYLLABLE BIT..YI SYLLABLE YYR
|
||
0xA490...0xA4C6, // [W] So [55] YI RADICAL QOT..YI RADICAL KE
|
||
0xA960...0xA97C, // [W] Lo [29] HANGUL CHOSEONG TIKEUT-MIEUM..HANGUL CHOSEONG SSANGYEORINHIEUH
|
||
0xAC00...0xD7A3, // [W] Lo [11172] HANGUL SYLLABLE GA..HANGUL SYLLABLE HIH
|
||
0xF900...0xFA6D, // [W] Lo [366] CJK COMPATIBILITY IDEOGRAPH-F900..CJK COMPATIBILITY IDEOGRAPH-FA6D
|
||
0xFA6E...0xFA6F, // [W] Cn [2] <reserved-FA6E>..<reserved-FA6F>
|
||
0xFA70...0xFAD9, // [W] Lo [106] CJK COMPATIBILITY IDEOGRAPH-FA70..CJK COMPATIBILITY IDEOGRAPH-FAD9
|
||
0xFADA...0xFAFF, // [W] Cn [38] <reserved-FADA>..<reserved-FAFF>
|
||
0xFE10...0xFE16, // [W] Po [7] PRESENTATION FORM FOR VERTICAL COMMA..PRESENTATION FORM FOR VERTICAL QUESTION MARK
|
||
0xFE17, // [W] Ps PRESENTATION FORM FOR VERTICAL LEFT WHITE LENTICULAR BRACKET
|
||
0xFE18, // [W] Pe PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRAKCET
|
||
0xFE19, // [W] Po PRESENTATION FORM FOR VERTICAL HORIZONTAL ELLIPSIS
|
||
0xFE30, // [W] Po PRESENTATION FORM FOR VERTICAL TWO DOT LEADER
|
||
0xFE31...0xFE32, // [W] Pd [2] PRESENTATION FORM FOR VERTICAL EM DASH..PRESENTATION FORM FOR VERTICAL EN DASH
|
||
0xFE33...0xFE34, // [W] Pc [2] PRESENTATION FORM FOR VERTICAL LOW LINE..PRESENTATION FORM FOR VERTICAL WAVY LOW LINE
|
||
0xFE35, // [W] Ps PRESENTATION FORM FOR VERTICAL LEFT PARENTHESIS
|
||
0xFE36, // [W] Pe PRESENTATION FORM FOR VERTICAL RIGHT PARENTHESIS
|
||
0xFE37, // [W] Ps PRESENTATION FORM FOR VERTICAL LEFT CURLY BRACKET
|
||
0xFE38, // [W] Pe PRESENTATION FORM FOR VERTICAL RIGHT CURLY BRACKET
|
||
0xFE39, // [W] Ps PRESENTATION FORM FOR VERTICAL LEFT TORTOISE SHELL BRACKET
|
||
0xFE3A, // [W] Pe PRESENTATION FORM FOR VERTICAL RIGHT TORTOISE SHELL BRACKET
|
||
0xFE3B, // [W] Ps PRESENTATION FORM FOR VERTICAL LEFT BLACK LENTICULAR BRACKET
|
||
0xFE3C, // [W] Pe PRESENTATION FORM FOR VERTICAL RIGHT BLACK LENTICULAR BRACKET
|
||
0xFE3D, // [W] Ps PRESENTATION FORM FOR VERTICAL LEFT DOUBLE ANGLE BRACKET
|
||
0xFE3E, // [W] Pe PRESENTATION FORM FOR VERTICAL RIGHT DOUBLE ANGLE BRACKET
|
||
0xFE3F, // [W] Ps PRESENTATION FORM FOR VERTICAL LEFT ANGLE BRACKET
|
||
0xFE40, // [W] Pe PRESENTATION FORM FOR VERTICAL RIGHT ANGLE BRACKET
|
||
0xFE41, // [W] Ps PRESENTATION FORM FOR VERTICAL LEFT CORNER BRACKET
|
||
0xFE42, // [W] Pe PRESENTATION FORM FOR VERTICAL RIGHT CORNER BRACKET
|
||
0xFE43, // [W] Ps PRESENTATION FORM FOR VERTICAL LEFT WHITE CORNER BRACKET
|
||
0xFE44, // [W] Pe PRESENTATION FORM FOR VERTICAL RIGHT WHITE CORNER BRACKET
|
||
0xFE45...0xFE46, // [W] Po [2] SESAME DOT..WHITE SESAME DOT
|
||
0xFE47, // [W] Ps PRESENTATION FORM FOR VERTICAL LEFT SQUARE BRACKET
|
||
0xFE48, // [W] Pe PRESENTATION FORM FOR VERTICAL RIGHT SQUARE BRACKET
|
||
0xFE49...0xFE4C, // [W] Po [4] DASHED OVERLINE..DOUBLE WAVY OVERLINE
|
||
0xFE4D...0xFE4F, // [W] Pc [3] DASHED LOW LINE..WAVY LOW LINE
|
||
0xFE50...0xFE52, // [W] Po [3] SMALL COMMA..SMALL FULL STOP
|
||
0xFE54...0xFE57, // [W] Po [4] SMALL SEMICOLON..SMALL EXCLAMATION MARK
|
||
0xFE58, // [W] Pd SMALL EM DASH
|
||
0xFE59, // [W] Ps SMALL LEFT PARENTHESIS
|
||
0xFE5A, // [W] Pe SMALL RIGHT PARENTHESIS
|
||
0xFE5B, // [W] Ps SMALL LEFT CURLY BRACKET
|
||
0xFE5C, // [W] Pe SMALL RIGHT CURLY BRACKET
|
||
0xFE5D, // [W] Ps SMALL LEFT TORTOISE SHELL BRACKET
|
||
0xFE5E, // [W] Pe SMALL RIGHT TORTOISE SHELL BRACKET
|
||
0xFE5F...0xFE61, // [W] Po [3] SMALL NUMBER SIGN..SMALL ASTERISK
|
||
0xFE62, // [W] Sm SMALL PLUS SIGN
|
||
0xFE63, // [W] Pd SMALL HYPHEN-MINUS
|
||
0xFE64...0xFE66, // [W] Sm [3] SMALL LESS-THAN SIGN..SMALL EQUALS SIGN
|
||
0xFE68, // [W] Po SMALL REVERSE SOLIDUS
|
||
0xFE69, // [W] Sc SMALL DOLLAR SIGN
|
||
0xFE6A...0xFE6B, // [W] Po [2] SMALL PERCENT SIGN..SMALL COMMERCIAL AT
|
||
0xFF01...0xFF03, // [F] Po [3] FULLWIDTH EXCLAMATION MARK..FULLWIDTH NUMBER SIGN
|
||
0xFF04, // [F] Sc FULLWIDTH DOLLAR SIGN
|
||
0xFF05...0xFF07, // [F] Po [3] FULLWIDTH PERCENT SIGN..FULLWIDTH APOSTROPHE
|
||
0xFF08, // [F] Ps FULLWIDTH LEFT PARENTHESIS
|
||
0xFF09, // [F] Pe FULLWIDTH RIGHT PARENTHESIS
|
||
0xFF0A, // [F] Po FULLWIDTH ASTERISK
|
||
0xFF0B, // [F] Sm FULLWIDTH PLUS SIGN
|
||
0xFF0C, // [F] Po FULLWIDTH COMMA
|
||
0xFF0D, // [F] Pd FULLWIDTH HYPHEN-MINUS
|
||
0xFF0E...0xFF0F, // [F] Po [2] FULLWIDTH FULL STOP..FULLWIDTH SOLIDUS
|
||
0xFF10...0xFF19, // [F] Nd [10] FULLWIDTH DIGIT ZERO..FULLWIDTH DIGIT NINE
|
||
0xFF1A...0xFF1B, // [F] Po [2] FULLWIDTH COLON..FULLWIDTH SEMICOLON
|
||
0xFF1C...0xFF1E, // [F] Sm [3] FULLWIDTH LESS-THAN SIGN..FULLWIDTH GREATER-THAN SIGN
|
||
0xFF1F...0xFF20, // [F] Po [2] FULLWIDTH QUESTION MARK..FULLWIDTH COMMERCIAL AT
|
||
0xFF21...0xFF3A, // [F] Lu [26] FULLWIDTH LATIN CAPITAL LETTER A..FULLWIDTH LATIN CAPITAL LETTER Z
|
||
0xFF3B, // [F] Ps FULLWIDTH LEFT SQUARE BRACKET
|
||
0xFF3C, // [F] Po FULLWIDTH REVERSE SOLIDUS
|
||
0xFF3D, // [F] Pe FULLWIDTH RIGHT SQUARE BRACKET
|
||
0xFF3E, // [F] Sk FULLWIDTH CIRCUMFLEX ACCENT
|
||
0xFF3F, // [F] Pc FULLWIDTH LOW LINE
|
||
0xFF40, // [F] Sk FULLWIDTH GRAVE ACCENT
|
||
0xFF41...0xFF5A, // [F] Ll [26] FULLWIDTH LATIN SMALL LETTER A..FULLWIDTH LATIN SMALL LETTER Z
|
||
0xFF5B, // [F] Ps FULLWIDTH LEFT CURLY BRACKET
|
||
0xFF5C, // [F] Sm FULLWIDTH VERTICAL LINE
|
||
0xFF5D, // [F] Pe FULLWIDTH RIGHT CURLY BRACKET
|
||
0xFF5E, // [F] Sm FULLWIDTH TILDE
|
||
0xFF5F, // [F] Ps FULLWIDTH LEFT WHITE PARENTHESIS
|
||
0xFF60, // [F] Pe FULLWIDTH RIGHT WHITE PARENTHESIS
|
||
0xFFE0...0xFFE1, // [F] Sc [2] FULLWIDTH CENT SIGN..FULLWIDTH POUND SIGN
|
||
0xFFE2, // [F] Sm FULLWIDTH NOT SIGN
|
||
0xFFE3, // [F] Sk FULLWIDTH MACRON
|
||
0xFFE4, // [F] So FULLWIDTH BROKEN BAR
|
||
0xFFE5...0xFFE6, // [F] Sc [2] FULLWIDTH YEN SIGN..FULLWIDTH WON SIGN
|
||
0x16FE0...0x16FE1, // [W] Lm [2] TANGUT ITERATION MARK..NUSHU ITERATION MARK
|
||
0x16FE2, // [W] Po OLD CHINESE HOOK MARK
|
||
0x16FE3, // [W] Lm OLD CHINESE ITERATION MARK
|
||
0x16FE4, // [W] Mn KHITAN SMALL SCRIPT FILLER
|
||
0x16FF0...0x16FF1, // [W] Mc [2] VIETNAMESE ALTERNATE READING MARK CA..VIETNAMESE ALTERNATE READING MARK NHAY
|
||
0x17000...0x187F7, // [W] Lo [6136] TANGUT IDEOGRAPH-17000..TANGUT IDEOGRAPH-187F7
|
||
0x18800...0x18AFF, // [W] Lo [768] TANGUT COMPONENT-001..TANGUT COMPONENT-768
|
||
0x18B00...0x18CD5, // [W] Lo [470] KHITAN SMALL SCRIPT CHARACTER-18B00..KHITAN SMALL SCRIPT CHARACTER-18CD5
|
||
0x18D00...0x18D08, // [W] Lo [9] TANGUT IDEOGRAPH-18D00..TANGUT IDEOGRAPH-18D08
|
||
0x1AFF0...0x1AFF3, // [W] Lm [4] KATAKANA LETTER MINNAN TONE-2..KATAKANA LETTER MINNAN TONE-5
|
||
0x1AFF5...0x1AFFB, // [W] Lm [7] KATAKANA LETTER MINNAN TONE-7..KATAKANA LETTER MINNAN NASALIZED TONE-5
|
||
0x1AFFD...0x1AFFE, // [W] Lm [2] KATAKANA LETTER MINNAN NASALIZED TONE-7..KATAKANA LETTER MINNAN NASALIZED TONE-8
|
||
0x1B000...0x1B0FF, // [W] Lo [256] KATAKANA LETTER ARCHAIC E..HENTAIGANA LETTER RE-2
|
||
0x1B100...0x1B122, // [W] Lo [35] HENTAIGANA LETTER RE-3..KATAKANA LETTER ARCHAIC WU
|
||
0x1B132, // [W] Lo HIRAGANA LETTER SMALL KO
|
||
0x1B150...0x1B152, // [W] Lo [3] HIRAGANA LETTER SMALL WI..HIRAGANA LETTER SMALL WO
|
||
0x1B155, // [W] Lo KATAKANA LETTER SMALL KO
|
||
0x1B164...0x1B167, // [W] Lo [4] KATAKANA LETTER SMALL WI..KATAKANA LETTER SMALL N
|
||
0x1B170...0x1B2FB, // [W] Lo [396] NUSHU CHARACTER-1B170..NUSHU CHARACTER-1B2FB
|
||
0x1F004, // [W] So MAHJONG TILE RED DRAGON
|
||
0x1F0CF, // [W] So PLAYING CARD BLACK JOKER
|
||
0x1F18E, // [W] So NEGATIVE SQUARED AB
|
||
0x1F191...0x1F19A, // [W] So [10] SQUARED CL..SQUARED VS
|
||
0x1F200...0x1F202, // [W] So [3] SQUARE HIRAGANA HOKA..SQUARED KATAKANA SA
|
||
0x1F210...0x1F23B, // [W] So [44] SQUARED CJK UNIFIED IDEOGRAPH-624B..SQUARED CJK UNIFIED IDEOGRAPH-914D
|
||
0x1F240...0x1F248, // [W] So [9] TORTOISE SHELL BRACKETED CJK UNIFIED IDEOGRAPH-672C..TORTOISE SHELL BRACKETED CJK UNIFIED IDEOGRAPH-6557
|
||
0x1F250...0x1F251, // [W] So [2] CIRCLED IDEOGRAPH ADVANTAGE..CIRCLED IDEOGRAPH ACCEPT
|
||
0x1F260...0x1F265, // [W] So [6] ROUNDED SYMBOL FOR FU..ROUNDED SYMBOL FOR CAI
|
||
0x1F300...0x1F320, // [W] So [33] CYCLONE..SHOOTING STAR
|
||
0x1F32D...0x1F335, // [W] So [9] HOT DOG..CACTUS
|
||
0x1F337...0x1F37C, // [W] So [70] TULIP..BABY BOTTLE
|
||
0x1F37E...0x1F393, // [W] So [22] BOTTLE WITH POPPING CORK..GRADUATION CAP
|
||
0x1F3A0...0x1F3CA, // [W] So [43] CAROUSEL HORSE..SWIMMER
|
||
0x1F3CF...0x1F3D3, // [W] So [5] CRICKET BAT AND BALL..TABLE TENNIS PADDLE AND BALL
|
||
0x1F3E0...0x1F3F0, // [W] So [17] HOUSE BUILDING..EUROPEAN CASTLE
|
||
0x1F3F4, // [W] So WAVING BLACK FLAG
|
||
0x1F3F8...0x1F3FA, // [W] So [3] BADMINTON RACQUET AND SHUTTLECOCK..AMPHORA
|
||
0x1F3FB...0x1F3FF, // [W] Sk [5] EMOJI MODIFIER FITZPATRICK TYPE-1-2..EMOJI MODIFIER FITZPATRICK TYPE-6
|
||
0x1F400...0x1F43E, // [W] So [63] RAT..PAW PRINTS
|
||
0x1F440, // [W] So EYES
|
||
0x1F442...0x1F4FC, // [W] So [187] EAR..VIDEOCASSETTE
|
||
0x1F4FF...0x1F53D, // [W] So [63] PRAYER BEADS..DOWN-POINTING SMALL RED TRIANGLE
|
||
0x1F54B...0x1F54E, // [W] So [4] KAABA..MENORAH WITH NINE BRANCHES
|
||
0x1F550...0x1F567, // [W] So [24] CLOCK FACE ONE OCLOCK..CLOCK FACE TWELVE-THIRTY
|
||
0x1F57A, // [W] So MAN DANCING
|
||
0x1F595...0x1F596, // [W] So [2] REVERSED HAND WITH MIDDLE FINGER EXTENDED..RAISED HAND WITH PART BETWEEN MIDDLE AND RING FINGERS
|
||
0x1F5A4, // [W] So BLACK HEART
|
||
0x1F5FB...0x1F5FF, // [W] So [5] MOUNT FUJI..MOYAI
|
||
0x1F600...0x1F64F, // [W] So [80] GRINNING FACE..PERSON WITH FOLDED HANDS
|
||
0x1F680...0x1F6C5, // [W] So [70] ROCKET..LEFT LUGGAGE
|
||
0x1F6CC, // [W] So SLEEPING ACCOMMODATION
|
||
0x1F6D0...0x1F6D2, // [W] So [3] PLACE OF WORSHIP..SHOPPING TROLLEY
|
||
0x1F6D5...0x1F6D7, // [W] So [3] HINDU TEMPLE..ELEVATOR
|
||
0x1F6DC...0x1F6DF, // [W] So [4] WIRELESS..RING BUOY
|
||
0x1F6EB...0x1F6EC, // [W] So [2] AIRPLANE DEPARTURE..AIRPLANE ARRIVING
|
||
0x1F6F4...0x1F6FC, // [W] So [9] SCOOTER..ROLLER SKATE
|
||
0x1F7E0...0x1F7EB, // [W] So [12] LARGE ORANGE CIRCLE..LARGE BROWN SQUARE
|
||
0x1F7F0, // [W] So HEAVY EQUALS SIGN
|
||
0x1F90C...0x1F93A, // [W] So [47] PINCHED FINGERS..FENCER
|
||
0x1F93C...0x1F945, // [W] So [10] WRESTLERS..GOAL NET
|
||
0x1F947...0x1F9FF, // [W] So [185] FIRST PLACE MEDAL..NAZAR AMULET
|
||
0x1FA70...0x1FA7C, // [W] So [13] BALLET SHOES..CRUTCH
|
||
0x1FA80...0x1FA88, // [W] So [9] YO-YO..FLUTE
|
||
0x1FA90...0x1FABD, // [W] So [46] RINGED PLANET..WING
|
||
0x1FABF...0x1FAC5, // [W] So [7] GOOSE..PERSON WITH CROWN
|
||
0x1FACE...0x1FADB, // [W] So [14] MOOSE..PEA POD
|
||
0x1FAE0...0x1FAE8, // [W] So [9] MELTING FACE..SHAKING FACE
|
||
0x1FAF0...0x1FAF8, // [W] So [9] HAND WITH INDEX FINGER AND THUMB CROSSED..RIGHTWARDS PUSHING HAND
|
||
0x20000...0x2A6DF, // [W] Lo [42720] CJK UNIFIED IDEOGRAPH-20000..CJK UNIFIED IDEOGRAPH-2A6DF
|
||
0x2A6E0...0x2A6FF, // [W] Cn [32] <reserved-2A6E0>..<reserved-2A6FF>
|
||
0x2A700...0x2B739, // [W] Lo [4154] CJK UNIFIED IDEOGRAPH-2A700..CJK UNIFIED IDEOGRAPH-2B739
|
||
0x2B73A...0x2B73F, // [W] Cn [6] <reserved-2B73A>..<reserved-2B73F>
|
||
0x2B740...0x2B81D, // [W] Lo [222] CJK UNIFIED IDEOGRAPH-2B740..CJK UNIFIED IDEOGRAPH-2B81D
|
||
0x2B81E...0x2B81F, // [W] Cn [2] <reserved-2B81E>..<reserved-2B81F>
|
||
0x2B820...0x2CEA1, // [W] Lo [5762] CJK UNIFIED IDEOGRAPH-2B820..CJK UNIFIED IDEOGRAPH-2CEA1
|
||
0x2CEA2...0x2CEAF, // [W] Cn [14] <reserved-2CEA2>..<reserved-2CEAF>
|
||
0x2CEB0...0x2EBE0, // [W] Lo [7473] CJK UNIFIED IDEOGRAPH-2CEB0..CJK UNIFIED IDEOGRAPH-2EBE0
|
||
0x2EBE1...0x2EBEF, // [W] Cn [15] <reserved-2EBE1>..<reserved-2EBEF>
|
||
0x2EBF0...0x2EE5D, // [W] Lo [622] CJK UNIFIED IDEOGRAPH-2EBF0..CJK UNIFIED IDEOGRAPH-2EE5D
|
||
0x2EE5E...0x2F7FF, // [W] Cn [2466] <reserved-2EE5E>..<reserved-2F7FF>
|
||
0x2F800...0x2FA1D, // [W] Lo [542] CJK COMPATIBILITY IDEOGRAPH-2F800..CJK COMPATIBILITY IDEOGRAPH-2FA1D
|
||
0x2FA1E...0x2FA1F, // [W] Cn [2] <reserved-2FA1E>..<reserved-2FA1F>
|
||
0x2FA20...0x2FFFD, // [W] Cn [1502] <reserved-2FA20>..<reserved-2FFFD>
|
||
0x30000...0x3134A, // [W] Lo [4939] CJK UNIFIED IDEOGRAPH-30000..CJK UNIFIED IDEOGRAPH-3134A
|
||
0x3134B...0x3134F, // [W] Cn [5] <reserved-3134B>..<reserved-3134F>
|
||
0x31350...0x323AF, // [W] Lo [4192] CJK UNIFIED IDEOGRAPH-31350..CJK UNIFIED IDEOGRAPH-323AF
|
||
0x323B0...0x3FFFD, // [W] Cn [56398] <reserved-323B0>..<reserved-3FFFD>
|
||
=> true,
|
||
else => false,
|
||
};
|
||
}
|
||
|
||
pub fn visibleCodepointWidth(cp: anytype) u3 {
|
||
return visibleCodepointWidthType(@TypeOf(cp), cp);
|
||
}
|
||
|
||
pub fn visibleCodepointWidthType(comptime T: type, cp: T) usize {
|
||
if (isZeroWidthCodepointType(T, cp)) {
|
||
return 0;
|
||
}
|
||
|
||
if (isFullWidthCodepointType(T, cp)) {
|
||
return 2;
|
||
}
|
||
|
||
return 1;
|
||
}
|
||
|
||
pub fn visibleASCIIWidth(input_: anytype) usize {
|
||
var length: usize = 0;
|
||
var input = input_;
|
||
|
||
if (comptime Environment.enableSIMD) {
|
||
// https://zig.godbolt.org/z/hxhjncvq7
|
||
const ElementType = std.meta.Child(@TypeOf(input_));
|
||
const simd = 16 / @sizeOf(ElementType);
|
||
if (input.len >= simd) {
|
||
const input_end = input.ptr + input.len - (input.len % simd);
|
||
while (input.ptr != input_end) {
|
||
const chunk: @Vector(simd, ElementType) = input[0..simd].*;
|
||
input = input[simd..];
|
||
|
||
const cmp: @Vector(simd, ElementType) = @splat(0x1f);
|
||
const match1: @Vector(simd, u1) = @bitCast(chunk >= cmp);
|
||
const match: @Vector(simd, ElementType) = match1;
|
||
|
||
length += @reduce(.Add, match);
|
||
}
|
||
}
|
||
|
||
// this is a deliberate compiler optimization
|
||
// it disables auto-vectorizing the "input" for loop.
|
||
if (!(input.len < simd)) unreachable;
|
||
}
|
||
|
||
for (input) |c| {
|
||
length += if (c > 0x1f) 1 else 0;
|
||
}
|
||
|
||
return length;
|
||
}
|
||
|
||
pub fn visibleUTF8Width(input: []const u8) usize {
|
||
var bytes = input;
|
||
var len: usize = 0;
|
||
while (bun.strings.firstNonASCII(bytes)) |i| {
|
||
len += visibleASCIIWidth(bytes[0..i]);
|
||
|
||
const byte = bytes[i];
|
||
const skip = bun.strings.wtf8ByteSequenceLengthWithInvalid(byte);
|
||
const cp_bytes: [4]u8 = switch (skip) {
|
||
inline 1, 2, 3, 4 => |cp_len| .{
|
||
byte,
|
||
if (comptime cp_len > 1) bytes[1] else 0,
|
||
if (comptime cp_len > 2) bytes[2] else 0,
|
||
if (comptime cp_len > 3) bytes[3] else 0,
|
||
},
|
||
else => unreachable,
|
||
};
|
||
|
||
const cp = decodeWTF8RuneTMultibyte(&cp_bytes, skip, u32, unicode_replacement);
|
||
len += visibleCodepointWidthType(u32, cp);
|
||
|
||
bytes = bytes[@min(i + skip, bytes.len)..];
|
||
}
|
||
|
||
len += visibleASCIIWidth(bytes);
|
||
|
||
return len;
|
||
}
|
||
|
||
pub fn visibleUTF16Width(input: []const u16) usize {
|
||
var bytes = input;
|
||
var len: usize = 0;
|
||
while (bun.strings.firstNonASCII16CheckMin([]const u16, bytes, false)) |i| {
|
||
len += visibleASCIIWidth(bytes[0..i]);
|
||
bytes = bytes[i..];
|
||
|
||
const utf8 = utf16CodepointWithFFFD([]const u16, bytes);
|
||
len += visibleCodepointWidthType(u32, utf8.code_point);
|
||
bytes = bytes[@min(@as(usize, utf8.len), bytes.len)..];
|
||
}
|
||
|
||
len += visibleASCIIWidth(bytes);
|
||
|
||
return len;
|
||
}
|
||
|
||
pub fn visibleLatin1Width(input: []const u8) usize {
|
||
return visibleASCIIWidth(input);
|
||
}
|
||
|
||
pub const QuoteEscapeFormat = struct {
|
||
data: []const u8,
|
||
|
||
pub fn format(self: QuoteEscapeFormat, comptime _: []const u8, _: std.fmt.FormatOptions, writer: anytype) !void {
|
||
var i: usize = 0;
|
||
while (std.mem.indexOfAnyPos(u8, self.data, i, "\"\n\\")) |j| : (i = j + 1) {
|
||
try writer.writeAll(self.data[i..j]);
|
||
try writer.writeAll(switch (self.data[j]) {
|
||
'"' => "\\\"",
|
||
'\n' => "\\n",
|
||
'\\' => "\\\\",
|
||
else => unreachable,
|
||
});
|
||
}
|
||
if (i == self.data.len) return;
|
||
try writer.writeAll(self.data[i..]);
|
||
}
|
||
};
|