mirror of
https://github.com/oven-sh/bun
synced 2026-02-09 18:38:55 +00:00
Use Highway SIMD (#19134)
Co-authored-by: Dylan Conway <dylan.conway567@gmail.com> Co-authored-by: Dylan Conway <35280289+dylan-conway@users.noreply.github.com> Co-authored-by: Jarred-Sumner <709451+Jarred-Sumner@users.noreply.github.com>
This commit is contained in:
@@ -96,43 +96,14 @@ fn literalLength(comptime T: type, comptime str: string) usize {
|
||||
|
||||
pub const OptionalUsize = std.meta.Int(.unsigned, @bitSizeOf(usize) - 1);
|
||||
pub fn indexOfAny(slice: string, comptime str: []const u8) ?OptionalUsize {
|
||||
switch (comptime str.len) {
|
||||
return switch (comptime str.len) {
|
||||
0 => @compileError("str cannot be empty"),
|
||||
1 => return indexOfChar(slice, str[0]),
|
||||
else => {},
|
||||
}
|
||||
|
||||
var remaining = slice;
|
||||
if (remaining.len == 0) return null;
|
||||
|
||||
if (comptime Environment.enableSIMD) {
|
||||
while (remaining.len >= ascii_vector_size) {
|
||||
const vec: AsciiVector = remaining[0..ascii_vector_size].*;
|
||||
var cmp: AsciiVectorU1 = @bitCast(vec == @as(AsciiVector, @splat(@as(u8, str[0]))));
|
||||
inline for (str[1..]) |c| {
|
||||
cmp |= @bitCast(vec == @as(AsciiVector, @splat(@as(u8, c))));
|
||||
}
|
||||
|
||||
if (@reduce(.Max, cmp) > 0) {
|
||||
const bitmask = @as(AsciiVectorInt, @bitCast(cmp));
|
||||
const first = @ctz(bitmask);
|
||||
|
||||
return @as(OptionalUsize, @intCast(first + slice.len - remaining.len));
|
||||
}
|
||||
|
||||
remaining = remaining[ascii_vector_size..];
|
||||
}
|
||||
|
||||
if (comptime Environment.allow_assert) assert(remaining.len < ascii_vector_size);
|
||||
}
|
||||
|
||||
for (remaining, 0..) |c, i| {
|
||||
if (strings.indexOfChar(str, c) != null) {
|
||||
return @as(OptionalUsize, @intCast(i + slice.len - remaining.len));
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
else => if (bun.highway.indexOfAnyChar(slice, str)) |i|
|
||||
@intCast(i)
|
||||
else
|
||||
null,
|
||||
};
|
||||
}
|
||||
|
||||
pub fn indexOfAny16(self: []const u16, comptime str: anytype) ?OptionalUsize {
|
||||
@@ -177,7 +148,7 @@ pub fn inMapCaseInsensitive(self: []const u8, comptime ComptimeStringMap: anytyp
|
||||
return bun.String.ascii(self).inMapCaseInsensitive(ComptimeStringMap);
|
||||
}
|
||||
|
||||
pub inline fn containsAny(in: anytype, target: string) bool {
|
||||
pub inline fn containsAny(in: anytype, target: anytype) bool {
|
||||
for (in) |str| if (contains(if (@TypeOf(str) == u8) &[1]u8{str} else bun.span(str), target)) return true;
|
||||
return false;
|
||||
}
|
||||
@@ -496,7 +467,7 @@ pub inline fn lastIndexOf(self: string, str: string) ?usize {
|
||||
return std.mem.lastIndexOf(u8, self, str);
|
||||
}
|
||||
|
||||
pub inline fn indexOf(self: string, str: string) ?usize {
|
||||
pub fn indexOf(self: string, str: string) ?usize {
|
||||
if (comptime !bun.Environment.isNative) {
|
||||
return std.mem.indexOf(u8, self, str);
|
||||
}
|
||||
@@ -990,16 +961,13 @@ pub fn endsWithAnyComptime(self: string, comptime str: string) bool {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn eql(self: string, other: anytype) bool {
|
||||
pub fn eql(self: string, other: []const u8) bool {
|
||||
if (self.len != other.len) return false;
|
||||
if (comptime @TypeOf(other) == *string) {
|
||||
return eql(self, other.*);
|
||||
}
|
||||
|
||||
for (self, 0..) |c, i| {
|
||||
if (other[i] != c) return false;
|
||||
}
|
||||
return true;
|
||||
return eqlLong(self, other, false);
|
||||
}
|
||||
|
||||
pub fn eqlComptimeT(comptime T: type, self: []const T, comptime alt: anytype) bool {
|
||||
@@ -1367,43 +1335,11 @@ pub fn copyU8IntoU16WithAlignment(comptime alignment: u21, output_: []align(alig
|
||||
// }
|
||||
// }
|
||||
|
||||
pub inline fn copyU16IntoU8(output_: []u8, comptime InputType: type, input_: InputType) void {
|
||||
if (comptime Environment.allow_assert) assert(input_.len <= output_.len);
|
||||
var output = output_;
|
||||
var input = input_;
|
||||
pub inline fn copyU16IntoU8(output: []u8, input: []align(1) const u16) void {
|
||||
if (comptime Environment.allow_assert) assert(input.len <= output.len);
|
||||
const count = @min(input.len, output.len);
|
||||
|
||||
// https://zig.godbolt.org/z/9rTn1orcY
|
||||
|
||||
const group = @as(usize, 16);
|
||||
// end at the last group of 16 bytes
|
||||
var input_ptr = input.ptr;
|
||||
var output_ptr = output.ptr;
|
||||
|
||||
if (comptime Environment.enableSIMD) {
|
||||
const end_len = (@min(input.len, output.len) & ~(group - 1));
|
||||
const last_vector_ptr = input.ptr + end_len;
|
||||
while (last_vector_ptr != input_ptr) {
|
||||
const input_vec1: @Vector(group, u16) = input_ptr[0..group].*;
|
||||
inline for (0..group) |i| {
|
||||
output_ptr[i] = @as(u8, @truncate(input_vec1[i]));
|
||||
}
|
||||
|
||||
output_ptr += group;
|
||||
input_ptr += group;
|
||||
}
|
||||
|
||||
input.len -= end_len;
|
||||
output.len -= end_len;
|
||||
}
|
||||
|
||||
const last_input_ptr = input_ptr + @min(input.len, output.len);
|
||||
|
||||
while (last_input_ptr != input_ptr) {
|
||||
output_ptr[0] = @as(u8, @truncate(input_ptr[0]));
|
||||
output_ptr += 1;
|
||||
input_ptr += 1;
|
||||
}
|
||||
bun.highway.copyU16ToU8(input[0..count], output[0..count]);
|
||||
}
|
||||
|
||||
const strings = @This();
|
||||
@@ -2353,11 +2289,7 @@ pub fn toUTF8ListWithTypeBun(list: *std.ArrayList(u8), comptime Type: type, utf1
|
||||
}
|
||||
list.items.len += i;
|
||||
|
||||
copyU16IntoU8(
|
||||
list.items[list.items.len - i ..],
|
||||
Type,
|
||||
to_copy,
|
||||
);
|
||||
copyU16IntoU8(list.items[list.items.len - i ..], to_copy);
|
||||
|
||||
if (comptime skip_trailing_replacement) {
|
||||
if (replacement.is_lead and utf16_remaining.len == 0) {
|
||||
@@ -2377,7 +2309,7 @@ pub fn toUTF8ListWithTypeBun(list: *std.ArrayList(u8), comptime Type: type, utf1
|
||||
try list.ensureTotalCapacityPrecise(utf16_remaining.len + list.items.len);
|
||||
const old_len = list.items.len;
|
||||
list.items.len += utf16_remaining.len;
|
||||
copyU16IntoU8(list.items[old_len..], Type, utf16_remaining);
|
||||
copyU16IntoU8(list.items[old_len..], utf16_remaining);
|
||||
}
|
||||
|
||||
log("UTF16 {d} -> {d} UTF8", .{ utf16.len, list.items.len });
|
||||
@@ -2794,43 +2726,8 @@ pub fn replaceLatin1WithUTF8(buf_: []u8) void {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn elementLengthLatin1IntoUTF8(comptime Type: type, latin1_: Type) usize {
|
||||
// https://zig.godbolt.org/z/zzYexPPs9
|
||||
|
||||
var latin1 = latin1_;
|
||||
const input_len = latin1.len;
|
||||
var total_non_ascii_count: usize = 0;
|
||||
|
||||
// This is about 30% faster on large input compared to auto-vectorization
|
||||
if (comptime Environment.enableSIMD) {
|
||||
const end = latin1.ptr + (latin1.len - (latin1.len % ascii_vector_size));
|
||||
while (latin1.ptr != end) {
|
||||
const vec: AsciiVector = latin1[0..ascii_vector_size].*;
|
||||
|
||||
// Shifting a unsigned 8 bit integer to the right by 7 bits always produces a value of 0 or 1.
|
||||
const cmp = vec >> @as(AsciiVector, @splat(
|
||||
@as(u8, 7),
|
||||
));
|
||||
|
||||
// Anding that value rather than converting it into a @Vector(16, u1) produces better code from LLVM.
|
||||
const mask: AsciiVector = cmp & @as(AsciiVector, @splat(
|
||||
@as(u8, 1),
|
||||
));
|
||||
|
||||
total_non_ascii_count += @as(usize, @reduce(.Add, mask));
|
||||
latin1 = latin1[ascii_vector_size..];
|
||||
}
|
||||
|
||||
// an important hint to the compiler to not auto-vectorize the loop below
|
||||
if (latin1.len >= ascii_vector_size) unreachable;
|
||||
}
|
||||
|
||||
for (latin1) |c| {
|
||||
total_non_ascii_count += @as(usize, @intFromBool(c > 127));
|
||||
}
|
||||
|
||||
// each non-ascii latin1 character becomes 2 UTF8 characters
|
||||
return input_len + total_non_ascii_count;
|
||||
pub fn elementLengthLatin1IntoUTF8(slice: []const u8) usize {
|
||||
return bun.simdutf.length.utf8.from.latin1(slice);
|
||||
}
|
||||
|
||||
pub fn copyLatin1IntoUTF16(comptime Buffer: type, buf_: Buffer, comptime Type: type, latin1_: Type) EncodeIntoResult {
|
||||
@@ -2865,20 +2762,7 @@ pub fn elementLengthLatin1IntoUTF16(comptime Type: type, latin1_: Type) usize {
|
||||
return latin1_.len;
|
||||
}
|
||||
|
||||
var count: usize = 0;
|
||||
var latin1 = latin1_;
|
||||
while (latin1.len > 0) {
|
||||
const function = comptime if (std.meta.Child(Type) == u8) strings.firstNonASCIIWithType else strings.firstNonASCII16;
|
||||
const to_write = function(Type, latin1) orelse @as(u32, @truncate(latin1.len));
|
||||
count += to_write;
|
||||
latin1 = latin1[to_write..];
|
||||
if (latin1.len > 0) {
|
||||
count += comptime if (std.meta.Child(Type) == u8) 2 else 1;
|
||||
latin1 = latin1[1..];
|
||||
}
|
||||
}
|
||||
|
||||
return count;
|
||||
return bun.simdutf.length.utf16.from.latin1(latin1_);
|
||||
}
|
||||
|
||||
pub fn escapeHTMLForLatin1Input(allocator: std.mem.Allocator, latin1: []const u8) !Escaped(u8) {
|
||||
@@ -3605,7 +3489,7 @@ pub fn copyUTF16IntoUTF8WithBuffer(buf: []u8, comptime Type: type, utf16: Type,
|
||||
|
||||
while (firstNonASCII16(Type, utf16_remaining)) |i| {
|
||||
const end = @min(i, remaining.len);
|
||||
if (end > 0) copyU16IntoU8(remaining, Type, utf16_remaining[0..end]);
|
||||
if (end > 0) copyU16IntoU8(remaining, utf16_remaining[0..end]);
|
||||
remaining = remaining[end..];
|
||||
utf16_remaining = utf16_remaining[end..];
|
||||
|
||||
@@ -3674,7 +3558,7 @@ pub fn copyUTF16IntoUTF8WithBuffer(buf: []u8, comptime Type: type, utf16: Type,
|
||||
|
||||
if (remaining.len > 0 and !ended_on_non_ascii and utf16_remaining.len > 0) {
|
||||
const len = @min(remaining.len, utf16_remaining.len);
|
||||
copyU16IntoU8(remaining[0..len], Type, utf16_remaining[0..len]);
|
||||
copyU16IntoU8(remaining[0..len], utf16_remaining[0..len]);
|
||||
utf16_remaining = utf16_remaining[len..];
|
||||
remaining = remaining[len..];
|
||||
}
|
||||
@@ -4014,44 +3898,7 @@ pub fn isAllASCII(slice: []const u8) bool {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (bun.FeatureFlags.use_simdutf)
|
||||
return bun.simdutf.validate.ascii(slice);
|
||||
|
||||
var remaining = slice;
|
||||
|
||||
// The NEON SIMD unit is 128-bit wide and includes 16 128-bit registers that can be used as 32 64-bit registers
|
||||
if (comptime Environment.enableSIMD) {
|
||||
const remaining_end_ptr = remaining.ptr + remaining.len - (remaining.len % ascii_vector_size);
|
||||
while (remaining.ptr != remaining_end_ptr) : (remaining.ptr += ascii_vector_size) {
|
||||
const vec: AsciiVector = remaining[0..ascii_vector_size].*;
|
||||
|
||||
if (@reduce(.Max, vec) > 127) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const Int = u64;
|
||||
const size = @sizeOf(Int);
|
||||
const remaining_last8 = slice.ptr + slice.len - (slice.len % size);
|
||||
while (remaining.ptr != remaining_last8) : (remaining.ptr += size) {
|
||||
const bytes = @as(Int, @bitCast(remaining[0..size].*));
|
||||
// https://dotat.at/@/2022-06-27-tolower-swar.html
|
||||
const mask = bytes & 0x8080808080808080;
|
||||
|
||||
if (mask > 0) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
const final = slice.ptr + slice.len;
|
||||
while (remaining.ptr != final) : (remaining.ptr += 1) {
|
||||
if (remaining[0] > 127) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
return bun.simdutf.validate.ascii(slice);
|
||||
}
|
||||
|
||||
// #define U16_LEAD(supplementary) (UChar)(((supplementary)>>10)+0xd7c0)
|
||||
@@ -4085,296 +3932,67 @@ pub inline fn u16GetSupplementary(lead: u32, trail: u32) u32 {
|
||||
pub const u16_surrogate_offset = 56613888;
|
||||
|
||||
pub fn firstNonASCII(slice: []const u8) ?u32 {
|
||||
return firstNonASCIIWithType([]const u8, slice);
|
||||
}
|
||||
|
||||
pub fn firstNonASCIIWithType(comptime Type: type, slice: Type) ?u32 {
|
||||
var remaining = slice;
|
||||
|
||||
if (comptime bun.FeatureFlags.use_simdutf) {
|
||||
const result = bun.simdutf.validate.with_errors.ascii(slice);
|
||||
if (result.status == .success) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return @as(u32, @truncate(result.count));
|
||||
}
|
||||
|
||||
if (comptime Environment.enableSIMD) {
|
||||
if (remaining.len >= ascii_vector_size) {
|
||||
const remaining_start = remaining.ptr;
|
||||
const remaining_end = remaining.ptr + remaining.len - (remaining.len % ascii_vector_size);
|
||||
|
||||
while (remaining.ptr != remaining_end) {
|
||||
const vec: AsciiVector = remaining[0..ascii_vector_size].*;
|
||||
|
||||
if (@reduce(.Max, vec) > 127) {
|
||||
const Int = u64;
|
||||
const size = @sizeOf(Int);
|
||||
remaining.len -= @intFromPtr(remaining.ptr) - @intFromPtr(remaining_start);
|
||||
|
||||
{
|
||||
const bytes = @as(Int, @bitCast(remaining[0..size].*));
|
||||
// https://dotat.at/@/2022-06-27-tolower-swar.html
|
||||
const mask = bytes & 0x8080808080808080;
|
||||
|
||||
if (mask > 0) {
|
||||
const first_set_byte = @ctz(mask) / 8;
|
||||
if (comptime Environment.isDebug) {
|
||||
bun.assert(remaining[first_set_byte] > 127);
|
||||
for (0..first_set_byte) |j| {
|
||||
bun.assert(remaining[j] <= 127);
|
||||
}
|
||||
}
|
||||
|
||||
return @as(u32, first_set_byte) + @as(u32, @intCast(slice.len - remaining.len));
|
||||
}
|
||||
remaining = remaining[size..];
|
||||
}
|
||||
{
|
||||
const bytes = @as(Int, @bitCast(remaining[0..size].*));
|
||||
const mask = bytes & 0x8080808080808080;
|
||||
|
||||
if (mask > 0) {
|
||||
const first_set_byte = @ctz(mask) / 8;
|
||||
if (comptime Environment.isDebug) {
|
||||
bun.assert(remaining[first_set_byte] > 127);
|
||||
for (0..first_set_byte) |j| {
|
||||
bun.assert(remaining[j] <= 127);
|
||||
}
|
||||
}
|
||||
|
||||
return @as(u32, first_set_byte) + @as(u32, @intCast(slice.len - remaining.len));
|
||||
}
|
||||
}
|
||||
unreachable;
|
||||
}
|
||||
|
||||
// the more intuitive way, using slices, produces worse codegen
|
||||
// specifically: it subtracts the length at the end of the loop
|
||||
// we don't need to do that
|
||||
// we only need to subtract the length once at the very end
|
||||
remaining.ptr += ascii_vector_size;
|
||||
}
|
||||
remaining.len -= @intFromPtr(remaining.ptr) - @intFromPtr(remaining_start);
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
const Int = u64;
|
||||
const size = @sizeOf(Int);
|
||||
const remaining_start = remaining.ptr;
|
||||
const remaining_end = remaining.ptr + remaining.len - (remaining.len % size);
|
||||
|
||||
if (comptime Environment.enableSIMD) {
|
||||
// these assertions exist more so for LLVM
|
||||
bun.unsafeAssert(remaining.len < ascii_vector_size);
|
||||
bun.unsafeAssert(@intFromPtr(remaining.ptr + ascii_vector_size) > @intFromPtr(remaining_end));
|
||||
}
|
||||
|
||||
if (remaining.len >= size) {
|
||||
while (remaining.ptr != remaining_end) {
|
||||
const bytes = @as(Int, @bitCast(remaining[0..size].*));
|
||||
// https://dotat.at/@/2022-06-27-tolower-swar.html
|
||||
const mask = bytes & 0x8080808080808080;
|
||||
|
||||
if (mask > 0) {
|
||||
remaining.len -= @intFromPtr(remaining.ptr) - @intFromPtr(remaining_start);
|
||||
const first_set_byte = @ctz(mask) / 8;
|
||||
if (comptime Environment.isDebug) {
|
||||
bun.unsafeAssert(remaining[first_set_byte] > 127);
|
||||
for (0..first_set_byte) |j| {
|
||||
bun.unsafeAssert(remaining[j] <= 127);
|
||||
}
|
||||
}
|
||||
|
||||
return @as(u32, first_set_byte) + @as(u32, @intCast(slice.len - remaining.len));
|
||||
}
|
||||
|
||||
remaining.ptr += size;
|
||||
}
|
||||
remaining.len -= @intFromPtr(remaining.ptr) - @intFromPtr(remaining_start);
|
||||
}
|
||||
}
|
||||
|
||||
if (comptime Environment.allow_assert) assert(remaining.len < 8);
|
||||
|
||||
for (remaining) |*char| {
|
||||
if (char.* > 127) {
|
||||
// try to prevent it from reading the length of the slice
|
||||
return @as(u32, @truncate(@intFromPtr(char) - @intFromPtr(slice.ptr)));
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
pub fn indexOfNewlineOrNonASCIIOrANSI(slice_: []const u8, offset: u32) ?u32 {
|
||||
const slice = slice_[offset..];
|
||||
var remaining = slice;
|
||||
|
||||
if (remaining.len == 0)
|
||||
const result = bun.simdutf.validate.with_errors.ascii(slice);
|
||||
if (result.status == .success) {
|
||||
return null;
|
||||
|
||||
if (comptime Environment.enableSIMD) {
|
||||
while (remaining.len >= ascii_vector_size) {
|
||||
const vec: AsciiVector = remaining[0..ascii_vector_size].*;
|
||||
const cmp = @as(AsciiVectorU1, @bitCast((vec > max_16_ascii))) | @as(AsciiVectorU1, @bitCast((vec < min_16_ascii))) |
|
||||
@as(AsciiVectorU1, @bitCast(vec == @as(AsciiVector, @splat(@as(u8, '\r'))))) |
|
||||
@as(AsciiVectorU1, @bitCast(vec == @as(AsciiVector, @splat(@as(u8, '\n'))))) |
|
||||
@as(AsciiVectorU1, @bitCast(vec == @as(AsciiVector, @splat(@as(u8, '\x1b')))));
|
||||
|
||||
if (@reduce(.Max, cmp) > 0) {
|
||||
const bitmask = @as(AsciiVectorInt, @bitCast(cmp));
|
||||
const first = @ctz(bitmask);
|
||||
|
||||
return @as(u32, first) + @as(u32, @intCast(slice.len - remaining.len)) + offset;
|
||||
}
|
||||
|
||||
remaining = remaining[ascii_vector_size..];
|
||||
}
|
||||
|
||||
if (comptime Environment.allow_assert) assert(remaining.len < ascii_vector_size);
|
||||
}
|
||||
|
||||
for (remaining) |*char_| {
|
||||
const char = char_.*;
|
||||
if (char > 127 or char < 0x20 or char == '\n' or char == '\r' or char == '\x1b') {
|
||||
return @as(u32, @truncate((@intFromPtr(char_) - @intFromPtr(slice.ptr)))) + offset;
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
return @as(u32, @truncate(result.count));
|
||||
}
|
||||
|
||||
pub const indexOfNewlineOrNonASCIIOrANSI = indexOfNewlineOrNonASCII;
|
||||
|
||||
/// Checks if slice[offset..] has any < 0x20 or > 127 characters
|
||||
pub fn indexOfNewlineOrNonASCII(slice_: []const u8, offset: u32) ?u32 {
|
||||
return indexOfNewlineOrNonASCIICheckStart(slice_, offset, true);
|
||||
}
|
||||
|
||||
pub fn indexOfSpaceOrNewlineOrNonASCII(slice_: []const u8, offset: u32) ?u32 {
|
||||
const slice = slice_[offset..];
|
||||
const remaining = slice;
|
||||
|
||||
if (remaining.len == 0)
|
||||
return null;
|
||||
|
||||
if (remaining[0] > 127 or (remaining[0] < 0x20 and remaining[0] != 0x09)) {
|
||||
return offset;
|
||||
}
|
||||
|
||||
const i = bun.highway.indexOfSpaceOrNewlineOrNonASCII(remaining) orelse return null;
|
||||
return @as(u32, @truncate(i)) + offset;
|
||||
}
|
||||
|
||||
pub fn indexOfNewlineOrNonASCIICheckStart(slice_: []const u8, offset: u32, comptime check_start: bool) ?u32 {
|
||||
const slice = slice_[offset..];
|
||||
var remaining = slice;
|
||||
const remaining = slice;
|
||||
|
||||
if (remaining.len == 0)
|
||||
return null;
|
||||
|
||||
if (comptime check_start) {
|
||||
// this shows up in profiling
|
||||
if (remaining[0] > 127 or remaining[0] < 0x20 or remaining[0] == '\r' or remaining[0] == '\n') {
|
||||
if (remaining[0] > 127 or (remaining[0] < 0x20 and remaining[0] != 0x09)) {
|
||||
return offset;
|
||||
}
|
||||
}
|
||||
|
||||
if (comptime Environment.enableSIMD) {
|
||||
while (remaining.len >= ascii_vector_size) {
|
||||
const vec: AsciiVector = remaining[0..ascii_vector_size].*;
|
||||
const cmp = @as(AsciiVectorU1, @bitCast((vec > max_16_ascii))) | @as(AsciiVectorU1, @bitCast((vec < min_16_ascii))) |
|
||||
@as(AsciiVectorU1, @bitCast(vec == @as(AsciiVector, @splat(@as(u8, '\r'))))) |
|
||||
@as(AsciiVectorU1, @bitCast(vec == @as(AsciiVector, @splat(@as(u8, '\n')))));
|
||||
|
||||
if (@reduce(.Max, cmp) > 0) {
|
||||
const bitmask = @as(AsciiVectorInt, @bitCast(cmp));
|
||||
const first = @ctz(bitmask);
|
||||
|
||||
return @as(u32, first) + @as(u32, @intCast(slice.len - remaining.len)) + offset;
|
||||
}
|
||||
|
||||
remaining = remaining[ascii_vector_size..];
|
||||
}
|
||||
|
||||
if (comptime Environment.allow_assert) assert(remaining.len < ascii_vector_size);
|
||||
}
|
||||
|
||||
for (remaining) |*char_| {
|
||||
const char = char_.*;
|
||||
if (char > 127 or char < 0x20 or char == '\n' or char == '\r') {
|
||||
return @as(u32, @truncate((@intFromPtr(char_) - @intFromPtr(slice.ptr)))) + offset;
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
const i = bun.highway.indexOfNewlineOrNonASCII(remaining) orelse return null;
|
||||
return @as(u32, @truncate(i)) + offset;
|
||||
}
|
||||
|
||||
pub fn containsNewlineOrNonASCIIOrQuote(slice_: []const u8) bool {
|
||||
const slice = slice_;
|
||||
var remaining = slice;
|
||||
|
||||
if (remaining.len == 0)
|
||||
return false;
|
||||
|
||||
if (comptime Environment.enableSIMD) {
|
||||
while (remaining.len >= ascii_vector_size) {
|
||||
const vec: AsciiVector = remaining[0..ascii_vector_size].*;
|
||||
const cmp = @as(AsciiVectorU1, @bitCast((vec > max_16_ascii))) | @as(AsciiVectorU1, @bitCast((vec < min_16_ascii))) |
|
||||
@as(AsciiVectorU1, @bitCast(vec == @as(AsciiVector, @splat(@as(u8, '\r'))))) |
|
||||
@as(AsciiVectorU1, @bitCast(vec == @as(AsciiVector, @splat(@as(u8, '\n'))))) |
|
||||
@as(AsciiVectorU1, @bitCast(vec == @as(AsciiVector, @splat(@as(u8, '"')))));
|
||||
|
||||
if (@reduce(.Max, cmp) > 0) {
|
||||
return true;
|
||||
}
|
||||
|
||||
remaining = remaining[ascii_vector_size..];
|
||||
}
|
||||
|
||||
if (comptime Environment.allow_assert) assert(remaining.len < ascii_vector_size);
|
||||
}
|
||||
|
||||
for (remaining) |*char_| {
|
||||
const char = char_.*;
|
||||
if (char > 127 or char < 0x20 or char == '\n' or char == '\r' or char == '"') {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
pub fn containsNewlineOrNonASCIIOrQuote(text: []const u8) bool {
|
||||
return bun.highway.containsNewlineOrNonASCIIOrQuote(text);
|
||||
}
|
||||
|
||||
/// JSON escape
|
||||
pub fn indexOfNeedsEscape(slice: []const u8, comptime quote_char: u8) ?u32 {
|
||||
var remaining = slice;
|
||||
if (remaining.len == 0)
|
||||
/// Supports:
|
||||
/// - `"`
|
||||
/// - `'`
|
||||
/// - "`"
|
||||
pub fn indexOfNeedsEscapeForJavaScriptString(slice: []const u8, quote_char: u8) ?u32 {
|
||||
if (slice.len == 0)
|
||||
return null;
|
||||
|
||||
if (remaining[0] >= 127 or remaining[0] < 0x20 or remaining[0] == '\\' or remaining[0] == quote_char or (quote_char == '`' and remaining[0] == '$')) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (comptime Environment.enableSIMD) {
|
||||
while (remaining.len >= ascii_vector_size) {
|
||||
const vec: AsciiVector = remaining[0..ascii_vector_size].*;
|
||||
const cmp: AsciiVectorU1 = if (comptime quote_char == '`') ( //
|
||||
@as(AsciiVectorU1, @bitCast((vec > max_16_ascii))) |
|
||||
@as(AsciiVectorU1, @bitCast((vec < min_16_ascii))) |
|
||||
@as(AsciiVectorU1, @bitCast(vec == @as(AsciiVector, @splat(@as(u8, '\\'))))) |
|
||||
@as(AsciiVectorU1, @bitCast(vec == @as(AsciiVector, @splat(@as(u8, quote_char))))) |
|
||||
@as(AsciiVectorU1, @bitCast(vec == @as(AsciiVector, @splat(@as(u8, '$'))))) //
|
||||
) else ( //
|
||||
@as(AsciiVectorU1, @bitCast((vec > max_16_ascii))) |
|
||||
@as(AsciiVectorU1, @bitCast((vec < min_16_ascii))) |
|
||||
@as(AsciiVectorU1, @bitCast(vec == @as(AsciiVector, @splat(@as(u8, '\\'))))) |
|
||||
@as(AsciiVectorU1, @bitCast(vec == @as(AsciiVector, @splat(@as(u8, quote_char))))) //
|
||||
);
|
||||
|
||||
if (@reduce(.Max, cmp) > 0) {
|
||||
const bitmask = @as(AsciiVectorInt, @bitCast(cmp));
|
||||
const first = @ctz(bitmask);
|
||||
|
||||
return @as(u32, first) + @as(u32, @truncate(@intFromPtr(remaining.ptr) - @intFromPtr(slice.ptr)));
|
||||
}
|
||||
|
||||
remaining = remaining[ascii_vector_size..];
|
||||
}
|
||||
}
|
||||
|
||||
for (remaining) |*char_| {
|
||||
const char = char_.*;
|
||||
if (char > 127 or char < 0x20 or char == '\\' or char == quote_char or (quote_char == '`' and char == '$')) {
|
||||
return @as(u32, @truncate(@intFromPtr(char_) - @intFromPtr(slice.ptr)));
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
return bun.highway.indexOfNeedsEscapeForJavaScriptString(slice, quote_char);
|
||||
}
|
||||
|
||||
pub fn indexOfNeedsURLEncode(slice: []const u8) ?u32 {
|
||||
@@ -4447,15 +4065,7 @@ pub fn indexOfNeedsURLEncode(slice: []const u8) ?u32 {
|
||||
}
|
||||
|
||||
pub fn indexOfCharZ(sliceZ: [:0]const u8, char: u8) ?u63 {
|
||||
const ptr = bun.C.strchr(sliceZ.ptr, char) orelse return null;
|
||||
const pos = @intFromPtr(ptr) - @intFromPtr(sliceZ.ptr);
|
||||
|
||||
if (comptime Environment.isDebug)
|
||||
bun.assert(@intFromPtr(sliceZ.ptr) <= @intFromPtr(ptr) and
|
||||
@intFromPtr(ptr) < @intFromPtr(sliceZ.ptr + sliceZ.len) and
|
||||
pos <= sliceZ.len);
|
||||
|
||||
return @as(u63, @truncate(pos));
|
||||
return @truncate(bun.highway.indexOfChar(sliceZ, char) orelse return null);
|
||||
}
|
||||
|
||||
pub fn indexOfChar(slice: []const u8, char: u8) ?u32 {
|
||||
@@ -4463,19 +4073,11 @@ pub fn indexOfChar(slice: []const u8, char: u8) ?u32 {
|
||||
}
|
||||
|
||||
pub fn indexOfCharUsize(slice: []const u8, char: u8) ?usize {
|
||||
if (slice.len == 0)
|
||||
return null;
|
||||
|
||||
if (comptime !Environment.isNative) {
|
||||
return std.mem.indexOfScalar(u8, slice, char);
|
||||
}
|
||||
|
||||
const ptr = bun.C.memchr(slice.ptr, char, slice.len) orelse return null;
|
||||
const i = @intFromPtr(ptr) - @intFromPtr(slice.ptr);
|
||||
bun.assert(i < slice.len);
|
||||
bun.assert(slice[i] == char);
|
||||
|
||||
return i;
|
||||
return bun.highway.indexOfChar(slice, char);
|
||||
}
|
||||
|
||||
pub fn indexOfCharPos(slice: []const u8, char: u8, start_index: usize) ?usize {
|
||||
@@ -4485,13 +4087,9 @@ pub fn indexOfCharPos(slice: []const u8, char: u8, start_index: usize) ?usize {
|
||||
|
||||
if (start_index >= slice.len) return null;
|
||||
|
||||
const ptr = bun.C.memchr(slice.ptr + start_index, char, slice.len - start_index) orelse
|
||||
return null;
|
||||
const i = @intFromPtr(ptr) - @intFromPtr(slice.ptr);
|
||||
bun.assert(i < slice.len);
|
||||
bun.assert(slice[i] == char);
|
||||
|
||||
return i;
|
||||
const result = bun.highway.indexOfChar(slice[start_index..], char) orelse return null;
|
||||
bun.debugAssert(slice.len > result + start_index);
|
||||
return result + start_index;
|
||||
}
|
||||
|
||||
pub fn indexOfAnyPosComptime(slice: []const u8, comptime chars: []const u8, start_index: usize) ?usize {
|
||||
@@ -4934,47 +4532,6 @@ pub fn firstNonASCII16(comptime Slice: type, slice: Slice) ?u32 {
|
||||
return null;
|
||||
}
|
||||
|
||||
/// Fast path for printing template literal strings
|
||||
pub fn @"nextUTF16NonASCIIOr$`\\"(
|
||||
comptime Slice: type,
|
||||
slice: Slice,
|
||||
) ?u32 {
|
||||
var remaining = slice;
|
||||
|
||||
if (comptime Environment.enableSIMD and Environment.isNative) {
|
||||
while (remaining.len >= ascii_u16_vector_size) {
|
||||
const vec: AsciiU16Vector = remaining[0..ascii_u16_vector_size].*;
|
||||
|
||||
const cmp = @as(AsciiVectorU16U1, @bitCast((vec > max_u16_ascii))) |
|
||||
@as(AsciiVectorU16U1, @bitCast((vec < min_u16_ascii))) |
|
||||
@as(AsciiVectorU16U1, @bitCast((vec == @as(AsciiU16Vector, @splat(@as(u16, '$')))))) |
|
||||
@as(AsciiVectorU16U1, @bitCast((vec == @as(AsciiU16Vector, @splat(@as(u16, '`')))))) |
|
||||
@as(AsciiVectorU16U1, @bitCast((vec == @as(AsciiU16Vector, @splat(@as(u16, '\\'))))));
|
||||
|
||||
const bitmask = @as(u8, @bitCast(cmp));
|
||||
const first = @ctz(bitmask);
|
||||
if (first < ascii_u16_vector_size) {
|
||||
return @as(u32, @intCast(@as(u32, first) +
|
||||
@as(u32, @intCast(slice.len - remaining.len))));
|
||||
}
|
||||
|
||||
remaining = remaining[ascii_u16_vector_size..];
|
||||
}
|
||||
}
|
||||
|
||||
for (remaining, 0..) |char, i| {
|
||||
switch (char) {
|
||||
'$', '`', '\\', 0...0x20 - 1, 128...std.math.maxInt(u16) => {
|
||||
return @as(u32, @truncate(i + (slice.len - remaining.len)));
|
||||
},
|
||||
|
||||
else => {},
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
/// Convert potentially ill-formed UTF-8 or UTF-16 bytes to a Unicode Codepoint.
|
||||
/// - Invalid codepoints are replaced with `zero` parameter
|
||||
/// - Null bytes return 0
|
||||
@@ -5097,31 +4654,6 @@ pub fn lengthOfLeadingWhitespaceASCII(slice: string) usize {
|
||||
return slice.len;
|
||||
}
|
||||
|
||||
pub fn containsNonBmpCodePointUTF16(_text: []const u16) bool {
|
||||
const n = _text.len;
|
||||
if (n > 0) {
|
||||
var i: usize = 0;
|
||||
const text = _text[0 .. n - 1];
|
||||
while (i < n - 1) : (i += 1) {
|
||||
switch (text[i]) {
|
||||
// Check for a high surrogate
|
||||
0xD800...0xDBFF => {
|
||||
// Check for a low surrogate
|
||||
switch (text[i + 1]) {
|
||||
0xDC00...0xDFFF => {
|
||||
return true;
|
||||
},
|
||||
else => {},
|
||||
}
|
||||
},
|
||||
else => {},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
pub fn join(slices: []const string, delimiter: string, allocator: std.mem.Allocator) !string {
|
||||
return try std.mem.join(allocator, delimiter, slices);
|
||||
}
|
||||
@@ -5238,6 +4770,75 @@ pub fn NewCodePointIterator(comptime CodePointType_: type, comptime zeroValue: c
|
||||
return Iterator{ .bytes = str, .i = i, .c = zeroValue };
|
||||
}
|
||||
|
||||
const SkipResult = enum {
|
||||
eof,
|
||||
found,
|
||||
not_found,
|
||||
};
|
||||
|
||||
/// Advance forward until the scalar function returns true.
|
||||
/// THe simd function is "best effort" and expected to sometimes return a result which `scalar` will return false for.
|
||||
/// This is because we don't decode UTF-8 in the SIMD code path.
|
||||
pub fn skip(it: *const Iterator, cursor: *Cursor, simd: *const fn (input: []const u8) ?usize, scalar: *const fn (CodePointType) bool) SkipResult {
|
||||
while (true) {
|
||||
// 1. Get current position. Check for EOF.
|
||||
const current_byte_index = cursor.i;
|
||||
if (current_byte_index >= it.bytes.len) {
|
||||
return .not_found; // Reached end without finding
|
||||
}
|
||||
|
||||
// 2. Decode the *next* character using the standard iterator method.
|
||||
if (!next(it, cursor)) {
|
||||
return .not_found; // Reached end or error during decode
|
||||
}
|
||||
|
||||
// 3. Check if the character just decoded matches the scalar condition.
|
||||
if (scalar(it.c)) {
|
||||
return .found; // Found it!
|
||||
}
|
||||
|
||||
// 4. Optimization: Can we skip ahead using SIMD?
|
||||
// Scan starting from the byte *after* the character we just decoded.
|
||||
const next_scan_start_index = cursor.i;
|
||||
if (next_scan_start_index >= it.bytes.len) {
|
||||
// Just decoded the last character and it didn't match.
|
||||
return .not_found;
|
||||
}
|
||||
const remaining_slice = it.bytes[next_scan_start_index..];
|
||||
if (remaining_slice.len == 0) {
|
||||
return .not_found;
|
||||
}
|
||||
|
||||
// Ask SIMD for the next potential candidate.
|
||||
if (simd(remaining_slice)) |pos| {
|
||||
// SIMD found a potential candidate `pos` bytes ahead.
|
||||
if (pos > 0) {
|
||||
// Jump the byte index to the start of the potential candidate.
|
||||
cursor.i = next_scan_start_index + @as(u32, @intCast(pos));
|
||||
// Reset width so next() decodes correctly from the jumped position.
|
||||
cursor.width = 0;
|
||||
// Loop will continue, starting the decode from the new cursor.i.
|
||||
continue;
|
||||
}
|
||||
// If pos == 0, SIMD suggests the *immediate next* character.
|
||||
// No jump needed, just let the loop iterate naturally.
|
||||
// Fallthrough to the end of the loop.
|
||||
} else {
|
||||
// SIMD found no potential candidates in the rest of the string.
|
||||
// Since the SIMD search set is a superset of the scalar check set,
|
||||
// we can guarantee that no character satisfying `scalar` exists further.
|
||||
// Since the current character (decoded in step 2) also didn't match,
|
||||
// we can conclude the target character is not found.
|
||||
return .not_found;
|
||||
}
|
||||
|
||||
// If we reach here, it means SIMD returned pos=0.
|
||||
// Loop continues to the next iteration, processing the immediate next char.
|
||||
} // End while true
|
||||
|
||||
unreachable;
|
||||
}
|
||||
|
||||
pub inline fn next(it: *const Iterator, cursor: *Cursor) bool {
|
||||
const pos: u32 = @as(u32, cursor.width) + cursor.i;
|
||||
if (pos >= it.bytes.len) {
|
||||
@@ -5527,6 +5128,16 @@ pub fn leftHasAnyInRight(to_check: []const string, against: []const string) bool
|
||||
return false;
|
||||
}
|
||||
|
||||
/// Returns true if the input has the prefix and the next character is not an identifier character
|
||||
/// Also returns true if the input ends with the prefix (i.e. EOF)
|
||||
///
|
||||
/// Example:
|
||||
/// ```zig
|
||||
/// // returns true
|
||||
/// hasPrefixWithWordBoundary("console.log", "console") // true
|
||||
/// hasPrefixWithWordBoundary("console.log", "log") // false
|
||||
/// hasPrefixWithWordBoundary("console.log", "console.log") // true
|
||||
/// ```
|
||||
pub fn hasPrefixWithWordBoundary(input: []const u8, comptime prefix: []const u8) bool {
|
||||
if (hasPrefixComptime(input, prefix)) {
|
||||
if (input.len == prefix.len) return true;
|
||||
@@ -5708,7 +5319,6 @@ pub fn mustEscapeYAMLString(contents: []const u8) bool {
|
||||
else => true,
|
||||
};
|
||||
}
|
||||
|
||||
pub fn pathContainsNodeModulesFolder(path: []const u8) bool {
|
||||
return strings.contains(path, comptime std.fs.path.sep_str ++ "node_modules" ++ std.fs.path.sep_str);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user