Use Highway SIMD (#19134)

Co-authored-by: Dylan Conway <dylan.conway567@gmail.com>
Co-authored-by: Dylan Conway <35280289+dylan-conway@users.noreply.github.com>
Co-authored-by: Jarred-Sumner <709451+Jarred-Sumner@users.noreply.github.com>
This commit is contained in:
Jarred Sumner
2025-04-21 23:28:03 -07:00
committed by GitHub
parent b117d14650
commit 0471254e4e
19 changed files with 1841 additions and 872 deletions

View File

@@ -96,43 +96,14 @@ fn literalLength(comptime T: type, comptime str: string) usize {
pub const OptionalUsize = std.meta.Int(.unsigned, @bitSizeOf(usize) - 1);
pub fn indexOfAny(slice: string, comptime str: []const u8) ?OptionalUsize {
switch (comptime str.len) {
return switch (comptime str.len) {
0 => @compileError("str cannot be empty"),
1 => return indexOfChar(slice, str[0]),
else => {},
}
var remaining = slice;
if (remaining.len == 0) return null;
if (comptime Environment.enableSIMD) {
while (remaining.len >= ascii_vector_size) {
const vec: AsciiVector = remaining[0..ascii_vector_size].*;
var cmp: AsciiVectorU1 = @bitCast(vec == @as(AsciiVector, @splat(@as(u8, str[0]))));
inline for (str[1..]) |c| {
cmp |= @bitCast(vec == @as(AsciiVector, @splat(@as(u8, c))));
}
if (@reduce(.Max, cmp) > 0) {
const bitmask = @as(AsciiVectorInt, @bitCast(cmp));
const first = @ctz(bitmask);
return @as(OptionalUsize, @intCast(first + slice.len - remaining.len));
}
remaining = remaining[ascii_vector_size..];
}
if (comptime Environment.allow_assert) assert(remaining.len < ascii_vector_size);
}
for (remaining, 0..) |c, i| {
if (strings.indexOfChar(str, c) != null) {
return @as(OptionalUsize, @intCast(i + slice.len - remaining.len));
}
}
return null;
else => if (bun.highway.indexOfAnyChar(slice, str)) |i|
@intCast(i)
else
null,
};
}
pub fn indexOfAny16(self: []const u16, comptime str: anytype) ?OptionalUsize {
@@ -177,7 +148,7 @@ pub fn inMapCaseInsensitive(self: []const u8, comptime ComptimeStringMap: anytyp
return bun.String.ascii(self).inMapCaseInsensitive(ComptimeStringMap);
}
pub inline fn containsAny(in: anytype, target: string) bool {
pub inline fn containsAny(in: anytype, target: anytype) bool {
for (in) |str| if (contains(if (@TypeOf(str) == u8) &[1]u8{str} else bun.span(str), target)) return true;
return false;
}
@@ -496,7 +467,7 @@ pub inline fn lastIndexOf(self: string, str: string) ?usize {
return std.mem.lastIndexOf(u8, self, str);
}
pub inline fn indexOf(self: string, str: string) ?usize {
pub fn indexOf(self: string, str: string) ?usize {
if (comptime !bun.Environment.isNative) {
return std.mem.indexOf(u8, self, str);
}
@@ -990,16 +961,13 @@ pub fn endsWithAnyComptime(self: string, comptime str: string) bool {
}
}
pub fn eql(self: string, other: anytype) bool {
pub fn eql(self: string, other: []const u8) bool {
if (self.len != other.len) return false;
if (comptime @TypeOf(other) == *string) {
return eql(self, other.*);
}
for (self, 0..) |c, i| {
if (other[i] != c) return false;
}
return true;
return eqlLong(self, other, false);
}
pub fn eqlComptimeT(comptime T: type, self: []const T, comptime alt: anytype) bool {
@@ -1367,43 +1335,11 @@ pub fn copyU8IntoU16WithAlignment(comptime alignment: u21, output_: []align(alig
// }
// }
pub inline fn copyU16IntoU8(output_: []u8, comptime InputType: type, input_: InputType) void {
if (comptime Environment.allow_assert) assert(input_.len <= output_.len);
var output = output_;
var input = input_;
pub inline fn copyU16IntoU8(output: []u8, input: []align(1) const u16) void {
if (comptime Environment.allow_assert) assert(input.len <= output.len);
const count = @min(input.len, output.len);
// https://zig.godbolt.org/z/9rTn1orcY
const group = @as(usize, 16);
// end at the last group of 16 bytes
var input_ptr = input.ptr;
var output_ptr = output.ptr;
if (comptime Environment.enableSIMD) {
const end_len = (@min(input.len, output.len) & ~(group - 1));
const last_vector_ptr = input.ptr + end_len;
while (last_vector_ptr != input_ptr) {
const input_vec1: @Vector(group, u16) = input_ptr[0..group].*;
inline for (0..group) |i| {
output_ptr[i] = @as(u8, @truncate(input_vec1[i]));
}
output_ptr += group;
input_ptr += group;
}
input.len -= end_len;
output.len -= end_len;
}
const last_input_ptr = input_ptr + @min(input.len, output.len);
while (last_input_ptr != input_ptr) {
output_ptr[0] = @as(u8, @truncate(input_ptr[0]));
output_ptr += 1;
input_ptr += 1;
}
bun.highway.copyU16ToU8(input[0..count], output[0..count]);
}
const strings = @This();
@@ -2353,11 +2289,7 @@ pub fn toUTF8ListWithTypeBun(list: *std.ArrayList(u8), comptime Type: type, utf1
}
list.items.len += i;
copyU16IntoU8(
list.items[list.items.len - i ..],
Type,
to_copy,
);
copyU16IntoU8(list.items[list.items.len - i ..], to_copy);
if (comptime skip_trailing_replacement) {
if (replacement.is_lead and utf16_remaining.len == 0) {
@@ -2377,7 +2309,7 @@ pub fn toUTF8ListWithTypeBun(list: *std.ArrayList(u8), comptime Type: type, utf1
try list.ensureTotalCapacityPrecise(utf16_remaining.len + list.items.len);
const old_len = list.items.len;
list.items.len += utf16_remaining.len;
copyU16IntoU8(list.items[old_len..], Type, utf16_remaining);
copyU16IntoU8(list.items[old_len..], utf16_remaining);
}
log("UTF16 {d} -> {d} UTF8", .{ utf16.len, list.items.len });
@@ -2794,43 +2726,8 @@ pub fn replaceLatin1WithUTF8(buf_: []u8) void {
}
}
pub fn elementLengthLatin1IntoUTF8(comptime Type: type, latin1_: Type) usize {
// https://zig.godbolt.org/z/zzYexPPs9
var latin1 = latin1_;
const input_len = latin1.len;
var total_non_ascii_count: usize = 0;
// This is about 30% faster on large input compared to auto-vectorization
if (comptime Environment.enableSIMD) {
const end = latin1.ptr + (latin1.len - (latin1.len % ascii_vector_size));
while (latin1.ptr != end) {
const vec: AsciiVector = latin1[0..ascii_vector_size].*;
// Shifting a unsigned 8 bit integer to the right by 7 bits always produces a value of 0 or 1.
const cmp = vec >> @as(AsciiVector, @splat(
@as(u8, 7),
));
// Anding that value rather than converting it into a @Vector(16, u1) produces better code from LLVM.
const mask: AsciiVector = cmp & @as(AsciiVector, @splat(
@as(u8, 1),
));
total_non_ascii_count += @as(usize, @reduce(.Add, mask));
latin1 = latin1[ascii_vector_size..];
}
// an important hint to the compiler to not auto-vectorize the loop below
if (latin1.len >= ascii_vector_size) unreachable;
}
for (latin1) |c| {
total_non_ascii_count += @as(usize, @intFromBool(c > 127));
}
// each non-ascii latin1 character becomes 2 UTF8 characters
return input_len + total_non_ascii_count;
pub fn elementLengthLatin1IntoUTF8(slice: []const u8) usize {
return bun.simdutf.length.utf8.from.latin1(slice);
}
pub fn copyLatin1IntoUTF16(comptime Buffer: type, buf_: Buffer, comptime Type: type, latin1_: Type) EncodeIntoResult {
@@ -2865,20 +2762,7 @@ pub fn elementLengthLatin1IntoUTF16(comptime Type: type, latin1_: Type) usize {
return latin1_.len;
}
var count: usize = 0;
var latin1 = latin1_;
while (latin1.len > 0) {
const function = comptime if (std.meta.Child(Type) == u8) strings.firstNonASCIIWithType else strings.firstNonASCII16;
const to_write = function(Type, latin1) orelse @as(u32, @truncate(latin1.len));
count += to_write;
latin1 = latin1[to_write..];
if (latin1.len > 0) {
count += comptime if (std.meta.Child(Type) == u8) 2 else 1;
latin1 = latin1[1..];
}
}
return count;
return bun.simdutf.length.utf16.from.latin1(latin1_);
}
pub fn escapeHTMLForLatin1Input(allocator: std.mem.Allocator, latin1: []const u8) !Escaped(u8) {
@@ -3605,7 +3489,7 @@ pub fn copyUTF16IntoUTF8WithBuffer(buf: []u8, comptime Type: type, utf16: Type,
while (firstNonASCII16(Type, utf16_remaining)) |i| {
const end = @min(i, remaining.len);
if (end > 0) copyU16IntoU8(remaining, Type, utf16_remaining[0..end]);
if (end > 0) copyU16IntoU8(remaining, utf16_remaining[0..end]);
remaining = remaining[end..];
utf16_remaining = utf16_remaining[end..];
@@ -3674,7 +3558,7 @@ pub fn copyUTF16IntoUTF8WithBuffer(buf: []u8, comptime Type: type, utf16: Type,
if (remaining.len > 0 and !ended_on_non_ascii and utf16_remaining.len > 0) {
const len = @min(remaining.len, utf16_remaining.len);
copyU16IntoU8(remaining[0..len], Type, utf16_remaining[0..len]);
copyU16IntoU8(remaining[0..len], utf16_remaining[0..len]);
utf16_remaining = utf16_remaining[len..];
remaining = remaining[len..];
}
@@ -4014,44 +3898,7 @@ pub fn isAllASCII(slice: []const u8) bool {
return true;
}
if (bun.FeatureFlags.use_simdutf)
return bun.simdutf.validate.ascii(slice);
var remaining = slice;
// The NEON SIMD unit is 128-bit wide and includes 16 128-bit registers that can be used as 32 64-bit registers
if (comptime Environment.enableSIMD) {
const remaining_end_ptr = remaining.ptr + remaining.len - (remaining.len % ascii_vector_size);
while (remaining.ptr != remaining_end_ptr) : (remaining.ptr += ascii_vector_size) {
const vec: AsciiVector = remaining[0..ascii_vector_size].*;
if (@reduce(.Max, vec) > 127) {
return false;
}
}
}
const Int = u64;
const size = @sizeOf(Int);
const remaining_last8 = slice.ptr + slice.len - (slice.len % size);
while (remaining.ptr != remaining_last8) : (remaining.ptr += size) {
const bytes = @as(Int, @bitCast(remaining[0..size].*));
// https://dotat.at/@/2022-06-27-tolower-swar.html
const mask = bytes & 0x8080808080808080;
if (mask > 0) {
return false;
}
}
const final = slice.ptr + slice.len;
while (remaining.ptr != final) : (remaining.ptr += 1) {
if (remaining[0] > 127) {
return false;
}
}
return true;
return bun.simdutf.validate.ascii(slice);
}
// #define U16_LEAD(supplementary) (UChar)(((supplementary)>>10)+0xd7c0)
@@ -4085,296 +3932,67 @@ pub inline fn u16GetSupplementary(lead: u32, trail: u32) u32 {
pub const u16_surrogate_offset = 56613888;
pub fn firstNonASCII(slice: []const u8) ?u32 {
return firstNonASCIIWithType([]const u8, slice);
}
pub fn firstNonASCIIWithType(comptime Type: type, slice: Type) ?u32 {
var remaining = slice;
if (comptime bun.FeatureFlags.use_simdutf) {
const result = bun.simdutf.validate.with_errors.ascii(slice);
if (result.status == .success) {
return null;
}
return @as(u32, @truncate(result.count));
}
if (comptime Environment.enableSIMD) {
if (remaining.len >= ascii_vector_size) {
const remaining_start = remaining.ptr;
const remaining_end = remaining.ptr + remaining.len - (remaining.len % ascii_vector_size);
while (remaining.ptr != remaining_end) {
const vec: AsciiVector = remaining[0..ascii_vector_size].*;
if (@reduce(.Max, vec) > 127) {
const Int = u64;
const size = @sizeOf(Int);
remaining.len -= @intFromPtr(remaining.ptr) - @intFromPtr(remaining_start);
{
const bytes = @as(Int, @bitCast(remaining[0..size].*));
// https://dotat.at/@/2022-06-27-tolower-swar.html
const mask = bytes & 0x8080808080808080;
if (mask > 0) {
const first_set_byte = @ctz(mask) / 8;
if (comptime Environment.isDebug) {
bun.assert(remaining[first_set_byte] > 127);
for (0..first_set_byte) |j| {
bun.assert(remaining[j] <= 127);
}
}
return @as(u32, first_set_byte) + @as(u32, @intCast(slice.len - remaining.len));
}
remaining = remaining[size..];
}
{
const bytes = @as(Int, @bitCast(remaining[0..size].*));
const mask = bytes & 0x8080808080808080;
if (mask > 0) {
const first_set_byte = @ctz(mask) / 8;
if (comptime Environment.isDebug) {
bun.assert(remaining[first_set_byte] > 127);
for (0..first_set_byte) |j| {
bun.assert(remaining[j] <= 127);
}
}
return @as(u32, first_set_byte) + @as(u32, @intCast(slice.len - remaining.len));
}
}
unreachable;
}
// the more intuitive way, using slices, produces worse codegen
// specifically: it subtracts the length at the end of the loop
// we don't need to do that
// we only need to subtract the length once at the very end
remaining.ptr += ascii_vector_size;
}
remaining.len -= @intFromPtr(remaining.ptr) - @intFromPtr(remaining_start);
}
}
{
const Int = u64;
const size = @sizeOf(Int);
const remaining_start = remaining.ptr;
const remaining_end = remaining.ptr + remaining.len - (remaining.len % size);
if (comptime Environment.enableSIMD) {
// these assertions exist more so for LLVM
bun.unsafeAssert(remaining.len < ascii_vector_size);
bun.unsafeAssert(@intFromPtr(remaining.ptr + ascii_vector_size) > @intFromPtr(remaining_end));
}
if (remaining.len >= size) {
while (remaining.ptr != remaining_end) {
const bytes = @as(Int, @bitCast(remaining[0..size].*));
// https://dotat.at/@/2022-06-27-tolower-swar.html
const mask = bytes & 0x8080808080808080;
if (mask > 0) {
remaining.len -= @intFromPtr(remaining.ptr) - @intFromPtr(remaining_start);
const first_set_byte = @ctz(mask) / 8;
if (comptime Environment.isDebug) {
bun.unsafeAssert(remaining[first_set_byte] > 127);
for (0..first_set_byte) |j| {
bun.unsafeAssert(remaining[j] <= 127);
}
}
return @as(u32, first_set_byte) + @as(u32, @intCast(slice.len - remaining.len));
}
remaining.ptr += size;
}
remaining.len -= @intFromPtr(remaining.ptr) - @intFromPtr(remaining_start);
}
}
if (comptime Environment.allow_assert) assert(remaining.len < 8);
for (remaining) |*char| {
if (char.* > 127) {
// try to prevent it from reading the length of the slice
return @as(u32, @truncate(@intFromPtr(char) - @intFromPtr(slice.ptr)));
}
}
return null;
}
pub fn indexOfNewlineOrNonASCIIOrANSI(slice_: []const u8, offset: u32) ?u32 {
const slice = slice_[offset..];
var remaining = slice;
if (remaining.len == 0)
const result = bun.simdutf.validate.with_errors.ascii(slice);
if (result.status == .success) {
return null;
if (comptime Environment.enableSIMD) {
while (remaining.len >= ascii_vector_size) {
const vec: AsciiVector = remaining[0..ascii_vector_size].*;
const cmp = @as(AsciiVectorU1, @bitCast((vec > max_16_ascii))) | @as(AsciiVectorU1, @bitCast((vec < min_16_ascii))) |
@as(AsciiVectorU1, @bitCast(vec == @as(AsciiVector, @splat(@as(u8, '\r'))))) |
@as(AsciiVectorU1, @bitCast(vec == @as(AsciiVector, @splat(@as(u8, '\n'))))) |
@as(AsciiVectorU1, @bitCast(vec == @as(AsciiVector, @splat(@as(u8, '\x1b')))));
if (@reduce(.Max, cmp) > 0) {
const bitmask = @as(AsciiVectorInt, @bitCast(cmp));
const first = @ctz(bitmask);
return @as(u32, first) + @as(u32, @intCast(slice.len - remaining.len)) + offset;
}
remaining = remaining[ascii_vector_size..];
}
if (comptime Environment.allow_assert) assert(remaining.len < ascii_vector_size);
}
for (remaining) |*char_| {
const char = char_.*;
if (char > 127 or char < 0x20 or char == '\n' or char == '\r' or char == '\x1b') {
return @as(u32, @truncate((@intFromPtr(char_) - @intFromPtr(slice.ptr)))) + offset;
}
}
return null;
return @as(u32, @truncate(result.count));
}
pub const indexOfNewlineOrNonASCIIOrANSI = indexOfNewlineOrNonASCII;
/// Checks if slice[offset..] has any < 0x20 or > 127 characters
pub fn indexOfNewlineOrNonASCII(slice_: []const u8, offset: u32) ?u32 {
return indexOfNewlineOrNonASCIICheckStart(slice_, offset, true);
}
pub fn indexOfSpaceOrNewlineOrNonASCII(slice_: []const u8, offset: u32) ?u32 {
const slice = slice_[offset..];
const remaining = slice;
if (remaining.len == 0)
return null;
if (remaining[0] > 127 or (remaining[0] < 0x20 and remaining[0] != 0x09)) {
return offset;
}
const i = bun.highway.indexOfSpaceOrNewlineOrNonASCII(remaining) orelse return null;
return @as(u32, @truncate(i)) + offset;
}
pub fn indexOfNewlineOrNonASCIICheckStart(slice_: []const u8, offset: u32, comptime check_start: bool) ?u32 {
const slice = slice_[offset..];
var remaining = slice;
const remaining = slice;
if (remaining.len == 0)
return null;
if (comptime check_start) {
// this shows up in profiling
if (remaining[0] > 127 or remaining[0] < 0x20 or remaining[0] == '\r' or remaining[0] == '\n') {
if (remaining[0] > 127 or (remaining[0] < 0x20 and remaining[0] != 0x09)) {
return offset;
}
}
if (comptime Environment.enableSIMD) {
while (remaining.len >= ascii_vector_size) {
const vec: AsciiVector = remaining[0..ascii_vector_size].*;
const cmp = @as(AsciiVectorU1, @bitCast((vec > max_16_ascii))) | @as(AsciiVectorU1, @bitCast((vec < min_16_ascii))) |
@as(AsciiVectorU1, @bitCast(vec == @as(AsciiVector, @splat(@as(u8, '\r'))))) |
@as(AsciiVectorU1, @bitCast(vec == @as(AsciiVector, @splat(@as(u8, '\n')))));
if (@reduce(.Max, cmp) > 0) {
const bitmask = @as(AsciiVectorInt, @bitCast(cmp));
const first = @ctz(bitmask);
return @as(u32, first) + @as(u32, @intCast(slice.len - remaining.len)) + offset;
}
remaining = remaining[ascii_vector_size..];
}
if (comptime Environment.allow_assert) assert(remaining.len < ascii_vector_size);
}
for (remaining) |*char_| {
const char = char_.*;
if (char > 127 or char < 0x20 or char == '\n' or char == '\r') {
return @as(u32, @truncate((@intFromPtr(char_) - @intFromPtr(slice.ptr)))) + offset;
}
}
return null;
const i = bun.highway.indexOfNewlineOrNonASCII(remaining) orelse return null;
return @as(u32, @truncate(i)) + offset;
}
pub fn containsNewlineOrNonASCIIOrQuote(slice_: []const u8) bool {
const slice = slice_;
var remaining = slice;
if (remaining.len == 0)
return false;
if (comptime Environment.enableSIMD) {
while (remaining.len >= ascii_vector_size) {
const vec: AsciiVector = remaining[0..ascii_vector_size].*;
const cmp = @as(AsciiVectorU1, @bitCast((vec > max_16_ascii))) | @as(AsciiVectorU1, @bitCast((vec < min_16_ascii))) |
@as(AsciiVectorU1, @bitCast(vec == @as(AsciiVector, @splat(@as(u8, '\r'))))) |
@as(AsciiVectorU1, @bitCast(vec == @as(AsciiVector, @splat(@as(u8, '\n'))))) |
@as(AsciiVectorU1, @bitCast(vec == @as(AsciiVector, @splat(@as(u8, '"')))));
if (@reduce(.Max, cmp) > 0) {
return true;
}
remaining = remaining[ascii_vector_size..];
}
if (comptime Environment.allow_assert) assert(remaining.len < ascii_vector_size);
}
for (remaining) |*char_| {
const char = char_.*;
if (char > 127 or char < 0x20 or char == '\n' or char == '\r' or char == '"') {
return true;
}
}
return false;
pub fn containsNewlineOrNonASCIIOrQuote(text: []const u8) bool {
return bun.highway.containsNewlineOrNonASCIIOrQuote(text);
}
/// JSON escape
pub fn indexOfNeedsEscape(slice: []const u8, comptime quote_char: u8) ?u32 {
var remaining = slice;
if (remaining.len == 0)
/// Supports:
/// - `"`
/// - `'`
/// - "`"
pub fn indexOfNeedsEscapeForJavaScriptString(slice: []const u8, quote_char: u8) ?u32 {
if (slice.len == 0)
return null;
if (remaining[0] >= 127 or remaining[0] < 0x20 or remaining[0] == '\\' or remaining[0] == quote_char or (quote_char == '`' and remaining[0] == '$')) {
return 0;
}
if (comptime Environment.enableSIMD) {
while (remaining.len >= ascii_vector_size) {
const vec: AsciiVector = remaining[0..ascii_vector_size].*;
const cmp: AsciiVectorU1 = if (comptime quote_char == '`') ( //
@as(AsciiVectorU1, @bitCast((vec > max_16_ascii))) |
@as(AsciiVectorU1, @bitCast((vec < min_16_ascii))) |
@as(AsciiVectorU1, @bitCast(vec == @as(AsciiVector, @splat(@as(u8, '\\'))))) |
@as(AsciiVectorU1, @bitCast(vec == @as(AsciiVector, @splat(@as(u8, quote_char))))) |
@as(AsciiVectorU1, @bitCast(vec == @as(AsciiVector, @splat(@as(u8, '$'))))) //
) else ( //
@as(AsciiVectorU1, @bitCast((vec > max_16_ascii))) |
@as(AsciiVectorU1, @bitCast((vec < min_16_ascii))) |
@as(AsciiVectorU1, @bitCast(vec == @as(AsciiVector, @splat(@as(u8, '\\'))))) |
@as(AsciiVectorU1, @bitCast(vec == @as(AsciiVector, @splat(@as(u8, quote_char))))) //
);
if (@reduce(.Max, cmp) > 0) {
const bitmask = @as(AsciiVectorInt, @bitCast(cmp));
const first = @ctz(bitmask);
return @as(u32, first) + @as(u32, @truncate(@intFromPtr(remaining.ptr) - @intFromPtr(slice.ptr)));
}
remaining = remaining[ascii_vector_size..];
}
}
for (remaining) |*char_| {
const char = char_.*;
if (char > 127 or char < 0x20 or char == '\\' or char == quote_char or (quote_char == '`' and char == '$')) {
return @as(u32, @truncate(@intFromPtr(char_) - @intFromPtr(slice.ptr)));
}
}
return null;
return bun.highway.indexOfNeedsEscapeForJavaScriptString(slice, quote_char);
}
pub fn indexOfNeedsURLEncode(slice: []const u8) ?u32 {
@@ -4447,15 +4065,7 @@ pub fn indexOfNeedsURLEncode(slice: []const u8) ?u32 {
}
pub fn indexOfCharZ(sliceZ: [:0]const u8, char: u8) ?u63 {
const ptr = bun.C.strchr(sliceZ.ptr, char) orelse return null;
const pos = @intFromPtr(ptr) - @intFromPtr(sliceZ.ptr);
if (comptime Environment.isDebug)
bun.assert(@intFromPtr(sliceZ.ptr) <= @intFromPtr(ptr) and
@intFromPtr(ptr) < @intFromPtr(sliceZ.ptr + sliceZ.len) and
pos <= sliceZ.len);
return @as(u63, @truncate(pos));
return @truncate(bun.highway.indexOfChar(sliceZ, char) orelse return null);
}
pub fn indexOfChar(slice: []const u8, char: u8) ?u32 {
@@ -4463,19 +4073,11 @@ pub fn indexOfChar(slice: []const u8, char: u8) ?u32 {
}
pub fn indexOfCharUsize(slice: []const u8, char: u8) ?usize {
if (slice.len == 0)
return null;
if (comptime !Environment.isNative) {
return std.mem.indexOfScalar(u8, slice, char);
}
const ptr = bun.C.memchr(slice.ptr, char, slice.len) orelse return null;
const i = @intFromPtr(ptr) - @intFromPtr(slice.ptr);
bun.assert(i < slice.len);
bun.assert(slice[i] == char);
return i;
return bun.highway.indexOfChar(slice, char);
}
pub fn indexOfCharPos(slice: []const u8, char: u8, start_index: usize) ?usize {
@@ -4485,13 +4087,9 @@ pub fn indexOfCharPos(slice: []const u8, char: u8, start_index: usize) ?usize {
if (start_index >= slice.len) return null;
const ptr = bun.C.memchr(slice.ptr + start_index, char, slice.len - start_index) orelse
return null;
const i = @intFromPtr(ptr) - @intFromPtr(slice.ptr);
bun.assert(i < slice.len);
bun.assert(slice[i] == char);
return i;
const result = bun.highway.indexOfChar(slice[start_index..], char) orelse return null;
bun.debugAssert(slice.len > result + start_index);
return result + start_index;
}
pub fn indexOfAnyPosComptime(slice: []const u8, comptime chars: []const u8, start_index: usize) ?usize {
@@ -4934,47 +4532,6 @@ pub fn firstNonASCII16(comptime Slice: type, slice: Slice) ?u32 {
return null;
}
/// Fast path for printing template literal strings
pub fn @"nextUTF16NonASCIIOr$`\\"(
comptime Slice: type,
slice: Slice,
) ?u32 {
var remaining = slice;
if (comptime Environment.enableSIMD and Environment.isNative) {
while (remaining.len >= ascii_u16_vector_size) {
const vec: AsciiU16Vector = remaining[0..ascii_u16_vector_size].*;
const cmp = @as(AsciiVectorU16U1, @bitCast((vec > max_u16_ascii))) |
@as(AsciiVectorU16U1, @bitCast((vec < min_u16_ascii))) |
@as(AsciiVectorU16U1, @bitCast((vec == @as(AsciiU16Vector, @splat(@as(u16, '$')))))) |
@as(AsciiVectorU16U1, @bitCast((vec == @as(AsciiU16Vector, @splat(@as(u16, '`')))))) |
@as(AsciiVectorU16U1, @bitCast((vec == @as(AsciiU16Vector, @splat(@as(u16, '\\'))))));
const bitmask = @as(u8, @bitCast(cmp));
const first = @ctz(bitmask);
if (first < ascii_u16_vector_size) {
return @as(u32, @intCast(@as(u32, first) +
@as(u32, @intCast(slice.len - remaining.len))));
}
remaining = remaining[ascii_u16_vector_size..];
}
}
for (remaining, 0..) |char, i| {
switch (char) {
'$', '`', '\\', 0...0x20 - 1, 128...std.math.maxInt(u16) => {
return @as(u32, @truncate(i + (slice.len - remaining.len)));
},
else => {},
}
}
return null;
}
/// Convert potentially ill-formed UTF-8 or UTF-16 bytes to a Unicode Codepoint.
/// - Invalid codepoints are replaced with `zero` parameter
/// - Null bytes return 0
@@ -5097,31 +4654,6 @@ pub fn lengthOfLeadingWhitespaceASCII(slice: string) usize {
return slice.len;
}
pub fn containsNonBmpCodePointUTF16(_text: []const u16) bool {
const n = _text.len;
if (n > 0) {
var i: usize = 0;
const text = _text[0 .. n - 1];
while (i < n - 1) : (i += 1) {
switch (text[i]) {
// Check for a high surrogate
0xD800...0xDBFF => {
// Check for a low surrogate
switch (text[i + 1]) {
0xDC00...0xDFFF => {
return true;
},
else => {},
}
},
else => {},
}
}
}
return false;
}
pub fn join(slices: []const string, delimiter: string, allocator: std.mem.Allocator) !string {
return try std.mem.join(allocator, delimiter, slices);
}
@@ -5238,6 +4770,75 @@ pub fn NewCodePointIterator(comptime CodePointType_: type, comptime zeroValue: c
return Iterator{ .bytes = str, .i = i, .c = zeroValue };
}
const SkipResult = enum {
eof,
found,
not_found,
};
/// Advance forward until the scalar function returns true.
/// THe simd function is "best effort" and expected to sometimes return a result which `scalar` will return false for.
/// This is because we don't decode UTF-8 in the SIMD code path.
pub fn skip(it: *const Iterator, cursor: *Cursor, simd: *const fn (input: []const u8) ?usize, scalar: *const fn (CodePointType) bool) SkipResult {
while (true) {
// 1. Get current position. Check for EOF.
const current_byte_index = cursor.i;
if (current_byte_index >= it.bytes.len) {
return .not_found; // Reached end without finding
}
// 2. Decode the *next* character using the standard iterator method.
if (!next(it, cursor)) {
return .not_found; // Reached end or error during decode
}
// 3. Check if the character just decoded matches the scalar condition.
if (scalar(it.c)) {
return .found; // Found it!
}
// 4. Optimization: Can we skip ahead using SIMD?
// Scan starting from the byte *after* the character we just decoded.
const next_scan_start_index = cursor.i;
if (next_scan_start_index >= it.bytes.len) {
// Just decoded the last character and it didn't match.
return .not_found;
}
const remaining_slice = it.bytes[next_scan_start_index..];
if (remaining_slice.len == 0) {
return .not_found;
}
// Ask SIMD for the next potential candidate.
if (simd(remaining_slice)) |pos| {
// SIMD found a potential candidate `pos` bytes ahead.
if (pos > 0) {
// Jump the byte index to the start of the potential candidate.
cursor.i = next_scan_start_index + @as(u32, @intCast(pos));
// Reset width so next() decodes correctly from the jumped position.
cursor.width = 0;
// Loop will continue, starting the decode from the new cursor.i.
continue;
}
// If pos == 0, SIMD suggests the *immediate next* character.
// No jump needed, just let the loop iterate naturally.
// Fallthrough to the end of the loop.
} else {
// SIMD found no potential candidates in the rest of the string.
// Since the SIMD search set is a superset of the scalar check set,
// we can guarantee that no character satisfying `scalar` exists further.
// Since the current character (decoded in step 2) also didn't match,
// we can conclude the target character is not found.
return .not_found;
}
// If we reach here, it means SIMD returned pos=0.
// Loop continues to the next iteration, processing the immediate next char.
} // End while true
unreachable;
}
pub inline fn next(it: *const Iterator, cursor: *Cursor) bool {
const pos: u32 = @as(u32, cursor.width) + cursor.i;
if (pos >= it.bytes.len) {
@@ -5527,6 +5128,16 @@ pub fn leftHasAnyInRight(to_check: []const string, against: []const string) bool
return false;
}
/// Returns true if the input has the prefix and the next character is not an identifier character
/// Also returns true if the input ends with the prefix (i.e. EOF)
///
/// Example:
/// ```zig
/// // returns true
/// hasPrefixWithWordBoundary("console.log", "console") // true
/// hasPrefixWithWordBoundary("console.log", "log") // false
/// hasPrefixWithWordBoundary("console.log", "console.log") // true
/// ```
pub fn hasPrefixWithWordBoundary(input: []const u8, comptime prefix: []const u8) bool {
if (hasPrefixComptime(input, prefix)) {
if (input.len == prefix.len) return true;
@@ -5708,7 +5319,6 @@ pub fn mustEscapeYAMLString(contents: []const u8) bool {
else => true,
};
}
pub fn pathContainsNodeModulesFolder(path: []const u8) bool {
return strings.contains(path, comptime std.fs.path.sep_str ++ "node_modules" ++ std.fs.path.sep_str);
}