Use Highway SIMD (#19134)

Co-authored-by: Dylan Conway <dylan.conway567@gmail.com>
Co-authored-by: Dylan Conway <35280289+dylan-conway@users.noreply.github.com>
Co-authored-by: Jarred-Sumner <709451+Jarred-Sumner@users.noreply.github.com>
This commit is contained in:
Jarred Sumner
2025-04-21 23:28:03 -07:00
committed by GitHub
parent b117d14650
commit 0471254e4e
19 changed files with 1841 additions and 872 deletions

View File

@@ -1089,6 +1089,7 @@ set(BUN_DEPENDENCIES
BoringSSL
Brotli
Cares
Highway
LibDeflate
LolHtml
Lshpack

View File

@@ -0,0 +1,33 @@
register_repository(
NAME
highway
REPOSITORY
google/highway
COMMIT
12b325bc1793dee68ab2157995a690db859fe9e0
)
set(HIGHWAY_CMAKE_ARGS
# Build a static library
-DBUILD_SHARED_LIBS=OFF
# Enable position-independent code for linking into the main executable
-DCMAKE_POSITION_INDEPENDENT_CODE=ON
# Disable unnecessary components
-DHWY_ENABLE_TESTS=OFF
-DHWY_ENABLE_EXAMPLES=OFF
-DHWY_ENABLE_CONTRIB=OFF
# Disable building of the install target
-DHWY_ENABLE_INSTALL=OFF
)
register_cmake_command(
TARGET
highway
LIBRARIES
hwy
ARGS
${HIGHWAY_CMAKE_ARGS}
INCLUDES
.
hwy
)

View File

@@ -367,4 +367,10 @@ SIMDUTFResult simdutf__base64_decode_from_binary16(const char16_t* input, size_t
return { .error = res.error, .count = res.count };
}
size_t simdutf__utf16_length_from_latin1(const char* input, size_t length)
{
UNUSED_PARAM(input);
return simdutf::utf16_length_from_latin1(length);
}
}

View File

@@ -95,6 +95,7 @@ pub extern fn simdutf__utf8_length_from_utf32(input: [*c]const c_uint, length: u
pub extern fn simdutf__utf16_length_from_utf32(input: [*c]const c_uint, length: usize) usize;
pub extern fn simdutf__utf32_length_from_utf8(input: [*]const u8, length: usize) usize;
pub extern fn simdutf__utf8_length_from_latin1(input: [*]const u8, length: usize) usize;
pub extern fn simdutf__utf16_length_from_latin1(input: [*]const u8, length: usize) usize;
pub const validate = struct {
pub const with_errors = struct {
@@ -295,6 +296,10 @@ pub const length = struct {
JSC.markBinding(@src());
return simdutf__utf16_length_from_utf32(input.ptr, input.len);
}
pub fn latin1(input: []const u8) usize {
return simdutf__utf16_length_from_latin1(input.ptr, input.len);
}
};
};

View File

@@ -0,0 +1,790 @@
// Must be first
#include "root.h"
#undef HWY_TARGET_INCLUDE
// Correct path to this file relative to the build root (CMakeLists.txt)
#define HWY_TARGET_INCLUDE "highway_strings.cpp"
#include <hwy/foreach_target.h> // Must come before highway.h
// Now include Highway and other headers
#include <hwy/highway.h>
#include <hwy/aligned_allocator.h>
#include <hwy/contrib/algo/find-inl.h>
#include <cstring> // For memcmp
#include <algorithm> // For std::min, std::max
#include <cstddef>
#include <cstdint>
// Wrap the SIMD implementations in the Highway namespaces
HWY_BEFORE_NAMESPACE();
namespace bun {
namespace HWY_NAMESPACE {
namespace hn = hwy::HWY_NAMESPACE; // Alias for convenience
// Type alias for SIMD vector tag
using D8 = hn::ScalableTag<uint8_t>;
size_t IndexOfCharImpl(const uint8_t* HWY_RESTRICT haystack, size_t haystack_len,
uint8_t needle)
{
D8 d;
// Use the Find function from find-inl.h which handles both vectorized and scalar cases
const size_t pos = hn::Find<D8>(d, needle, haystack, haystack_len);
// Convert to int64_t and return -1 if not found
return (pos < haystack_len) ? pos : haystack_len;
}
// --- Implementation Details ---
size_t IndexOfAnyCharImpl(const uint8_t* HWY_RESTRICT text, size_t text_len, const uint8_t* HWY_RESTRICT chars, size_t chars_len)
{
if (text_len == 0) return 0;
D8 d;
const size_t N = hn::Lanes(d);
if (chars_len == 1) {
ASSERT_NOT_REACHED_WITH_MESSAGE("chars_len == 1");
} else if (chars_len == 2) {
// 2 character implemenation
// covers the most common case:
//
// - { '\r', '\n' }
// - { '\\', '/' }
// - { ' ', '\t' }
//
const auto vec_char1 = hn::Set(d, chars[0]);
const auto vec_char2 = hn::Set(d, chars[1]);
size_t i = 0;
const size_t simd_text_len = text_len - (text_len % N);
for (; i < simd_text_len; i += N) {
const auto text_vec = hn::LoadN(d, text + i, N);
const auto found_mask = hn::Or(hn::Eq(text_vec, vec_char2), hn::Eq(text_vec, vec_char1));
const intptr_t pos = hn::FindFirstTrue(d, found_mask);
if (pos >= 0) {
return i + pos;
}
}
for (; i < text_len; ++i) {
const uint8_t text_char = text[i];
if (text_char == chars[0] || text_char == chars[1]) {
return i;
}
}
return text_len;
} else {
ASSERT(chars_len <= 16);
constexpr size_t kMaxPreloadedChars = 16;
hn::Vec<D8> char_vecs[kMaxPreloadedChars];
const size_t num_chars_to_preload = std::min(chars_len, kMaxPreloadedChars);
for (size_t c = 0; c < num_chars_to_preload; ++c) {
char_vecs[c] = hn::Set(d, chars[c]);
}
const size_t simd_text_len = text_len - (text_len % N);
size_t i = 0;
for (; i < simd_text_len; i += N) {
const auto text_vec = hn::LoadN(d, text + i, N);
auto found_mask = hn::MaskFalse(d);
for (size_t c = 0; c < num_chars_to_preload; ++c) {
found_mask = hn::Or(found_mask, hn::Eq(text_vec, char_vecs[c]));
}
if (chars_len > num_chars_to_preload) {
for (size_t c = num_chars_to_preload; c < chars_len; ++c) {
found_mask = hn::Or(found_mask, hn::Eq(text_vec, hn::Set(d, chars[c])));
}
}
const intptr_t pos = hn::FindFirstTrue(d, found_mask);
if (pos >= 0) {
return i + pos;
}
}
for (; i < text_len; ++i) {
const uint8_t text_char = text[i];
for (size_t c = 0; c < chars_len; ++c) {
if (text_char == chars[c]) {
return i;
}
}
}
}
return text_len;
}
void CopyU16ToU8Impl(const uint16_t* HWY_RESTRICT input, size_t count,
uint8_t* HWY_RESTRICT output)
{
// Tag for the output vector type (u8)
const hn::ScalableTag<uint8_t> d8;
// Tag for the input vector type (u16). OrderedTruncate2To takes two u16 vectors
// (each N/2 lanes) to produce one u8 vector (N lanes).
// Repartition<uint16_t, decltype(d8)> gives a u16 tag with N/2 lanes.
const hn::Repartition<uint16_t, decltype(d8)> d16;
const size_t N8 = hn::Lanes(d8); // Number of u8 lanes processed per iteration
const size_t N16 = hn::Lanes(d16); // Number of u16 lanes per input vector load
// Sanity check: we should load 2*N16 u16 elements to produce N8 u8 elements.
// Since sizeof(u16) == 2 * sizeof(u8), N16 should be N8 / 2.
// static_assert(N16 * 2 == N8, "Lane configuration mismatch"); // Highway ensures this
size_t i = 0;
const size_t simd_count = count - (count % N8);
// Process N8 elements (u8 output size) per iteration. This corresponds to
// loading N8 u16 input elements (2 vectors of N16 lanes each).
for (; i < simd_count; i += N8) {
// Load two input vectors of u16
const auto in1 = hn::LoadU(d16, input + i);
const auto in2 = hn::LoadU(d16, input + i + N16);
// Truncate and interleave into a single u8 vector
// OrderedTruncate2To(d_narrow, vec_wide_a, vec_wide_b)
const hn::Vec<decltype(d8)> result8 = hn::OrderedTruncate2To(d8, in1, in2);
// Store the resulting u8 vector
hn::StoreU(result8, d8, output + i);
}
// Handle remaining elements (< N8)
for (; i < count; ++i) {
output[i] = static_cast<uint8_t>(input[i]); // Truncation happens here
}
}
// Implementation for scanCharFrequency (Unchanged from previous correct version)
void ScanCharFrequencyImpl(const uint8_t* HWY_RESTRICT text, size_t text_len, int32_t* HWY_RESTRICT freqs, int32_t delta)
{
if (text_len == 0 || delta == 0) return;
D8 d;
const size_t N = hn::Lanes(d);
const auto vec_a = hn::Set(d, 'a');
const auto vec_z = hn::Set(d, 'z');
const auto vec_A = hn::Set(d, 'A');
const auto vec_Z = hn::Set(d, 'Z');
const auto vec_0 = hn::Set(d, '0');
const auto vec_9 = hn::Set(d, '9');
const auto vec_underscore = hn::Set(d, '_');
const auto vec_dollar = hn::Set(d, '$');
const auto vec_offset_a = hn::Set(d, 'a');
const auto vec_offset_A = hn::Set(d, 'A');
const auto vec_offset_0 = hn::Set(d, '0');
size_t i = 0;
size_t simd_text_len = text_len - (text_len % N);
for (; i < simd_text_len; i += N) {
const auto text_vec = hn::LoadU(d, text + i);
const auto mask_az = hn::And(hn::Ge(text_vec, vec_a), hn::Le(text_vec, vec_z));
const auto mask_AZ = hn::And(hn::Ge(text_vec, vec_A), hn::Le(text_vec, vec_Z));
const auto mask_09 = hn::And(hn::Ge(text_vec, vec_0), hn::Le(text_vec, vec_9));
const auto mask_underscore = hn::Eq(text_vec, vec_underscore);
const auto mask_dollar = hn::Eq(text_vec, vec_dollar);
auto valid_mask = hn::Or(mask_az, hn::Or(mask_AZ, hn::Or(mask_09, hn::Or(mask_underscore, mask_dollar))));
if (hn::AllFalse(d, valid_mask)) continue;
const auto idx_az = hn::Sub(text_vec, vec_offset_a);
const auto idx_AZ = hn::Add(hn::Sub(text_vec, vec_offset_A), hn::Set(d, uint8_t { 26 }));
const auto idx_09 = hn::Add(hn::Sub(text_vec, vec_offset_0), hn::Set(d, uint8_t { 52 }));
auto indices_vec = hn::Zero(d);
indices_vec = hn::IfThenElse(mask_az, idx_az, indices_vec);
indices_vec = hn::IfThenElse(mask_AZ, idx_AZ, indices_vec);
indices_vec = hn::IfThenElse(mask_09, idx_09, indices_vec);
indices_vec = hn::IfThenElse(mask_underscore, hn::Set(d, uint8_t { 62 }), indices_vec);
indices_vec = hn::IfThenElse(mask_dollar, hn::Set(d, uint8_t { 63 }), indices_vec);
alignas(HWY_ALIGNMENT) uint8_t indices_array[HWY_MAX_LANES_D(D8)];
alignas(HWY_ALIGNMENT) uint8_t valid_bits_array[(HWY_MAX_LANES_D(D8) + 7) / 8];
hn::Store(indices_vec, d, indices_array);
hn::StoreMaskBits(d, valid_mask, valid_bits_array);
for (size_t j = 0; j < N; ++j) {
if ((valid_bits_array[j / 8] >> (j % 8)) & 1) {
assert(indices_array[j] < 64);
freqs[indices_array[j]] += delta;
}
}
}
for (; i < text_len; ++i) {
const uint8_t c = text[i];
if (c >= 'a' && c <= 'z')
freqs[c - 'a'] += delta;
else if (c >= 'A' && c <= 'Z')
freqs[c - 'A' + 26] += delta;
else if (c >= '0' && c <= '9')
freqs[c - '0' + 52] += delta;
else if (c == '_')
freqs[62] += delta;
else if (c == '$')
freqs[63] += delta;
}
}
// Implementation for finding interesting characters in string literals
size_t IndexOfInterestingCharacterInStringLiteralImpl(const uint8_t* HWY_RESTRICT text, size_t text_len, uint8_t quote)
{
ASSERT(text_len > 0);
D8 d;
const size_t N = hn::Lanes(d);
const auto vec_quote = hn::Set(d, quote);
const auto vec_backslash = hn::Set(d, '\\');
const auto vec_min_ascii = hn::Set(d, uint8_t { 0x20 }); // Space
const auto vec_max_ascii = hn::Set(d, uint8_t { 0x7E }); // ~
const size_t simd_text_len = text_len - (text_len % N);
size_t i = 0;
for (; i < simd_text_len; i += N) {
const auto text_vec = hn::LoadN(d, text + i, N);
// Check for quote, backslash, or characters outside printable ASCII range
const auto mask_quote = hn::Eq(text_vec, vec_quote);
const auto mask_backslash = hn::Eq(text_vec, vec_backslash);
const auto mask_lt_min = hn::Lt(text_vec, vec_min_ascii);
const auto mask_gt_max = hn::Gt(text_vec, vec_max_ascii);
const auto found_mask = hn::Or(
hn::Or(mask_quote, mask_backslash),
hn::Or(mask_lt_min, mask_gt_max));
const intptr_t pos = hn::FindFirstTrue(d, found_mask);
if (pos >= 0) {
return i + pos;
}
}
for (; i < text_len; ++i) {
const uint8_t c = text[i];
if (c == quote || c == '\\' || (c < 0x20 || c > 0x7E)) {
return i;
}
}
return text_len;
}
size_t IndexOfNewlineOrNonASCIIOrHashOrAtImpl(const uint8_t* HWY_RESTRICT start_ptr, size_t search_len)
{
ASSERT(search_len > 0);
D8 d;
const size_t N = hn::Lanes(d);
const auto vec_hash = hn::Set(d, '#');
const auto vec_at = hn::Set(d, '@');
const auto vec_min_ascii = hn::Set(d, uint8_t { 0x20 });
const auto vec_max_ascii = hn::Set(d, uint8_t { 0x7E });
size_t i = 0;
const size_t simd_text_len = search_len - (search_len % N);
for (; i < simd_text_len; i += N) {
const auto vec = hn::LoadU(d, start_ptr + i);
const auto mask_hash = hn::Eq(vec, vec_hash);
const auto mask_at = hn::Eq(vec, vec_at);
const auto mask_lt_min = hn::Lt(vec, vec_min_ascii);
const auto mask_gt_max = hn::Gt(vec, vec_max_ascii);
const auto found_mask = hn::Or(hn::Or(mask_hash, mask_at), hn::Or(mask_lt_min, mask_gt_max));
const intptr_t pos = hn::FindFirstTrue(d, found_mask);
if (pos >= 0) {
return i + pos;
}
}
for (; i < search_len; ++i) {
const uint8_t char_ = start_ptr[i];
if (char_ == '#' || char_ == '@' || char_ < 0x20 || char_ > 127) {
return i;
}
}
return search_len;
}
size_t IndexOfNewlineOrNonASCIIImpl(const uint8_t* HWY_RESTRICT start_ptr, size_t search_len)
{
ASSERT(search_len > 0);
D8 d;
const size_t N = hn::Lanes(d);
// SIMD constants
const auto vec_max_ascii = hn::Set(d, uint8_t { 127 });
const auto vec_min_ascii = hn::Set(d, uint8_t { 0x20 });
// FUTURE TODO: normalize tabs
// Some tests involving githubactions depend on tabs not being normalized right now.
size_t i = 0;
const size_t simd_text_len = search_len - (search_len % N);
// Process full vectors
for (; i < simd_text_len; i += N) {
const auto vec = hn::LoadU(d, start_ptr + i);
const auto mask_lt_min = hn::Lt(vec, vec_min_ascii);
const auto mask_gt_max = hn::Gt(vec, vec_max_ascii);
const auto found_mask = hn::Or(mask_gt_max, mask_lt_min);
const intptr_t pos = hn::FindFirstTrue(d, found_mask);
if (pos >= 0) {
return i + pos;
}
}
// Scalar check for the remainder
for (; i < search_len; ++i) {
const uint8_t char_ = start_ptr[i];
if (char_ > 127 || char_ < 0x20) {
return i;
}
}
return search_len;
}
size_t IndexOfSpaceOrNewlineOrNonASCIIImpl(const uint8_t* HWY_RESTRICT start_ptr, size_t search_len)
{
ASSERT(search_len > 0);
D8 d;
const size_t N = hn::Lanes(d);
const uint8_t after_space = ' ' + 1;
const auto vec_min_ascii_including_space = hn::Set(d, after_space);
const auto vec_max_ascii = hn::Set(d, uint8_t { 127 });
size_t simd_text_len = search_len - (search_len % N);
size_t i = 0;
for (; i < simd_text_len; i += N) {
const auto vec = hn::LoadU(d, start_ptr + i);
const auto mask_lt_min = hn::Lt(vec, vec_min_ascii_including_space);
const auto mask_gt_max = hn::Gt(vec, vec_max_ascii);
const auto found_mask = hn::Or(mask_gt_max, mask_lt_min);
const intptr_t pos = hn::FindFirstTrue(d, found_mask);
if (pos >= 0) {
return i + pos;
}
}
for (; i < search_len; ++i) {
const uint8_t char_ = start_ptr[i];
if (char_ <= ' ' || char_ > 127) {
return i;
}
}
return search_len;
}
bool ContainsNewlineOrNonASCIIOrQuoteImpl(const uint8_t* HWY_RESTRICT text, size_t text_len)
{
ASSERT(text_len > 0);
D8 d;
const size_t N = hn::Lanes(d);
// SIMD constants
const auto vec_max_ascii = hn::Set(d, uint8_t { 127 });
const auto vec_min_ascii = hn::Set(d, uint8_t { 0x20 });
const auto vec_quote = hn::Set(d, uint8_t { '"' });
size_t i = 0;
const size_t simd_text_len = text_len - (text_len % N);
// Process full vectors
for (; i < simd_text_len; i += N) {
const auto vec = hn::LoadU(d, text + i);
const auto mask_lt_min = hn::Lt(vec, vec_min_ascii);
const auto mask_gt_max = hn::Gt(vec, vec_max_ascii);
const auto mask_quote_eq = hn::Eq(vec, vec_quote);
const auto found_mask = hn::Or(hn::Or(mask_gt_max, mask_lt_min), mask_quote_eq);
if (!hn::AllFalse(d, found_mask)) {
return true;
}
}
// Scalar check for the remainder
for (; i < text_len; ++i) {
const uint8_t char_ = text[i];
if (char_ > 127 || char_ < 0x20 || char_ == '"') {
return true;
}
}
return false;
}
template<bool is_backtick>
static size_t IndexOfNeedsEscapeForJavaScriptStringImpl(const uint8_t* HWY_RESTRICT text, size_t text_len, uint8_t quote_char)
{
ASSERT(text_len > 0);
D8 d;
const size_t N = hn::Lanes(d);
// Set up SIMD constants
const auto vec_backslash = hn::Set(d, uint8_t { '\\' });
const auto vec_min_ascii = hn::Set(d, uint8_t { 0x20 });
const auto vec_max_ascii = hn::Set(d, uint8_t { 127 });
const auto vec_quote = hn::Set(d, quote_char);
const auto vec_dollar = hn::Set(d, uint8_t { '$' });
ASSERT(is_backtick || quote_char != '`');
// Calculate how many full SIMD vectors we can process
const size_t simd_text_len = text_len - (text_len % N);
size_t i = 0;
// Process chunks of the string
for (; i < simd_text_len; i += N) {
const auto text_vec = hn::LoadN(d, text + i, N);
// Check for characters that need escaping
const auto mask_gt_max = hn::Gt(text_vec, vec_max_ascii);
const auto mask_lt_min = hn::Lt(text_vec, vec_min_ascii);
const auto mask_backslash = hn::Eq(text_vec, vec_backslash);
const auto mask_quote = hn::Eq(text_vec, vec_quote);
auto found_mask = !is_backtick ? hn::Or(
hn::Or(mask_gt_max, mask_lt_min),
hn::Or(mask_backslash, mask_quote))
: hn::Or(
hn::Or(
hn::Or(mask_gt_max, mask_lt_min),
hn::Or(mask_backslash, mask_quote)),
hn::Eq(text_vec, vec_dollar));
const intptr_t pos = hn::FindFirstTrue(d, found_mask);
if (pos >= 0) {
return i + pos;
}
}
// Scalar check for the remainder
for (; i < text_len; ++i) {
const uint8_t char_ = text[i];
if (char_ >= 127 || (char_ < 0x20 && char_ != 0x09) || char_ == '\\' || char_ == quote_char || (is_backtick && char_ == '$')) {
return i;
}
}
return text_len; // No characters needing escape found
}
size_t IndexOfNeedsEscapeForJavaScriptStringImplBacktick(const uint8_t* HWY_RESTRICT text, size_t text_len, uint8_t quote_char)
{
return IndexOfNeedsEscapeForJavaScriptStringImpl<true>(text, text_len, quote_char);
}
size_t IndexOfNeedsEscapeForJavaScriptStringImplQuote(const uint8_t* HWY_RESTRICT text, size_t text_len, uint8_t quote_char)
{
return IndexOfNeedsEscapeForJavaScriptStringImpl<false>(text, text_len, quote_char);
}
// Highway implementation of memmem
// Returns a pointer to the first occurrence of `needle` in `haystack`,
// or nullptr if not found. The return type is non-const `uint8_t*`
// to match the standard C `memmem` signature, even though the input
// is const. The caller should handle constness appropriately.
uint8_t* MemMemImpl(const uint8_t* haystack, size_t haystack_len,
const uint8_t* needle, size_t needle_len)
{
// --- Edge Cases ---
if (HWY_UNLIKELY(needle_len == 0)) {
return const_cast<uint8_t*>(haystack);
}
if (HWY_UNLIKELY(haystack_len < needle_len)) {
return nullptr;
}
if (HWY_UNLIKELY(needle_len == 1)) {
size_t index = IndexOfCharImpl(haystack, haystack_len, needle[0]);
if (index != haystack_len) {
return const_cast<uint8_t*>(haystack + index);
}
return nullptr;
}
// --- SIMD Setup ---
const hn::ScalableTag<uint8_t> d;
const size_t N = hn::Lanes(d);
const uint8_t first_needle_char = needle[0];
const hn::Vec<decltype(d)> v_first_needle = hn::Set(d, first_needle_char);
const size_t last_possible_start = haystack_len - needle_len;
// --- SIMD Main Loop ---
size_t i = 0;
while (i + N <= haystack_len && i <= last_possible_start) {
const hn::Vec<decltype(d)> haystack_vec = hn::LoadU(d, haystack + i);
hn::Mask<decltype(d)> m_starts = hn::Eq(haystack_vec, v_first_needle);
// Iterate through potential matches within this vector chunk using FindFirstTrue
while (!hn::AllFalse(d, m_starts)) {
const intptr_t bit_idx_ptr = hn::FindFirstTrue(d, m_starts);
// Loop condition guarantees FindFirstTrue finds something
HWY_ASSERT(bit_idx_ptr >= 0);
const size_t bit_idx = static_cast<size_t>(bit_idx_ptr);
const size_t potential_pos = i + bit_idx;
// Double-check bounds (essential if N > needle_len, and correct otherwise)
if (potential_pos <= last_possible_start) {
if (memcmp(haystack + potential_pos, needle, needle_len) == 0) {
return const_cast<uint8_t*>(haystack + potential_pos);
}
} else {
// Optimization: If the first match found in this chunk is already
// beyond the last possible start, no subsequent match in this
// chunk can be valid.
goto remainder_check; // Exit both loops and proceed to scalar remainder
}
// Clear the found bit to find the next one in the next iteration.
// SetOnlyFirst creates a mask with only the first true bit set.
// AndNot removes that bit from m_starts.
const hn::Mask<decltype(d)> first_bit_mask = hn::SetOnlyFirst(m_starts);
m_starts = hn::AndNot(first_bit_mask, m_starts);
} // End while (!AllFalse)
i += N;
} // End SIMD loop
remainder_check:
// --- Scalar Remainder Loop ---
// Check any remaining bytes that couldn't form a full vector load
// or potential starts within the last vector load that weren't checked
// because they were past last_possible_start.
// Start `i` from where the SIMD loop *could* have last started a valid check.
size_t remainder_start = (i >= N) ? (i - N) : 0;
// Ensure we re-check any potential starts the SIMD loop might have skipped
// due to the bounds check optimization or being in the final partial vector.
for (; remainder_start <= last_possible_start; ++remainder_start) {
// Optimization: Check first character before expensive memcmp
if (haystack[remainder_start] == first_needle_char) {
if (memcmp(haystack + remainder_start, needle, needle_len) == 0) {
return const_cast<uint8_t*>(haystack + remainder_start);
}
}
}
return nullptr; // Not found
}
// Implementation for WebSocket mask application
void FillWithSkipMaskImpl(const uint8_t* HWY_RESTRICT mask, size_t mask_len, uint8_t* HWY_RESTRICT output, const uint8_t* HWY_RESTRICT input, size_t length, bool skip_mask)
{
ASSERT(mask_len == 4);
ASSERT(length > 0);
// If we're skipping masking or there's no data, return early
if (skip_mask) {
std::memcpy(output, input, length);
return;
}
D8 d;
const size_t N = hn::Lanes(d);
// Create a vector filled with the mask pattern repeating every 4 bytes
alignas(HWY_ALIGNMENT) uint8_t mask_pattern[HWY_MAX_LANES_D(D8)] = {};
for (size_t i = 0; i < HWY_MAX_LANES_D(D8); i += 4) {
mask_pattern[i] = mask[0];
mask_pattern[i + 1] = mask[1];
mask_pattern[i + 2] = mask[2];
mask_pattern[i + 3] = mask[3];
}
const auto mask_vec = hn::Load(d, mask_pattern);
// Process data in chunks of size N
size_t i = 0;
const size_t vector_length = length - (length % N);
for (; i < vector_length; i += N) {
// Load input data
const auto input_vec = hn::LoadU(d, input + i);
// XOR with mask
const auto masked_vec = hn::Xor(input_vec, mask_vec);
// Store result
hn::StoreU(masked_vec, d, output + i);
}
// Handle remaining bytes with scalar operations
for (; i < length; ++i) {
output[i] = input[i] ^ mask[i % 4];
}
}
} // namespace HWY_NAMESPACE
} // namespace bun
HWY_AFTER_NAMESPACE();
// HWY_ONCE ensures this block is only included once,
// in the final pass after all target-specific code is generated.
#if HWY_ONCE
namespace bun {
// Define the dispatch tables. The names here must exactly match
// the *Impl function names defined within the HWY_NAMESPACE block above.
HWY_EXPORT(ContainsNewlineOrNonASCIIOrQuoteImpl);
HWY_EXPORT(CopyU16ToU8Impl);
HWY_EXPORT(FillWithSkipMaskImpl);
HWY_EXPORT(IndexOfAnyCharImpl);
HWY_EXPORT(IndexOfCharImpl);
HWY_EXPORT(IndexOfInterestingCharacterInStringLiteralImpl);
HWY_EXPORT(IndexOfNeedsEscapeForJavaScriptStringImplBacktick);
HWY_EXPORT(IndexOfNeedsEscapeForJavaScriptStringImplQuote);
HWY_EXPORT(IndexOfNewlineOrNonASCIIImpl);
HWY_EXPORT(IndexOfNewlineOrNonASCIIOrHashOrAtImpl);
HWY_EXPORT(IndexOfSpaceOrNewlineOrNonASCIIImpl);
HWY_EXPORT(MemMemImpl);
HWY_EXPORT(ScanCharFrequencyImpl);
} // namespace bun
// Define the C-callable wrappers that use HWY_DYNAMIC_DISPATCH.
// These need to be defined *after* the HWY_EXPORT block.
extern "C" {
void* highway_memmem(const uint8_t* haystack, size_t haystack_len, const uint8_t* needle, size_t needle_len)
{
return HWY_DYNAMIC_DISPATCH(bun::MemMemImpl)(haystack, haystack_len, needle, needle_len);
}
static void highway_copy_u16_to_u8_impl(
const uint16_t* input,
size_t count,
uint8_t* output)
{
return HWY_DYNAMIC_DISPATCH(bun::CopyU16ToU8Impl)(input, count, output);
}
void highway_copy_u16_to_u8(
// No HWY_RESTRICT
const uint16_t* input,
size_t count,
// No HWY_RESTRICT
uint8_t* output)
{
if (count == 0) {
return;
}
// Check alignment of the input pointer
if (!hwy::IsAligned(input, alignof(uint16_t))) {
// Handle the first unaligned element scalar-ly
output[0] = static_cast<uint8_t>(input[0]);
// Call the core implementation with adjusted pointers and count,
// which are now guaranteed to be aligned or have count == 0.
// The HWY_RESTRICT inside CopyU16ToU8Impl is now valid for the
// ranges it operates on.
if (count > 1)
highway_copy_u16_to_u8_impl(input + 1, count - 1, output + 1);
} else {
// Input is already aligned, call the core implementation directly.
highway_copy_u16_to_u8_impl(input, count, output);
}
}
size_t highway_index_of_any_char(const uint8_t* HWY_RESTRICT text, size_t text_len, const uint8_t* HWY_RESTRICT chars, size_t chars_len)
{
return HWY_DYNAMIC_DISPATCH(bun::IndexOfAnyCharImpl)(text, text_len, chars, chars_len);
}
void highway_char_frequency(const uint8_t* HWY_RESTRICT text, size_t text_len,
int32_t* freqs, int32_t delta)
{
HWY_DYNAMIC_DISPATCH(bun::ScanCharFrequencyImpl)(text, text_len, freqs, delta);
}
size_t highway_index_of_char(const uint8_t* HWY_RESTRICT haystack, size_t haystack_len,
uint8_t needle)
{
return HWY_DYNAMIC_DISPATCH(bun::IndexOfCharImpl)(haystack, haystack_len, needle);
}
size_t highway_index_of_interesting_character_in_string_literal(const uint8_t* HWY_RESTRICT text, size_t text_len, uint8_t quote)
{
return HWY_DYNAMIC_DISPATCH(bun::IndexOfInterestingCharacterInStringLiteralImpl)(text, text_len, quote);
}
size_t highway_index_of_newline_or_non_ascii(const uint8_t* HWY_RESTRICT haystack, size_t haystack_len)
{
return HWY_DYNAMIC_DISPATCH(bun::IndexOfNewlineOrNonASCIIImpl)(haystack, haystack_len);
}
size_t highway_index_of_newline_or_non_ascii_or_hash_or_at(const uint8_t* HWY_RESTRICT haystack, size_t haystack_len)
{
return HWY_DYNAMIC_DISPATCH(bun::IndexOfNewlineOrNonASCIIOrHashOrAtImpl)(haystack, haystack_len);
}
bool highway_contains_newline_or_non_ascii_or_quote(const uint8_t* HWY_RESTRICT text, size_t text_len)
{
return HWY_DYNAMIC_DISPATCH(bun::ContainsNewlineOrNonASCIIOrQuoteImpl)(text, text_len);
}
size_t highway_index_of_needs_escape_for_javascript_string(const uint8_t* HWY_RESTRICT text, size_t text_len, uint8_t quote_char)
{
if (quote_char == '`') {
return HWY_DYNAMIC_DISPATCH(bun::IndexOfNeedsEscapeForJavaScriptStringImplBacktick)(text, text_len, quote_char);
} else {
return HWY_DYNAMIC_DISPATCH(bun::IndexOfNeedsEscapeForJavaScriptStringImplQuote)(text, text_len, quote_char);
}
}
size_t highway_index_of_space_or_newline_or_non_ascii(const uint8_t* HWY_RESTRICT text, size_t text_len)
{
return HWY_DYNAMIC_DISPATCH(bun::IndexOfSpaceOrNewlineOrNonASCIIImpl)(text, text_len);
}
void highway_fill_with_skip_mask(
const uint8_t* mask, // 4-byte mask array
size_t mask_len, // Should be 4
uint8_t* output, // Output buffer
const uint8_t* input, // Input buffer
size_t length, // Length of input/output
bool skip_mask) // Whether to skip masking
{
HWY_DYNAMIC_DISPATCH(bun::FillWithSkipMaskImpl)(mask, mask_len, output, input, length, skip_mask);
}
} // extern "C"
#if OS(DARWIN)
// On macOS, override the libc memmem with our implementation
// This uses inline assembly to ensure the symbol is exported with the correct name
__asm__(".globl _memmem");
__asm__(".set _memmem, _highway_memmem");
#elif OS(LINUX)
// On Linux, override the libc memmem with our implementation
// This uses the GNU-specific attribute to alias our function to the libc symbol
// The alias will be visible across the entire program, not just this file
extern "C" {
// Using both "default" visibility and "weak" ensures our implementation is used
// throughout the entire program when linked, not just in this object file
__attribute__((visibility("default"), weak, used)) void* memmem(const void* haystack, size_t haystacklen, const void* needle, size_t needlelen)
__attribute__((alias("highway_memmem")));
}
#endif
#endif // HWY_ONCE

View File

@@ -355,7 +355,7 @@ pub const Encoder = struct {
switch (comptime encoding) {
.utf8 => {
return strings.elementLengthLatin1IntoUTF8([]const u8, input[0..len]);
return strings.elementLengthLatin1IntoUTF8(input[0..len]);
},
.latin1, .ascii, .buffer => {
@@ -395,7 +395,7 @@ pub const Encoder = struct {
},
.latin1, .ascii, .buffer => {
const out = @min(len, to_len);
strings.copyU16IntoU8(to[0..to_len], []const u16, input[0..out]);
strings.copyU16IntoU8(to[0..to_len], input[0..out]);
return out;
},
// string is already encoded, just need to copy the data
@@ -404,7 +404,7 @@ pub const Encoder = struct {
const bytes_input_len = len * 2;
const written = @min(bytes_input_len, to_len);
const input_u8 = @as([*]const u8, @ptrCast(input));
strings.copyU16IntoU8(to[0..written], []const u8, input_u8[0..written]);
bun.memmove(to[0..written], input_u8[0..written]);
return written;
} else {
const bytes_input_len = len * 2;
@@ -413,7 +413,7 @@ pub const Encoder = struct {
const fixed_len = (written / 2) * 2;
const input_u8 = @as([*]const u8, @ptrCast(input));
strings.copyU16IntoU8(to[0..written], []const u8, input_u8[0..fixed_len]);
bun.memmove(to[0..written], input_u8[0..fixed_len]);
return fixed_len;
}
},
@@ -503,7 +503,7 @@ pub const Encoder = struct {
},
.latin1, .buffer, .ascii => {
var to = allocator.alloc(u8, len) catch return &[_]u8{};
strings.copyU16IntoU8(to[0..len], []const u16, input[0..len]);
strings.copyU16IntoU8(to[0..len], input[0..len]);
return to;
},
// string is already encoded, just need to copy the data

View File

@@ -3595,3 +3595,4 @@ pub fn freeSensitive(allocator: std.mem.Allocator, slice: anytype) void {
pub const server = @import("./bun.js/api/server.zig");
pub const macho = @import("./macho.zig");
pub const valkey = @import("./valkey/index.zig");
pub const highway = @import("./highway.zig");

View File

@@ -2986,7 +2986,7 @@ pub fn parse_attribute_selector(comptime Impl: type, parser: *SelectorParser, in
};
const never_matches = switch (operator) {
.equal, .dash_match => false,
.includes => value_str.len == 0 or std.mem.indexOfAny(u8, value_str, SELECTOR_WHITESPACE) != null,
.includes => value_str.len == 0 or bun.strings.indexOfAny(value_str, SELECTOR_WHITESPACE) != null,
.prefix, .substring, .suffix => value_str.len == 0,
};

View File

@@ -1422,43 +1422,10 @@ pub fn GlobWalker_(
return filepath.len > 0 and filepath[0] == '.';
}
const syntax_tokens = "*[{?!";
fn checkSpecialSyntax(pattern: []const u8) bool {
if (pattern.len < 16) {
for (pattern[0..]) |c| {
switch (c) {
'*', '[', '{', '?', '!' => return true,
else => {},
}
}
return false;
}
const syntax_tokens = comptime [_]u8{ '*', '[', '{', '?', '!' };
const needles: [syntax_tokens.len]@Vector(16, u8) = comptime needles: {
var needles: [syntax_tokens.len]@Vector(16, u8) = undefined;
for (syntax_tokens, 0..) |tok, i| {
needles[i] = @splat(tok);
}
break :needles needles;
};
var i: usize = 0;
while (i + 16 <= pattern.len) : (i += 16) {
const haystack: @Vector(16, u8) = pattern[i..][0..16].*;
inline for (needles) |needle| {
if (std.simd.firstTrue(needle == haystack) != null) return true;
}
}
if (i < pattern.len) {
for (pattern[i..]) |c| {
inline for (syntax_tokens) |tok| {
if (c == tok) return true;
}
}
}
return false;
return bun.strings.indexOfAny(pattern, syntax_tokens) != null;
}
fn makeComponent(

305
src/highway.zig Normal file
View File

@@ -0,0 +1,305 @@
const std = @import("std");
const bun = @import("bun");
const strings = bun.strings;
const string = bun.string;
const Environment = bun.Environment;
extern "c" fn highway_char_frequency(
text: [*]const u8,
text_len: usize,
freqs: [*]i32,
delta: i32,
) void;
extern "c" fn highway_index_of_char(
haystack: [*]const u8,
haystack_len: usize,
needle: u8,
) usize;
extern "c" fn highway_index_of_interesting_character_in_string_literal(
noalias text: [*]const u8,
text_len: usize,
quote: u8,
) usize;
extern "c" fn highway_index_of_newline_or_non_ascii(
noalias haystack: [*]const u8,
haystack_len: usize,
) usize;
extern "c" fn highway_index_of_newline_or_non_ascii_or_ansi(
noalias haystack: [*]const u8,
haystack_len: usize,
) usize;
extern "c" fn highway_index_of_newline_or_non_ascii_or_hash_or_at(
noalias haystack: [*]const u8,
haystack_len: usize,
) usize;
extern "c" fn highway_index_of_space_or_newline_or_non_ascii(
noalias haystack: [*]const u8,
haystack_len: usize,
) usize;
extern "c" fn highway_contains_newline_or_non_ascii_or_quote(
noalias text: [*]const u8,
text_len: usize,
) bool;
extern "c" fn highway_index_of_needs_escape_for_javascript_string(
noalias text: [*]const u8,
text_len: usize,
quote_char: u8,
) usize;
extern "c" fn highway_index_of_any_char(
noalias text: [*]const u8,
text_len: usize,
noalias chars: [*]const u8,
chars_len: usize,
) usize;
extern "c" fn highway_fill_with_skip_mask(
mask: [*]const u8,
mask_len: usize,
output: [*]u8,
input: [*]const u8,
length: usize,
skip_mask: bool,
) void;
/// Count frequencies of [a-zA-Z0-9_$] characters in a string
/// Updates the provided frequency array with counts (adds delta for each occurrence)
pub fn scanCharFrequency(text: string, freqs: *[64]i32, delta: i32) void {
if (text.len == 0 or delta == 0) {
return;
}
highway_char_frequency(
text.ptr,
text.len,
freqs.ptr,
delta,
);
}
pub fn indexOfChar(haystack: string, needle: u8) ?usize {
if (haystack.len == 0) {
return null;
}
const result = highway_index_of_char(
haystack.ptr,
haystack.len,
needle,
);
if (result == haystack.len) {
return null;
}
bun.debugAssert(haystack[result] == needle);
return result;
}
pub fn indexOfInterestingCharacterInStringLiteral(slice: string, quote_type: u8) ?usize {
if (slice.len == 0) {
return null;
}
const result = highway_index_of_interesting_character_in_string_literal(
slice.ptr,
slice.len,
quote_type,
);
if (result == slice.len) {
return null;
}
return result;
}
pub fn indexOfNewlineOrNonASCII(haystack: string) ?usize {
bun.debugAssert(haystack.len > 0);
const result = highway_index_of_newline_or_non_ascii(
haystack.ptr,
haystack.len,
);
if (result == haystack.len) {
return null;
}
if (comptime Environment.isDebug) {
const haystack_char = haystack[result];
if (!(haystack_char > 127 or haystack_char < 0x20 or haystack_char == '\r' or haystack_char == '\n')) {
@panic("Invalid character found in indexOfNewlineOrNonASCII");
}
}
return result;
}
pub fn indexOfNewlineOrNonASCIIOrANSI(haystack: string) ?usize {
bun.debugAssert(haystack.len > 0);
const result = highway_index_of_newline_or_non_ascii_or_ansi(
haystack.ptr,
haystack.len,
);
if (result == haystack.len) {
return null;
}
if (comptime Environment.isDebug) {
const haystack_char = haystack[result];
if (!(haystack_char > 127 or haystack_char < 0x20 or haystack_char == '\r' or haystack_char == '\n')) {
@panic("Invalid character found in indexOfNewlineOrNonASCIIOrANSI");
}
}
return result;
}
/// Checks if the string contains any newlines, non-ASCII characters, or quotes
pub fn containsNewlineOrNonASCIIOrQuote(text: string) bool {
if (text.len == 0) {
return false;
}
return highway_contains_newline_or_non_ascii_or_quote(
text.ptr,
text.len,
);
}
/// Finds the first character that needs escaping in a JavaScript string
/// Looks for characters above ASCII (> 127), control characters (< 0x20),
/// backslash characters (`\`), the quote character itself, and for backtick
/// strings also the dollar sign (`$`)
pub fn indexOfNeedsEscapeForJavaScriptString(slice: string, quote_char: u8) ?u32 {
if (slice.len == 0) {
return null;
}
const result = highway_index_of_needs_escape_for_javascript_string(
slice.ptr,
slice.len,
quote_char,
);
if (result == slice.len) {
return null;
}
if (comptime Environment.isDebug) {
const haystack_char = slice[result];
if (!(haystack_char > 127 or haystack_char < 0x20 or haystack_char == '\\' or haystack_char == quote_char or haystack_char == '$' or haystack_char == '\r' or haystack_char == '\n')) {
@panic("Invalid character found in indexOfNeedsEscapeForJavaScriptString");
}
}
return @truncate(result);
}
pub fn indexOfAnyChar(haystack: string, chars: string) ?usize {
if (haystack.len == 0 or chars.len == 0) {
return null;
}
const result = highway_index_of_any_char(haystack.ptr, haystack.len, chars.ptr, chars.len);
if (result == haystack.len) {
return null;
}
if (comptime Environment.isDebug) {
const haystack_char = haystack[result];
var found = false;
for (chars) |c| {
if (c == haystack_char) {
found = true;
break;
}
}
if (!found) {
@panic("Invalid character found in indexOfAnyChar");
}
}
return result;
}
extern "c" fn highway_copy_u16_to_u8(
input: [*]align(1) const u16,
count: usize,
output: [*]u8,
) void;
pub fn copyU16ToU8(input: []align(1) const u16, output: []u8) void {
highway_copy_u16_to_u8(input.ptr, input.len, output.ptr);
}
/// Apply a WebSocket mask to data using SIMD acceleration
/// If skip_mask is true, data is copied without masking
pub fn fillWithSkipMask(mask: [4]u8, output: []u8, input: []const u8, skip_mask: bool) void {
if (input.len == 0) {
return;
}
highway_fill_with_skip_mask(
&mask,
4,
output.ptr,
input.ptr,
input.len,
skip_mask,
);
}
/// Useful for single-line JavaScript comments.
/// Scans for:
/// - `\n`, `\r`
/// - Non-ASCII characters (which implicitly include `\n`, `\r`)
/// - `#`
/// - `@`
pub fn indexOfNewlineOrNonASCIIOrHashOrAt(haystack: string) ?usize {
if (haystack.len == 0) {
return null;
}
const result = highway_index_of_newline_or_non_ascii_or_hash_or_at(
haystack.ptr,
haystack.len,
);
if (result == haystack.len) {
return null;
}
return result;
}
/// Scans for:
/// - " "
/// - Non-ASCII characters (which implicitly include `\n`, `\r`, '\t')
pub fn indexOfSpaceOrNewlineOrNonASCII(haystack: string) ?usize {
if (haystack.len == 0) {
return null;
}
const result = highway_index_of_space_or_newline_or_non_ascii(
haystack.ptr,
haystack.len,
);
if (result == haystack.len) {
return null;
}
return result;
}

View File

@@ -731,80 +731,17 @@ pub const Mask = struct {
const mask = mask_buf.*;
const skip_mask = @as(u32, @bitCast(mask)) == 0;
if (!skip_mask) {
fillWithSkipMask(mask, output_, input_, false);
} else {
fillWithSkipMask(mask, output_, input_, true);
}
fillWithSkipMask(mask, output_, input_, skip_mask);
}
fn fillWithSkipMask(mask: [4]u8, output_: []u8, input_: []const u8, comptime skip_mask: bool) void {
var input = input_;
var output = output_;
if (comptime Environment.enableSIMD) {
if (input.len >= strings.ascii_vector_size) {
const vec: strings.AsciiVector = brk: {
var in: [strings.ascii_vector_size]u8 = undefined;
comptime var i: usize = 0;
inline while (i < strings.ascii_vector_size) : (i += 4) {
in[i..][0..4].* = mask;
}
break :brk @as(strings.AsciiVector, in);
};
const end_ptr_wrapped_to_last_16 = input.ptr + input.len - (input.len % strings.ascii_vector_size);
if (comptime skip_mask) {
while (input.ptr != end_ptr_wrapped_to_last_16) {
const input_vec: strings.AsciiVector = @as(strings.AsciiVector, input[0..strings.ascii_vector_size].*);
output.ptr[0..strings.ascii_vector_size].* = input_vec;
output = output[strings.ascii_vector_size..];
input = input[strings.ascii_vector_size..];
}
} else {
while (input.ptr != end_ptr_wrapped_to_last_16) {
const input_vec: strings.AsciiVector = @as(strings.AsciiVector, input[0..strings.ascii_vector_size].*);
output.ptr[0..strings.ascii_vector_size].* = input_vec ^ vec;
output = output[strings.ascii_vector_size..];
input = input[strings.ascii_vector_size..];
}
}
}
// hint to the compiler not to vectorize the next loop
bun.assert(input.len < strings.ascii_vector_size);
}
if (comptime !skip_mask) {
while (input.len >= 4) {
const input_vec: [4]u8 = input[0..4].*;
output.ptr[0..4].* = [4]u8{
input_vec[0] ^ mask[0],
input_vec[1] ^ mask[1],
input_vec[2] ^ mask[2],
input_vec[3] ^ mask[3],
};
output = output[4..];
input = input[4..];
}
} else {
while (input.len >= 4) {
const input_vec: [4]u8 = input[0..4].*;
output.ptr[0..4].* = input_vec;
output = output[4..];
input = input[4..];
}
}
if (comptime !skip_mask) {
for (input, 0..) |c, i| {
output[i] = c ^ mask[i % 4];
}
} else {
for (input, 0..) |c, i| {
output[i] = c;
}
fn fillWithSkipMask(mask: [4]u8, output_: []u8, input_: []const u8, skip_mask: bool) void {
const input = input_;
const output = output_;
if (input.len == 0) {
@branchHint(.unlikely);
return;
}
return bun.highway.fillWithSkipMask(mask, output, input, skip_mask);
}
};
@@ -902,7 +839,7 @@ const Copy = union(enum) {
return WebsocketHeader.frameSizeIncludingMask(byte_len.*);
},
.latin1 => {
byte_len.* = strings.elementLengthLatin1IntoUTF8([]const u8, this.latin1);
byte_len.* = strings.elementLengthLatin1IntoUTF8(this.latin1);
return WebsocketHeader.frameSizeIncludingMask(byte_len.*);
},
.bytes => {

View File

@@ -806,6 +806,10 @@ fn NewLexer_(
return if (!(cp_len + it.current > it.source.contents.len)) it.source.contents[it.current .. cp_len + it.current] else "";
}
fn remaining(it: *const LexerType) []const u8 {
return it.source.contents[it.current..];
}
inline fn nextCodepoint(it: *LexerType) CodePoint {
if (it.current >= it.source.contents.len) {
it.end = it.source.contents.len;
@@ -1498,26 +1502,14 @@ fn NewLexer_(
lexer.token = .t_slash_equals;
},
'/' => {
singleLineComment: while (true) {
lexer.step();
switch (lexer.code_point) {
'\r', '\n', 0x2028, 0x2029 => {
break :singleLineComment;
},
-1 => {
break :singleLineComment;
},
else => {},
}
}
lexer.scanSingleLineComment();
if (comptime is_json) {
if (!json.allow_comments) {
try lexer.addRangeError(lexer.range(), "JSON does not support comments", .{}, true);
return;
}
}
lexer.scanCommentText();
lexer.scanCommentText(false);
continue;
},
'*' => {
@@ -1571,7 +1563,7 @@ fn NewLexer_(
return;
}
}
lexer.scanCommentText();
lexer.scanCommentText(true);
continue;
},
else => {
@@ -1890,7 +1882,7 @@ fn NewLexer_(
}
}
fn scanCommentText(lexer: *LexerType) void {
fn scanCommentText(lexer: *LexerType, for_pragma: bool) void {
const text = lexer.source.contents[lexer.start..lexer.end];
const has_legal_annotation = text.len > 2 and text[2] == '!';
const is_multiline_comment = text.len > 1 and text[1] == '*';
@@ -1922,120 +1914,132 @@ fn NewLexer_(
if (comptime is_json)
return;
var rest = text[0..end_comment_text];
const end = rest.ptr + rest.len;
if (comptime Environment.enableSIMD) {
const wrapped_len = rest.len - (rest.len % strings.ascii_vector_size);
const comment_end = rest.ptr + wrapped_len;
while (rest.ptr != comment_end) {
const vec: strings.AsciiVector = rest.ptr[0..strings.ascii_vector_size].*;
// lookahead for any # or @ characters
const hashtag = @as(strings.AsciiVectorU1, @bitCast(vec == @as(strings.AsciiVector, @splat(@as(u8, '#')))));
const at = @as(strings.AsciiVectorU1, @bitCast(vec == @as(strings.AsciiVector, @splat(@as(u8, '@')))));
if (@reduce(.Max, hashtag + at) == 1) {
rest.len = @intFromPtr(end) - @intFromPtr(rest.ptr);
if (comptime Environment.allow_assert) {
bun.assert(
strings.containsChar(&@as([strings.ascii_vector_size]u8, vec), '#') or
strings.containsChar(&@as([strings.ascii_vector_size]u8, vec), '@'),
);
}
for (@as([strings.ascii_vector_size]u8, vec), 0..) |c, i| {
switch (c) {
'@', '#' => {
const chunk = rest[i + 1 ..];
if (!lexer.has_pure_comment_before) {
if (strings.hasPrefixWithWordBoundary(chunk, "__PURE__")) {
lexer.has_pure_comment_before = true;
continue;
}
// TODO: implement NO_SIDE_EFFECTS
// else if (strings.hasPrefixWithWordBoundary(chunk, "__NO_SIDE_EFFECTS__")) {
// lexer.has_no_side_effect_comment_before = true;
// continue;
// }
}
if (strings.hasPrefixWithWordBoundary(chunk, "jsx")) {
if (PragmaArg.scan(.skip_space_first, lexer.start + i + 1, "jsx", chunk)) |span| {
lexer.jsx_pragma._jsx = span;
}
} else if (strings.hasPrefixWithWordBoundary(chunk, "jsxFrag")) {
if (PragmaArg.scan(.skip_space_first, lexer.start + i + 1, "jsxFrag", chunk)) |span| {
lexer.jsx_pragma._jsxFrag = span;
}
} else if (strings.hasPrefixWithWordBoundary(chunk, "jsxRuntime")) {
if (PragmaArg.scan(.skip_space_first, lexer.start + i + 1, "jsxRuntime", chunk)) |span| {
lexer.jsx_pragma._jsxRuntime = span;
}
} else if (strings.hasPrefixWithWordBoundary(chunk, "jsxImportSource")) {
if (PragmaArg.scan(.skip_space_first, lexer.start + i + 1, "jsxImportSource", chunk)) |span| {
lexer.jsx_pragma._jsxImportSource = span;
}
} else if (i == 2 and strings.hasPrefixComptime(chunk, " sourceMappingURL=")) {
if (PragmaArg.scan(.no_space_first, lexer.start + i + 1, " sourceMappingURL=", chunk)) |span| {
lexer.source_mapping_url = span;
}
}
},
else => {},
}
}
}
rest.ptr += strings.ascii_vector_size;
}
rest.len = @intFromPtr(end) - @intFromPtr(rest.ptr);
if (!for_pragma) {
return;
}
if (comptime Environment.allow_assert)
bun.assert(rest.len == 0 or bun.isSliceInBuffer(rest, text));
var rest = text[0..end_comment_text];
while (rest.len > 0) {
const c = rest[0];
rest = rest[1..];
while (strings.indexOfAny(rest, "@#")) |i| {
const c = rest[i];
rest = rest[@min(i + 1, rest.len)..];
switch (c) {
'@', '#' => {
const chunk = rest;
const i = @intFromPtr(chunk.ptr) - @intFromPtr(text.ptr);
if (!lexer.has_pure_comment_before) {
if (strings.hasPrefixWithWordBoundary(chunk, "__PURE__")) {
lexer.has_pure_comment_before = true;
continue;
}
}
const offset = lexer.scanPragma(lexer.start + i + (text.len - rest.len), chunk, false);
if (strings.hasPrefixWithWordBoundary(chunk, "jsx")) {
if (PragmaArg.scan(.skip_space_first, lexer.start + i + 1, "jsx", chunk)) |span| {
lexer.jsx_pragma._jsx = span;
}
} else if (strings.hasPrefixWithWordBoundary(chunk, "jsxFrag")) {
if (PragmaArg.scan(.skip_space_first, lexer.start + i + 1, "jsxFrag", chunk)) |span| {
lexer.jsx_pragma._jsxFrag = span;
}
} else if (strings.hasPrefixWithWordBoundary(chunk, "jsxRuntime")) {
if (PragmaArg.scan(.skip_space_first, lexer.start + i + 1, "jsxRuntime", chunk)) |span| {
lexer.jsx_pragma._jsxRuntime = span;
}
} else if (strings.hasPrefixWithWordBoundary(chunk, "jsxImportSource")) {
if (PragmaArg.scan(.skip_space_first, lexer.start + i + 1, "jsxImportSource", chunk)) |span| {
lexer.jsx_pragma._jsxImportSource = span;
}
} else if (i == 2 and strings.hasPrefixComptime(chunk, " sourceMappingURL=")) {
if (PragmaArg.scan(.no_space_first, lexer.start + i + 1, " sourceMappingURL=", chunk)) |span| {
lexer.source_mapping_url = span;
}
}
rest = rest[
// The @min is necessary because the file could end
// with a pragma and hasPrefixWithWordBoundary
// returns true when that "word boundary" is EOF
@min(offset, rest.len)..];
},
else => {},
}
}
}
/// This scans a "// comment" in a single pass over the input.
fn scanSingleLineComment(lexer: *LexerType) void {
while (true) {
// Find index of newline (ASCII/Unicode), non-ASCII, '#', or '@'.
if (bun.highway.indexOfNewlineOrNonASCIIOrHashOrAt(lexer.remaining())) |relative_index| {
const absolute_index = lexer.current + relative_index;
lexer.current = absolute_index; // Move TO the interesting char
lexer.step(); // Consume the interesting char, sets code_point, advances current
switch (lexer.code_point) {
'\r', '\n', 0x2028, 0x2029 => { // Is it a line terminator?
// Found the end of the comment line.
return; // Stop scanning. Lexer state is ready for the next token.
},
-1 => {
return;
}, // EOF? Stop.
'#', '@' => {
if (comptime !is_json) {
const pragma_trigger_pos = lexer.end; // Position OF #/@
// Use remaining() which starts *after* the consumed #/@
const chunk = lexer.remaining();
const offset = lexer.scanPragma(pragma_trigger_pos, chunk, true);
if (offset > 0) {
// Pragma found (e.g., __PURE__).
// Advance current past the pragma's argument text.
// 'current' is already after the #/@ trigger.
lexer.current += offset;
// Do NOT consume the character immediately after the pragma.
// Let the main loop find the actual line terminator.
// Continue the outer loop from the position AFTER the pragma arg.
continue;
}
// If offset == 0, it wasn't a valid pragma start.
}
// Not a pragma or is_json. Treat #/@ as a normal comment character.
// The character was consumed by step(). Let the outer loop continue.
continue;
},
else => {
// Non-ASCII (but not LS/PS), etc. Treat as normal comment char.
// The character was consumed by step(). Let the outer loop continue.
continue;
},
}
} else { // Highway found nothing until EOF
// Consume the rest of the line.
lexer.end = lexer.source.contents.len;
lexer.current = lexer.source.contents.len;
lexer.code_point = -1; // Set EOF state
return;
}
}
unreachable;
}
/// Scans the string for a pragma.
/// offset is used when there's an issue with the JSX pragma later on.
/// Returns the byte length to advance by if found, otherwise 0.
fn scanPragma(lexer: *LexerType, offset_for_errors: usize, chunk: string, allow_newline: bool) usize {
if (!lexer.has_pure_comment_before) {
if (strings.hasPrefixWithWordBoundary(chunk, "__PURE__")) {
lexer.has_pure_comment_before = true;
return "__PURE__".len;
}
}
if (strings.hasPrefixWithWordBoundary(chunk, "jsx")) {
if (PragmaArg.scan(.skip_space_first, lexer.start + offset_for_errors, "jsx", chunk, allow_newline)) |span| {
lexer.jsx_pragma._jsx = span;
return "jsx".len +
if (span.range.len > 0) @as(usize, @intCast(span.range.len)) else 0;
}
} else if (strings.hasPrefixWithWordBoundary(chunk, "jsxFrag")) {
if (PragmaArg.scan(.skip_space_first, lexer.start + offset_for_errors, "jsxFrag", chunk, allow_newline)) |span| {
lexer.jsx_pragma._jsxFrag = span;
return "jsxFrag".len +
if (span.range.len > 0) @as(usize, @intCast(span.range.len)) else 0;
}
} else if (strings.hasPrefixWithWordBoundary(chunk, "jsxRuntime")) {
if (PragmaArg.scan(.skip_space_first, lexer.start + offset_for_errors, "jsxRuntime", chunk, allow_newline)) |span| {
lexer.jsx_pragma._jsxRuntime = span;
return "jsxRuntime".len +
if (span.range.len > 0) @as(usize, @intCast(span.range.len)) else 0;
}
} else if (strings.hasPrefixWithWordBoundary(chunk, "jsxImportSource")) {
if (PragmaArg.scan(.skip_space_first, lexer.start + offset_for_errors, "jsxImportSource", chunk, allow_newline)) |span| {
lexer.jsx_pragma._jsxImportSource = span;
return "jsxImportSource".len +
if (span.range.len > 0) @as(usize, @intCast(span.range.len)) else 0;
}
} else if (chunk.len >= " sourceMappingURL=".len + 1 and strings.hasPrefixComptime(chunk, " sourceMappingURL=")) { // Check includes space for prefix
return PragmaArg.scanSourceMappingURLValue(lexer.start, offset_for_errors, chunk, &lexer.source_mapping_url);
}
return 0;
}
// TODO: implement this
pub fn removeMultilineCommentIndent(_: *LexerType, _: string, text: string) string {
return text;
@@ -2123,7 +2127,7 @@ fn NewLexer_(
return js_ast.E.String.init(try lexer.allocator.dupe(u16, lexer.temp_buffer_u16.items));
} else {
const result = try lexer.allocator.alloc(u8, lexer.temp_buffer_u16.items.len);
strings.copyU16IntoU8(result, []const u16, lexer.temp_buffer_u16.items);
strings.copyU16IntoU8(result, lexer.temp_buffer_u16.items);
return js_ast.E.String.init(result);
}
},
@@ -3267,7 +3271,48 @@ pub const PragmaArg = enum {
no_space_first,
skip_space_first,
pub fn scan(kind: PragmaArg, offset_: usize, pragma: string, text_: string) ?js_ast.Span {
pub fn isNewline(c: CodePoint) bool {
return c == '\r' or c == '\n' or c == 0x2028 or c == 0x2029;
}
// These can be extremely long, so we use SIMD.
/// "//# sourceMappingURL=data:/adspaoksdpkz"
/// ^^^^^^^^^^^^^^^^^^
pub fn scanSourceMappingURLValue(start: usize, offset_for_errors: usize, chunk: string, result: *?js_ast.Span) usize {
const prefix: u32 = " sourceMappingURL=".len;
const url_and_rest_of_code = chunk[prefix..]; // Slice containing only the potential argument
const url_len: usize = brk: {
if (bun.strings.indexOfSpaceOrNewlineOrNonASCII(url_and_rest_of_code, 0)) |delimiter_pos_in_arg| {
// SIMD found the delimiter at index 'delimiter_pos_in_arg' relative to url start.
// The argument's length is exactly this index.
break :brk delimiter_pos_in_arg;
} else {
// SIMD found no delimiter in the entire url.
// The argument is the whole chunk.
break :brk url_and_rest_of_code.len;
}
};
// Now we have the correct argument length (url_len) and the argument text.
const url = url_and_rest_of_code[0..url_len];
// Calculate absolute start location of the argument
const absolute_arg_start = start + offset_for_errors + prefix;
result.* = js_ast.Span{
.range = logger.Range{
.len = @as(i32, @intCast(url_len)), // Correct length
.loc = .{ .start = @as(i32, @intCast(absolute_arg_start)) }, // Correct start
},
.text = url,
};
// Return total length consumed from the start of the chunk
return prefix + url_len; // Correct total length
}
pub fn scan(kind: PragmaArg, offset_: usize, pragma: string, text_: string, allow_newline: bool) ?js_ast.Span {
var text = text_[pragma.len..];
var iter = strings.CodepointIterator.init(text);
@@ -3297,7 +3342,7 @@ pub const PragmaArg = enum {
}
var i: usize = 0;
while (!isWhitespace(cursor.c)) {
while (!isWhitespace(cursor.c) and (!allow_newline or !isNewline(cursor.c))) {
i += cursor.width;
if (i >= text.len) {
break;
@@ -3356,28 +3401,5 @@ fn skipToInterestingCharacterInMultilineComment(text_: []const u8) ?u32 {
}
fn indexOfInterestingCharacterInStringLiteral(text_: []const u8, quote: u8) ?usize {
var text = text_;
const quote_: @Vector(strings.ascii_vector_size, u8) = @splat(@as(u8, quote));
const backslash: @Vector(strings.ascii_vector_size, u8) = @splat(@as(u8, '\\'));
const V1x16 = strings.AsciiVectorU1;
while (text.len >= strings.ascii_vector_size) {
const vec: strings.AsciiVector = text[0..strings.ascii_vector_size].*;
const any_significant =
@as(V1x16, @bitCast(vec > strings.max_16_ascii)) |
@as(V1x16, @bitCast(vec < strings.min_16_ascii)) |
@as(V1x16, @bitCast(quote_ == vec)) |
@as(V1x16, @bitCast(backslash == vec));
if (@reduce(.Max, any_significant) > 0) {
const bitmask = @as(u16, @bitCast(any_significant));
const first = @ctz(bitmask);
bun.assert(first < strings.ascii_vector_size);
return first + (@intFromPtr(text.ptr) - @intFromPtr(text_.ptr));
}
text = text[strings.ascii_vector_size..];
}
return null;
return bun.highway.indexOfInterestingCharacterInStringLiteral(text_, quote);
}

View File

@@ -165,7 +165,7 @@ pub fn estimateLengthForUTF8(input: []const u8, comptime ascii_only: bool, compt
var remaining = input;
var len: usize = 2; // for quotes
while (strings.indexOfNeedsEscape(remaining, quote_char)) |i| {
while (strings.indexOfNeedsEscapeForJavaScriptString(remaining, quote_char)) |i| {
len += i;
remaining = remaining[i..];
const char_len = strings.wtf8ByteSequenceLengthWithInvalid(remaining[0]);
@@ -249,7 +249,7 @@ pub fn writePreQuotedString(text_in: []const u8, comptime Writer: type, writer:
switch (encoding) {
.ascii, .utf8 => {
if (strings.indexOfNeedsEscape(remain, quote_char)) |j| {
if (strings.indexOfNeedsEscapeForJavaScriptString(remain, quote_char)) |j| {
const text_chunk = text[i .. i + clamped_width];
try writer.writeAll(text_chunk);
i += clamped_width;

View File

@@ -95,60 +95,6 @@ fn StackStack(comptime T: type, comptime SizeType: type, comptime N: SizeType) t
};
}
/// This may have false positives but it is fast
fn fastDetect(src: []const u8) bool {
var has_open = false;
var has_close = false;
if (src.len < 16) {
for (src) |char| {
switch (char) {
'{' => {
has_open = true;
},
'}' => {
has_close = true;
},
}
if (has_close and has_close) return true;
}
return false;
}
const needles = comptime [2]@Vector(16, u8){
@splat('{'),
@splat('}'),
@splat('"'),
};
const i: usize = 0;
while (i + 16 <= src.len) {
const haystack = src[i .. i + 16].*;
if (std.simd.firstTrue(needles[0] == haystack)) {
has_open = true;
}
if (std.simd.firstTrue(needles[1] == haystack)) {
has_close = true;
}
if (has_open and has_close) return true;
}
if (i < src.len) {
for (src) |char| {
switch (char) {
'{' => {
has_open = true;
},
'}' => {
has_close = true;
},
}
if (has_close and has_open) return true;
}
return false;
}
return false;
}
const ExpandError = StackError || ParserError;
/// `out` is preallocated by using the result from `calculateExpandedAmount`

View File

@@ -3563,26 +3563,10 @@ var stderr_mutex = bun.Mutex{};
pub fn hasEqSign(str: []const u8) ?u32 {
if (isAllAscii(str)) {
if (str.len < 16)
return hasEqSignAsciiSlow(str);
const needles: @Vector(16, u8) = @splat('=');
var i: u32 = 0;
while (i + 16 <= str.len) : (i += 16) {
const haystack = str[i..][0..16].*;
const result = haystack == needles;
if (std.simd.firstTrue(result)) |idx| {
return @intCast(i + idx);
}
}
return i + (hasEqSignAsciiSlow(str[i..]) orelse return null);
return bun.strings.indexOfChar(str, '=');
}
// TODO actually i think that this can also use the simd stuff
var iter = CodepointIterator.init(str);
var cursor = CodepointIterator.Cursor{};
while (iter.next(&cursor)) {
@@ -3594,11 +3578,6 @@ pub fn hasEqSign(str: []const u8) ?u32 {
return null;
}
pub fn hasEqSignAsciiSlow(str: []const u8) ?u32 {
for (str, 0..) |c, i| if (c == '=') return @intCast(i);
return null;
}
pub const CmdEnvIter = struct {
env: *const bun.StringArrayHashMap([:0]const u8),
iter: bun.StringArrayHashMap([:0]const u8).Iterator,

View File

@@ -96,43 +96,14 @@ fn literalLength(comptime T: type, comptime str: string) usize {
pub const OptionalUsize = std.meta.Int(.unsigned, @bitSizeOf(usize) - 1);
pub fn indexOfAny(slice: string, comptime str: []const u8) ?OptionalUsize {
switch (comptime str.len) {
return switch (comptime str.len) {
0 => @compileError("str cannot be empty"),
1 => return indexOfChar(slice, str[0]),
else => {},
}
var remaining = slice;
if (remaining.len == 0) return null;
if (comptime Environment.enableSIMD) {
while (remaining.len >= ascii_vector_size) {
const vec: AsciiVector = remaining[0..ascii_vector_size].*;
var cmp: AsciiVectorU1 = @bitCast(vec == @as(AsciiVector, @splat(@as(u8, str[0]))));
inline for (str[1..]) |c| {
cmp |= @bitCast(vec == @as(AsciiVector, @splat(@as(u8, c))));
}
if (@reduce(.Max, cmp) > 0) {
const bitmask = @as(AsciiVectorInt, @bitCast(cmp));
const first = @ctz(bitmask);
return @as(OptionalUsize, @intCast(first + slice.len - remaining.len));
}
remaining = remaining[ascii_vector_size..];
}
if (comptime Environment.allow_assert) assert(remaining.len < ascii_vector_size);
}
for (remaining, 0..) |c, i| {
if (strings.indexOfChar(str, c) != null) {
return @as(OptionalUsize, @intCast(i + slice.len - remaining.len));
}
}
return null;
else => if (bun.highway.indexOfAnyChar(slice, str)) |i|
@intCast(i)
else
null,
};
}
pub fn indexOfAny16(self: []const u16, comptime str: anytype) ?OptionalUsize {
@@ -177,7 +148,7 @@ pub fn inMapCaseInsensitive(self: []const u8, comptime ComptimeStringMap: anytyp
return bun.String.ascii(self).inMapCaseInsensitive(ComptimeStringMap);
}
pub inline fn containsAny(in: anytype, target: string) bool {
pub inline fn containsAny(in: anytype, target: anytype) bool {
for (in) |str| if (contains(if (@TypeOf(str) == u8) &[1]u8{str} else bun.span(str), target)) return true;
return false;
}
@@ -496,7 +467,7 @@ pub inline fn lastIndexOf(self: string, str: string) ?usize {
return std.mem.lastIndexOf(u8, self, str);
}
pub inline fn indexOf(self: string, str: string) ?usize {
pub fn indexOf(self: string, str: string) ?usize {
if (comptime !bun.Environment.isNative) {
return std.mem.indexOf(u8, self, str);
}
@@ -990,16 +961,13 @@ pub fn endsWithAnyComptime(self: string, comptime str: string) bool {
}
}
pub fn eql(self: string, other: anytype) bool {
pub fn eql(self: string, other: []const u8) bool {
if (self.len != other.len) return false;
if (comptime @TypeOf(other) == *string) {
return eql(self, other.*);
}
for (self, 0..) |c, i| {
if (other[i] != c) return false;
}
return true;
return eqlLong(self, other, false);
}
pub fn eqlComptimeT(comptime T: type, self: []const T, comptime alt: anytype) bool {
@@ -1367,43 +1335,11 @@ pub fn copyU8IntoU16WithAlignment(comptime alignment: u21, output_: []align(alig
// }
// }
pub inline fn copyU16IntoU8(output_: []u8, comptime InputType: type, input_: InputType) void {
if (comptime Environment.allow_assert) assert(input_.len <= output_.len);
var output = output_;
var input = input_;
pub inline fn copyU16IntoU8(output: []u8, input: []align(1) const u16) void {
if (comptime Environment.allow_assert) assert(input.len <= output.len);
const count = @min(input.len, output.len);
// https://zig.godbolt.org/z/9rTn1orcY
const group = @as(usize, 16);
// end at the last group of 16 bytes
var input_ptr = input.ptr;
var output_ptr = output.ptr;
if (comptime Environment.enableSIMD) {
const end_len = (@min(input.len, output.len) & ~(group - 1));
const last_vector_ptr = input.ptr + end_len;
while (last_vector_ptr != input_ptr) {
const input_vec1: @Vector(group, u16) = input_ptr[0..group].*;
inline for (0..group) |i| {
output_ptr[i] = @as(u8, @truncate(input_vec1[i]));
}
output_ptr += group;
input_ptr += group;
}
input.len -= end_len;
output.len -= end_len;
}
const last_input_ptr = input_ptr + @min(input.len, output.len);
while (last_input_ptr != input_ptr) {
output_ptr[0] = @as(u8, @truncate(input_ptr[0]));
output_ptr += 1;
input_ptr += 1;
}
bun.highway.copyU16ToU8(input[0..count], output[0..count]);
}
const strings = @This();
@@ -2353,11 +2289,7 @@ pub fn toUTF8ListWithTypeBun(list: *std.ArrayList(u8), comptime Type: type, utf1
}
list.items.len += i;
copyU16IntoU8(
list.items[list.items.len - i ..],
Type,
to_copy,
);
copyU16IntoU8(list.items[list.items.len - i ..], to_copy);
if (comptime skip_trailing_replacement) {
if (replacement.is_lead and utf16_remaining.len == 0) {
@@ -2377,7 +2309,7 @@ pub fn toUTF8ListWithTypeBun(list: *std.ArrayList(u8), comptime Type: type, utf1
try list.ensureTotalCapacityPrecise(utf16_remaining.len + list.items.len);
const old_len = list.items.len;
list.items.len += utf16_remaining.len;
copyU16IntoU8(list.items[old_len..], Type, utf16_remaining);
copyU16IntoU8(list.items[old_len..], utf16_remaining);
}
log("UTF16 {d} -> {d} UTF8", .{ utf16.len, list.items.len });
@@ -2794,43 +2726,8 @@ pub fn replaceLatin1WithUTF8(buf_: []u8) void {
}
}
pub fn elementLengthLatin1IntoUTF8(comptime Type: type, latin1_: Type) usize {
// https://zig.godbolt.org/z/zzYexPPs9
var latin1 = latin1_;
const input_len = latin1.len;
var total_non_ascii_count: usize = 0;
// This is about 30% faster on large input compared to auto-vectorization
if (comptime Environment.enableSIMD) {
const end = latin1.ptr + (latin1.len - (latin1.len % ascii_vector_size));
while (latin1.ptr != end) {
const vec: AsciiVector = latin1[0..ascii_vector_size].*;
// Shifting a unsigned 8 bit integer to the right by 7 bits always produces a value of 0 or 1.
const cmp = vec >> @as(AsciiVector, @splat(
@as(u8, 7),
));
// Anding that value rather than converting it into a @Vector(16, u1) produces better code from LLVM.
const mask: AsciiVector = cmp & @as(AsciiVector, @splat(
@as(u8, 1),
));
total_non_ascii_count += @as(usize, @reduce(.Add, mask));
latin1 = latin1[ascii_vector_size..];
}
// an important hint to the compiler to not auto-vectorize the loop below
if (latin1.len >= ascii_vector_size) unreachable;
}
for (latin1) |c| {
total_non_ascii_count += @as(usize, @intFromBool(c > 127));
}
// each non-ascii latin1 character becomes 2 UTF8 characters
return input_len + total_non_ascii_count;
pub fn elementLengthLatin1IntoUTF8(slice: []const u8) usize {
return bun.simdutf.length.utf8.from.latin1(slice);
}
pub fn copyLatin1IntoUTF16(comptime Buffer: type, buf_: Buffer, comptime Type: type, latin1_: Type) EncodeIntoResult {
@@ -2865,20 +2762,7 @@ pub fn elementLengthLatin1IntoUTF16(comptime Type: type, latin1_: Type) usize {
return latin1_.len;
}
var count: usize = 0;
var latin1 = latin1_;
while (latin1.len > 0) {
const function = comptime if (std.meta.Child(Type) == u8) strings.firstNonASCIIWithType else strings.firstNonASCII16;
const to_write = function(Type, latin1) orelse @as(u32, @truncate(latin1.len));
count += to_write;
latin1 = latin1[to_write..];
if (latin1.len > 0) {
count += comptime if (std.meta.Child(Type) == u8) 2 else 1;
latin1 = latin1[1..];
}
}
return count;
return bun.simdutf.length.utf16.from.latin1(latin1_);
}
pub fn escapeHTMLForLatin1Input(allocator: std.mem.Allocator, latin1: []const u8) !Escaped(u8) {
@@ -3605,7 +3489,7 @@ pub fn copyUTF16IntoUTF8WithBuffer(buf: []u8, comptime Type: type, utf16: Type,
while (firstNonASCII16(Type, utf16_remaining)) |i| {
const end = @min(i, remaining.len);
if (end > 0) copyU16IntoU8(remaining, Type, utf16_remaining[0..end]);
if (end > 0) copyU16IntoU8(remaining, utf16_remaining[0..end]);
remaining = remaining[end..];
utf16_remaining = utf16_remaining[end..];
@@ -3674,7 +3558,7 @@ pub fn copyUTF16IntoUTF8WithBuffer(buf: []u8, comptime Type: type, utf16: Type,
if (remaining.len > 0 and !ended_on_non_ascii and utf16_remaining.len > 0) {
const len = @min(remaining.len, utf16_remaining.len);
copyU16IntoU8(remaining[0..len], Type, utf16_remaining[0..len]);
copyU16IntoU8(remaining[0..len], utf16_remaining[0..len]);
utf16_remaining = utf16_remaining[len..];
remaining = remaining[len..];
}
@@ -4014,44 +3898,7 @@ pub fn isAllASCII(slice: []const u8) bool {
return true;
}
if (bun.FeatureFlags.use_simdutf)
return bun.simdutf.validate.ascii(slice);
var remaining = slice;
// The NEON SIMD unit is 128-bit wide and includes 16 128-bit registers that can be used as 32 64-bit registers
if (comptime Environment.enableSIMD) {
const remaining_end_ptr = remaining.ptr + remaining.len - (remaining.len % ascii_vector_size);
while (remaining.ptr != remaining_end_ptr) : (remaining.ptr += ascii_vector_size) {
const vec: AsciiVector = remaining[0..ascii_vector_size].*;
if (@reduce(.Max, vec) > 127) {
return false;
}
}
}
const Int = u64;
const size = @sizeOf(Int);
const remaining_last8 = slice.ptr + slice.len - (slice.len % size);
while (remaining.ptr != remaining_last8) : (remaining.ptr += size) {
const bytes = @as(Int, @bitCast(remaining[0..size].*));
// https://dotat.at/@/2022-06-27-tolower-swar.html
const mask = bytes & 0x8080808080808080;
if (mask > 0) {
return false;
}
}
const final = slice.ptr + slice.len;
while (remaining.ptr != final) : (remaining.ptr += 1) {
if (remaining[0] > 127) {
return false;
}
}
return true;
return bun.simdutf.validate.ascii(slice);
}
// #define U16_LEAD(supplementary) (UChar)(((supplementary)>>10)+0xd7c0)
@@ -4085,296 +3932,67 @@ pub inline fn u16GetSupplementary(lead: u32, trail: u32) u32 {
pub const u16_surrogate_offset = 56613888;
pub fn firstNonASCII(slice: []const u8) ?u32 {
return firstNonASCIIWithType([]const u8, slice);
}
pub fn firstNonASCIIWithType(comptime Type: type, slice: Type) ?u32 {
var remaining = slice;
if (comptime bun.FeatureFlags.use_simdutf) {
const result = bun.simdutf.validate.with_errors.ascii(slice);
if (result.status == .success) {
return null;
}
return @as(u32, @truncate(result.count));
}
if (comptime Environment.enableSIMD) {
if (remaining.len >= ascii_vector_size) {
const remaining_start = remaining.ptr;
const remaining_end = remaining.ptr + remaining.len - (remaining.len % ascii_vector_size);
while (remaining.ptr != remaining_end) {
const vec: AsciiVector = remaining[0..ascii_vector_size].*;
if (@reduce(.Max, vec) > 127) {
const Int = u64;
const size = @sizeOf(Int);
remaining.len -= @intFromPtr(remaining.ptr) - @intFromPtr(remaining_start);
{
const bytes = @as(Int, @bitCast(remaining[0..size].*));
// https://dotat.at/@/2022-06-27-tolower-swar.html
const mask = bytes & 0x8080808080808080;
if (mask > 0) {
const first_set_byte = @ctz(mask) / 8;
if (comptime Environment.isDebug) {
bun.assert(remaining[first_set_byte] > 127);
for (0..first_set_byte) |j| {
bun.assert(remaining[j] <= 127);
}
}
return @as(u32, first_set_byte) + @as(u32, @intCast(slice.len - remaining.len));
}
remaining = remaining[size..];
}
{
const bytes = @as(Int, @bitCast(remaining[0..size].*));
const mask = bytes & 0x8080808080808080;
if (mask > 0) {
const first_set_byte = @ctz(mask) / 8;
if (comptime Environment.isDebug) {
bun.assert(remaining[first_set_byte] > 127);
for (0..first_set_byte) |j| {
bun.assert(remaining[j] <= 127);
}
}
return @as(u32, first_set_byte) + @as(u32, @intCast(slice.len - remaining.len));
}
}
unreachable;
}
// the more intuitive way, using slices, produces worse codegen
// specifically: it subtracts the length at the end of the loop
// we don't need to do that
// we only need to subtract the length once at the very end
remaining.ptr += ascii_vector_size;
}
remaining.len -= @intFromPtr(remaining.ptr) - @intFromPtr(remaining_start);
}
}
{
const Int = u64;
const size = @sizeOf(Int);
const remaining_start = remaining.ptr;
const remaining_end = remaining.ptr + remaining.len - (remaining.len % size);
if (comptime Environment.enableSIMD) {
// these assertions exist more so for LLVM
bun.unsafeAssert(remaining.len < ascii_vector_size);
bun.unsafeAssert(@intFromPtr(remaining.ptr + ascii_vector_size) > @intFromPtr(remaining_end));
}
if (remaining.len >= size) {
while (remaining.ptr != remaining_end) {
const bytes = @as(Int, @bitCast(remaining[0..size].*));
// https://dotat.at/@/2022-06-27-tolower-swar.html
const mask = bytes & 0x8080808080808080;
if (mask > 0) {
remaining.len -= @intFromPtr(remaining.ptr) - @intFromPtr(remaining_start);
const first_set_byte = @ctz(mask) / 8;
if (comptime Environment.isDebug) {
bun.unsafeAssert(remaining[first_set_byte] > 127);
for (0..first_set_byte) |j| {
bun.unsafeAssert(remaining[j] <= 127);
}
}
return @as(u32, first_set_byte) + @as(u32, @intCast(slice.len - remaining.len));
}
remaining.ptr += size;
}
remaining.len -= @intFromPtr(remaining.ptr) - @intFromPtr(remaining_start);
}
}
if (comptime Environment.allow_assert) assert(remaining.len < 8);
for (remaining) |*char| {
if (char.* > 127) {
// try to prevent it from reading the length of the slice
return @as(u32, @truncate(@intFromPtr(char) - @intFromPtr(slice.ptr)));
}
}
return null;
}
pub fn indexOfNewlineOrNonASCIIOrANSI(slice_: []const u8, offset: u32) ?u32 {
const slice = slice_[offset..];
var remaining = slice;
if (remaining.len == 0)
const result = bun.simdutf.validate.with_errors.ascii(slice);
if (result.status == .success) {
return null;
if (comptime Environment.enableSIMD) {
while (remaining.len >= ascii_vector_size) {
const vec: AsciiVector = remaining[0..ascii_vector_size].*;
const cmp = @as(AsciiVectorU1, @bitCast((vec > max_16_ascii))) | @as(AsciiVectorU1, @bitCast((vec < min_16_ascii))) |
@as(AsciiVectorU1, @bitCast(vec == @as(AsciiVector, @splat(@as(u8, '\r'))))) |
@as(AsciiVectorU1, @bitCast(vec == @as(AsciiVector, @splat(@as(u8, '\n'))))) |
@as(AsciiVectorU1, @bitCast(vec == @as(AsciiVector, @splat(@as(u8, '\x1b')))));
if (@reduce(.Max, cmp) > 0) {
const bitmask = @as(AsciiVectorInt, @bitCast(cmp));
const first = @ctz(bitmask);
return @as(u32, first) + @as(u32, @intCast(slice.len - remaining.len)) + offset;
}
remaining = remaining[ascii_vector_size..];
}
if (comptime Environment.allow_assert) assert(remaining.len < ascii_vector_size);
}
for (remaining) |*char_| {
const char = char_.*;
if (char > 127 or char < 0x20 or char == '\n' or char == '\r' or char == '\x1b') {
return @as(u32, @truncate((@intFromPtr(char_) - @intFromPtr(slice.ptr)))) + offset;
}
}
return null;
return @as(u32, @truncate(result.count));
}
pub const indexOfNewlineOrNonASCIIOrANSI = indexOfNewlineOrNonASCII;
/// Checks if slice[offset..] has any < 0x20 or > 127 characters
pub fn indexOfNewlineOrNonASCII(slice_: []const u8, offset: u32) ?u32 {
return indexOfNewlineOrNonASCIICheckStart(slice_, offset, true);
}
pub fn indexOfSpaceOrNewlineOrNonASCII(slice_: []const u8, offset: u32) ?u32 {
const slice = slice_[offset..];
const remaining = slice;
if (remaining.len == 0)
return null;
if (remaining[0] > 127 or (remaining[0] < 0x20 and remaining[0] != 0x09)) {
return offset;
}
const i = bun.highway.indexOfSpaceOrNewlineOrNonASCII(remaining) orelse return null;
return @as(u32, @truncate(i)) + offset;
}
pub fn indexOfNewlineOrNonASCIICheckStart(slice_: []const u8, offset: u32, comptime check_start: bool) ?u32 {
const slice = slice_[offset..];
var remaining = slice;
const remaining = slice;
if (remaining.len == 0)
return null;
if (comptime check_start) {
// this shows up in profiling
if (remaining[0] > 127 or remaining[0] < 0x20 or remaining[0] == '\r' or remaining[0] == '\n') {
if (remaining[0] > 127 or (remaining[0] < 0x20 and remaining[0] != 0x09)) {
return offset;
}
}
if (comptime Environment.enableSIMD) {
while (remaining.len >= ascii_vector_size) {
const vec: AsciiVector = remaining[0..ascii_vector_size].*;
const cmp = @as(AsciiVectorU1, @bitCast((vec > max_16_ascii))) | @as(AsciiVectorU1, @bitCast((vec < min_16_ascii))) |
@as(AsciiVectorU1, @bitCast(vec == @as(AsciiVector, @splat(@as(u8, '\r'))))) |
@as(AsciiVectorU1, @bitCast(vec == @as(AsciiVector, @splat(@as(u8, '\n')))));
if (@reduce(.Max, cmp) > 0) {
const bitmask = @as(AsciiVectorInt, @bitCast(cmp));
const first = @ctz(bitmask);
return @as(u32, first) + @as(u32, @intCast(slice.len - remaining.len)) + offset;
}
remaining = remaining[ascii_vector_size..];
}
if (comptime Environment.allow_assert) assert(remaining.len < ascii_vector_size);
}
for (remaining) |*char_| {
const char = char_.*;
if (char > 127 or char < 0x20 or char == '\n' or char == '\r') {
return @as(u32, @truncate((@intFromPtr(char_) - @intFromPtr(slice.ptr)))) + offset;
}
}
return null;
const i = bun.highway.indexOfNewlineOrNonASCII(remaining) orelse return null;
return @as(u32, @truncate(i)) + offset;
}
pub fn containsNewlineOrNonASCIIOrQuote(slice_: []const u8) bool {
const slice = slice_;
var remaining = slice;
if (remaining.len == 0)
return false;
if (comptime Environment.enableSIMD) {
while (remaining.len >= ascii_vector_size) {
const vec: AsciiVector = remaining[0..ascii_vector_size].*;
const cmp = @as(AsciiVectorU1, @bitCast((vec > max_16_ascii))) | @as(AsciiVectorU1, @bitCast((vec < min_16_ascii))) |
@as(AsciiVectorU1, @bitCast(vec == @as(AsciiVector, @splat(@as(u8, '\r'))))) |
@as(AsciiVectorU1, @bitCast(vec == @as(AsciiVector, @splat(@as(u8, '\n'))))) |
@as(AsciiVectorU1, @bitCast(vec == @as(AsciiVector, @splat(@as(u8, '"')))));
if (@reduce(.Max, cmp) > 0) {
return true;
}
remaining = remaining[ascii_vector_size..];
}
if (comptime Environment.allow_assert) assert(remaining.len < ascii_vector_size);
}
for (remaining) |*char_| {
const char = char_.*;
if (char > 127 or char < 0x20 or char == '\n' or char == '\r' or char == '"') {
return true;
}
}
return false;
pub fn containsNewlineOrNonASCIIOrQuote(text: []const u8) bool {
return bun.highway.containsNewlineOrNonASCIIOrQuote(text);
}
/// JSON escape
pub fn indexOfNeedsEscape(slice: []const u8, comptime quote_char: u8) ?u32 {
var remaining = slice;
if (remaining.len == 0)
/// Supports:
/// - `"`
/// - `'`
/// - "`"
pub fn indexOfNeedsEscapeForJavaScriptString(slice: []const u8, quote_char: u8) ?u32 {
if (slice.len == 0)
return null;
if (remaining[0] >= 127 or remaining[0] < 0x20 or remaining[0] == '\\' or remaining[0] == quote_char or (quote_char == '`' and remaining[0] == '$')) {
return 0;
}
if (comptime Environment.enableSIMD) {
while (remaining.len >= ascii_vector_size) {
const vec: AsciiVector = remaining[0..ascii_vector_size].*;
const cmp: AsciiVectorU1 = if (comptime quote_char == '`') ( //
@as(AsciiVectorU1, @bitCast((vec > max_16_ascii))) |
@as(AsciiVectorU1, @bitCast((vec < min_16_ascii))) |
@as(AsciiVectorU1, @bitCast(vec == @as(AsciiVector, @splat(@as(u8, '\\'))))) |
@as(AsciiVectorU1, @bitCast(vec == @as(AsciiVector, @splat(@as(u8, quote_char))))) |
@as(AsciiVectorU1, @bitCast(vec == @as(AsciiVector, @splat(@as(u8, '$'))))) //
) else ( //
@as(AsciiVectorU1, @bitCast((vec > max_16_ascii))) |
@as(AsciiVectorU1, @bitCast((vec < min_16_ascii))) |
@as(AsciiVectorU1, @bitCast(vec == @as(AsciiVector, @splat(@as(u8, '\\'))))) |
@as(AsciiVectorU1, @bitCast(vec == @as(AsciiVector, @splat(@as(u8, quote_char))))) //
);
if (@reduce(.Max, cmp) > 0) {
const bitmask = @as(AsciiVectorInt, @bitCast(cmp));
const first = @ctz(bitmask);
return @as(u32, first) + @as(u32, @truncate(@intFromPtr(remaining.ptr) - @intFromPtr(slice.ptr)));
}
remaining = remaining[ascii_vector_size..];
}
}
for (remaining) |*char_| {
const char = char_.*;
if (char > 127 or char < 0x20 or char == '\\' or char == quote_char or (quote_char == '`' and char == '$')) {
return @as(u32, @truncate(@intFromPtr(char_) - @intFromPtr(slice.ptr)));
}
}
return null;
return bun.highway.indexOfNeedsEscapeForJavaScriptString(slice, quote_char);
}
pub fn indexOfNeedsURLEncode(slice: []const u8) ?u32 {
@@ -4447,15 +4065,7 @@ pub fn indexOfNeedsURLEncode(slice: []const u8) ?u32 {
}
pub fn indexOfCharZ(sliceZ: [:0]const u8, char: u8) ?u63 {
const ptr = bun.C.strchr(sliceZ.ptr, char) orelse return null;
const pos = @intFromPtr(ptr) - @intFromPtr(sliceZ.ptr);
if (comptime Environment.isDebug)
bun.assert(@intFromPtr(sliceZ.ptr) <= @intFromPtr(ptr) and
@intFromPtr(ptr) < @intFromPtr(sliceZ.ptr + sliceZ.len) and
pos <= sliceZ.len);
return @as(u63, @truncate(pos));
return @truncate(bun.highway.indexOfChar(sliceZ, char) orelse return null);
}
pub fn indexOfChar(slice: []const u8, char: u8) ?u32 {
@@ -4463,19 +4073,11 @@ pub fn indexOfChar(slice: []const u8, char: u8) ?u32 {
}
pub fn indexOfCharUsize(slice: []const u8, char: u8) ?usize {
if (slice.len == 0)
return null;
if (comptime !Environment.isNative) {
return std.mem.indexOfScalar(u8, slice, char);
}
const ptr = bun.C.memchr(slice.ptr, char, slice.len) orelse return null;
const i = @intFromPtr(ptr) - @intFromPtr(slice.ptr);
bun.assert(i < slice.len);
bun.assert(slice[i] == char);
return i;
return bun.highway.indexOfChar(slice, char);
}
pub fn indexOfCharPos(slice: []const u8, char: u8, start_index: usize) ?usize {
@@ -4485,13 +4087,9 @@ pub fn indexOfCharPos(slice: []const u8, char: u8, start_index: usize) ?usize {
if (start_index >= slice.len) return null;
const ptr = bun.C.memchr(slice.ptr + start_index, char, slice.len - start_index) orelse
return null;
const i = @intFromPtr(ptr) - @intFromPtr(slice.ptr);
bun.assert(i < slice.len);
bun.assert(slice[i] == char);
return i;
const result = bun.highway.indexOfChar(slice[start_index..], char) orelse return null;
bun.debugAssert(slice.len > result + start_index);
return result + start_index;
}
pub fn indexOfAnyPosComptime(slice: []const u8, comptime chars: []const u8, start_index: usize) ?usize {
@@ -4934,47 +4532,6 @@ pub fn firstNonASCII16(comptime Slice: type, slice: Slice) ?u32 {
return null;
}
/// Fast path for printing template literal strings
pub fn @"nextUTF16NonASCIIOr$`\\"(
comptime Slice: type,
slice: Slice,
) ?u32 {
var remaining = slice;
if (comptime Environment.enableSIMD and Environment.isNative) {
while (remaining.len >= ascii_u16_vector_size) {
const vec: AsciiU16Vector = remaining[0..ascii_u16_vector_size].*;
const cmp = @as(AsciiVectorU16U1, @bitCast((vec > max_u16_ascii))) |
@as(AsciiVectorU16U1, @bitCast((vec < min_u16_ascii))) |
@as(AsciiVectorU16U1, @bitCast((vec == @as(AsciiU16Vector, @splat(@as(u16, '$')))))) |
@as(AsciiVectorU16U1, @bitCast((vec == @as(AsciiU16Vector, @splat(@as(u16, '`')))))) |
@as(AsciiVectorU16U1, @bitCast((vec == @as(AsciiU16Vector, @splat(@as(u16, '\\'))))));
const bitmask = @as(u8, @bitCast(cmp));
const first = @ctz(bitmask);
if (first < ascii_u16_vector_size) {
return @as(u32, @intCast(@as(u32, first) +
@as(u32, @intCast(slice.len - remaining.len))));
}
remaining = remaining[ascii_u16_vector_size..];
}
}
for (remaining, 0..) |char, i| {
switch (char) {
'$', '`', '\\', 0...0x20 - 1, 128...std.math.maxInt(u16) => {
return @as(u32, @truncate(i + (slice.len - remaining.len)));
},
else => {},
}
}
return null;
}
/// Convert potentially ill-formed UTF-8 or UTF-16 bytes to a Unicode Codepoint.
/// - Invalid codepoints are replaced with `zero` parameter
/// - Null bytes return 0
@@ -5097,31 +4654,6 @@ pub fn lengthOfLeadingWhitespaceASCII(slice: string) usize {
return slice.len;
}
pub fn containsNonBmpCodePointUTF16(_text: []const u16) bool {
const n = _text.len;
if (n > 0) {
var i: usize = 0;
const text = _text[0 .. n - 1];
while (i < n - 1) : (i += 1) {
switch (text[i]) {
// Check for a high surrogate
0xD800...0xDBFF => {
// Check for a low surrogate
switch (text[i + 1]) {
0xDC00...0xDFFF => {
return true;
},
else => {},
}
},
else => {},
}
}
}
return false;
}
pub fn join(slices: []const string, delimiter: string, allocator: std.mem.Allocator) !string {
return try std.mem.join(allocator, delimiter, slices);
}
@@ -5238,6 +4770,75 @@ pub fn NewCodePointIterator(comptime CodePointType_: type, comptime zeroValue: c
return Iterator{ .bytes = str, .i = i, .c = zeroValue };
}
const SkipResult = enum {
eof,
found,
not_found,
};
/// Advance forward until the scalar function returns true.
/// THe simd function is "best effort" and expected to sometimes return a result which `scalar` will return false for.
/// This is because we don't decode UTF-8 in the SIMD code path.
pub fn skip(it: *const Iterator, cursor: *Cursor, simd: *const fn (input: []const u8) ?usize, scalar: *const fn (CodePointType) bool) SkipResult {
while (true) {
// 1. Get current position. Check for EOF.
const current_byte_index = cursor.i;
if (current_byte_index >= it.bytes.len) {
return .not_found; // Reached end without finding
}
// 2. Decode the *next* character using the standard iterator method.
if (!next(it, cursor)) {
return .not_found; // Reached end or error during decode
}
// 3. Check if the character just decoded matches the scalar condition.
if (scalar(it.c)) {
return .found; // Found it!
}
// 4. Optimization: Can we skip ahead using SIMD?
// Scan starting from the byte *after* the character we just decoded.
const next_scan_start_index = cursor.i;
if (next_scan_start_index >= it.bytes.len) {
// Just decoded the last character and it didn't match.
return .not_found;
}
const remaining_slice = it.bytes[next_scan_start_index..];
if (remaining_slice.len == 0) {
return .not_found;
}
// Ask SIMD for the next potential candidate.
if (simd(remaining_slice)) |pos| {
// SIMD found a potential candidate `pos` bytes ahead.
if (pos > 0) {
// Jump the byte index to the start of the potential candidate.
cursor.i = next_scan_start_index + @as(u32, @intCast(pos));
// Reset width so next() decodes correctly from the jumped position.
cursor.width = 0;
// Loop will continue, starting the decode from the new cursor.i.
continue;
}
// If pos == 0, SIMD suggests the *immediate next* character.
// No jump needed, just let the loop iterate naturally.
// Fallthrough to the end of the loop.
} else {
// SIMD found no potential candidates in the rest of the string.
// Since the SIMD search set is a superset of the scalar check set,
// we can guarantee that no character satisfying `scalar` exists further.
// Since the current character (decoded in step 2) also didn't match,
// we can conclude the target character is not found.
return .not_found;
}
// If we reach here, it means SIMD returned pos=0.
// Loop continues to the next iteration, processing the immediate next char.
} // End while true
unreachable;
}
pub inline fn next(it: *const Iterator, cursor: *Cursor) bool {
const pos: u32 = @as(u32, cursor.width) + cursor.i;
if (pos >= it.bytes.len) {
@@ -5527,6 +5128,16 @@ pub fn leftHasAnyInRight(to_check: []const string, against: []const string) bool
return false;
}
/// Returns true if the input has the prefix and the next character is not an identifier character
/// Also returns true if the input ends with the prefix (i.e. EOF)
///
/// Example:
/// ```zig
/// // returns true
/// hasPrefixWithWordBoundary("console.log", "console") // true
/// hasPrefixWithWordBoundary("console.log", "log") // false
/// hasPrefixWithWordBoundary("console.log", "console.log") // true
/// ```
pub fn hasPrefixWithWordBoundary(input: []const u8, comptime prefix: []const u8) bool {
if (hasPrefixComptime(input, prefix)) {
if (input.len == prefix.len) return true;
@@ -5708,7 +5319,6 @@ pub fn mustEscapeYAMLString(contents: []const u8) bool {
else => true,
};
}
pub fn pathContainsNodeModulesFolder(path: []const u8) bool {
return strings.contains(path, comptime std.fs.path.sep_str ++ "node_modules" ++ std.fs.path.sep_str);
}

View File

@@ -0,0 +1,353 @@
import { describe } from "bun:test";
import { itBundled } from "./expectBundled";
describe("single-line comments", () => {
itBundled("unix newlines", {
files: {
"/entry.js": `// This is a comment\nconsole.log("hello");\n// Another comment\n`,
},
onAfterBundle(api) {
const output = api.readFile("/out.js");
api.expectFile("/out.js").toContain("hello");
},
});
itBundled("windows newlines", {
files: {
"/entry.js": `// This is a comment\r\nconsole.log("hello");\r\n// Another comment\r\n`,
},
onAfterBundle(api) {
api.expectFile("/out.js").toContain("hello");
},
});
itBundled("no trailing newline", {
files: {
"/entry.js": `// This is a comment\nconsole.log("hello");\n// No newline at end`,
},
onAfterBundle(api) {
api.expectFile("/out.js").toContain("hello");
},
});
itBundled("non-ascii characters", {
files: {
"/entry.js": `// 你好,世界\n// Привет, мир\n// こんにちは世界\nconsole.log("hello");\n`,
},
onAfterBundle(api) {
api.expectFile("/out.js").toContain("hello");
},
});
itBundled("emoji", {
files: {
"/entry.js": `// 🚀 🔥 💯\nconsole.log("hello");\n`,
},
onAfterBundle(api) {
api.expectFile("/out.js").toContain("hello");
},
});
itBundled("invalid surrogate pair at beginning", {
files: {
"/entry.js": `// \uDC00 invalid surrogate\nconsole.log("hello");\n`,
},
onAfterBundle(api) {
api.expectFile("/out.js").toContain("hello");
},
});
itBundled("invalid surrogate pair at end", {
files: {
"/entry.js": `// invalid surrogate \uD800\nconsole.log("hello");\n`,
},
onAfterBundle(api) {
api.expectFile("/out.js").toContain("hello");
},
});
itBundled("invalid surrogate pair in middle", {
files: {
"/entry.js": `// invalid \uD800\uDC00\uD800 surrogate\nconsole.log("hello");\n`,
},
onAfterBundle(api) {
api.expectFile("/out.js").toContain("hello");
},
});
itBundled("multiple comments on same line", {
files: {
"/entry.js": `const x = 5; // first comment // second comment\nconsole.log(x);\n`,
},
onAfterBundle(api) {
api.expectFile("/out.js").toContain("console.log(x)");
},
});
itBundled("comment with ASI", {
files: {
"/entry.js": `const x = 5// first comment // second comment\nconsole.log(x)`,
},
onAfterBundle(api) {
api.expectFile("/out.js").toContain("console.log(x)");
},
});
itBundled("comment at end of file without newline", {
files: {
"/entry.js": `console.log("hello"); //`,
},
onAfterBundle(api) {
api.expectFile("/out.js").toContain("hello");
},
});
itBundled("empty comments", {
files: {
"/entry.js": `//\n//\nconsole.log("hello");\n//`,
},
onAfterBundle(api) {
api.expectFile("/out.js").toContain("hello");
},
});
itBundled("comments with special characters", {
files: {
"/entry.js": `// Comment with \\ backslash\n// Comment with \" quote\n// Comment with \t tab\nconsole.log("hello");\n`,
},
onAfterBundle(api) {
api.expectFile("/out.js").toContain("hello");
},
});
itBundled("comments with control characters", {
files: {
"/entry.js": `// Comment with \u0000 NULL\n// Comment with \u0001 SOH\nconsole.log("hello");\n`,
},
onAfterBundle(api) {
api.expectFile("/out.js").toContain("hello");
},
});
itBundled("comments with minification", {
files: {
"/entry.js": `// This should be removed\nconsole.log("hello");\n// This too`,
},
minifyWhitespace: true,
minifySyntax: true,
onAfterBundle(api) {
api.expectFile("/out.js").toEqualIgnoringWhitespace('console.log("hello");');
},
});
for (const minify of [true, false]) {
itBundled(
`some code and an empty comment without newline preceding ${minify ? "with minification" : "without minification"}`,
{
files: {
"/entry.js": `console.log("hello");//`,
},
minifyWhitespace: minify,
minifySyntax: minify,
run: {
stdout: "hello",
},
},
);
itBundled(`some code and then only an empty comment ${minify ? "with minification" : "without minification"}`, {
files: {
"/entry.js": `console.log("hello");\n//`,
},
minifyWhitespace: minify,
minifySyntax: minify,
run: {
stdout: "hello",
},
});
itBundled(`only an empty comment ${minify ? "with minification" : "without minification"}`, {
files: {
"/entry.js": `//`,
},
minifyWhitespace: minify,
minifySyntax: minify,
run: {
stdout: "",
},
});
itBundled("only a comment", {
files: {
"/entry.js": `// This is a comment`,
},
minifyWhitespace: true,
minifySyntax: true,
run: {
stdout: "",
},
});
}
itBundled("trailing //# sourceMappingURL=", {
files: {
"/entry.js": `// This is a comment\nconsole.log("hello");\n//# sourceMappingURL=data:application/json;base64,eyJ2ZXJzaW9uIjozLCJmaWxlIjoiZXhhbXBsZS5qcyIsInNvdXJjZSI6Ii8vZXhhbXBsZS5qcyJ9`,
},
onAfterBundle(api) {
api.expectFile("/out.js").toContain("hello");
},
});
itBundled("trailing //# sourceMappingURL= with == at end", {
files: {
"/entry.js": `// This is a comment\nconsole.log("hello");\n//# sourceMappingURL=data:application/json;base64,eyJ2ZXJzaW9uIjozLCJmaWxlIjoiZXhhbXBsZS5qcyIsInNvdXJjZSI6Ii8vZXhhbXBsZS5qcyJ9==`,
},
onAfterBundle(api) {
api.expectFile("/out.js").toContain("hello");
},
});
itBundled("trailing //# sourceMappingURL= with = at end", {
files: {
"/entry.js": `// This is a comment\nconsole.log("hello");\n//# sourceMappingURL=data:application/json;base64,eyJ2ZXJzaW9uIjozLCJmaWxlIjoiZXhhbXBsZS5qcyIsInNvdXJjZSI6Ii8vZXhhbXBsZS5qcyJ9=`,
},
onAfterBundle(api) {
api.expectFile("/out.js").toContain("hello");
},
});
itBundled("leading //# sourceMappingURL= with = at end", {
files: {
"/entry.js": `//# sourceMappingURL=data:application/json;base64,eyJ2ZXJzaW9uIjozLCJmaWxlIjoiZXhhbXBsZS5qcyIsInNvdXJjZSI6Ii8vZXhhbXBsZS5qcyJ9=\n// This is a comment\nconsole.log("hello");`,
},
onAfterBundle(api) {
api.expectFile("/out.js").toContain("hello");
},
});
itBundled("leading trailing newline //# sourceMappingURL= with = at end", {
files: {
"/entry.js": `//# sourceMappingURL=data:application/json;base64,eyJ2ZXJzaW9uIjozLCJmaWxlIjoiZXhhbXBsZS5qcyIsInNvdXJjZSI6Ii8vZXhhbXBsZS5qcyJ9=\n// This is a comment\nconsole.log("hello");\n`,
},
onAfterBundle(api) {
api.expectFile("/out.js").toContain("hello");
},
});
itBundled("leading newline and sourcemap, trailing newline //# sourceMappingURL= with = at end", {
files: {
"/entry.js": `\n//# sourceMappingURL=data:application/json;base64,eyJ2ZXJzaW9uIjozLCJmaWxlIjoiZXhhbXBsZS5qcyIsInNvdXJjZSI6Ii8vZXhhbXBsZS5qcyJ9=\n// This is a comment\nconsole.log("hello");\n`,
},
onAfterBundle(api) {
api.expectFile("/out.js").toContain("hello");
},
});
itBundled("__PURE__ comment in single-line comment basic", {
files: {
"/entry.js": `//#__PURE__\nconsole.log("hello");`,
},
onAfterBundle(api) {
api.expectFile("/out.js").not.toContain("hello");
},
});
itBundled("__PURE__ comment in single-line comment with spaces", {
files: {
"/entry.js": `// #__PURE__ \nconsole.log("hello");`,
},
onAfterBundle(api) {
api.expectFile("/out.js").not.toContain("hello");
},
});
itBundled("__PURE__ comment in single-line comment with text before", {
files: {
"/entry.js": `// some text #__PURE__\nconsole.log("hello");`,
},
onAfterBundle(api) {
api.expectFile("/out.js").not.toContain("hello");
},
});
itBundled("__PURE__ comment in single-line comment with text after", {
files: {
"/entry.js": `// #__PURE__ some text\nconsole.log("hello");`,
},
onAfterBundle(api) {
api.expectFile("/out.js").not.toContain("hello");
},
});
itBundled("__PURE__ comment in single-line comment with unicode characters", {
files: {
"/entry.js": `// 你好 #__PURE__ 世界\nconsole.log("hello");`,
},
onAfterBundle(api) {
api.expectFile("/out.js").not.toContain("hello");
},
});
itBundled("__PURE__ comment in single-line comment with emoji", {
files: {
"/entry.js": `// 🚀 #__PURE__ 🔥\nconsole.log("hello");`,
},
onAfterBundle(api) {
api.expectFile("/out.js").not.toContain("hello");
},
});
itBundled("__PURE__ comment in single-line comment with invalid surrogate pair", {
files: {
"/entry.js": `// \uD800 #__PURE__ \uDC00\nconsole.log("hello");`,
},
onAfterBundle(api) {
api.expectFile("/out.js").not.toContain("hello");
},
});
itBundled("multiple __PURE__ comments in single-line comments", {
files: {
"/entry.js": `//#__PURE__\nconsole.log("hello");\n//#__PURE__\nconsole.log("world");`,
},
onAfterBundle(api) {
api.expectFile("/out.js").not.toContain("hello");
api.expectFile("/out.js").not.toContain("world");
},
});
itBundled("__PURE__ comment in single-line comment with minification", {
files: {
"/entry.js": `//#__PURE__\nconsole.log("hello");`,
},
minifyWhitespace: true,
minifySyntax: true,
onAfterBundle(api) {
api.expectFile("/out.js").not.toContain("hello");
},
});
itBundled("__PURE__ comment in single-line comment with windows newlines", {
files: {
"/entry.js": `//#__PURE__\r\nconsole.log("hello");`,
},
onAfterBundle(api) {
api.expectFile("/out.js").not.toContain("hello");
},
});
itBundled("__PURE__ comment in single-line comment at end of file", {
files: {
"/entry.js": `console.log("hello");\n//#__PURE__`,
},
onAfterBundle(api) {
api.expectFile("/out.js").toContain("hello");
},
});
itBundled("__PURE__ comment in single-line comment in middle of a statement", {
files: {
"/entry.js": `console.log(//#__PURE__\n123);`,
},
run: {
stdout: "123",
},
});
});

View File

@@ -204,7 +204,6 @@ describe("bundler", () => {
`,
});
itBundledDevAndProd("jsx/Classic", {
todo: true,
files: {
"/index.jsx": /* js*/ `
import { print } from 'bun-test-helpers'
@@ -226,7 +225,6 @@ describe("bundler", () => {
},
});
itBundledDevAndProd("jsx/ClassicPragma", {
todo: true,
files: {
"/index.jsx": /* js*/ `
// @jsx fn
@@ -298,7 +296,6 @@ describe("bundler", () => {
`,
});
itBundledDevAndProd("jsx/Factory", {
todo: true,
files: {
"/index.jsx": /* js*/ `
const h = () => 'hello'
@@ -322,7 +319,6 @@ describe("bundler", () => {
},
});
itBundledDevAndProd("jsx/FactoryImport", {
todo: false,
files: {
"/index.jsx": /* js*/ `
import { h, fragment } from './jsx.ts';
@@ -353,7 +349,6 @@ describe("bundler", () => {
},
});
itBundledDevAndProd("jsx/FactoryImportExplicitReactDefault", {
todo: false,
files: {
"/index.jsx": /* js*/ `
import { print } from 'bun-test-helpers'
@@ -374,7 +369,6 @@ describe("bundler", () => {
},
});
itBundledDevAndProd("jsx/FactoryImportExplicitReactDefaultExternal", {
todo: false,
files: {
"/index.jsx": /* js*/ `
import { print } from 'bun-test-helpers'
@@ -397,4 +391,24 @@ describe("bundler", () => {
expect(file).toContain('import * as React from "react"');
},
});
itBundled("jsx/jsxImportSource pragma works", {
files: {
"/index.jsx": /* jsx */ `
// @jsxImportSource hello
console.log(<div>Hello World</div>);
`,
"/node_modules/hello/jsx-dev-runtime.js": /* js */ `
export function jsxDEV(type, props, key) {
return {
$$typeof: Symbol("hello_jsxDEV"), type, props, key
}
}
`,
},
outdir: "/out",
target: "browser",
run: {
stdout: `{\n $$typeof: Symbol(hello_jsxDEV),\n type: \"div\",\n props: {\n children: \"Hello World\",\n },\n key: undefined,\n}`,
},
});
});

View File

@@ -12,7 +12,7 @@ const words: Record<string, { reason: string; limit?: number; regex?: boolean }>
"std.debug.assert": { reason: "Use bun.assert instead", limit: 26 },
"std.debug.dumpStackTrace": { reason: "Use bun.handleErrorReturnTrace or bun.crash_handler.dumpStackTrace instead" },
"std.debug.print": { reason: "Don't let this be committed", limit: 0 },
"std.mem.indexOfAny(u8": { reason: "Use bun.strings.indexOfAny", limit: 3 },
"std.mem.indexOfAny(u8": { reason: "Use bun.strings.indexOfAny", limit: 2 },
"std.StringArrayHashMapUnmanaged(": { reason: "bun.StringArrayHashMapUnmanaged has a faster `eql`", limit: 12 },
"std.StringArrayHashMap(": { reason: "bun.StringArrayHashMap has a faster `eql`", limit: 1 },
"std.StringHashMapUnmanaged(": { reason: "bun.StringHashMapUnmanaged has a faster `eql`" },