diff --git a/cmake/targets/BuildBun.cmake b/cmake/targets/BuildBun.cmake index 72f671976d..2ee427e8ca 100644 --- a/cmake/targets/BuildBun.cmake +++ b/cmake/targets/BuildBun.cmake @@ -1089,6 +1089,7 @@ set(BUN_DEPENDENCIES BoringSSL Brotli Cares + Highway LibDeflate LolHtml Lshpack diff --git a/cmake/targets/BuildHighway.cmake b/cmake/targets/BuildHighway.cmake new file mode 100644 index 0000000000..5f8664486d --- /dev/null +++ b/cmake/targets/BuildHighway.cmake @@ -0,0 +1,33 @@ +register_repository( + NAME + highway + REPOSITORY + google/highway + COMMIT + 12b325bc1793dee68ab2157995a690db859fe9e0 +) + +set(HIGHWAY_CMAKE_ARGS + # Build a static library + -DBUILD_SHARED_LIBS=OFF + # Enable position-independent code for linking into the main executable + -DCMAKE_POSITION_INDEPENDENT_CODE=ON + # Disable unnecessary components + -DHWY_ENABLE_TESTS=OFF + -DHWY_ENABLE_EXAMPLES=OFF + -DHWY_ENABLE_CONTRIB=OFF + # Disable building of the install target + -DHWY_ENABLE_INSTALL=OFF +) + +register_cmake_command( + TARGET + highway + LIBRARIES + hwy + ARGS + ${HIGHWAY_CMAKE_ARGS} + INCLUDES + . + hwy +) \ No newline at end of file diff --git a/src/bun.js/bindings/bun-simdutf.cpp b/src/bun.js/bindings/bun-simdutf.cpp index 6aba5caa74..9faa3d8b84 100644 --- a/src/bun.js/bindings/bun-simdutf.cpp +++ b/src/bun.js/bindings/bun-simdutf.cpp @@ -367,4 +367,10 @@ SIMDUTFResult simdutf__base64_decode_from_binary16(const char16_t* input, size_t return { .error = res.error, .count = res.count }; } + +size_t simdutf__utf16_length_from_latin1(const char* input, size_t length) +{ + UNUSED_PARAM(input); + return simdutf::utf16_length_from_latin1(length); +} } diff --git a/src/bun.js/bindings/bun-simdutf.zig b/src/bun.js/bindings/bun-simdutf.zig index 006bfb4a46..3316e6780d 100644 --- a/src/bun.js/bindings/bun-simdutf.zig +++ b/src/bun.js/bindings/bun-simdutf.zig @@ -95,6 +95,7 @@ pub extern fn simdutf__utf8_length_from_utf32(input: [*c]const c_uint, length: u pub extern fn simdutf__utf16_length_from_utf32(input: [*c]const c_uint, length: usize) usize; pub extern fn simdutf__utf32_length_from_utf8(input: [*]const u8, length: usize) usize; pub extern fn simdutf__utf8_length_from_latin1(input: [*]const u8, length: usize) usize; +pub extern fn simdutf__utf16_length_from_latin1(input: [*]const u8, length: usize) usize; pub const validate = struct { pub const with_errors = struct { @@ -295,6 +296,10 @@ pub const length = struct { JSC.markBinding(@src()); return simdutf__utf16_length_from_utf32(input.ptr, input.len); } + + pub fn latin1(input: []const u8) usize { + return simdutf__utf16_length_from_latin1(input.ptr, input.len); + } }; }; diff --git a/src/bun.js/bindings/highway_strings.cpp b/src/bun.js/bindings/highway_strings.cpp new file mode 100644 index 0000000000..cc5b76a4d6 --- /dev/null +++ b/src/bun.js/bindings/highway_strings.cpp @@ -0,0 +1,790 @@ +// Must be first +#include "root.h" +#undef HWY_TARGET_INCLUDE +// Correct path to this file relative to the build root (CMakeLists.txt) +#define HWY_TARGET_INCLUDE "highway_strings.cpp" +#include // Must come before highway.h + +// Now include Highway and other headers +#include +#include + +#include + +#include // For memcmp +#include // For std::min, std::max +#include +#include + +// Wrap the SIMD implementations in the Highway namespaces +HWY_BEFORE_NAMESPACE(); +namespace bun { +namespace HWY_NAMESPACE { + +namespace hn = hwy::HWY_NAMESPACE; // Alias for convenience + +// Type alias for SIMD vector tag +using D8 = hn::ScalableTag; + +size_t IndexOfCharImpl(const uint8_t* HWY_RESTRICT haystack, size_t haystack_len, + uint8_t needle) +{ + D8 d; + // Use the Find function from find-inl.h which handles both vectorized and scalar cases + const size_t pos = hn::Find(d, needle, haystack, haystack_len); + + // Convert to int64_t and return -1 if not found + return (pos < haystack_len) ? pos : haystack_len; +} + +// --- Implementation Details --- + +size_t IndexOfAnyCharImpl(const uint8_t* HWY_RESTRICT text, size_t text_len, const uint8_t* HWY_RESTRICT chars, size_t chars_len) +{ + if (text_len == 0) return 0; + D8 d; + const size_t N = hn::Lanes(d); + + if (chars_len == 1) { + ASSERT_NOT_REACHED_WITH_MESSAGE("chars_len == 1"); + } else if (chars_len == 2) { + // 2 character implemenation + // covers the most common case: + // + // - { '\r', '\n' } + // - { '\\', '/' } + // - { ' ', '\t' } + // + const auto vec_char1 = hn::Set(d, chars[0]); + const auto vec_char2 = hn::Set(d, chars[1]); + + size_t i = 0; + const size_t simd_text_len = text_len - (text_len % N); + for (; i < simd_text_len; i += N) { + const auto text_vec = hn::LoadN(d, text + i, N); + const auto found_mask = hn::Or(hn::Eq(text_vec, vec_char2), hn::Eq(text_vec, vec_char1)); + + const intptr_t pos = hn::FindFirstTrue(d, found_mask); + if (pos >= 0) { + return i + pos; + } + } + + for (; i < text_len; ++i) { + const uint8_t text_char = text[i]; + if (text_char == chars[0] || text_char == chars[1]) { + return i; + } + } + + return text_len; + } else { + ASSERT(chars_len <= 16); + constexpr size_t kMaxPreloadedChars = 16; + hn::Vec char_vecs[kMaxPreloadedChars]; + const size_t num_chars_to_preload = std::min(chars_len, kMaxPreloadedChars); + for (size_t c = 0; c < num_chars_to_preload; ++c) { + char_vecs[c] = hn::Set(d, chars[c]); + } + + const size_t simd_text_len = text_len - (text_len % N); + size_t i = 0; + + for (; i < simd_text_len; i += N) { + const auto text_vec = hn::LoadN(d, text + i, N); + auto found_mask = hn::MaskFalse(d); + + for (size_t c = 0; c < num_chars_to_preload; ++c) { + found_mask = hn::Or(found_mask, hn::Eq(text_vec, char_vecs[c])); + } + if (chars_len > num_chars_to_preload) { + for (size_t c = num_chars_to_preload; c < chars_len; ++c) { + found_mask = hn::Or(found_mask, hn::Eq(text_vec, hn::Set(d, chars[c]))); + } + } + + const intptr_t pos = hn::FindFirstTrue(d, found_mask); + if (pos >= 0) { + return i + pos; + } + } + + for (; i < text_len; ++i) { + const uint8_t text_char = text[i]; + for (size_t c = 0; c < chars_len; ++c) { + if (text_char == chars[c]) { + return i; + } + } + } + } + + return text_len; +} + +void CopyU16ToU8Impl(const uint16_t* HWY_RESTRICT input, size_t count, + uint8_t* HWY_RESTRICT output) +{ + // Tag for the output vector type (u8) + const hn::ScalableTag d8; + // Tag for the input vector type (u16). OrderedTruncate2To takes two u16 vectors + // (each N/2 lanes) to produce one u8 vector (N lanes). + // Repartition gives a u16 tag with N/2 lanes. + const hn::Repartition d16; + + const size_t N8 = hn::Lanes(d8); // Number of u8 lanes processed per iteration + const size_t N16 = hn::Lanes(d16); // Number of u16 lanes per input vector load + + // Sanity check: we should load 2*N16 u16 elements to produce N8 u8 elements. + // Since sizeof(u16) == 2 * sizeof(u8), N16 should be N8 / 2. + // static_assert(N16 * 2 == N8, "Lane configuration mismatch"); // Highway ensures this + + size_t i = 0; + const size_t simd_count = count - (count % N8); + // Process N8 elements (u8 output size) per iteration. This corresponds to + // loading N8 u16 input elements (2 vectors of N16 lanes each). + for (; i < simd_count; i += N8) { + // Load two input vectors of u16 + const auto in1 = hn::LoadU(d16, input + i); + const auto in2 = hn::LoadU(d16, input + i + N16); + + // Truncate and interleave into a single u8 vector + // OrderedTruncate2To(d_narrow, vec_wide_a, vec_wide_b) + const hn::Vec result8 = hn::OrderedTruncate2To(d8, in1, in2); + + // Store the resulting u8 vector + hn::StoreU(result8, d8, output + i); + } + + // Handle remaining elements (< N8) + for (; i < count; ++i) { + output[i] = static_cast(input[i]); // Truncation happens here + } +} + +// Implementation for scanCharFrequency (Unchanged from previous correct version) +void ScanCharFrequencyImpl(const uint8_t* HWY_RESTRICT text, size_t text_len, int32_t* HWY_RESTRICT freqs, int32_t delta) +{ + if (text_len == 0 || delta == 0) return; + D8 d; + const size_t N = hn::Lanes(d); + + const auto vec_a = hn::Set(d, 'a'); + const auto vec_z = hn::Set(d, 'z'); + const auto vec_A = hn::Set(d, 'A'); + const auto vec_Z = hn::Set(d, 'Z'); + const auto vec_0 = hn::Set(d, '0'); + const auto vec_9 = hn::Set(d, '9'); + const auto vec_underscore = hn::Set(d, '_'); + const auto vec_dollar = hn::Set(d, '$'); + + const auto vec_offset_a = hn::Set(d, 'a'); + const auto vec_offset_A = hn::Set(d, 'A'); + const auto vec_offset_0 = hn::Set(d, '0'); + + size_t i = 0; + size_t simd_text_len = text_len - (text_len % N); + for (; i < simd_text_len; i += N) { + const auto text_vec = hn::LoadU(d, text + i); + const auto mask_az = hn::And(hn::Ge(text_vec, vec_a), hn::Le(text_vec, vec_z)); + const auto mask_AZ = hn::And(hn::Ge(text_vec, vec_A), hn::Le(text_vec, vec_Z)); + const auto mask_09 = hn::And(hn::Ge(text_vec, vec_0), hn::Le(text_vec, vec_9)); + const auto mask_underscore = hn::Eq(text_vec, vec_underscore); + const auto mask_dollar = hn::Eq(text_vec, vec_dollar); + auto valid_mask = hn::Or(mask_az, hn::Or(mask_AZ, hn::Or(mask_09, hn::Or(mask_underscore, mask_dollar)))); + if (hn::AllFalse(d, valid_mask)) continue; + + const auto idx_az = hn::Sub(text_vec, vec_offset_a); + const auto idx_AZ = hn::Add(hn::Sub(text_vec, vec_offset_A), hn::Set(d, uint8_t { 26 })); + const auto idx_09 = hn::Add(hn::Sub(text_vec, vec_offset_0), hn::Set(d, uint8_t { 52 })); + + auto indices_vec = hn::Zero(d); + indices_vec = hn::IfThenElse(mask_az, idx_az, indices_vec); + indices_vec = hn::IfThenElse(mask_AZ, idx_AZ, indices_vec); + indices_vec = hn::IfThenElse(mask_09, idx_09, indices_vec); + indices_vec = hn::IfThenElse(mask_underscore, hn::Set(d, uint8_t { 62 }), indices_vec); + indices_vec = hn::IfThenElse(mask_dollar, hn::Set(d, uint8_t { 63 }), indices_vec); + + alignas(HWY_ALIGNMENT) uint8_t indices_array[HWY_MAX_LANES_D(D8)]; + alignas(HWY_ALIGNMENT) uint8_t valid_bits_array[(HWY_MAX_LANES_D(D8) + 7) / 8]; + hn::Store(indices_vec, d, indices_array); + hn::StoreMaskBits(d, valid_mask, valid_bits_array); + + for (size_t j = 0; j < N; ++j) { + if ((valid_bits_array[j / 8] >> (j % 8)) & 1) { + assert(indices_array[j] < 64); + freqs[indices_array[j]] += delta; + } + } + } + + for (; i < text_len; ++i) { + const uint8_t c = text[i]; + if (c >= 'a' && c <= 'z') + freqs[c - 'a'] += delta; + else if (c >= 'A' && c <= 'Z') + freqs[c - 'A' + 26] += delta; + else if (c >= '0' && c <= '9') + freqs[c - '0' + 52] += delta; + else if (c == '_') + freqs[62] += delta; + else if (c == '$') + freqs[63] += delta; + } +} + +// Implementation for finding interesting characters in string literals +size_t IndexOfInterestingCharacterInStringLiteralImpl(const uint8_t* HWY_RESTRICT text, size_t text_len, uint8_t quote) +{ + ASSERT(text_len > 0); + D8 d; + const size_t N = hn::Lanes(d); + + const auto vec_quote = hn::Set(d, quote); + const auto vec_backslash = hn::Set(d, '\\'); + const auto vec_min_ascii = hn::Set(d, uint8_t { 0x20 }); // Space + const auto vec_max_ascii = hn::Set(d, uint8_t { 0x7E }); // ~ + + const size_t simd_text_len = text_len - (text_len % N); + size_t i = 0; + for (; i < simd_text_len; i += N) { + const auto text_vec = hn::LoadN(d, text + i, N); + + // Check for quote, backslash, or characters outside printable ASCII range + const auto mask_quote = hn::Eq(text_vec, vec_quote); + const auto mask_backslash = hn::Eq(text_vec, vec_backslash); + const auto mask_lt_min = hn::Lt(text_vec, vec_min_ascii); + const auto mask_gt_max = hn::Gt(text_vec, vec_max_ascii); + + const auto found_mask = hn::Or( + hn::Or(mask_quote, mask_backslash), + hn::Or(mask_lt_min, mask_gt_max)); + + const intptr_t pos = hn::FindFirstTrue(d, found_mask); + if (pos >= 0) { + return i + pos; + } + } + + for (; i < text_len; ++i) { + const uint8_t c = text[i]; + if (c == quote || c == '\\' || (c < 0x20 || c > 0x7E)) { + return i; + } + } + + return text_len; +} + +size_t IndexOfNewlineOrNonASCIIOrHashOrAtImpl(const uint8_t* HWY_RESTRICT start_ptr, size_t search_len) +{ + ASSERT(search_len > 0); + + D8 d; + const size_t N = hn::Lanes(d); + + const auto vec_hash = hn::Set(d, '#'); + const auto vec_at = hn::Set(d, '@'); + const auto vec_min_ascii = hn::Set(d, uint8_t { 0x20 }); + const auto vec_max_ascii = hn::Set(d, uint8_t { 0x7E }); + + size_t i = 0; + const size_t simd_text_len = search_len - (search_len % N); + for (; i < simd_text_len; i += N) { + const auto vec = hn::LoadU(d, start_ptr + i); + + const auto mask_hash = hn::Eq(vec, vec_hash); + const auto mask_at = hn::Eq(vec, vec_at); + const auto mask_lt_min = hn::Lt(vec, vec_min_ascii); + const auto mask_gt_max = hn::Gt(vec, vec_max_ascii); + + const auto found_mask = hn::Or(hn::Or(mask_hash, mask_at), hn::Or(mask_lt_min, mask_gt_max)); + + const intptr_t pos = hn::FindFirstTrue(d, found_mask); + if (pos >= 0) { + return i + pos; + } + } + + for (; i < search_len; ++i) { + const uint8_t char_ = start_ptr[i]; + if (char_ == '#' || char_ == '@' || char_ < 0x20 || char_ > 127) { + return i; + } + } + + return search_len; +} + +size_t IndexOfNewlineOrNonASCIIImpl(const uint8_t* HWY_RESTRICT start_ptr, size_t search_len) +{ + ASSERT(search_len > 0); + + D8 d; + const size_t N = hn::Lanes(d); + + // SIMD constants + const auto vec_max_ascii = hn::Set(d, uint8_t { 127 }); + const auto vec_min_ascii = hn::Set(d, uint8_t { 0x20 }); + + // FUTURE TODO: normalize tabs + // Some tests involving githubactions depend on tabs not being normalized right now. + + size_t i = 0; + const size_t simd_text_len = search_len - (search_len % N); + // Process full vectors + for (; i < simd_text_len; i += N) { + const auto vec = hn::LoadU(d, start_ptr + i); + const auto mask_lt_min = hn::Lt(vec, vec_min_ascii); + const auto mask_gt_max = hn::Gt(vec, vec_max_ascii); + + const auto found_mask = hn::Or(mask_gt_max, mask_lt_min); + + const intptr_t pos = hn::FindFirstTrue(d, found_mask); + if (pos >= 0) { + return i + pos; + } + } + + // Scalar check for the remainder + for (; i < search_len; ++i) { + const uint8_t char_ = start_ptr[i]; + if (char_ > 127 || char_ < 0x20) { + return i; + } + } + + return search_len; +} + +size_t IndexOfSpaceOrNewlineOrNonASCIIImpl(const uint8_t* HWY_RESTRICT start_ptr, size_t search_len) +{ + ASSERT(search_len > 0); + + D8 d; + const size_t N = hn::Lanes(d); + + const uint8_t after_space = ' ' + 1; + + const auto vec_min_ascii_including_space = hn::Set(d, after_space); + const auto vec_max_ascii = hn::Set(d, uint8_t { 127 }); + size_t simd_text_len = search_len - (search_len % N); + + size_t i = 0; + for (; i < simd_text_len; i += N) { + const auto vec = hn::LoadU(d, start_ptr + i); + const auto mask_lt_min = hn::Lt(vec, vec_min_ascii_including_space); + const auto mask_gt_max = hn::Gt(vec, vec_max_ascii); + const auto found_mask = hn::Or(mask_gt_max, mask_lt_min); + const intptr_t pos = hn::FindFirstTrue(d, found_mask); + if (pos >= 0) { + return i + pos; + } + } + + for (; i < search_len; ++i) { + const uint8_t char_ = start_ptr[i]; + if (char_ <= ' ' || char_ > 127) { + return i; + } + } + + return search_len; +} + +bool ContainsNewlineOrNonASCIIOrQuoteImpl(const uint8_t* HWY_RESTRICT text, size_t text_len) +{ + ASSERT(text_len > 0); + + D8 d; + const size_t N = hn::Lanes(d); + + // SIMD constants + const auto vec_max_ascii = hn::Set(d, uint8_t { 127 }); + const auto vec_min_ascii = hn::Set(d, uint8_t { 0x20 }); + const auto vec_quote = hn::Set(d, uint8_t { '"' }); + + size_t i = 0; + const size_t simd_text_len = text_len - (text_len % N); + + // Process full vectors + for (; i < simd_text_len; i += N) { + const auto vec = hn::LoadU(d, text + i); + const auto mask_lt_min = hn::Lt(vec, vec_min_ascii); + const auto mask_gt_max = hn::Gt(vec, vec_max_ascii); + + const auto mask_quote_eq = hn::Eq(vec, vec_quote); + + const auto found_mask = hn::Or(hn::Or(mask_gt_max, mask_lt_min), mask_quote_eq); + + if (!hn::AllFalse(d, found_mask)) { + return true; + } + } + + // Scalar check for the remainder + for (; i < text_len; ++i) { + const uint8_t char_ = text[i]; + if (char_ > 127 || char_ < 0x20 || char_ == '"') { + return true; + } + } + + return false; +} + +template +static size_t IndexOfNeedsEscapeForJavaScriptStringImpl(const uint8_t* HWY_RESTRICT text, size_t text_len, uint8_t quote_char) +{ + ASSERT(text_len > 0); + + D8 d; + const size_t N = hn::Lanes(d); + + // Set up SIMD constants + const auto vec_backslash = hn::Set(d, uint8_t { '\\' }); + const auto vec_min_ascii = hn::Set(d, uint8_t { 0x20 }); + const auto vec_max_ascii = hn::Set(d, uint8_t { 127 }); + const auto vec_quote = hn::Set(d, quote_char); + + const auto vec_dollar = hn::Set(d, uint8_t { '$' }); + ASSERT(is_backtick || quote_char != '`'); + + // Calculate how many full SIMD vectors we can process + const size_t simd_text_len = text_len - (text_len % N); + size_t i = 0; + + // Process chunks of the string + for (; i < simd_text_len; i += N) { + const auto text_vec = hn::LoadN(d, text + i, N); + + // Check for characters that need escaping + const auto mask_gt_max = hn::Gt(text_vec, vec_max_ascii); + const auto mask_lt_min = hn::Lt(text_vec, vec_min_ascii); + const auto mask_backslash = hn::Eq(text_vec, vec_backslash); + const auto mask_quote = hn::Eq(text_vec, vec_quote); + + auto found_mask = !is_backtick ? hn::Or( + hn::Or(mask_gt_max, mask_lt_min), + hn::Or(mask_backslash, mask_quote)) + : hn::Or( + hn::Or( + hn::Or(mask_gt_max, mask_lt_min), + hn::Or(mask_backslash, mask_quote)), + hn::Eq(text_vec, vec_dollar)); + + const intptr_t pos = hn::FindFirstTrue(d, found_mask); + if (pos >= 0) { + return i + pos; + } + } + + // Scalar check for the remainder + for (; i < text_len; ++i) { + const uint8_t char_ = text[i]; + if (char_ >= 127 || (char_ < 0x20 && char_ != 0x09) || char_ == '\\' || char_ == quote_char || (is_backtick && char_ == '$')) { + return i; + } + } + + return text_len; // No characters needing escape found +} + +size_t IndexOfNeedsEscapeForJavaScriptStringImplBacktick(const uint8_t* HWY_RESTRICT text, size_t text_len, uint8_t quote_char) +{ + return IndexOfNeedsEscapeForJavaScriptStringImpl(text, text_len, quote_char); +} + +size_t IndexOfNeedsEscapeForJavaScriptStringImplQuote(const uint8_t* HWY_RESTRICT text, size_t text_len, uint8_t quote_char) +{ + return IndexOfNeedsEscapeForJavaScriptStringImpl(text, text_len, quote_char); +} + +// Highway implementation of memmem +// Returns a pointer to the first occurrence of `needle` in `haystack`, +// or nullptr if not found. The return type is non-const `uint8_t*` +// to match the standard C `memmem` signature, even though the input +// is const. The caller should handle constness appropriately. +uint8_t* MemMemImpl(const uint8_t* haystack, size_t haystack_len, + const uint8_t* needle, size_t needle_len) +{ + // --- Edge Cases --- + if (HWY_UNLIKELY(needle_len == 0)) { + return const_cast(haystack); + } + if (HWY_UNLIKELY(haystack_len < needle_len)) { + return nullptr; + } + if (HWY_UNLIKELY(needle_len == 1)) { + size_t index = IndexOfCharImpl(haystack, haystack_len, needle[0]); + if (index != haystack_len) { + return const_cast(haystack + index); + } + return nullptr; + } + + // --- SIMD Setup --- + const hn::ScalableTag d; + const size_t N = hn::Lanes(d); + const uint8_t first_needle_char = needle[0]; + const hn::Vec v_first_needle = hn::Set(d, first_needle_char); + const size_t last_possible_start = haystack_len - needle_len; + + // --- SIMD Main Loop --- + size_t i = 0; + while (i + N <= haystack_len && i <= last_possible_start) { + const hn::Vec haystack_vec = hn::LoadU(d, haystack + i); + hn::Mask m_starts = hn::Eq(haystack_vec, v_first_needle); + + // Iterate through potential matches within this vector chunk using FindFirstTrue + while (!hn::AllFalse(d, m_starts)) { + const intptr_t bit_idx_ptr = hn::FindFirstTrue(d, m_starts); + // Loop condition guarantees FindFirstTrue finds something + HWY_ASSERT(bit_idx_ptr >= 0); + const size_t bit_idx = static_cast(bit_idx_ptr); + + const size_t potential_pos = i + bit_idx; + + // Double-check bounds (essential if N > needle_len, and correct otherwise) + if (potential_pos <= last_possible_start) { + if (memcmp(haystack + potential_pos, needle, needle_len) == 0) { + return const_cast(haystack + potential_pos); + } + } else { + // Optimization: If the first match found in this chunk is already + // beyond the last possible start, no subsequent match in this + // chunk can be valid. + goto remainder_check; // Exit both loops and proceed to scalar remainder + } + + // Clear the found bit to find the next one in the next iteration. + // SetOnlyFirst creates a mask with only the first true bit set. + // AndNot removes that bit from m_starts. + const hn::Mask first_bit_mask = hn::SetOnlyFirst(m_starts); + m_starts = hn::AndNot(first_bit_mask, m_starts); + } // End while (!AllFalse) + + i += N; + } // End SIMD loop + +remainder_check: + // --- Scalar Remainder Loop --- + // Check any remaining bytes that couldn't form a full vector load + // or potential starts within the last vector load that weren't checked + // because they were past last_possible_start. + // Start `i` from where the SIMD loop *could* have last started a valid check. + size_t remainder_start = (i >= N) ? (i - N) : 0; + // Ensure we re-check any potential starts the SIMD loop might have skipped + // due to the bounds check optimization or being in the final partial vector. + for (; remainder_start <= last_possible_start; ++remainder_start) { + // Optimization: Check first character before expensive memcmp + if (haystack[remainder_start] == first_needle_char) { + if (memcmp(haystack + remainder_start, needle, needle_len) == 0) { + return const_cast(haystack + remainder_start); + } + } + } + + return nullptr; // Not found +} + +// Implementation for WebSocket mask application +void FillWithSkipMaskImpl(const uint8_t* HWY_RESTRICT mask, size_t mask_len, uint8_t* HWY_RESTRICT output, const uint8_t* HWY_RESTRICT input, size_t length, bool skip_mask) +{ + ASSERT(mask_len == 4); + + ASSERT(length > 0); + + // If we're skipping masking or there's no data, return early + if (skip_mask) { + std::memcpy(output, input, length); + return; + } + + D8 d; + const size_t N = hn::Lanes(d); + + // Create a vector filled with the mask pattern repeating every 4 bytes + alignas(HWY_ALIGNMENT) uint8_t mask_pattern[HWY_MAX_LANES_D(D8)] = {}; + for (size_t i = 0; i < HWY_MAX_LANES_D(D8); i += 4) { + mask_pattern[i] = mask[0]; + mask_pattern[i + 1] = mask[1]; + mask_pattern[i + 2] = mask[2]; + mask_pattern[i + 3] = mask[3]; + } + const auto mask_vec = hn::Load(d, mask_pattern); + + // Process data in chunks of size N + size_t i = 0; + const size_t vector_length = length - (length % N); + for (; i < vector_length; i += N) { + // Load input data + const auto input_vec = hn::LoadU(d, input + i); + // XOR with mask + const auto masked_vec = hn::Xor(input_vec, mask_vec); + // Store result + hn::StoreU(masked_vec, d, output + i); + } + + // Handle remaining bytes with scalar operations + for (; i < length; ++i) { + output[i] = input[i] ^ mask[i % 4]; + } +} + +} // namespace HWY_NAMESPACE +} // namespace bun +HWY_AFTER_NAMESPACE(); + +// HWY_ONCE ensures this block is only included once, +// in the final pass after all target-specific code is generated. +#if HWY_ONCE + +namespace bun { + +// Define the dispatch tables. The names here must exactly match +// the *Impl function names defined within the HWY_NAMESPACE block above. +HWY_EXPORT(ContainsNewlineOrNonASCIIOrQuoteImpl); +HWY_EXPORT(CopyU16ToU8Impl); +HWY_EXPORT(FillWithSkipMaskImpl); +HWY_EXPORT(IndexOfAnyCharImpl); +HWY_EXPORT(IndexOfCharImpl); +HWY_EXPORT(IndexOfInterestingCharacterInStringLiteralImpl); +HWY_EXPORT(IndexOfNeedsEscapeForJavaScriptStringImplBacktick); +HWY_EXPORT(IndexOfNeedsEscapeForJavaScriptStringImplQuote); +HWY_EXPORT(IndexOfNewlineOrNonASCIIImpl); +HWY_EXPORT(IndexOfNewlineOrNonASCIIOrHashOrAtImpl); +HWY_EXPORT(IndexOfSpaceOrNewlineOrNonASCIIImpl); +HWY_EXPORT(MemMemImpl); +HWY_EXPORT(ScanCharFrequencyImpl); +} // namespace bun + +// Define the C-callable wrappers that use HWY_DYNAMIC_DISPATCH. +// These need to be defined *after* the HWY_EXPORT block. +extern "C" { + +void* highway_memmem(const uint8_t* haystack, size_t haystack_len, const uint8_t* needle, size_t needle_len) +{ + return HWY_DYNAMIC_DISPATCH(bun::MemMemImpl)(haystack, haystack_len, needle, needle_len); +} + +static void highway_copy_u16_to_u8_impl( + const uint16_t* input, + size_t count, + uint8_t* output) +{ + return HWY_DYNAMIC_DISPATCH(bun::CopyU16ToU8Impl)(input, count, output); +} + +void highway_copy_u16_to_u8( + // No HWY_RESTRICT + const uint16_t* input, + + size_t count, + // No HWY_RESTRICT + uint8_t* output) +{ + + if (count == 0) { + return; + } + + // Check alignment of the input pointer + if (!hwy::IsAligned(input, alignof(uint16_t))) { + // Handle the first unaligned element scalar-ly + output[0] = static_cast(input[0]); + + // Call the core implementation with adjusted pointers and count, + // which are now guaranteed to be aligned or have count == 0. + // The HWY_RESTRICT inside CopyU16ToU8Impl is now valid for the + // ranges it operates on. + if (count > 1) + highway_copy_u16_to_u8_impl(input + 1, count - 1, output + 1); + } else { + // Input is already aligned, call the core implementation directly. + highway_copy_u16_to_u8_impl(input, count, output); + } +} +size_t highway_index_of_any_char(const uint8_t* HWY_RESTRICT text, size_t text_len, const uint8_t* HWY_RESTRICT chars, size_t chars_len) +{ + return HWY_DYNAMIC_DISPATCH(bun::IndexOfAnyCharImpl)(text, text_len, chars, chars_len); +} + +void highway_char_frequency(const uint8_t* HWY_RESTRICT text, size_t text_len, + int32_t* freqs, int32_t delta) +{ + HWY_DYNAMIC_DISPATCH(bun::ScanCharFrequencyImpl)(text, text_len, freqs, delta); +} + +size_t highway_index_of_char(const uint8_t* HWY_RESTRICT haystack, size_t haystack_len, + uint8_t needle) +{ + return HWY_DYNAMIC_DISPATCH(bun::IndexOfCharImpl)(haystack, haystack_len, needle); +} + +size_t highway_index_of_interesting_character_in_string_literal(const uint8_t* HWY_RESTRICT text, size_t text_len, uint8_t quote) +{ + return HWY_DYNAMIC_DISPATCH(bun::IndexOfInterestingCharacterInStringLiteralImpl)(text, text_len, quote); +} + +size_t highway_index_of_newline_or_non_ascii(const uint8_t* HWY_RESTRICT haystack, size_t haystack_len) +{ + return HWY_DYNAMIC_DISPATCH(bun::IndexOfNewlineOrNonASCIIImpl)(haystack, haystack_len); +} + +size_t highway_index_of_newline_or_non_ascii_or_hash_or_at(const uint8_t* HWY_RESTRICT haystack, size_t haystack_len) +{ + return HWY_DYNAMIC_DISPATCH(bun::IndexOfNewlineOrNonASCIIOrHashOrAtImpl)(haystack, haystack_len); +} + +bool highway_contains_newline_or_non_ascii_or_quote(const uint8_t* HWY_RESTRICT text, size_t text_len) +{ + return HWY_DYNAMIC_DISPATCH(bun::ContainsNewlineOrNonASCIIOrQuoteImpl)(text, text_len); +} + +size_t highway_index_of_needs_escape_for_javascript_string(const uint8_t* HWY_RESTRICT text, size_t text_len, uint8_t quote_char) +{ + if (quote_char == '`') { + return HWY_DYNAMIC_DISPATCH(bun::IndexOfNeedsEscapeForJavaScriptStringImplBacktick)(text, text_len, quote_char); + } else { + return HWY_DYNAMIC_DISPATCH(bun::IndexOfNeedsEscapeForJavaScriptStringImplQuote)(text, text_len, quote_char); + } +} + +size_t highway_index_of_space_or_newline_or_non_ascii(const uint8_t* HWY_RESTRICT text, size_t text_len) +{ + return HWY_DYNAMIC_DISPATCH(bun::IndexOfSpaceOrNewlineOrNonASCIIImpl)(text, text_len); +} + +void highway_fill_with_skip_mask( + const uint8_t* mask, // 4-byte mask array + size_t mask_len, // Should be 4 + uint8_t* output, // Output buffer + const uint8_t* input, // Input buffer + size_t length, // Length of input/output + bool skip_mask) // Whether to skip masking +{ + HWY_DYNAMIC_DISPATCH(bun::FillWithSkipMaskImpl)(mask, mask_len, output, input, length, skip_mask); +} + +} // extern "C" + +#if OS(DARWIN) +// On macOS, override the libc memmem with our implementation +// This uses inline assembly to ensure the symbol is exported with the correct name +__asm__(".globl _memmem"); +__asm__(".set _memmem, _highway_memmem"); +#elif OS(LINUX) +// On Linux, override the libc memmem with our implementation +// This uses the GNU-specific attribute to alias our function to the libc symbol +// The alias will be visible across the entire program, not just this file +extern "C" { +// Using both "default" visibility and "weak" ensures our implementation is used +// throughout the entire program when linked, not just in this object file +__attribute__((visibility("default"), weak, used)) void* memmem(const void* haystack, size_t haystacklen, const void* needle, size_t needlelen) + __attribute__((alias("highway_memmem"))); +} + +#endif + +#endif // HWY_ONCE diff --git a/src/bun.js/webcore/encoding.zig b/src/bun.js/webcore/encoding.zig index f12a653c78..e2077779cb 100644 --- a/src/bun.js/webcore/encoding.zig +++ b/src/bun.js/webcore/encoding.zig @@ -355,7 +355,7 @@ pub const Encoder = struct { switch (comptime encoding) { .utf8 => { - return strings.elementLengthLatin1IntoUTF8([]const u8, input[0..len]); + return strings.elementLengthLatin1IntoUTF8(input[0..len]); }, .latin1, .ascii, .buffer => { @@ -395,7 +395,7 @@ pub const Encoder = struct { }, .latin1, .ascii, .buffer => { const out = @min(len, to_len); - strings.copyU16IntoU8(to[0..to_len], []const u16, input[0..out]); + strings.copyU16IntoU8(to[0..to_len], input[0..out]); return out; }, // string is already encoded, just need to copy the data @@ -404,7 +404,7 @@ pub const Encoder = struct { const bytes_input_len = len * 2; const written = @min(bytes_input_len, to_len); const input_u8 = @as([*]const u8, @ptrCast(input)); - strings.copyU16IntoU8(to[0..written], []const u8, input_u8[0..written]); + bun.memmove(to[0..written], input_u8[0..written]); return written; } else { const bytes_input_len = len * 2; @@ -413,7 +413,7 @@ pub const Encoder = struct { const fixed_len = (written / 2) * 2; const input_u8 = @as([*]const u8, @ptrCast(input)); - strings.copyU16IntoU8(to[0..written], []const u8, input_u8[0..fixed_len]); + bun.memmove(to[0..written], input_u8[0..fixed_len]); return fixed_len; } }, @@ -503,7 +503,7 @@ pub const Encoder = struct { }, .latin1, .buffer, .ascii => { var to = allocator.alloc(u8, len) catch return &[_]u8{}; - strings.copyU16IntoU8(to[0..len], []const u16, input[0..len]); + strings.copyU16IntoU8(to[0..len], input[0..len]); return to; }, // string is already encoded, just need to copy the data diff --git a/src/bun.zig b/src/bun.zig index 49a952a844..5b83f866d1 100644 --- a/src/bun.zig +++ b/src/bun.zig @@ -3595,3 +3595,4 @@ pub fn freeSensitive(allocator: std.mem.Allocator, slice: anytype) void { pub const server = @import("./bun.js/api/server.zig"); pub const macho = @import("./macho.zig"); pub const valkey = @import("./valkey/index.zig"); +pub const highway = @import("./highway.zig"); diff --git a/src/css/selectors/parser.zig b/src/css/selectors/parser.zig index 50a997b192..c84be6b2f8 100644 --- a/src/css/selectors/parser.zig +++ b/src/css/selectors/parser.zig @@ -2986,7 +2986,7 @@ pub fn parse_attribute_selector(comptime Impl: type, parser: *SelectorParser, in }; const never_matches = switch (operator) { .equal, .dash_match => false, - .includes => value_str.len == 0 or std.mem.indexOfAny(u8, value_str, SELECTOR_WHITESPACE) != null, + .includes => value_str.len == 0 or bun.strings.indexOfAny(value_str, SELECTOR_WHITESPACE) != null, .prefix, .substring, .suffix => value_str.len == 0, }; diff --git a/src/glob/GlobWalker.zig b/src/glob/GlobWalker.zig index 3e0ded6647..39c08beefb 100644 --- a/src/glob/GlobWalker.zig +++ b/src/glob/GlobWalker.zig @@ -1422,43 +1422,10 @@ pub fn GlobWalker_( return filepath.len > 0 and filepath[0] == '.'; } + const syntax_tokens = "*[{?!"; + fn checkSpecialSyntax(pattern: []const u8) bool { - if (pattern.len < 16) { - for (pattern[0..]) |c| { - switch (c) { - '*', '[', '{', '?', '!' => return true, - else => {}, - } - } - return false; - } - - const syntax_tokens = comptime [_]u8{ '*', '[', '{', '?', '!' }; - const needles: [syntax_tokens.len]@Vector(16, u8) = comptime needles: { - var needles: [syntax_tokens.len]@Vector(16, u8) = undefined; - for (syntax_tokens, 0..) |tok, i| { - needles[i] = @splat(tok); - } - break :needles needles; - }; - - var i: usize = 0; - while (i + 16 <= pattern.len) : (i += 16) { - const haystack: @Vector(16, u8) = pattern[i..][0..16].*; - inline for (needles) |needle| { - if (std.simd.firstTrue(needle == haystack) != null) return true; - } - } - - if (i < pattern.len) { - for (pattern[i..]) |c| { - inline for (syntax_tokens) |tok| { - if (c == tok) return true; - } - } - } - - return false; + return bun.strings.indexOfAny(pattern, syntax_tokens) != null; } fn makeComponent( diff --git a/src/highway.zig b/src/highway.zig new file mode 100644 index 0000000000..7a7570a075 --- /dev/null +++ b/src/highway.zig @@ -0,0 +1,305 @@ +const std = @import("std"); +const bun = @import("bun"); +const strings = bun.strings; +const string = bun.string; +const Environment = bun.Environment; + +extern "c" fn highway_char_frequency( + text: [*]const u8, + text_len: usize, + freqs: [*]i32, + delta: i32, +) void; + +extern "c" fn highway_index_of_char( + haystack: [*]const u8, + haystack_len: usize, + needle: u8, +) usize; + +extern "c" fn highway_index_of_interesting_character_in_string_literal( + noalias text: [*]const u8, + text_len: usize, + quote: u8, +) usize; + +extern "c" fn highway_index_of_newline_or_non_ascii( + noalias haystack: [*]const u8, + haystack_len: usize, +) usize; + +extern "c" fn highway_index_of_newline_or_non_ascii_or_ansi( + noalias haystack: [*]const u8, + haystack_len: usize, +) usize; + +extern "c" fn highway_index_of_newline_or_non_ascii_or_hash_or_at( + noalias haystack: [*]const u8, + haystack_len: usize, +) usize; + +extern "c" fn highway_index_of_space_or_newline_or_non_ascii( + noalias haystack: [*]const u8, + haystack_len: usize, +) usize; + +extern "c" fn highway_contains_newline_or_non_ascii_or_quote( + noalias text: [*]const u8, + text_len: usize, +) bool; + +extern "c" fn highway_index_of_needs_escape_for_javascript_string( + noalias text: [*]const u8, + text_len: usize, + quote_char: u8, +) usize; + +extern "c" fn highway_index_of_any_char( + noalias text: [*]const u8, + text_len: usize, + noalias chars: [*]const u8, + chars_len: usize, +) usize; + +extern "c" fn highway_fill_with_skip_mask( + mask: [*]const u8, + mask_len: usize, + output: [*]u8, + input: [*]const u8, + length: usize, + skip_mask: bool, +) void; + +/// Count frequencies of [a-zA-Z0-9_$] characters in a string +/// Updates the provided frequency array with counts (adds delta for each occurrence) +pub fn scanCharFrequency(text: string, freqs: *[64]i32, delta: i32) void { + if (text.len == 0 or delta == 0) { + return; + } + + highway_char_frequency( + text.ptr, + text.len, + freqs.ptr, + delta, + ); +} + +pub fn indexOfChar(haystack: string, needle: u8) ?usize { + if (haystack.len == 0) { + return null; + } + + const result = highway_index_of_char( + haystack.ptr, + haystack.len, + needle, + ); + + if (result == haystack.len) { + return null; + } + + bun.debugAssert(haystack[result] == needle); + + return result; +} + +pub fn indexOfInterestingCharacterInStringLiteral(slice: string, quote_type: u8) ?usize { + if (slice.len == 0) { + return null; + } + + const result = highway_index_of_interesting_character_in_string_literal( + slice.ptr, + slice.len, + quote_type, + ); + + if (result == slice.len) { + return null; + } + + return result; +} + +pub fn indexOfNewlineOrNonASCII(haystack: string) ?usize { + bun.debugAssert(haystack.len > 0); + + const result = highway_index_of_newline_or_non_ascii( + haystack.ptr, + haystack.len, + ); + + if (result == haystack.len) { + return null; + } + if (comptime Environment.isDebug) { + const haystack_char = haystack[result]; + if (!(haystack_char > 127 or haystack_char < 0x20 or haystack_char == '\r' or haystack_char == '\n')) { + @panic("Invalid character found in indexOfNewlineOrNonASCII"); + } + } + + return result; +} + +pub fn indexOfNewlineOrNonASCIIOrANSI(haystack: string) ?usize { + bun.debugAssert(haystack.len > 0); + + const result = highway_index_of_newline_or_non_ascii_or_ansi( + haystack.ptr, + haystack.len, + ); + + if (result == haystack.len) { + return null; + } + if (comptime Environment.isDebug) { + const haystack_char = haystack[result]; + if (!(haystack_char > 127 or haystack_char < 0x20 or haystack_char == '\r' or haystack_char == '\n')) { + @panic("Invalid character found in indexOfNewlineOrNonASCIIOrANSI"); + } + } + + return result; +} + +/// Checks if the string contains any newlines, non-ASCII characters, or quotes +pub fn containsNewlineOrNonASCIIOrQuote(text: string) bool { + if (text.len == 0) { + return false; + } + + return highway_contains_newline_or_non_ascii_or_quote( + text.ptr, + text.len, + ); +} + +/// Finds the first character that needs escaping in a JavaScript string +/// Looks for characters above ASCII (> 127), control characters (< 0x20), +/// backslash characters (`\`), the quote character itself, and for backtick +/// strings also the dollar sign (`$`) +pub fn indexOfNeedsEscapeForJavaScriptString(slice: string, quote_char: u8) ?u32 { + if (slice.len == 0) { + return null; + } + + const result = highway_index_of_needs_escape_for_javascript_string( + slice.ptr, + slice.len, + quote_char, + ); + + if (result == slice.len) { + return null; + } + + if (comptime Environment.isDebug) { + const haystack_char = slice[result]; + if (!(haystack_char > 127 or haystack_char < 0x20 or haystack_char == '\\' or haystack_char == quote_char or haystack_char == '$' or haystack_char == '\r' or haystack_char == '\n')) { + @panic("Invalid character found in indexOfNeedsEscapeForJavaScriptString"); + } + } + + return @truncate(result); +} + +pub fn indexOfAnyChar(haystack: string, chars: string) ?usize { + if (haystack.len == 0 or chars.len == 0) { + return null; + } + + const result = highway_index_of_any_char(haystack.ptr, haystack.len, chars.ptr, chars.len); + + if (result == haystack.len) { + return null; + } + + if (comptime Environment.isDebug) { + const haystack_char = haystack[result]; + var found = false; + for (chars) |c| { + if (c == haystack_char) { + found = true; + break; + } + } + if (!found) { + @panic("Invalid character found in indexOfAnyChar"); + } + } + + return result; +} + +extern "c" fn highway_copy_u16_to_u8( + input: [*]align(1) const u16, + count: usize, + output: [*]u8, +) void; + +pub fn copyU16ToU8(input: []align(1) const u16, output: []u8) void { + highway_copy_u16_to_u8(input.ptr, input.len, output.ptr); +} + +/// Apply a WebSocket mask to data using SIMD acceleration +/// If skip_mask is true, data is copied without masking +pub fn fillWithSkipMask(mask: [4]u8, output: []u8, input: []const u8, skip_mask: bool) void { + if (input.len == 0) { + return; + } + + highway_fill_with_skip_mask( + &mask, + 4, + output.ptr, + input.ptr, + input.len, + skip_mask, + ); +} + +/// Useful for single-line JavaScript comments. +/// Scans for: +/// - `\n`, `\r` +/// - Non-ASCII characters (which implicitly include `\n`, `\r`) +/// - `#` +/// - `@` +pub fn indexOfNewlineOrNonASCIIOrHashOrAt(haystack: string) ?usize { + if (haystack.len == 0) { + return null; + } + + const result = highway_index_of_newline_or_non_ascii_or_hash_or_at( + haystack.ptr, + haystack.len, + ); + + if (result == haystack.len) { + return null; + } + + return result; +} + +/// Scans for: +/// - " " +/// - Non-ASCII characters (which implicitly include `\n`, `\r`, '\t') +pub fn indexOfSpaceOrNewlineOrNonASCII(haystack: string) ?usize { + if (haystack.len == 0) { + return null; + } + + const result = highway_index_of_space_or_newline_or_non_ascii( + haystack.ptr, + haystack.len, + ); + + if (result == haystack.len) { + return null; + } + + return result; +} diff --git a/src/http/websocket_http_client.zig b/src/http/websocket_http_client.zig index 57f38e0e71..04dd6d1642 100644 --- a/src/http/websocket_http_client.zig +++ b/src/http/websocket_http_client.zig @@ -731,80 +731,17 @@ pub const Mask = struct { const mask = mask_buf.*; const skip_mask = @as(u32, @bitCast(mask)) == 0; - if (!skip_mask) { - fillWithSkipMask(mask, output_, input_, false); - } else { - fillWithSkipMask(mask, output_, input_, true); - } + fillWithSkipMask(mask, output_, input_, skip_mask); } - fn fillWithSkipMask(mask: [4]u8, output_: []u8, input_: []const u8, comptime skip_mask: bool) void { - var input = input_; - var output = output_; - - if (comptime Environment.enableSIMD) { - if (input.len >= strings.ascii_vector_size) { - const vec: strings.AsciiVector = brk: { - var in: [strings.ascii_vector_size]u8 = undefined; - comptime var i: usize = 0; - inline while (i < strings.ascii_vector_size) : (i += 4) { - in[i..][0..4].* = mask; - } - break :brk @as(strings.AsciiVector, in); - }; - const end_ptr_wrapped_to_last_16 = input.ptr + input.len - (input.len % strings.ascii_vector_size); - - if (comptime skip_mask) { - while (input.ptr != end_ptr_wrapped_to_last_16) { - const input_vec: strings.AsciiVector = @as(strings.AsciiVector, input[0..strings.ascii_vector_size].*); - output.ptr[0..strings.ascii_vector_size].* = input_vec; - output = output[strings.ascii_vector_size..]; - input = input[strings.ascii_vector_size..]; - } - } else { - while (input.ptr != end_ptr_wrapped_to_last_16) { - const input_vec: strings.AsciiVector = @as(strings.AsciiVector, input[0..strings.ascii_vector_size].*); - output.ptr[0..strings.ascii_vector_size].* = input_vec ^ vec; - output = output[strings.ascii_vector_size..]; - input = input[strings.ascii_vector_size..]; - } - } - } - - // hint to the compiler not to vectorize the next loop - bun.assert(input.len < strings.ascii_vector_size); - } - - if (comptime !skip_mask) { - while (input.len >= 4) { - const input_vec: [4]u8 = input[0..4].*; - output.ptr[0..4].* = [4]u8{ - input_vec[0] ^ mask[0], - input_vec[1] ^ mask[1], - input_vec[2] ^ mask[2], - input_vec[3] ^ mask[3], - }; - output = output[4..]; - input = input[4..]; - } - } else { - while (input.len >= 4) { - const input_vec: [4]u8 = input[0..4].*; - output.ptr[0..4].* = input_vec; - output = output[4..]; - input = input[4..]; - } - } - - if (comptime !skip_mask) { - for (input, 0..) |c, i| { - output[i] = c ^ mask[i % 4]; - } - } else { - for (input, 0..) |c, i| { - output[i] = c; - } + fn fillWithSkipMask(mask: [4]u8, output_: []u8, input_: []const u8, skip_mask: bool) void { + const input = input_; + const output = output_; + if (input.len == 0) { + @branchHint(.unlikely); + return; } + return bun.highway.fillWithSkipMask(mask, output, input, skip_mask); } }; @@ -902,7 +839,7 @@ const Copy = union(enum) { return WebsocketHeader.frameSizeIncludingMask(byte_len.*); }, .latin1 => { - byte_len.* = strings.elementLengthLatin1IntoUTF8([]const u8, this.latin1); + byte_len.* = strings.elementLengthLatin1IntoUTF8(this.latin1); return WebsocketHeader.frameSizeIncludingMask(byte_len.*); }, .bytes => { diff --git a/src/js_lexer.zig b/src/js_lexer.zig index c27609e2c1..51fb58fbed 100644 --- a/src/js_lexer.zig +++ b/src/js_lexer.zig @@ -806,6 +806,10 @@ fn NewLexer_( return if (!(cp_len + it.current > it.source.contents.len)) it.source.contents[it.current .. cp_len + it.current] else ""; } + fn remaining(it: *const LexerType) []const u8 { + return it.source.contents[it.current..]; + } + inline fn nextCodepoint(it: *LexerType) CodePoint { if (it.current >= it.source.contents.len) { it.end = it.source.contents.len; @@ -1498,26 +1502,14 @@ fn NewLexer_( lexer.token = .t_slash_equals; }, '/' => { - singleLineComment: while (true) { - lexer.step(); - switch (lexer.code_point) { - '\r', '\n', 0x2028, 0x2029 => { - break :singleLineComment; - }, - -1 => { - break :singleLineComment; - }, - else => {}, - } - } - + lexer.scanSingleLineComment(); if (comptime is_json) { if (!json.allow_comments) { try lexer.addRangeError(lexer.range(), "JSON does not support comments", .{}, true); return; } } - lexer.scanCommentText(); + lexer.scanCommentText(false); continue; }, '*' => { @@ -1571,7 +1563,7 @@ fn NewLexer_( return; } } - lexer.scanCommentText(); + lexer.scanCommentText(true); continue; }, else => { @@ -1890,7 +1882,7 @@ fn NewLexer_( } } - fn scanCommentText(lexer: *LexerType) void { + fn scanCommentText(lexer: *LexerType, for_pragma: bool) void { const text = lexer.source.contents[lexer.start..lexer.end]; const has_legal_annotation = text.len > 2 and text[2] == '!'; const is_multiline_comment = text.len > 1 and text[1] == '*'; @@ -1922,120 +1914,132 @@ fn NewLexer_( if (comptime is_json) return; - var rest = text[0..end_comment_text]; - const end = rest.ptr + rest.len; - - if (comptime Environment.enableSIMD) { - const wrapped_len = rest.len - (rest.len % strings.ascii_vector_size); - const comment_end = rest.ptr + wrapped_len; - while (rest.ptr != comment_end) { - const vec: strings.AsciiVector = rest.ptr[0..strings.ascii_vector_size].*; - - // lookahead for any # or @ characters - const hashtag = @as(strings.AsciiVectorU1, @bitCast(vec == @as(strings.AsciiVector, @splat(@as(u8, '#'))))); - const at = @as(strings.AsciiVectorU1, @bitCast(vec == @as(strings.AsciiVector, @splat(@as(u8, '@'))))); - - if (@reduce(.Max, hashtag + at) == 1) { - rest.len = @intFromPtr(end) - @intFromPtr(rest.ptr); - if (comptime Environment.allow_assert) { - bun.assert( - strings.containsChar(&@as([strings.ascii_vector_size]u8, vec), '#') or - strings.containsChar(&@as([strings.ascii_vector_size]u8, vec), '@'), - ); - } - - for (@as([strings.ascii_vector_size]u8, vec), 0..) |c, i| { - switch (c) { - '@', '#' => { - const chunk = rest[i + 1 ..]; - if (!lexer.has_pure_comment_before) { - if (strings.hasPrefixWithWordBoundary(chunk, "__PURE__")) { - lexer.has_pure_comment_before = true; - continue; - } - // TODO: implement NO_SIDE_EFFECTS - // else if (strings.hasPrefixWithWordBoundary(chunk, "__NO_SIDE_EFFECTS__")) { - // lexer.has_no_side_effect_comment_before = true; - // continue; - // } - } - - if (strings.hasPrefixWithWordBoundary(chunk, "jsx")) { - if (PragmaArg.scan(.skip_space_first, lexer.start + i + 1, "jsx", chunk)) |span| { - lexer.jsx_pragma._jsx = span; - } - } else if (strings.hasPrefixWithWordBoundary(chunk, "jsxFrag")) { - if (PragmaArg.scan(.skip_space_first, lexer.start + i + 1, "jsxFrag", chunk)) |span| { - lexer.jsx_pragma._jsxFrag = span; - } - } else if (strings.hasPrefixWithWordBoundary(chunk, "jsxRuntime")) { - if (PragmaArg.scan(.skip_space_first, lexer.start + i + 1, "jsxRuntime", chunk)) |span| { - lexer.jsx_pragma._jsxRuntime = span; - } - } else if (strings.hasPrefixWithWordBoundary(chunk, "jsxImportSource")) { - if (PragmaArg.scan(.skip_space_first, lexer.start + i + 1, "jsxImportSource", chunk)) |span| { - lexer.jsx_pragma._jsxImportSource = span; - } - } else if (i == 2 and strings.hasPrefixComptime(chunk, " sourceMappingURL=")) { - if (PragmaArg.scan(.no_space_first, lexer.start + i + 1, " sourceMappingURL=", chunk)) |span| { - lexer.source_mapping_url = span; - } - } - }, - else => {}, - } - } - } - - rest.ptr += strings.ascii_vector_size; - } - rest.len = @intFromPtr(end) - @intFromPtr(rest.ptr); + if (!for_pragma) { + return; } - if (comptime Environment.allow_assert) - bun.assert(rest.len == 0 or bun.isSliceInBuffer(rest, text)); + var rest = text[0..end_comment_text]; - while (rest.len > 0) { - const c = rest[0]; - rest = rest[1..]; + while (strings.indexOfAny(rest, "@#")) |i| { + const c = rest[i]; + rest = rest[@min(i + 1, rest.len)..]; switch (c) { '@', '#' => { const chunk = rest; - const i = @intFromPtr(chunk.ptr) - @intFromPtr(text.ptr); - if (!lexer.has_pure_comment_before) { - if (strings.hasPrefixWithWordBoundary(chunk, "__PURE__")) { - lexer.has_pure_comment_before = true; - continue; - } - } + const offset = lexer.scanPragma(lexer.start + i + (text.len - rest.len), chunk, false); - if (strings.hasPrefixWithWordBoundary(chunk, "jsx")) { - if (PragmaArg.scan(.skip_space_first, lexer.start + i + 1, "jsx", chunk)) |span| { - lexer.jsx_pragma._jsx = span; - } - } else if (strings.hasPrefixWithWordBoundary(chunk, "jsxFrag")) { - if (PragmaArg.scan(.skip_space_first, lexer.start + i + 1, "jsxFrag", chunk)) |span| { - lexer.jsx_pragma._jsxFrag = span; - } - } else if (strings.hasPrefixWithWordBoundary(chunk, "jsxRuntime")) { - if (PragmaArg.scan(.skip_space_first, lexer.start + i + 1, "jsxRuntime", chunk)) |span| { - lexer.jsx_pragma._jsxRuntime = span; - } - } else if (strings.hasPrefixWithWordBoundary(chunk, "jsxImportSource")) { - if (PragmaArg.scan(.skip_space_first, lexer.start + i + 1, "jsxImportSource", chunk)) |span| { - lexer.jsx_pragma._jsxImportSource = span; - } - } else if (i == 2 and strings.hasPrefixComptime(chunk, " sourceMappingURL=")) { - if (PragmaArg.scan(.no_space_first, lexer.start + i + 1, " sourceMappingURL=", chunk)) |span| { - lexer.source_mapping_url = span; - } - } + rest = rest[ + // The @min is necessary because the file could end + // with a pragma and hasPrefixWithWordBoundary + // returns true when that "word boundary" is EOF + @min(offset, rest.len)..]; }, else => {}, } } } + /// This scans a "// comment" in a single pass over the input. + fn scanSingleLineComment(lexer: *LexerType) void { + while (true) { + // Find index of newline (ASCII/Unicode), non-ASCII, '#', or '@'. + if (bun.highway.indexOfNewlineOrNonASCIIOrHashOrAt(lexer.remaining())) |relative_index| { + const absolute_index = lexer.current + relative_index; + lexer.current = absolute_index; // Move TO the interesting char + + lexer.step(); // Consume the interesting char, sets code_point, advances current + + switch (lexer.code_point) { + '\r', '\n', 0x2028, 0x2029 => { // Is it a line terminator? + // Found the end of the comment line. + return; // Stop scanning. Lexer state is ready for the next token. + }, + -1 => { + return; + }, // EOF? Stop. + + '#', '@' => { + if (comptime !is_json) { + const pragma_trigger_pos = lexer.end; // Position OF #/@ + // Use remaining() which starts *after* the consumed #/@ + const chunk = lexer.remaining(); + + const offset = lexer.scanPragma(pragma_trigger_pos, chunk, true); + + if (offset > 0) { + // Pragma found (e.g., __PURE__). + // Advance current past the pragma's argument text. + // 'current' is already after the #/@ trigger. + lexer.current += offset; + // Do NOT consume the character immediately after the pragma. + // Let the main loop find the actual line terminator. + + // Continue the outer loop from the position AFTER the pragma arg. + continue; + } + // If offset == 0, it wasn't a valid pragma start. + } + // Not a pragma or is_json. Treat #/@ as a normal comment character. + // The character was consumed by step(). Let the outer loop continue. + continue; + }, + else => { + // Non-ASCII (but not LS/PS), etc. Treat as normal comment char. + // The character was consumed by step(). Let the outer loop continue. + continue; + }, + } + } else { // Highway found nothing until EOF + // Consume the rest of the line. + lexer.end = lexer.source.contents.len; + lexer.current = lexer.source.contents.len; + lexer.code_point = -1; // Set EOF state + return; + } + } + unreachable; + } + /// Scans the string for a pragma. + /// offset is used when there's an issue with the JSX pragma later on. + /// Returns the byte length to advance by if found, otherwise 0. + fn scanPragma(lexer: *LexerType, offset_for_errors: usize, chunk: string, allow_newline: bool) usize { + if (!lexer.has_pure_comment_before) { + if (strings.hasPrefixWithWordBoundary(chunk, "__PURE__")) { + lexer.has_pure_comment_before = true; + return "__PURE__".len; + } + } + + if (strings.hasPrefixWithWordBoundary(chunk, "jsx")) { + if (PragmaArg.scan(.skip_space_first, lexer.start + offset_for_errors, "jsx", chunk, allow_newline)) |span| { + lexer.jsx_pragma._jsx = span; + return "jsx".len + + if (span.range.len > 0) @as(usize, @intCast(span.range.len)) else 0; + } + } else if (strings.hasPrefixWithWordBoundary(chunk, "jsxFrag")) { + if (PragmaArg.scan(.skip_space_first, lexer.start + offset_for_errors, "jsxFrag", chunk, allow_newline)) |span| { + lexer.jsx_pragma._jsxFrag = span; + return "jsxFrag".len + + if (span.range.len > 0) @as(usize, @intCast(span.range.len)) else 0; + } + } else if (strings.hasPrefixWithWordBoundary(chunk, "jsxRuntime")) { + if (PragmaArg.scan(.skip_space_first, lexer.start + offset_for_errors, "jsxRuntime", chunk, allow_newline)) |span| { + lexer.jsx_pragma._jsxRuntime = span; + return "jsxRuntime".len + + if (span.range.len > 0) @as(usize, @intCast(span.range.len)) else 0; + } + } else if (strings.hasPrefixWithWordBoundary(chunk, "jsxImportSource")) { + if (PragmaArg.scan(.skip_space_first, lexer.start + offset_for_errors, "jsxImportSource", chunk, allow_newline)) |span| { + lexer.jsx_pragma._jsxImportSource = span; + return "jsxImportSource".len + + if (span.range.len > 0) @as(usize, @intCast(span.range.len)) else 0; + } + } else if (chunk.len >= " sourceMappingURL=".len + 1 and strings.hasPrefixComptime(chunk, " sourceMappingURL=")) { // Check includes space for prefix + return PragmaArg.scanSourceMappingURLValue(lexer.start, offset_for_errors, chunk, &lexer.source_mapping_url); + } + + return 0; + } // TODO: implement this pub fn removeMultilineCommentIndent(_: *LexerType, _: string, text: string) string { return text; @@ -2123,7 +2127,7 @@ fn NewLexer_( return js_ast.E.String.init(try lexer.allocator.dupe(u16, lexer.temp_buffer_u16.items)); } else { const result = try lexer.allocator.alloc(u8, lexer.temp_buffer_u16.items.len); - strings.copyU16IntoU8(result, []const u16, lexer.temp_buffer_u16.items); + strings.copyU16IntoU8(result, lexer.temp_buffer_u16.items); return js_ast.E.String.init(result); } }, @@ -3267,7 +3271,48 @@ pub const PragmaArg = enum { no_space_first, skip_space_first, - pub fn scan(kind: PragmaArg, offset_: usize, pragma: string, text_: string) ?js_ast.Span { + pub fn isNewline(c: CodePoint) bool { + return c == '\r' or c == '\n' or c == 0x2028 or c == 0x2029; + } + + // These can be extremely long, so we use SIMD. + /// "//# sourceMappingURL=data:/adspaoksdpkz" + /// ^^^^^^^^^^^^^^^^^^ + pub fn scanSourceMappingURLValue(start: usize, offset_for_errors: usize, chunk: string, result: *?js_ast.Span) usize { + const prefix: u32 = " sourceMappingURL=".len; + const url_and_rest_of_code = chunk[prefix..]; // Slice containing only the potential argument + + const url_len: usize = brk: { + if (bun.strings.indexOfSpaceOrNewlineOrNonASCII(url_and_rest_of_code, 0)) |delimiter_pos_in_arg| { + // SIMD found the delimiter at index 'delimiter_pos_in_arg' relative to url start. + // The argument's length is exactly this index. + break :brk delimiter_pos_in_arg; + } else { + // SIMD found no delimiter in the entire url. + // The argument is the whole chunk. + break :brk url_and_rest_of_code.len; + } + }; + + // Now we have the correct argument length (url_len) and the argument text. + const url = url_and_rest_of_code[0..url_len]; + + // Calculate absolute start location of the argument + const absolute_arg_start = start + offset_for_errors + prefix; + + result.* = js_ast.Span{ + .range = logger.Range{ + .len = @as(i32, @intCast(url_len)), // Correct length + .loc = .{ .start = @as(i32, @intCast(absolute_arg_start)) }, // Correct start + }, + .text = url, + }; + + // Return total length consumed from the start of the chunk + return prefix + url_len; // Correct total length + } + + pub fn scan(kind: PragmaArg, offset_: usize, pragma: string, text_: string, allow_newline: bool) ?js_ast.Span { var text = text_[pragma.len..]; var iter = strings.CodepointIterator.init(text); @@ -3297,7 +3342,7 @@ pub const PragmaArg = enum { } var i: usize = 0; - while (!isWhitespace(cursor.c)) { + while (!isWhitespace(cursor.c) and (!allow_newline or !isNewline(cursor.c))) { i += cursor.width; if (i >= text.len) { break; @@ -3356,28 +3401,5 @@ fn skipToInterestingCharacterInMultilineComment(text_: []const u8) ?u32 { } fn indexOfInterestingCharacterInStringLiteral(text_: []const u8, quote: u8) ?usize { - var text = text_; - const quote_: @Vector(strings.ascii_vector_size, u8) = @splat(@as(u8, quote)); - const backslash: @Vector(strings.ascii_vector_size, u8) = @splat(@as(u8, '\\')); - const V1x16 = strings.AsciiVectorU1; - - while (text.len >= strings.ascii_vector_size) { - const vec: strings.AsciiVector = text[0..strings.ascii_vector_size].*; - - const any_significant = - @as(V1x16, @bitCast(vec > strings.max_16_ascii)) | - @as(V1x16, @bitCast(vec < strings.min_16_ascii)) | - @as(V1x16, @bitCast(quote_ == vec)) | - @as(V1x16, @bitCast(backslash == vec)); - - if (@reduce(.Max, any_significant) > 0) { - const bitmask = @as(u16, @bitCast(any_significant)); - const first = @ctz(bitmask); - bun.assert(first < strings.ascii_vector_size); - return first + (@intFromPtr(text.ptr) - @intFromPtr(text_.ptr)); - } - text = text[strings.ascii_vector_size..]; - } - - return null; + return bun.highway.indexOfInterestingCharacterInStringLiteral(text_, quote); } diff --git a/src/js_printer.zig b/src/js_printer.zig index a94951f00e..19137c9f72 100644 --- a/src/js_printer.zig +++ b/src/js_printer.zig @@ -165,7 +165,7 @@ pub fn estimateLengthForUTF8(input: []const u8, comptime ascii_only: bool, compt var remaining = input; var len: usize = 2; // for quotes - while (strings.indexOfNeedsEscape(remaining, quote_char)) |i| { + while (strings.indexOfNeedsEscapeForJavaScriptString(remaining, quote_char)) |i| { len += i; remaining = remaining[i..]; const char_len = strings.wtf8ByteSequenceLengthWithInvalid(remaining[0]); @@ -249,7 +249,7 @@ pub fn writePreQuotedString(text_in: []const u8, comptime Writer: type, writer: switch (encoding) { .ascii, .utf8 => { - if (strings.indexOfNeedsEscape(remain, quote_char)) |j| { + if (strings.indexOfNeedsEscapeForJavaScriptString(remain, quote_char)) |j| { const text_chunk = text[i .. i + clamped_width]; try writer.writeAll(text_chunk); i += clamped_width; diff --git a/src/shell/braces.zig b/src/shell/braces.zig index d739140ccc..a696eb0bd2 100644 --- a/src/shell/braces.zig +++ b/src/shell/braces.zig @@ -95,60 +95,6 @@ fn StackStack(comptime T: type, comptime SizeType: type, comptime N: SizeType) t }; } -/// This may have false positives but it is fast -fn fastDetect(src: []const u8) bool { - var has_open = false; - var has_close = false; - if (src.len < 16) { - for (src) |char| { - switch (char) { - '{' => { - has_open = true; - }, - '}' => { - has_close = true; - }, - } - if (has_close and has_close) return true; - } - return false; - } - - const needles = comptime [2]@Vector(16, u8){ - @splat('{'), - @splat('}'), - @splat('"'), - }; - - const i: usize = 0; - while (i + 16 <= src.len) { - const haystack = src[i .. i + 16].*; - if (std.simd.firstTrue(needles[0] == haystack)) { - has_open = true; - } - if (std.simd.firstTrue(needles[1] == haystack)) { - has_close = true; - } - if (has_open and has_close) return true; - } - - if (i < src.len) { - for (src) |char| { - switch (char) { - '{' => { - has_open = true; - }, - '}' => { - has_close = true; - }, - } - if (has_close and has_open) return true; - } - return false; - } - return false; -} - const ExpandError = StackError || ParserError; /// `out` is preallocated by using the result from `calculateExpandedAmount` diff --git a/src/shell/shell.zig b/src/shell/shell.zig index 6193fe6c19..eb89c72c2e 100644 --- a/src/shell/shell.zig +++ b/src/shell/shell.zig @@ -3563,26 +3563,10 @@ var stderr_mutex = bun.Mutex{}; pub fn hasEqSign(str: []const u8) ?u32 { if (isAllAscii(str)) { - if (str.len < 16) - return hasEqSignAsciiSlow(str); - - const needles: @Vector(16, u8) = @splat('='); - - var i: u32 = 0; - while (i + 16 <= str.len) : (i += 16) { - const haystack = str[i..][0..16].*; - const result = haystack == needles; - - if (std.simd.firstTrue(result)) |idx| { - return @intCast(i + idx); - } - } - - return i + (hasEqSignAsciiSlow(str[i..]) orelse return null); + return bun.strings.indexOfChar(str, '='); } // TODO actually i think that this can also use the simd stuff - var iter = CodepointIterator.init(str); var cursor = CodepointIterator.Cursor{}; while (iter.next(&cursor)) { @@ -3594,11 +3578,6 @@ pub fn hasEqSign(str: []const u8) ?u32 { return null; } -pub fn hasEqSignAsciiSlow(str: []const u8) ?u32 { - for (str, 0..) |c, i| if (c == '=') return @intCast(i); - return null; -} - pub const CmdEnvIter = struct { env: *const bun.StringArrayHashMap([:0]const u8), iter: bun.StringArrayHashMap([:0]const u8).Iterator, diff --git a/src/string_immutable.zig b/src/string_immutable.zig index 2bba0b7cf8..3025727c86 100644 --- a/src/string_immutable.zig +++ b/src/string_immutable.zig @@ -96,43 +96,14 @@ fn literalLength(comptime T: type, comptime str: string) usize { pub const OptionalUsize = std.meta.Int(.unsigned, @bitSizeOf(usize) - 1); pub fn indexOfAny(slice: string, comptime str: []const u8) ?OptionalUsize { - switch (comptime str.len) { + return switch (comptime str.len) { 0 => @compileError("str cannot be empty"), 1 => return indexOfChar(slice, str[0]), - else => {}, - } - - var remaining = slice; - if (remaining.len == 0) return null; - - if (comptime Environment.enableSIMD) { - while (remaining.len >= ascii_vector_size) { - const vec: AsciiVector = remaining[0..ascii_vector_size].*; - var cmp: AsciiVectorU1 = @bitCast(vec == @as(AsciiVector, @splat(@as(u8, str[0])))); - inline for (str[1..]) |c| { - cmp |= @bitCast(vec == @as(AsciiVector, @splat(@as(u8, c)))); - } - - if (@reduce(.Max, cmp) > 0) { - const bitmask = @as(AsciiVectorInt, @bitCast(cmp)); - const first = @ctz(bitmask); - - return @as(OptionalUsize, @intCast(first + slice.len - remaining.len)); - } - - remaining = remaining[ascii_vector_size..]; - } - - if (comptime Environment.allow_assert) assert(remaining.len < ascii_vector_size); - } - - for (remaining, 0..) |c, i| { - if (strings.indexOfChar(str, c) != null) { - return @as(OptionalUsize, @intCast(i + slice.len - remaining.len)); - } - } - - return null; + else => if (bun.highway.indexOfAnyChar(slice, str)) |i| + @intCast(i) + else + null, + }; } pub fn indexOfAny16(self: []const u16, comptime str: anytype) ?OptionalUsize { @@ -177,7 +148,7 @@ pub fn inMapCaseInsensitive(self: []const u8, comptime ComptimeStringMap: anytyp return bun.String.ascii(self).inMapCaseInsensitive(ComptimeStringMap); } -pub inline fn containsAny(in: anytype, target: string) bool { +pub inline fn containsAny(in: anytype, target: anytype) bool { for (in) |str| if (contains(if (@TypeOf(str) == u8) &[1]u8{str} else bun.span(str), target)) return true; return false; } @@ -496,7 +467,7 @@ pub inline fn lastIndexOf(self: string, str: string) ?usize { return std.mem.lastIndexOf(u8, self, str); } -pub inline fn indexOf(self: string, str: string) ?usize { +pub fn indexOf(self: string, str: string) ?usize { if (comptime !bun.Environment.isNative) { return std.mem.indexOf(u8, self, str); } @@ -990,16 +961,13 @@ pub fn endsWithAnyComptime(self: string, comptime str: string) bool { } } -pub fn eql(self: string, other: anytype) bool { +pub fn eql(self: string, other: []const u8) bool { if (self.len != other.len) return false; if (comptime @TypeOf(other) == *string) { return eql(self, other.*); } - for (self, 0..) |c, i| { - if (other[i] != c) return false; - } - return true; + return eqlLong(self, other, false); } pub fn eqlComptimeT(comptime T: type, self: []const T, comptime alt: anytype) bool { @@ -1367,43 +1335,11 @@ pub fn copyU8IntoU16WithAlignment(comptime alignment: u21, output_: []align(alig // } // } -pub inline fn copyU16IntoU8(output_: []u8, comptime InputType: type, input_: InputType) void { - if (comptime Environment.allow_assert) assert(input_.len <= output_.len); - var output = output_; - var input = input_; +pub inline fn copyU16IntoU8(output: []u8, input: []align(1) const u16) void { if (comptime Environment.allow_assert) assert(input.len <= output.len); + const count = @min(input.len, output.len); - // https://zig.godbolt.org/z/9rTn1orcY - - const group = @as(usize, 16); - // end at the last group of 16 bytes - var input_ptr = input.ptr; - var output_ptr = output.ptr; - - if (comptime Environment.enableSIMD) { - const end_len = (@min(input.len, output.len) & ~(group - 1)); - const last_vector_ptr = input.ptr + end_len; - while (last_vector_ptr != input_ptr) { - const input_vec1: @Vector(group, u16) = input_ptr[0..group].*; - inline for (0..group) |i| { - output_ptr[i] = @as(u8, @truncate(input_vec1[i])); - } - - output_ptr += group; - input_ptr += group; - } - - input.len -= end_len; - output.len -= end_len; - } - - const last_input_ptr = input_ptr + @min(input.len, output.len); - - while (last_input_ptr != input_ptr) { - output_ptr[0] = @as(u8, @truncate(input_ptr[0])); - output_ptr += 1; - input_ptr += 1; - } + bun.highway.copyU16ToU8(input[0..count], output[0..count]); } const strings = @This(); @@ -2353,11 +2289,7 @@ pub fn toUTF8ListWithTypeBun(list: *std.ArrayList(u8), comptime Type: type, utf1 } list.items.len += i; - copyU16IntoU8( - list.items[list.items.len - i ..], - Type, - to_copy, - ); + copyU16IntoU8(list.items[list.items.len - i ..], to_copy); if (comptime skip_trailing_replacement) { if (replacement.is_lead and utf16_remaining.len == 0) { @@ -2377,7 +2309,7 @@ pub fn toUTF8ListWithTypeBun(list: *std.ArrayList(u8), comptime Type: type, utf1 try list.ensureTotalCapacityPrecise(utf16_remaining.len + list.items.len); const old_len = list.items.len; list.items.len += utf16_remaining.len; - copyU16IntoU8(list.items[old_len..], Type, utf16_remaining); + copyU16IntoU8(list.items[old_len..], utf16_remaining); } log("UTF16 {d} -> {d} UTF8", .{ utf16.len, list.items.len }); @@ -2794,43 +2726,8 @@ pub fn replaceLatin1WithUTF8(buf_: []u8) void { } } -pub fn elementLengthLatin1IntoUTF8(comptime Type: type, latin1_: Type) usize { - // https://zig.godbolt.org/z/zzYexPPs9 - - var latin1 = latin1_; - const input_len = latin1.len; - var total_non_ascii_count: usize = 0; - - // This is about 30% faster on large input compared to auto-vectorization - if (comptime Environment.enableSIMD) { - const end = latin1.ptr + (latin1.len - (latin1.len % ascii_vector_size)); - while (latin1.ptr != end) { - const vec: AsciiVector = latin1[0..ascii_vector_size].*; - - // Shifting a unsigned 8 bit integer to the right by 7 bits always produces a value of 0 or 1. - const cmp = vec >> @as(AsciiVector, @splat( - @as(u8, 7), - )); - - // Anding that value rather than converting it into a @Vector(16, u1) produces better code from LLVM. - const mask: AsciiVector = cmp & @as(AsciiVector, @splat( - @as(u8, 1), - )); - - total_non_ascii_count += @as(usize, @reduce(.Add, mask)); - latin1 = latin1[ascii_vector_size..]; - } - - // an important hint to the compiler to not auto-vectorize the loop below - if (latin1.len >= ascii_vector_size) unreachable; - } - - for (latin1) |c| { - total_non_ascii_count += @as(usize, @intFromBool(c > 127)); - } - - // each non-ascii latin1 character becomes 2 UTF8 characters - return input_len + total_non_ascii_count; +pub fn elementLengthLatin1IntoUTF8(slice: []const u8) usize { + return bun.simdutf.length.utf8.from.latin1(slice); } pub fn copyLatin1IntoUTF16(comptime Buffer: type, buf_: Buffer, comptime Type: type, latin1_: Type) EncodeIntoResult { @@ -2865,20 +2762,7 @@ pub fn elementLengthLatin1IntoUTF16(comptime Type: type, latin1_: Type) usize { return latin1_.len; } - var count: usize = 0; - var latin1 = latin1_; - while (latin1.len > 0) { - const function = comptime if (std.meta.Child(Type) == u8) strings.firstNonASCIIWithType else strings.firstNonASCII16; - const to_write = function(Type, latin1) orelse @as(u32, @truncate(latin1.len)); - count += to_write; - latin1 = latin1[to_write..]; - if (latin1.len > 0) { - count += comptime if (std.meta.Child(Type) == u8) 2 else 1; - latin1 = latin1[1..]; - } - } - - return count; + return bun.simdutf.length.utf16.from.latin1(latin1_); } pub fn escapeHTMLForLatin1Input(allocator: std.mem.Allocator, latin1: []const u8) !Escaped(u8) { @@ -3605,7 +3489,7 @@ pub fn copyUTF16IntoUTF8WithBuffer(buf: []u8, comptime Type: type, utf16: Type, while (firstNonASCII16(Type, utf16_remaining)) |i| { const end = @min(i, remaining.len); - if (end > 0) copyU16IntoU8(remaining, Type, utf16_remaining[0..end]); + if (end > 0) copyU16IntoU8(remaining, utf16_remaining[0..end]); remaining = remaining[end..]; utf16_remaining = utf16_remaining[end..]; @@ -3674,7 +3558,7 @@ pub fn copyUTF16IntoUTF8WithBuffer(buf: []u8, comptime Type: type, utf16: Type, if (remaining.len > 0 and !ended_on_non_ascii and utf16_remaining.len > 0) { const len = @min(remaining.len, utf16_remaining.len); - copyU16IntoU8(remaining[0..len], Type, utf16_remaining[0..len]); + copyU16IntoU8(remaining[0..len], utf16_remaining[0..len]); utf16_remaining = utf16_remaining[len..]; remaining = remaining[len..]; } @@ -4014,44 +3898,7 @@ pub fn isAllASCII(slice: []const u8) bool { return true; } - if (bun.FeatureFlags.use_simdutf) - return bun.simdutf.validate.ascii(slice); - - var remaining = slice; - - // The NEON SIMD unit is 128-bit wide and includes 16 128-bit registers that can be used as 32 64-bit registers - if (comptime Environment.enableSIMD) { - const remaining_end_ptr = remaining.ptr + remaining.len - (remaining.len % ascii_vector_size); - while (remaining.ptr != remaining_end_ptr) : (remaining.ptr += ascii_vector_size) { - const vec: AsciiVector = remaining[0..ascii_vector_size].*; - - if (@reduce(.Max, vec) > 127) { - return false; - } - } - } - - const Int = u64; - const size = @sizeOf(Int); - const remaining_last8 = slice.ptr + slice.len - (slice.len % size); - while (remaining.ptr != remaining_last8) : (remaining.ptr += size) { - const bytes = @as(Int, @bitCast(remaining[0..size].*)); - // https://dotat.at/@/2022-06-27-tolower-swar.html - const mask = bytes & 0x8080808080808080; - - if (mask > 0) { - return false; - } - } - - const final = slice.ptr + slice.len; - while (remaining.ptr != final) : (remaining.ptr += 1) { - if (remaining[0] > 127) { - return false; - } - } - - return true; + return bun.simdutf.validate.ascii(slice); } // #define U16_LEAD(supplementary) (UChar)(((supplementary)>>10)+0xd7c0) @@ -4085,296 +3932,67 @@ pub inline fn u16GetSupplementary(lead: u32, trail: u32) u32 { pub const u16_surrogate_offset = 56613888; pub fn firstNonASCII(slice: []const u8) ?u32 { - return firstNonASCIIWithType([]const u8, slice); -} - -pub fn firstNonASCIIWithType(comptime Type: type, slice: Type) ?u32 { - var remaining = slice; - - if (comptime bun.FeatureFlags.use_simdutf) { - const result = bun.simdutf.validate.with_errors.ascii(slice); - if (result.status == .success) { - return null; - } - - return @as(u32, @truncate(result.count)); - } - - if (comptime Environment.enableSIMD) { - if (remaining.len >= ascii_vector_size) { - const remaining_start = remaining.ptr; - const remaining_end = remaining.ptr + remaining.len - (remaining.len % ascii_vector_size); - - while (remaining.ptr != remaining_end) { - const vec: AsciiVector = remaining[0..ascii_vector_size].*; - - if (@reduce(.Max, vec) > 127) { - const Int = u64; - const size = @sizeOf(Int); - remaining.len -= @intFromPtr(remaining.ptr) - @intFromPtr(remaining_start); - - { - const bytes = @as(Int, @bitCast(remaining[0..size].*)); - // https://dotat.at/@/2022-06-27-tolower-swar.html - const mask = bytes & 0x8080808080808080; - - if (mask > 0) { - const first_set_byte = @ctz(mask) / 8; - if (comptime Environment.isDebug) { - bun.assert(remaining[first_set_byte] > 127); - for (0..first_set_byte) |j| { - bun.assert(remaining[j] <= 127); - } - } - - return @as(u32, first_set_byte) + @as(u32, @intCast(slice.len - remaining.len)); - } - remaining = remaining[size..]; - } - { - const bytes = @as(Int, @bitCast(remaining[0..size].*)); - const mask = bytes & 0x8080808080808080; - - if (mask > 0) { - const first_set_byte = @ctz(mask) / 8; - if (comptime Environment.isDebug) { - bun.assert(remaining[first_set_byte] > 127); - for (0..first_set_byte) |j| { - bun.assert(remaining[j] <= 127); - } - } - - return @as(u32, first_set_byte) + @as(u32, @intCast(slice.len - remaining.len)); - } - } - unreachable; - } - - // the more intuitive way, using slices, produces worse codegen - // specifically: it subtracts the length at the end of the loop - // we don't need to do that - // we only need to subtract the length once at the very end - remaining.ptr += ascii_vector_size; - } - remaining.len -= @intFromPtr(remaining.ptr) - @intFromPtr(remaining_start); - } - } - - { - const Int = u64; - const size = @sizeOf(Int); - const remaining_start = remaining.ptr; - const remaining_end = remaining.ptr + remaining.len - (remaining.len % size); - - if (comptime Environment.enableSIMD) { - // these assertions exist more so for LLVM - bun.unsafeAssert(remaining.len < ascii_vector_size); - bun.unsafeAssert(@intFromPtr(remaining.ptr + ascii_vector_size) > @intFromPtr(remaining_end)); - } - - if (remaining.len >= size) { - while (remaining.ptr != remaining_end) { - const bytes = @as(Int, @bitCast(remaining[0..size].*)); - // https://dotat.at/@/2022-06-27-tolower-swar.html - const mask = bytes & 0x8080808080808080; - - if (mask > 0) { - remaining.len -= @intFromPtr(remaining.ptr) - @intFromPtr(remaining_start); - const first_set_byte = @ctz(mask) / 8; - if (comptime Environment.isDebug) { - bun.unsafeAssert(remaining[first_set_byte] > 127); - for (0..first_set_byte) |j| { - bun.unsafeAssert(remaining[j] <= 127); - } - } - - return @as(u32, first_set_byte) + @as(u32, @intCast(slice.len - remaining.len)); - } - - remaining.ptr += size; - } - remaining.len -= @intFromPtr(remaining.ptr) - @intFromPtr(remaining_start); - } - } - - if (comptime Environment.allow_assert) assert(remaining.len < 8); - - for (remaining) |*char| { - if (char.* > 127) { - // try to prevent it from reading the length of the slice - return @as(u32, @truncate(@intFromPtr(char) - @intFromPtr(slice.ptr))); - } - } - - return null; -} - -pub fn indexOfNewlineOrNonASCIIOrANSI(slice_: []const u8, offset: u32) ?u32 { - const slice = slice_[offset..]; - var remaining = slice; - - if (remaining.len == 0) + const result = bun.simdutf.validate.with_errors.ascii(slice); + if (result.status == .success) { return null; - - if (comptime Environment.enableSIMD) { - while (remaining.len >= ascii_vector_size) { - const vec: AsciiVector = remaining[0..ascii_vector_size].*; - const cmp = @as(AsciiVectorU1, @bitCast((vec > max_16_ascii))) | @as(AsciiVectorU1, @bitCast((vec < min_16_ascii))) | - @as(AsciiVectorU1, @bitCast(vec == @as(AsciiVector, @splat(@as(u8, '\r'))))) | - @as(AsciiVectorU1, @bitCast(vec == @as(AsciiVector, @splat(@as(u8, '\n'))))) | - @as(AsciiVectorU1, @bitCast(vec == @as(AsciiVector, @splat(@as(u8, '\x1b'))))); - - if (@reduce(.Max, cmp) > 0) { - const bitmask = @as(AsciiVectorInt, @bitCast(cmp)); - const first = @ctz(bitmask); - - return @as(u32, first) + @as(u32, @intCast(slice.len - remaining.len)) + offset; - } - - remaining = remaining[ascii_vector_size..]; - } - - if (comptime Environment.allow_assert) assert(remaining.len < ascii_vector_size); } - for (remaining) |*char_| { - const char = char_.*; - if (char > 127 or char < 0x20 or char == '\n' or char == '\r' or char == '\x1b') { - return @as(u32, @truncate((@intFromPtr(char_) - @intFromPtr(slice.ptr)))) + offset; - } - } - - return null; + return @as(u32, @truncate(result.count)); } +pub const indexOfNewlineOrNonASCIIOrANSI = indexOfNewlineOrNonASCII; + +/// Checks if slice[offset..] has any < 0x20 or > 127 characters pub fn indexOfNewlineOrNonASCII(slice_: []const u8, offset: u32) ?u32 { return indexOfNewlineOrNonASCIICheckStart(slice_, offset, true); } +pub fn indexOfSpaceOrNewlineOrNonASCII(slice_: []const u8, offset: u32) ?u32 { + const slice = slice_[offset..]; + const remaining = slice; + + if (remaining.len == 0) + return null; + + if (remaining[0] > 127 or (remaining[0] < 0x20 and remaining[0] != 0x09)) { + return offset; + } + + const i = bun.highway.indexOfSpaceOrNewlineOrNonASCII(remaining) orelse return null; + return @as(u32, @truncate(i)) + offset; +} + pub fn indexOfNewlineOrNonASCIICheckStart(slice_: []const u8, offset: u32, comptime check_start: bool) ?u32 { const slice = slice_[offset..]; - var remaining = slice; + const remaining = slice; if (remaining.len == 0) return null; if (comptime check_start) { // this shows up in profiling - if (remaining[0] > 127 or remaining[0] < 0x20 or remaining[0] == '\r' or remaining[0] == '\n') { + if (remaining[0] > 127 or (remaining[0] < 0x20 and remaining[0] != 0x09)) { return offset; } } - if (comptime Environment.enableSIMD) { - while (remaining.len >= ascii_vector_size) { - const vec: AsciiVector = remaining[0..ascii_vector_size].*; - const cmp = @as(AsciiVectorU1, @bitCast((vec > max_16_ascii))) | @as(AsciiVectorU1, @bitCast((vec < min_16_ascii))) | - @as(AsciiVectorU1, @bitCast(vec == @as(AsciiVector, @splat(@as(u8, '\r'))))) | - @as(AsciiVectorU1, @bitCast(vec == @as(AsciiVector, @splat(@as(u8, '\n'))))); - - if (@reduce(.Max, cmp) > 0) { - const bitmask = @as(AsciiVectorInt, @bitCast(cmp)); - const first = @ctz(bitmask); - - return @as(u32, first) + @as(u32, @intCast(slice.len - remaining.len)) + offset; - } - - remaining = remaining[ascii_vector_size..]; - } - - if (comptime Environment.allow_assert) assert(remaining.len < ascii_vector_size); - } - - for (remaining) |*char_| { - const char = char_.*; - if (char > 127 or char < 0x20 or char == '\n' or char == '\r') { - return @as(u32, @truncate((@intFromPtr(char_) - @intFromPtr(slice.ptr)))) + offset; - } - } - - return null; + const i = bun.highway.indexOfNewlineOrNonASCII(remaining) orelse return null; + return @as(u32, @truncate(i)) + offset; } -pub fn containsNewlineOrNonASCIIOrQuote(slice_: []const u8) bool { - const slice = slice_; - var remaining = slice; - - if (remaining.len == 0) - return false; - - if (comptime Environment.enableSIMD) { - while (remaining.len >= ascii_vector_size) { - const vec: AsciiVector = remaining[0..ascii_vector_size].*; - const cmp = @as(AsciiVectorU1, @bitCast((vec > max_16_ascii))) | @as(AsciiVectorU1, @bitCast((vec < min_16_ascii))) | - @as(AsciiVectorU1, @bitCast(vec == @as(AsciiVector, @splat(@as(u8, '\r'))))) | - @as(AsciiVectorU1, @bitCast(vec == @as(AsciiVector, @splat(@as(u8, '\n'))))) | - @as(AsciiVectorU1, @bitCast(vec == @as(AsciiVector, @splat(@as(u8, '"'))))); - - if (@reduce(.Max, cmp) > 0) { - return true; - } - - remaining = remaining[ascii_vector_size..]; - } - - if (comptime Environment.allow_assert) assert(remaining.len < ascii_vector_size); - } - - for (remaining) |*char_| { - const char = char_.*; - if (char > 127 or char < 0x20 or char == '\n' or char == '\r' or char == '"') { - return true; - } - } - - return false; +pub fn containsNewlineOrNonASCIIOrQuote(text: []const u8) bool { + return bun.highway.containsNewlineOrNonASCIIOrQuote(text); } -/// JSON escape -pub fn indexOfNeedsEscape(slice: []const u8, comptime quote_char: u8) ?u32 { - var remaining = slice; - if (remaining.len == 0) +/// Supports: +/// - `"` +/// - `'` +/// - "`" +pub fn indexOfNeedsEscapeForJavaScriptString(slice: []const u8, quote_char: u8) ?u32 { + if (slice.len == 0) return null; - if (remaining[0] >= 127 or remaining[0] < 0x20 or remaining[0] == '\\' or remaining[0] == quote_char or (quote_char == '`' and remaining[0] == '$')) { - return 0; - } - - if (comptime Environment.enableSIMD) { - while (remaining.len >= ascii_vector_size) { - const vec: AsciiVector = remaining[0..ascii_vector_size].*; - const cmp: AsciiVectorU1 = if (comptime quote_char == '`') ( // - @as(AsciiVectorU1, @bitCast((vec > max_16_ascii))) | - @as(AsciiVectorU1, @bitCast((vec < min_16_ascii))) | - @as(AsciiVectorU1, @bitCast(vec == @as(AsciiVector, @splat(@as(u8, '\\'))))) | - @as(AsciiVectorU1, @bitCast(vec == @as(AsciiVector, @splat(@as(u8, quote_char))))) | - @as(AsciiVectorU1, @bitCast(vec == @as(AsciiVector, @splat(@as(u8, '$'))))) // - ) else ( // - @as(AsciiVectorU1, @bitCast((vec > max_16_ascii))) | - @as(AsciiVectorU1, @bitCast((vec < min_16_ascii))) | - @as(AsciiVectorU1, @bitCast(vec == @as(AsciiVector, @splat(@as(u8, '\\'))))) | - @as(AsciiVectorU1, @bitCast(vec == @as(AsciiVector, @splat(@as(u8, quote_char))))) // - ); - - if (@reduce(.Max, cmp) > 0) { - const bitmask = @as(AsciiVectorInt, @bitCast(cmp)); - const first = @ctz(bitmask); - - return @as(u32, first) + @as(u32, @truncate(@intFromPtr(remaining.ptr) - @intFromPtr(slice.ptr))); - } - - remaining = remaining[ascii_vector_size..]; - } - } - - for (remaining) |*char_| { - const char = char_.*; - if (char > 127 or char < 0x20 or char == '\\' or char == quote_char or (quote_char == '`' and char == '$')) { - return @as(u32, @truncate(@intFromPtr(char_) - @intFromPtr(slice.ptr))); - } - } - - return null; + return bun.highway.indexOfNeedsEscapeForJavaScriptString(slice, quote_char); } pub fn indexOfNeedsURLEncode(slice: []const u8) ?u32 { @@ -4447,15 +4065,7 @@ pub fn indexOfNeedsURLEncode(slice: []const u8) ?u32 { } pub fn indexOfCharZ(sliceZ: [:0]const u8, char: u8) ?u63 { - const ptr = bun.C.strchr(sliceZ.ptr, char) orelse return null; - const pos = @intFromPtr(ptr) - @intFromPtr(sliceZ.ptr); - - if (comptime Environment.isDebug) - bun.assert(@intFromPtr(sliceZ.ptr) <= @intFromPtr(ptr) and - @intFromPtr(ptr) < @intFromPtr(sliceZ.ptr + sliceZ.len) and - pos <= sliceZ.len); - - return @as(u63, @truncate(pos)); + return @truncate(bun.highway.indexOfChar(sliceZ, char) orelse return null); } pub fn indexOfChar(slice: []const u8, char: u8) ?u32 { @@ -4463,19 +4073,11 @@ pub fn indexOfChar(slice: []const u8, char: u8) ?u32 { } pub fn indexOfCharUsize(slice: []const u8, char: u8) ?usize { - if (slice.len == 0) - return null; - if (comptime !Environment.isNative) { return std.mem.indexOfScalar(u8, slice, char); } - const ptr = bun.C.memchr(slice.ptr, char, slice.len) orelse return null; - const i = @intFromPtr(ptr) - @intFromPtr(slice.ptr); - bun.assert(i < slice.len); - bun.assert(slice[i] == char); - - return i; + return bun.highway.indexOfChar(slice, char); } pub fn indexOfCharPos(slice: []const u8, char: u8, start_index: usize) ?usize { @@ -4485,13 +4087,9 @@ pub fn indexOfCharPos(slice: []const u8, char: u8, start_index: usize) ?usize { if (start_index >= slice.len) return null; - const ptr = bun.C.memchr(slice.ptr + start_index, char, slice.len - start_index) orelse - return null; - const i = @intFromPtr(ptr) - @intFromPtr(slice.ptr); - bun.assert(i < slice.len); - bun.assert(slice[i] == char); - - return i; + const result = bun.highway.indexOfChar(slice[start_index..], char) orelse return null; + bun.debugAssert(slice.len > result + start_index); + return result + start_index; } pub fn indexOfAnyPosComptime(slice: []const u8, comptime chars: []const u8, start_index: usize) ?usize { @@ -4934,47 +4532,6 @@ pub fn firstNonASCII16(comptime Slice: type, slice: Slice) ?u32 { return null; } -/// Fast path for printing template literal strings -pub fn @"nextUTF16NonASCIIOr$`\\"( - comptime Slice: type, - slice: Slice, -) ?u32 { - var remaining = slice; - - if (comptime Environment.enableSIMD and Environment.isNative) { - while (remaining.len >= ascii_u16_vector_size) { - const vec: AsciiU16Vector = remaining[0..ascii_u16_vector_size].*; - - const cmp = @as(AsciiVectorU16U1, @bitCast((vec > max_u16_ascii))) | - @as(AsciiVectorU16U1, @bitCast((vec < min_u16_ascii))) | - @as(AsciiVectorU16U1, @bitCast((vec == @as(AsciiU16Vector, @splat(@as(u16, '$')))))) | - @as(AsciiVectorU16U1, @bitCast((vec == @as(AsciiU16Vector, @splat(@as(u16, '`')))))) | - @as(AsciiVectorU16U1, @bitCast((vec == @as(AsciiU16Vector, @splat(@as(u16, '\\')))))); - - const bitmask = @as(u8, @bitCast(cmp)); - const first = @ctz(bitmask); - if (first < ascii_u16_vector_size) { - return @as(u32, @intCast(@as(u32, first) + - @as(u32, @intCast(slice.len - remaining.len)))); - } - - remaining = remaining[ascii_u16_vector_size..]; - } - } - - for (remaining, 0..) |char, i| { - switch (char) { - '$', '`', '\\', 0...0x20 - 1, 128...std.math.maxInt(u16) => { - return @as(u32, @truncate(i + (slice.len - remaining.len))); - }, - - else => {}, - } - } - - return null; -} - /// Convert potentially ill-formed UTF-8 or UTF-16 bytes to a Unicode Codepoint. /// - Invalid codepoints are replaced with `zero` parameter /// - Null bytes return 0 @@ -5097,31 +4654,6 @@ pub fn lengthOfLeadingWhitespaceASCII(slice: string) usize { return slice.len; } -pub fn containsNonBmpCodePointUTF16(_text: []const u16) bool { - const n = _text.len; - if (n > 0) { - var i: usize = 0; - const text = _text[0 .. n - 1]; - while (i < n - 1) : (i += 1) { - switch (text[i]) { - // Check for a high surrogate - 0xD800...0xDBFF => { - // Check for a low surrogate - switch (text[i + 1]) { - 0xDC00...0xDFFF => { - return true; - }, - else => {}, - } - }, - else => {}, - } - } - } - - return false; -} - pub fn join(slices: []const string, delimiter: string, allocator: std.mem.Allocator) !string { return try std.mem.join(allocator, delimiter, slices); } @@ -5238,6 +4770,75 @@ pub fn NewCodePointIterator(comptime CodePointType_: type, comptime zeroValue: c return Iterator{ .bytes = str, .i = i, .c = zeroValue }; } + const SkipResult = enum { + eof, + found, + not_found, + }; + + /// Advance forward until the scalar function returns true. + /// THe simd function is "best effort" and expected to sometimes return a result which `scalar` will return false for. + /// This is because we don't decode UTF-8 in the SIMD code path. + pub fn skip(it: *const Iterator, cursor: *Cursor, simd: *const fn (input: []const u8) ?usize, scalar: *const fn (CodePointType) bool) SkipResult { + while (true) { + // 1. Get current position. Check for EOF. + const current_byte_index = cursor.i; + if (current_byte_index >= it.bytes.len) { + return .not_found; // Reached end without finding + } + + // 2. Decode the *next* character using the standard iterator method. + if (!next(it, cursor)) { + return .not_found; // Reached end or error during decode + } + + // 3. Check if the character just decoded matches the scalar condition. + if (scalar(it.c)) { + return .found; // Found it! + } + + // 4. Optimization: Can we skip ahead using SIMD? + // Scan starting from the byte *after* the character we just decoded. + const next_scan_start_index = cursor.i; + if (next_scan_start_index >= it.bytes.len) { + // Just decoded the last character and it didn't match. + return .not_found; + } + const remaining_slice = it.bytes[next_scan_start_index..]; + if (remaining_slice.len == 0) { + return .not_found; + } + + // Ask SIMD for the next potential candidate. + if (simd(remaining_slice)) |pos| { + // SIMD found a potential candidate `pos` bytes ahead. + if (pos > 0) { + // Jump the byte index to the start of the potential candidate. + cursor.i = next_scan_start_index + @as(u32, @intCast(pos)); + // Reset width so next() decodes correctly from the jumped position. + cursor.width = 0; + // Loop will continue, starting the decode from the new cursor.i. + continue; + } + // If pos == 0, SIMD suggests the *immediate next* character. + // No jump needed, just let the loop iterate naturally. + // Fallthrough to the end of the loop. + } else { + // SIMD found no potential candidates in the rest of the string. + // Since the SIMD search set is a superset of the scalar check set, + // we can guarantee that no character satisfying `scalar` exists further. + // Since the current character (decoded in step 2) also didn't match, + // we can conclude the target character is not found. + return .not_found; + } + + // If we reach here, it means SIMD returned pos=0. + // Loop continues to the next iteration, processing the immediate next char. + } // End while true + + unreachable; + } + pub inline fn next(it: *const Iterator, cursor: *Cursor) bool { const pos: u32 = @as(u32, cursor.width) + cursor.i; if (pos >= it.bytes.len) { @@ -5527,6 +5128,16 @@ pub fn leftHasAnyInRight(to_check: []const string, against: []const string) bool return false; } +/// Returns true if the input has the prefix and the next character is not an identifier character +/// Also returns true if the input ends with the prefix (i.e. EOF) +/// +/// Example: +/// ```zig +/// // returns true +/// hasPrefixWithWordBoundary("console.log", "console") // true +/// hasPrefixWithWordBoundary("console.log", "log") // false +/// hasPrefixWithWordBoundary("console.log", "console.log") // true +/// ``` pub fn hasPrefixWithWordBoundary(input: []const u8, comptime prefix: []const u8) bool { if (hasPrefixComptime(input, prefix)) { if (input.len == prefix.len) return true; @@ -5708,7 +5319,6 @@ pub fn mustEscapeYAMLString(contents: []const u8) bool { else => true, }; } - pub fn pathContainsNodeModulesFolder(path: []const u8) bool { return strings.contains(path, comptime std.fs.path.sep_str ++ "node_modules" ++ std.fs.path.sep_str); } diff --git a/test/bundler/bundler_comments.test.ts b/test/bundler/bundler_comments.test.ts new file mode 100644 index 0000000000..7abcd563bc --- /dev/null +++ b/test/bundler/bundler_comments.test.ts @@ -0,0 +1,353 @@ +import { describe } from "bun:test"; +import { itBundled } from "./expectBundled"; + +describe("single-line comments", () => { + itBundled("unix newlines", { + files: { + "/entry.js": `// This is a comment\nconsole.log("hello");\n// Another comment\n`, + }, + onAfterBundle(api) { + const output = api.readFile("/out.js"); + api.expectFile("/out.js").toContain("hello"); + }, + }); + + itBundled("windows newlines", { + files: { + "/entry.js": `// This is a comment\r\nconsole.log("hello");\r\n// Another comment\r\n`, + }, + onAfterBundle(api) { + api.expectFile("/out.js").toContain("hello"); + }, + }); + + itBundled("no trailing newline", { + files: { + "/entry.js": `// This is a comment\nconsole.log("hello");\n// No newline at end`, + }, + onAfterBundle(api) { + api.expectFile("/out.js").toContain("hello"); + }, + }); + + itBundled("non-ascii characters", { + files: { + "/entry.js": `// 你好,世界\n// Привет, мир\n// こんにちは世界\nconsole.log("hello");\n`, + }, + onAfterBundle(api) { + api.expectFile("/out.js").toContain("hello"); + }, + }); + + itBundled("emoji", { + files: { + "/entry.js": `// 🚀 🔥 💯\nconsole.log("hello");\n`, + }, + onAfterBundle(api) { + api.expectFile("/out.js").toContain("hello"); + }, + }); + + itBundled("invalid surrogate pair at beginning", { + files: { + "/entry.js": `// \uDC00 invalid surrogate\nconsole.log("hello");\n`, + }, + onAfterBundle(api) { + api.expectFile("/out.js").toContain("hello"); + }, + }); + + itBundled("invalid surrogate pair at end", { + files: { + "/entry.js": `// invalid surrogate \uD800\nconsole.log("hello");\n`, + }, + onAfterBundle(api) { + api.expectFile("/out.js").toContain("hello"); + }, + }); + + itBundled("invalid surrogate pair in middle", { + files: { + "/entry.js": `// invalid \uD800\uDC00\uD800 surrogate\nconsole.log("hello");\n`, + }, + onAfterBundle(api) { + api.expectFile("/out.js").toContain("hello"); + }, + }); + + itBundled("multiple comments on same line", { + files: { + "/entry.js": `const x = 5; // first comment // second comment\nconsole.log(x);\n`, + }, + onAfterBundle(api) { + api.expectFile("/out.js").toContain("console.log(x)"); + }, + }); + + itBundled("comment with ASI", { + files: { + "/entry.js": `const x = 5// first comment // second comment\nconsole.log(x)`, + }, + onAfterBundle(api) { + api.expectFile("/out.js").toContain("console.log(x)"); + }, + }); + + itBundled("comment at end of file without newline", { + files: { + "/entry.js": `console.log("hello"); //`, + }, + onAfterBundle(api) { + api.expectFile("/out.js").toContain("hello"); + }, + }); + + itBundled("empty comments", { + files: { + "/entry.js": `//\n//\nconsole.log("hello");\n//`, + }, + onAfterBundle(api) { + api.expectFile("/out.js").toContain("hello"); + }, + }); + + itBundled("comments with special characters", { + files: { + "/entry.js": `// Comment with \\ backslash\n// Comment with \" quote\n// Comment with \t tab\nconsole.log("hello");\n`, + }, + onAfterBundle(api) { + api.expectFile("/out.js").toContain("hello"); + }, + }); + + itBundled("comments with control characters", { + files: { + "/entry.js": `// Comment with \u0000 NULL\n// Comment with \u0001 SOH\nconsole.log("hello");\n`, + }, + onAfterBundle(api) { + api.expectFile("/out.js").toContain("hello"); + }, + }); + + itBundled("comments with minification", { + files: { + "/entry.js": `// This should be removed\nconsole.log("hello");\n// This too`, + }, + minifyWhitespace: true, + minifySyntax: true, + onAfterBundle(api) { + api.expectFile("/out.js").toEqualIgnoringWhitespace('console.log("hello");'); + }, + }); + + for (const minify of [true, false]) { + itBundled( + `some code and an empty comment without newline preceding ${minify ? "with minification" : "without minification"}`, + { + files: { + "/entry.js": `console.log("hello");//`, + }, + minifyWhitespace: minify, + minifySyntax: minify, + run: { + stdout: "hello", + }, + }, + ); + itBundled(`some code and then only an empty comment ${minify ? "with minification" : "without minification"}`, { + files: { + "/entry.js": `console.log("hello");\n//`, + }, + minifyWhitespace: minify, + minifySyntax: minify, + run: { + stdout: "hello", + }, + }); + itBundled(`only an empty comment ${minify ? "with minification" : "without minification"}`, { + files: { + "/entry.js": `//`, + }, + minifyWhitespace: minify, + minifySyntax: minify, + run: { + stdout: "", + }, + }); + itBundled("only a comment", { + files: { + "/entry.js": `// This is a comment`, + }, + minifyWhitespace: true, + minifySyntax: true, + run: { + stdout: "", + }, + }); + } + + itBundled("trailing //# sourceMappingURL=", { + files: { + "/entry.js": `// This is a comment\nconsole.log("hello");\n//# sourceMappingURL=data:application/json;base64,eyJ2ZXJzaW9uIjozLCJmaWxlIjoiZXhhbXBsZS5qcyIsInNvdXJjZSI6Ii8vZXhhbXBsZS5qcyJ9`, + }, + onAfterBundle(api) { + api.expectFile("/out.js").toContain("hello"); + }, + }); + + itBundled("trailing //# sourceMappingURL= with == at end", { + files: { + "/entry.js": `// This is a comment\nconsole.log("hello");\n//# sourceMappingURL=data:application/json;base64,eyJ2ZXJzaW9uIjozLCJmaWxlIjoiZXhhbXBsZS5qcyIsInNvdXJjZSI6Ii8vZXhhbXBsZS5qcyJ9==`, + }, + onAfterBundle(api) { + api.expectFile("/out.js").toContain("hello"); + }, + }); + + itBundled("trailing //# sourceMappingURL= with = at end", { + files: { + "/entry.js": `// This is a comment\nconsole.log("hello");\n//# sourceMappingURL=data:application/json;base64,eyJ2ZXJzaW9uIjozLCJmaWxlIjoiZXhhbXBsZS5qcyIsInNvdXJjZSI6Ii8vZXhhbXBsZS5qcyJ9=`, + }, + onAfterBundle(api) { + api.expectFile("/out.js").toContain("hello"); + }, + }); + + itBundled("leading //# sourceMappingURL= with = at end", { + files: { + "/entry.js": `//# sourceMappingURL=data:application/json;base64,eyJ2ZXJzaW9uIjozLCJmaWxlIjoiZXhhbXBsZS5qcyIsInNvdXJjZSI6Ii8vZXhhbXBsZS5qcyJ9=\n// This is a comment\nconsole.log("hello");`, + }, + onAfterBundle(api) { + api.expectFile("/out.js").toContain("hello"); + }, + }); + + itBundled("leading trailing newline //# sourceMappingURL= with = at end", { + files: { + "/entry.js": `//# sourceMappingURL=data:application/json;base64,eyJ2ZXJzaW9uIjozLCJmaWxlIjoiZXhhbXBsZS5qcyIsInNvdXJjZSI6Ii8vZXhhbXBsZS5qcyJ9=\n// This is a comment\nconsole.log("hello");\n`, + }, + onAfterBundle(api) { + api.expectFile("/out.js").toContain("hello"); + }, + }); + + itBundled("leading newline and sourcemap, trailing newline //# sourceMappingURL= with = at end", { + files: { + "/entry.js": `\n//# sourceMappingURL=data:application/json;base64,eyJ2ZXJzaW9uIjozLCJmaWxlIjoiZXhhbXBsZS5qcyIsInNvdXJjZSI6Ii8vZXhhbXBsZS5qcyJ9=\n// This is a comment\nconsole.log("hello");\n`, + }, + onAfterBundle(api) { + api.expectFile("/out.js").toContain("hello"); + }, + }); + + itBundled("__PURE__ comment in single-line comment basic", { + files: { + "/entry.js": `//#__PURE__\nconsole.log("hello");`, + }, + onAfterBundle(api) { + api.expectFile("/out.js").not.toContain("hello"); + }, + }); + + itBundled("__PURE__ comment in single-line comment with spaces", { + files: { + "/entry.js": `// #__PURE__ \nconsole.log("hello");`, + }, + onAfterBundle(api) { + api.expectFile("/out.js").not.toContain("hello"); + }, + }); + + itBundled("__PURE__ comment in single-line comment with text before", { + files: { + "/entry.js": `// some text #__PURE__\nconsole.log("hello");`, + }, + onAfterBundle(api) { + api.expectFile("/out.js").not.toContain("hello"); + }, + }); + + itBundled("__PURE__ comment in single-line comment with text after", { + files: { + "/entry.js": `// #__PURE__ some text\nconsole.log("hello");`, + }, + onAfterBundle(api) { + api.expectFile("/out.js").not.toContain("hello"); + }, + }); + + itBundled("__PURE__ comment in single-line comment with unicode characters", { + files: { + "/entry.js": `// 你好 #__PURE__ 世界\nconsole.log("hello");`, + }, + onAfterBundle(api) { + api.expectFile("/out.js").not.toContain("hello"); + }, + }); + + itBundled("__PURE__ comment in single-line comment with emoji", { + files: { + "/entry.js": `// 🚀 #__PURE__ 🔥\nconsole.log("hello");`, + }, + onAfterBundle(api) { + api.expectFile("/out.js").not.toContain("hello"); + }, + }); + + itBundled("__PURE__ comment in single-line comment with invalid surrogate pair", { + files: { + "/entry.js": `// \uD800 #__PURE__ \uDC00\nconsole.log("hello");`, + }, + onAfterBundle(api) { + api.expectFile("/out.js").not.toContain("hello"); + }, + }); + + itBundled("multiple __PURE__ comments in single-line comments", { + files: { + "/entry.js": `//#__PURE__\nconsole.log("hello");\n//#__PURE__\nconsole.log("world");`, + }, + onAfterBundle(api) { + api.expectFile("/out.js").not.toContain("hello"); + api.expectFile("/out.js").not.toContain("world"); + }, + }); + + itBundled("__PURE__ comment in single-line comment with minification", { + files: { + "/entry.js": `//#__PURE__\nconsole.log("hello");`, + }, + minifyWhitespace: true, + minifySyntax: true, + onAfterBundle(api) { + api.expectFile("/out.js").not.toContain("hello"); + }, + }); + + itBundled("__PURE__ comment in single-line comment with windows newlines", { + files: { + "/entry.js": `//#__PURE__\r\nconsole.log("hello");`, + }, + onAfterBundle(api) { + api.expectFile("/out.js").not.toContain("hello"); + }, + }); + + itBundled("__PURE__ comment in single-line comment at end of file", { + files: { + "/entry.js": `console.log("hello");\n//#__PURE__`, + }, + onAfterBundle(api) { + api.expectFile("/out.js").toContain("hello"); + }, + }); + + itBundled("__PURE__ comment in single-line comment in middle of a statement", { + files: { + "/entry.js": `console.log(//#__PURE__\n123);`, + }, + run: { + stdout: "123", + }, + }); +}); diff --git a/test/bundler/bundler_jsx.test.ts b/test/bundler/bundler_jsx.test.ts index fbdcf18ef4..8afaf3c991 100644 --- a/test/bundler/bundler_jsx.test.ts +++ b/test/bundler/bundler_jsx.test.ts @@ -204,7 +204,6 @@ describe("bundler", () => { `, }); itBundledDevAndProd("jsx/Classic", { - todo: true, files: { "/index.jsx": /* js*/ ` import { print } from 'bun-test-helpers' @@ -226,7 +225,6 @@ describe("bundler", () => { }, }); itBundledDevAndProd("jsx/ClassicPragma", { - todo: true, files: { "/index.jsx": /* js*/ ` // @jsx fn @@ -298,7 +296,6 @@ describe("bundler", () => { `, }); itBundledDevAndProd("jsx/Factory", { - todo: true, files: { "/index.jsx": /* js*/ ` const h = () => 'hello' @@ -322,7 +319,6 @@ describe("bundler", () => { }, }); itBundledDevAndProd("jsx/FactoryImport", { - todo: false, files: { "/index.jsx": /* js*/ ` import { h, fragment } from './jsx.ts'; @@ -353,7 +349,6 @@ describe("bundler", () => { }, }); itBundledDevAndProd("jsx/FactoryImportExplicitReactDefault", { - todo: false, files: { "/index.jsx": /* js*/ ` import { print } from 'bun-test-helpers' @@ -374,7 +369,6 @@ describe("bundler", () => { }, }); itBundledDevAndProd("jsx/FactoryImportExplicitReactDefaultExternal", { - todo: false, files: { "/index.jsx": /* js*/ ` import { print } from 'bun-test-helpers' @@ -397,4 +391,24 @@ describe("bundler", () => { expect(file).toContain('import * as React from "react"'); }, }); + itBundled("jsx/jsxImportSource pragma works", { + files: { + "/index.jsx": /* jsx */ ` + // @jsxImportSource hello + console.log(
Hello World
); + `, + "/node_modules/hello/jsx-dev-runtime.js": /* js */ ` + export function jsxDEV(type, props, key) { + return { + $$typeof: Symbol("hello_jsxDEV"), type, props, key + } + } + `, + }, + outdir: "/out", + target: "browser", + run: { + stdout: `{\n $$typeof: Symbol(hello_jsxDEV),\n type: \"div\",\n props: {\n children: \"Hello World\",\n },\n key: undefined,\n}`, + }, + }); }); diff --git a/test/internal/ban-words.test.ts b/test/internal/ban-words.test.ts index 5cfd66da65..818105959b 100644 --- a/test/internal/ban-words.test.ts +++ b/test/internal/ban-words.test.ts @@ -12,7 +12,7 @@ const words: Record "std.debug.assert": { reason: "Use bun.assert instead", limit: 26 }, "std.debug.dumpStackTrace": { reason: "Use bun.handleErrorReturnTrace or bun.crash_handler.dumpStackTrace instead" }, "std.debug.print": { reason: "Don't let this be committed", limit: 0 }, - "std.mem.indexOfAny(u8": { reason: "Use bun.strings.indexOfAny", limit: 3 }, + "std.mem.indexOfAny(u8": { reason: "Use bun.strings.indexOfAny", limit: 2 }, "std.StringArrayHashMapUnmanaged(": { reason: "bun.StringArrayHashMapUnmanaged has a faster `eql`", limit: 12 }, "std.StringArrayHashMap(": { reason: "bun.StringArrayHashMap has a faster `eql`", limit: 1 }, "std.StringHashMapUnmanaged(": { reason: "bun.StringHashMapUnmanaged has a faster `eql`" },