From f8a049e9f2f0525b5ee2e2b95f25028bb4b3e25c Mon Sep 17 00:00:00 2001 From: SUZUKI Sosuke Date: Mon, 19 Jan 2026 06:33:04 +0900 Subject: [PATCH] perf(buffer): optimize swap16/swap64 with __builtin_bswap (#26190) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary Optimize `Buffer.swap16()` and `Buffer.swap64()` by replacing byte-by-byte swapping loops with `__builtin_bswap16/64` compiler intrinsics. ## Problem `Buffer.swap16` and `Buffer.swap64` were significantly slower than Node.js due to inefficient byte-level operations: - **swap16**: Swapped bytes one at a time in a loop - **swap64**: Used a nested loop with 4 byte swaps per 8-byte element ## Solution Replace the manual byte swapping with `__builtin_bswap16/64` intrinsics, which compile to single CPU instructions (`BSWAP` on x86, `REV` on ARM). Use `memcpy` for loading/storing values to handle potentially unaligned buffers safely. ## Benchmark Results (64KB buffer, Apple M4 Max) | Operation | Bun 1.3.6 | Node.js 24 | This PR | Improvement | |-----------|-----------|------------|---------|-------------| | swap16 | 1.00 µs | 0.57 µs | 0.56 µs | **1.79x faster** | | swap32 | 0.55 µs | 0.77 µs | 0.54 µs | (no change, already fast) | | swap64 | 2.02 µs | 0.58 µs | 0.56 µs | **3.6x faster** | Bun now matches or exceeds Node.js performance for all swap operations. ## Notes - `swap32` was not modified as the compiler already optimizes the 4-byte swap pattern - All existing tests pass --- src/bun.js/bindings/JSBuffer.cpp | 39 ++++++++++++++------------------ 1 file changed, 17 insertions(+), 22 deletions(-) diff --git a/src/bun.js/bindings/JSBuffer.cpp b/src/bun.js/bindings/JSBuffer.cpp index fbc4887698..a39b7b65ff 100644 --- a/src/bun.js/bindings/JSBuffer.cpp +++ b/src/bun.js/bindings/JSBuffer.cpp @@ -1649,8 +1649,8 @@ static JSC::EncodedJSValue jsBufferPrototypeFunction_swap16Body(JSC::JSGlobalObj auto& vm = JSC::getVM(lexicalGlobalObject); auto scope = DECLARE_THROW_SCOPE(vm); - constexpr int elemSize = 2; - int64_t length = static_cast(castedThis->byteLength()); + constexpr size_t elemSize = 2; + size_t length = castedThis->byteLength(); if (length % elemSize != 0) { throwNodeRangeError(lexicalGlobalObject, scope, "Buffer size must be a multiple of 16-bits"_s); return {}; @@ -1661,14 +1661,14 @@ static JSC::EncodedJSValue jsBufferPrototypeFunction_swap16Body(JSC::JSGlobalObj return {}; } - uint8_t* typedVector = castedThis->typedVector(); + uint8_t* data = castedThis->typedVector(); + size_t count = length / elemSize; - for (size_t elem = 0; elem < length; elem += elemSize) { - const size_t right = elem + 1; - - uint8_t temp = typedVector[elem]; - typedVector[elem] = typedVector[right]; - typedVector[right] = temp; + for (size_t i = 0; i < count; i++) { + uint16_t val; + memcpy(&val, data + i * elemSize, sizeof(val)); + val = __builtin_bswap16(val); + memcpy(data + i * elemSize, &val, sizeof(val)); } return JSC::JSValue::encode(castedThis); @@ -1715,7 +1715,7 @@ static JSC::EncodedJSValue jsBufferPrototypeFunction_swap64Body(JSC::JSGlobalObj auto scope = DECLARE_THROW_SCOPE(vm); constexpr size_t elemSize = 8; - int64_t length = static_cast(castedThis->byteLength()); + size_t length = castedThis->byteLength(); if (length % elemSize != 0) { throwNodeRangeError(lexicalGlobalObject, scope, "Buffer size must be a multiple of 64-bits"_s); return {}; @@ -1726,19 +1726,14 @@ static JSC::EncodedJSValue jsBufferPrototypeFunction_swap64Body(JSC::JSGlobalObj return {}; } - uint8_t* typedVector = castedThis->typedVector(); + uint8_t* data = castedThis->typedVector(); + size_t count = length / elemSize; - constexpr size_t swaps = elemSize / 2; - for (size_t elem = 0; elem < length; elem += elemSize) { - const size_t right = elem + elemSize - 1; - for (size_t k = 0; k < swaps; k++) { - const size_t i = right - k; - const size_t j = elem + k; - - uint8_t temp = typedVector[i]; - typedVector[i] = typedVector[j]; - typedVector[j] = temp; - } + for (size_t i = 0; i < count; i++) { + uint64_t val; + memcpy(&val, data + i * elemSize, sizeof(val)); + val = __builtin_bswap64(val); + memcpy(data + i * elemSize, &val, sizeof(val)); } return JSC::JSValue::encode(castedThis);