perf(buffer): optimize swap16/swap64 with __builtin_bswap (#26190)

## Summary Optimize `Buffer.swap16()` and `Buffer.swap64()` by replacing byte-by-byte swapping loops with `__builtin_bswap16/64` compiler intrinsics. ## Problem `Buffer.swap16` and `Buffer.swap64` were significantly slower than Node.js due to inefficient byte-level operations: - **swap16**: Swapped bytes one at a time in a loop - **swap64**: Used a nested loop with 4 byte swaps per 8-byte element ## Solution Replace the manual byte swapping with `__builtin_bswap16/64` intrinsics, which compile to single CPU instructions (`BSWAP` on x86, `REV` on ARM). Use `memcpy` for loading/storing values to handle potentially unaligned buffers safely. ## Benchmark Results (64KB buffer, Apple M4 Max) | Operation | Bun 1.3.6 | Node.js 24 | This PR | Improvement | |-----------|-----------|------------|---------|-------------| | swap16 | 1.00 µs | 0.57 µs | 0.56 µs | **1.79x faster** | | swap32 | 0.55 µs | 0.77 µs | 0.54 µs | (no change, already fast) | | swap64 | 2.02 µs | 0.58 µs | 0.56 µs | **3.6x faster** | Bun now matches or exceeds Node.js performance for all swap operations. ## Notes - `swap32` was not modified as the compiler already optimizes the 4-byte swap pattern - All existing tests pass
2026-02-02 15:08:46 +00:00 · 2026-01-19 06:33:04 +09:00
parent 12a45b7cbf
commit f8a049e9f2
1 changed files with 17 additions and 22 deletions
--- a/src/bun.js/bindings/JSBuffer.cpp
+++ b/src/bun.js/bindings/JSBuffer.cpp
@@ -1649,8 +1649,8 @@ static JSC::EncodedJSValue jsBufferPrototypeFunction_swap16Body(JSC::JSGlobalObj
    auto& vm = JSC::getVM(lexicalGlobalObject);
    auto scope = DECLARE_THROW_SCOPE(vm);

-    constexpr int elemSize = 2;
-    int64_t length = static_cast<int64_t>(castedThis->byteLength());
+    constexpr size_t elemSize = 2;
+    size_t length = castedThis->byteLength();
    if (length % elemSize != 0) {
        throwNodeRangeError(lexicalGlobalObject, scope, "Buffer size must be a multiple of 16-bits"_s);
        return {};
@@ -1661,14 +1661,14 @@ static JSC::EncodedJSValue jsBufferPrototypeFunction_swap16Body(JSC::JSGlobalObj
        return {};
    }

-    uint8_t* typedVector = castedThis->typedVector();
+    uint8_t* data = castedThis->typedVector();
+    size_t count = length / elemSize;

-    for (size_t elem = 0; elem < length; elem += elemSize) {
-        const size_t right = elem + 1;
-
-        uint8_t temp = typedVector[elem];
-        typedVector[elem] = typedVector[right];
-        typedVector[right] = temp;
+    for (size_t i = 0; i < count; i++) {
+        uint16_t val;
+        memcpy(&val, data + i * elemSize, sizeof(val));
+        val = __builtin_bswap16(val);
+        memcpy(data + i * elemSize, &val, sizeof(val));
    }

    return JSC::JSValue::encode(castedThis);
@@ -1715,7 +1715,7 @@ static JSC::EncodedJSValue jsBufferPrototypeFunction_swap64Body(JSC::JSGlobalObj
    auto scope = DECLARE_THROW_SCOPE(vm);

    constexpr size_t elemSize = 8;
-    int64_t length = static_cast<int64_t>(castedThis->byteLength());
+    size_t length = castedThis->byteLength();
    if (length % elemSize != 0) {
        throwNodeRangeError(lexicalGlobalObject, scope, "Buffer size must be a multiple of 64-bits"_s);
        return {};
@@ -1726,19 +1726,14 @@ static JSC::EncodedJSValue jsBufferPrototypeFunction_swap64Body(JSC::JSGlobalObj
        return {};
    }

-    uint8_t* typedVector = castedThis->typedVector();
+    uint8_t* data = castedThis->typedVector();
+    size_t count = length / elemSize;

-    constexpr size_t swaps = elemSize / 2;
-    for (size_t elem = 0; elem < length; elem += elemSize) {
-        const size_t right = elem + elemSize - 1;
-        for (size_t k = 0; k < swaps; k++) {
-            const size_t i = right - k;
-            const size_t j = elem + k;
-
-            uint8_t temp = typedVector[i];
-            typedVector[i] = typedVector[j];
-            typedVector[j] = temp;
-        }
+    for (size_t i = 0; i < count; i++) {
+        uint64_t val;
+        memcpy(&val, data + i * elemSize, sizeof(val));
+        val = __builtin_bswap64(val);
+        memcpy(data + i * elemSize, &val, sizeof(val));
    }

    return JSC::JSValue::encode(castedThis);