Compare commits

...

1 Commits

Author SHA1 Message Date
Claude Bot
7183dc224e fix: implement buffer.transcode() from node:buffer
Implements the `transcode` function which was previously stubbed
with a "Not implemented" error. Supports all encoding pairs that
Node.js supports: utf8, ascii, latin1, and ucs2/utf16le.

Uses simdutf for fast SIMD-accelerated conversions where possible,
with custom paths for ASCII/Latin-1 substitution (replacing
unmappable characters with '?').

Closes #24235

Co-Authored-By: Claude <noreply@anthropic.com>
2026-02-20 04:45:22 +00:00
3 changed files with 448 additions and 6 deletions

View File

@@ -3,6 +3,7 @@
#include "root.h"
#include "../bindings/JSBuffer.h"
#include "../bindings/JSBufferEncodingType.h"
#include "ErrorCode.h"
#include "JavaScriptCore/PageCount.h"
#include "NodeValidator.h"
@@ -125,6 +126,349 @@ JSC_DEFINE_HOST_FUNCTION(jsBufferConstructorFunction_isAscii,
BUN_DECLARE_HOST_FUNCTION(jsFunctionResolveObjectURL);
// Transcode encoding enum - only the 4 encodings supported by Node.js transcode()
enum class TranscodeEncoding : uint8_t {
ASCII,
LATIN1,
UTF8,
UCS2, // UTF-16LE
};
static std::optional<TranscodeEncoding> parseTranscodeEncoding(JSC::JSGlobalObject& globalObject, JSValue value)
{
auto encoding = parseEnumeration<BufferEncodingType>(globalObject, value);
if (!encoding.has_value())
return std::nullopt;
switch (encoding.value()) {
case BufferEncodingType::ascii:
return TranscodeEncoding::ASCII;
case BufferEncodingType::latin1:
return TranscodeEncoding::LATIN1;
case BufferEncodingType::utf8:
return TranscodeEncoding::UTF8;
case BufferEncodingType::ucs2:
case BufferEncodingType::utf16le:
return TranscodeEncoding::UCS2;
default:
return std::nullopt;
}
}
// Transcode UTF-8 to ASCII: decode each codepoint; if > 0x7F, replace with '?'
static JSC::JSUint8Array* transcodeUtf8ToAscii(JSC::JSGlobalObject* globalObject, const char* source, size_t sourceLength)
{
// First, decode UTF-8 to UTF-32 codepoints to count output bytes (one per codepoint)
size_t outputLength = simdutf::utf32_length_from_utf8(source, sourceLength);
auto* result = WebCore::createUninitializedBuffer(globalObject, outputLength);
if (!result)
return nullptr;
auto* out = result->typedVector();
// Decode UTF-8 codepoints one by one
size_t srcIdx = 0;
size_t dstIdx = 0;
while (srcIdx < sourceLength && dstIdx < outputLength) {
uint8_t byte = static_cast<uint8_t>(source[srcIdx]);
uint32_t codepoint;
size_t seqLen;
if (byte < 0x80) {
codepoint = byte;
seqLen = 1;
} else if ((byte & 0xE0) == 0xC0) {
seqLen = 2;
if (srcIdx + seqLen > sourceLength) break;
codepoint = (byte & 0x1F) << 6;
codepoint |= (static_cast<uint8_t>(source[srcIdx + 1]) & 0x3F);
} else if ((byte & 0xF0) == 0xE0) {
seqLen = 3;
if (srcIdx + seqLen > sourceLength) break;
codepoint = (byte & 0x0F) << 12;
codepoint |= (static_cast<uint8_t>(source[srcIdx + 1]) & 0x3F) << 6;
codepoint |= (static_cast<uint8_t>(source[srcIdx + 2]) & 0x3F);
} else if ((byte & 0xF8) == 0xF0) {
seqLen = 4;
if (srcIdx + seqLen > sourceLength) break;
codepoint = (byte & 0x07) << 18;
codepoint |= (static_cast<uint8_t>(source[srcIdx + 1]) & 0x3F) << 12;
codepoint |= (static_cast<uint8_t>(source[srcIdx + 2]) & 0x3F) << 6;
codepoint |= (static_cast<uint8_t>(source[srcIdx + 3]) & 0x3F);
} else {
// Invalid UTF-8 start byte
codepoint = 0xFFFD;
seqLen = 1;
}
out[dstIdx++] = (codepoint <= 0x7F) ? static_cast<uint8_t>(codepoint) : '?';
srcIdx += seqLen;
}
return result;
}
// Transcode UTF-8 to Latin-1: decode each codepoint; if > 0xFF, replace with '?'
static JSC::JSUint8Array* transcodeUtf8ToLatin1(JSC::JSGlobalObject* globalObject, const char* source, size_t sourceLength)
{
size_t outputLength = simdutf::utf32_length_from_utf8(source, sourceLength);
auto* result = WebCore::createUninitializedBuffer(globalObject, outputLength);
if (!result)
return nullptr;
auto* out = result->typedVector();
size_t srcIdx = 0;
size_t dstIdx = 0;
while (srcIdx < sourceLength && dstIdx < outputLength) {
uint8_t byte = static_cast<uint8_t>(source[srcIdx]);
uint32_t codepoint;
size_t seqLen;
if (byte < 0x80) {
codepoint = byte;
seqLen = 1;
} else if ((byte & 0xE0) == 0xC0) {
seqLen = 2;
if (srcIdx + seqLen > sourceLength) break;
codepoint = (byte & 0x1F) << 6;
codepoint |= (static_cast<uint8_t>(source[srcIdx + 1]) & 0x3F);
} else if ((byte & 0xF0) == 0xE0) {
seqLen = 3;
if (srcIdx + seqLen > sourceLength) break;
codepoint = (byte & 0x0F) << 12;
codepoint |= (static_cast<uint8_t>(source[srcIdx + 1]) & 0x3F) << 6;
codepoint |= (static_cast<uint8_t>(source[srcIdx + 2]) & 0x3F);
} else if ((byte & 0xF8) == 0xF0) {
seqLen = 4;
if (srcIdx + seqLen > sourceLength) break;
codepoint = (byte & 0x07) << 18;
codepoint |= (static_cast<uint8_t>(source[srcIdx + 1]) & 0x3F) << 12;
codepoint |= (static_cast<uint8_t>(source[srcIdx + 2]) & 0x3F) << 6;
codepoint |= (static_cast<uint8_t>(source[srcIdx + 3]) & 0x3F);
} else {
codepoint = 0xFFFD;
seqLen = 1;
}
out[dstIdx++] = (codepoint <= 0xFF) ? static_cast<uint8_t>(codepoint) : '?';
srcIdx += seqLen;
}
return result;
}
// Transcode UCS-2 to ASCII: each char16_t > 0x7F becomes '?'
static JSC::JSUint8Array* transcodeUcs2ToAscii(JSC::JSGlobalObject* globalObject, const char16_t* source, size_t charLength)
{
auto* result = WebCore::createUninitializedBuffer(globalObject, charLength);
if (!result)
return nullptr;
auto* out = result->typedVector();
for (size_t i = 0; i < charLength; i++) {
out[i] = (source[i] <= 0x7F) ? static_cast<uint8_t>(source[i]) : '?';
}
return result;
}
// Transcode UCS-2 to Latin-1: each char16_t > 0xFF becomes '?'
static JSC::JSUint8Array* transcodeUcs2ToLatin1(JSC::JSGlobalObject* globalObject, const char16_t* source, size_t charLength)
{
auto* result = WebCore::createUninitializedBuffer(globalObject, charLength);
if (!result)
return nullptr;
auto* out = result->typedVector();
for (size_t i = 0; i < charLength; i++) {
out[i] = (source[i] <= 0xFF) ? static_cast<uint8_t>(source[i]) : '?';
}
return result;
}
JSC_DEFINE_HOST_FUNCTION(jsFunction_transcode,
(JSGlobalObject * globalObject,
CallFrame* callFrame))
{
VM& vm = globalObject->vm();
auto scope = DECLARE_THROW_SCOPE(vm);
JSValue sourceValue = callFrame->argument(0);
// Validate source is Buffer or Uint8Array
auto* sourceView = JSC::jsDynamicCast<JSC::JSArrayBufferView*>(sourceValue);
if (!sourceView) {
Bun::ERR::INVALID_ARG_TYPE_INSTANCE(scope, globalObject,
"source"_s, "Buffer"_s, "Uint8Array"_s, sourceValue);
return {};
}
const char* sourceData = reinterpret_cast<const char*>(sourceView->vector());
size_t sourceLength = sourceView->byteLength();
// Empty input → empty Buffer
if (sourceLength == 0) {
return JSValue::encode(WebCore::createEmptyBuffer(globalObject));
}
// Parse encodings
auto fromEncoding = parseTranscodeEncoding(*globalObject, callFrame->argument(1));
RETURN_IF_EXCEPTION(scope, {});
auto toEncoding = parseTranscodeEncoding(*globalObject, callFrame->argument(2));
RETURN_IF_EXCEPTION(scope, {});
if (!fromEncoding.has_value() || !toEncoding.has_value()) {
throwException(globalObject, scope,
createError(globalObject, "Unable to transcode Buffer [U_ILLEGAL_ARGUMENT_ERROR]"_s));
return {};
}
auto from = fromEncoding.value();
auto to = toEncoding.value();
JSC::JSUint8Array* resultBuffer = nullptr;
// Same encoding → copy
if (from == to) {
resultBuffer = WebCore::createBuffer(globalObject, reinterpret_cast<const uint8_t*>(sourceData), sourceLength);
RETURN_IF_EXCEPTION(scope, {});
return JSValue::encode(resultBuffer);
}
switch (from) {
case TranscodeEncoding::ASCII:
case TranscodeEncoding::LATIN1: {
switch (to) {
case TranscodeEncoding::UCS2: {
// Latin1/ASCII → UCS-2: use simdutf
auto* result = WebCore::createUninitializedBuffer(globalObject, sourceLength * 2);
if (!result) {
RETURN_IF_EXCEPTION(scope, {});
return {};
}
(void)simdutf::convert_latin1_to_utf16le(sourceData, sourceLength,
reinterpret_cast<char16_t*>(result->typedVector()));
resultBuffer = result;
break;
}
case TranscodeEncoding::UTF8: {
// Latin1 → UTF-8: use simdutf
size_t utf8Length = simdutf::utf8_length_from_latin1(sourceData, sourceLength);
auto* result = WebCore::createUninitializedBuffer(globalObject, utf8Length);
if (!result) {
RETURN_IF_EXCEPTION(scope, {});
return {};
}
(void)simdutf::convert_latin1_to_utf8(sourceData, sourceLength,
reinterpret_cast<char*>(result->typedVector()));
resultBuffer = result;
break;
}
case TranscodeEncoding::ASCII: {
// Latin1 → ASCII: clamp bytes > 0x7F to '?'
auto* result = WebCore::createUninitializedBuffer(globalObject, sourceLength);
if (!result) {
RETURN_IF_EXCEPTION(scope, {});
return {};
}
auto* out = result->typedVector();
for (size_t i = 0; i < sourceLength; i++) {
uint8_t byte = static_cast<uint8_t>(sourceData[i]);
out[i] = (byte <= 0x7F) ? byte : '?';
}
resultBuffer = result;
break;
}
case TranscodeEncoding::LATIN1: {
// ASCII → Latin1: just copy (ASCII is a subset of Latin1)
resultBuffer = WebCore::createBuffer(globalObject, reinterpret_cast<const uint8_t*>(sourceData), sourceLength);
break;
}
}
break;
}
case TranscodeEncoding::UTF8: {
switch (to) {
case TranscodeEncoding::UCS2: {
// UTF-8 → UCS-2: use simdutf
size_t utf16Length = simdutf::utf16_length_from_utf8(sourceData, sourceLength);
auto* result = WebCore::createUninitializedBuffer(globalObject, utf16Length * sizeof(char16_t));
if (!result) {
RETURN_IF_EXCEPTION(scope, {});
return {};
}
size_t actual = simdutf::convert_utf8_to_utf16le(sourceData, sourceLength,
reinterpret_cast<char16_t*>(result->typedVector()));
if (actual == 0 && sourceLength > 0) {
throwException(globalObject, scope,
createError(globalObject, "Unable to transcode Buffer [U_INVALID_CHAR_FOUND]"_s));
return {};
}
resultBuffer = result;
break;
}
case TranscodeEncoding::ASCII: {
resultBuffer = transcodeUtf8ToAscii(globalObject, sourceData, sourceLength);
break;
}
case TranscodeEncoding::LATIN1: {
resultBuffer = transcodeUtf8ToLatin1(globalObject, sourceData, sourceLength);
break;
}
default:
break;
}
break;
}
case TranscodeEncoding::UCS2: {
const char16_t* utf16Data = reinterpret_cast<const char16_t*>(sourceData);
size_t charLength = sourceLength / sizeof(char16_t);
switch (to) {
case TranscodeEncoding::UTF8: {
// UCS-2 → UTF-8: use simdutf
size_t utf8Length = simdutf::utf8_length_from_utf16le(utf16Data, charLength);
auto* result = WebCore::createUninitializedBuffer(globalObject, utf8Length);
if (!result) {
RETURN_IF_EXCEPTION(scope, {});
return {};
}
size_t actual = simdutf::convert_utf16le_to_utf8(utf16Data, charLength,
reinterpret_cast<char*>(result->typedVector()));
if (actual == 0 && charLength > 0) {
throwException(globalObject, scope,
createError(globalObject, "Unable to transcode Buffer [U_INVALID_CHAR_FOUND]"_s));
return {};
}
resultBuffer = result;
break;
}
case TranscodeEncoding::ASCII: {
resultBuffer = transcodeUcs2ToAscii(globalObject, utf16Data, charLength);
break;
}
case TranscodeEncoding::LATIN1: {
resultBuffer = transcodeUcs2ToLatin1(globalObject, utf16Data, charLength);
break;
}
default:
break;
}
break;
}
}
if (!resultBuffer) {
RETURN_IF_EXCEPTION(scope, {});
throwException(globalObject, scope,
createError(globalObject, "Unable to transcode Buffer [U_ILLEGAL_ARGUMENT_ERROR]"_s));
return {};
}
RETURN_IF_EXCEPTION(scope, {});
return JSValue::encode(resultBuffer);
}
JSC_DEFINE_HOST_FUNCTION(jsFunctionNotImplemented,
(JSGlobalObject * globalObject,
CallFrame* callFrame))
@@ -203,9 +547,7 @@ DEFINE_NATIVE_MODULE(NodeBuffer)
put(atobI, atobV);
put(btoaI, btoaV);
auto* transcode = InternalFunction::createFunctionThatMasqueradesAsUndefined(vm, globalObject, 1, "transcode"_s, jsFunctionNotImplemented);
put(JSC::Identifier::fromString(vm, "transcode"_s), transcode);
put(JSC::Identifier::fromString(vm, "transcode"_s), JSC::JSFunction::create(vm, globalObject, 3, "transcode"_s, jsFunction_transcode, ImplementationVisibility::Public, NoIntrinsic, jsFunction_transcode));
auto* resolveObjectURL = JSC::JSFunction::create(vm, globalObject, 1, "resolveObjectURL"_s, jsFunctionResolveObjectURL, ImplementationVisibility::Public, NoIntrinsic, jsFunctionResolveObjectURL);

View File

@@ -2188,10 +2188,39 @@ for (let withOverridenBufferWrite of [false, true]) {
});
it("transcode", () => {
expect(typeof BufferModule.transcode).toBe("undefined");
expect(typeof BufferModule.transcode).toBe("function");
// This is a masqueradesAsUndefined function
expect(() => BufferModule.transcode()).toThrow("Not implemented");
// Basic UTF-8 to ASCII
const euroUtf8 = Buffer.from("€", "utf8");
const asciiResult = BufferModule.transcode(euroUtf8, "utf8", "ascii");
expect(asciiResult.toString("ascii")).toBe("?");
// UTF-8 to Latin-1
const orig = Buffer.from("těst ☕", "utf8");
const latin1 = BufferModule.transcode(orig, "utf8", "latin1");
expect(Array.from(latin1)).toEqual([0x74, 0x3f, 0x73, 0x74, 0x20, 0x3f]);
// UTF-8 to UCS-2
const ucs2 = BufferModule.transcode(orig, "utf8", "ucs2");
expect(Array.from(ucs2)).toEqual([0x74, 0x00, 0x1b, 0x01, 0x73, 0x00, 0x74, 0x00, 0x20, 0x00, 0x15, 0x26]);
// Round-trip UCS-2 → UTF-8
const backToUtf8 = BufferModule.transcode(Buffer.from(ucs2), "ucs2", "utf8");
expect(backToUtf8.toString()).toBe(orig.toString());
// Empty input
const empty = BufferModule.transcode(new Uint8Array(), "utf8", "latin1");
expect(empty.length).toBe(0);
// Invalid source type
expect(() => BufferModule.transcode(null, "utf8", "ascii")).toThrow();
// Invalid encoding
expect(() => BufferModule.transcode(Buffer.from("a"), "b", "utf8")).toThrow(/U_ILLEGAL_ARGUMENT_ERROR/);
// Uint8Array support
const uint8arr = new Uint8Array([...Buffer.from("hä", "latin1")]);
expect(BufferModule.transcode(uint8arr, "latin1", "utf16le")).toEqual(Buffer.from("hä", "utf16le"));
});
it("Buffer.from (Node.js test/test-buffer-from.js)", () => {

View File

@@ -0,0 +1,71 @@
import { expect, test } from "bun:test";
import * as buffer from "node:buffer";
test("buffer.transcode is a function, not undefined", () => {
expect(typeof buffer.transcode).toBe("function");
});
test("buffer.transcode converts UTF-8 to ASCII with ? substitution", () => {
const newBuf = buffer.transcode(Buffer.from("€"), "utf8", "ascii");
expect(newBuf.toString("ascii")).toBe("?");
});
test("buffer.transcode converts UTF-8 to Latin-1 with ? substitution", () => {
const orig = Buffer.from("těst ☕", "utf8");
const dest = buffer.transcode(orig, "utf8", "latin1");
// ě (U+011B) fits in latin1 → 0x3F because it's > 0xFF? No.
// Actually ě is U+011B which is > 0xFF, so it becomes '?' (0x3F)
// ☕ is U+2615, also > 0xFF, so '?' (0x3F)
expect(Array.from(dest)).toEqual([0x74, 0x3f, 0x73, 0x74, 0x20, 0x3f]);
});
test("buffer.transcode converts UTF-8 to UCS-2", () => {
const orig = Buffer.from("těst ☕", "utf8");
const dest = buffer.transcode(orig, "utf8", "ucs2");
expect(Array.from(dest)).toEqual([0x74, 0x00, 0x1b, 0x01, 0x73, 0x00, 0x74, 0x00, 0x20, 0x00, 0x15, 0x26]);
});
test("buffer.transcode round-trips UCS-2 to UTF-8", () => {
const orig = Buffer.from("těst ☕", "utf8");
const ucs2 = buffer.transcode(orig, "utf8", "ucs2");
const back = buffer.transcode(Buffer.from(ucs2), "ucs2", "utf8");
expect(back.toString()).toBe(orig.toString());
});
test("buffer.transcode handles large data", () => {
const utf8 = Buffer.from("€".repeat(4000), "utf8");
const ucs2 = Buffer.from("€".repeat(4000), "ucs2");
const utf8_to_ucs2 = buffer.transcode(utf8, "utf8", "ucs2");
const ucs2_to_utf8 = buffer.transcode(ucs2, "ucs2", "utf8");
expect(Buffer.compare(utf8, ucs2_to_utf8)).toBe(0);
expect(Buffer.compare(ucs2, utf8_to_ucs2)).toBe(0);
});
test("buffer.transcode throws on invalid source type", () => {
expect(() => buffer.transcode(null as any, "utf8", "ascii")).toThrow();
});
test("buffer.transcode throws on unsupported encoding", () => {
expect(() => buffer.transcode(Buffer.from("a"), "b" as any, "utf8")).toThrow(/U_ILLEGAL_ARGUMENT_ERROR/);
expect(() => buffer.transcode(Buffer.from("a"), "uf8" as any, "b" as any)).toThrow(/U_ILLEGAL_ARGUMENT_ERROR/);
});
test("buffer.transcode ASCII/Latin-1 to UTF-16LE", () => {
expect(buffer.transcode(Buffer.from("hi", "ascii"), "ascii", "utf16le")).toEqual(Buffer.from("hi", "utf16le"));
expect(buffer.transcode(Buffer.from("hi", "latin1"), "latin1", "utf16le")).toEqual(Buffer.from("hi", "utf16le"));
expect(buffer.transcode(Buffer.from("hä", "latin1"), "latin1", "utf16le")).toEqual(Buffer.from("hä", "utf16le"));
});
test("buffer.transcode accepts Uint8Array", () => {
const uint8array = new Uint8Array([...Buffer.from("hä", "latin1")]);
expect(buffer.transcode(uint8array, "latin1", "utf16le")).toEqual(Buffer.from("hä", "utf16le"));
});
test("buffer.transcode empty input", () => {
const dest = buffer.transcode(new Uint8Array(), "utf8", "latin1");
expect(dest.length).toBe(0);
});
test("buffer.transcode doesn't crash with allocUnsafeSlow", () => {
buffer.transcode(new buffer.Buffer.allocUnsafeSlow(1) as any, "utf16le", "ucs2");
});