diff --git a/src/bun.js/bindings/ErrorCode.cpp b/src/bun.js/bindings/ErrorCode.cpp index 26d7664ef5..4f2d6eb32f 100644 --- a/src/bun.js/bindings/ErrorCode.cpp +++ b/src/bun.js/bindings/ErrorCode.cpp @@ -302,16 +302,9 @@ WTF::String ERR_OUT_OF_RANGE(JSC::ThrowScope& scope, JSC::JSGlobalObject* global namespace ERR { -JSC::EncodedJSValue INVALID_ARG_TYPE(JSC::ThrowScope& throwScope, JSC::JSGlobalObject* globalObject, const WTF::String& val_arg_name, const WTF::String& val_expected_type, JSC::JSValue val_actual_value) +JSC::EncodedJSValue INVALID_ARG_TYPE(JSC::ThrowScope& throwScope, JSC::JSGlobalObject* globalObject, WTF::ASCIILiteral arg_name, WTF::ASCIILiteral expected_type, JSC::JSValue val_actual_value) { - auto arg_name = val_arg_name.span8(); - ASSERT(WTF::charactersAreAllASCII(arg_name)); - auto arg_kind = String(arg_name).startsWith("options."_s) ? "property"_s : "argument"_s; - - auto expected_type = val_expected_type.span8(); - ASSERT(WTF::charactersAreAllASCII(expected_type)); - auto ty_first_char = expected_type[0]; auto ty_kind = ty_first_char >= 'A' && ty_first_char <= 'Z' ? "an instance of"_s : "of type"_s; @@ -322,15 +315,11 @@ JSC::EncodedJSValue INVALID_ARG_TYPE(JSC::ThrowScope& throwScope, JSC::JSGlobalO throwScope.throwException(globalObject, createError(globalObject, ErrorCode::ERR_INVALID_ARG_TYPE, message)); return {}; } -JSC::EncodedJSValue INVALID_ARG_TYPE(JSC::ThrowScope& throwScope, JSC::JSGlobalObject* globalObject, JSC::JSValue val_arg_name, const WTF::String& val_expected_type, JSC::JSValue val_actual_value) +JSC::EncodedJSValue INVALID_ARG_TYPE(JSC::ThrowScope& throwScope, JSC::JSGlobalObject* globalObject, JSC::JSValue val_arg_name, WTF::ASCIILiteral expected_type, JSC::JSValue val_actual_value) { auto arg_name = val_arg_name.toWTFString(globalObject); RETURN_IF_EXCEPTION(throwScope, {}); - - auto arg_kind = String(arg_name).startsWith("options."_s) ? "property"_s : "argument"_s; - - auto expected_type = val_expected_type.span8(); - ASSERT(WTF::charactersAreAllASCII(expected_type)); + auto arg_kind = arg_name.startsWith("options."_s) ? "property"_s : "argument"_s; auto ty_first_char = expected_type[0]; auto ty_kind = ty_first_char >= 'A' && ty_first_char <= 'Z' ? "an instance of"_s : "of type"_s; @@ -408,15 +397,9 @@ JSC::EncodedJSValue OUT_OF_RANGE(JSC::ThrowScope& throwScope, JSC::JSGlobalObjec return {}; } -JSC::EncodedJSValue INVALID_ARG_VALUE(JSC::ThrowScope& throwScope, JSC::JSGlobalObject* globalObject, const WTF::String& name, JSC::JSValue value, const WTF::String& reason) +JSC::EncodedJSValue INVALID_ARG_VALUE(JSC::ThrowScope& throwScope, JSC::JSGlobalObject* globalObject, WTF::ASCIILiteral name, JSC::JSValue value, const WTF::String& reason) { - ASCIILiteral type; - { - auto sp = name.span8(); - auto str = std::string_view((const char*)(sp.data()), sp.size()); - auto has = str.find('.') == std::string::npos; - type = has ? "property"_s : "argument"_s; - } + ASCIILiteral type = String(name).find('.') != notFound ? "property"_s : "argument"_s; auto value_string = JSValueToStringSafe(globalObject, value); RETURN_IF_EXCEPTION(throwScope, {}); diff --git a/src/bun.js/bindings/ErrorCode.h b/src/bun.js/bindings/ErrorCode.h index 39c1d0f963..addcd371ba 100644 --- a/src/bun.js/bindings/ErrorCode.h +++ b/src/bun.js/bindings/ErrorCode.h @@ -75,14 +75,14 @@ enum Bound { namespace ERR { -JSC::EncodedJSValue INVALID_ARG_TYPE(JSC::ThrowScope& throwScope, JSC::JSGlobalObject* globalObject, const WTF::String& val_arg_name, const WTF::String& val_expected_type, JSC::JSValue val_actual_value); -JSC::EncodedJSValue INVALID_ARG_TYPE(JSC::ThrowScope& throwScope, JSC::JSGlobalObject* globalObject, JSC::JSValue val_arg_name, const WTF::String& val_expected_type, JSC::JSValue val_actual_value); +JSC::EncodedJSValue INVALID_ARG_TYPE(JSC::ThrowScope& throwScope, JSC::JSGlobalObject* globalObject, const WTF::ASCIILiteral& arg_name, const WTF::ASCIILiteral& expected_type, JSC::JSValue val_actual_value); +JSC::EncodedJSValue INVALID_ARG_TYPE(JSC::ThrowScope& throwScope, JSC::JSGlobalObject* globalObject, JSC::JSValue arg_name, WTF::ASCIILiteral expected_type, JSC::JSValue val_actual_value); JSC::EncodedJSValue OUT_OF_RANGE(JSC::ThrowScope& throwScope, JSC::JSGlobalObject* globalObject, const WTF::String& arg_name, size_t lower, size_t upper, JSC::JSValue actual); JSC::EncodedJSValue OUT_OF_RANGE(JSC::ThrowScope& throwScope, JSC::JSGlobalObject* globalObject, JSC::JSValue arg_name, size_t lower, size_t upper, JSC::JSValue actual); JSC::EncodedJSValue OUT_OF_RANGE(JSC::ThrowScope& throwScope, JSC::JSGlobalObject* globalObject, JSC::JSValue arg_name_val, size_t bound_num, Bound bound, JSC::JSValue actual); JSC::EncodedJSValue OUT_OF_RANGE(JSC::ThrowScope& throwScope, JSC::JSGlobalObject* globalObject, JSC::JSValue arg_name_val, const WTF::String& msg, JSC::JSValue actual); JSC::EncodedJSValue OUT_OF_RANGE(JSC::ThrowScope& throwScope, JSC::JSGlobalObject* globalObject, const WTF::String& arg_name_val, const WTF::String& msg, JSC::JSValue actual); -JSC::EncodedJSValue INVALID_ARG_VALUE(JSC::ThrowScope& throwScope, JSC::JSGlobalObject* globalObject, const WTF::String& name, JSC::JSValue value, const WTF::String& reason = "is invalid"_s); +JSC::EncodedJSValue INVALID_ARG_VALUE(JSC::ThrowScope& throwScope, JSC::JSGlobalObject* globalObject, WTF::ASCIILiteral name, JSC::JSValue value, const WTF::String& reason = "is invalid"_s); JSC::EncodedJSValue INVALID_ARG_VALUE(JSC::ThrowScope& throwScope, JSC::JSGlobalObject* globalObject, JSC::JSValue name, JSC::JSValue value, const WTF::String& reason = "is invalid"_s); JSC::EncodedJSValue UNKNOWN_ENCODING(JSC::ThrowScope& throwScope, JSC::JSGlobalObject* globalObject, const WTF::String& encoding); JSC::EncodedJSValue INVALID_STATE(JSC::ThrowScope& throwScope, JSC::JSGlobalObject* globalObject, const WTF::String& statemsg); diff --git a/src/bun.js/bindings/JSStringDecoder.cpp b/src/bun.js/bindings/JSStringDecoder.cpp index 95200ce646..ff6a0c18c0 100644 --- a/src/bun.js/bindings/JSStringDecoder.cpp +++ b/src/bun.js/bindings/JSStringDecoder.cpp @@ -2,6 +2,7 @@ #include "JSBuffer.h" #include #include +#include "JavaScriptCore/ExceptionScope.h" #include "ZigGlobalObject.h" #include "JSDOMOperation.h" #include "JSDOMAttribute.h" @@ -9,8 +10,11 @@ #include "JSDOMConvertEnumeration.h" #include #include "BunClientData.h" +#include "wtf/text/ASCIILiteral.h" #include "wtf/text/StringImpl.h" #include "wtf/unicode/CharacterNames.h" +#include "wtf/SIMDUTF.h" +#include "ErrorCode.h" namespace WebCore { @@ -23,51 +27,80 @@ static JSC_DECLARE_HOST_FUNCTION(jsStringDecoderPrototypeFunction_text); static JSC_DECLARE_CUSTOM_GETTER(jsStringDecoder_lastChar); static JSC_DECLARE_CUSTOM_GETTER(jsStringDecoder_lastNeed); static JSC_DECLARE_CUSTOM_GETTER(jsStringDecoder_lastTotal); +static JSC_DECLARE_CUSTOM_GETTER(jsStringDecoder_encoding); static WTF::String replacementString() { - return WTF::String(std::span { u"\uFFFD", 1 }); } - -static inline JSC::EncodedJSValue jsStringDecoderCast(JSGlobalObject* globalObject, JSValue stringDecoderValue) +static WTF::String replacementString2() { - if (LIKELY(jsDynamicCast(stringDecoderValue))) - return JSValue::encode(stringDecoderValue); + return WTF::String(std::span { u"\uFFFD\uFFFD", 2 }); +} +static WTF::String replacementString3() +{ + return WTF::String(std::span { u"\uFFFD\uFFFD\uFFFD", 3 }); +} + +// Checks the type of a UTF-8 byte, whether it's ASCII, a leading byte, or a +// continuation byte. +// 0 1 2 3 4 5 6 7 8 9 A B C D E F +// 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +// 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +// 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +// 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +// 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +// 5 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +// 6 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +// 7 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +// 8 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 +// 9 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 +// A -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 +// B -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 +// C 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 +// D 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 +// E 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 +// F 4 4 4 4 4 4 4 4 -2 -2 -2 -2 -2 -2 -2 -2 +int8_t utf8CheckByte(uint8_t byte) +{ + if (byte <= 0x7F) + return 0; // ASCII + else if ((byte >> 5) == 0x06) + return 2; // 2-byte Start + else if ((byte >> 4) == 0x0E) + return 3; // 3-byte Start + else if ((byte >> 3) == 0x1E) + return 4; // 4-byte Start + return (byte >> 6) == 0x02 + ? -1 // Continuation + : -2; // Invalid +} + +ALWAYS_INLINE bool isContinuation(uint8_t byte) +{ + return (byte & 0xC0) == 0x80; +} + +static inline JSStringDecoder* jsStringDecoderCast(JSGlobalObject* globalObject, JSValue stringDecoderValue, WTF::ASCIILiteral functionName) +{ + ASSERT(stringDecoderValue); + if (auto cast = jsDynamicCast(stringDecoderValue); LIKELY(cast)) + return cast; auto& vm = globalObject->vm(); auto throwScope = DECLARE_THROW_SCOPE(vm); - if (stringDecoderValue.isEmpty() || stringDecoderValue.isUndefinedOrNull()) { - return JSC::JSValue::encode(jsUndefined()); - } - - if (!stringDecoderValue.isObject()) { - return throwThisTypeError(*globalObject, throwScope, JSStringDecoder::info()->className, "write"); - } - - JSC::JSObject* thisObject = JSC::asObject(stringDecoderValue); - JSStringDecoder* castedThis = nullptr; - auto clientData = WebCore::clientData(vm); - if (JSValue existingDecoderValue = thisObject->getIfPropertyExists(globalObject, clientData->builtinNames().decodePrivateName())) { - castedThis = jsDynamicCast(existingDecoderValue); - } - - if (!castedThis) { - BufferEncodingType encoding = BufferEncodingType::utf8; - if (JSValue encodingValue = thisObject->getIfPropertyExists(globalObject, clientData->builtinNames().encodingPrivateName())) { - if (encodingValue.isString()) { - std::optional opt = parseEnumeration(*globalObject, encodingValue); - if (opt.has_value()) { - encoding = opt.value(); - } - } + if (JSC::JSObject* thisObject = stringDecoderValue.getObject()) { + auto clientData = WebCore::clientData(vm); + JSValue existingDecoderValue = thisObject->getIfPropertyExists(globalObject, clientData->builtinNames().decodePrivateName()); + if (LIKELY(existingDecoderValue)) { + if (auto cast = jsDynamicCast(existingDecoderValue); LIKELY(cast)) + return cast; } - castedThis = JSStringDecoder::create(globalObject->vm(), globalObject, reinterpret_cast(globalObject)->JSStringDecoderStructure(), encoding); - thisObject->putDirect(vm, clientData->builtinNames().decodePrivateName(), castedThis, 0); } - return JSValue::encode(castedThis); + throwThisTypeError(*globalObject, throwScope, JSStringDecoder::info()->className, functionName); + return nullptr; } void JSStringDecoder::finishCreation(JSC::VM& vm, JSC::JSGlobalObject* globalObject) @@ -75,54 +108,6 @@ void JSStringDecoder::finishCreation(JSC::VM& vm, JSC::JSGlobalObject* globalObj Base::finishCreation(vm); } -JSC::JSValue JSStringDecoder::fillLast(JSC::VM& vm, JSC::JSGlobalObject* globalObject, uint8_t* bufPtr, uint32_t length) -{ - auto throwScope = DECLARE_THROW_SCOPE(vm); - - if (m_encoding == BufferEncodingType::utf8) { - // utf8CheckExtraBytes - if ((bufPtr[0] & 0xC0) != 0x80) { - m_lastNeed = 0; - RELEASE_AND_RETURN(throwScope, JSC::jsString(vm, replacementString())); - } - if (m_lastNeed > 1 && length > 1) { - if ((bufPtr[1] & 0xC0) != 0x80) { - m_lastNeed = 1; - RELEASE_AND_RETURN(throwScope, JSC::jsString(vm, replacementString())); - } - if (m_lastNeed > 2 && length > 2) { - if ((bufPtr[2] & 0xC0) != 0x80) { - m_lastNeed = 2; - RELEASE_AND_RETURN(throwScope, JSC::jsString(vm, replacementString())); - } - } - } - } - - if (m_lastNeed <= length) { - memmove(m_lastChar + m_lastTotal - m_lastNeed, bufPtr, m_lastNeed); - RELEASE_AND_RETURN(throwScope, JSC::JSValue::decode(Bun__encoding__toString(m_lastChar, m_lastTotal, globalObject, static_cast(m_encoding)))); - } - memmove(m_lastChar + m_lastTotal - m_lastNeed, bufPtr, length); - m_lastNeed -= length; - RELEASE_AND_RETURN(throwScope, JSC::jsEmptyString(vm)); -} - -// Checks the type of a UTF-8 byte, whether it's ASCII, a leading byte, or a -// continuation byte. If an invalid byte is detected, -2 is returned. -int8_t utf8CheckByte(uint8_t byte) -{ - if (byte <= 0x7F) - return 0; - else if ((byte >> 5) == 0x06) - return 2; - else if ((byte >> 4) == 0x0E) - return 3; - else if ((byte >> 3) == 0x1E) - return 4; - return (byte >> 6) == 0x02 ? -1 : -2; -} - // Checks at most 3 bytes at the end of a Buffer in order to detect an // incomplete multi-byte UTF-8 character. The total number of bytes (2, 3, or 4) // needed to complete the UTF-8 character (if applicable) are returned. @@ -160,6 +145,60 @@ uint8_t JSStringDecoder::utf8CheckIncomplete(uint8_t* bufPtr, uint32_t length, u return 0; } +JSC::JSValue JSStringDecoder::fillLast(JSC::VM& vm, JSC::JSGlobalObject* globalObject, uint8_t* bufPtr, uint32_t length) +{ + auto throwScope = DECLARE_THROW_SCOPE(vm); + + if (m_encoding == BufferEncodingType::utf8) { + // Check if the start has a failing UTF-8 code point. This is checking + // for situations where the a new character starts instead of a + // continuation byte. In this case, lastNeed (offset for decoding the + // rest of bufPtr) needs to be set less than the number of codepoints + // from what lastChar[0] requests since a new character starts. + // Example: + // [ 0xcc ] + [ 0xcc, 0x8c ] + // The first byte is not known to be an error until the second chunk + // comes in, to which the error is just the first 0xcc, and then + // the second two bytes are seen as the valid code point. + uint32_t max = std::min(length, m_lastNeed); + for (uint32_t i = 0; i < max; i++) { + if (!isContinuation(bufPtr[i])) { + // copy the continuation bytes to lastChar, then run it through + // originally this had an abridged version of the utf-8 decoder, + // but doing that is going to be more error prone. + // Example: [ 0xf2, 0x90 ] + [ 0xD0 ] -> '' + '\uFFFD' + '\uFFFD' + // ~~~~~~~~~~ ~~~~ two total errors + // Example: [ 0xf6, 0x90 ] + [ 0xD0 ] -> '' + '\uFFFD\uFFFD' + '\uFFFD' + // ~~~~ ~~~~ ~~~~ three total errors + // ^ 0xF6 is an invalid start byte + uint32_t chars = m_lastTotal - m_lastNeed + i; + memmove(m_lastChar + m_lastTotal - m_lastNeed, bufPtr, i); + m_lastNeed = i; + RELEASE_AND_RETURN(throwScope, JSC::JSValue::decode(Bun__encoding__toString(m_lastChar, chars, globalObject, static_cast(m_encoding)))); + } + } + } + if (m_lastNeed <= length) { + memmove(m_lastChar + m_lastTotal - m_lastNeed, bufPtr, m_lastNeed); + RELEASE_AND_RETURN(throwScope, JSC::JSValue::decode(Bun__encoding__toString(m_lastChar, m_lastTotal, globalObject, static_cast(m_encoding)))); + } + + memmove(m_lastChar + m_lastTotal - m_lastNeed, bufPtr, length); + if (m_encoding == BufferEncodingType::utf8) { + uint32_t lastLastNeed = m_lastNeed; + uint32_t total = utf8CheckIncomplete(m_lastChar, m_lastTotal - lastLastNeed + length, 0); + if (total == 0) { + uint32_t len = m_lastTotal - m_lastNeed + length; + m_lastNeed = length; + RELEASE_AND_RETURN(throwScope, JSC::JSValue::decode(Bun__encoding__toString(m_lastChar, len, globalObject, static_cast(m_encoding)))); + } + m_lastNeed = lastLastNeed; + } + + m_lastNeed -= length; + RELEASE_AND_RETURN(throwScope, JSC::jsEmptyString(vm)); +} + // This is not the exposed text JSC::JSValue JSStringDecoder::text(JSC::VM& vm, JSC::JSGlobalObject* globalObject, uint8_t* bufPtr, uint32_t length, uint32_t offset) { @@ -241,6 +280,8 @@ JSC::JSValue JSStringDecoder::write(JSC::VM& vm, JSC::JSGlobalObject* globalObje RELEASE_AND_RETURN(throwScope, firstHalf); offset = m_lastNeed; m_lastNeed = 0; + if (offset == length) + RELEASE_AND_RETURN(throwScope, firstHalf); JSString* secondHalf = text(vm, globalObject, bufPtr, length, offset).toString(globalObject); RETURN_IF_EXCEPTION(throwScope, JSC::jsUndefined()); @@ -305,11 +346,17 @@ JSStringDecoder::end(JSC::VM& vm, JSC::JSGlobalObject* globalObject, uint8_t* bu } case BufferEncodingType::utf8: { if (length == 0) { - RELEASE_AND_RETURN(throwScope, m_lastNeed ? JSC::jsString(vm, replacementString()) : JSC::jsEmptyString(vm)); + RELEASE_AND_RETURN(throwScope, m_lastNeed ? JSC::JSValue::decode(Bun__encoding__toString(m_lastChar, m_lastTotal - m_lastNeed, globalObject, static_cast(m_encoding))) : JSC::jsEmptyString(vm)); } JSString* firstHalf = write(vm, globalObject, bufPtr, length).toString(globalObject); RETURN_IF_EXCEPTION(throwScope, JSC::jsUndefined()); - RELEASE_AND_RETURN(throwScope, m_lastNeed ? JSC::jsString(globalObject, firstHalf, replacementString()) : firstHalf); + RELEASE_AND_RETURN(throwScope, + m_lastNeed + ? JSC::jsString( + globalObject, + firstHalf, + jsCast(JSC::JSValue::decode(Bun__encoding__toString(m_lastChar, m_lastTotal - m_lastNeed, globalObject, static_cast(m_encoding))))) + : firstHalf); } case BufferEncodingType::base64: case BufferEncodingType::base64url: { @@ -373,8 +420,7 @@ static inline JSC::EncodedJSValue jsStringDecoderPrototypeFunction_writeBody(JSC return JSC::JSValue::encode(buffer); } - throwVMTypeError(lexicalGlobalObject, throwScope, "Expected Uint8Array"_s); - return {}; + return Bun::ERR::INVALID_ARG_TYPE(throwScope, lexicalGlobalObject, "buf"_s, "Buffer, TypedArray, or DataView"_s, buffer); } RELEASE_AND_RETURN(throwScope, JSC::JSValue::encode(castedThis->write(vm, lexicalGlobalObject, reinterpret_cast(view->vector()), view->byteLength()))); } @@ -420,70 +466,63 @@ static inline JSC::EncodedJSValue jsStringDecoderPrototypeFunction_textBody(JSC: JSC_DEFINE_HOST_FUNCTION(jsStringDecoderPrototypeFunction_write, (JSC::JSGlobalObject * globalObject, JSC::CallFrame* callFrame)) { - JSValue stringDecoderValue = JSValue::decode(jsStringDecoderCast(globalObject, callFrame->thisValue())); - if (stringDecoderValue.isEmpty() || !stringDecoderValue.isCell()) { - return JSValue::encode(stringDecoderValue); - } - JSStringDecoder* castedThis = jsCast(stringDecoderValue); + auto scope = DECLARE_THROW_SCOPE(globalObject->vm()); + JSStringDecoder* castedThis = jsStringDecoderCast(globalObject, callFrame->thisValue(), "write"_s); + RETURN_IF_EXCEPTION(scope, JSC::JSValue::encode({})); return jsStringDecoderPrototypeFunction_writeBody(globalObject, callFrame, castedThis); } JSC_DEFINE_HOST_FUNCTION(jsStringDecoderPrototypeFunction_end, (JSC::JSGlobalObject * globalObject, JSC::CallFrame* callFrame)) { - JSValue stringDecoderValue = JSValue::decode(jsStringDecoderCast(globalObject, callFrame->thisValue())); - if (stringDecoderValue.isEmpty() || !stringDecoderValue.isCell()) { - return JSValue::encode(stringDecoderValue); - } - JSStringDecoder* castedThis = jsCast(stringDecoderValue); + auto scope = DECLARE_THROW_SCOPE(globalObject->vm()); + JSStringDecoder* castedThis = jsStringDecoderCast(globalObject, callFrame->thisValue(), "end"_s); + RETURN_IF_EXCEPTION(scope, JSC::JSValue::encode({})); return jsStringDecoderPrototypeFunction_endBody(globalObject, callFrame, castedThis); } JSC_DEFINE_HOST_FUNCTION(jsStringDecoderPrototypeFunction_text, (JSC::JSGlobalObject * globalObject, JSC::CallFrame* callFrame)) { - JSValue stringDecoderValue = JSValue::decode(jsStringDecoderCast(globalObject, callFrame->thisValue())); - if (stringDecoderValue.isEmpty() || !stringDecoderValue.isCell()) { - return JSValue::encode(stringDecoderValue); - } - JSStringDecoder* castedThis = jsCast(stringDecoderValue); - + auto scope = DECLARE_THROW_SCOPE(globalObject->vm()); + JSStringDecoder* castedThis = jsStringDecoderCast(globalObject, callFrame->thisValue(), "text"_s); + RETURN_IF_EXCEPTION(scope, JSC::JSValue::encode({})); return jsStringDecoderPrototypeFunction_textBody(globalObject, callFrame, castedThis); } static JSC_DEFINE_CUSTOM_GETTER(jsStringDecoder_lastChar, (JSGlobalObject * lexicalGlobalObject, JSC::EncodedJSValue thisValue, PropertyName attributeName)) { auto& vm = JSC::getVM(lexicalGlobalObject); - JSValue stringDecoderValue = JSValue::decode(jsStringDecoderCast(lexicalGlobalObject, JSValue::decode(thisValue))); - if (stringDecoderValue.isEmpty() || !stringDecoderValue.isCell()) { - return JSValue::encode(stringDecoderValue); - } - JSStringDecoder* thisObject = jsCast(stringDecoderValue); - auto throwScope = DECLARE_THROW_SCOPE(vm); - auto buffer = ArrayBuffer::create({ thisObject->m_lastChar, 4 }); + auto scope = DECLARE_THROW_SCOPE(vm); + JSStringDecoder* castedThis = jsStringDecoderCast(lexicalGlobalObject, JSC::JSValue::decode(thisValue), "text"_s); + RETURN_IF_EXCEPTION(scope, JSC::JSValue::encode({})); + auto buffer = ArrayBuffer::create({ castedThis->m_lastChar, 4 }); auto* globalObject = reinterpret_cast(lexicalGlobalObject); JSC::JSUint8Array* uint8Array = JSC::JSUint8Array::create(lexicalGlobalObject, globalObject->JSBufferSubclassStructure(), WTFMove(buffer), 0, 4); - RELEASE_AND_RETURN(throwScope, JSC::JSValue::encode(uint8Array)); + RELEASE_AND_RETURN(scope, JSC::JSValue::encode(uint8Array)); } static JSC_DEFINE_CUSTOM_GETTER(jsStringDecoder_lastNeed, (JSGlobalObject * lexicalGlobalObject, JSC::EncodedJSValue thisValue, PropertyName attributeName)) { auto& vm = JSC::getVM(lexicalGlobalObject); - JSValue stringDecoderValue = JSValue::decode(jsStringDecoderCast(lexicalGlobalObject, JSValue::decode(thisValue))); - if (stringDecoderValue.isEmpty() || !stringDecoderValue.isCell()) { - return JSValue::encode(stringDecoderValue); - } - JSStringDecoder* thisObject = jsCast(stringDecoderValue); - auto throwScope = DECLARE_THROW_SCOPE(vm); - RELEASE_AND_RETURN(throwScope, JSC::JSValue::encode(JSC::jsNumber(thisObject->m_lastNeed))); + auto scope = DECLARE_THROW_SCOPE(vm); + JSStringDecoder* castedThis = jsStringDecoderCast(lexicalGlobalObject, JSC::JSValue::decode(thisValue), "lastNeed"_s); + RETURN_IF_EXCEPTION(scope, JSC::JSValue::encode({})); + RELEASE_AND_RETURN(scope, JSC::JSValue::encode(JSC::jsNumber(castedThis->m_lastNeed))); } static JSC_DEFINE_CUSTOM_GETTER(jsStringDecoder_lastTotal, (JSGlobalObject * lexicalGlobalObject, JSC::EncodedJSValue thisValue, PropertyName attributeName)) { auto& vm = JSC::getVM(lexicalGlobalObject); - JSValue stringDecoderValue = JSValue::decode(jsStringDecoderCast(lexicalGlobalObject, JSValue::decode(thisValue))); - if (stringDecoderValue.isEmpty() || !stringDecoderValue.isCell()) { - return JSValue::encode(stringDecoderValue); - } - JSStringDecoder* thisObject = jsCast(stringDecoderValue); - auto throwScope = DECLARE_THROW_SCOPE(vm); - RELEASE_AND_RETURN(throwScope, JSC::JSValue::encode(JSC::jsNumber(thisObject->m_lastTotal))); + auto scope = DECLARE_THROW_SCOPE(vm); + JSStringDecoder* castedThis = jsStringDecoderCast(lexicalGlobalObject, JSC::JSValue::decode(thisValue), "lastTotal"_s); + RETURN_IF_EXCEPTION(scope, JSC::JSValue::encode({})); + RELEASE_AND_RETURN(scope, JSC::JSValue::encode(JSC::jsNumber(castedThis->m_lastTotal))); +} + +static JSC_DEFINE_CUSTOM_GETTER(jsStringDecoder_encoding, (JSGlobalObject * lexicalGlobalObject, JSC::EncodedJSValue thisValue, PropertyName attributeName)) +{ + auto& vm = JSC::getVM(lexicalGlobalObject); + auto scope = DECLARE_THROW_SCOPE(vm); + JSStringDecoder* castedThis = jsStringDecoderCast(lexicalGlobalObject, JSC::JSValue::decode(thisValue), "lastTotal"_s); + RETURN_IF_EXCEPTION(scope, JSC::JSValue::encode({})); + return JSC::JSValue::encode(WebCore::convertEnumerationToJS(*lexicalGlobalObject, castedThis->m_encoding)); } /* Hash table for prototype */ @@ -492,6 +531,7 @@ static const HashTableValue JSStringDecoderPrototypeTableValues[] { "lastChar"_s, static_cast(JSC::PropertyAttribute::DontDelete | JSC::PropertyAttribute::ReadOnly | JSC::PropertyAttribute::CustomAccessor | JSC::PropertyAttribute::DOMAttribute), NoIntrinsic, { HashTableValue::GetterSetterType, jsStringDecoder_lastChar, 0 } }, { "lastNeed"_s, static_cast(JSC::PropertyAttribute::DontDelete | JSC::PropertyAttribute::ReadOnly | JSC::PropertyAttribute::CustomAccessor | JSC::PropertyAttribute::DOMAttribute), NoIntrinsic, { HashTableValue::GetterSetterType, jsStringDecoder_lastNeed, 0 } }, { "lastTotal"_s, static_cast(JSC::PropertyAttribute::DontDelete | JSC::PropertyAttribute::ReadOnly | JSC::PropertyAttribute::CustomAccessor | JSC::PropertyAttribute::DOMAttribute), NoIntrinsic, { HashTableValue::GetterSetterType, jsStringDecoder_lastTotal, 0 } }, + { "encoding"_s, static_cast(JSC::PropertyAttribute::DontDelete | JSC::PropertyAttribute::ReadOnly | JSC::PropertyAttribute::CustomAccessor | JSC::PropertyAttribute::DOMAttribute), NoIntrinsic, { HashTableValue::GetterSetterType, jsStringDecoder_encoding, 0 } }, { "write"_s, static_cast(JSC::PropertyAttribute::Function), NoIntrinsic, { HashTableValue::NativeFunctionType, jsStringDecoderPrototypeFunction_write, 1 } }, { "end"_s, static_cast(JSC::PropertyAttribute::Function), NoIntrinsic, { HashTableValue::NativeFunctionType, jsStringDecoderPrototypeFunction_end, 1 } }, { "text"_s, static_cast(JSC::PropertyAttribute::Function), NoIntrinsic, { HashTableValue::NativeFunctionType, jsStringDecoderPrototypeFunction_text, 2 } }, @@ -523,15 +563,17 @@ JSStringDecoderConstructor* JSStringDecoderConstructor::create(JSC::VM& vm, JSC: JSC::EncodedJSValue JSStringDecoderConstructor::construct(JSC::JSGlobalObject* lexicalGlobalObject, JSC::CallFrame* callFrame) { JSC::VM& vm = lexicalGlobalObject->vm(); + auto throwScope = DECLARE_THROW_SCOPE(vm); auto encoding = BufferEncodingType::utf8; - if (callFrame->argumentCount() > 0) { - - auto encoding_ = callFrame->argument(0); - if (encoding_.isString()) { - std::optional opt = parseEnumeration(*lexicalGlobalObject, encoding_); - if (opt.has_value()) { - encoding = opt.value(); - } + auto jsEncoding = callFrame->argument(0); + if (!jsEncoding.isUndefinedOrNull()) { + std::optional opt = parseEnumeration(*lexicalGlobalObject, jsEncoding); + if (opt.has_value()) { + encoding = opt.value(); + } else { + WTF::String encodingString = jsEncoding.toWTFString(lexicalGlobalObject); + RETURN_IF_EXCEPTION(throwScope, {}); + return Bun::ERR::UNKNOWN_ENCODING(throwScope, lexicalGlobalObject, encodingString); } } JSValue thisValue = callFrame->newTarget(); @@ -552,6 +594,7 @@ JSC::EncodedJSValue JSStringDecoderConstructor::construct(JSC::JSGlobalObject* l JSObject* thisObject = asObject(callFrame->thisValue()); thisObject->putDirect(vm, clientData->builtinNames().decodePrivateName(), jsObject, JSC::PropertyAttribute::DontEnum | 0); + thisObject->putDirect(vm, clientData->builtinNames().encodingPublicName(), convertEnumerationToJS(*lexicalGlobalObject, encoding), JSC::PropertyAttribute::DontEnum | 0); return JSC::JSValue::encode(thisObject); } diff --git a/src/bun.js/bindings/JSStringDecoder.h b/src/bun.js/bindings/JSStringDecoder.h index ebbfccf0aa..58f1b270f1 100644 --- a/src/bun.js/bindings/JSStringDecoder.h +++ b/src/bun.js/bindings/JSStringDecoder.h @@ -54,16 +54,15 @@ public: JSC::JSValue write(JSC::VM&, JSC::JSGlobalObject*, uint8_t*, uint32_t); JSC::JSValue end(JSC::VM&, JSC::JSGlobalObject*, uint8_t*, uint32_t); - uint8_t m_lastNeed; - uint8_t m_lastTotal; - uint8_t m_lastChar[4]; + uint8_t m_lastNeed = 0; + uint8_t m_lastTotal = 0; + uint8_t m_lastChar[4] = { 0, 0, 0, 0 }; + BufferEncodingType m_encoding = BufferEncodingType::utf8; private: JSC::JSValue fillLast(JSC::VM&, JSC::JSGlobalObject*, uint8_t*, uint32_t); JSC::JSValue text(JSC::VM&, JSC::JSGlobalObject*, uint8_t*, uint32_t, uint32_t); uint8_t utf8CheckIncomplete(uint8_t*, uint32_t, uint32_t); - - BufferEncodingType m_encoding; }; class JSStringDecoderPrototype : public JSC::JSNonFinalObject { diff --git a/src/http/websocket_http_client.zig b/src/http/websocket_http_client.zig index 5918f68078..806b3ba117 100644 --- a/src/http/websocket_http_client.zig +++ b/src/http/websocket_http_client.zig @@ -1131,7 +1131,7 @@ pub fn NewWebSocketClient(comptime ssl: bool) type { // this function encodes to UTF-16 if > 127 // so we don't need to worry about latin1 non-ascii code points // we avoid trim since we wanna keep the utf8 validation intact - const utf16_bytes_ = strings.toUTF16AllocNoTrim(bun.default_allocator, data_, true, false) catch { + const utf16_bytes_ = strings.toUTF16Alloc(bun.default_allocator, data_, true, false) catch { this.terminate(ErrorCode.invalid_utf8); return; }; diff --git a/src/string_immutable.zig b/src/string_immutable.zig index 2b1f084d04..286ac906b7 100644 --- a/src/string_immutable.zig +++ b/src/string_immutable.zig @@ -1559,20 +1559,14 @@ pub fn nonASCIISequenceLength(first_byte: u8) u3 { pub fn toUTF16Alloc(allocator: std.mem.Allocator, bytes: []const u8, comptime fail_if_invalid: bool, comptime sentinel: bool) !if (sentinel) ?[:0]u16 else ?[]u16 { if (strings.firstNonASCII(bytes)) |i| { const output_: ?std.ArrayList(u16) = if (comptime bun.FeatureFlags.use_simdutf) simd: { - const trimmed = bun.simdutf.trim.utf8(bytes); - - if (trimmed.len == 0) - break :simd null; - - const out_length = bun.simdutf.length.utf16.from.utf8(trimmed); - + const out_length = bun.simdutf.length.utf16.from.utf8(bytes); if (out_length == 0) break :simd null; var out = try allocator.alloc(u16, out_length + if (sentinel) 1 else 0); log("toUTF16 {d} UTF8 -> {d} UTF16", .{ bytes.len, out_length }); - const res = bun.simdutf.convert.utf8.to.utf16.with_errors.le(trimmed, out); + const res = bun.simdutf.convert.utf8.to.utf16.with_errors.le(bytes, out); if (res.status == .success) { if (comptime sentinel) { out[out_length] = 0; @@ -1778,104 +1772,6 @@ pub fn toUTF16AllocMaybeBuffered( return .{ output.items, .{0} ** 3, 0 }; } -pub fn toUTF16AllocNoTrim(allocator: std.mem.Allocator, bytes: []const u8, comptime fail_if_invalid: bool, comptime _: bool) !?[]u16 { - if (strings.firstNonASCII(bytes)) |i| { - const output_: ?std.ArrayList(u16) = if (comptime bun.FeatureFlags.use_simdutf) simd: { - const out_length = bun.simdutf.length.utf16.from.utf8(bytes); - - if (out_length == 0) - break :simd null; - - var out = try allocator.alloc(u16, out_length); - log("toUTF16 {d} UTF8 -> {d} UTF16", .{ bytes.len, out_length }); - - const res = bun.simdutf.convert.utf8.to.utf16.with_errors.le(bytes, out); - if (res.status == .success) { - return out; - } - - if (comptime fail_if_invalid) { - allocator.free(out); - return error.InvalidByteSequence; - } - - break :simd .{ - .items = out[0..i], - .capacity = out.len, - .allocator = allocator, - }; - } else null; - var output = output_ orelse fallback: { - var list = try std.ArrayList(u16).initCapacity(allocator, i + 2); - list.items.len = i; - strings.copyU8IntoU16(list.items, bytes[0..i]); - break :fallback list; - }; - errdefer output.deinit(); - - var remaining = bytes[i..]; - - { - const replacement = strings.convertUTF8BytesIntoUTF16(remaining); - if (comptime fail_if_invalid) { - if (replacement.fail) { - if (comptime Environment.allow_assert) assert(replacement.code_point == unicode_replacement); - return error.InvalidByteSequence; - } - } - remaining = remaining[@max(replacement.len, 1)..]; - - //#define U16_LENGTH(c) ((uint32_t)(c)<=0xffff ? 1 : 2) - switch (replacement.code_point) { - 0...0xffff => |c| { - try output.append(@as(u16, @intCast(c))); - }, - else => |c| { - try output.appendSlice(&[_]u16{ strings.u16Lead(c), strings.u16Trail(c) }); - }, - } - } - - while (strings.firstNonASCII(remaining)) |j| { - const end = output.items.len; - try output.ensureUnusedCapacity(j); - output.items.len += j; - strings.copyU8IntoU16(output.items[end..][0..j], remaining[0..j]); - remaining = remaining[j..]; - - const replacement = strings.convertUTF8BytesIntoUTF16(remaining); - if (comptime fail_if_invalid) { - if (replacement.fail) { - if (comptime Environment.allow_assert) assert(replacement.code_point == unicode_replacement); - return error.InvalidByteSequence; - } - } - remaining = remaining[@max(replacement.len, 1)..]; - - //#define U16_LENGTH(c) ((uint32_t)(c)<=0xffff ? 1 : 2) - switch (replacement.code_point) { - 0...0xffff => |c| { - try output.append(@as(u16, @intCast(c))); - }, - else => |c| { - try output.appendSlice(&[_]u16{ strings.u16Lead(c), strings.u16Trail(c) }); - }, - } - } - - if (remaining.len > 0) { - try output.ensureTotalCapacityPrecise(output.items.len + remaining.len); - - output.items.len += remaining.len; - strings.copyU8IntoU16(output.items[output.items.len - remaining.len ..], remaining); - } - - return output.items; - } - - return null; -} - pub fn utf16CodepointWithFFFD(comptime Type: type, input: Type) UTF16Replacement { return utf16CodepointWithFFFDAndFirstInputChar(Type, input[0], input); } diff --git a/test/js/node/buffer.test.js b/test/js/node/buffer.test.js index 645d29ac9e..08830c4bfc 100644 --- a/test/js/node/buffer.test.js +++ b/test/js/node/buffer.test.js @@ -2971,3 +2971,12 @@ describe("serialization", () => { expect(JSON.parse(string, receiver)).toEqual(buffer); }); }); + +it("should not trim utf-8 start bytes at end of string", () => { + // always worked + const buf1 = Buffer.from("e136e1", "hex"); + expect(buf1.toString("utf-8")).toEqual("\uFFFD6\uFFFD"); + // bugged + const buf2 = Buffer.from("36e1", "hex"); + expect(buf2.toString("utf-8")).toEqual("6\uFFFD"); +}); diff --git a/test/js/node/string_decoder/string-decoder.test.js b/test/js/node/string_decoder/string-decoder.test.js index 5e8463955c..1b4e6ab787 100644 --- a/test/js/node/string_decoder/string-decoder.test.js +++ b/test/js/node/string_decoder/string-decoder.test.js @@ -260,3 +260,21 @@ it("decoding latin1, issue #3738", () => { output += decoder.end(); expect(output).toStrictEqual("ÝYÞ"); }); + +it("invalid utf-8 at end of stream can sometimes produce more than one replacement character", () => { + let decoder = new RealStringDecoder("utf-8"); + expect(decoder.write(Buffer.from("36f59c", "hex"))).toEqual("6"); + expect(decoder.end()).toEqual("\uFFFD\uFFFD"); + decoder = new RealStringDecoder("utf-8"); + expect(decoder.write(Buffer.from("36f5", "hex"))).toEqual("6"); + expect(decoder.end(Buffer.from("9c", "hex"))).toEqual("\uFFFD\uFFFD"); +}); + +it("invalid utf-8 at end of stream can sometimes produce more than one replacement character", () => { + let decoder = new RealStringDecoder("utf-8"); + expect(decoder.write(Buffer.from("36f59c", "hex"))).toEqual("6"); + expect(decoder.end()).toEqual("\uFFFD\uFFFD"); + decoder = new RealStringDecoder("utf-8"); + expect(decoder.write(Buffer.from("36f5", "hex"))).toEqual("6"); + expect(decoder.end(Buffer.from("9c", "hex"))).toEqual("\uFFFD\uFFFD"); +}); diff --git a/test/js/node/test/parallel/test-string-decoder-fuzz.js b/test/js/node/test/parallel/test-string-decoder-fuzz.js new file mode 100644 index 0000000000..542876e96e --- /dev/null +++ b/test/js/node/test/parallel/test-string-decoder-fuzz.js @@ -0,0 +1,49 @@ +'use strict'; +require('../common'); +const { StringDecoder } = require('string_decoder'); +const util = require('util'); +const assert = require('assert'); + +// Tests that, for random sequences of bytes, our StringDecoder gives the +// same result as a direction conversion using Buffer.toString(). +// In particular, it checks that StringDecoder aligns with V8’s own output. + +function rand(max) { + return Math.floor(Math.random() * max); +} + +function randBuf(maxLen) { + const buf = Buffer.allocUnsafe(rand(maxLen)); + for (let i = 0; i < buf.length; i++) + buf[i] = rand(256); + return buf; +} + +const encodings = [ + 'utf16le', 'utf8', 'ascii', 'hex', 'base64', 'latin1', 'base64url', +]; + +function runSingleFuzzTest() { + const enc = encodings[rand(encodings.length)]; + const sd = new StringDecoder(enc); + const bufs = []; + const strings = []; + + const N = rand(10); + for (let i = 0; i < N; ++i) { + const buf = randBuf(50); + bufs.push(buf); + strings.push(sd.write(buf)); + } + strings.push(sd.end()); + + assert.strictEqual(strings.join(''), Buffer.concat(bufs).toString(enc), + `Mismatch:\n${util.inspect(strings)}\n` + + util.inspect(bufs.map((buf) => buf.toString('hex'))) + + `\nfor encoding ${enc}`); +} + +const start = Date.now(); +// Run this for 1 second +while (Date.now() - start < 1000) + runSingleFuzzTest(); diff --git a/test/js/node/test/parallel/test-string-decoder.js b/test/js/node/test/parallel/test-string-decoder.js new file mode 100644 index 0000000000..d82a149bf2 --- /dev/null +++ b/test/js/node/test/parallel/test-string-decoder.js @@ -0,0 +1,287 @@ +// Copyright Joyent, Inc. and other Node contributors. +// +// Permission is hereby granted, free of charge, to any person obtaining a +// copy of this software and associated documentation files (the +// "Software"), to deal in the Software without restriction, including +// without limitation the rights to use, copy, modify, merge, publish, +// distribute, sublicense, and/or sell copies of the Software, and to permit +// persons to whom the Software is furnished to do so, subject to the +// following conditions: +// +// The above copyright notice and this permission notice shall be included +// in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN +// NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, +// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR +// OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE +// USE OR OTHER DEALINGS IN THE SOFTWARE. + +'use strict'; +const common = require('../common'); +const assert = require('assert'); +const inspect = require('util').inspect; +const StringDecoder = require('string_decoder').StringDecoder; + +// Test default encoding +let decoder = new StringDecoder(); +assert.strictEqual(decoder.encoding, 'utf8'); + +// Should work without 'new' keyword +const decoder2 = {}; +StringDecoder.call(decoder2); +assert.strictEqual(decoder2.encoding, 'utf8'); + +// UTF-8 +test('utf-8', Buffer.from('$', 'utf-8'), '$'); +test('utf-8', Buffer.from('¢', 'utf-8'), '¢'); +test('utf-8', Buffer.from('€', 'utf-8'), '€'); +test('utf-8', Buffer.from('𤭢', 'utf-8'), '𤭢'); +// A mixed ascii and non-ascii string +// Test stolen from deps/v8/test/cctest/test-strings.cc +// U+02E4 -> CB A4 +// U+0064 -> 64 +// U+12E4 -> E1 8B A4 +// U+0030 -> 30 +// U+3045 -> E3 81 85 +test( + 'utf-8', + Buffer.from([0xCB, 0xA4, 0x64, 0xE1, 0x8B, 0xA4, 0x30, 0xE3, 0x81, 0x85]), + '\u02e4\u0064\u12e4\u0030\u3045' +); + +// Some invalid input, known to have caused trouble with chunking +// in https://github.com/nodejs/node/pull/7310#issuecomment-226445923 +// 00: |00000000 ASCII +// 41: |01000001 ASCII +// B8: 10|111000 continuation +// CC: 110|01100 two-byte head +// E2: 1110|0010 three-byte head +// F0: 11110|000 four-byte head +// F1: 11110|001'another four-byte head +// FB: 111110|11 "five-byte head", not UTF-8 +test('utf-8', Buffer.from('C9B5A941', 'hex'), '\u0275\ufffdA'); +test('utf-8', Buffer.from('E2', 'hex'), '\ufffd'); +test('utf-8', Buffer.from('E241', 'hex'), '\ufffdA'); +test('utf-8', Buffer.from('CCCCB8', 'hex'), '\ufffd\u0338'); +test('utf-8', Buffer.from('F0B841', 'hex'), '\ufffdA'); +test('utf-8', Buffer.from('F1CCB8', 'hex'), '\ufffd\u0338'); +test('utf-8', Buffer.from('F0FB00', 'hex'), '\ufffd\ufffd\0'); +test('utf-8', Buffer.from('CCE2B8B8', 'hex'), '\ufffd\u2e38'); +test('utf-8', Buffer.from('E2B8CCB8', 'hex'), '\ufffd\u0338'); +test('utf-8', Buffer.from('E2FBCC01', 'hex'), '\ufffd\ufffd\ufffd\u0001'); +test('utf-8', Buffer.from('CCB8CDB9', 'hex'), '\u0338\u0379'); +// CESU-8 of U+1D40D + +// V8 has changed their invalid UTF-8 handling, see +// https://chromium-review.googlesource.com/c/v8/v8/+/671020 for more info. +test('utf-8', Buffer.from('EDA0B5EDB08D', 'hex'), + '\ufffd\ufffd\ufffd\ufffd\ufffd\ufffd'); + +// UCS-2 +test('ucs2', Buffer.from('ababc', 'ucs2'), 'ababc'); + +// UTF-16LE +test('utf16le', Buffer.from('3DD84DDC', 'hex'), '\ud83d\udc4d'); // thumbs up + +// Additional UTF-8 tests +decoder = new StringDecoder('utf8'); +assert.strictEqual(decoder.write(Buffer.from('E1', 'hex')), ''); + +// A quick test for lastChar, lastNeed & lastTotal which are undocumented. +assert(decoder.lastChar.equals(new Uint8Array([0xe1, 0, 0, 0]))); +assert.strictEqual(decoder.lastNeed, 2); +assert.strictEqual(decoder.lastTotal, 3); + +assert.strictEqual(decoder.end(), '\ufffd'); + +// ArrayBufferView tests +const arrayBufferViewStr = 'String for ArrayBufferView tests\n'; +const inputBuffer = Buffer.from(arrayBufferViewStr.repeat(8), 'utf8'); +for (const expectView of common.getArrayBufferViews(inputBuffer)) { + assert.strictEqual( + decoder.write(expectView), + inputBuffer.toString('utf8') + ); + assert.strictEqual(decoder.end(), ''); +} + +decoder = new StringDecoder('utf8'); +assert.strictEqual(decoder.write(Buffer.from('E18B', 'hex')), ''); +assert.strictEqual(decoder.end(), '\ufffd'); + +decoder = new StringDecoder('utf8'); +assert.strictEqual(decoder.write(Buffer.from('\ufffd')), '\ufffd'); +assert.strictEqual(decoder.end(), ''); + +decoder = new StringDecoder('utf8'); +assert.strictEqual(decoder.write(Buffer.from('\ufffd\ufffd\ufffd')), + '\ufffd\ufffd\ufffd'); +assert.strictEqual(decoder.end(), ''); + +decoder = new StringDecoder('utf8'); +assert.strictEqual(decoder.write(Buffer.from('EFBFBDE2', 'hex')), '\ufffd'); +assert.strictEqual(decoder.end(), '\ufffd'); + +decoder = new StringDecoder('utf8'); +assert.strictEqual(decoder.write(Buffer.from('F1', 'hex')), ''); +assert.strictEqual(decoder.write(Buffer.from('41F2', 'hex')), '\ufffdA'); +assert.strictEqual(decoder.end(), '\ufffd'); + +// Additional utf8Text test +decoder = new StringDecoder('utf8'); +assert.strictEqual(decoder.text(Buffer.from([0x41]), 2), ''); + +// Additional UTF-16LE surrogate pair tests +decoder = new StringDecoder('utf16le'); +assert.strictEqual(decoder.write(Buffer.from('3DD8', 'hex')), ''); +assert.strictEqual(decoder.write(Buffer.from('4D', 'hex')), ''); +assert.strictEqual(decoder.write(Buffer.from('DC', 'hex')), '\ud83d\udc4d'); +assert.strictEqual(decoder.end(), ''); + +decoder = new StringDecoder('utf16le'); +assert.strictEqual(decoder.write(Buffer.from('3DD8', 'hex')), ''); +assert.strictEqual(decoder.end(), '\ud83d'); + +decoder = new StringDecoder('utf16le'); +assert.strictEqual(decoder.write(Buffer.from('3DD8', 'hex')), ''); +assert.strictEqual(decoder.write(Buffer.from('4D', 'hex')), ''); +assert.strictEqual(decoder.end(), '\ud83d'); + +decoder = new StringDecoder('utf16le'); +assert.strictEqual(decoder.write(Buffer.from('3DD84D', 'hex')), '\ud83d'); +assert.strictEqual(decoder.end(), ''); + +// Regression test for https://github.com/nodejs/node/issues/22358 +// (unaligned UTF-16 access). +decoder = new StringDecoder('utf16le'); +assert.strictEqual(decoder.write(Buffer.alloc(1)), ''); +assert.strictEqual(decoder.write(Buffer.alloc(20)), '\0'.repeat(10)); +assert.strictEqual(decoder.write(Buffer.alloc(48)), '\0'.repeat(24)); +assert.strictEqual(decoder.end(), ''); + +// Regression tests for https://github.com/nodejs/node/issues/22626 +// (not enough replacement chars when having seen more than one byte of an +// incomplete multibyte characters). +decoder = new StringDecoder('utf8'); +assert.strictEqual(decoder.write(Buffer.from('f69b', 'hex')), ''); +assert.strictEqual(decoder.write(Buffer.from('d1', 'hex')), '\ufffd\ufffd'); +assert.strictEqual(decoder.end(), '\ufffd'); +assert.strictEqual(decoder.write(Buffer.from('f4', 'hex')), ''); +assert.strictEqual(decoder.write(Buffer.from('bde5', 'hex')), '\ufffd\ufffd'); +assert.strictEqual(decoder.end(), '\ufffd'); + +assert.throws( + () => new StringDecoder(1), + { + code: 'ERR_UNKNOWN_ENCODING', + name: 'TypeError', + message: 'Unknown encoding: 1' + } +); + +assert.throws( + () => new StringDecoder('test'), + { + code: 'ERR_UNKNOWN_ENCODING', + name: 'TypeError', + message: 'Unknown encoding: test' + } +); + +assert.throws( + () => new StringDecoder('utf8').write(null), + { + code: 'ERR_INVALID_ARG_TYPE', + name: 'TypeError', + message: 'The "buf" argument must be an instance of Buffer, TypedArray,' + + ' or DataView. Received null' + } +); + +// Skipped in Bun: JSC supports much larger strings, so it is extremely hard to +// actually produce this exception. +// if (common.enoughTestMem) { +// assert.throws( +// () => new StringDecoder().write(Buffer.alloc((process.arch === 'ia32' ? 0x18ffffe8 : 0x1fffffe8) + 1).fill('a')), +// { +// code: 'ERR_STRING_TOO_LONG', +// } +// ); +// } + +assert.throws( + () => new StringDecoder('utf8').__proto__.write(Buffer.from('abc')), // eslint-disable-line no-proto + { + code: 'ERR_INVALID_THIS', + } +); + +// Test verifies that StringDecoder will correctly decode the given input +// buffer with the given encoding to the expected output. It will attempt all +// possible ways to write() the input buffer, see writeSequences(). The +// singleSequence allows for easy debugging of a specific sequence which is +// useful in case of test failures. +function test(encoding, input, expected, singleSequence) { + let sequences; + if (!singleSequence) { + sequences = writeSequences(input.length); + } else { + sequences = [singleSequence]; + } + const hexNumberRE = /.{2}/g; + sequences.forEach((sequence) => { + const decoder = new StringDecoder(encoding); + let output = ''; + sequence.forEach((write) => { + output += decoder.write(input.slice(write[0], write[1])); + }); + output += decoder.end(); + if (output !== expected) { + const message = + `Expected "${unicodeEscape(expected)}", ` + + `but got "${unicodeEscape(output)}"\n` + + `input: ${input.toString('hex').match(hexNumberRE)}\n` + + `Write sequence: ${JSON.stringify(sequence)}\n` + + `Full Decoder State: ${inspect(decoder)}`; + assert.fail(message); + } + }); +} + +// unicodeEscape prints the str contents as unicode escape codes. +function unicodeEscape(str) { + let r = ''; + for (let i = 0; i < str.length; i++) { + r += `\\u${str.charCodeAt(i).toString(16)}`; + } + return r; +} + +// writeSequences returns an array of arrays that describes all possible ways a +// buffer of the given length could be split up and passed to sequential write +// calls. +// +// e.G. writeSequences(3) will return: [ +// [ [ 0, 3 ] ], +// [ [ 0, 2 ], [ 2, 3 ] ], +// [ [ 0, 1 ], [ 1, 3 ] ], +// [ [ 0, 1 ], [ 1, 2 ], [ 2, 3 ] ] +// ] +function writeSequences(length, start, sequence) { + if (start === undefined) { + start = 0; + sequence = []; + } else if (start === length) { + return [sequence]; + } + let sequences = []; + for (let end = length; end > start; end--) { + const subSequence = sequence.concat([[start, end]]); + const subSequences = writeSequences(length, end, subSequence, sequences); + sequences = sequences.concat(subSequences); + } + return sequences; +}