mirror of
https://github.com/oven-sh/bun
synced 2026-02-16 22:01:47 +00:00
Support all standard text encodings in TextDecoder
This commit is contained in:
@@ -7,6 +7,7 @@
|
||||
#include "_libusockets.h"
|
||||
#include "BunClientData.h"
|
||||
#include "EventLoopTask.h"
|
||||
#include "TextCodecICU.h"
|
||||
|
||||
extern "C" void Bun__startLoop(us_loop_t* loop);
|
||||
|
||||
@@ -17,6 +18,24 @@ static std::atomic<unsigned> lastUniqueIdentifier = 0;
|
||||
WTF_MAKE_ISO_ALLOCATED_IMPL(EventLoopTask);
|
||||
WTF_MAKE_ISO_ALLOCATED_IMPL(ScriptExecutionContext);
|
||||
|
||||
ScriptExecutionContext::ScriptExecutionContext(JSC::VM* vm, JSC::JSGlobalObject* globalObject)
|
||||
: m_vm(vm)
|
||||
, m_globalObject(globalObject)
|
||||
, m_identifier(0)
|
||||
, m_broadcastChannelRegistry(BunBroadcastChannelRegistry::create())
|
||||
{
|
||||
regenerateIdentifier();
|
||||
}
|
||||
|
||||
ScriptExecutionContext::ScriptExecutionContext(JSC::VM* vm, JSC::JSGlobalObject* globalObject, ScriptExecutionContextIdentifier identifier)
|
||||
: m_vm(vm)
|
||||
, m_globalObject(globalObject)
|
||||
, m_identifier(identifier)
|
||||
, m_broadcastChannelRegistry(BunBroadcastChannelRegistry::create())
|
||||
{
|
||||
addToContextsMap();
|
||||
}
|
||||
|
||||
static Lock allScriptExecutionContextsMapLock;
|
||||
static HashMap<ScriptExecutionContextIdentifier, ScriptExecutionContext*>& allScriptExecutionContextsMap() WTF_REQUIRES_LOCK(allScriptExecutionContextsMapLock)
|
||||
{
|
||||
@@ -31,6 +50,15 @@ ScriptExecutionContext* ScriptExecutionContext::getScriptExecutionContext(Script
|
||||
return allScriptExecutionContextsMap().get(identifier);
|
||||
}
|
||||
|
||||
PAL::ICUConverterWrapper& ScriptExecutionContext::cachedConverterICU()
|
||||
{
|
||||
if (!m_cachedConverterICU) {
|
||||
m_cachedConverterICU = makeUnique<PAL::ICUConverterWrapper>();
|
||||
}
|
||||
|
||||
return *m_cachedConverterICU;
|
||||
}
|
||||
|
||||
template<bool SSL, bool isServer>
|
||||
static void registerHTTPContextForWebSocket(ScriptExecutionContext* script, us_socket_context_t* ctx, us_loop_t* loop)
|
||||
{
|
||||
|
||||
@@ -14,6 +14,10 @@
|
||||
#include "CachedScript.h"
|
||||
#include <wtf/URL.h>
|
||||
|
||||
namespace PAL {
|
||||
class ICUConverterWrapper;
|
||||
}
|
||||
|
||||
namespace uWS {
|
||||
template<bool isServer, bool isClient, typename UserData>
|
||||
struct WebSocketContext;
|
||||
@@ -37,24 +41,8 @@ class ScriptExecutionContext : public CanMakeWeakPtr<ScriptExecutionContext> {
|
||||
WTF_MAKE_ISO_ALLOCATED(ScriptExecutionContext);
|
||||
|
||||
public:
|
||||
ScriptExecutionContext(JSC::VM* vm, JSC::JSGlobalObject* globalObject)
|
||||
: m_vm(vm)
|
||||
, m_globalObject(globalObject)
|
||||
, m_identifier(0)
|
||||
, m_broadcastChannelRegistry(BunBroadcastChannelRegistry::create())
|
||||
{
|
||||
regenerateIdentifier();
|
||||
}
|
||||
|
||||
ScriptExecutionContext(JSC::VM* vm, JSC::JSGlobalObject* globalObject, ScriptExecutionContextIdentifier identifier)
|
||||
: m_vm(vm)
|
||||
, m_globalObject(globalObject)
|
||||
, m_identifier(identifier)
|
||||
, m_broadcastChannelRegistry(BunBroadcastChannelRegistry::create())
|
||||
{
|
||||
addToContextsMap();
|
||||
}
|
||||
|
||||
ScriptExecutionContext(JSC::VM* vm, JSC::JSGlobalObject* globalObject);
|
||||
ScriptExecutionContext(JSC::VM* vm, JSC::JSGlobalObject* globalObject, ScriptExecutionContextIdentifier identifier);
|
||||
~ScriptExecutionContext();
|
||||
|
||||
static ScriptExecutionContextIdentifier generateIdentifier();
|
||||
@@ -160,6 +148,8 @@ public:
|
||||
|
||||
static ScriptExecutionContext* getMainThreadScriptExecutionContext();
|
||||
|
||||
PAL::ICUConverterWrapper& cachedConverterICU();
|
||||
|
||||
private:
|
||||
JSC::VM* m_vm = nullptr;
|
||||
JSC::JSGlobalObject* m_globalObject = nullptr;
|
||||
@@ -184,6 +174,8 @@ private:
|
||||
us_socket_context_t* m_connected_ssl_client_websockets_ctx = nullptr;
|
||||
us_socket_context_t* m_connected_client_websockets_ctx = nullptr;
|
||||
|
||||
std::unique_ptr<PAL::ICUConverterWrapper> m_cachedConverterICU = { nullptr };
|
||||
|
||||
public:
|
||||
template<bool isSSL, bool isServer>
|
||||
us_socket_context_t* connectedWebSocketContext()
|
||||
|
||||
@@ -1058,3 +1058,21 @@ fn findPathInner(
|
||||
);
|
||||
return errorable.unwrap() catch null;
|
||||
}
|
||||
|
||||
pub const WebKitTextCodec = opaque {
|
||||
extern fn WebKitTextCodec__create(encoding_label: [*]const u8, len: usize) *WebKitTextCodec;
|
||||
extern fn WebKitTextCodec__deinit(this: *WebKitTextCodec) void;
|
||||
extern fn WebKitTextCodec__decode(this: *WebKitTextCodec, ptr: [*]const u8, len: usize, flush: bool, stopOnError: *bool) bun.String;
|
||||
extern fn WebKitTextCodec__stripByteOrderMark(this: *WebKitTextCodec) void;
|
||||
pub fn init(encoding_label: JSC.WebCore.EncodingLabel) *WebKitTextCodec {
|
||||
return WebKitTextCodec__create(@tagName(encoding_label).ptr, @tagName(encoding_label).len);
|
||||
}
|
||||
|
||||
pub const deinit = WebKitTextCodec__deinit;
|
||||
|
||||
pub fn decode(this: *WebKitTextCodec, input: []const u8, flush: bool, stop_on_error: *bool) bun.String {
|
||||
return WebKitTextCodec__decode(this, input.ptr, input.len, flush, stop_on_error);
|
||||
}
|
||||
|
||||
pub const stripByteOrderMark = WebKitTextCodec__stripByteOrderMark;
|
||||
};
|
||||
|
||||
@@ -73,6 +73,11 @@
|
||||
#define WEBCORE_EXPORT JS_EXPORT_PRIVATE
|
||||
#endif
|
||||
|
||||
#if OS(DARWIN)
|
||||
// Prevent symbol names from causing issues
|
||||
#define U_DISABLE_RENAMING 1
|
||||
#endif
|
||||
|
||||
#include <wtf/PlatformCallingConventions.h>
|
||||
#include <JavaScriptCore/JSCJSValue.h>
|
||||
#include <wtf/text/MakeString.h>
|
||||
|
||||
189
src/bun.js/bindings/webcore/DecodeEscapeSequences.h
Normal file
189
src/bun.js/bindings/webcore/DecodeEscapeSequences.h
Normal file
@@ -0,0 +1,189 @@
|
||||
/*
|
||||
* Copyright (C) 2011 Daniel Bates (dbates@intudata.com). All Rights Reserved.
|
||||
* Copyright (c) 2012 Google, inc. All Rights Reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* 3. Neither the name of Google Inc. nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
|
||||
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
|
||||
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "TextEncoding.h"
|
||||
#include <wtf/ASCIICType.h>
|
||||
#include <wtf/Assertions.h>
|
||||
#include <wtf/text/StringBuilder.h>
|
||||
|
||||
namespace PAL {
|
||||
|
||||
// See <http://en.wikipedia.org/wiki/Percent-encoding#Non-standard_implementations>.
|
||||
struct Unicode16BitEscapeSequence {
|
||||
enum { SequenceSize = 6 }; // e.g. %u26C4
|
||||
static size_t findInString(StringView string, size_t startPosition) { return string.find("%u"_s, startPosition); }
|
||||
static size_t findEndOfRun(StringView string, size_t startPosition, size_t endPosition)
|
||||
{
|
||||
size_t runEnd = startPosition;
|
||||
while (endPosition - runEnd >= SequenceSize && string[runEnd] == '%' && string[runEnd + 1] == 'u'
|
||||
&& isASCIIHexDigit(string[runEnd + 2]) && isASCIIHexDigit(string[runEnd + 3])
|
||||
&& isASCIIHexDigit(string[runEnd + 4]) && isASCIIHexDigit(string[runEnd + 5])) {
|
||||
runEnd += SequenceSize;
|
||||
}
|
||||
return runEnd;
|
||||
}
|
||||
static String decodeRun(StringView run, const TextEncoding&)
|
||||
{
|
||||
// Each %u-escape sequence represents a UTF-16 code unit.
|
||||
// See <http://www.w3.org/International/iri-edit/draft-duerst-iri.html#anchor29>.
|
||||
// For 16-bit escape sequences, we know that findEndOfRun() has given us a contiguous run of sequences
|
||||
// without any intervening characters, so decode the run without additional checks.
|
||||
auto numberOfSequences = run.length() / SequenceSize;
|
||||
StringBuilder builder;
|
||||
builder.reserveCapacity(numberOfSequences);
|
||||
while (numberOfSequences--) {
|
||||
UChar codeUnit = (toASCIIHexValue(run[2]) << 12) | (toASCIIHexValue(run[3]) << 8) | (toASCIIHexValue(run[4]) << 4) | toASCIIHexValue(run[5]);
|
||||
builder.append(codeUnit);
|
||||
run = run.substring(SequenceSize);
|
||||
}
|
||||
return builder.toString();
|
||||
}
|
||||
};
|
||||
|
||||
struct URLEscapeSequence {
|
||||
enum { SequenceSize = 3 }; // e.g. %41
|
||||
static size_t findInString(StringView string, size_t startPosition) { return string.find('%', startPosition); }
|
||||
static size_t findEndOfRun(StringView string, size_t startPosition, size_t endPosition)
|
||||
{
|
||||
// Make the simplifying assumption that supported encodings may have up to two unescaped characters
|
||||
// in the range 0x40 - 0x7F as the trailing bytes of their sequences which need to be passed into the
|
||||
// decoder as part of the run. In other words, we end the run at the first value outside of the
|
||||
// 0x40 - 0x7F range, after two values in this range, or at a %-sign that does not introduce a valid
|
||||
// escape sequence.
|
||||
size_t runEnd = startPosition;
|
||||
int numberOfTrailingCharacters = 0;
|
||||
while (runEnd < endPosition) {
|
||||
if (string[runEnd] == '%') {
|
||||
if (endPosition - runEnd >= SequenceSize && isASCIIHexDigit(string[runEnd + 1]) && isASCIIHexDigit(string[runEnd + 2])) {
|
||||
runEnd += SequenceSize;
|
||||
numberOfTrailingCharacters = 0;
|
||||
} else
|
||||
break;
|
||||
} else if (string[runEnd] >= 0x40 && string[runEnd] <= 0x7F && numberOfTrailingCharacters < 2) {
|
||||
runEnd += 1;
|
||||
numberOfTrailingCharacters += 1;
|
||||
} else
|
||||
break;
|
||||
}
|
||||
return runEnd;
|
||||
}
|
||||
|
||||
static Vector<uint8_t, 512> decodeRun(StringView run)
|
||||
{
|
||||
// For URL escape sequences, we know that findEndOfRun() has given us a run where every %-sign introduces
|
||||
// a valid escape sequence, but there may be characters between the sequences.
|
||||
Vector<uint8_t, 512> buffer;
|
||||
buffer.grow(run.length()); // Unescaping hex sequences only makes the length smaller.
|
||||
uint8_t* p = buffer.data();
|
||||
while (!run.isEmpty()) {
|
||||
if (run[0] == '%') {
|
||||
*p++ = (toASCIIHexValue(run[1]) << 4) | toASCIIHexValue(run[2]);
|
||||
run = run.substring(SequenceSize);
|
||||
} else {
|
||||
*p++ = run[0];
|
||||
run = run.substring(1);
|
||||
}
|
||||
}
|
||||
ASSERT(buffer.size() >= static_cast<size_t>(p - buffer.data())); // Prove buffer not overrun.
|
||||
buffer.shrink(p - buffer.data());
|
||||
return buffer;
|
||||
}
|
||||
|
||||
static String decodeRun(StringView run, const TextEncoding& encoding)
|
||||
{
|
||||
auto buffer = decodeRun(run);
|
||||
if (!encoding.isValid())
|
||||
return PAL::UTF8Encoding().decode(buffer.span());
|
||||
return encoding.decode(buffer.span());
|
||||
}
|
||||
};
|
||||
|
||||
template<typename EscapeSequence>
|
||||
String decodeEscapeSequences(StringView string, const TextEncoding& encoding)
|
||||
{
|
||||
StringBuilder result;
|
||||
size_t length = string.length();
|
||||
size_t decodedPosition = 0;
|
||||
size_t searchPosition = 0;
|
||||
size_t encodedRunPosition;
|
||||
while ((encodedRunPosition = EscapeSequence::findInString(string, searchPosition)) != notFound) {
|
||||
size_t encodedRunEnd = EscapeSequence::findEndOfRun(string, encodedRunPosition, length);
|
||||
searchPosition = encodedRunEnd;
|
||||
if (encodedRunEnd == encodedRunPosition) {
|
||||
++searchPosition;
|
||||
continue;
|
||||
}
|
||||
|
||||
String decoded = EscapeSequence::decodeRun(string.substring(encodedRunPosition, encodedRunEnd - encodedRunPosition), encoding);
|
||||
if (decoded.isEmpty())
|
||||
continue;
|
||||
|
||||
result.append(string.substring(decodedPosition, encodedRunPosition - decodedPosition), decoded);
|
||||
decodedPosition = encodedRunEnd;
|
||||
}
|
||||
result.append(string.substring(decodedPosition, length - decodedPosition));
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
inline Vector<uint8_t> decodeURLEscapeSequencesAsData(StringView string)
|
||||
{
|
||||
Vector<uint8_t> result;
|
||||
size_t decodedPosition = 0;
|
||||
size_t searchPosition = 0;
|
||||
while (true) {
|
||||
size_t encodedRunPosition = URLEscapeSequence::findInString(string, searchPosition);
|
||||
size_t encodedRunEnd = 0;
|
||||
if (encodedRunPosition != notFound) {
|
||||
encodedRunEnd = URLEscapeSequence::findEndOfRun(string, encodedRunPosition, string.length());
|
||||
searchPosition = encodedRunEnd;
|
||||
if (encodedRunEnd == encodedRunPosition) {
|
||||
++searchPosition;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// Strings are encoded as requested.
|
||||
result.appendVector(PAL::UTF8Encoding().encodeForURLParsing(string.substring(decodedPosition, encodedRunPosition - decodedPosition)));
|
||||
|
||||
if (encodedRunPosition == notFound)
|
||||
return result;
|
||||
|
||||
// Bytes go through as-is.
|
||||
auto decodedEscapeSequence = URLEscapeSequence::decodeRun(string.substring(encodedRunPosition, encodedRunEnd - encodedRunPosition));
|
||||
ASSERT(!decodedEscapeSequence.isEmpty());
|
||||
result.appendVector(decodedEscapeSequence);
|
||||
|
||||
decodedPosition = encodedRunEnd;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace PAL
|
||||
|
||||
8688
src/bun.js/bindings/webcore/EncodingTables.cpp
Normal file
8688
src/bun.js/bindings/webcore/EncodingTables.cpp
Normal file
File diff suppressed because it is too large
Load Diff
135
src/bun.js/bindings/webcore/EncodingTables.h
Normal file
135
src/bun.js/bindings/webcore/EncodingTables.h
Normal file
@@ -0,0 +1,135 @@
|
||||
/*
|
||||
* Copyright (C) 2020 Apple Inc. All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
|
||||
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
|
||||
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <algorithm>
|
||||
#include <array>
|
||||
#include <iterator>
|
||||
#include <optional>
|
||||
#include <unicode/umachine.h>
|
||||
#include <utility>
|
||||
|
||||
namespace PAL {
|
||||
|
||||
const std::array<std::pair<uint16_t, UChar>, 7724>& jis0208();
|
||||
const std::array<std::pair<uint16_t, UChar>, 6067>& jis0212();
|
||||
const std::array<std::pair<uint16_t, char32_t>, 18590>& big5();
|
||||
const std::array<std::pair<uint16_t, UChar>, 17048>& eucKR();
|
||||
const std::array<UChar, 23940>& gb18030();
|
||||
|
||||
void checkEncodingTableInvariants();
|
||||
|
||||
// Functions for using sorted arrays of pairs as a map.
|
||||
// FIXME: Consider moving these functions to StdLibExtras.h for uses other than encoding tables.
|
||||
template<typename CollectionType> void sortByFirst(CollectionType&);
|
||||
template<typename CollectionType> void stableSortByFirst(CollectionType&);
|
||||
template<typename CollectionType> bool isSortedByFirst(const CollectionType&);
|
||||
template<typename CollectionType> bool sortedFirstsAreUnique(const CollectionType&);
|
||||
template<typename CollectionType, typename KeyType> static auto findFirstInSortedPairs(const CollectionType& sortedPairsCollection, const KeyType&) -> std::optional<decltype(std::begin(sortedPairsCollection)->second)>;
|
||||
template<typename CollectionType, typename KeyType> static auto findInSortedPairs(const CollectionType& sortedPairsCollection, const KeyType&) -> std::pair<decltype(std::begin(sortedPairsCollection)), decltype(std::begin(sortedPairsCollection))>;
|
||||
|
||||
#if !ASSERT_ENABLED
|
||||
inline void checkEncodingTableInvariants() { }
|
||||
#endif
|
||||
|
||||
struct CompareFirst {
|
||||
template<typename TypeA, typename TypeB> bool operator()(const TypeA& a, const TypeB& b)
|
||||
{
|
||||
return a.first < b.first;
|
||||
}
|
||||
};
|
||||
|
||||
struct EqualFirst {
|
||||
template<typename TypeA, typename TypeB> bool operator()(const TypeA& a, const TypeB& b)
|
||||
{
|
||||
return a.first == b.first;
|
||||
}
|
||||
};
|
||||
|
||||
struct CompareSecond {
|
||||
template<typename TypeA, typename TypeB> bool operator()(const TypeA& a, const TypeB& b)
|
||||
{
|
||||
return a.second < b.second;
|
||||
}
|
||||
};
|
||||
|
||||
template<typename T> struct FirstAdapter {
|
||||
const T& first;
|
||||
};
|
||||
template<typename T> FirstAdapter<T> makeFirstAdapter(const T& value)
|
||||
{
|
||||
return { value };
|
||||
}
|
||||
|
||||
template<typename T> struct SecondAdapter {
|
||||
const T& second;
|
||||
};
|
||||
template<typename T> SecondAdapter<T> makeSecondAdapter(const T& value)
|
||||
{
|
||||
return { value };
|
||||
}
|
||||
|
||||
template<typename CollectionType> void sortByFirst(CollectionType& collection)
|
||||
{
|
||||
std::sort(std::begin(collection), std::end(collection), CompareFirst { });
|
||||
}
|
||||
|
||||
template<typename CollectionType> void stableSortByFirst(CollectionType& collection)
|
||||
{
|
||||
std::stable_sort(std::begin(collection), std::end(collection), CompareFirst { });
|
||||
}
|
||||
|
||||
template<typename CollectionType> bool isSortedByFirst(const CollectionType& collection)
|
||||
{
|
||||
return std::is_sorted(std::begin(collection), std::end(collection), CompareFirst { });
|
||||
}
|
||||
|
||||
template<typename CollectionType> bool sortedFirstsAreUnique(const CollectionType& collection)
|
||||
{
|
||||
return std::adjacent_find(std::begin(collection), std::end(collection), EqualFirst { }) == std::end(collection);
|
||||
}
|
||||
|
||||
template<typename CollectionType, typename KeyType> static auto findFirstInSortedPairs(const CollectionType& collection, const KeyType& key) -> std::optional<decltype(std::begin(collection)->second)>
|
||||
{
|
||||
if constexpr (std::is_integral_v<KeyType>) {
|
||||
if (key != decltype(std::begin(collection)->first)(key))
|
||||
return std::nullopt;
|
||||
}
|
||||
auto iterator = std::lower_bound(std::begin(collection), std::end(collection), makeFirstAdapter(key), CompareFirst { });
|
||||
if (iterator == std::end(collection) || key < iterator->first)
|
||||
return std::nullopt;
|
||||
return iterator->second;
|
||||
}
|
||||
|
||||
template<typename CollectionType, typename KeyType> static auto findInSortedPairs(const CollectionType& collection, const KeyType& key) -> std::pair<decltype(std::begin(collection)), decltype(std::begin(collection))> {
|
||||
if constexpr (std::is_integral_v<KeyType>) {
|
||||
if (key != decltype(std::begin(collection)->first)(key))
|
||||
return { std::end(collection), std::end(collection) };
|
||||
}
|
||||
return std::equal_range(std::begin(collection), std::end(collection), makeFirstAdapter(key), CompareFirst { });
|
||||
}
|
||||
|
||||
}
|
||||
59
src/bun.js/bindings/webcore/KillRing.cpp
Normal file
59
src/bun.js/bindings/webcore/KillRing.cpp
Normal file
@@ -0,0 +1,59 @@
|
||||
/*
|
||||
* Copyright (C) 2010 Google Inc. All Rights Reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
|
||||
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
|
||||
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
#include "KillRing.h"
|
||||
#include <wtf/TZoneMallocInlines.h>
|
||||
|
||||
#if !PLATFORM(MAC)
|
||||
|
||||
namespace PAL {
|
||||
|
||||
WTF_MAKE_TZONE_ALLOCATED_IMPL(KillRing);
|
||||
|
||||
void KillRing::append(const String&)
|
||||
{
|
||||
}
|
||||
|
||||
void KillRing::prepend(const String&)
|
||||
{
|
||||
}
|
||||
|
||||
String KillRing::yank()
|
||||
{
|
||||
return String();
|
||||
}
|
||||
|
||||
void KillRing::startNewSequence()
|
||||
{
|
||||
}
|
||||
|
||||
void KillRing::setToYankedState()
|
||||
{
|
||||
}
|
||||
|
||||
} // namespace PAL
|
||||
|
||||
#endif // !PLATFORM(MAC)
|
||||
44
src/bun.js/bindings/webcore/KillRing.h
Normal file
44
src/bun.js/bindings/webcore/KillRing.h
Normal file
@@ -0,0 +1,44 @@
|
||||
/*
|
||||
* Copyright (C) 2010 Google Inc. All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
|
||||
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
|
||||
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <wtf/TZoneMalloc.h>
|
||||
#include <wtf/text/WTFString.h>
|
||||
|
||||
namespace PAL {
|
||||
|
||||
class KillRing {
|
||||
WTF_MAKE_TZONE_ALLOCATED_EXPORT(KillRing, );
|
||||
|
||||
public:
|
||||
void append(const String&);
|
||||
void prepend(const String&);
|
||||
String yank();
|
||||
void startNewSequence();
|
||||
void setToYankedState();
|
||||
};
|
||||
|
||||
} // namespace PAL
|
||||
59
src/bun.js/bindings/webcore/TextCodec.cpp
Normal file
59
src/bun.js/bindings/webcore/TextCodec.cpp
Normal file
@@ -0,0 +1,59 @@
|
||||
/*
|
||||
* Copyright (C) 2004-2017 Apple Inc. All rights reserved.
|
||||
* Copyright (C) 2006 Alexey Proskuryakov <ap@nypop.com>
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
|
||||
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
|
||||
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
#include "TextCodec.h"
|
||||
#include <unicode/uchar.h>
|
||||
#include <wtf/TZoneMallocInlines.h>
|
||||
#include <wtf/text/WTFString.h>
|
||||
#include <wtf/unicode/CharacterNames.h>
|
||||
|
||||
namespace PAL {
|
||||
|
||||
WTF_MAKE_TZONE_ALLOCATED_IMPL(TextCodec);
|
||||
|
||||
int TextCodec::getUnencodableReplacement(char32_t codePoint, UnencodableHandling handling, UnencodableReplacementArray& replacement)
|
||||
{
|
||||
ASSERT(!(codePoint > UCHAR_MAX_VALUE));
|
||||
|
||||
// The Encoding Standard doesn't have surrogate code points in the input, but that would require
|
||||
// scanning and potentially manipulating inputs ahead of time. Instead handle them at the last
|
||||
// possible point.
|
||||
if (U_IS_SURROGATE(codePoint))
|
||||
codePoint = replacementCharacter;
|
||||
|
||||
switch (handling) {
|
||||
case UnencodableHandling::Entities:
|
||||
return snprintf(replacement.data(), sizeof(UnencodableReplacementArray), "&#%u;", static_cast<unsigned>(codePoint));
|
||||
case UnencodableHandling::URLEncodedEntities:
|
||||
return snprintf(replacement.data(), sizeof(UnencodableReplacementArray), "%%26%%23%u%%3B", static_cast<unsigned>(codePoint));
|
||||
}
|
||||
ASSERT_NOT_REACHED();
|
||||
replacement.data()[0] = 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
} // namespace PAL
|
||||
70
src/bun.js/bindings/webcore/TextCodec.h
Normal file
70
src/bun.js/bindings/webcore/TextCodec.h
Normal file
@@ -0,0 +1,70 @@
|
||||
/*
|
||||
* Copyright (C) 2004-2020 Apple Inc. All rights reserved.
|
||||
* Copyright (C) 2006 Alexey Proskuryakov <ap@nypop.com>
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
|
||||
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
|
||||
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "UnencodableHandling.h"
|
||||
#include <array>
|
||||
#include <memory>
|
||||
#include <span>
|
||||
#include <unicode/umachine.h>
|
||||
#include <wtf/Forward.h>
|
||||
#include <wtf/Noncopyable.h>
|
||||
#include <wtf/TZoneMalloc.h>
|
||||
|
||||
namespace PAL {
|
||||
|
||||
class TextEncoding;
|
||||
|
||||
using UnencodableReplacementArray = std::array<char, 32>;
|
||||
|
||||
class TextCodec {
|
||||
WTF_MAKE_TZONE_ALLOCATED(TextCodec);
|
||||
WTF_MAKE_NONCOPYABLE(TextCodec);
|
||||
|
||||
public:
|
||||
TextCodec() = default;
|
||||
virtual ~TextCodec() = default;
|
||||
|
||||
virtual void stripByteOrderMark() {}
|
||||
virtual String decode(std::span<const uint8_t> data, bool flush, bool stopOnError, bool& sawError) = 0;
|
||||
|
||||
virtual Vector<uint8_t> encode(StringView, UnencodableHandling) const = 0;
|
||||
|
||||
// Fills a null-terminated string representation of the given
|
||||
// unencodable character into the given replacement buffer.
|
||||
// The length of the string (not including the null) will be returned.
|
||||
static int getUnencodableReplacement(char32_t, UnencodableHandling, UnencodableReplacementArray&);
|
||||
};
|
||||
|
||||
Function<void(char32_t, Vector<uint8_t>&)> unencodableHandler(UnencodableHandling);
|
||||
|
||||
using EncodingNameRegistrar = void (*)(ASCIILiteral alias, ASCIILiteral name);
|
||||
|
||||
using NewTextCodecFunction = Function<std::unique_ptr<TextCodec>()>;
|
||||
using TextCodecRegistrar = void (*)(ASCIILiteral name, NewTextCodecFunction&&);
|
||||
|
||||
} // namespace PAL
|
||||
77
src/bun.js/bindings/webcore/TextCodecASCIIFastPath.h
Normal file
77
src/bun.js/bindings/webcore/TextCodecASCIIFastPath.h
Normal file
@@ -0,0 +1,77 @@
|
||||
/*
|
||||
* Copyright (C) 2011 Apple Inc. All rights reserved.
|
||||
* Copyright (C) 2011 Nokia Corporation and/or its subsidiary(-ies).
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
|
||||
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
|
||||
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <wtf/text/ASCIIFastPath.h>
|
||||
|
||||
namespace PAL {
|
||||
|
||||
template<size_t size> struct UCharByteFiller;
|
||||
template<> struct UCharByteFiller<4> {
|
||||
static void copy(LChar* destination, const uint8_t* source)
|
||||
{
|
||||
memcpy(destination, source, 4);
|
||||
}
|
||||
|
||||
static void copy(UChar* destination, const uint8_t* source)
|
||||
{
|
||||
destination[0] = source[0];
|
||||
destination[1] = source[1];
|
||||
destination[2] = source[2];
|
||||
destination[3] = source[3];
|
||||
}
|
||||
};
|
||||
template<> struct UCharByteFiller<8> {
|
||||
static void copy(LChar* destination, const uint8_t* source)
|
||||
{
|
||||
memcpy(destination, source, 8);
|
||||
}
|
||||
|
||||
static void copy(UChar* destination, const uint8_t* source)
|
||||
{
|
||||
destination[0] = source[0];
|
||||
destination[1] = source[1];
|
||||
destination[2] = source[2];
|
||||
destination[3] = source[3];
|
||||
destination[4] = source[4];
|
||||
destination[5] = source[5];
|
||||
destination[6] = source[6];
|
||||
destination[7] = source[7];
|
||||
}
|
||||
};
|
||||
|
||||
inline void copyASCIIMachineWord(LChar* destination, const uint8_t* source)
|
||||
{
|
||||
UCharByteFiller<sizeof(WTF::MachineWord)>::copy(destination, source);
|
||||
}
|
||||
|
||||
inline void copyASCIIMachineWord(UChar* destination, const uint8_t* source)
|
||||
{
|
||||
UCharByteFiller<sizeof(WTF::MachineWord)>::copy(destination, source);
|
||||
}
|
||||
|
||||
} // namespace PAL
|
||||
1239
src/bun.js/bindings/webcore/TextCodecCJK.cpp
Normal file
1239
src/bun.js/bindings/webcore/TextCodecCJK.cpp
Normal file
File diff suppressed because it is too large
Load Diff
76
src/bun.js/bindings/webcore/TextCodecCJK.h
Normal file
76
src/bun.js/bindings/webcore/TextCodecCJK.h
Normal file
@@ -0,0 +1,76 @@
|
||||
/*
|
||||
* Copyright (C) 2020 Apple Inc. All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
|
||||
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
|
||||
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "TextCodec.h"
|
||||
#include <optional>
|
||||
#include <wtf/TZoneMalloc.h>
|
||||
|
||||
namespace PAL {
|
||||
|
||||
class TextCodecCJK final : public TextCodec {
|
||||
WTF_MAKE_TZONE_ALLOCATED(TextCodecCJK);
|
||||
public:
|
||||
static void registerEncodingNames(EncodingNameRegistrar);
|
||||
static void registerCodecs(TextCodecRegistrar);
|
||||
|
||||
enum class Encoding : uint8_t;
|
||||
explicit TextCodecCJK(Encoding);
|
||||
|
||||
private:
|
||||
String decode(std::span<const uint8_t>, bool flush, bool stopOnError, bool& sawError) final;
|
||||
Vector<uint8_t> encode(StringView, UnencodableHandling) const final;
|
||||
|
||||
enum class SawError : bool { No, Yes };
|
||||
String decodeCommon(std::span<const uint8_t>, bool, bool, bool&, const Function<SawError(uint8_t, StringBuilder&)>&);
|
||||
|
||||
String eucJPDecode(std::span<const uint8_t>, bool, bool, bool&);
|
||||
String iso2022JPDecode(std::span<const uint8_t>, bool, bool, bool&);
|
||||
String shiftJISDecode(std::span<const uint8_t>, bool, bool, bool&);
|
||||
String eucKRDecode(std::span<const uint8_t>, bool, bool, bool&);
|
||||
String big5Decode(std::span<const uint8_t>, bool, bool, bool&);
|
||||
String gbkDecode(std::span<const uint8_t>, bool, bool, bool&);
|
||||
String gb18030Decode(std::span<const uint8_t>, bool, bool, bool&);
|
||||
|
||||
const Encoding m_encoding;
|
||||
|
||||
bool m_jis0212 { false };
|
||||
|
||||
enum class ISO2022JPDecoderState : uint8_t { ASCII, Roman, Katakana, LeadByte, TrailByte, EscapeStart, Escape };
|
||||
ISO2022JPDecoderState m_iso2022JPDecoderState { ISO2022JPDecoderState::ASCII };
|
||||
ISO2022JPDecoderState m_iso2022JPDecoderOutputState { ISO2022JPDecoderState::ASCII };
|
||||
bool m_iso2022JPOutput { false };
|
||||
std::optional<uint8_t> m_iso2022JPSecondPrependedByte;
|
||||
|
||||
uint8_t m_gb18030First { 0x00 };
|
||||
uint8_t m_gb18030Second { 0x00 };
|
||||
uint8_t m_gb18030Third { 0x00 };
|
||||
|
||||
uint8_t m_lead { 0x00 };
|
||||
std::optional<uint8_t> m_prependedByte;
|
||||
};
|
||||
|
||||
} // namespace PAL
|
||||
330
src/bun.js/bindings/webcore/TextCodecICU.cpp
Normal file
330
src/bun.js/bindings/webcore/TextCodecICU.cpp
Normal file
@@ -0,0 +1,330 @@
|
||||
/*
|
||||
* Copyright (C) 2004-2017 Apple Inc. All rights reserved.
|
||||
* Copyright (C) 2006 Alexey Proskuryakov <ap@nypop.com>
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
|
||||
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
|
||||
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
#include "TextCodecICU.h"
|
||||
|
||||
#include "TextEncoding.h"
|
||||
#include "TextEncodingRegistry.h"
|
||||
#include <array>
|
||||
#include <unicode/ucnv_cb.h>
|
||||
#include <wtf/TZoneMallocInlines.h>
|
||||
#include <wtf/Threading.h>
|
||||
#include <wtf/text/CString.h>
|
||||
#include <wtf/text/StringBuilder.h>
|
||||
#include <wtf/unicode/CharacterNames.h>
|
||||
#include <wtf/unicode/icu/ICUHelpers.h>
|
||||
#include "ScriptExecutionContext.h"
|
||||
#include "ZigGlobalObject.h"
|
||||
|
||||
namespace PAL {
|
||||
|
||||
static ICUConverterWrapper& cachedConverterICU()
|
||||
{
|
||||
return defaultGlobalObject()->scriptExecutionContext()->cachedConverterICU();
|
||||
}
|
||||
|
||||
WTF_MAKE_TZONE_ALLOCATED_IMPL(TextCodecICU);
|
||||
|
||||
const size_t ConversionBufferSize = 16384;
|
||||
|
||||
#define DECLARE_ALIASES(encoding, ...) \
|
||||
static constexpr ASCIILiteral encoding##_aliases[] { __VA_ARGS__ }
|
||||
|
||||
// From https://encoding.spec.whatwg.org. Plus a few extra aliases that macOS had historically from TEC.
|
||||
DECLARE_ALIASES(ISO_8859_2, "csisolatin2"_s, "iso-ir-101"_s, "iso8859-2"_s, "iso88592"_s, "iso_8859-2"_s, "iso_8859-2:1987"_s, "l2"_s, "latin2"_s);
|
||||
DECLARE_ALIASES(ISO_8859_4, "csisolatin4"_s, "iso-ir-110"_s, "iso8859-4"_s, "iso88594"_s, "iso_8859-4"_s, "iso_8859-4:1988"_s, "l4"_s, "latin4"_s);
|
||||
DECLARE_ALIASES(ISO_8859_5, "csisolatincyrillic"_s, "cyrillic"_s, "iso-ir-144"_s, "iso8859-5"_s, "iso88595"_s, "iso_8859-5"_s, "iso_8859-5:1988"_s);
|
||||
DECLARE_ALIASES(ISO_8859_10, "csisolatin6"_s, "iso-ir-157"_s, "iso8859-10"_s, "iso885910"_s, "l6"_s, "latin6"_s, "iso8859101992"_s, "isoir157"_s);
|
||||
DECLARE_ALIASES(ISO_8859_13, "iso8859-13"_s, "iso885913"_s);
|
||||
DECLARE_ALIASES(ISO_8859_14, "iso8859-14"_s, "iso885914"_s, "isoceltic"_s, "iso8859141998"_s, "isoir199"_s, "latin8"_s, "l8"_s);
|
||||
DECLARE_ALIASES(ISO_8859_15, "csisolatin9"_s, "iso8859-15"_s, "iso885915"_s, "iso_8859-15"_s, "l9"_s);
|
||||
DECLARE_ALIASES(ISO_8859_16, "isoir226"_s, "iso8859162001"_s, "l10"_s, "latin10"_s);
|
||||
DECLARE_ALIASES(KOI8_R, "cskoi8r"_s, "koi"_s, "koi8"_s, "koi8_r"_s);
|
||||
DECLARE_ALIASES(macintosh, "csmacintosh"_s, "mac"_s, "x-mac-roman"_s, "macroman"_s, "x-macroman"_s);
|
||||
DECLARE_ALIASES(windows_1250, "cp1250"_s, "x-cp1250"_s, "winlatin2"_s);
|
||||
DECLARE_ALIASES(windows_1251, "cp1251"_s, "wincyrillic"_s, "x-cp1251"_s);
|
||||
DECLARE_ALIASES(windows_1254, "winturkish"_s, "cp1254"_s, "csisolatin5"_s, "iso-8859-9"_s, "iso-ir-148"_s, "iso8859-9"_s, "iso88599"_s, "iso_8859-9"_s, "iso_8859-9:1989"_s, "l5"_s, "latin5"_s, "x-cp1254"_s);
|
||||
DECLARE_ALIASES(windows_1256, "winarabic"_s, "cp1256"_s, "x-cp1256"_s);
|
||||
DECLARE_ALIASES(windows_1258, "winvietnamese"_s, "cp1258"_s, "x-cp1258"_s);
|
||||
DECLARE_ALIASES(x_mac_cyrillic, "maccyrillic"_s, "x-mac-ukrainian"_s, "windows-10007"_s, "mac-cyrillic"_s, "maccy"_s, "x-MacCyrillic"_s, "x-MacUkraine"_s);
|
||||
// Encodings below are not in the standard.
|
||||
DECLARE_ALIASES(x_mac_greek, "windows-10006"_s, "macgr"_s, "x-MacGreek"_s);
|
||||
DECLARE_ALIASES(x_mac_centraleurroman, "windows-10029"_s, "x-mac-ce"_s, "macce"_s, "maccentraleurope"_s, "x-MacCentralEurope"_s);
|
||||
DECLARE_ALIASES(x_mac_turkish, "windows-10081"_s, "mactr"_s, "x-MacTurkish"_s);
|
||||
|
||||
#define DECLARE_ENCODING_NAME(encoding, alias_array) \
|
||||
{ encoding, std::size(alias_array##_aliases), alias_array##_aliases }
|
||||
|
||||
#define DECLARE_ENCODING_NAME_NO_ALIASES(encoding) \
|
||||
{ encoding, 0, nullptr }
|
||||
|
||||
static const struct EncodingName {
|
||||
ASCIILiteral name;
|
||||
unsigned aliasCount;
|
||||
const ASCIILiteral* aliases;
|
||||
} encodingNames[] = {
|
||||
DECLARE_ENCODING_NAME("ISO-8859-2"_s, ISO_8859_2),
|
||||
DECLARE_ENCODING_NAME("ISO-8859-4"_s, ISO_8859_4),
|
||||
DECLARE_ENCODING_NAME("ISO-8859-5"_s, ISO_8859_5),
|
||||
DECLARE_ENCODING_NAME("ISO-8859-10"_s, ISO_8859_10),
|
||||
DECLARE_ENCODING_NAME("ISO-8859-13"_s, ISO_8859_13),
|
||||
DECLARE_ENCODING_NAME("ISO-8859-14"_s, ISO_8859_14),
|
||||
DECLARE_ENCODING_NAME("ISO-8859-15"_s, ISO_8859_15),
|
||||
DECLARE_ENCODING_NAME("ISO-8859-16"_s, ISO_8859_16),
|
||||
DECLARE_ENCODING_NAME("KOI8-R"_s, KOI8_R),
|
||||
DECLARE_ENCODING_NAME("macintosh"_s, macintosh),
|
||||
DECLARE_ENCODING_NAME("windows-1250"_s, windows_1250),
|
||||
DECLARE_ENCODING_NAME("windows-1251"_s, windows_1251),
|
||||
DECLARE_ENCODING_NAME("windows-1254"_s, windows_1254),
|
||||
DECLARE_ENCODING_NAME("windows-1256"_s, windows_1256),
|
||||
DECLARE_ENCODING_NAME("windows-1258"_s, windows_1258),
|
||||
DECLARE_ENCODING_NAME("x-mac-cyrillic"_s, x_mac_cyrillic),
|
||||
// Encodings below are not in the standard.
|
||||
DECLARE_ENCODING_NAME("x-mac-greek"_s, x_mac_greek),
|
||||
DECLARE_ENCODING_NAME("x-mac-centraleurroman"_s, x_mac_centraleurroman),
|
||||
DECLARE_ENCODING_NAME("x-mac-turkish"_s, x_mac_turkish),
|
||||
DECLARE_ENCODING_NAME_NO_ALIASES("EUC-TW"_s),
|
||||
};
|
||||
|
||||
void TextCodecICU::registerEncodingNames(EncodingNameRegistrar registrar)
|
||||
{
|
||||
for (auto& encodingName : encodingNames) {
|
||||
registrar(encodingName.name, encodingName.name);
|
||||
for (size_t i = 0; i < encodingName.aliasCount; ++i)
|
||||
registrar(encodingName.aliases[i], encodingName.name);
|
||||
}
|
||||
}
|
||||
|
||||
void TextCodecICU::registerCodecs(TextCodecRegistrar registrar)
|
||||
{
|
||||
for (auto& encodingName : encodingNames) {
|
||||
ASCIILiteral name = encodingName.name;
|
||||
|
||||
UErrorCode error = U_ZERO_ERROR;
|
||||
const char* canonicalConverterName = ucnv_getCanonicalName(name, "IANA", &error);
|
||||
ASSERT(U_SUCCESS(error));
|
||||
if (!canonicalConverterName) {
|
||||
auto converter = ICUConverterPtr { ucnv_open(name, &error) };
|
||||
ASSERT(U_SUCCESS(error));
|
||||
canonicalConverterName = ucnv_getName(converter.get(), &error);
|
||||
ASSERT(U_SUCCESS(error));
|
||||
if (!canonicalConverterName) {
|
||||
ASSERT_NOT_REACHED();
|
||||
continue;
|
||||
}
|
||||
}
|
||||
registrar(name, [name, canonicalConverterName] {
|
||||
// ucnv_getCanonicalName() returns a static string owned by libicu so the call to
|
||||
// ASCIILiteral::fromLiteralUnsafe() should be safe.
|
||||
return makeUnique<TextCodecICU>(name, ASCIILiteral::fromLiteralUnsafe(canonicalConverterName));
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
TextCodecICU::TextCodecICU(ASCIILiteral encoding, ASCIILiteral canonicalConverterName)
|
||||
: m_encodingName(encoding)
|
||||
, m_canonicalConverterName(canonicalConverterName)
|
||||
{
|
||||
ASSERT(!m_canonicalConverterName.isNull());
|
||||
}
|
||||
|
||||
TextCodecICU::~TextCodecICU()
|
||||
{
|
||||
if (m_converter) {
|
||||
ucnv_reset(m_converter.get());
|
||||
cachedConverterICU().converter = WTFMove(m_converter);
|
||||
}
|
||||
}
|
||||
|
||||
void TextCodecICU::createICUConverter() const
|
||||
{
|
||||
ASSERT(!m_converter);
|
||||
|
||||
auto& cachedConverter = cachedConverterICU().converter;
|
||||
if (cachedConverter) {
|
||||
UErrorCode error = U_ZERO_ERROR;
|
||||
const char* cachedConverterName = ucnv_getName(cachedConverter.get(), &error);
|
||||
if (U_SUCCESS(error) && !strcmp(m_canonicalConverterName, cachedConverterName)) {
|
||||
m_converter = WTFMove(cachedConverter);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
UErrorCode error = U_ZERO_ERROR;
|
||||
m_converter = ICUConverterPtr { ucnv_open(m_canonicalConverterName, &error) };
|
||||
if (m_converter)
|
||||
ucnv_setFallback(m_converter.get(), true);
|
||||
}
|
||||
|
||||
int TextCodecICU::decodeToBuffer(UChar* target, UChar* targetLimit, const char*& source, const char* sourceLimit, int32_t* offsets, bool flush, UErrorCode& error)
|
||||
{
|
||||
UChar* targetStart = target;
|
||||
error = U_ZERO_ERROR;
|
||||
ucnv_toUnicode(m_converter.get(), &target, targetLimit, &source, sourceLimit, offsets, flush, &error);
|
||||
return target - targetStart;
|
||||
}
|
||||
|
||||
class ErrorCallbackSetter {
|
||||
public:
|
||||
ErrorCallbackSetter(UConverter& converter, bool stopOnError)
|
||||
: m_converter(converter)
|
||||
, m_shouldStopOnEncodingErrors(stopOnError)
|
||||
{
|
||||
if (m_shouldStopOnEncodingErrors) {
|
||||
UErrorCode err = U_ZERO_ERROR;
|
||||
ucnv_setToUCallBack(&m_converter, UCNV_TO_U_CALLBACK_SUBSTITUTE, UCNV_SUB_STOP_ON_ILLEGAL, &m_savedAction, &m_savedContext, &err);
|
||||
ASSERT(U_SUCCESS(err));
|
||||
}
|
||||
}
|
||||
~ErrorCallbackSetter()
|
||||
{
|
||||
if (m_shouldStopOnEncodingErrors) {
|
||||
UErrorCode err = U_ZERO_ERROR;
|
||||
const void* oldContext;
|
||||
UConverterToUCallback oldAction;
|
||||
ucnv_setToUCallBack(&m_converter, m_savedAction, m_savedContext, &oldAction, &oldContext, &err);
|
||||
ASSERT(oldAction == UCNV_TO_U_CALLBACK_SUBSTITUTE);
|
||||
ASSERT(!strcmp(static_cast<const char*>(oldContext), UCNV_SUB_STOP_ON_ILLEGAL));
|
||||
ASSERT(U_SUCCESS(err));
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
UConverter& m_converter;
|
||||
bool m_shouldStopOnEncodingErrors;
|
||||
const void* m_savedContext { nullptr };
|
||||
UConverterToUCallback m_savedAction { nullptr };
|
||||
};
|
||||
|
||||
String TextCodecICU::decode(std::span<const uint8_t> bytes, bool flush, bool stopOnError, bool& sawError)
|
||||
{
|
||||
// Get a converter for the passed-in encoding.
|
||||
if (!m_converter) {
|
||||
createICUConverter();
|
||||
if (!m_converter) {
|
||||
LOG_ERROR("error creating ICU encoder even though encoding was in table");
|
||||
sawError = true;
|
||||
return {};
|
||||
}
|
||||
}
|
||||
|
||||
ErrorCallbackSetter callbackSetter(*m_converter, stopOnError);
|
||||
|
||||
StringBuilder result;
|
||||
|
||||
UChar buffer[ConversionBufferSize];
|
||||
UChar* bufferLimit = buffer + ConversionBufferSize;
|
||||
const char* source = byteCast<char>(bytes.data());
|
||||
const char* sourceLimit = source + bytes.size();
|
||||
int32_t* offsets = nullptr;
|
||||
UErrorCode err = U_ZERO_ERROR;
|
||||
|
||||
do {
|
||||
size_t ucharsDecoded = decodeToBuffer(buffer, bufferLimit, source, sourceLimit, offsets, flush, err);
|
||||
result.append(std::span { buffer, ucharsDecoded });
|
||||
} while (needsToGrowToProduceBuffer(err));
|
||||
|
||||
if (U_FAILURE(err)) {
|
||||
// flush the converter so it can be reused, and not be bothered by this error.
|
||||
do {
|
||||
decodeToBuffer(buffer, bufferLimit, source, sourceLimit, offsets, true, err);
|
||||
} while (source < sourceLimit);
|
||||
sawError = true;
|
||||
}
|
||||
|
||||
String resultString = result.toString();
|
||||
|
||||
return resultString;
|
||||
}
|
||||
|
||||
// Invalid character handler when writing escaped entities for unrepresentable
|
||||
// characters. See the declaration of TextCodec::encode for more.
|
||||
static void urlEscapedEntityCallback(const void* context, UConverterFromUnicodeArgs* fromUArgs, const UChar* codeUnits, int32_t length,
|
||||
UChar32 codePoint, UConverterCallbackReason reason, UErrorCode* error)
|
||||
{
|
||||
if (reason == UCNV_UNASSIGNED) {
|
||||
*error = U_ZERO_ERROR;
|
||||
UnencodableReplacementArray entity;
|
||||
int entityLen = TextCodec::getUnencodableReplacement(codePoint, UnencodableHandling::URLEncodedEntities, entity);
|
||||
ucnv_cbFromUWriteBytes(fromUArgs, entity.data(), entityLen, 0, error);
|
||||
} else
|
||||
UCNV_FROM_U_CALLBACK_ESCAPE(context, fromUArgs, codeUnits, length, codePoint, reason, error);
|
||||
}
|
||||
|
||||
Vector<uint8_t> TextCodecICU::encode(StringView string, UnencodableHandling handling) const
|
||||
{
|
||||
if (string.isEmpty())
|
||||
return {};
|
||||
|
||||
if (!m_converter) {
|
||||
createICUConverter();
|
||||
if (!m_converter)
|
||||
return {};
|
||||
}
|
||||
|
||||
// FIXME: We should see if there is "force ASCII range" mode in ICU;
|
||||
// until then, we change the backslash into a yen sign.
|
||||
// Encoding will change the yen sign back into a backslash.
|
||||
String copy;
|
||||
if (shouldShowBackslashAsCurrencySymbolIn(m_encodingName)) {
|
||||
copy = makeStringByReplacingAll(string, '\\', yenSign);
|
||||
string = copy;
|
||||
}
|
||||
|
||||
UErrorCode error;
|
||||
switch (handling) {
|
||||
case UnencodableHandling::Entities:
|
||||
error = U_ZERO_ERROR;
|
||||
ucnv_setFromUCallBack(m_converter.get(), UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_DEC, 0, 0, &error);
|
||||
if (U_FAILURE(error))
|
||||
return {};
|
||||
break;
|
||||
case UnencodableHandling::URLEncodedEntities:
|
||||
error = U_ZERO_ERROR;
|
||||
ucnv_setFromUCallBack(m_converter.get(), urlEscapedEntityCallback, 0, 0, 0, &error);
|
||||
if (U_FAILURE(error))
|
||||
return {};
|
||||
break;
|
||||
}
|
||||
|
||||
auto upconvertedCharacters = string.upconvertedCharacters();
|
||||
auto* source = upconvertedCharacters.get();
|
||||
auto* sourceLimit = source + string.length();
|
||||
|
||||
Vector<uint8_t> result;
|
||||
do {
|
||||
char buffer[ConversionBufferSize];
|
||||
char* target = buffer;
|
||||
char* targetLimit = target + ConversionBufferSize;
|
||||
error = U_ZERO_ERROR;
|
||||
ucnv_fromUnicode(m_converter.get(), &target, targetLimit, &source, sourceLimit, 0, true, &error);
|
||||
result.append(std::span(byteCast<uint8_t>(&buffer[0]), target - buffer));
|
||||
} while (needsToGrowToProduceBuffer(error));
|
||||
return result;
|
||||
}
|
||||
|
||||
} // namespace PAL
|
||||
69
src/bun.js/bindings/webcore/TextCodecICU.h
Normal file
69
src/bun.js/bindings/webcore/TextCodecICU.h
Normal file
@@ -0,0 +1,69 @@
|
||||
/*
|
||||
* Copyright (C) 2004-2017 Apple Inc. All rights reserved.
|
||||
* Copyright (C) 2006 Alexey Proskuryakov <ap@nypop.com>
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
|
||||
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
|
||||
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "TextCodec.h"
|
||||
#include <wtf/unicode/icu/unicode/ucnv.h>
|
||||
#include <wtf/TZoneMalloc.h>
|
||||
#include <wtf/text/ASCIILiteral.h>
|
||||
#include <wtf/unicode/icu/ICUHelpers.h>
|
||||
|
||||
namespace PAL {
|
||||
|
||||
using ICUConverterPtr = std::unique_ptr<UConverter, ICUDeleter<ucnv_close>>;
|
||||
|
||||
class TextCodecICU final : public TextCodec {
|
||||
WTF_MAKE_TZONE_ALLOCATED(TextCodecICU);
|
||||
|
||||
public:
|
||||
static void registerEncodingNames(EncodingNameRegistrar);
|
||||
static void registerCodecs(TextCodecRegistrar);
|
||||
|
||||
explicit TextCodecICU(ASCIILiteral encoding, ASCIILiteral canonicalConverterName);
|
||||
virtual ~TextCodecICU();
|
||||
|
||||
private:
|
||||
String decode(std::span<const uint8_t>, bool flush, bool stopOnError, bool& sawError) final;
|
||||
Vector<uint8_t> encode(StringView, UnencodableHandling) const final;
|
||||
|
||||
void createICUConverter() const;
|
||||
void releaseICUConverter() const;
|
||||
|
||||
int decodeToBuffer(UChar* buffer, UChar* bufferLimit, const char*& source, const char* sourceLimit, int32_t* offsets, bool flush, UErrorCode&);
|
||||
|
||||
ASCIILiteral m_encodingName;
|
||||
ASCIILiteral const m_canonicalConverterName;
|
||||
mutable ICUConverterPtr m_converter;
|
||||
};
|
||||
|
||||
struct ICUConverterWrapper {
|
||||
WTF_MAKE_STRUCT_FAST_ALLOCATED;
|
||||
|
||||
ICUConverterPtr converter;
|
||||
};
|
||||
|
||||
} // namespace PAL
|
||||
254
src/bun.js/bindings/webcore/TextCodecLatin1.cpp
Normal file
254
src/bun.js/bindings/webcore/TextCodecLatin1.cpp
Normal file
@@ -0,0 +1,254 @@
|
||||
/*
|
||||
* Copyright (C) 2004-2017 Apple Inc. All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
|
||||
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
|
||||
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
#include "TextCodecLatin1.h"
|
||||
|
||||
#include "TextCodecASCIIFastPath.h"
|
||||
#include <array>
|
||||
#include <wtf/text/CString.h>
|
||||
#include <wtf/text/WTFString.h>
|
||||
|
||||
namespace PAL {
|
||||
|
||||
static const UChar latin1ConversionTable[256] = {
|
||||
0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, // 00-07
|
||||
0x0008, 0x0009, 0x000A, 0x000B, 0x000C, 0x000D, 0x000E, 0x000F, // 08-0F
|
||||
0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, // 10-17
|
||||
0x0018, 0x0019, 0x001A, 0x001B, 0x001C, 0x001D, 0x001E, 0x001F, // 18-1F
|
||||
0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, // 20-27
|
||||
0x0028, 0x0029, 0x002A, 0x002B, 0x002C, 0x002D, 0x002E, 0x002F, // 28-2F
|
||||
0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, // 30-37
|
||||
0x0038, 0x0039, 0x003A, 0x003B, 0x003C, 0x003D, 0x003E, 0x003F, // 38-3F
|
||||
0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, // 40-47
|
||||
0x0048, 0x0049, 0x004A, 0x004B, 0x004C, 0x004D, 0x004E, 0x004F, // 48-4F
|
||||
0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, // 50-57
|
||||
0x0058, 0x0059, 0x005A, 0x005B, 0x005C, 0x005D, 0x005E, 0x005F, // 58-5F
|
||||
0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, // 60-67
|
||||
0x0068, 0x0069, 0x006A, 0x006B, 0x006C, 0x006D, 0x006E, 0x006F, // 68-6F
|
||||
0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, // 70-77
|
||||
0x0078, 0x0079, 0x007A, 0x007B, 0x007C, 0x007D, 0x007E, 0x007F, // 78-7F
|
||||
0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, // 80-87
|
||||
0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F, // 88-8F
|
||||
0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, // 90-97
|
||||
0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178, // 98-9F
|
||||
0x00A0, 0x00A1, 0x00A2, 0x00A3, 0x00A4, 0x00A5, 0x00A6, 0x00A7, // A0-A7
|
||||
0x00A8, 0x00A9, 0x00AA, 0x00AB, 0x00AC, 0x00AD, 0x00AE, 0x00AF, // A8-AF
|
||||
0x00B0, 0x00B1, 0x00B2, 0x00B3, 0x00B4, 0x00B5, 0x00B6, 0x00B7, // B0-B7
|
||||
0x00B8, 0x00B9, 0x00BA, 0x00BB, 0x00BC, 0x00BD, 0x00BE, 0x00BF, // B8-BF
|
||||
0x00C0, 0x00C1, 0x00C2, 0x00C3, 0x00C4, 0x00C5, 0x00C6, 0x00C7, // C0-C7
|
||||
0x00C8, 0x00C9, 0x00CA, 0x00CB, 0x00CC, 0x00CD, 0x00CE, 0x00CF, // C8-CF
|
||||
0x00D0, 0x00D1, 0x00D2, 0x00D3, 0x00D4, 0x00D5, 0x00D6, 0x00D7, // D0-D7
|
||||
0x00D8, 0x00D9, 0x00DA, 0x00DB, 0x00DC, 0x00DD, 0x00DE, 0x00DF, // D8-DF
|
||||
0x00E0, 0x00E1, 0x00E2, 0x00E3, 0x00E4, 0x00E5, 0x00E6, 0x00E7, // E0-E7
|
||||
0x00E8, 0x00E9, 0x00EA, 0x00EB, 0x00EC, 0x00ED, 0x00EE, 0x00EF, // E8-EF
|
||||
0x00F0, 0x00F1, 0x00F2, 0x00F3, 0x00F4, 0x00F5, 0x00F6, 0x00F7, // F0-F7
|
||||
0x00F8, 0x00F9, 0x00FA, 0x00FB, 0x00FC, 0x00FD, 0x00FE, 0x00FF // F8-FF
|
||||
};
|
||||
|
||||
void TextCodecLatin1::registerEncodingNames(EncodingNameRegistrar registrar)
|
||||
{
|
||||
// From https://encoding.spec.whatwg.org.
|
||||
registrar("windows-1252"_s, "windows-1252"_s);
|
||||
registrar("ansi_x3.4-1968"_s, "windows-1252"_s);
|
||||
registrar("ascii"_s, "windows-1252"_s);
|
||||
registrar("cp1252"_s, "windows-1252"_s);
|
||||
registrar("cp819"_s, "windows-1252"_s);
|
||||
registrar("csisolatin1"_s, "windows-1252"_s);
|
||||
registrar("ibm819"_s, "windows-1252"_s);
|
||||
registrar("iso-8859-1"_s, "windows-1252"_s);
|
||||
registrar("iso-ir-100"_s, "windows-1252"_s);
|
||||
registrar("iso8859-1"_s, "windows-1252"_s);
|
||||
registrar("iso88591"_s, "windows-1252"_s);
|
||||
registrar("iso_8859-1"_s, "windows-1252"_s);
|
||||
registrar("iso_8859-1:1987"_s, "windows-1252"_s);
|
||||
registrar("l1"_s, "windows-1252"_s);
|
||||
registrar("latin1"_s, "windows-1252"_s);
|
||||
registrar("us-ascii"_s, "windows-1252"_s);
|
||||
registrar("x-cp1252"_s, "windows-1252"_s);
|
||||
}
|
||||
|
||||
void TextCodecLatin1::registerCodecs(TextCodecRegistrar registrar)
|
||||
{
|
||||
registrar("windows-1252"_s, [] {
|
||||
return makeUnique<TextCodecLatin1>();
|
||||
});
|
||||
}
|
||||
|
||||
String TextCodecLatin1::decode(std::span<const uint8_t> bytes, bool, bool, bool& sawException)
|
||||
{
|
||||
LChar* characters;
|
||||
if (bytes.empty())
|
||||
return emptyString();
|
||||
if (UNLIKELY(bytes.size() > std::numeric_limits<unsigned>::max())) {
|
||||
ASSERT_NOT_REACHED();
|
||||
sawException = true;
|
||||
return emptyString();
|
||||
}
|
||||
String result = String::createUninitialized(bytes.size(), characters);
|
||||
|
||||
const uint8_t* source = bytes.data();
|
||||
const uint8_t* end = bytes.data() + bytes.size();
|
||||
const uint8_t* alignedEnd = WTF::alignToMachineWord(end);
|
||||
LChar* destination = characters;
|
||||
|
||||
while (source < end) {
|
||||
if (isASCII(*source)) {
|
||||
// Fast path for ASCII. Most Latin-1 text will be ASCII.
|
||||
if (WTF::isAlignedToMachineWord(source)) {
|
||||
while (source < alignedEnd) {
|
||||
auto chunk = *reinterpret_cast_ptr<const WTF::MachineWord*>(source);
|
||||
|
||||
if (!WTF::containsOnlyASCII<LChar>(chunk))
|
||||
goto useLookupTable;
|
||||
|
||||
copyASCIIMachineWord(destination, source);
|
||||
source += sizeof(WTF::MachineWord);
|
||||
destination += sizeof(WTF::MachineWord);
|
||||
}
|
||||
|
||||
if (source == end)
|
||||
break;
|
||||
|
||||
// *source may not be ASCII anymore if source moves inside the loop of the fast code path
|
||||
if (!isASCII(*source))
|
||||
goto useLookupTable;
|
||||
}
|
||||
*destination = *source;
|
||||
} else {
|
||||
useLookupTable:
|
||||
if (!isLatin1(latin1ConversionTable[*source]))
|
||||
goto upConvertTo16Bit;
|
||||
|
||||
*destination = latin1ConversionTable[*source];
|
||||
}
|
||||
|
||||
++source;
|
||||
++destination;
|
||||
}
|
||||
|
||||
return result;
|
||||
|
||||
upConvertTo16Bit:
|
||||
UChar* characters16;
|
||||
String result16 = String::createUninitialized(bytes.size(), characters16);
|
||||
|
||||
UChar* destination16 = characters16;
|
||||
|
||||
// Zero extend and copy already processed 8 bit data
|
||||
LChar* ptr8 = characters;
|
||||
LChar* endPtr8 = destination;
|
||||
|
||||
while (ptr8 < endPtr8)
|
||||
*destination16++ = *ptr8++;
|
||||
|
||||
// Handle the character that triggered the 16 bit path
|
||||
*destination16 = latin1ConversionTable[*source];
|
||||
++source;
|
||||
++destination16;
|
||||
|
||||
while (source < end) {
|
||||
if (isASCII(*source)) {
|
||||
// Fast path for ASCII. Most Latin-1 text will be ASCII.
|
||||
if (WTF::isAlignedToMachineWord(source)) {
|
||||
while (source < alignedEnd) {
|
||||
auto chunk = *reinterpret_cast_ptr<const WTF::MachineWord*>(source);
|
||||
|
||||
if (!WTF::containsOnlyASCII<LChar>(chunk))
|
||||
goto useLookupTable16;
|
||||
|
||||
copyASCIIMachineWord(destination16, source);
|
||||
source += sizeof(WTF::MachineWord);
|
||||
destination16 += sizeof(WTF::MachineWord);
|
||||
}
|
||||
|
||||
if (source == end)
|
||||
break;
|
||||
|
||||
// *source may not be ASCII anymore if source moves inside the loop of the fast code path
|
||||
if (!isASCII(*source))
|
||||
goto useLookupTable16;
|
||||
}
|
||||
*destination16 = *source;
|
||||
} else {
|
||||
useLookupTable16:
|
||||
*destination16 = latin1ConversionTable[*source];
|
||||
}
|
||||
|
||||
++source;
|
||||
++destination16;
|
||||
}
|
||||
|
||||
return result16;
|
||||
}
|
||||
|
||||
static Vector<uint8_t> encodeComplexWindowsLatin1(StringView string, UnencodableHandling handling)
|
||||
{
|
||||
Vector<uint8_t> result;
|
||||
|
||||
for (auto character : string.codePoints()) {
|
||||
uint8_t b = character;
|
||||
// Do an efficient check to detect characters other than 00-7F and A0-FF.
|
||||
if (b != character || (character & 0xE0) == 0x80) {
|
||||
// Look for a way to encode this with Windows Latin-1.
|
||||
for (b = 0x80; b < 0xA0; ++b) {
|
||||
if (latin1ConversionTable[b] == character)
|
||||
goto gotByte;
|
||||
}
|
||||
// No way to encode this character with Windows Latin-1.
|
||||
UnencodableReplacementArray replacement;
|
||||
int replacementLength = TextCodec::getUnencodableReplacement(character, handling, replacement);
|
||||
result.append(std::span(replacement.data(), replacementLength));
|
||||
continue;
|
||||
}
|
||||
gotByte:
|
||||
result.append(b);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
Vector<uint8_t> TextCodecLatin1::encode(StringView string, UnencodableHandling handling) const
|
||||
{
|
||||
{
|
||||
Vector<uint8_t> result(string.length());
|
||||
auto* bytes = result.data();
|
||||
|
||||
// Convert and simultaneously do a check to see if it's all ASCII.
|
||||
UChar ored = 0;
|
||||
for (auto character : string.codeUnits()) {
|
||||
*bytes++ = character;
|
||||
ored |= character;
|
||||
}
|
||||
|
||||
if (!(ored & 0xFF80))
|
||||
return result;
|
||||
}
|
||||
|
||||
// If it wasn't all ASCII, call the function that handles more-complex cases.
|
||||
return encodeComplexWindowsLatin1(string, handling);
|
||||
}
|
||||
|
||||
} // namespace PAL
|
||||
42
src/bun.js/bindings/webcore/TextCodecLatin1.h
Normal file
42
src/bun.js/bindings/webcore/TextCodecLatin1.h
Normal file
@@ -0,0 +1,42 @@
|
||||
/*
|
||||
* Copyright (C) 2004-2017 Apple Inc. All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
|
||||
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
|
||||
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "TextCodec.h"
|
||||
|
||||
namespace PAL {
|
||||
|
||||
class TextCodecLatin1 final : public TextCodec {
|
||||
public:
|
||||
static void registerEncodingNames(EncodingNameRegistrar);
|
||||
static void registerCodecs(TextCodecRegistrar);
|
||||
|
||||
private:
|
||||
String decode(std::span<const uint8_t>, bool flush, bool stopOnError, bool& sawError) final;
|
||||
Vector<uint8_t> encode(StringView, UnencodableHandling) const final;
|
||||
};
|
||||
|
||||
} // namespace PAL
|
||||
70
src/bun.js/bindings/webcore/TextCodecReplacement.cpp
Normal file
70
src/bun.js/bindings/webcore/TextCodecReplacement.cpp
Normal file
@@ -0,0 +1,70 @@
|
||||
/*
|
||||
* Copyright (C) 2016-2017 Apple Inc. All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS''
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
|
||||
* THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS
|
||||
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
|
||||
* THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
#include "TextCodecReplacement.h"
|
||||
|
||||
#include <wtf/Function.h>
|
||||
#include <wtf/TZoneMallocInlines.h>
|
||||
#include <wtf/text/WTFString.h>
|
||||
#include <wtf/unicode/CharacterNames.h>
|
||||
|
||||
namespace PAL {
|
||||
|
||||
WTF_MAKE_TZONE_ALLOCATED_IMPL(TextCodecReplacement);
|
||||
|
||||
void TextCodecReplacement::registerEncodingNames(EncodingNameRegistrar registrar)
|
||||
{
|
||||
registrar("replacement"_s, "replacement"_s);
|
||||
|
||||
registrar("csiso2022kr"_s, "replacement"_s);
|
||||
registrar("hz-gb-2312"_s, "replacement"_s);
|
||||
registrar("iso-2022-cn"_s, "replacement"_s);
|
||||
registrar("iso-2022-cn-ext"_s, "replacement"_s);
|
||||
registrar("iso-2022-kr"_s, "replacement"_s);
|
||||
}
|
||||
|
||||
void TextCodecReplacement::registerCodecs(TextCodecRegistrar registrar)
|
||||
{
|
||||
registrar("replacement"_s, [] {
|
||||
return makeUnique<TextCodecReplacement>();
|
||||
});
|
||||
}
|
||||
|
||||
String TextCodecReplacement::decode(std::span<const uint8_t>, bool, bool, bool& sawError)
|
||||
{
|
||||
sawError = true;
|
||||
if (m_sentEOF)
|
||||
return emptyString();
|
||||
m_sentEOF = true;
|
||||
return span(replacementCharacter);
|
||||
}
|
||||
|
||||
Vector<uint8_t> TextCodecReplacement::encode(StringView string, UnencodableHandling) const
|
||||
{
|
||||
return {};
|
||||
}
|
||||
|
||||
} // namespace PAL
|
||||
47
src/bun.js/bindings/webcore/TextCodecReplacement.h
Normal file
47
src/bun.js/bindings/webcore/TextCodecReplacement.h
Normal file
@@ -0,0 +1,47 @@
|
||||
/*
|
||||
* Copyright (C) 2016-2020 Apple Inc. All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS''
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
|
||||
* THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS
|
||||
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
|
||||
* THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "TextCodec.h"
|
||||
#include <wtf/TZoneMalloc.h>
|
||||
|
||||
namespace PAL {
|
||||
|
||||
class TextCodecReplacement final : public TextCodec {
|
||||
WTF_MAKE_TZONE_ALLOCATED(TextCodecReplacement);
|
||||
|
||||
public:
|
||||
static void registerEncodingNames(EncodingNameRegistrar);
|
||||
static void registerCodecs(TextCodecRegistrar);
|
||||
|
||||
private:
|
||||
String decode(std::span<const uint8_t>, bool flush, bool stopOnError, bool& sawError) final;
|
||||
Vector<uint8_t> encode(StringView, UnencodableHandling) const final;
|
||||
|
||||
bool m_sentEOF { false };
|
||||
};
|
||||
|
||||
} // namespace PAL
|
||||
466
src/bun.js/bindings/webcore/TextCodecSingleByte.cpp
Normal file
466
src/bun.js/bindings/webcore/TextCodecSingleByte.cpp
Normal file
@@ -0,0 +1,466 @@
|
||||
/*
|
||||
* Copyright (C) 2020 Apple Inc. All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
|
||||
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
|
||||
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
#include "TextCodecSingleByte.h"
|
||||
|
||||
#include "EncodingTables.h"
|
||||
#include <mutex>
|
||||
#include <wtf/IteratorRange.h>
|
||||
#include <wtf/TZoneMallocInlines.h>
|
||||
#include <wtf/text/CodePointIterator.h>
|
||||
#include <wtf/text/StringBuilder.h>
|
||||
#include <wtf/unicode/CharacterNames.h>
|
||||
|
||||
namespace PAL {
|
||||
|
||||
WTF_MAKE_TZONE_ALLOCATED_IMPL(TextCodecSingleByte);
|
||||
|
||||
enum class TextCodecSingleByte::Encoding : uint8_t {
|
||||
ISO_8859_3,
|
||||
ISO_8859_6,
|
||||
ISO_8859_7,
|
||||
ISO_8859_8,
|
||||
Windows_874,
|
||||
Windows_1253,
|
||||
Windows_1255,
|
||||
Windows_1257,
|
||||
IBM866,
|
||||
KOI8U,
|
||||
};
|
||||
|
||||
using SingleByteDecodeTable = std::array<UChar, 128>;
|
||||
using SingleByteEncodeTableEntry = std::pair<UChar, uint8_t>;
|
||||
using SingleByteEncodeTable = IteratorRange<const SingleByteEncodeTableEntry*>;
|
||||
|
||||
// From https://encoding.spec.whatwg.org/index-iso-8859-3.txt with 0xFFFD filling the gaps
|
||||
static constexpr SingleByteDecodeTable iso88593 {
|
||||
0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088, 0x0089, 0x008A, 0x008B, 0x008C, 0x008D, 0x008E, 0x008F,
|
||||
0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009A, 0x009B, 0x009C, 0x009D, 0x009E, 0x009F,
|
||||
0x00A0, 0x0126, 0x02D8, 0x00A3, 0x00A4, 0xFFFD, 0x0124, 0x00A7, 0x00A8, 0x0130, 0x015E, 0x011E, 0x0134, 0x00AD, 0xFFFD, 0x017B,
|
||||
0x00B0, 0x0127, 0x00B2, 0x00B3, 0x00B4, 0x00B5, 0x0125, 0x00B7, 0x00B8, 0x0131, 0x015F, 0x011F, 0x0135, 0x00BD, 0xFFFD, 0x017C,
|
||||
0x00C0, 0x00C1, 0x00C2, 0xFFFD, 0x00C4, 0x010A, 0x0108, 0x00C7, 0x00C8, 0x00C9, 0x00CA, 0x00CB, 0x00CC, 0x00CD, 0x00CE, 0x00CF,
|
||||
0xFFFD, 0x00D1, 0x00D2, 0x00D3, 0x00D4, 0x0120, 0x00D6, 0x00D7, 0x011C, 0x00D9, 0x00DA, 0x00DB, 0x00DC, 0x016C, 0x015C, 0x00DF,
|
||||
0x00E0, 0x00E1, 0x00E2, 0xFFFD, 0x00E4, 0x010B, 0x0109, 0x00E7, 0x00E8, 0x00E9, 0x00EA, 0x00EB, 0x00EC, 0x00ED, 0x00EE, 0x00EF,
|
||||
0xFFFD, 0x00F1, 0x00F2, 0x00F3, 0x00F4, 0x0121, 0x00F6, 0x00F7, 0x011D, 0x00F9, 0x00FA, 0x00FB, 0x00FC, 0x016D, 0x015D, 0x02D9
|
||||
};
|
||||
|
||||
// From https://encoding.spec.whatwg.org/index-iso-8859-6.txt with 0xFFFD filling the gaps
|
||||
static constexpr SingleByteDecodeTable iso88596 {
|
||||
0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088, 0x0089, 0x008A, 0x008B, 0x008C, 0x008D, 0x008E, 0x008F,
|
||||
0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009A, 0x009B, 0x009C, 0x009D, 0x009E, 0x009F,
|
||||
0x00A0, 0xFFFD, 0xFFFD, 0xFFFD, 0x00A4, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0x060C, 0x00AD, 0xFFFD, 0xFFFD,
|
||||
0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0x061B, 0xFFFD, 0xFFFD, 0xFFFD, 0x061F,
|
||||
0xFFFD, 0x0621, 0x0622, 0x0623, 0x0624, 0x0625, 0x0626, 0x0627, 0x0628, 0x0629, 0x062A, 0x062B, 0x062C, 0x062D, 0x062E, 0x062F,
|
||||
0x0630, 0x0631, 0x0632, 0x0633, 0x0634, 0x0635, 0x0636, 0x0637, 0x0638, 0x0639, 0x063A, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD,
|
||||
0x0640, 0x0641, 0x0642, 0x0643, 0x0644, 0x0645, 0x0646, 0x0647, 0x0648, 0x0649, 0x064A, 0x064B, 0x064C, 0x064D, 0x064E, 0x064F,
|
||||
0x0650, 0x0651, 0x0652, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD
|
||||
};
|
||||
|
||||
// From https://encoding.spec.whatwg.org/index-iso-8859-7.txt with 0xFFFD filling the gaps
|
||||
static constexpr SingleByteDecodeTable iso88597 {
|
||||
0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088, 0x0089, 0x008A, 0x008B, 0x008C, 0x008D, 0x008E, 0x008F,
|
||||
0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009A, 0x009B, 0x009C, 0x009D, 0x009E, 0x009F,
|
||||
0x00A0, 0x2018, 0x2019, 0x00A3, 0x20AC, 0x20AF, 0x00A6, 0x00A7, 0x00A8, 0x00A9, 0x037A, 0x00AB, 0x00AC, 0x00AD, 0xFFFD, 0x2015,
|
||||
0x00B0, 0x00B1, 0x00B2, 0x00B3, 0x0384, 0x0385, 0x0386, 0x00B7, 0x0388, 0x0389, 0x038A, 0x00BB, 0x038C, 0x00BD, 0x038E, 0x038F,
|
||||
0x0390, 0x0391, 0x0392, 0x0393, 0x0394, 0x0395, 0x0396, 0x0397, 0x0398, 0x0399, 0x039A, 0x039B, 0x039C, 0x039D, 0x039E, 0x039F,
|
||||
0x03A0, 0x03A1, 0xFFFD, 0x03A3, 0x03A4, 0x03A5, 0x03A6, 0x03A7, 0x03A8, 0x03A9, 0x03AA, 0x03AB, 0x03AC, 0x03AD, 0x03AE, 0x03AF,
|
||||
0x03B0, 0x03B1, 0x03B2, 0x03B3, 0x03B4, 0x03B5, 0x03B6, 0x03B7, 0x03B8, 0x03B9, 0x03BA, 0x03BB, 0x03BC, 0x03BD, 0x03BE, 0x03BF,
|
||||
0x03C0, 0x03C1, 0x03C2, 0x03C3, 0x03C4, 0x03C5, 0x03C6, 0x03C7, 0x03C8, 0x03C9, 0x03CA, 0x03CB, 0x03CC, 0x03CD, 0x03CE, 0xFFFD
|
||||
};
|
||||
|
||||
// From https://encoding.spec.whatwg.org/index-iso-8859-8.txt with 0xFFFD filling the gaps
|
||||
static constexpr SingleByteDecodeTable iso88598 {
|
||||
0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088, 0x0089, 0x008A, 0x008B, 0x008C, 0x008D, 0x008E, 0x008F,
|
||||
0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009A, 0x009B, 0x009C, 0x009D, 0x009E, 0x009F,
|
||||
0x00A0, 0xFFFD, 0x00A2, 0x00A3, 0x00A4, 0x00A5, 0x00A6, 0x00A7, 0x00A8, 0x00A9, 0x00D7, 0x00AB, 0x00AC, 0x00AD, 0x00AE, 0x00AF,
|
||||
0x00B0, 0x00B1, 0x00B2, 0x00B3, 0x00B4, 0x00B5, 0x00B6, 0x00B7, 0x00B8, 0x00B9, 0x00F7, 0x00BB, 0x00BC, 0x00BD, 0x00BE, 0xFFFD,
|
||||
0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD,
|
||||
0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0x2017,
|
||||
0x05D0, 0x05D1, 0x05D2, 0x05D3, 0x05D4, 0x05D5, 0x05D6, 0x05D7, 0x05D8, 0x05D9, 0x05DA, 0x05DB, 0x05DC, 0x05DD, 0x05DE, 0x05DF,
|
||||
0x05E0, 0x05E1, 0x05E2, 0x05E3, 0x05E4, 0x05E5, 0x05E6, 0x05E7, 0x05E8, 0x05E9, 0x05EA, 0xFFFD, 0xFFFD, 0x200E, 0x200F, 0xFFFD
|
||||
};
|
||||
|
||||
// From https://encoding.spec.whatwg.org/index-windows-874.txt with 0xFFFD filling the gaps
|
||||
static constexpr SingleByteDecodeTable windows874 {
|
||||
0x20AC, 0x0081, 0x0082, 0x0083, 0x0084, 0x2026, 0x0086, 0x0087, 0x0088, 0x0089, 0x008A, 0x008B, 0x008C, 0x008D, 0x008E, 0x008F,
|
||||
0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, 0x0098, 0x0099, 0x009A, 0x009B, 0x009C, 0x009D, 0x009E, 0x009F,
|
||||
0x00A0, 0x0E01, 0x0E02, 0x0E03, 0x0E04, 0x0E05, 0x0E06, 0x0E07, 0x0E08, 0x0E09, 0x0E0A, 0x0E0B, 0x0E0C, 0x0E0D, 0x0E0E, 0x0E0F,
|
||||
0x0E10, 0x0E11, 0x0E12, 0x0E13, 0x0E14, 0x0E15, 0x0E16, 0x0E17, 0x0E18, 0x0E19, 0x0E1A, 0x0E1B, 0x0E1C, 0x0E1D, 0x0E1E, 0x0E1F,
|
||||
0x0E20, 0x0E21, 0x0E22, 0x0E23, 0x0E24, 0x0E25, 0x0E26, 0x0E27, 0x0E28, 0x0E29, 0x0E2A, 0x0E2B, 0x0E2C, 0x0E2D, 0x0E2E, 0x0E2F,
|
||||
0x0E30, 0x0E31, 0x0E32, 0x0E33, 0x0E34, 0x0E35, 0x0E36, 0x0E37, 0x0E38, 0x0E39, 0x0E3A, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0x0E3F,
|
||||
0x0E40, 0x0E41, 0x0E42, 0x0E43, 0x0E44, 0x0E45, 0x0E46, 0x0E47, 0x0E48, 0x0E49, 0x0E4A, 0x0E4B, 0x0E4C, 0x0E4D, 0x0E4E, 0x0E4F,
|
||||
0x0E50, 0x0E51, 0x0E52, 0x0E53, 0x0E54, 0x0E55, 0x0E56, 0x0E57, 0x0E58, 0x0E59, 0x0E5A, 0x0E5B, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD
|
||||
};
|
||||
|
||||
// From https://encoding.spec.whatwg.org/index-windows-1253.txt with 0xFFFD filling the gaps
|
||||
static constexpr SingleByteDecodeTable windows1253 {
|
||||
0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, 0x0088, 0x2030, 0x008A, 0x2039, 0x008C, 0x008D, 0x008E, 0x008F,
|
||||
0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, 0x0098, 0x2122, 0x009A, 0x203A, 0x009C, 0x009D, 0x009E, 0x009F,
|
||||
0x00A0, 0x0385, 0x0386, 0x00A3, 0x00A4, 0x00A5, 0x00A6, 0x00A7, 0x00A8, 0x00A9, 0xFFFD, 0x00AB, 0x00AC, 0x00AD, 0x00AE, 0x2015,
|
||||
0x00B0, 0x00B1, 0x00B2, 0x00B3, 0x0384, 0x00B5, 0x00B6, 0x00B7, 0x0388, 0x0389, 0x038A, 0x00BB, 0x038C, 0x00BD, 0x038E, 0x038F,
|
||||
0x0390, 0x0391, 0x0392, 0x0393, 0x0394, 0x0395, 0x0396, 0x0397, 0x0398, 0x0399, 0x039A, 0x039B, 0x039C, 0x039D, 0x039E, 0x039F,
|
||||
0x03A0, 0x03A1, 0xFFFD, 0x03A3, 0x03A4, 0x03A5, 0x03A6, 0x03A7, 0x03A8, 0x03A9, 0x03AA, 0x03AB, 0x03AC, 0x03AD, 0x03AE, 0x03AF,
|
||||
0x03B0, 0x03B1, 0x03B2, 0x03B3, 0x03B4, 0x03B5, 0x03B6, 0x03B7, 0x03B8, 0x03B9, 0x03BA, 0x03BB, 0x03BC, 0x03BD, 0x03BE, 0x03BF,
|
||||
0x03C0, 0x03C1, 0x03C2, 0x03C3, 0x03C4, 0x03C5, 0x03C6, 0x03C7, 0x03C8, 0x03C9, 0x03CA, 0x03CB, 0x03CC, 0x03CD, 0x03CE, 0xFFFD
|
||||
};
|
||||
|
||||
static constexpr SingleByteDecodeTable windows1255 {
|
||||
0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, 0x02C6, 0x2030, 0x008A, 0x2039, 0x008C, 0x008D, 0x008E, 0x008F,
|
||||
0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, 0x02DC, 0x2122, 0x009A, 0x203A, 0x009C, 0x009D, 0x009E, 0x009F,
|
||||
0x00A0, 0x00A1, 0x00A2, 0x00A3, 0x20AA, 0x00A5, 0x00A6, 0x00A7, 0x00A8, 0x00A9, 0x00D7, 0x00AB, 0x00AC, 0x00AD, 0x00AE, 0x00AF,
|
||||
0x00B0, 0x00B1, 0x00B2, 0x00B3, 0x00B4, 0x00B5, 0x00B6, 0x00B7, 0x00B8, 0x00B9, 0x00F7, 0x00BB, 0x00BC, 0x00BD, 0x00BE, 0x00BF,
|
||||
0x05B0, 0x05B1, 0x05B2, 0x05B3, 0x05B4, 0x05B5, 0x05B6, 0x05B7, 0x05B8, 0x05B9, 0x05BA, 0x05BB, 0x05BC, 0x05BD, 0x05BE, 0x05BF,
|
||||
0x05C0, 0x05C1, 0x05C2, 0x05C3, 0x05F0, 0x05F1, 0x05F2, 0x05F3, 0x05F4, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD,
|
||||
0x05D0, 0x05D1, 0x05D2, 0x05D3, 0x05D4, 0x05D5, 0x05D6, 0x05D7, 0x05D8, 0x05D9, 0x05DA, 0x05DB, 0x05DC, 0x05DD, 0x05DE, 0x05DF,
|
||||
0x05E0, 0x05E1, 0x05E2, 0x05E3, 0x05E4, 0x05E5, 0x05E6, 0x05E7, 0x05E8, 0x05E9, 0x05EA, 0xFFFD, 0xFFFD, 0x200E, 0x200F, 0xFFFD
|
||||
};
|
||||
|
||||
// From https://encoding.spec.whatwg.org/index-windows-1257.txt with 0xFFFD filling the gaps
|
||||
static constexpr SingleByteDecodeTable windows1257 {
|
||||
0x20AC, 0x0081, 0x201A, 0x0083, 0x201E, 0x2026, 0x2020, 0x2021, 0x0088, 0x2030, 0x008A, 0x2039, 0x008C, 0x00A8, 0x02C7, 0x00B8,
|
||||
0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, 0x0098, 0x2122, 0x009A, 0x203A, 0x009C, 0x00AF, 0x02DB, 0x009F,
|
||||
0x00A0, 0xFFFD, 0x00A2, 0x00A3, 0x00A4, 0xFFFD, 0x00A6, 0x00A7, 0x00D8, 0x00A9, 0x0156, 0x00AB, 0x00AC, 0x00AD, 0x00AE, 0x00C6,
|
||||
0x00B0, 0x00B1, 0x00B2, 0x00B3, 0x00B4, 0x00B5, 0x00B6, 0x00B7, 0x00F8, 0x00B9, 0x0157, 0x00BB, 0x00BC, 0x00BD, 0x00BE, 0x00E6,
|
||||
0x0104, 0x012E, 0x0100, 0x0106, 0x00C4, 0x00C5, 0x0118, 0x0112, 0x010C, 0x00C9, 0x0179, 0x0116, 0x0122, 0x0136, 0x012A, 0x013B,
|
||||
0x0160, 0x0143, 0x0145, 0x00D3, 0x014C, 0x00D5, 0x00D6, 0x00D7, 0x0172, 0x0141, 0x015A, 0x016A, 0x00DC, 0x017B, 0x017D, 0x00DF,
|
||||
0x0105, 0x012F, 0x0101, 0x0107, 0x00E4, 0x00E5, 0x0119, 0x0113, 0x010D, 0x00E9, 0x017A, 0x0117, 0x0123, 0x0137, 0x012B, 0x013C,
|
||||
0x0161, 0x0144, 0x0146, 0x00F3, 0x014D, 0x00F5, 0x00F6, 0x00F7, 0x0173, 0x0142, 0x015B, 0x016B, 0x00FC, 0x017C, 0x017E, 0x02D9
|
||||
};
|
||||
|
||||
// From https://encoding.spec.whatwg.org/index-koi8-u.txt
|
||||
static constexpr SingleByteDecodeTable koi8u {
|
||||
0x2500, 0x2502, 0x250C, 0x2510, 0x2514, 0x2518, 0x251C, 0x2524, 0x252C, 0x2534, 0x253C, 0x2580, 0x2584, 0x2588, 0x258C, 0x2590,
|
||||
0x2591, 0x2592, 0x2593, 0x2320, 0x25A0, 0x2219, 0x221A, 0x2248, 0x2264, 0x2265, 0x00A0, 0x2321, 0x00B0, 0x00B2, 0x00B7, 0x00F7,
|
||||
0x2550, 0x2551, 0x2552, 0x0451, 0x0454, 0x2554, 0x0456, 0x0457, 0x2557, 0x2558, 0x2559, 0x255A, 0x255B, 0x0491, 0x045E, 0x255E,
|
||||
0x255F, 0x2560, 0x2561, 0x0401, 0x0404, 0x2563, 0x0406, 0x0407, 0x2566, 0x2567, 0x2568, 0x2569, 0x256A, 0x0490, 0x040E, 0x00A9,
|
||||
0x044E, 0x0430, 0x0431, 0x0446, 0x0434, 0x0435, 0x0444, 0x0433, 0x0445, 0x0438, 0x0439, 0x043A, 0x043B, 0x043C, 0x043D, 0x043E,
|
||||
0x043F, 0x044F, 0x0440, 0x0441, 0x0442, 0x0443, 0x0436, 0x0432, 0x044C, 0x044B, 0x0437, 0x0448, 0x044D, 0x0449, 0x0447, 0x044A,
|
||||
0x042E, 0x0410, 0x0411, 0x0426, 0x0414, 0x0415, 0x0424, 0x0413, 0x0425, 0x0418, 0x0419, 0x041A, 0x041B, 0x041C, 0x041D, 0x041E,
|
||||
0x041F, 0x042F, 0x0420, 0x0421, 0x0422, 0x0423, 0x0416, 0x0412, 0x042C, 0x042B, 0x0417, 0x0428, 0x042D, 0x0429, 0x0427, 0x042A
|
||||
};
|
||||
|
||||
// From https://encoding.spec.whatwg.org/index-ibm866.txt
|
||||
static constexpr SingleByteDecodeTable ibm866 {
|
||||
0x0410, 0x0411, 0x0412, 0x0413, 0x0414, 0x0415, 0x0416, 0x0417, 0x0418, 0x0419, 0x041A, 0x041B, 0x041C, 0x041D, 0x041E, 0x041F,
|
||||
0x0420, 0x0421, 0x0422, 0x0423, 0x0424, 0x0425, 0x0426, 0x0427, 0x0428, 0x0429, 0x042A, 0x042B, 0x042C, 0x042D, 0x042E, 0x042F,
|
||||
0x0430, 0x0431, 0x0432, 0x0433, 0x0434, 0x0435, 0x0436, 0x0437, 0x0438, 0x0439, 0x043A, 0x043B, 0x043C, 0x043D, 0x043E, 0x043F,
|
||||
0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x2561, 0x2562, 0x2556, 0x2555, 0x2563, 0x2551, 0x2557, 0x255D, 0x255C, 0x255B, 0x2510,
|
||||
0x2514, 0x2534, 0x252C, 0x251C, 0x2500, 0x253C, 0x255E, 0x255F, 0x255A, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256C, 0x2567,
|
||||
0x2568, 0x2564, 0x2565, 0x2559, 0x2558, 0x2552, 0x2553, 0x256B, 0x256A, 0x2518, 0x250C, 0x2588, 0x2584, 0x258C, 0x2590, 0x2580,
|
||||
0x0440, 0x0441, 0x0442, 0x0443, 0x0444, 0x0445, 0x0446, 0x0447, 0x0448, 0x0449, 0x044A, 0x044B, 0x044C, 0x044D, 0x044E, 0x044F,
|
||||
0x0401, 0x0451, 0x0404, 0x0454, 0x0407, 0x0457, 0x040E, 0x045E, 0x00B0, 0x2219, 0x00B7, 0x221A, 0x2116, 0x00A4, 0x25A0, 0x00A0
|
||||
};
|
||||
|
||||
template<const SingleByteDecodeTable& decodeTable> SingleByteEncodeTable tableForEncoding()
|
||||
{
|
||||
// Allocate this at runtime because building it at compile time would make the binary much larger and this is often not used.
|
||||
// FIXME: With the C++20 version of std::count, we should be able to change this from const to constexpr and compute the size at compile time.
|
||||
static const auto size = std::size(decodeTable) - std::count(std::begin(decodeTable), std::end(decodeTable), replacementCharacter);
|
||||
static const SingleByteEncodeTableEntry* entries;
|
||||
static std::once_flag once;
|
||||
std::call_once(once, [&] {
|
||||
auto* mutableEntries = new SingleByteEncodeTableEntry[size];
|
||||
size_t j = 0;
|
||||
for (uint8_t i = 0; i < std::size(decodeTable); i++) {
|
||||
if (decodeTable[i] != replacementCharacter)
|
||||
mutableEntries[j++] = { decodeTable[i], i + 0x80 };
|
||||
}
|
||||
ASSERT(j == size);
|
||||
auto collection = WTF::makeIteratorRange(&mutableEntries[0], &mutableEntries[size]);
|
||||
sortByFirst(collection);
|
||||
ASSERT(sortedFirstsAreUnique(collection));
|
||||
entries = mutableEntries;
|
||||
});
|
||||
return WTF::makeIteratorRange(&entries[0], &entries[size]);
|
||||
}
|
||||
|
||||
static SingleByteEncodeTable tableForEncoding(TextCodecSingleByte::Encoding encoding)
|
||||
{
|
||||
switch (encoding) {
|
||||
case TextCodecSingleByte::Encoding::ISO_8859_3:
|
||||
return tableForEncoding<iso88593>();
|
||||
case TextCodecSingleByte::Encoding::ISO_8859_6:
|
||||
return tableForEncoding<iso88596>();
|
||||
case TextCodecSingleByte::Encoding::ISO_8859_7:
|
||||
return tableForEncoding<iso88597>();
|
||||
case TextCodecSingleByte::Encoding::ISO_8859_8:
|
||||
return tableForEncoding<iso88598>();
|
||||
case TextCodecSingleByte::Encoding::Windows_874:
|
||||
return tableForEncoding<windows874>();
|
||||
case TextCodecSingleByte::Encoding::Windows_1253:
|
||||
return tableForEncoding<windows1253>();
|
||||
case TextCodecSingleByte::Encoding::Windows_1255:
|
||||
return tableForEncoding<windows1255>();
|
||||
case TextCodecSingleByte::Encoding::Windows_1257:
|
||||
return tableForEncoding<windows1257>();
|
||||
case TextCodecSingleByte::Encoding::IBM866:
|
||||
return tableForEncoding<ibm866>();
|
||||
case TextCodecSingleByte::Encoding::KOI8U:
|
||||
return tableForEncoding<koi8u>();
|
||||
}
|
||||
RELEASE_ASSERT_NOT_REACHED();
|
||||
}
|
||||
|
||||
static const SingleByteDecodeTable& tableForDecoding(TextCodecSingleByte::Encoding encoding)
|
||||
{
|
||||
switch (encoding) {
|
||||
case TextCodecSingleByte::Encoding::ISO_8859_3:
|
||||
return iso88593;
|
||||
case TextCodecSingleByte::Encoding::ISO_8859_6:
|
||||
return iso88596;
|
||||
case TextCodecSingleByte::Encoding::ISO_8859_7:
|
||||
return iso88597;
|
||||
case TextCodecSingleByte::Encoding::ISO_8859_8:
|
||||
return iso88598;
|
||||
case TextCodecSingleByte::Encoding::Windows_874:
|
||||
return windows874;
|
||||
case TextCodecSingleByte::Encoding::Windows_1253:
|
||||
return windows1253;
|
||||
case TextCodecSingleByte::Encoding::Windows_1255:
|
||||
return windows1255;
|
||||
case TextCodecSingleByte::Encoding::Windows_1257:
|
||||
return windows1257;
|
||||
case TextCodecSingleByte::Encoding::IBM866:
|
||||
return ibm866;
|
||||
case TextCodecSingleByte::Encoding::KOI8U:
|
||||
return koi8u;
|
||||
}
|
||||
RELEASE_ASSERT_NOT_REACHED();
|
||||
}
|
||||
|
||||
// https://encoding.spec.whatwg.org/#single-byte-encoder
|
||||
static Vector<uint8_t> encode(const SingleByteEncodeTable& table, StringView string, Function<void(char32_t, Vector<uint8_t>&)>&& unencodableHandler)
|
||||
{
|
||||
// FIXME: Consider adding an ASCII fast path like the one in TextCodecLatin1::decode.
|
||||
Vector<uint8_t> result;
|
||||
result.reserveInitialCapacity(string.length());
|
||||
for (auto codePoint : string.codePoints()) {
|
||||
if (isASCII(codePoint)) {
|
||||
result.append(codePoint);
|
||||
continue;
|
||||
}
|
||||
auto byte = findFirstInSortedPairs(table, codePoint);
|
||||
if (!byte) {
|
||||
unencodableHandler(codePoint, result);
|
||||
continue;
|
||||
}
|
||||
result.append(*byte);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
// https://encoding.spec.whatwg.org/#single-byte-decoder
|
||||
static String decode(const SingleByteDecodeTable& table, std::span<const uint8_t> bytes, bool, bool stopOnError, bool& sawError)
|
||||
{
|
||||
StringBuilder result;
|
||||
result.reserveCapacity(bytes.size());
|
||||
auto parseByte = [&] (uint8_t byte) {
|
||||
if (isASCII(byte)) {
|
||||
result.append(byte);
|
||||
return;
|
||||
}
|
||||
UChar codePoint = table[byte - 0x80];
|
||||
if (codePoint == replacementCharacter)
|
||||
sawError = true;
|
||||
result.append(codePoint);
|
||||
};
|
||||
if (stopOnError) {
|
||||
for (auto byte : bytes) {
|
||||
parseByte(byte);
|
||||
if (sawError)
|
||||
return result.toString();
|
||||
}
|
||||
} else {
|
||||
for (auto byte : bytes)
|
||||
parseByte(byte);
|
||||
}
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
Vector<uint8_t> TextCodecSingleByte::encode(StringView string, UnencodableHandling handling) const
|
||||
{
|
||||
return PAL::encode(tableForEncoding(m_encoding), string, unencodableHandler(handling));
|
||||
}
|
||||
|
||||
String TextCodecSingleByte::decode(std::span<const uint8_t> bytes, bool flush, bool stopOnError, bool& sawError)
|
||||
{
|
||||
return PAL::decode(tableForDecoding(m_encoding), bytes, flush, stopOnError, sawError);
|
||||
}
|
||||
|
||||
TextCodecSingleByte::TextCodecSingleByte(Encoding encoding)
|
||||
: m_encoding(encoding)
|
||||
{
|
||||
}
|
||||
|
||||
void TextCodecSingleByte::registerEncodingNames(EncodingNameRegistrar registrar)
|
||||
{
|
||||
// https://encoding.spec.whatwg.org/#names-and-labels
|
||||
auto registerAliases = [&] (std::initializer_list<ASCIILiteral> list) {
|
||||
for (auto& alias : list)
|
||||
registrar(alias, *list.begin());
|
||||
};
|
||||
registerAliases({
|
||||
"ISO-8859-3"_s,
|
||||
"csisolatin3"_s,
|
||||
"iso-ir-109"_s,
|
||||
"iso8859-3"_s,
|
||||
"iso88593"_s,
|
||||
"iso_8859-3"_s,
|
||||
"iso_8859-3:1988"_s,
|
||||
"l3"_s,
|
||||
"latin3"_s
|
||||
});
|
||||
|
||||
registerAliases({
|
||||
"ISO-8859-6"_s,
|
||||
"arabic"_s,
|
||||
"asmo-708"_s,
|
||||
"csiso88596e"_s,
|
||||
"csiso88596i"_s,
|
||||
"csisolatinarabic"_s,
|
||||
"ecma-114"_s,
|
||||
"iso-8859-6-e"_s,
|
||||
"iso-8859-6-i"_s,
|
||||
"iso-ir-127"_s,
|
||||
"iso8859-6"_s,
|
||||
"iso88596"_s,
|
||||
"iso_8859-6"_s,
|
||||
"iso_8859-6:1987"_s
|
||||
});
|
||||
|
||||
registerAliases({
|
||||
"ISO-8859-7"_s,
|
||||
"csisolatingreek"_s,
|
||||
"ecma-118"_s,
|
||||
"elot_928"_s,
|
||||
"greek"_s,
|
||||
"greek8"_s,
|
||||
"iso-ir-126"_s,
|
||||
"iso8859-7"_s,
|
||||
"iso88597"_s,
|
||||
"iso_8859-7"_s,
|
||||
"iso_8859-7:1987"_s,
|
||||
"sun_eu_greek"_s
|
||||
});
|
||||
|
||||
registerAliases({
|
||||
"ISO-8859-8"_s,
|
||||
"csiso88598e"_s,
|
||||
"csisolatinhebrew"_s,
|
||||
"hebrew"_s,
|
||||
"iso-8859-8-e"_s,
|
||||
"iso-ir-138"_s,
|
||||
"iso8859-8"_s,
|
||||
"iso88598"_s,
|
||||
"iso_8859-8"_s,
|
||||
"iso_8859-8:1988"_s,
|
||||
"visual"_s
|
||||
});
|
||||
|
||||
registerAliases({
|
||||
"ISO-8859-8-I"_s,
|
||||
"csiso88598i"_s,
|
||||
"logical"_s
|
||||
});
|
||||
|
||||
registerAliases({
|
||||
"windows-874"_s,
|
||||
"dos-874"_s,
|
||||
"iso-8859-11"_s,
|
||||
"iso8859-11"_s,
|
||||
"iso885911"_s,
|
||||
"tis-620"_s
|
||||
});
|
||||
|
||||
registerAliases({
|
||||
"windows-1253"_s,
|
||||
"cp1253"_s,
|
||||
"x-cp1253"_s
|
||||
});
|
||||
|
||||
registerAliases({
|
||||
"windows-1255"_s,
|
||||
"cp1255"_s,
|
||||
"x-cp1255"_s
|
||||
});
|
||||
|
||||
registerAliases({
|
||||
"windows-1257"_s,
|
||||
"cp1257"_s,
|
||||
"x-cp1257"_s
|
||||
});
|
||||
|
||||
registerAliases({
|
||||
"KOI8-U"_s,
|
||||
"koi8-ru"_s
|
||||
});
|
||||
|
||||
registerAliases({
|
||||
"IBM866"_s,
|
||||
"866"_s,
|
||||
"cp866"_s,
|
||||
"csibm866"_s
|
||||
});
|
||||
}
|
||||
|
||||
void TextCodecSingleByte::registerCodecs(TextCodecRegistrar registrar)
|
||||
{
|
||||
registrar("ISO-8859-3"_s, [] {
|
||||
return makeUnique<TextCodecSingleByte>(Encoding::ISO_8859_3);
|
||||
});
|
||||
registrar("ISO-8859-6"_s, [] {
|
||||
return makeUnique<TextCodecSingleByte>(Encoding::ISO_8859_6);
|
||||
});
|
||||
registrar("ISO-8859-7"_s, [] {
|
||||
return makeUnique<TextCodecSingleByte>(Encoding::ISO_8859_7);
|
||||
});
|
||||
registrar("ISO-8859-8"_s, [] {
|
||||
return makeUnique<TextCodecSingleByte>(Encoding::ISO_8859_8);
|
||||
});
|
||||
registrar("ISO-8859-8-I"_s, [] {
|
||||
return makeUnique<TextCodecSingleByte>(Encoding::ISO_8859_8);
|
||||
});
|
||||
registrar("windows-874"_s, [] {
|
||||
return makeUnique<TextCodecSingleByte>(Encoding::Windows_874);
|
||||
});
|
||||
registrar("windows-1253"_s, [] {
|
||||
return makeUnique<TextCodecSingleByte>(Encoding::Windows_1253);
|
||||
});
|
||||
registrar("windows-1255"_s, [] {
|
||||
return makeUnique<TextCodecSingleByte>(Encoding::Windows_1255);
|
||||
});
|
||||
registrar("windows-1257"_s, [] {
|
||||
return makeUnique<TextCodecSingleByte>(Encoding::Windows_1257);
|
||||
});
|
||||
registrar("KOI8-U"_s, [] {
|
||||
return makeUnique<TextCodecSingleByte>(Encoding::KOI8U);
|
||||
});
|
||||
registrar("IBM866"_s, [] {
|
||||
return makeUnique<TextCodecSingleByte>(Encoding::IBM866);
|
||||
});
|
||||
}
|
||||
|
||||
}
|
||||
49
src/bun.js/bindings/webcore/TextCodecSingleByte.h
Normal file
49
src/bun.js/bindings/webcore/TextCodecSingleByte.h
Normal file
@@ -0,0 +1,49 @@
|
||||
/*
|
||||
* Copyright (C) 2020 Apple Inc. All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
|
||||
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
|
||||
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "TextCodec.h"
|
||||
#include <wtf/TZoneMalloc.h>
|
||||
|
||||
namespace PAL {
|
||||
|
||||
class TextCodecSingleByte final : public TextCodec {
|
||||
WTF_MAKE_TZONE_ALLOCATED(TextCodecSingleByte);
|
||||
public:
|
||||
static void registerEncodingNames(EncodingNameRegistrar);
|
||||
static void registerCodecs(TextCodecRegistrar);
|
||||
|
||||
enum class Encoding : uint8_t;
|
||||
explicit TextCodecSingleByte(Encoding);
|
||||
|
||||
private:
|
||||
String decode(std::span<const uint8_t>, bool flush, bool stopOnError, bool& sawError) final;
|
||||
Vector<uint8_t> encode(StringView, UnencodableHandling) const final;
|
||||
|
||||
const Encoding m_encoding;
|
||||
};
|
||||
|
||||
} // namespace PAL
|
||||
100
src/bun.js/bindings/webcore/TextCodecUserDefined.cpp
Normal file
100
src/bun.js/bindings/webcore/TextCodecUserDefined.cpp
Normal file
@@ -0,0 +1,100 @@
|
||||
/*
|
||||
* Copyright (C) 2007-2017 Apple, Inc. All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
|
||||
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
|
||||
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
#include "TextCodecUserDefined.h"
|
||||
|
||||
#include <array>
|
||||
#include <wtf/TZoneMallocInlines.h>
|
||||
#include <wtf/text/CString.h>
|
||||
#include <wtf/text/StringBuilder.h>
|
||||
#include <wtf/text/WTFString.h>
|
||||
|
||||
namespace PAL {
|
||||
|
||||
WTF_MAKE_TZONE_ALLOCATED_IMPL(TextCodecUserDefined);
|
||||
|
||||
void TextCodecUserDefined::registerEncodingNames(EncodingNameRegistrar registrar)
|
||||
{
|
||||
registrar("x-user-defined"_s, "x-user-defined"_s);
|
||||
}
|
||||
|
||||
void TextCodecUserDefined::registerCodecs(TextCodecRegistrar registrar)
|
||||
{
|
||||
registrar("x-user-defined"_s, [] {
|
||||
return makeUnique<TextCodecUserDefined>();
|
||||
});
|
||||
}
|
||||
|
||||
String TextCodecUserDefined::decode(std::span<const uint8_t> bytes, bool, bool, bool&)
|
||||
{
|
||||
StringBuilder result;
|
||||
result.reserveCapacity(bytes.size());
|
||||
for (char byte : bytes)
|
||||
result.append(static_cast<UChar>(byte & 0xF7FF));
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
static Vector<uint8_t> encodeComplexUserDefined(StringView string, UnencodableHandling handling)
|
||||
{
|
||||
Vector<uint8_t> result;
|
||||
|
||||
for (auto character : string.codePoints()) {
|
||||
int8_t signedByte = character;
|
||||
if ((signedByte & 0xF7FF) == character)
|
||||
result.append(signedByte);
|
||||
else {
|
||||
// No way to encode this character with x-user-defined.
|
||||
UnencodableReplacementArray replacement;
|
||||
int replacementLength = TextCodec::getUnencodableReplacement(character, handling, replacement);
|
||||
result.append(std::span(replacement.data(), replacementLength));
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
Vector<uint8_t> TextCodecUserDefined::encode(StringView string, UnencodableHandling handling) const
|
||||
{
|
||||
{
|
||||
Vector<uint8_t> result(string.length());
|
||||
auto* bytes = result.data();
|
||||
|
||||
// Convert and simultaneously do a check to see if it's all ASCII.
|
||||
UChar ored = 0;
|
||||
for (auto character : string.codeUnits()) {
|
||||
*bytes++ = character;
|
||||
ored |= character;
|
||||
}
|
||||
|
||||
if (!(ored & 0xFF80))
|
||||
return result;
|
||||
}
|
||||
|
||||
// If it wasn't all ASCII, call the function that handles more-complex cases.
|
||||
return encodeComplexUserDefined(string, handling);
|
||||
}
|
||||
|
||||
} // namespace PAL
|
||||
44
src/bun.js/bindings/webcore/TextCodecUserDefined.h
Normal file
44
src/bun.js/bindings/webcore/TextCodecUserDefined.h
Normal file
@@ -0,0 +1,44 @@
|
||||
/*
|
||||
* Copyright (C) 2007-2017 Apple, Inc. All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
|
||||
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
|
||||
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "TextCodec.h"
|
||||
#include <wtf/TZoneMalloc.h>
|
||||
|
||||
namespace PAL {
|
||||
|
||||
class TextCodecUserDefined final : public TextCodec {
|
||||
WTF_MAKE_TZONE_ALLOCATED(TextCodecUserDefined);
|
||||
public:
|
||||
static void registerEncodingNames(EncodingNameRegistrar);
|
||||
static void registerCodecs(TextCodecRegistrar);
|
||||
|
||||
private:
|
||||
String decode(std::span<const uint8_t>, bool flush, bool stopOnError, bool& sawError) final;
|
||||
Vector<uint8_t> encode(StringView, UnencodableHandling) const final;
|
||||
};
|
||||
|
||||
} // namespace PAL
|
||||
197
src/bun.js/bindings/webcore/TextEncoding.cpp
Normal file
197
src/bun.js/bindings/webcore/TextEncoding.cpp
Normal file
@@ -0,0 +1,197 @@
|
||||
/*
|
||||
* Copyright (C) 2004-2019 Apple Inc. All rights reserved.
|
||||
* Copyright (C) 2006 Alexey Proskuryakov <ap@nypop.com>
|
||||
* Copyright (C) 2007-2009 Torch Mobile, Inc.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
|
||||
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
|
||||
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
#include "TextEncoding.h"
|
||||
|
||||
#include "DecodeEscapeSequences.h"
|
||||
#include "TextCodec.h"
|
||||
#include "TextEncodingRegistry.h"
|
||||
#include <wtf/NeverDestroyed.h>
|
||||
#include <wtf/StdLibExtras.h>
|
||||
#include <wtf/text/StringView.h>
|
||||
|
||||
namespace PAL {
|
||||
|
||||
static const TextEncoding& UTF7Encoding()
|
||||
{
|
||||
static NeverDestroyed<TextEncoding> globalUTF7Encoding("UTF-7");
|
||||
return globalUTF7Encoding;
|
||||
}
|
||||
|
||||
TextEncoding::TextEncoding(const char* name)
|
||||
: m_name(atomCanonicalTextEncodingName(name))
|
||||
, m_backslashAsCurrencySymbol(backslashAsCurrencySymbol())
|
||||
{
|
||||
}
|
||||
|
||||
TextEncoding::TextEncoding(StringView name)
|
||||
: m_name(atomCanonicalTextEncodingName(name))
|
||||
, m_backslashAsCurrencySymbol(backslashAsCurrencySymbol())
|
||||
{
|
||||
}
|
||||
|
||||
TextEncoding::TextEncoding(const String& name)
|
||||
: TextEncoding(StringView { name })
|
||||
{
|
||||
}
|
||||
|
||||
String TextEncoding::decode(std::span<const uint8_t> data, bool stopOnError, bool& sawError) const
|
||||
{
|
||||
if (m_name.isNull())
|
||||
return String();
|
||||
|
||||
return newTextCodec(*this)->decode(data, true, stopOnError, sawError);
|
||||
}
|
||||
|
||||
Vector<uint8_t> TextEncoding::encode(StringView string, PAL::UnencodableHandling handling, NFCNormalize normalize) const
|
||||
{
|
||||
if (m_name.isNull() || string.isEmpty())
|
||||
return { };
|
||||
|
||||
// FIXME: What's the right place to do normalization?
|
||||
// It's a little strange to do it inside the encode function.
|
||||
// Perhaps normalization should be an explicit step done before calling encode.
|
||||
if (normalize == NFCNormalize::Yes)
|
||||
return newTextCodec(*this)->encode(normalizedNFC(string).view, handling);
|
||||
return newTextCodec(*this)->encode(string, handling);
|
||||
}
|
||||
|
||||
ASCIILiteral TextEncoding::domName() const
|
||||
{
|
||||
if (noExtendedTextEncodingNameUsed())
|
||||
return m_name;
|
||||
|
||||
// We treat EUC-KR as windows-949 (its superset), but need to expose
|
||||
// the name 'EUC-KR' because the name 'windows-949' is not recognized by
|
||||
// most Korean web servers even though they do use the encoding
|
||||
// 'windows-949' with the name 'EUC-KR'.
|
||||
// FIXME: This is not thread-safe. At the moment, this function is
|
||||
// only accessed in a single thread, but eventually has to be made
|
||||
// thread-safe along with usesVisualOrdering().
|
||||
static const ASCIILiteral windows949 = atomCanonicalTextEncodingName("windows-949");
|
||||
if (m_name == windows949)
|
||||
return "EUC-KR"_s;
|
||||
return m_name;
|
||||
}
|
||||
|
||||
bool TextEncoding::usesVisualOrdering() const
|
||||
{
|
||||
if (noExtendedTextEncodingNameUsed())
|
||||
return false;
|
||||
|
||||
static const ASCIILiteral iso88598 = atomCanonicalTextEncodingName("ISO-8859-8");
|
||||
return m_name == iso88598;
|
||||
}
|
||||
|
||||
bool TextEncoding::isJapanese() const
|
||||
{
|
||||
return isJapaneseEncoding(m_name);
|
||||
}
|
||||
|
||||
UChar TextEncoding::backslashAsCurrencySymbol() const
|
||||
{
|
||||
return shouldShowBackslashAsCurrencySymbolIn(m_name) ? 0x00A5 : '\\';
|
||||
}
|
||||
|
||||
bool TextEncoding::isNonByteBasedEncoding() const
|
||||
{
|
||||
return *this == UTF16LittleEndianEncoding() || *this == UTF16BigEndianEncoding();
|
||||
}
|
||||
|
||||
bool TextEncoding::isUTF7Encoding() const
|
||||
{
|
||||
if (noExtendedTextEncodingNameUsed())
|
||||
return false;
|
||||
|
||||
return *this == UTF7Encoding();
|
||||
}
|
||||
|
||||
const TextEncoding& TextEncoding::closestByteBasedEquivalent() const
|
||||
{
|
||||
if (isNonByteBasedEncoding())
|
||||
return UTF8Encoding();
|
||||
return *this;
|
||||
}
|
||||
|
||||
// HTML5 specifies that UTF-8 be used in form submission when a form is
|
||||
// is a part of a document in UTF-16 probably because UTF-16 is not a
|
||||
// byte-based encoding and can contain 0x00. By extension, the same
|
||||
// should be done for UTF-32. In case of UTF-7, it is a byte-based encoding,
|
||||
// but it's fraught with problems and we'd rather steer clear of it.
|
||||
const TextEncoding& TextEncoding::encodingForFormSubmissionOrURLParsing() const
|
||||
{
|
||||
if (isNonByteBasedEncoding() || isUTF7Encoding())
|
||||
return UTF8Encoding();
|
||||
return *this;
|
||||
}
|
||||
|
||||
const TextEncoding& ASCIIEncoding()
|
||||
{
|
||||
static NeverDestroyed<TextEncoding> globalASCIIEncoding("ASCII");
|
||||
return globalASCIIEncoding;
|
||||
}
|
||||
|
||||
const TextEncoding& Latin1Encoding()
|
||||
{
|
||||
static NeverDestroyed<TextEncoding> globalLatin1Encoding("latin1");
|
||||
return globalLatin1Encoding;
|
||||
}
|
||||
|
||||
const TextEncoding& UTF16BigEndianEncoding()
|
||||
{
|
||||
static NeverDestroyed<TextEncoding> globalUTF16BigEndianEncoding("UTF-16BE");
|
||||
return globalUTF16BigEndianEncoding;
|
||||
}
|
||||
|
||||
const TextEncoding& UTF16LittleEndianEncoding()
|
||||
{
|
||||
static NeverDestroyed<TextEncoding> globalUTF16LittleEndianEncoding("UTF-16LE");
|
||||
return globalUTF16LittleEndianEncoding;
|
||||
}
|
||||
|
||||
const TextEncoding& UTF8Encoding()
|
||||
{
|
||||
static NeverDestroyed<TextEncoding> globalUTF8Encoding("UTF-8");
|
||||
ASSERT(globalUTF8Encoding.get().isValid());
|
||||
return globalUTF8Encoding;
|
||||
}
|
||||
|
||||
const TextEncoding& WindowsLatin1Encoding()
|
||||
{
|
||||
static NeverDestroyed<TextEncoding> globalWindowsLatin1Encoding("WinLatin-1");
|
||||
return globalWindowsLatin1Encoding;
|
||||
}
|
||||
|
||||
String decodeURLEscapeSequences(StringView string, const TextEncoding& encoding)
|
||||
{
|
||||
if (string.isEmpty())
|
||||
return string.toString();
|
||||
return decodeEscapeSequences<URLEscapeSequence>(string, encoding);
|
||||
}
|
||||
|
||||
} // namespace PAL
|
||||
89
src/bun.js/bindings/webcore/TextEncoding.h
Normal file
89
src/bun.js/bindings/webcore/TextEncoding.h
Normal file
@@ -0,0 +1,89 @@
|
||||
/*
|
||||
* Copyright (C) 2004-2017 Apple Inc. All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
|
||||
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
|
||||
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "UnencodableHandling.h"
|
||||
#include <wtf/URL.h>
|
||||
#include <wtf/text/StringView.h>
|
||||
|
||||
namespace PAL {
|
||||
|
||||
enum class NFCNormalize : bool { No,
|
||||
Yes };
|
||||
|
||||
class TextEncoding : public WTF::URLTextEncoding {
|
||||
public:
|
||||
TextEncoding() = default;
|
||||
TextEncoding(const char* name);
|
||||
TextEncoding(StringView name);
|
||||
TextEncoding(const String& name);
|
||||
|
||||
bool isValid() const { return !m_name.isNull(); }
|
||||
ASCIILiteral name() const { return m_name; }
|
||||
ASCIILiteral domName() const; // name exposed via DOM
|
||||
bool usesVisualOrdering() const;
|
||||
bool isJapanese() const;
|
||||
|
||||
const TextEncoding& closestByteBasedEquivalent() const;
|
||||
const TextEncoding& encodingForFormSubmissionOrURLParsing() const;
|
||||
|
||||
String decode(std::span<const uint8_t>, bool stopOnError, bool& sawError) const;
|
||||
String decode(std::span<const uint8_t>) const;
|
||||
Vector<uint8_t> encode(StringView, PAL::UnencodableHandling, NFCNormalize = NFCNormalize::Yes) const;
|
||||
Vector<uint8_t> encodeForURLParsing(StringView string) const final { return encode(string, PAL::UnencodableHandling::URLEncodedEntities, NFCNormalize::No); }
|
||||
|
||||
UChar backslashAsCurrencySymbol() const;
|
||||
bool isByteBasedEncoding() const { return !isNonByteBasedEncoding(); }
|
||||
|
||||
private:
|
||||
bool isNonByteBasedEncoding() const;
|
||||
bool isUTF7Encoding() const;
|
||||
|
||||
ASCIILiteral m_name;
|
||||
UChar m_backslashAsCurrencySymbol;
|
||||
};
|
||||
|
||||
inline bool operator==(const TextEncoding& a, const TextEncoding& b) { return a.name() == b.name(); }
|
||||
|
||||
const TextEncoding& ASCIIEncoding();
|
||||
const TextEncoding& Latin1Encoding();
|
||||
const TextEncoding& UTF16BigEndianEncoding();
|
||||
const TextEncoding& UTF16LittleEndianEncoding();
|
||||
const TextEncoding& UTF8Encoding();
|
||||
const TextEncoding& WindowsLatin1Encoding();
|
||||
|
||||
// Unescapes the given string using URL escaping rules.
|
||||
// DANGER: If the URL has "%00" in it,
|
||||
// the resulting string will have embedded null characters!
|
||||
String decodeURLEscapeSequences(StringView, const TextEncoding& = UTF8Encoding());
|
||||
|
||||
inline String TextEncoding::decode(std::span<const uint8_t> characters) const
|
||||
{
|
||||
bool ignored;
|
||||
return decode(characters, false, ignored);
|
||||
}
|
||||
|
||||
} // namespace PAL
|
||||
45
src/bun.js/bindings/webcore/TextEncodingDetector.h
Normal file
45
src/bun.js/bindings/webcore/TextEncodingDetector.h
Normal file
@@ -0,0 +1,45 @@
|
||||
/*
|
||||
* Copyright (C) 2009 Google Inc. All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are
|
||||
* met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above
|
||||
* copyright notice, this list of conditions and the following disclaimer
|
||||
* in the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
* * Neither the name of Google Inc. nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <span>
|
||||
|
||||
namespace PAL {
|
||||
|
||||
class TextEncoding;
|
||||
|
||||
// Given a sequence of bytes in |data| of length |len| and an optional
|
||||
// hintEncodingName, detect the most likely character encoding.
|
||||
// The way hintEncodingName is used is up to an implementation.
|
||||
// Currently, the only caller sets it to the parent frame encoding.
|
||||
bool detectTextEncoding(std::span<const uint8_t> data, const char* hintEncodingName, TextEncoding* detectedEncoding);
|
||||
|
||||
} // namespace PAL
|
||||
115
src/bun.js/bindings/webcore/TextEncodingDetectorICU.cpp
Normal file
115
src/bun.js/bindings/webcore/TextEncodingDetectorICU.cpp
Normal file
@@ -0,0 +1,115 @@
|
||||
/*
|
||||
* Copyright (C) 2008, 2009 Google Inc. All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are
|
||||
* met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above
|
||||
* copyright notice, this list of conditions and the following disclaimer
|
||||
* in the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
* * Neither the name of Google Inc. nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
#include "TextEncodingDetector.h"
|
||||
|
||||
#include "TextEncoding.h"
|
||||
#include <wtf/unicode/icu/unicode/ucnv.h>
|
||||
#include <wtf/unicode/icu/unicode/ucsdet.h>
|
||||
|
||||
namespace PAL {
|
||||
|
||||
bool detectTextEncoding(std::span<const uint8_t> data, const char* hintEncodingName, TextEncoding* detectedEncoding)
|
||||
{
|
||||
*detectedEncoding = TextEncoding();
|
||||
int matchesCount = 0;
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
UCharsetDetector* detector = ucsdet_open(&status);
|
||||
if (U_FAILURE(status))
|
||||
return false;
|
||||
ucsdet_enableInputFilter(detector, true);
|
||||
ucsdet_setText(detector, byteCast<char>(data.data()), static_cast<int32_t>(data.size()), &status);
|
||||
if (U_FAILURE(status))
|
||||
return false;
|
||||
|
||||
// FIXME: A few things we can do other than improving
|
||||
// the ICU detector itself.
|
||||
// 1. Use ucsdet_detectAll and pick the most likely one given
|
||||
// "the context" (parent-encoding, referrer encoding, etc).
|
||||
// 2. 'Emulate' Firefox/IE's non-Universal detectors (e.g.
|
||||
// Chinese, Japanese, Russian, Korean and Hebrew) by picking the
|
||||
// encoding with a highest confidence among the detector-specific
|
||||
// limited set of candidate encodings.
|
||||
// Below is a partial implementation of the first part of what's outlined
|
||||
// above.
|
||||
const UCharsetMatch** matches = ucsdet_detectAll(detector, &matchesCount, &status);
|
||||
if (U_FAILURE(status)) {
|
||||
ucsdet_close(detector);
|
||||
return false;
|
||||
}
|
||||
|
||||
const char* encoding = 0;
|
||||
if (hintEncodingName) {
|
||||
TextEncoding hintEncoding(hintEncodingName);
|
||||
// 10 is the minimum confidence value consistent with the codepoint
|
||||
// allocation in a given encoding. The size of a chunk passed to
|
||||
// us varies even for the same html file (apparently depending on
|
||||
// the network load). When we're given a rather short chunk, we
|
||||
// don't have a sufficiently reliable signal other than the fact that
|
||||
// the chunk is consistent with a set of encodings. So, instead of
|
||||
// setting an arbitrary threshold, we have to scan all the encodings
|
||||
// consistent with the data.
|
||||
const int32_t kThreshold = 10;
|
||||
for (int i = 0; i < matchesCount; ++i) {
|
||||
int32_t confidence = ucsdet_getConfidence(matches[i], &status);
|
||||
if (U_FAILURE(status)) {
|
||||
status = U_ZERO_ERROR;
|
||||
continue;
|
||||
}
|
||||
if (confidence < kThreshold)
|
||||
break;
|
||||
const char* matchEncoding = ucsdet_getName(matches[i], &status);
|
||||
if (U_FAILURE(status)) {
|
||||
status = U_ZERO_ERROR;
|
||||
continue;
|
||||
}
|
||||
if (TextEncoding(matchEncoding) == hintEncoding) {
|
||||
encoding = hintEncodingName;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
// If no match is found so far, just pick the top match.
|
||||
// This can happen, say, when a parent frame in EUC-JP refers to
|
||||
// a child frame in Shift_JIS and both frames do NOT specify the encoding
|
||||
// making us resort to auto-detection (when it IS turned on).
|
||||
if (!encoding && matchesCount > 0)
|
||||
encoding = ucsdet_getName(matches[0], &status);
|
||||
if (U_SUCCESS(status)) {
|
||||
*detectedEncoding = TextEncoding(encoding);
|
||||
ucsdet_close(detector);
|
||||
return true;
|
||||
}
|
||||
ucsdet_close(detector);
|
||||
return false;
|
||||
}
|
||||
|
||||
}
|
||||
350
src/bun.js/bindings/webcore/TextEncodingRegistry.cpp
Normal file
350
src/bun.js/bindings/webcore/TextEncodingRegistry.cpp
Normal file
@@ -0,0 +1,350 @@
|
||||
/*
|
||||
* Copyright (C) 2006-2017 Apple Inc. All rights reserved.
|
||||
* Copyright (C) 2007-2009 Torch Mobile, Inc.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
|
||||
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
|
||||
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
#include "TextEncodingRegistry.h"
|
||||
|
||||
#include "TextCodec.h"
|
||||
#include "TextCodecCJK.h"
|
||||
#include "TextCodecICU.h"
|
||||
#include "TextCodecLatin1.h"
|
||||
#include "TextCodecReplacement.h"
|
||||
#include "TextCodecSingleByte.h"
|
||||
#include "TextCodecUserDefined.h"
|
||||
#include "TextEncoding.h"
|
||||
#include <mutex>
|
||||
#include <wtf/ASCIICType.h>
|
||||
#include <wtf/CheckedArithmetic.h>
|
||||
#include <wtf/HashMap.h>
|
||||
#include <wtf/HashSet.h>
|
||||
#include <wtf/Lock.h>
|
||||
#include <wtf/MainThread.h>
|
||||
#include <wtf/StdLibExtras.h>
|
||||
#include <wtf/text/CString.h>
|
||||
#include <wtf/text/StringHash.h>
|
||||
|
||||
namespace PAL {
|
||||
|
||||
const size_t maxEncodingNameLength = 63;
|
||||
|
||||
// Hash for all-ASCII strings that does case folding.
|
||||
struct TextEncodingNameHash {
|
||||
static bool equal(const char* s1, const char* s2)
|
||||
{
|
||||
char c1;
|
||||
char c2;
|
||||
do {
|
||||
c1 = *s1++;
|
||||
c2 = *s2++;
|
||||
if (toASCIILower(c1) != toASCIILower(c2))
|
||||
return false;
|
||||
} while (c1 && c2);
|
||||
return !c1 && !c2;
|
||||
}
|
||||
|
||||
// This algorithm is the one-at-a-time hash from:
|
||||
// http://burtleburtle.net/bob/hash/hashfaq.html
|
||||
// http://burtleburtle.net/bob/hash/doobs.html
|
||||
static unsigned hash(const char* s)
|
||||
{
|
||||
unsigned h = WTF::stringHashingStartValue;
|
||||
for (;;) {
|
||||
char c = *s++;
|
||||
if (!c) {
|
||||
h += (h << 3);
|
||||
h ^= (h >> 11);
|
||||
h += (h << 15);
|
||||
return h;
|
||||
}
|
||||
h += toASCIILower(c);
|
||||
h += (h << 10);
|
||||
h ^= (h >> 6);
|
||||
}
|
||||
}
|
||||
|
||||
static const bool safeToCompareToEmptyOrDeleted = false;
|
||||
};
|
||||
|
||||
struct HashTranslatorTextEncodingName {
|
||||
static unsigned hash(const char* literal)
|
||||
{
|
||||
return TextEncodingNameHash::hash(literal);
|
||||
}
|
||||
|
||||
static bool equal(const ASCIILiteral& a, const char* b)
|
||||
{
|
||||
return TextEncodingNameHash::equal(a.characters(), b);
|
||||
}
|
||||
};
|
||||
|
||||
using TextEncodingNameMap = HashMap<ASCIILiteral, ASCIILiteral, TextEncodingNameHash>;
|
||||
using TextCodecMap = HashMap<ASCIILiteral, NewTextCodecFunction>;
|
||||
|
||||
static Lock encodingRegistryLock;
|
||||
|
||||
static TextEncodingNameMap* textEncodingNameMap WTF_GUARDED_BY_LOCK(encodingRegistryLock);
|
||||
static TextCodecMap* textCodecMap WTF_GUARDED_BY_LOCK(encodingRegistryLock);
|
||||
static bool didExtendTextCodecMaps;
|
||||
static HashSet<ASCIILiteral>* japaneseEncodings;
|
||||
static HashSet<ASCIILiteral>* nonBackslashEncodings;
|
||||
|
||||
static constexpr ASCIILiteral textEncodingNameBlocklist[] = { "UTF-7"_s, "BOCU-1"_s, "SCSU"_s };
|
||||
|
||||
static bool isUndesiredAlias(ASCIILiteral alias)
|
||||
{
|
||||
// Reject aliases with version numbers that are supported by some back-ends (such as "ISO_2022,locale=ja,version=0" in ICU).
|
||||
if (strchr(alias.characters(), ','))
|
||||
return true;
|
||||
// 8859_1 is known to (at least) ICU, but other browsers don't support this name - and having it caused a compatibility
|
||||
// problem, see bug 43554.
|
||||
if (alias == "8859_1"_s)
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
static void addToTextEncodingNameMap(ASCIILiteral alias, ASCIILiteral name) WTF_REQUIRES_LOCK(encodingRegistryLock)
|
||||
{
|
||||
ASSERT(strlen(alias) <= maxEncodingNameLength);
|
||||
if (isUndesiredAlias(alias))
|
||||
return;
|
||||
ASCIILiteral atomName = textEncodingNameMap->get(name);
|
||||
ASSERT((alias == name) || !atomName.isNull());
|
||||
if (atomName.isNull())
|
||||
atomName = name;
|
||||
|
||||
ASSERT_WITH_MESSAGE(textEncodingNameMap->get(alias).isNull(), "Duplicate text encoding name %s for %s (previously registered as %s)", alias.characters(), atomName.characters(), textEncodingNameMap->get(alias).characters());
|
||||
|
||||
textEncodingNameMap->add(alias, atomName);
|
||||
}
|
||||
|
||||
static void addToTextCodecMap(ASCIILiteral name, NewTextCodecFunction&& function) WTF_REQUIRES_LOCK(encodingRegistryLock)
|
||||
{
|
||||
ASCIILiteral atomName = textEncodingNameMap->get(name);
|
||||
ASSERT(!atomName.isNull());
|
||||
textCodecMap->add(atomName, WTFMove(function));
|
||||
}
|
||||
|
||||
static void pruneBlocklistedCodecs() WTF_REQUIRES_LOCK(encodingRegistryLock)
|
||||
{
|
||||
for (auto& nameFromBlocklist : textEncodingNameBlocklist) {
|
||||
ASCIILiteral atomName = textEncodingNameMap->get(nameFromBlocklist);
|
||||
if (atomName.isNull())
|
||||
continue;
|
||||
|
||||
Vector<ASCIILiteral> names;
|
||||
for (auto& entry : *textEncodingNameMap) {
|
||||
if (entry.value == atomName)
|
||||
names.append(entry.key);
|
||||
}
|
||||
|
||||
for (auto& name : names)
|
||||
textEncodingNameMap->remove(name);
|
||||
|
||||
textCodecMap->remove(atomName);
|
||||
}
|
||||
}
|
||||
|
||||
static void buildBaseTextCodecMaps() WTF_REQUIRES_LOCK(encodingRegistryLock)
|
||||
{
|
||||
ASSERT(!textCodecMap);
|
||||
ASSERT(!textEncodingNameMap);
|
||||
|
||||
textCodecMap = new TextCodecMap;
|
||||
textEncodingNameMap = new TextEncodingNameMap;
|
||||
|
||||
TextCodecLatin1::registerEncodingNames(addToTextEncodingNameMap);
|
||||
TextCodecLatin1::registerCodecs(addToTextCodecMap);
|
||||
|
||||
TextCodecUserDefined::registerEncodingNames(addToTextEncodingNameMap);
|
||||
TextCodecUserDefined::registerCodecs(addToTextCodecMap);
|
||||
}
|
||||
|
||||
static void addEncodingName(HashSet<ASCIILiteral>& set, ASCIILiteral name) WTF_REQUIRES_LOCK(encodingRegistryLock)
|
||||
{
|
||||
// We must not use atomCanonicalTextEncodingName() because this function is called in it.
|
||||
ASCIILiteral atomName = textEncodingNameMap->get(name);
|
||||
if (!atomName.isNull())
|
||||
set.add(atomName);
|
||||
}
|
||||
|
||||
static void buildQuirksSets() WTF_REQUIRES_LOCK(encodingRegistryLock)
|
||||
{
|
||||
// FIXME: Having isJapaneseEncoding() and shouldShowBackslashAsCurrencySymbolIn()
|
||||
// and initializing the sets for them in TextEncodingRegistry.cpp look strange.
|
||||
|
||||
ASSERT(!japaneseEncodings);
|
||||
ASSERT(!nonBackslashEncodings);
|
||||
|
||||
japaneseEncodings = new HashSet<ASCIILiteral>;
|
||||
addEncodingName(*japaneseEncodings, "EUC-JP"_s);
|
||||
addEncodingName(*japaneseEncodings, "ISO-2022-JP"_s);
|
||||
addEncodingName(*japaneseEncodings, "ISO-2022-JP-1"_s);
|
||||
addEncodingName(*japaneseEncodings, "ISO-2022-JP-2"_s);
|
||||
addEncodingName(*japaneseEncodings, "ISO-2022-JP-3"_s);
|
||||
addEncodingName(*japaneseEncodings, "JIS_C6226-1978"_s);
|
||||
addEncodingName(*japaneseEncodings, "JIS_X0201"_s);
|
||||
addEncodingName(*japaneseEncodings, "JIS_X0208-1983"_s);
|
||||
addEncodingName(*japaneseEncodings, "JIS_X0208-1990"_s);
|
||||
addEncodingName(*japaneseEncodings, "JIS_X0212-1990"_s);
|
||||
addEncodingName(*japaneseEncodings, "Shift_JIS"_s);
|
||||
addEncodingName(*japaneseEncodings, "Shift_JIS_X0213-2000"_s);
|
||||
addEncodingName(*japaneseEncodings, "cp932"_s);
|
||||
addEncodingName(*japaneseEncodings, "x-mac-japanese"_s);
|
||||
|
||||
nonBackslashEncodings = new HashSet<ASCIILiteral>;
|
||||
// The text encodings below treat backslash as a currency symbol for IE compatibility.
|
||||
// See http://blogs.msdn.com/michkap/archive/2005/09/17/469941.aspx for more information.
|
||||
addEncodingName(*nonBackslashEncodings, "x-mac-japanese"_s);
|
||||
addEncodingName(*nonBackslashEncodings, "ISO-2022-JP"_s);
|
||||
addEncodingName(*nonBackslashEncodings, "EUC-JP"_s);
|
||||
// Shift_JIS_X0213-2000 is not the same encoding as Shift_JIS on Mac. We need to register both of them.
|
||||
addEncodingName(*nonBackslashEncodings, "Shift_JIS"_s);
|
||||
addEncodingName(*nonBackslashEncodings, "Shift_JIS_X0213-2000"_s);
|
||||
}
|
||||
|
||||
bool isJapaneseEncoding(ASCIILiteral canonicalEncodingName)
|
||||
{
|
||||
return !canonicalEncodingName.isNull() && japaneseEncodings && japaneseEncodings->contains(canonicalEncodingName);
|
||||
}
|
||||
|
||||
bool shouldShowBackslashAsCurrencySymbolIn(ASCIILiteral canonicalEncodingName)
|
||||
{
|
||||
return !canonicalEncodingName.isNull() && nonBackslashEncodings && nonBackslashEncodings->contains(canonicalEncodingName);
|
||||
}
|
||||
|
||||
static void extendTextCodecMaps() WTF_REQUIRES_LOCK(encodingRegistryLock)
|
||||
{
|
||||
TextCodecReplacement::registerEncodingNames(addToTextEncodingNameMap);
|
||||
TextCodecReplacement::registerCodecs(addToTextCodecMap);
|
||||
|
||||
TextCodecICU::registerEncodingNames(addToTextEncodingNameMap);
|
||||
TextCodecICU::registerCodecs(addToTextCodecMap);
|
||||
|
||||
TextCodecCJK::registerEncodingNames(addToTextEncodingNameMap);
|
||||
TextCodecCJK::registerCodecs(addToTextCodecMap);
|
||||
|
||||
TextCodecSingleByte::registerEncodingNames(addToTextEncodingNameMap);
|
||||
TextCodecSingleByte::registerCodecs(addToTextCodecMap);
|
||||
|
||||
pruneBlocklistedCodecs();
|
||||
buildQuirksSets();
|
||||
}
|
||||
|
||||
std::unique_ptr<TextCodec> newTextCodec(const TextEncoding& encoding)
|
||||
{
|
||||
Locker locker { encodingRegistryLock };
|
||||
|
||||
ASSERT(textCodecMap);
|
||||
if (!encoding.isValid()) {
|
||||
RELEASE_ASSERT_NOT_REACHED();
|
||||
}
|
||||
auto result = textCodecMap->find(encoding.name());
|
||||
if (result == textCodecMap->end()) {
|
||||
RELEASE_ASSERT_NOT_REACHED();
|
||||
}
|
||||
if (!result->value) {
|
||||
RELEASE_LOG_ERROR(TextEncoding, "Codec for encoding %" PUBLIC_LOG_STRING " is null. Will default to UTF-8", encoding.name().characters());
|
||||
RELEASE_ASSERT_NOT_REACHED();
|
||||
}
|
||||
|
||||
return result->value();
|
||||
}
|
||||
|
||||
ASCIILiteral atomCanonicalTextEncodingName(const char* name)
|
||||
{
|
||||
if (!name || !name[0])
|
||||
return {};
|
||||
|
||||
Locker locker { encodingRegistryLock };
|
||||
|
||||
if (!textEncodingNameMap)
|
||||
buildBaseTextCodecMaps();
|
||||
|
||||
if (ASCIILiteral atomName = textEncodingNameMap->get<HashTranslatorTextEncodingName>(name))
|
||||
return atomName;
|
||||
if (didExtendTextCodecMaps)
|
||||
return {};
|
||||
|
||||
extendTextCodecMaps();
|
||||
didExtendTextCodecMaps = true;
|
||||
return textEncodingNameMap->get<HashTranslatorTextEncodingName>(name);
|
||||
}
|
||||
|
||||
template<typename CharacterType> static ASCIILiteral atomCanonicalTextEncodingName(std::span<const CharacterType> characters)
|
||||
{
|
||||
char buffer[maxEncodingNameLength + 1];
|
||||
size_t j = 0;
|
||||
for (auto character : characters) {
|
||||
if (j == maxEncodingNameLength)
|
||||
return {};
|
||||
buffer[j++] = character;
|
||||
}
|
||||
buffer[j] = 0;
|
||||
return atomCanonicalTextEncodingName(buffer);
|
||||
}
|
||||
|
||||
ASCIILiteral atomCanonicalTextEncodingName(StringView alias)
|
||||
{
|
||||
if (alias.isEmpty() || !alias.containsOnlyASCII())
|
||||
return {};
|
||||
|
||||
if (alias.is8Bit())
|
||||
return atomCanonicalTextEncodingName(alias.span8());
|
||||
|
||||
return atomCanonicalTextEncodingName(alias.span16());
|
||||
}
|
||||
|
||||
bool noExtendedTextEncodingNameUsed()
|
||||
{
|
||||
// If the calling thread did not use extended encoding names, it is fine for it to use a stale false value.
|
||||
return !didExtendTextCodecMaps;
|
||||
}
|
||||
|
||||
String defaultTextEncodingNameForSystemLanguage()
|
||||
{
|
||||
#if PLATFORM(COCOA)
|
||||
String systemEncodingName = CFStringConvertEncodingToIANACharSetName(webDefaultCFStringEncoding());
|
||||
|
||||
// CFStringConvertEncodingToIANACharSetName() returns cp949 for kTextEncodingDOSKorean AKA "extended EUC-KR" AKA windows-949.
|
||||
// ICU uses this name for a different encoding, so we need to change the name to a value that actually gives us windows-949.
|
||||
// In addition, this value must match what is used in Safari, see <rdar://problem/5579292>.
|
||||
// On some OS versions, the result is CP949 (uppercase).
|
||||
if (equalLettersIgnoringASCIICase(systemEncodingName, "cp949"_s))
|
||||
systemEncodingName = "ks_c_5601-1987"_s;
|
||||
|
||||
// CFStringConvertEncodingToIANACharSetName() returns cp874 for kTextEncodingDOSThai, AKA windows-874.
|
||||
// Since "cp874" alias is not standard (https://encoding.spec.whatwg.org/#names-and-labels), map to
|
||||
// "dos-874" instead.
|
||||
if (equalLettersIgnoringASCIICase(systemEncodingName, "cp874"_s))
|
||||
systemEncodingName = "dos-874"_s;
|
||||
|
||||
return systemEncodingName;
|
||||
#else
|
||||
return "ISO-8859-1"_s;
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace PAL
|
||||
57
src/bun.js/bindings/webcore/TextEncodingRegistry.h
Normal file
57
src/bun.js/bindings/webcore/TextEncodingRegistry.h
Normal file
@@ -0,0 +1,57 @@
|
||||
/*
|
||||
* Copyright (C) 2006-2017 Apple Inc. All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
|
||||
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
|
||||
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <memory>
|
||||
#include <wtf/Forward.h>
|
||||
|
||||
#if PLATFORM(COCOA)
|
||||
#include <CoreFoundation/CoreFoundation.h>
|
||||
#endif
|
||||
|
||||
namespace PAL {
|
||||
|
||||
class TextCodec;
|
||||
class TextEncoding;
|
||||
|
||||
// Use TextResourceDecoder::decode to decode resources, since it handles BOMs.
|
||||
// Use TextEncoding::encode to encode, since it takes care of normalization.
|
||||
std::unique_ptr<TextCodec> newTextCodec(const TextEncoding&);
|
||||
|
||||
// Only TextEncoding should use the following functions directly.
|
||||
ASCIILiteral atomCanonicalTextEncodingName(const char* alias);
|
||||
ASCIILiteral atomCanonicalTextEncodingName(StringView);
|
||||
bool noExtendedTextEncodingNameUsed();
|
||||
bool isJapaneseEncoding(ASCIILiteral canonicalEncodingName);
|
||||
bool shouldShowBackslashAsCurrencySymbolIn(ASCIILiteral canonicalEncodingName);
|
||||
|
||||
String defaultTextEncodingNameForSystemLanguage();
|
||||
|
||||
#if PLATFORM(COCOA)
|
||||
CFStringEncoding webDefaultCFStringEncoding();
|
||||
#endif
|
||||
|
||||
} // namespace PAL
|
||||
43
src/bun.js/bindings/webcore/UnencodableHandling.h
Normal file
43
src/bun.js/bindings/webcore/UnencodableHandling.h
Normal file
@@ -0,0 +1,43 @@
|
||||
/*
|
||||
* Copyright (C) 2004-2017 Apple Inc. All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
|
||||
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
|
||||
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
namespace PAL {
|
||||
|
||||
// Specifies what will happen when a character is encountered that is
|
||||
// not encodable in the character set.
|
||||
enum class UnencodableHandling: bool {
|
||||
// Encodes the character as an XML entity. For example, U+06DE
|
||||
// would be "۞" (0x6DE = 1758 in octal).
|
||||
Entities,
|
||||
|
||||
// Encodes the character as en entity as above, but escaped
|
||||
// non-alphanumeric characters. This is used in URLs.
|
||||
// For example, U+6DE would be "%26%231758%3B".
|
||||
URLEncodedEntities
|
||||
};
|
||||
|
||||
}
|
||||
48
src/bun.js/bindings/webcore/WebkitTextCodec.cpp
Normal file
48
src/bun.js/bindings/webcore/WebkitTextCodec.cpp
Normal file
@@ -0,0 +1,48 @@
|
||||
#include "root.h"
|
||||
|
||||
#include "TextCodec.h"
|
||||
#include "TextEncodingRegistry.h"
|
||||
#include "TextEncoding.h"
|
||||
#include "headers-handwritten.h"
|
||||
namespace Bun {
|
||||
|
||||
using namespace PAL;
|
||||
using namespace WTF;
|
||||
|
||||
class WebKitTextCodec {
|
||||
WTF_MAKE_FAST_ALLOCATED;
|
||||
|
||||
public:
|
||||
std::unique_ptr<TextCodec> codec;
|
||||
|
||||
static WebKitTextCodec* create(std::span<const LChar> encodingLabel)
|
||||
{
|
||||
return new WebKitTextCodec(newTextCodec(TextEncoding(StringView(encodingLabel))));
|
||||
}
|
||||
};
|
||||
|
||||
extern "C" WebKitTextCodec* WebKitTextCodec__create(const LChar* ptr, size_t len)
|
||||
{
|
||||
|
||||
auto label = std::span<const LChar>(ptr, len);
|
||||
return WebKitTextCodec::create(label);
|
||||
}
|
||||
|
||||
extern "C" void WebKitTextCodec__deinit(WebKitTextCodec* codec)
|
||||
{
|
||||
delete codec;
|
||||
}
|
||||
|
||||
extern "C" BunString WebKitTextCodec__decode(WebKitTextCodec* code, const uint8_t* input_ptr, size_t input_len, bool flush, bool* stopOnError)
|
||||
{
|
||||
const std::span<const uint8_t> data = { input_ptr, input_len };
|
||||
auto str = code->codec->decode(data, flush, stopOnError, *stopOnError);
|
||||
return Bun::toStringRef(str);
|
||||
}
|
||||
|
||||
extern "C" void WebKitTextCodec__stripByteOrderMark(WebKitTextCodec* code)
|
||||
{
|
||||
code->codec->stripByteOrderMark();
|
||||
}
|
||||
|
||||
}
|
||||
@@ -29,10 +29,6 @@
|
||||
#include <wtf/Vector.h>
|
||||
#include <wtf/text/WTFString.h>
|
||||
|
||||
#ifndef PAL_EXPORT
|
||||
#define PAL_EXPORT
|
||||
#endif
|
||||
|
||||
namespace PAL {
|
||||
|
||||
struct CryptoDigestContext;
|
||||
@@ -48,12 +44,12 @@ public:
|
||||
SHA_384,
|
||||
SHA_512,
|
||||
};
|
||||
PAL_EXPORT static std::unique_ptr<CryptoDigest> create(Algorithm);
|
||||
PAL_EXPORT ~CryptoDigest();
|
||||
static std::unique_ptr<CryptoDigest> create(Algorithm);
|
||||
~CryptoDigest();
|
||||
|
||||
PAL_EXPORT void addBytes(const void* input, size_t length);
|
||||
PAL_EXPORT Vector<uint8_t> computeHash();
|
||||
PAL_EXPORT String toHexString();
|
||||
void addBytes(const void* input, size_t length);
|
||||
Vector<uint8_t> computeHash();
|
||||
String toHexString();
|
||||
|
||||
private:
|
||||
CryptoDigest();
|
||||
|
||||
@@ -273,9 +273,6 @@ pub const EncodingLabel = enum {
|
||||
@"windows-874",
|
||||
@"windows-1250",
|
||||
@"windows-1251",
|
||||
/// Also known as
|
||||
/// - ASCII
|
||||
/// - latin1
|
||||
@"windows-1252",
|
||||
@"windows-1253",
|
||||
@"windows-1254",
|
||||
@@ -294,13 +291,6 @@ pub const EncodingLabel = enum {
|
||||
@"x-user-defined",
|
||||
|
||||
pub const Map = std.enums.EnumMap(EncodingLabel, string);
|
||||
pub const label: Map = brk: {
|
||||
var map = Map.initFull("");
|
||||
map.put(EncodingLabel.@"UTF-8", "utf-8");
|
||||
map.put(EncodingLabel.@"UTF-16LE", "utf-16le");
|
||||
map.put(EncodingLabel.@"windows-1252", "windows-1252");
|
||||
break :brk map;
|
||||
};
|
||||
|
||||
const utf16_names = [_]string{
|
||||
"ucs-2",
|
||||
@@ -343,66 +333,78 @@ pub const EncodingLabel = enum {
|
||||
|
||||
pub const latin1 = EncodingLabel.@"windows-1252";
|
||||
|
||||
const map = bun.ComptimeStringMap(EncodingLabel, .{
|
||||
.{ "ansi_x3.4-1968", latin1 },
|
||||
.{ "ascii", latin1 },
|
||||
.{ "big5", EncodingLabel.Big5 },
|
||||
.{ "cp1252", latin1 },
|
||||
.{ "cp819", latin1 },
|
||||
.{ "csisolatin1", latin1 },
|
||||
.{ "csunicode", EncodingLabel.@"UTF-16LE" },
|
||||
.{ "euc-jp", EncodingLabel.@"EUC-JP" },
|
||||
.{ "euc-kr", EncodingLabel.@"EUC-KR" },
|
||||
.{ "ibm819", latin1 },
|
||||
.{ "ibm866", EncodingLabel.IBM866 },
|
||||
.{ "iso_8859-1:1987", latin1 },
|
||||
.{ "iso_8859-1", latin1 },
|
||||
.{ "iso-10646-ucs-2", EncodingLabel.@"UTF-16LE" },
|
||||
.{ "iso-2022-jp", EncodingLabel.@"ISO-2022-JP" },
|
||||
.{ "iso-8859-1", latin1 },
|
||||
.{ "iso-8859-10", EncodingLabel.@"ISO-8859-10" },
|
||||
.{ "iso-8859-13", EncodingLabel.@"ISO-8859-13" },
|
||||
.{ "iso-8859-14", EncodingLabel.@"ISO-8859-14" },
|
||||
.{ "iso-8859-15", EncodingLabel.@"ISO-8859-15" },
|
||||
.{ "iso-8859-16", EncodingLabel.@"ISO-8859-16" },
|
||||
.{ "iso-8859-2", EncodingLabel.@"ISO-8859-2" },
|
||||
.{ "iso-8859-3", EncodingLabel.@"ISO-8859-3" },
|
||||
.{ "iso-8859-4", EncodingLabel.@"ISO-8859-4" },
|
||||
.{ "iso-8859-5", EncodingLabel.@"ISO-8859-5" },
|
||||
.{ "iso-8859-6", EncodingLabel.@"ISO-8859-6" },
|
||||
.{ "iso-8859-7", EncodingLabel.@"ISO-8859-7" },
|
||||
.{ "iso-8859-8-i", EncodingLabel.@"ISO-8859-8-I" },
|
||||
.{ "iso-8859-8", EncodingLabel.@"ISO-8859-8" },
|
||||
.{ "iso-ir-100", latin1 },
|
||||
.{ "iso8859-1", latin1 },
|
||||
.{ "iso88591", latin1 },
|
||||
.{ "koi8-r", EncodingLabel.@"KOI8-R" },
|
||||
.{ "koi8-u", EncodingLabel.@"KOI8-U" },
|
||||
.{ "l1", latin1 },
|
||||
.{ "latin1", latin1 },
|
||||
.{ "macintosh", EncodingLabel.macintosh },
|
||||
.{ "shift_jis", EncodingLabel.Shift_JIS },
|
||||
.{ "ucs-2", EncodingLabel.@"UTF-16LE" },
|
||||
.{ "unicode-1-1-utf-8", EncodingLabel.@"UTF-8" },
|
||||
.{ "unicode", EncodingLabel.@"UTF-16LE" },
|
||||
.{ "unicode11utf8", EncodingLabel.@"UTF-8" },
|
||||
.{ "unicode20utf8", EncodingLabel.@"UTF-8" },
|
||||
.{ "unicodefeff", EncodingLabel.@"UTF-16LE" },
|
||||
.{ "us-ascii", latin1 },
|
||||
.{ "utf-16", EncodingLabel.@"UTF-16LE" },
|
||||
.{ "utf-16be", EncodingLabel.@"UTF-16BE" },
|
||||
.{ "utf-16le", EncodingLabel.@"UTF-16LE" },
|
||||
.{ "utf-8", EncodingLabel.@"UTF-8" },
|
||||
.{ "utf8", EncodingLabel.@"UTF-8" },
|
||||
.{ "windows-1250", EncodingLabel.@"windows-1250" },
|
||||
.{ "windows-1251", EncodingLabel.@"windows-1251" },
|
||||
.{ "windows-1252", EncodingLabel.@"windows-1252" },
|
||||
.{ "windows-1252", latin1 },
|
||||
.{ "windows-1253", EncodingLabel.@"windows-1253" },
|
||||
.{ "windows-1254", EncodingLabel.@"windows-1254" },
|
||||
.{ "windows-1255", EncodingLabel.@"windows-1255" },
|
||||
.{ "windows-1256", EncodingLabel.@"windows-1256" },
|
||||
.{ "windows-1257", EncodingLabel.@"windows-1257" },
|
||||
.{ "windows-1258", EncodingLabel.@"windows-1258" },
|
||||
.{ "windows-874", EncodingLabel.@"windows-874" },
|
||||
.{ "x-cp1252", latin1 },
|
||||
.{ "x-mac-cyrillic", EncodingLabel.@"x-mac-cyrillic" },
|
||||
.{ "x-unicode20utf8", EncodingLabel.@"UTF-8" },
|
||||
.{ "x-user-defined", EncodingLabel.@"x-user-defined" },
|
||||
});
|
||||
|
||||
pub fn which(input_: string) ?EncodingLabel {
|
||||
const input = strings.trim(input_, " \t\r\n");
|
||||
const ExactMatcher = strings.ExactSizeMatcher;
|
||||
const Eight = ExactMatcher(8);
|
||||
const Sixteen = ExactMatcher(16);
|
||||
return switch (input.len) {
|
||||
1, 0 => null,
|
||||
2...8 => switch (Eight.matchLower(input)) {
|
||||
Eight.case("l1"),
|
||||
Eight.case("ascii"),
|
||||
Eight.case("cp819"),
|
||||
Eight.case("cp1252"),
|
||||
Eight.case("ibm819"),
|
||||
Eight.case("latin1"),
|
||||
Eight.case("iso88591"),
|
||||
Eight.case("us-ascii"),
|
||||
Eight.case("x-cp1252"),
|
||||
=> EncodingLabel.latin1,
|
||||
|
||||
Eight.case("ucs-2"),
|
||||
Eight.case("utf-16"),
|
||||
Eight.case("unicode"),
|
||||
Eight.case("utf-16le"),
|
||||
=> EncodingLabel.@"UTF-16LE",
|
||||
|
||||
Eight.case("utf-16be"),
|
||||
=> EncodingLabel.@"UTF-16BE",
|
||||
|
||||
Eight.case("utf8"), Eight.case("utf-8") => EncodingLabel.@"UTF-8",
|
||||
else => null,
|
||||
},
|
||||
|
||||
9...16 => switch (Sixteen.matchLower(input)) {
|
||||
Sixteen.case("iso8859-1"),
|
||||
Sixteen.case("iso_8859-1"),
|
||||
Sixteen.case("iso-8859-1"),
|
||||
Sixteen.case("iso-ir-100"),
|
||||
Sixteen.case("csisolatin1"),
|
||||
Sixteen.case("windows-1252"),
|
||||
Sixteen.case("ansi_x3.4-1968"),
|
||||
Sixteen.case("iso_8859-1:1987"),
|
||||
=> EncodingLabel.latin1,
|
||||
|
||||
Sixteen.case("unicode11utf8"),
|
||||
Sixteen.case("unicode20utf8"),
|
||||
Sixteen.case("x-unicode20utf8"),
|
||||
=> EncodingLabel.@"UTF-8",
|
||||
|
||||
Sixteen.case("csunicode"),
|
||||
Sixteen.case("unicodefeff"),
|
||||
Sixteen.case("iso-10646-ucs-2"),
|
||||
=> EncodingLabel.@"UTF-16LE",
|
||||
|
||||
else => null,
|
||||
},
|
||||
else => if (strings.eqlCaseInsensitiveASCII(input, "unicode-1-1-utf-8", true))
|
||||
EncodingLabel.@"UTF-8"
|
||||
else
|
||||
null,
|
||||
};
|
||||
return strings.inMapCaseInsensitive(input, EncodingLabel.map);
|
||||
}
|
||||
};
|
||||
|
||||
@@ -627,10 +629,14 @@ pub const TextDecoder = struct {
|
||||
ignore_bom: bool = false,
|
||||
fatal: bool = false,
|
||||
encoding: EncodingLabel = EncodingLabel.@"UTF-8",
|
||||
webkit_text_codec: ?*JSC.WebKitTextCodec = null,
|
||||
|
||||
pub usingnamespace bun.New(TextDecoder);
|
||||
|
||||
pub fn finalize(this: *TextDecoder) void {
|
||||
if (this.webkit_text_codec) |codec| {
|
||||
codec.deinit();
|
||||
}
|
||||
this.destroy();
|
||||
}
|
||||
|
||||
@@ -672,7 +678,7 @@ pub const TextDecoder = struct {
|
||||
this: *TextDecoder,
|
||||
globalThis: *JSC.JSGlobalObject,
|
||||
) JSC.JSValue {
|
||||
return ZigString.init(EncodingLabel.label.get(this.encoding).?).toJS(globalThis);
|
||||
return ZigString.init(@tagName(this.encoding)).toJS(globalThis);
|
||||
}
|
||||
const Vector16 = std.meta.Vector(16, u16);
|
||||
const max_16_ascii: Vector16 = @splat(@as(u16, 127));
|
||||
@@ -917,8 +923,20 @@ pub const TextDecoder = struct {
|
||||
return output.toJS(globalThis);
|
||||
},
|
||||
else => {
|
||||
globalThis.throwInvalidArguments("TextDecoder.decode set to unsupported encoding", .{});
|
||||
return .zero;
|
||||
if (this.webkit_text_codec == null) {
|
||||
this.webkit_text_codec = JSC.WebKitTextCodec.init(this.encoding);
|
||||
}
|
||||
|
||||
const codec = this.webkit_text_codec.?;
|
||||
var did_stop_on_error = this.fatal;
|
||||
var str = codec.decode(buffer_slice, flush, &did_stop_on_error);
|
||||
defer str.deref();
|
||||
if (did_stop_on_error and this.fatal) {
|
||||
globalThis.ERR_ENCODING_INVALID_ENCODED_DATA("The encoded data was not valid {s} data", .{@tagName(this.encoding)}).throw();
|
||||
return .zero;
|
||||
}
|
||||
|
||||
return str.toJS(globalThis);
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
9
test/js/web/encoding/text-decoder-shiftjis.test.ts
Normal file
9
test/js/web/encoding/text-decoder-shiftjis.test.ts
Normal file
@@ -0,0 +1,9 @@
|
||||
import { test, expect } from "bun:test";
|
||||
|
||||
test("shift_jis", () => {
|
||||
const bytes = [147, 250, 150, 123, 140, 234];
|
||||
|
||||
const decoder = new TextDecoder("shift_jis");
|
||||
const data = decoder.decode(Uint8Array.from(bytes));
|
||||
expect(data).toEqual("日本語");
|
||||
});
|
||||
Reference in New Issue
Block a user