mirror of
https://github.com/oven-sh/bun
synced 2026-02-16 13:51:47 +00:00
Compare commits
5 Commits
claude/fix
...
jarred/tex
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
6edb3f4845 | ||
|
|
38c9de8516 | ||
|
|
93fdf30a38 | ||
|
|
f9c23b6426 | ||
|
|
237c191033 |
@@ -518,10 +518,15 @@ set(BUN_ZIG_OUTPUT ${BUILD_PATH}/bun-zig.o)
|
||||
if(CMAKE_SYSTEM_PROCESSOR MATCHES "arm|ARM|arm64|ARM64|aarch64|AARCH64")
|
||||
if(APPLE)
|
||||
set(ZIG_CPU "apple_m1")
|
||||
set(HOMEBREW_PREFIX "/opt/homebrew")
|
||||
else()
|
||||
set(ZIG_CPU "native")
|
||||
endif()
|
||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|X86_64|x64|X64|amd64|AMD64")
|
||||
if(APPLE)
|
||||
set(HOMEBREW_PREFIX "/usr/local")
|
||||
endif()
|
||||
|
||||
if(ENABLE_BASELINE)
|
||||
set(ZIG_CPU "nehalem")
|
||||
else()
|
||||
@@ -1010,6 +1015,8 @@ include_directories(${WEBKIT_INCLUDE_PATH})
|
||||
|
||||
if(NOT WEBKIT_LOCAL AND NOT APPLE)
|
||||
include_directories(${WEBKIT_INCLUDE_PATH}/wtf/unicode)
|
||||
elseif(WEBKIT_PREBUILT AND APPLE)
|
||||
include_directories(${HOMEBREW_PREFIX}/opt/icu4c/include)
|
||||
endif()
|
||||
|
||||
# --- Dependencies ---
|
||||
|
||||
@@ -7,6 +7,7 @@
|
||||
#include "_libusockets.h"
|
||||
#include "BunClientData.h"
|
||||
#include "EventLoopTask.h"
|
||||
#include "TextCodecICU.h"
|
||||
|
||||
extern "C" void Bun__startLoop(us_loop_t* loop);
|
||||
|
||||
@@ -17,6 +18,24 @@ static std::atomic<unsigned> lastUniqueIdentifier = 0;
|
||||
WTF_MAKE_ISO_ALLOCATED_IMPL(EventLoopTask);
|
||||
WTF_MAKE_ISO_ALLOCATED_IMPL(ScriptExecutionContext);
|
||||
|
||||
ScriptExecutionContext::ScriptExecutionContext(JSC::VM* vm, JSC::JSGlobalObject* globalObject)
|
||||
: m_vm(vm)
|
||||
, m_globalObject(globalObject)
|
||||
, m_identifier(0)
|
||||
, m_broadcastChannelRegistry(BunBroadcastChannelRegistry::create())
|
||||
{
|
||||
regenerateIdentifier();
|
||||
}
|
||||
|
||||
ScriptExecutionContext::ScriptExecutionContext(JSC::VM* vm, JSC::JSGlobalObject* globalObject, ScriptExecutionContextIdentifier identifier)
|
||||
: m_vm(vm)
|
||||
, m_globalObject(globalObject)
|
||||
, m_identifier(identifier)
|
||||
, m_broadcastChannelRegistry(BunBroadcastChannelRegistry::create())
|
||||
{
|
||||
addToContextsMap();
|
||||
}
|
||||
|
||||
static Lock allScriptExecutionContextsMapLock;
|
||||
static HashMap<ScriptExecutionContextIdentifier, ScriptExecutionContext*>& allScriptExecutionContextsMap() WTF_REQUIRES_LOCK(allScriptExecutionContextsMapLock)
|
||||
{
|
||||
@@ -31,6 +50,15 @@ ScriptExecutionContext* ScriptExecutionContext::getScriptExecutionContext(Script
|
||||
return allScriptExecutionContextsMap().get(identifier);
|
||||
}
|
||||
|
||||
PAL::ICUConverterWrapper& ScriptExecutionContext::cachedConverterICU()
|
||||
{
|
||||
if (!m_cachedConverterICU) {
|
||||
m_cachedConverterICU = makeUnique<PAL::ICUConverterWrapper>();
|
||||
}
|
||||
|
||||
return *m_cachedConverterICU;
|
||||
}
|
||||
|
||||
template<bool SSL, bool isServer>
|
||||
static void registerHTTPContextForWebSocket(ScriptExecutionContext* script, us_socket_context_t* ctx, us_loop_t* loop)
|
||||
{
|
||||
|
||||
@@ -14,6 +14,10 @@
|
||||
#include "CachedScript.h"
|
||||
#include <wtf/URL.h>
|
||||
|
||||
namespace PAL {
|
||||
class ICUConverterWrapper;
|
||||
}
|
||||
|
||||
namespace uWS {
|
||||
template<bool isServer, bool isClient, typename UserData>
|
||||
struct WebSocketContext;
|
||||
@@ -37,24 +41,8 @@ class ScriptExecutionContext : public CanMakeWeakPtr<ScriptExecutionContext> {
|
||||
WTF_MAKE_ISO_ALLOCATED(ScriptExecutionContext);
|
||||
|
||||
public:
|
||||
ScriptExecutionContext(JSC::VM* vm, JSC::JSGlobalObject* globalObject)
|
||||
: m_vm(vm)
|
||||
, m_globalObject(globalObject)
|
||||
, m_identifier(0)
|
||||
, m_broadcastChannelRegistry(BunBroadcastChannelRegistry::create())
|
||||
{
|
||||
regenerateIdentifier();
|
||||
}
|
||||
|
||||
ScriptExecutionContext(JSC::VM* vm, JSC::JSGlobalObject* globalObject, ScriptExecutionContextIdentifier identifier)
|
||||
: m_vm(vm)
|
||||
, m_globalObject(globalObject)
|
||||
, m_identifier(identifier)
|
||||
, m_broadcastChannelRegistry(BunBroadcastChannelRegistry::create())
|
||||
{
|
||||
addToContextsMap();
|
||||
}
|
||||
|
||||
ScriptExecutionContext(JSC::VM* vm, JSC::JSGlobalObject* globalObject);
|
||||
ScriptExecutionContext(JSC::VM* vm, JSC::JSGlobalObject* globalObject, ScriptExecutionContextIdentifier identifier);
|
||||
~ScriptExecutionContext();
|
||||
|
||||
static ScriptExecutionContextIdentifier generateIdentifier();
|
||||
@@ -160,6 +148,8 @@ public:
|
||||
|
||||
static ScriptExecutionContext* getMainThreadScriptExecutionContext();
|
||||
|
||||
PAL::ICUConverterWrapper& cachedConverterICU();
|
||||
|
||||
private:
|
||||
JSC::VM* m_vm = nullptr;
|
||||
JSC::JSGlobalObject* m_globalObject = nullptr;
|
||||
@@ -184,6 +174,8 @@ private:
|
||||
us_socket_context_t* m_connected_ssl_client_websockets_ctx = nullptr;
|
||||
us_socket_context_t* m_connected_client_websockets_ctx = nullptr;
|
||||
|
||||
std::unique_ptr<PAL::ICUConverterWrapper> m_cachedConverterICU = { nullptr };
|
||||
|
||||
public:
|
||||
template<bool isSSL, bool isServer>
|
||||
us_socket_context_t* connectedWebSocketContext()
|
||||
|
||||
@@ -1092,3 +1092,24 @@ fn findPathInner(
|
||||
);
|
||||
return errorable.unwrap() catch null;
|
||||
}
|
||||
|
||||
pub const WebKitTextCodec = opaque {
|
||||
extern fn WebKitTextCodec__create(encoding_label: [*]const u8, len: usize) ?*WebKitTextCodec;
|
||||
extern fn WebKitTextCodec__deinit(this: *WebKitTextCodec) void;
|
||||
extern fn WebKitTextCodec__decode(this: *WebKitTextCodec, ptr: [*]const u8, len: usize, flush: bool, stopOnError: *bool) bun.String;
|
||||
extern fn WebKitTextCodec__stripByteOrderMark(this: *WebKitTextCodec) void;
|
||||
extern fn WebKitTextCodec__name(this: *WebKitTextCodec) bun.String;
|
||||
pub fn init(encoding_label: []const u8) ?*WebKitTextCodec {
|
||||
return WebKitTextCodec__create(encoding_label.ptr, encoding_label.len);
|
||||
}
|
||||
|
||||
pub const name = WebKitTextCodec__name;
|
||||
|
||||
pub const deinit = WebKitTextCodec__deinit;
|
||||
|
||||
pub fn decode(this: *WebKitTextCodec, input: []const u8, flush: bool, stop_on_error: *bool) bun.String {
|
||||
return WebKitTextCodec__decode(this, input.ptr, input.len, flush, stop_on_error);
|
||||
}
|
||||
|
||||
pub const stripByteOrderMark = WebKitTextCodec__stripByteOrderMark;
|
||||
};
|
||||
|
||||
@@ -73,6 +73,11 @@
|
||||
#define WEBCORE_EXPORT JS_EXPORT_PRIVATE
|
||||
#endif
|
||||
|
||||
#if OS(DARWIN)
|
||||
// Prevent symbol names from causing issues
|
||||
#define U_DISABLE_RENAMING 1
|
||||
#endif
|
||||
|
||||
#include <wtf/PlatformCallingConventions.h>
|
||||
#include <JavaScriptCore/JSCJSValue.h>
|
||||
#include <wtf/text/MakeString.h>
|
||||
|
||||
187
src/bun.js/bindings/webcore/DecodeEscapeSequences.h
Normal file
187
src/bun.js/bindings/webcore/DecodeEscapeSequences.h
Normal file
@@ -0,0 +1,187 @@
|
||||
/*
|
||||
* Copyright (C) 2011 Daniel Bates (dbates@intudata.com). All Rights Reserved.
|
||||
* Copyright (c) 2012 Google, inc. All Rights Reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* 3. Neither the name of Google Inc. nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
|
||||
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
|
||||
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "TextEncoding.h"
|
||||
#include <wtf/ASCIICType.h>
|
||||
#include <wtf/Assertions.h>
|
||||
#include <wtf/text/StringBuilder.h>
|
||||
|
||||
namespace PAL {
|
||||
|
||||
// See <http://en.wikipedia.org/wiki/Percent-encoding#Non-standard_implementations>.
|
||||
struct Unicode16BitEscapeSequence {
|
||||
enum { SequenceSize = 6 }; // e.g. %u26C4
|
||||
static size_t findInString(StringView string, size_t startPosition) { return string.find("%u"_s, startPosition); }
|
||||
static size_t findEndOfRun(StringView string, size_t startPosition, size_t endPosition)
|
||||
{
|
||||
size_t runEnd = startPosition;
|
||||
while (endPosition - runEnd >= SequenceSize && string[runEnd] == '%' && string[runEnd + 1] == 'u'
|
||||
&& isASCIIHexDigit(string[runEnd + 2]) && isASCIIHexDigit(string[runEnd + 3])
|
||||
&& isASCIIHexDigit(string[runEnd + 4]) && isASCIIHexDigit(string[runEnd + 5])) {
|
||||
runEnd += SequenceSize;
|
||||
}
|
||||
return runEnd;
|
||||
}
|
||||
static String decodeRun(StringView run, const TextEncoding&)
|
||||
{
|
||||
// Each %u-escape sequence represents a UTF-16 code unit.
|
||||
// See <http://www.w3.org/International/iri-edit/draft-duerst-iri.html#anchor29>.
|
||||
// For 16-bit escape sequences, we know that findEndOfRun() has given us a contiguous run of sequences
|
||||
// without any intervening characters, so decode the run without additional checks.
|
||||
auto numberOfSequences = run.length() / SequenceSize;
|
||||
StringBuilder builder;
|
||||
builder.reserveCapacity(numberOfSequences);
|
||||
while (numberOfSequences--) {
|
||||
UChar codeUnit = (toASCIIHexValue(run[2]) << 12) | (toASCIIHexValue(run[3]) << 8) | (toASCIIHexValue(run[4]) << 4) | toASCIIHexValue(run[5]);
|
||||
builder.append(codeUnit);
|
||||
run = run.substring(SequenceSize);
|
||||
}
|
||||
return builder.toString();
|
||||
}
|
||||
};
|
||||
|
||||
struct URLEscapeSequence {
|
||||
enum { SequenceSize = 3 }; // e.g. %41
|
||||
static size_t findInString(StringView string, size_t startPosition) { return string.find('%', startPosition); }
|
||||
static size_t findEndOfRun(StringView string, size_t startPosition, size_t endPosition)
|
||||
{
|
||||
// Make the simplifying assumption that supported encodings may have up to two unescaped characters
|
||||
// in the range 0x40 - 0x7F as the trailing bytes of their sequences which need to be passed into the
|
||||
// decoder as part of the run. In other words, we end the run at the first value outside of the
|
||||
// 0x40 - 0x7F range, after two values in this range, or at a %-sign that does not introduce a valid
|
||||
// escape sequence.
|
||||
size_t runEnd = startPosition;
|
||||
int numberOfTrailingCharacters = 0;
|
||||
while (runEnd < endPosition) {
|
||||
if (string[runEnd] == '%') {
|
||||
if (endPosition - runEnd >= SequenceSize && isASCIIHexDigit(string[runEnd + 1]) && isASCIIHexDigit(string[runEnd + 2])) {
|
||||
runEnd += SequenceSize;
|
||||
numberOfTrailingCharacters = 0;
|
||||
} else
|
||||
break;
|
||||
} else if (string[runEnd] >= 0x40 && string[runEnd] <= 0x7F && numberOfTrailingCharacters < 2) {
|
||||
runEnd += 1;
|
||||
numberOfTrailingCharacters += 1;
|
||||
} else
|
||||
break;
|
||||
}
|
||||
return runEnd;
|
||||
}
|
||||
|
||||
static Vector<uint8_t, 512> decodeRun(StringView run)
|
||||
{
|
||||
// For URL escape sequences, we know that findEndOfRun() has given us a run where every %-sign introduces
|
||||
// a valid escape sequence, but there may be characters between the sequences.
|
||||
Vector<uint8_t, 512> buffer;
|
||||
buffer.grow(run.length()); // Unescaping hex sequences only makes the length smaller.
|
||||
size_t bufferIndex = 0;
|
||||
while (!run.isEmpty()) {
|
||||
if (run[0] == '%') {
|
||||
buffer[bufferIndex++] = (toASCIIHexValue(run[1]) << 4) | toASCIIHexValue(run[2]);
|
||||
run = run.substring(SequenceSize);
|
||||
} else {
|
||||
buffer[bufferIndex++] = run[0];
|
||||
run = run.substring(1);
|
||||
}
|
||||
}
|
||||
buffer.shrink(bufferIndex);
|
||||
return buffer;
|
||||
}
|
||||
|
||||
static String decodeRun(StringView run, const TextEncoding& encoding)
|
||||
{
|
||||
auto buffer = decodeRun(run);
|
||||
if (!encoding.isValid())
|
||||
return PAL::UTF8Encoding().decode(buffer.span());
|
||||
return encoding.decode(buffer.span());
|
||||
}
|
||||
};
|
||||
|
||||
template<typename EscapeSequence>
|
||||
String decodeEscapeSequences(StringView string, const TextEncoding& encoding)
|
||||
{
|
||||
StringBuilder result;
|
||||
size_t length = string.length();
|
||||
size_t decodedPosition = 0;
|
||||
size_t searchPosition = 0;
|
||||
size_t encodedRunPosition;
|
||||
while ((encodedRunPosition = EscapeSequence::findInString(string, searchPosition)) != notFound) {
|
||||
size_t encodedRunEnd = EscapeSequence::findEndOfRun(string, encodedRunPosition, length);
|
||||
searchPosition = encodedRunEnd;
|
||||
if (encodedRunEnd == encodedRunPosition) {
|
||||
++searchPosition;
|
||||
continue;
|
||||
}
|
||||
|
||||
String decoded = EscapeSequence::decodeRun(string.substring(encodedRunPosition, encodedRunEnd - encodedRunPosition), encoding);
|
||||
if (decoded.isEmpty())
|
||||
continue;
|
||||
|
||||
result.append(string.substring(decodedPosition, encodedRunPosition - decodedPosition), decoded);
|
||||
decodedPosition = encodedRunEnd;
|
||||
}
|
||||
result.append(string.substring(decodedPosition, length - decodedPosition));
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
inline Vector<uint8_t> decodeURLEscapeSequencesAsData(StringView string)
|
||||
{
|
||||
Vector<uint8_t> result;
|
||||
size_t decodedPosition = 0;
|
||||
size_t searchPosition = 0;
|
||||
while (true) {
|
||||
size_t encodedRunPosition = URLEscapeSequence::findInString(string, searchPosition);
|
||||
size_t encodedRunEnd = 0;
|
||||
if (encodedRunPosition != notFound) {
|
||||
encodedRunEnd = URLEscapeSequence::findEndOfRun(string, encodedRunPosition, string.length());
|
||||
searchPosition = encodedRunEnd;
|
||||
if (encodedRunEnd == encodedRunPosition) {
|
||||
++searchPosition;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// Strings are encoded as requested.
|
||||
result.appendVector(PAL::UTF8Encoding().encodeForURLParsing(string.substring(decodedPosition, encodedRunPosition - decodedPosition)));
|
||||
|
||||
if (encodedRunPosition == notFound)
|
||||
return result;
|
||||
|
||||
// Bytes go through as-is.
|
||||
auto decodedEscapeSequence = URLEscapeSequence::decodeRun(string.substring(encodedRunPosition, encodedRunEnd - encodedRunPosition));
|
||||
ASSERT(!decodedEscapeSequence.isEmpty());
|
||||
result.appendVector(decodedEscapeSequence);
|
||||
|
||||
decodedPosition = encodedRunEnd;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace PAL
|
||||
39856
src/bun.js/bindings/webcore/EncodingTables.cpp
Normal file
39856
src/bun.js/bindings/webcore/EncodingTables.cpp
Normal file
File diff suppressed because it is too large
Load Diff
136
src/bun.js/bindings/webcore/EncodingTables.h
Normal file
136
src/bun.js/bindings/webcore/EncodingTables.h
Normal file
@@ -0,0 +1,136 @@
|
||||
/*
|
||||
* Copyright (C) 2020 Apple Inc. All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
|
||||
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
|
||||
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <algorithm>
|
||||
#include <array>
|
||||
#include <iterator>
|
||||
#include <optional>
|
||||
#include <unicode/umachine.h>
|
||||
#include <utility>
|
||||
|
||||
namespace PAL {
|
||||
|
||||
const std::array<std::pair<uint16_t, UChar>, 7724>& jis0208();
|
||||
const std::array<std::pair<uint16_t, UChar>, 6067>& jis0212();
|
||||
const std::array<std::pair<uint16_t, char32_t>, 18590>& big5();
|
||||
const std::array<std::pair<uint16_t, UChar>, 17048>& eucKR();
|
||||
const std::array<UChar, 23940>& gb18030();
|
||||
|
||||
void checkEncodingTableInvariants();
|
||||
|
||||
// Functions for using sorted arrays of pairs as a map.
|
||||
// FIXME: Consider moving these functions to StdLibExtras.h for uses other than encoding tables.
|
||||
template<typename CollectionType> void sortByFirst(CollectionType&);
|
||||
template<typename CollectionType> void stableSortByFirst(CollectionType&);
|
||||
template<typename CollectionType> bool isSortedByFirst(const CollectionType&);
|
||||
template<typename CollectionType> bool sortedFirstsAreUnique(const CollectionType&);
|
||||
template<typename CollectionType, typename KeyType> static auto findFirstInSortedPairs(const CollectionType& sortedPairsCollection, const KeyType&) -> std::optional<decltype(std::begin(sortedPairsCollection)->second)>;
|
||||
template<typename CollectionType, typename KeyType> static auto findInSortedPairs(const CollectionType& sortedPairsCollection, const KeyType&) -> std::pair<decltype(std::begin(sortedPairsCollection)), decltype(std::begin(sortedPairsCollection))>;
|
||||
|
||||
#if !ASSERT_ENABLED
|
||||
inline void checkEncodingTableInvariants() {}
|
||||
#endif
|
||||
|
||||
struct CompareFirst {
|
||||
template<typename TypeA, typename TypeB> bool operator()(const TypeA& a, const TypeB& b)
|
||||
{
|
||||
return a.first < b.first;
|
||||
}
|
||||
};
|
||||
|
||||
struct EqualFirst {
|
||||
template<typename TypeA, typename TypeB> bool operator()(const TypeA& a, const TypeB& b)
|
||||
{
|
||||
return a.first == b.first;
|
||||
}
|
||||
};
|
||||
|
||||
struct CompareSecond {
|
||||
template<typename TypeA, typename TypeB> bool operator()(const TypeA& a, const TypeB& b)
|
||||
{
|
||||
return a.second < b.second;
|
||||
}
|
||||
};
|
||||
|
||||
template<typename T> struct FirstAdapter {
|
||||
const T& first;
|
||||
};
|
||||
template<typename T> FirstAdapter<T> makeFirstAdapter(const T& value)
|
||||
{
|
||||
return { value };
|
||||
}
|
||||
|
||||
template<typename T> struct SecondAdapter {
|
||||
const T& second;
|
||||
};
|
||||
template<typename T> SecondAdapter<T> makeSecondAdapter(const T& value)
|
||||
{
|
||||
return { value };
|
||||
}
|
||||
|
||||
template<typename CollectionType> void sortByFirst(CollectionType& collection)
|
||||
{
|
||||
std::sort(std::begin(collection), std::end(collection), CompareFirst {});
|
||||
}
|
||||
|
||||
template<typename CollectionType> void stableSortByFirst(CollectionType& collection)
|
||||
{
|
||||
std::stable_sort(std::begin(collection), std::end(collection), CompareFirst {});
|
||||
}
|
||||
|
||||
template<typename CollectionType> bool isSortedByFirst(const CollectionType& collection)
|
||||
{
|
||||
return std::is_sorted(std::begin(collection), std::end(collection), CompareFirst {});
|
||||
}
|
||||
|
||||
template<typename CollectionType> bool sortedFirstsAreUnique(const CollectionType& collection)
|
||||
{
|
||||
return std::adjacent_find(std::begin(collection), std::end(collection), EqualFirst {}) == std::end(collection);
|
||||
}
|
||||
|
||||
template<typename CollectionType, typename KeyType> static auto findFirstInSortedPairs(const CollectionType& collection, const KeyType& key) -> std::optional<decltype(std::begin(collection)->second)>
|
||||
{
|
||||
if constexpr (std::is_integral_v<KeyType>) {
|
||||
if (key != decltype(std::begin(collection)->first)(key))
|
||||
return std::nullopt;
|
||||
}
|
||||
auto iterator = std::lower_bound(std::begin(collection), std::end(collection), makeFirstAdapter(key), CompareFirst {});
|
||||
if (iterator == std::end(collection) || key < iterator->first)
|
||||
return std::nullopt;
|
||||
return iterator->second;
|
||||
}
|
||||
|
||||
template<typename CollectionType, typename KeyType> static auto findInSortedPairs(const CollectionType& collection, const KeyType& key) -> std::pair<decltype(std::begin(collection)), decltype(std::begin(collection))>
|
||||
{
|
||||
if constexpr (std::is_integral_v<KeyType>) {
|
||||
if (key != decltype(std::begin(collection)->first)(key))
|
||||
return { std::end(collection), std::end(collection) };
|
||||
}
|
||||
return std::equal_range(std::begin(collection), std::end(collection), makeFirstAdapter(key), CompareFirst {});
|
||||
}
|
||||
|
||||
}
|
||||
59
src/bun.js/bindings/webcore/KillRing.cpp
Normal file
59
src/bun.js/bindings/webcore/KillRing.cpp
Normal file
@@ -0,0 +1,59 @@
|
||||
/*
|
||||
* Copyright (C) 2010 Google Inc. All Rights Reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
|
||||
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
|
||||
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
#include "KillRing.h"
|
||||
#include <wtf/TZoneMallocInlines.h>
|
||||
|
||||
#if !PLATFORM(MAC)
|
||||
|
||||
namespace PAL {
|
||||
|
||||
WTF_MAKE_TZONE_ALLOCATED_IMPL(KillRing);
|
||||
|
||||
void KillRing::append(const String&)
|
||||
{
|
||||
}
|
||||
|
||||
void KillRing::prepend(const String&)
|
||||
{
|
||||
}
|
||||
|
||||
String KillRing::yank()
|
||||
{
|
||||
return String();
|
||||
}
|
||||
|
||||
void KillRing::startNewSequence()
|
||||
{
|
||||
}
|
||||
|
||||
void KillRing::setToYankedState()
|
||||
{
|
||||
}
|
||||
|
||||
} // namespace PAL
|
||||
|
||||
#endif // !PLATFORM(MAC)
|
||||
44
src/bun.js/bindings/webcore/KillRing.h
Normal file
44
src/bun.js/bindings/webcore/KillRing.h
Normal file
@@ -0,0 +1,44 @@
|
||||
/*
|
||||
* Copyright (C) 2010 Google Inc. All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
|
||||
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
|
||||
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <wtf/TZoneMalloc.h>
|
||||
#include <wtf/text/WTFString.h>
|
||||
|
||||
namespace PAL {
|
||||
|
||||
class KillRing {
|
||||
WTF_MAKE_TZONE_ALLOCATED_EXPORT(KillRing, );
|
||||
|
||||
public:
|
||||
void append(const String&);
|
||||
void prepend(const String&);
|
||||
String yank();
|
||||
void startNewSequence();
|
||||
void setToYankedState();
|
||||
};
|
||||
|
||||
} // namespace PAL
|
||||
@@ -0,0 +1,286 @@
|
||||
/*
|
||||
* Copyright (C) 2013 Google Inc. All rights reserved.
|
||||
* Copyright (C) 2020 Apple Inc. All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are
|
||||
* met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above
|
||||
* copyright notice, this list of conditions and the following disclaimer
|
||||
* in the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
* * Neither the name of Google Inc. nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <wtf/StdLibExtras.h>
|
||||
#include <wtf/text/StringCommon.h>
|
||||
#include <wtf/text/StringParsingBuffer.h>
|
||||
|
||||
namespace WTF {
|
||||
|
||||
template<typename CharacterType> inline bool isNotASCIISpace(CharacterType c)
|
||||
{
|
||||
return !isUnicodeCompatibleASCIIWhitespace(c);
|
||||
}
|
||||
|
||||
template<typename T> void skip(std::span<T>& data, size_t amountToSkip)
|
||||
{
|
||||
data = data.subspan(amountToSkip);
|
||||
}
|
||||
|
||||
template<typename CharacterType, typename DelimiterType> bool skipExactly(const CharacterType*& position, const CharacterType* end, DelimiterType delimiter)
|
||||
{
|
||||
if (position < end && *position == delimiter) {
|
||||
++position;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
template<typename CharacterType, typename DelimiterType> bool skipExactly(std::span<const CharacterType>& data, DelimiterType delimiter)
|
||||
{
|
||||
if (!data.empty() && data.front() == delimiter) {
|
||||
skip(data, 1);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
template<typename CharacterType, typename DelimiterType> bool skipExactly(StringParsingBuffer<CharacterType>& buffer, DelimiterType delimiter)
|
||||
{
|
||||
if (buffer.hasCharactersRemaining() && *buffer == delimiter) {
|
||||
++buffer;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
template<bool characterPredicate(LChar)> bool skipExactly(StringParsingBuffer<LChar>& buffer)
|
||||
{
|
||||
if (buffer.hasCharactersRemaining() && characterPredicate(*buffer)) {
|
||||
++buffer;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
template<bool characterPredicate(UChar)> bool skipExactly(StringParsingBuffer<UChar>& buffer)
|
||||
{
|
||||
if (buffer.hasCharactersRemaining() && characterPredicate(*buffer)) {
|
||||
++buffer;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
template<bool characterPredicate(LChar)> bool skipExactly(std::span<const LChar>& buffer)
|
||||
{
|
||||
if (!buffer.empty() && characterPredicate(buffer[0])) {
|
||||
skip(buffer, 1);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
template<bool characterPredicate(UChar)> bool skipExactly(std::span<const UChar>& buffer)
|
||||
{
|
||||
if (!buffer.empty() && characterPredicate(buffer[0])) {
|
||||
skip(buffer, 1);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
template<typename CharacterType, typename DelimiterType> void skipUntil(StringParsingBuffer<CharacterType>& buffer, DelimiterType delimiter)
|
||||
{
|
||||
while (buffer.hasCharactersRemaining() && *buffer != delimiter)
|
||||
++buffer;
|
||||
}
|
||||
|
||||
template<typename CharacterType, typename DelimiterType> void skipUntil(std::span<const CharacterType>& buffer, DelimiterType delimiter)
|
||||
{
|
||||
size_t index = 0;
|
||||
while (index < buffer.size() && buffer[index] != delimiter)
|
||||
++index;
|
||||
skip(buffer, index);
|
||||
}
|
||||
|
||||
template<bool characterPredicate(LChar)> void skipUntil(std::span<const LChar>& data)
|
||||
{
|
||||
size_t index = 0;
|
||||
while (index < data.size() && !characterPredicate(data[index]))
|
||||
++index;
|
||||
skip(data, index);
|
||||
}
|
||||
|
||||
template<bool characterPredicate(UChar)> void skipUntil(std::span<const UChar>& data)
|
||||
{
|
||||
size_t index = 0;
|
||||
while (index < data.size() && !characterPredicate(data[index]))
|
||||
++index;
|
||||
skip(data, index);
|
||||
}
|
||||
|
||||
template<bool characterPredicate(LChar)> void skipUntil(StringParsingBuffer<LChar>& buffer)
|
||||
{
|
||||
while (buffer.hasCharactersRemaining() && !characterPredicate(*buffer))
|
||||
++buffer;
|
||||
}
|
||||
|
||||
template<bool characterPredicate(UChar)> void skipUntil(StringParsingBuffer<UChar>& buffer)
|
||||
{
|
||||
while (buffer.hasCharactersRemaining() && !characterPredicate(*buffer))
|
||||
++buffer;
|
||||
}
|
||||
|
||||
template<typename CharacterType, typename DelimiterType> void skipWhile(StringParsingBuffer<CharacterType>& buffer, DelimiterType delimiter)
|
||||
{
|
||||
while (buffer.hasCharactersRemaining() && *buffer == delimiter)
|
||||
++buffer;
|
||||
}
|
||||
|
||||
template<typename CharacterType, typename DelimiterType> void skipWhile(std::span<const CharacterType>& buffer, DelimiterType delimiter)
|
||||
{
|
||||
size_t index = 0;
|
||||
while (index < buffer.size() && buffer[index] == delimiter)
|
||||
++index;
|
||||
skip(buffer, index);
|
||||
}
|
||||
|
||||
template<bool characterPredicate(LChar)> void skipWhile(std::span<const LChar>& data)
|
||||
{
|
||||
size_t index = 0;
|
||||
while (index < data.size() && characterPredicate(data[index]))
|
||||
++index;
|
||||
skip(data, index);
|
||||
}
|
||||
|
||||
template<bool characterPredicate(UChar)> void skipWhile(std::span<const UChar>& data)
|
||||
{
|
||||
size_t index = 0;
|
||||
while (index < data.size() && characterPredicate(data[index]))
|
||||
++index;
|
||||
skip(data, index);
|
||||
}
|
||||
|
||||
template<bool characterPredicate(LChar)> void skipWhile(StringParsingBuffer<LChar>& buffer)
|
||||
{
|
||||
while (buffer.hasCharactersRemaining() && characterPredicate(*buffer))
|
||||
++buffer;
|
||||
}
|
||||
|
||||
template<bool characterPredicate(UChar)> void skipWhile(StringParsingBuffer<UChar>& buffer)
|
||||
{
|
||||
while (buffer.hasCharactersRemaining() && characterPredicate(*buffer))
|
||||
++buffer;
|
||||
}
|
||||
|
||||
template<typename CharacterType> bool skipExactlyIgnoringASCIICase(StringParsingBuffer<CharacterType>& buffer, ASCIILiteral literal)
|
||||
{
|
||||
auto literalLength = literal.length();
|
||||
|
||||
if (buffer.lengthRemaining() < literalLength)
|
||||
return false;
|
||||
if (!equalLettersIgnoringASCIICaseWithLength(buffer.span(), literal.span8(), literalLength))
|
||||
return false;
|
||||
buffer += literalLength;
|
||||
return true;
|
||||
}
|
||||
|
||||
template<typename CharacterType, std::size_t Extent> bool skipLettersExactlyIgnoringASCIICase(StringParsingBuffer<CharacterType>& buffer, std::span<const CharacterType, Extent> letters)
|
||||
{
|
||||
if (buffer.lengthRemaining() < letters.size())
|
||||
return false;
|
||||
for (unsigned i = 0; i < letters.size(); ++i) {
|
||||
ASSERT(isASCIIAlpha(letters[i]));
|
||||
if (!isASCIIAlphaCaselessEqual(buffer[i], static_cast<char>(letters[i])))
|
||||
return false;
|
||||
}
|
||||
buffer += letters.size();
|
||||
return true;
|
||||
}
|
||||
|
||||
template<typename CharacterType, std::size_t Extent> bool skipLettersExactlyIgnoringASCIICase(std::span<const CharacterType>& buffer, std::span<const CharacterType, Extent> letters)
|
||||
{
|
||||
if (buffer.size() < letters.size())
|
||||
return false;
|
||||
if (!equalLettersIgnoringASCIICaseWithLength(buffer, letters, letters.size()))
|
||||
return false;
|
||||
skip(buffer, letters.size());
|
||||
return true;
|
||||
}
|
||||
|
||||
template<typename CharacterType, std::size_t Extent> constexpr bool skipCharactersExactly(StringParsingBuffer<CharacterType>& buffer, std::span<const CharacterType, Extent> string)
|
||||
{
|
||||
if (!spanHasPrefix(buffer.span(), string))
|
||||
return false;
|
||||
buffer += string.size();
|
||||
return true;
|
||||
}
|
||||
|
||||
template<typename CharacterType, std::size_t Extent> constexpr bool skipCharactersExactly(std::span<const CharacterType>& buffer, std::span<const CharacterType, Extent> string)
|
||||
{
|
||||
if (!spanHasPrefix(buffer, string))
|
||||
return false;
|
||||
skip(buffer, string.size());
|
||||
return true;
|
||||
}
|
||||
|
||||
template<typename T> std::span<T> consumeSpan(std::span<T>& data, size_t amountToConsume)
|
||||
{
|
||||
auto consumed = data.first(amountToConsume);
|
||||
skip(data, amountToConsume);
|
||||
return consumed;
|
||||
}
|
||||
|
||||
template<typename T> T& consume(std::span<T>& data)
|
||||
{
|
||||
T& value = data[0];
|
||||
skip(data, 1);
|
||||
return value;
|
||||
}
|
||||
|
||||
template<typename DestinationType, typename SourceType>
|
||||
match_constness_t<SourceType, DestinationType>& consumeAndCastTo(std::span<SourceType>& data)
|
||||
requires(sizeof(SourceType) == 1)
|
||||
{
|
||||
return spanReinterpretCast<match_constness_t<SourceType, DestinationType>>(consumeSpan(data, sizeof(DestinationType)))[0];
|
||||
}
|
||||
|
||||
// Adapt a UChar-predicate to an LChar-predicate.
|
||||
template<bool characterPredicate(UChar)>
|
||||
static inline bool LCharPredicateAdapter(LChar c) { return characterPredicate(c); }
|
||||
|
||||
} // namespace WTF
|
||||
|
||||
using WTF::consume;
|
||||
using WTF::consumeAndCastTo;
|
||||
using WTF::consumeSpan;
|
||||
using WTF::isNotASCIISpace;
|
||||
using WTF::LCharPredicateAdapter;
|
||||
using WTF::skip;
|
||||
using WTF::skipCharactersExactly;
|
||||
using WTF::skipExactly;
|
||||
using WTF::skipExactlyIgnoringASCIICase;
|
||||
using WTF::skipLettersExactlyIgnoringASCIICase;
|
||||
using WTF::skipUntil;
|
||||
using WTF::skipWhile;
|
||||
68
src/bun.js/bindings/webcore/TextCodec.cpp
Normal file
68
src/bun.js/bindings/webcore/TextCodec.cpp
Normal file
@@ -0,0 +1,68 @@
|
||||
/*
|
||||
* Copyright (C) 2004-2017 Apple Inc. All rights reserved.
|
||||
* Copyright (C) 2006 Alexey Proskuryakov <ap@nypop.com>
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
|
||||
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
|
||||
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
#include "TextCodec.h"
|
||||
#include <unicode/uchar.h>
|
||||
#include <wtf/TZoneMallocInlines.h>
|
||||
#include <wtf/text/WTFString.h>
|
||||
#include <wtf/unicode/CharacterNames.h>
|
||||
|
||||
#include <array>
|
||||
#include <cstdio>
|
||||
|
||||
namespace PAL {
|
||||
|
||||
WTF_MAKE_TZONE_ALLOCATED_IMPL(TextCodec);
|
||||
|
||||
std::span<char> TextCodec::getUnencodableReplacement(char32_t codePoint, UnencodableHandling handling, UnencodableReplacementArray& replacement)
|
||||
{
|
||||
ASSERT(!(codePoint > UCHAR_MAX_VALUE));
|
||||
|
||||
// The Encoding Standard doesn't have surrogate code points in the input, but that would require
|
||||
// scanning and potentially manipulating inputs ahead of time. Instead handle them at the last
|
||||
// possible point.
|
||||
if (U_IS_SURROGATE(codePoint))
|
||||
codePoint = replacementCharacter;
|
||||
|
||||
switch (handling) {
|
||||
case UnencodableHandling::Entities: {
|
||||
int count = snprintf(replacement.data(), sizeof(UnencodableReplacementArray), "&#%u;", static_cast<unsigned>(codePoint));
|
||||
ASSERT(count >= 0);
|
||||
return std::span { replacement }.first(std::max<int>(0, count));
|
||||
}
|
||||
case UnencodableHandling::URLEncodedEntities: {
|
||||
int count = snprintf(replacement.data(), sizeof(UnencodableReplacementArray), "%%26%%23%u%%3B", static_cast<unsigned>(codePoint));
|
||||
ASSERT(count >= 0);
|
||||
return std::span { replacement }.first(std::max<int>(0, count));
|
||||
} }
|
||||
|
||||
ASSERT_NOT_REACHED();
|
||||
replacement[0] = '\0';
|
||||
return std::span { replacement }.first(0);
|
||||
}
|
||||
|
||||
} // namespace PAL
|
||||
70
src/bun.js/bindings/webcore/TextCodec.h
Normal file
70
src/bun.js/bindings/webcore/TextCodec.h
Normal file
@@ -0,0 +1,70 @@
|
||||
/*
|
||||
* Copyright (C) 2004-2020 Apple Inc. All rights reserved.
|
||||
* Copyright (C) 2006 Alexey Proskuryakov <ap@nypop.com>
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
|
||||
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
|
||||
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "UnencodableHandling.h"
|
||||
#include <array>
|
||||
#include <memory>
|
||||
#include <span>
|
||||
#include <unicode/umachine.h>
|
||||
#include <wtf/Forward.h>
|
||||
#include <wtf/Noncopyable.h>
|
||||
#include <wtf/TZoneMalloc.h>
|
||||
|
||||
namespace PAL {
|
||||
|
||||
class TextEncoding;
|
||||
|
||||
using UnencodableReplacementArray = std::array<char, 32>;
|
||||
|
||||
class TextCodec {
|
||||
WTF_MAKE_TZONE_ALLOCATED(TextCodec);
|
||||
WTF_MAKE_NONCOPYABLE(TextCodec);
|
||||
|
||||
public:
|
||||
TextCodec() = default;
|
||||
virtual ~TextCodec() = default;
|
||||
|
||||
virtual void stripByteOrderMark() {}
|
||||
virtual String decode(std::span<const uint8_t> data, bool flush, bool stopOnError, bool& sawError) = 0;
|
||||
|
||||
virtual Vector<uint8_t> encode(StringView, UnencodableHandling) const = 0;
|
||||
|
||||
// Fills a null-terminated string representation of the given
|
||||
// unencodable character into the given replacement buffer.
|
||||
// The length of the string (not including the null) will be returned.
|
||||
static std::span<char> getUnencodableReplacement(char32_t, UnencodableHandling, UnencodableReplacementArray& LIFETIME_BOUND);
|
||||
};
|
||||
|
||||
Function<void(char32_t, Vector<uint8_t>&)> unencodableHandler(UnencodableHandling);
|
||||
|
||||
using EncodingNameRegistrar = void (*)(ASCIILiteral alias, ASCIILiteral name);
|
||||
|
||||
using NewTextCodecFunction = Function<std::unique_ptr<TextCodec>()>;
|
||||
using TextCodecRegistrar = void (*)(ASCIILiteral name, NewTextCodecFunction&&);
|
||||
|
||||
} // namespace PAL
|
||||
78
src/bun.js/bindings/webcore/TextCodecASCIIFastPath.h
Normal file
78
src/bun.js/bindings/webcore/TextCodecASCIIFastPath.h
Normal file
@@ -0,0 +1,78 @@
|
||||
/*
|
||||
* Copyright (C) 2011 Apple Inc. All rights reserved.
|
||||
* Copyright (C) 2011 Nokia Corporation and/or its subsidiary(-ies).
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
|
||||
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
|
||||
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <wtf/StdLibExtras.h>
|
||||
#include <wtf/text/ASCIIFastPath.h>
|
||||
|
||||
namespace PAL {
|
||||
|
||||
template<size_t size> struct UCharByteFiller;
|
||||
template<> struct UCharByteFiller<4> {
|
||||
static void copy(std::span<LChar> destination, std::span<const uint8_t> source)
|
||||
{
|
||||
memcpySpan(destination, source.first(4));
|
||||
}
|
||||
|
||||
static void copy(std::span<UChar> destination, std::span<const uint8_t> source)
|
||||
{
|
||||
destination[0] = source[0];
|
||||
destination[1] = source[1];
|
||||
destination[2] = source[2];
|
||||
destination[3] = source[3];
|
||||
}
|
||||
};
|
||||
template<> struct UCharByteFiller<8> {
|
||||
static void copy(std::span<LChar> destination, std::span<const uint8_t> source)
|
||||
{
|
||||
memcpySpan(destination, source.first(8));
|
||||
}
|
||||
|
||||
static void copy(std::span<UChar> destination, std::span<const uint8_t> source)
|
||||
{
|
||||
destination[0] = source[0];
|
||||
destination[1] = source[1];
|
||||
destination[2] = source[2];
|
||||
destination[3] = source[3];
|
||||
destination[4] = source[4];
|
||||
destination[5] = source[5];
|
||||
destination[6] = source[6];
|
||||
destination[7] = source[7];
|
||||
}
|
||||
};
|
||||
|
||||
inline void copyASCIIMachineWord(std::span<LChar> destination, std::span<const uint8_t> source)
|
||||
{
|
||||
UCharByteFiller<sizeof(WTF::MachineWord)>::copy(destination, source);
|
||||
}
|
||||
|
||||
inline void copyASCIIMachineWord(std::span<UChar> destination, std::span<const uint8_t> source)
|
||||
{
|
||||
UCharByteFiller<sizeof(WTF::MachineWord)>::copy(destination, source);
|
||||
}
|
||||
|
||||
} // namespace PAL
|
||||
1219
src/bun.js/bindings/webcore/TextCodecCJK.cpp
Normal file
1219
src/bun.js/bindings/webcore/TextCodecCJK.cpp
Normal file
File diff suppressed because it is too large
Load Diff
76
src/bun.js/bindings/webcore/TextCodecCJK.h
Normal file
76
src/bun.js/bindings/webcore/TextCodecCJK.h
Normal file
@@ -0,0 +1,76 @@
|
||||
/*
|
||||
* Copyright (C) 2020 Apple Inc. All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
|
||||
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
|
||||
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "TextCodec.h"
|
||||
#include <optional>
|
||||
#include <wtf/TZoneMalloc.h>
|
||||
|
||||
namespace PAL {
|
||||
|
||||
class TextCodecCJK final : public TextCodec {
|
||||
WTF_MAKE_TZONE_ALLOCATED(TextCodecCJK);
|
||||
public:
|
||||
static void registerEncodingNames(EncodingNameRegistrar);
|
||||
static void registerCodecs(TextCodecRegistrar);
|
||||
|
||||
enum class Encoding : uint8_t;
|
||||
explicit TextCodecCJK(Encoding);
|
||||
|
||||
private:
|
||||
String decode(std::span<const uint8_t>, bool flush, bool stopOnError, bool& sawError) final;
|
||||
Vector<uint8_t> encode(StringView, UnencodableHandling) const final;
|
||||
|
||||
enum class SawError : bool { No, Yes };
|
||||
String decodeCommon(std::span<const uint8_t>, bool, bool, bool&, const Function<SawError(uint8_t, StringBuilder&)>&);
|
||||
|
||||
String eucJPDecode(std::span<const uint8_t>, bool, bool, bool&);
|
||||
String iso2022JPDecode(std::span<const uint8_t>, bool, bool, bool&);
|
||||
String shiftJISDecode(std::span<const uint8_t>, bool, bool, bool&);
|
||||
String eucKRDecode(std::span<const uint8_t>, bool, bool, bool&);
|
||||
String big5Decode(std::span<const uint8_t>, bool, bool, bool&);
|
||||
String gbkDecode(std::span<const uint8_t>, bool, bool, bool&);
|
||||
String gb18030Decode(std::span<const uint8_t>, bool, bool, bool&);
|
||||
|
||||
const Encoding m_encoding;
|
||||
|
||||
bool m_jis0212 { false };
|
||||
|
||||
enum class ISO2022JPDecoderState : uint8_t { ASCII, Roman, Katakana, LeadByte, TrailByte, EscapeStart, Escape };
|
||||
ISO2022JPDecoderState m_iso2022JPDecoderState { ISO2022JPDecoderState::ASCII };
|
||||
ISO2022JPDecoderState m_iso2022JPDecoderOutputState { ISO2022JPDecoderState::ASCII };
|
||||
bool m_iso2022JPOutput { false };
|
||||
std::optional<uint8_t> m_iso2022JPSecondPrependedByte;
|
||||
|
||||
uint8_t m_gb18030First { 0x00 };
|
||||
uint8_t m_gb18030Second { 0x00 };
|
||||
uint8_t m_gb18030Third { 0x00 };
|
||||
|
||||
uint8_t m_lead { 0x00 };
|
||||
std::optional<uint8_t> m_prependedByte;
|
||||
};
|
||||
|
||||
} // namespace PAL
|
||||
337
src/bun.js/bindings/webcore/TextCodecICU.cpp
Normal file
337
src/bun.js/bindings/webcore/TextCodecICU.cpp
Normal file
@@ -0,0 +1,337 @@
|
||||
/*
|
||||
* Copyright (C) 2004-2017 Apple Inc. All rights reserved.
|
||||
* Copyright (C) 2006 Alexey Proskuryakov <ap@nypop.com>
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
|
||||
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
|
||||
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
#include "TextCodecICU.h"
|
||||
#include "ZigGlobalObject.h"
|
||||
#include "TextEncoding.h"
|
||||
#include "TextEncodingRegistry.h"
|
||||
// #include "ThreadGlobalData.h"
|
||||
#include <array>
|
||||
#include <unicode-ucnv_cb.h>
|
||||
#include <wtf/TZoneMallocInlines.h>
|
||||
#include <wtf/Threading.h>
|
||||
#include <wtf/text/CString.h>
|
||||
#include "ParsingUtilities-removeAfterWebKitUpgrade.h"
|
||||
#include <wtf/text/StringBuilder.h>
|
||||
#include <wtf/unicode/CharacterNames.h>
|
||||
#include <wtf/unicode/icu/ICUHelpers.h>
|
||||
#include "ScriptExecutionContext.h"
|
||||
|
||||
namespace PAL {
|
||||
|
||||
WTF_MAKE_TZONE_ALLOCATED_IMPL(TextCodecICU);
|
||||
|
||||
const size_t ConversionBufferSize = 16384;
|
||||
|
||||
static ICUConverterWrapper& cachedConverterICU()
|
||||
{
|
||||
return defaultGlobalObject()->scriptExecutionContext()->cachedConverterICU();
|
||||
}
|
||||
|
||||
#define DECLARE_ALIASES(encoding, ...) \
|
||||
static constexpr ASCIILiteral encoding##_aliases[] { __VA_ARGS__ }
|
||||
|
||||
// From https://encoding.spec.whatwg.org. Plus a few extra aliases that macOS had historically from TEC.
|
||||
DECLARE_ALIASES(ISO_8859_2, "csisolatin2"_s, "iso-ir-101"_s, "iso8859-2"_s, "iso88592"_s, "iso_8859-2"_s, "iso_8859-2:1987"_s, "l2"_s, "latin2"_s);
|
||||
DECLARE_ALIASES(ISO_8859_4, "csisolatin4"_s, "iso-ir-110"_s, "iso8859-4"_s, "iso88594"_s, "iso_8859-4"_s, "iso_8859-4:1988"_s, "l4"_s, "latin4"_s);
|
||||
DECLARE_ALIASES(ISO_8859_5, "csisolatincyrillic"_s, "cyrillic"_s, "iso-ir-144"_s, "iso8859-5"_s, "iso88595"_s, "iso_8859-5"_s, "iso_8859-5:1988"_s);
|
||||
DECLARE_ALIASES(ISO_8859_10, "csisolatin6"_s, "iso-ir-157"_s, "iso8859-10"_s, "iso885910"_s, "l6"_s, "latin6"_s, "iso8859101992"_s, "isoir157"_s);
|
||||
DECLARE_ALIASES(ISO_8859_13, "iso8859-13"_s, "iso885913"_s);
|
||||
DECLARE_ALIASES(ISO_8859_14, "iso8859-14"_s, "iso885914"_s, "isoceltic"_s, "iso8859141998"_s, "isoir199"_s, "latin8"_s, "l8"_s);
|
||||
DECLARE_ALIASES(ISO_8859_15, "csisolatin9"_s, "iso8859-15"_s, "iso885915"_s, "iso_8859-15"_s, "l9"_s);
|
||||
DECLARE_ALIASES(ISO_8859_16, "isoir226"_s, "iso8859162001"_s, "l10"_s, "latin10"_s);
|
||||
DECLARE_ALIASES(KOI8_R, "cskoi8r"_s, "koi"_s, "koi8"_s, "koi8_r"_s);
|
||||
DECLARE_ALIASES(macintosh, "csmacintosh"_s, "mac"_s, "x-mac-roman"_s, "macroman"_s, "x-macroman"_s);
|
||||
DECLARE_ALIASES(windows_1250, "cp1250"_s, "x-cp1250"_s, "winlatin2"_s);
|
||||
DECLARE_ALIASES(windows_1251, "cp1251"_s, "wincyrillic"_s, "x-cp1251"_s);
|
||||
DECLARE_ALIASES(windows_1254, "winturkish"_s, "cp1254"_s, "csisolatin5"_s, "iso-8859-9"_s, "iso-ir-148"_s, "iso8859-9"_s, "iso88599"_s, "iso_8859-9"_s, "iso_8859-9:1989"_s, "l5"_s, "latin5"_s, "x-cp1254"_s);
|
||||
DECLARE_ALIASES(windows_1256, "winarabic"_s, "cp1256"_s, "x-cp1256"_s);
|
||||
DECLARE_ALIASES(windows_1258, "winvietnamese"_s, "cp1258"_s, "x-cp1258"_s);
|
||||
DECLARE_ALIASES(x_mac_cyrillic, "maccyrillic"_s, "x-mac-ukrainian"_s, "windows-10007"_s, "mac-cyrillic"_s, "maccy"_s, "x-MacCyrillic"_s, "x-MacUkraine"_s);
|
||||
// Encodings below are not in the standard.
|
||||
DECLARE_ALIASES(x_mac_greek, "windows-10006"_s, "macgr"_s, "x-MacGreek"_s);
|
||||
DECLARE_ALIASES(x_mac_centraleurroman, "windows-10029"_s, "x-mac-ce"_s, "macce"_s, "maccentraleurope"_s, "x-MacCentralEurope"_s);
|
||||
DECLARE_ALIASES(x_mac_turkish, "windows-10081"_s, "mactr"_s, "x-MacTurkish"_s);
|
||||
|
||||
#define DECLARE_ENCODING_NAME(encoding, alias_array) \
|
||||
{ \
|
||||
encoding, std::span { alias_array##_aliases } \
|
||||
}
|
||||
|
||||
#define DECLARE_ENCODING_NAME_NO_ALIASES(encoding) \
|
||||
{ \
|
||||
encoding, {} \
|
||||
}
|
||||
|
||||
static const struct EncodingName {
|
||||
ASCIILiteral name;
|
||||
std::span<const ASCIILiteral> aliases;
|
||||
} encodingNames[] = {
|
||||
DECLARE_ENCODING_NAME("ISO-8859-2"_s, ISO_8859_2),
|
||||
DECLARE_ENCODING_NAME("ISO-8859-4"_s, ISO_8859_4),
|
||||
DECLARE_ENCODING_NAME("ISO-8859-5"_s, ISO_8859_5),
|
||||
DECLARE_ENCODING_NAME("ISO-8859-10"_s, ISO_8859_10),
|
||||
DECLARE_ENCODING_NAME("ISO-8859-13"_s, ISO_8859_13),
|
||||
DECLARE_ENCODING_NAME("ISO-8859-14"_s, ISO_8859_14),
|
||||
DECLARE_ENCODING_NAME("ISO-8859-15"_s, ISO_8859_15),
|
||||
DECLARE_ENCODING_NAME("ISO-8859-16"_s, ISO_8859_16),
|
||||
DECLARE_ENCODING_NAME("KOI8-R"_s, KOI8_R),
|
||||
DECLARE_ENCODING_NAME("macintosh"_s, macintosh),
|
||||
DECLARE_ENCODING_NAME("windows-1250"_s, windows_1250),
|
||||
DECLARE_ENCODING_NAME("windows-1251"_s, windows_1251),
|
||||
DECLARE_ENCODING_NAME("windows-1254"_s, windows_1254),
|
||||
DECLARE_ENCODING_NAME("windows-1256"_s, windows_1256),
|
||||
DECLARE_ENCODING_NAME("windows-1258"_s, windows_1258),
|
||||
DECLARE_ENCODING_NAME("x-mac-cyrillic"_s, x_mac_cyrillic),
|
||||
// Encodings below are not in the standard.
|
||||
DECLARE_ENCODING_NAME("x-mac-greek"_s, x_mac_greek),
|
||||
DECLARE_ENCODING_NAME("x-mac-centraleurroman"_s, x_mac_centraleurroman),
|
||||
DECLARE_ENCODING_NAME("x-mac-turkish"_s, x_mac_turkish),
|
||||
DECLARE_ENCODING_NAME_NO_ALIASES("EUC-TW"_s),
|
||||
};
|
||||
|
||||
void TextCodecICU::registerEncodingNames(EncodingNameRegistrar registrar)
|
||||
{
|
||||
for (auto& encodingName : encodingNames) {
|
||||
registrar(encodingName.name, encodingName.name);
|
||||
for (auto& alias : encodingName.aliases)
|
||||
registrar(alias, encodingName.name);
|
||||
}
|
||||
}
|
||||
|
||||
void TextCodecICU::registerCodecs(TextCodecRegistrar registrar)
|
||||
{
|
||||
for (auto& encodingName : encodingNames) {
|
||||
ASCIILiteral name = encodingName.name;
|
||||
|
||||
UErrorCode error = U_ZERO_ERROR;
|
||||
const char* canonicalConverterName = ucnv_getCanonicalName(name, "IANA", &error);
|
||||
ASSERT(U_SUCCESS(error));
|
||||
if (!canonicalConverterName) {
|
||||
auto converter = ICUConverterPtr { ucnv_open(name, &error) };
|
||||
ASSERT(U_SUCCESS(error));
|
||||
canonicalConverterName = ucnv_getName(converter.get(), &error);
|
||||
ASSERT(U_SUCCESS(error));
|
||||
if (!canonicalConverterName) {
|
||||
ASSERT_NOT_REACHED();
|
||||
continue;
|
||||
}
|
||||
}
|
||||
registrar(name, [name, canonicalConverterName] {
|
||||
// ucnv_getCanonicalName() returns a static string owned by libicu so the call to
|
||||
// ASCIILiteral::fromLiteralUnsafe() should be safe.
|
||||
return makeUnique<TextCodecICU>(name, ASCIILiteral::fromLiteralUnsafe(canonicalConverterName));
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
TextCodecICU::TextCodecICU(ASCIILiteral encoding, ASCIILiteral canonicalConverterName)
|
||||
: m_encodingName(encoding)
|
||||
, m_canonicalConverterName(canonicalConverterName)
|
||||
{
|
||||
ASSERT(!m_canonicalConverterName.isNull());
|
||||
}
|
||||
|
||||
TextCodecICU::~TextCodecICU()
|
||||
{
|
||||
if (m_converter) {
|
||||
ucnv_reset(m_converter.get());
|
||||
cachedConverterICU().converter = WTFMove(m_converter);
|
||||
}
|
||||
}
|
||||
|
||||
void TextCodecICU::createICUConverter() const
|
||||
{
|
||||
ASSERT(!m_converter);
|
||||
|
||||
auto& cachedConverter = cachedConverterICU().converter;
|
||||
if (cachedConverter) {
|
||||
UErrorCode error = U_ZERO_ERROR;
|
||||
const char* cachedConverterName = ucnv_getName(cachedConverter.get(), &error);
|
||||
if (U_SUCCESS(error) && !strcmp(m_canonicalConverterName, cachedConverterName)) {
|
||||
m_converter = WTFMove(cachedConverter);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
UErrorCode error = U_ZERO_ERROR;
|
||||
m_converter = ICUConverterPtr { ucnv_open(m_canonicalConverterName, &error) };
|
||||
if (m_converter)
|
||||
ucnv_setFallback(m_converter.get(), true);
|
||||
}
|
||||
|
||||
int TextCodecICU::decodeToBuffer(std::span<UChar> targetSpan, std::span<const uint8_t>& sourceSpan, int32_t* offsets, bool flush, UErrorCode& error)
|
||||
{
|
||||
UChar* targetStart = targetSpan.data();
|
||||
error = U_ZERO_ERROR;
|
||||
auto* source = byteCast<char>(sourceSpan.data());
|
||||
auto* sourceLimit = byteCast<char>(std::to_address(sourceSpan.end()));
|
||||
auto* target = targetSpan.data();
|
||||
auto* targetLimit = std::to_address(targetSpan.end());
|
||||
ucnv_toUnicode(m_converter.get(), &target, targetLimit, &source, sourceLimit, offsets, flush, &error);
|
||||
skip(sourceSpan, byteCast<uint8_t>(source) - sourceSpan.data());
|
||||
return target - targetStart;
|
||||
}
|
||||
|
||||
class ErrorCallbackSetter {
|
||||
public:
|
||||
ErrorCallbackSetter(UConverter& converter, bool stopOnError)
|
||||
: m_converter(converter)
|
||||
, m_shouldStopOnEncodingErrors(stopOnError)
|
||||
{
|
||||
if (m_shouldStopOnEncodingErrors) {
|
||||
UErrorCode err = U_ZERO_ERROR;
|
||||
ucnv_setToUCallBack(&m_converter, UCNV_TO_U_CALLBACK_SUBSTITUTE, UCNV_SUB_STOP_ON_ILLEGAL, &m_savedAction, &m_savedContext, &err);
|
||||
ASSERT(U_SUCCESS(err));
|
||||
}
|
||||
}
|
||||
~ErrorCallbackSetter()
|
||||
{
|
||||
if (m_shouldStopOnEncodingErrors) {
|
||||
UErrorCode err = U_ZERO_ERROR;
|
||||
const void* oldContext;
|
||||
UConverterToUCallback oldAction;
|
||||
ucnv_setToUCallBack(&m_converter, m_savedAction, m_savedContext, &oldAction, &oldContext, &err);
|
||||
ASSERT(oldAction == UCNV_TO_U_CALLBACK_SUBSTITUTE);
|
||||
ASSERT(!strcmp(static_cast<const char*>(oldContext), UCNV_SUB_STOP_ON_ILLEGAL));
|
||||
ASSERT(U_SUCCESS(err));
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
UConverter& m_converter;
|
||||
bool m_shouldStopOnEncodingErrors;
|
||||
const void* m_savedContext { nullptr };
|
||||
UConverterToUCallback m_savedAction { nullptr };
|
||||
};
|
||||
|
||||
String TextCodecICU::decode(std::span<const uint8_t> source, bool flush, bool stopOnError, bool& sawError)
|
||||
{
|
||||
// Get a converter for the passed-in encoding.
|
||||
if (!m_converter) {
|
||||
createICUConverter();
|
||||
if (!m_converter) {
|
||||
LOG_ERROR("error creating ICU encoder even though encoding was in table");
|
||||
sawError = true;
|
||||
return {};
|
||||
}
|
||||
}
|
||||
|
||||
ErrorCallbackSetter callbackSetter(*m_converter, stopOnError);
|
||||
|
||||
StringBuilder result;
|
||||
|
||||
std::array<UChar, ConversionBufferSize> buffer;
|
||||
auto target = std::span { buffer };
|
||||
int32_t* offsets = nullptr;
|
||||
UErrorCode err = U_ZERO_ERROR;
|
||||
|
||||
do {
|
||||
size_t ucharsDecoded = decodeToBuffer(target, source, offsets, flush, err);
|
||||
result.append(target.first(ucharsDecoded));
|
||||
} while (needsToGrowToProduceBuffer(err));
|
||||
|
||||
if (U_FAILURE(err)) {
|
||||
// flush the converter so it can be reused, and not be bothered by this error.
|
||||
do {
|
||||
decodeToBuffer(target, source, offsets, true, err);
|
||||
} while (!source.empty());
|
||||
sawError = true;
|
||||
}
|
||||
|
||||
String resultString = result.toString();
|
||||
|
||||
return resultString;
|
||||
}
|
||||
|
||||
// Invalid character handler when writing escaped entities for unrepresentable
|
||||
// characters. See the declaration of TextCodec::encode for more.
|
||||
static void urlEscapedEntityCallback(const void* context, UConverterFromUnicodeArgs* fromUArgs, const UChar* codeUnits, int32_t length,
|
||||
UChar32 codePoint, UConverterCallbackReason reason, UErrorCode* error)
|
||||
{
|
||||
if (reason == UCNV_UNASSIGNED) {
|
||||
*error = U_ZERO_ERROR;
|
||||
UnencodableReplacementArray entity;
|
||||
auto span = TextCodec::getUnencodableReplacement(codePoint, UnencodableHandling::URLEncodedEntities, entity);
|
||||
ucnv_cbFromUWriteBytes(fromUArgs, span.data(), span.size(), 0, error);
|
||||
} else
|
||||
UCNV_FROM_U_CALLBACK_ESCAPE(context, fromUArgs, codeUnits, length, codePoint, reason, error);
|
||||
}
|
||||
|
||||
Vector<uint8_t> TextCodecICU::encode(StringView string, UnencodableHandling handling) const
|
||||
{
|
||||
if (string.isEmpty())
|
||||
return {};
|
||||
|
||||
if (!m_converter) {
|
||||
createICUConverter();
|
||||
if (!m_converter)
|
||||
return {};
|
||||
}
|
||||
|
||||
// FIXME: We should see if there is "force ASCII range" mode in ICU;
|
||||
// until then, we change the backslash into a yen sign.
|
||||
// Encoding will change the yen sign back into a backslash.
|
||||
String copy;
|
||||
if (shouldShowBackslashAsCurrencySymbolIn(m_encodingName)) {
|
||||
copy = makeStringByReplacingAll(string, '\\', yenSign);
|
||||
string = copy;
|
||||
}
|
||||
|
||||
UErrorCode error;
|
||||
switch (handling) {
|
||||
case UnencodableHandling::Entities:
|
||||
error = U_ZERO_ERROR;
|
||||
ucnv_setFromUCallBack(m_converter.get(), UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_DEC, 0, 0, &error);
|
||||
if (U_FAILURE(error))
|
||||
return {};
|
||||
break;
|
||||
case UnencodableHandling::URLEncodedEntities:
|
||||
error = U_ZERO_ERROR;
|
||||
ucnv_setFromUCallBack(m_converter.get(), urlEscapedEntityCallback, 0, 0, 0, &error);
|
||||
if (U_FAILURE(error))
|
||||
return {};
|
||||
break;
|
||||
}
|
||||
|
||||
auto upconvertedCharacters = string.upconvertedCharacters();
|
||||
auto source = upconvertedCharacters.span().data();
|
||||
auto* sourceLimit = std::to_address(upconvertedCharacters.span().end());
|
||||
|
||||
Vector<uint8_t> result;
|
||||
do {
|
||||
std::array<char, ConversionBufferSize> buffer;
|
||||
char* target = buffer.data();
|
||||
char* targetLimit = std::to_address(std::span { buffer }.end());
|
||||
error = U_ZERO_ERROR;
|
||||
ucnv_fromUnicode(m_converter.get(), &target, targetLimit, &source, sourceLimit, 0, true, &error);
|
||||
result.append(byteCast<uint8_t>(std::span(buffer)).first(target - buffer.data()));
|
||||
} while (needsToGrowToProduceBuffer(error));
|
||||
return result;
|
||||
}
|
||||
|
||||
} // namespace PAL
|
||||
69
src/bun.js/bindings/webcore/TextCodecICU.h
Normal file
69
src/bun.js/bindings/webcore/TextCodecICU.h
Normal file
@@ -0,0 +1,69 @@
|
||||
/*
|
||||
* Copyright (C) 2004-2017 Apple Inc. All rights reserved.
|
||||
* Copyright (C) 2006 Alexey Proskuryakov <ap@nypop.com>
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
|
||||
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
|
||||
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "TextCodec.h"
|
||||
#include "unicode-ucnv.h"
|
||||
#include <wtf/TZoneMalloc.h>
|
||||
#include <wtf/text/ASCIILiteral.h>
|
||||
#include <wtf/unicode/icu/ICUHelpers.h>
|
||||
|
||||
namespace PAL {
|
||||
|
||||
using ICUConverterPtr = std::unique_ptr<UConverter, ICUDeleter<ucnv_close>>;
|
||||
|
||||
class TextCodecICU final : public TextCodec {
|
||||
WTF_MAKE_TZONE_ALLOCATED(TextCodecICU);
|
||||
|
||||
public:
|
||||
static void registerEncodingNames(EncodingNameRegistrar);
|
||||
static void registerCodecs(TextCodecRegistrar);
|
||||
|
||||
explicit TextCodecICU(ASCIILiteral encoding, ASCIILiteral canonicalConverterName);
|
||||
virtual ~TextCodecICU();
|
||||
|
||||
private:
|
||||
String decode(std::span<const uint8_t>, bool flush, bool stopOnError, bool& sawError) final;
|
||||
Vector<uint8_t> encode(StringView, UnencodableHandling) const final;
|
||||
|
||||
void createICUConverter() const;
|
||||
void releaseICUConverter() const;
|
||||
|
||||
int decodeToBuffer(std::span<UChar> buffer, std::span<const uint8_t>& source, int32_t* offsets, bool flush, UErrorCode&);
|
||||
|
||||
ASCIILiteral m_encodingName;
|
||||
ASCIILiteral const m_canonicalConverterName;
|
||||
mutable ICUConverterPtr m_converter;
|
||||
};
|
||||
|
||||
struct ICUConverterWrapper {
|
||||
WTF_MAKE_STRUCT_FAST_ALLOCATED;
|
||||
|
||||
ICUConverterPtr converter;
|
||||
};
|
||||
|
||||
} // namespace PAL
|
||||
256
src/bun.js/bindings/webcore/TextCodecLatin1.cpp
Normal file
256
src/bun.js/bindings/webcore/TextCodecLatin1.cpp
Normal file
@@ -0,0 +1,256 @@
|
||||
/*
|
||||
* Copyright (C) 2004-2017 Apple Inc. All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
|
||||
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
|
||||
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
#include "TextCodecLatin1.h"
|
||||
|
||||
#include "TextCodecASCIIFastPath.h"
|
||||
#include <array>
|
||||
#include <wtf/text/CString.h>
|
||||
#include "ParsingUtilities-removeAfterWebKitUpgrade.h"
|
||||
#include <wtf/text/WTFString.h>
|
||||
|
||||
namespace PAL {
|
||||
|
||||
static constexpr std::array<UChar, 256> latin1ConversionTable = {
|
||||
0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, // 00-07
|
||||
0x0008, 0x0009, 0x000A, 0x000B, 0x000C, 0x000D, 0x000E, 0x000F, // 08-0F
|
||||
0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, // 10-17
|
||||
0x0018, 0x0019, 0x001A, 0x001B, 0x001C, 0x001D, 0x001E, 0x001F, // 18-1F
|
||||
0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, // 20-27
|
||||
0x0028, 0x0029, 0x002A, 0x002B, 0x002C, 0x002D, 0x002E, 0x002F, // 28-2F
|
||||
0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, // 30-37
|
||||
0x0038, 0x0039, 0x003A, 0x003B, 0x003C, 0x003D, 0x003E, 0x003F, // 38-3F
|
||||
0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, // 40-47
|
||||
0x0048, 0x0049, 0x004A, 0x004B, 0x004C, 0x004D, 0x004E, 0x004F, // 48-4F
|
||||
0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, // 50-57
|
||||
0x0058, 0x0059, 0x005A, 0x005B, 0x005C, 0x005D, 0x005E, 0x005F, // 58-5F
|
||||
0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, // 60-67
|
||||
0x0068, 0x0069, 0x006A, 0x006B, 0x006C, 0x006D, 0x006E, 0x006F, // 68-6F
|
||||
0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, // 70-77
|
||||
0x0078, 0x0079, 0x007A, 0x007B, 0x007C, 0x007D, 0x007E, 0x007F, // 78-7F
|
||||
0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, // 80-87
|
||||
0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F, // 88-8F
|
||||
0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, // 90-97
|
||||
0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178, // 98-9F
|
||||
0x00A0, 0x00A1, 0x00A2, 0x00A3, 0x00A4, 0x00A5, 0x00A6, 0x00A7, // A0-A7
|
||||
0x00A8, 0x00A9, 0x00AA, 0x00AB, 0x00AC, 0x00AD, 0x00AE, 0x00AF, // A8-AF
|
||||
0x00B0, 0x00B1, 0x00B2, 0x00B3, 0x00B4, 0x00B5, 0x00B6, 0x00B7, // B0-B7
|
||||
0x00B8, 0x00B9, 0x00BA, 0x00BB, 0x00BC, 0x00BD, 0x00BE, 0x00BF, // B8-BF
|
||||
0x00C0, 0x00C1, 0x00C2, 0x00C3, 0x00C4, 0x00C5, 0x00C6, 0x00C7, // C0-C7
|
||||
0x00C8, 0x00C9, 0x00CA, 0x00CB, 0x00CC, 0x00CD, 0x00CE, 0x00CF, // C8-CF
|
||||
0x00D0, 0x00D1, 0x00D2, 0x00D3, 0x00D4, 0x00D5, 0x00D6, 0x00D7, // D0-D7
|
||||
0x00D8, 0x00D9, 0x00DA, 0x00DB, 0x00DC, 0x00DD, 0x00DE, 0x00DF, // D8-DF
|
||||
0x00E0, 0x00E1, 0x00E2, 0x00E3, 0x00E4, 0x00E5, 0x00E6, 0x00E7, // E0-E7
|
||||
0x00E8, 0x00E9, 0x00EA, 0x00EB, 0x00EC, 0x00ED, 0x00EE, 0x00EF, // E8-EF
|
||||
0x00F0, 0x00F1, 0x00F2, 0x00F3, 0x00F4, 0x00F5, 0x00F6, 0x00F7, // F0-F7
|
||||
0x00F8, 0x00F9, 0x00FA, 0x00FB, 0x00FC, 0x00FD, 0x00FE, 0x00FF // F8-FF
|
||||
};
|
||||
|
||||
void TextCodecLatin1::registerEncodingNames(EncodingNameRegistrar registrar)
|
||||
{
|
||||
// From https://encoding.spec.whatwg.org.
|
||||
registrar("windows-1252"_s, "windows-1252"_s);
|
||||
registrar("ansi_x3.4-1968"_s, "windows-1252"_s);
|
||||
registrar("ascii"_s, "windows-1252"_s);
|
||||
registrar("cp1252"_s, "windows-1252"_s);
|
||||
registrar("cp819"_s, "windows-1252"_s);
|
||||
registrar("csisolatin1"_s, "windows-1252"_s);
|
||||
registrar("ibm819"_s, "windows-1252"_s);
|
||||
registrar("iso-8859-1"_s, "windows-1252"_s);
|
||||
registrar("iso-ir-100"_s, "windows-1252"_s);
|
||||
registrar("iso8859-1"_s, "windows-1252"_s);
|
||||
registrar("iso88591"_s, "windows-1252"_s);
|
||||
registrar("iso_8859-1"_s, "windows-1252"_s);
|
||||
registrar("iso_8859-1:1987"_s, "windows-1252"_s);
|
||||
registrar("l1"_s, "windows-1252"_s);
|
||||
registrar("latin1"_s, "windows-1252"_s);
|
||||
registrar("us-ascii"_s, "windows-1252"_s);
|
||||
registrar("x-cp1252"_s, "windows-1252"_s);
|
||||
}
|
||||
|
||||
void TextCodecLatin1::registerCodecs(TextCodecRegistrar registrar)
|
||||
{
|
||||
registrar("windows-1252"_s, [] {
|
||||
return makeUnique<TextCodecLatin1>();
|
||||
});
|
||||
}
|
||||
|
||||
WTF_ALLOW_UNSAFE_BUFFER_USAGE_BEGIN
|
||||
|
||||
String TextCodecLatin1::decode(std::span<const uint8_t> bytes, bool, bool, bool& sawException)
|
||||
{
|
||||
std::span<LChar> characters;
|
||||
if (bytes.empty())
|
||||
return emptyString();
|
||||
if (UNLIKELY(bytes.size() > std::numeric_limits<unsigned>::max())) {
|
||||
ASSERT_NOT_REACHED();
|
||||
sawException = true;
|
||||
return emptyString();
|
||||
}
|
||||
String result = String::createUninitialized(bytes.size(), characters);
|
||||
|
||||
auto source = bytes;
|
||||
const uint8_t* alignedEnd = WTF::alignToMachineWord(std::to_address(source.end()));
|
||||
auto destination = characters;
|
||||
|
||||
while (!source.empty()) {
|
||||
if (isASCII(source[0])) {
|
||||
// Fast path for ASCII. Most Latin-1 text will be ASCII.
|
||||
if (WTF::isAlignedToMachineWord(source.data())) {
|
||||
while (source.data() < alignedEnd) {
|
||||
auto chunk = reinterpretCastSpanStartTo<WTF::MachineWord>(source);
|
||||
|
||||
if (!WTF::containsOnlyASCII<LChar>(chunk))
|
||||
goto useLookupTable;
|
||||
|
||||
copyASCIIMachineWord(destination, source);
|
||||
skip(source, sizeof(WTF::MachineWord));
|
||||
skip(destination, sizeof(WTF::MachineWord));
|
||||
}
|
||||
|
||||
if (source.empty())
|
||||
break;
|
||||
|
||||
// *source may not be ASCII anymore if source moves inside the loop of the fast code path
|
||||
if (!isASCII(source[0]))
|
||||
goto useLookupTable;
|
||||
}
|
||||
destination[0] = source[0];
|
||||
} else {
|
||||
useLookupTable:
|
||||
auto sourceCharacter = source[0];
|
||||
if (!isLatin1(latin1ConversionTable[sourceCharacter]))
|
||||
goto upConvertTo16Bit;
|
||||
|
||||
destination[0] = latin1ConversionTable[sourceCharacter];
|
||||
}
|
||||
|
||||
skip(source, 1);
|
||||
skip(destination, 1);
|
||||
}
|
||||
|
||||
return result;
|
||||
|
||||
upConvertTo16Bit:
|
||||
std::span<UChar> characters16;
|
||||
String result16 = String::createUninitialized(bytes.size(), characters16);
|
||||
|
||||
auto destination16 = characters16;
|
||||
|
||||
// Zero extend and copy already processed 8 bit data
|
||||
LChar* ptr8 = characters.data();
|
||||
LChar* endPtr8 = destination.data();
|
||||
|
||||
while (ptr8 < endPtr8)
|
||||
consume(destination16) = *ptr8++;
|
||||
|
||||
// Handle the character that triggered the 16 bit path
|
||||
consume(destination16) = latin1ConversionTable[consume(source)];
|
||||
|
||||
while (!source.empty()) {
|
||||
if (isASCII(source[0])) {
|
||||
// Fast path for ASCII. Most Latin-1 text will be ASCII.
|
||||
if (WTF::isAlignedToMachineWord(source.data())) {
|
||||
while (source.data() < alignedEnd) {
|
||||
auto chunk = reinterpretCastSpanStartTo<WTF::MachineWord>(source);
|
||||
|
||||
if (!WTF::containsOnlyASCII<LChar>(chunk))
|
||||
goto useLookupTable16;
|
||||
|
||||
copyASCIIMachineWord(destination16, source);
|
||||
skip(source, sizeof(WTF::MachineWord));
|
||||
skip(destination16, sizeof(WTF::MachineWord));
|
||||
}
|
||||
|
||||
if (source.empty())
|
||||
break;
|
||||
|
||||
// *source may not be ASCII anymore if source moves inside the loop of the fast code path
|
||||
if (!isASCII(source[0]))
|
||||
goto useLookupTable16;
|
||||
}
|
||||
destination16[0] = source[0];
|
||||
} else {
|
||||
useLookupTable16:
|
||||
destination16[0] = latin1ConversionTable[source[0]];
|
||||
}
|
||||
|
||||
skip(source, 1);
|
||||
skip(destination16, 1);
|
||||
}
|
||||
|
||||
return result16;
|
||||
}
|
||||
|
||||
WTF_ALLOW_UNSAFE_BUFFER_USAGE_END
|
||||
|
||||
static Vector<uint8_t> encodeComplexWindowsLatin1(StringView string, UnencodableHandling handling)
|
||||
{
|
||||
Vector<uint8_t> result;
|
||||
|
||||
for (auto character : string.codePoints()) {
|
||||
uint8_t b = character;
|
||||
// Do an efficient check to detect characters other than 00-7F and A0-FF.
|
||||
if (b != character || (character & 0xE0) == 0x80) {
|
||||
// Look for a way to encode this with Windows Latin-1.
|
||||
for (b = 0x80; b < 0xA0; ++b) {
|
||||
if (latin1ConversionTable[b] == character)
|
||||
goto gotByte;
|
||||
}
|
||||
// No way to encode this character with Windows Latin-1.
|
||||
UnencodableReplacementArray replacement;
|
||||
result.append(TextCodec::getUnencodableReplacement(character, handling, replacement));
|
||||
continue;
|
||||
}
|
||||
gotByte:
|
||||
result.append(b);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
Vector<uint8_t> TextCodecLatin1::encode(StringView string, UnencodableHandling handling) const
|
||||
{
|
||||
{
|
||||
Vector<uint8_t> result(string.length());
|
||||
size_t index = 0;
|
||||
|
||||
// Convert and simultaneously do a check to see if it's all ASCII.
|
||||
UChar ored = 0;
|
||||
for (auto character : string.codeUnits()) {
|
||||
result[index++] = character;
|
||||
ored |= character;
|
||||
}
|
||||
|
||||
if (!(ored & 0xFF80))
|
||||
return result;
|
||||
}
|
||||
|
||||
// If it wasn't all ASCII, call the function that handles more-complex cases.
|
||||
return encodeComplexWindowsLatin1(string, handling);
|
||||
}
|
||||
|
||||
} // namespace PAL
|
||||
42
src/bun.js/bindings/webcore/TextCodecLatin1.h
Normal file
42
src/bun.js/bindings/webcore/TextCodecLatin1.h
Normal file
@@ -0,0 +1,42 @@
|
||||
/*
|
||||
* Copyright (C) 2004-2017 Apple Inc. All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
|
||||
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
|
||||
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "TextCodec.h"
|
||||
|
||||
namespace PAL {
|
||||
|
||||
class TextCodecLatin1 final : public TextCodec {
|
||||
public:
|
||||
static void registerEncodingNames(EncodingNameRegistrar);
|
||||
static void registerCodecs(TextCodecRegistrar);
|
||||
|
||||
private:
|
||||
String decode(std::span<const uint8_t>, bool flush, bool stopOnError, bool& sawError) final;
|
||||
Vector<uint8_t> encode(StringView, UnencodableHandling) const final;
|
||||
};
|
||||
|
||||
} // namespace PAL
|
||||
70
src/bun.js/bindings/webcore/TextCodecReplacement.cpp
Normal file
70
src/bun.js/bindings/webcore/TextCodecReplacement.cpp
Normal file
@@ -0,0 +1,70 @@
|
||||
/*
|
||||
* Copyright (C) 2016-2017 Apple Inc. All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS''
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
|
||||
* THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS
|
||||
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
|
||||
* THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
#include "TextCodecReplacement.h"
|
||||
|
||||
#include <wtf/Function.h>
|
||||
#include <wtf/TZoneMallocInlines.h>
|
||||
#include <wtf/text/WTFString.h>
|
||||
#include <wtf/unicode/CharacterNames.h>
|
||||
|
||||
namespace PAL {
|
||||
|
||||
WTF_MAKE_TZONE_ALLOCATED_IMPL(TextCodecReplacement);
|
||||
|
||||
void TextCodecReplacement::registerEncodingNames(EncodingNameRegistrar registrar)
|
||||
{
|
||||
registrar("replacement"_s, "replacement"_s);
|
||||
|
||||
registrar("csiso2022kr"_s, "replacement"_s);
|
||||
registrar("hz-gb-2312"_s, "replacement"_s);
|
||||
registrar("iso-2022-cn"_s, "replacement"_s);
|
||||
registrar("iso-2022-cn-ext"_s, "replacement"_s);
|
||||
registrar("iso-2022-kr"_s, "replacement"_s);
|
||||
}
|
||||
|
||||
void TextCodecReplacement::registerCodecs(TextCodecRegistrar registrar)
|
||||
{
|
||||
registrar("replacement"_s, [] {
|
||||
return makeUnique<TextCodecReplacement>();
|
||||
});
|
||||
}
|
||||
|
||||
String TextCodecReplacement::decode(std::span<const uint8_t>, bool, bool, bool& sawError)
|
||||
{
|
||||
sawError = true;
|
||||
if (m_sentEOF)
|
||||
return emptyString();
|
||||
m_sentEOF = true;
|
||||
return span(replacementCharacter);
|
||||
}
|
||||
|
||||
Vector<uint8_t> TextCodecReplacement::encode(StringView string, UnencodableHandling) const
|
||||
{
|
||||
return TextCodecUTF8::encodeUTF8(string);
|
||||
}
|
||||
|
||||
} // namespace PAL
|
||||
46
src/bun.js/bindings/webcore/TextCodecReplacement.h
Normal file
46
src/bun.js/bindings/webcore/TextCodecReplacement.h
Normal file
@@ -0,0 +1,46 @@
|
||||
/*
|
||||
* Copyright (C) 2016-2020 Apple Inc. All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS''
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
|
||||
* THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS
|
||||
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
|
||||
* THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "TextCodecUTF8.h"
|
||||
#include <wtf/TZoneMalloc.h>
|
||||
|
||||
namespace PAL {
|
||||
|
||||
class TextCodecReplacement final : public TextCodec {
|
||||
WTF_MAKE_TZONE_ALLOCATED(TextCodecReplacement);
|
||||
public:
|
||||
static void registerEncodingNames(EncodingNameRegistrar);
|
||||
static void registerCodecs(TextCodecRegistrar);
|
||||
|
||||
private:
|
||||
String decode(std::span<const uint8_t>, bool flush, bool stopOnError, bool& sawError) final;
|
||||
Vector<uint8_t> encode(StringView, UnencodableHandling) const final;
|
||||
|
||||
bool m_sentEOF { false };
|
||||
};
|
||||
|
||||
} // namespace PAL
|
||||
467
src/bun.js/bindings/webcore/TextCodecSingleByte.cpp
Normal file
467
src/bun.js/bindings/webcore/TextCodecSingleByte.cpp
Normal file
@@ -0,0 +1,467 @@
|
||||
/*
|
||||
* Copyright (C) 2020 Apple Inc. All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
|
||||
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
|
||||
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
#include "TextCodecSingleByte.h"
|
||||
|
||||
#include "EncodingTables.h"
|
||||
#include <array>
|
||||
#include <mutex>
|
||||
#include <wtf/IteratorRange.h>
|
||||
#include <wtf/NeverDestroyed.h>
|
||||
#include <wtf/TZoneMallocInlines.h>
|
||||
#include <wtf/text/CodePointIterator.h>
|
||||
#include <wtf/text/StringBuilder.h>
|
||||
#include <wtf/unicode/CharacterNames.h>
|
||||
|
||||
namespace PAL {
|
||||
|
||||
WTF_MAKE_TZONE_ALLOCATED_IMPL(TextCodecSingleByte);
|
||||
|
||||
enum class TextCodecSingleByte::Encoding : uint8_t {
|
||||
ISO_8859_3,
|
||||
ISO_8859_6,
|
||||
ISO_8859_7,
|
||||
ISO_8859_8,
|
||||
Windows_874,
|
||||
Windows_1253,
|
||||
Windows_1255,
|
||||
Windows_1257,
|
||||
IBM866,
|
||||
KOI8U,
|
||||
};
|
||||
|
||||
using SingleByteDecodeTable = std::array<UChar, 128>;
|
||||
using SingleByteEncodeTableEntry = std::pair<UChar, uint8_t>;
|
||||
using SingleByteEncodeTable = std::span<const SingleByteEncodeTableEntry>;
|
||||
|
||||
// From https://encoding.spec.whatwg.org/index-iso-8859-3.txt with 0xFFFD filling the gaps
|
||||
static constexpr SingleByteDecodeTable iso88593 {
|
||||
0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088, 0x0089, 0x008A, 0x008B, 0x008C, 0x008D, 0x008E, 0x008F,
|
||||
0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009A, 0x009B, 0x009C, 0x009D, 0x009E, 0x009F,
|
||||
0x00A0, 0x0126, 0x02D8, 0x00A3, 0x00A4, 0xFFFD, 0x0124, 0x00A7, 0x00A8, 0x0130, 0x015E, 0x011E, 0x0134, 0x00AD, 0xFFFD, 0x017B,
|
||||
0x00B0, 0x0127, 0x00B2, 0x00B3, 0x00B4, 0x00B5, 0x0125, 0x00B7, 0x00B8, 0x0131, 0x015F, 0x011F, 0x0135, 0x00BD, 0xFFFD, 0x017C,
|
||||
0x00C0, 0x00C1, 0x00C2, 0xFFFD, 0x00C4, 0x010A, 0x0108, 0x00C7, 0x00C8, 0x00C9, 0x00CA, 0x00CB, 0x00CC, 0x00CD, 0x00CE, 0x00CF,
|
||||
0xFFFD, 0x00D1, 0x00D2, 0x00D3, 0x00D4, 0x0120, 0x00D6, 0x00D7, 0x011C, 0x00D9, 0x00DA, 0x00DB, 0x00DC, 0x016C, 0x015C, 0x00DF,
|
||||
0x00E0, 0x00E1, 0x00E2, 0xFFFD, 0x00E4, 0x010B, 0x0109, 0x00E7, 0x00E8, 0x00E9, 0x00EA, 0x00EB, 0x00EC, 0x00ED, 0x00EE, 0x00EF,
|
||||
0xFFFD, 0x00F1, 0x00F2, 0x00F3, 0x00F4, 0x0121, 0x00F6, 0x00F7, 0x011D, 0x00F9, 0x00FA, 0x00FB, 0x00FC, 0x016D, 0x015D, 0x02D9
|
||||
};
|
||||
|
||||
// From https://encoding.spec.whatwg.org/index-iso-8859-6.txt with 0xFFFD filling the gaps
|
||||
static constexpr SingleByteDecodeTable iso88596 {
|
||||
0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088, 0x0089, 0x008A, 0x008B, 0x008C, 0x008D, 0x008E, 0x008F,
|
||||
0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009A, 0x009B, 0x009C, 0x009D, 0x009E, 0x009F,
|
||||
0x00A0, 0xFFFD, 0xFFFD, 0xFFFD, 0x00A4, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0x060C, 0x00AD, 0xFFFD, 0xFFFD,
|
||||
0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0x061B, 0xFFFD, 0xFFFD, 0xFFFD, 0x061F,
|
||||
0xFFFD, 0x0621, 0x0622, 0x0623, 0x0624, 0x0625, 0x0626, 0x0627, 0x0628, 0x0629, 0x062A, 0x062B, 0x062C, 0x062D, 0x062E, 0x062F,
|
||||
0x0630, 0x0631, 0x0632, 0x0633, 0x0634, 0x0635, 0x0636, 0x0637, 0x0638, 0x0639, 0x063A, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD,
|
||||
0x0640, 0x0641, 0x0642, 0x0643, 0x0644, 0x0645, 0x0646, 0x0647, 0x0648, 0x0649, 0x064A, 0x064B, 0x064C, 0x064D, 0x064E, 0x064F,
|
||||
0x0650, 0x0651, 0x0652, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD
|
||||
};
|
||||
|
||||
// From https://encoding.spec.whatwg.org/index-iso-8859-7.txt with 0xFFFD filling the gaps
|
||||
static constexpr SingleByteDecodeTable iso88597 {
|
||||
0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088, 0x0089, 0x008A, 0x008B, 0x008C, 0x008D, 0x008E, 0x008F,
|
||||
0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009A, 0x009B, 0x009C, 0x009D, 0x009E, 0x009F,
|
||||
0x00A0, 0x2018, 0x2019, 0x00A3, 0x20AC, 0x20AF, 0x00A6, 0x00A7, 0x00A8, 0x00A9, 0x037A, 0x00AB, 0x00AC, 0x00AD, 0xFFFD, 0x2015,
|
||||
0x00B0, 0x00B1, 0x00B2, 0x00B3, 0x0384, 0x0385, 0x0386, 0x00B7, 0x0388, 0x0389, 0x038A, 0x00BB, 0x038C, 0x00BD, 0x038E, 0x038F,
|
||||
0x0390, 0x0391, 0x0392, 0x0393, 0x0394, 0x0395, 0x0396, 0x0397, 0x0398, 0x0399, 0x039A, 0x039B, 0x039C, 0x039D, 0x039E, 0x039F,
|
||||
0x03A0, 0x03A1, 0xFFFD, 0x03A3, 0x03A4, 0x03A5, 0x03A6, 0x03A7, 0x03A8, 0x03A9, 0x03AA, 0x03AB, 0x03AC, 0x03AD, 0x03AE, 0x03AF,
|
||||
0x03B0, 0x03B1, 0x03B2, 0x03B3, 0x03B4, 0x03B5, 0x03B6, 0x03B7, 0x03B8, 0x03B9, 0x03BA, 0x03BB, 0x03BC, 0x03BD, 0x03BE, 0x03BF,
|
||||
0x03C0, 0x03C1, 0x03C2, 0x03C3, 0x03C4, 0x03C5, 0x03C6, 0x03C7, 0x03C8, 0x03C9, 0x03CA, 0x03CB, 0x03CC, 0x03CD, 0x03CE, 0xFFFD
|
||||
};
|
||||
|
||||
// From https://encoding.spec.whatwg.org/index-iso-8859-8.txt with 0xFFFD filling the gaps
|
||||
static constexpr SingleByteDecodeTable iso88598 {
|
||||
0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088, 0x0089, 0x008A, 0x008B, 0x008C, 0x008D, 0x008E, 0x008F,
|
||||
0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009A, 0x009B, 0x009C, 0x009D, 0x009E, 0x009F,
|
||||
0x00A0, 0xFFFD, 0x00A2, 0x00A3, 0x00A4, 0x00A5, 0x00A6, 0x00A7, 0x00A8, 0x00A9, 0x00D7, 0x00AB, 0x00AC, 0x00AD, 0x00AE, 0x00AF,
|
||||
0x00B0, 0x00B1, 0x00B2, 0x00B3, 0x00B4, 0x00B5, 0x00B6, 0x00B7, 0x00B8, 0x00B9, 0x00F7, 0x00BB, 0x00BC, 0x00BD, 0x00BE, 0xFFFD,
|
||||
0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD,
|
||||
0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0x2017,
|
||||
0x05D0, 0x05D1, 0x05D2, 0x05D3, 0x05D4, 0x05D5, 0x05D6, 0x05D7, 0x05D8, 0x05D9, 0x05DA, 0x05DB, 0x05DC, 0x05DD, 0x05DE, 0x05DF,
|
||||
0x05E0, 0x05E1, 0x05E2, 0x05E3, 0x05E4, 0x05E5, 0x05E6, 0x05E7, 0x05E8, 0x05E9, 0x05EA, 0xFFFD, 0xFFFD, 0x200E, 0x200F, 0xFFFD
|
||||
};
|
||||
|
||||
// From https://encoding.spec.whatwg.org/index-windows-874.txt with 0xFFFD filling the gaps
|
||||
static constexpr SingleByteDecodeTable windows874 {
|
||||
0x20AC, 0x0081, 0x0082, 0x0083, 0x0084, 0x2026, 0x0086, 0x0087, 0x0088, 0x0089, 0x008A, 0x008B, 0x008C, 0x008D, 0x008E, 0x008F,
|
||||
0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, 0x0098, 0x0099, 0x009A, 0x009B, 0x009C, 0x009D, 0x009E, 0x009F,
|
||||
0x00A0, 0x0E01, 0x0E02, 0x0E03, 0x0E04, 0x0E05, 0x0E06, 0x0E07, 0x0E08, 0x0E09, 0x0E0A, 0x0E0B, 0x0E0C, 0x0E0D, 0x0E0E, 0x0E0F,
|
||||
0x0E10, 0x0E11, 0x0E12, 0x0E13, 0x0E14, 0x0E15, 0x0E16, 0x0E17, 0x0E18, 0x0E19, 0x0E1A, 0x0E1B, 0x0E1C, 0x0E1D, 0x0E1E, 0x0E1F,
|
||||
0x0E20, 0x0E21, 0x0E22, 0x0E23, 0x0E24, 0x0E25, 0x0E26, 0x0E27, 0x0E28, 0x0E29, 0x0E2A, 0x0E2B, 0x0E2C, 0x0E2D, 0x0E2E, 0x0E2F,
|
||||
0x0E30, 0x0E31, 0x0E32, 0x0E33, 0x0E34, 0x0E35, 0x0E36, 0x0E37, 0x0E38, 0x0E39, 0x0E3A, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0x0E3F,
|
||||
0x0E40, 0x0E41, 0x0E42, 0x0E43, 0x0E44, 0x0E45, 0x0E46, 0x0E47, 0x0E48, 0x0E49, 0x0E4A, 0x0E4B, 0x0E4C, 0x0E4D, 0x0E4E, 0x0E4F,
|
||||
0x0E50, 0x0E51, 0x0E52, 0x0E53, 0x0E54, 0x0E55, 0x0E56, 0x0E57, 0x0E58, 0x0E59, 0x0E5A, 0x0E5B, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD
|
||||
};
|
||||
|
||||
// From https://encoding.spec.whatwg.org/index-windows-1253.txt with 0xFFFD filling the gaps
|
||||
static constexpr SingleByteDecodeTable windows1253 {
|
||||
0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, 0x0088, 0x2030, 0x008A, 0x2039, 0x008C, 0x008D, 0x008E, 0x008F,
|
||||
0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, 0x0098, 0x2122, 0x009A, 0x203A, 0x009C, 0x009D, 0x009E, 0x009F,
|
||||
0x00A0, 0x0385, 0x0386, 0x00A3, 0x00A4, 0x00A5, 0x00A6, 0x00A7, 0x00A8, 0x00A9, 0xFFFD, 0x00AB, 0x00AC, 0x00AD, 0x00AE, 0x2015,
|
||||
0x00B0, 0x00B1, 0x00B2, 0x00B3, 0x0384, 0x00B5, 0x00B6, 0x00B7, 0x0388, 0x0389, 0x038A, 0x00BB, 0x038C, 0x00BD, 0x038E, 0x038F,
|
||||
0x0390, 0x0391, 0x0392, 0x0393, 0x0394, 0x0395, 0x0396, 0x0397, 0x0398, 0x0399, 0x039A, 0x039B, 0x039C, 0x039D, 0x039E, 0x039F,
|
||||
0x03A0, 0x03A1, 0xFFFD, 0x03A3, 0x03A4, 0x03A5, 0x03A6, 0x03A7, 0x03A8, 0x03A9, 0x03AA, 0x03AB, 0x03AC, 0x03AD, 0x03AE, 0x03AF,
|
||||
0x03B0, 0x03B1, 0x03B2, 0x03B3, 0x03B4, 0x03B5, 0x03B6, 0x03B7, 0x03B8, 0x03B9, 0x03BA, 0x03BB, 0x03BC, 0x03BD, 0x03BE, 0x03BF,
|
||||
0x03C0, 0x03C1, 0x03C2, 0x03C3, 0x03C4, 0x03C5, 0x03C6, 0x03C7, 0x03C8, 0x03C9, 0x03CA, 0x03CB, 0x03CC, 0x03CD, 0x03CE, 0xFFFD
|
||||
};
|
||||
|
||||
static constexpr SingleByteDecodeTable windows1255 {
|
||||
0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, 0x02C6, 0x2030, 0x008A, 0x2039, 0x008C, 0x008D, 0x008E, 0x008F,
|
||||
0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, 0x02DC, 0x2122, 0x009A, 0x203A, 0x009C, 0x009D, 0x009E, 0x009F,
|
||||
0x00A0, 0x00A1, 0x00A2, 0x00A3, 0x20AA, 0x00A5, 0x00A6, 0x00A7, 0x00A8, 0x00A9, 0x00D7, 0x00AB, 0x00AC, 0x00AD, 0x00AE, 0x00AF,
|
||||
0x00B0, 0x00B1, 0x00B2, 0x00B3, 0x00B4, 0x00B5, 0x00B6, 0x00B7, 0x00B8, 0x00B9, 0x00F7, 0x00BB, 0x00BC, 0x00BD, 0x00BE, 0x00BF,
|
||||
0x05B0, 0x05B1, 0x05B2, 0x05B3, 0x05B4, 0x05B5, 0x05B6, 0x05B7, 0x05B8, 0x05B9, 0x05BA, 0x05BB, 0x05BC, 0x05BD, 0x05BE, 0x05BF,
|
||||
0x05C0, 0x05C1, 0x05C2, 0x05C3, 0x05F0, 0x05F1, 0x05F2, 0x05F3, 0x05F4, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD,
|
||||
0x05D0, 0x05D1, 0x05D2, 0x05D3, 0x05D4, 0x05D5, 0x05D6, 0x05D7, 0x05D8, 0x05D9, 0x05DA, 0x05DB, 0x05DC, 0x05DD, 0x05DE, 0x05DF,
|
||||
0x05E0, 0x05E1, 0x05E2, 0x05E3, 0x05E4, 0x05E5, 0x05E6, 0x05E7, 0x05E8, 0x05E9, 0x05EA, 0xFFFD, 0xFFFD, 0x200E, 0x200F, 0xFFFD
|
||||
};
|
||||
|
||||
// From https://encoding.spec.whatwg.org/index-windows-1257.txt with 0xFFFD filling the gaps
|
||||
static constexpr SingleByteDecodeTable windows1257 {
|
||||
0x20AC, 0x0081, 0x201A, 0x0083, 0x201E, 0x2026, 0x2020, 0x2021, 0x0088, 0x2030, 0x008A, 0x2039, 0x008C, 0x00A8, 0x02C7, 0x00B8,
|
||||
0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, 0x0098, 0x2122, 0x009A, 0x203A, 0x009C, 0x00AF, 0x02DB, 0x009F,
|
||||
0x00A0, 0xFFFD, 0x00A2, 0x00A3, 0x00A4, 0xFFFD, 0x00A6, 0x00A7, 0x00D8, 0x00A9, 0x0156, 0x00AB, 0x00AC, 0x00AD, 0x00AE, 0x00C6,
|
||||
0x00B0, 0x00B1, 0x00B2, 0x00B3, 0x00B4, 0x00B5, 0x00B6, 0x00B7, 0x00F8, 0x00B9, 0x0157, 0x00BB, 0x00BC, 0x00BD, 0x00BE, 0x00E6,
|
||||
0x0104, 0x012E, 0x0100, 0x0106, 0x00C4, 0x00C5, 0x0118, 0x0112, 0x010C, 0x00C9, 0x0179, 0x0116, 0x0122, 0x0136, 0x012A, 0x013B,
|
||||
0x0160, 0x0143, 0x0145, 0x00D3, 0x014C, 0x00D5, 0x00D6, 0x00D7, 0x0172, 0x0141, 0x015A, 0x016A, 0x00DC, 0x017B, 0x017D, 0x00DF,
|
||||
0x0105, 0x012F, 0x0101, 0x0107, 0x00E4, 0x00E5, 0x0119, 0x0113, 0x010D, 0x00E9, 0x017A, 0x0117, 0x0123, 0x0137, 0x012B, 0x013C,
|
||||
0x0161, 0x0144, 0x0146, 0x00F3, 0x014D, 0x00F5, 0x00F6, 0x00F7, 0x0173, 0x0142, 0x015B, 0x016B, 0x00FC, 0x017C, 0x017E, 0x02D9
|
||||
};
|
||||
|
||||
// From https://encoding.spec.whatwg.org/index-koi8-u.txt
|
||||
static constexpr SingleByteDecodeTable koi8u {
|
||||
0x2500, 0x2502, 0x250C, 0x2510, 0x2514, 0x2518, 0x251C, 0x2524, 0x252C, 0x2534, 0x253C, 0x2580, 0x2584, 0x2588, 0x258C, 0x2590,
|
||||
0x2591, 0x2592, 0x2593, 0x2320, 0x25A0, 0x2219, 0x221A, 0x2248, 0x2264, 0x2265, 0x00A0, 0x2321, 0x00B0, 0x00B2, 0x00B7, 0x00F7,
|
||||
0x2550, 0x2551, 0x2552, 0x0451, 0x0454, 0x2554, 0x0456, 0x0457, 0x2557, 0x2558, 0x2559, 0x255A, 0x255B, 0x0491, 0x045E, 0x255E,
|
||||
0x255F, 0x2560, 0x2561, 0x0401, 0x0404, 0x2563, 0x0406, 0x0407, 0x2566, 0x2567, 0x2568, 0x2569, 0x256A, 0x0490, 0x040E, 0x00A9,
|
||||
0x044E, 0x0430, 0x0431, 0x0446, 0x0434, 0x0435, 0x0444, 0x0433, 0x0445, 0x0438, 0x0439, 0x043A, 0x043B, 0x043C, 0x043D, 0x043E,
|
||||
0x043F, 0x044F, 0x0440, 0x0441, 0x0442, 0x0443, 0x0436, 0x0432, 0x044C, 0x044B, 0x0437, 0x0448, 0x044D, 0x0449, 0x0447, 0x044A,
|
||||
0x042E, 0x0410, 0x0411, 0x0426, 0x0414, 0x0415, 0x0424, 0x0413, 0x0425, 0x0418, 0x0419, 0x041A, 0x041B, 0x041C, 0x041D, 0x041E,
|
||||
0x041F, 0x042F, 0x0420, 0x0421, 0x0422, 0x0423, 0x0416, 0x0412, 0x042C, 0x042B, 0x0417, 0x0428, 0x042D, 0x0429, 0x0427, 0x042A
|
||||
};
|
||||
|
||||
// From https://encoding.spec.whatwg.org/index-ibm866.txt
|
||||
static constexpr SingleByteDecodeTable ibm866 {
|
||||
0x0410, 0x0411, 0x0412, 0x0413, 0x0414, 0x0415, 0x0416, 0x0417, 0x0418, 0x0419, 0x041A, 0x041B, 0x041C, 0x041D, 0x041E, 0x041F,
|
||||
0x0420, 0x0421, 0x0422, 0x0423, 0x0424, 0x0425, 0x0426, 0x0427, 0x0428, 0x0429, 0x042A, 0x042B, 0x042C, 0x042D, 0x042E, 0x042F,
|
||||
0x0430, 0x0431, 0x0432, 0x0433, 0x0434, 0x0435, 0x0436, 0x0437, 0x0438, 0x0439, 0x043A, 0x043B, 0x043C, 0x043D, 0x043E, 0x043F,
|
||||
0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x2561, 0x2562, 0x2556, 0x2555, 0x2563, 0x2551, 0x2557, 0x255D, 0x255C, 0x255B, 0x2510,
|
||||
0x2514, 0x2534, 0x252C, 0x251C, 0x2500, 0x253C, 0x255E, 0x255F, 0x255A, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256C, 0x2567,
|
||||
0x2568, 0x2564, 0x2565, 0x2559, 0x2558, 0x2552, 0x2553, 0x256B, 0x256A, 0x2518, 0x250C, 0x2588, 0x2584, 0x258C, 0x2590, 0x2580,
|
||||
0x0440, 0x0441, 0x0442, 0x0443, 0x0444, 0x0445, 0x0446, 0x0447, 0x0448, 0x0449, 0x044A, 0x044B, 0x044C, 0x044D, 0x044E, 0x044F,
|
||||
0x0401, 0x0451, 0x0404, 0x0454, 0x0407, 0x0457, 0x040E, 0x045E, 0x00B0, 0x2219, 0x00B7, 0x221A, 0x2116, 0x00A4, 0x25A0, 0x00A0
|
||||
};
|
||||
|
||||
template<const SingleByteDecodeTable& decodeTable> SingleByteEncodeTable tableForEncoding()
|
||||
{
|
||||
// Allocate this at runtime because building it at compile time would make the binary much larger and this is often not used.
|
||||
static constexpr auto size = std::size(decodeTable) - std::count(std::begin(decodeTable), std::end(decodeTable), replacementCharacter);
|
||||
static const std::array<SingleByteEncodeTableEntry, size>* entries;
|
||||
static std::once_flag once;
|
||||
std::call_once(once, [&] {
|
||||
auto* mutableEntries = new std::array<SingleByteEncodeTableEntry, size>();
|
||||
size_t j = 0;
|
||||
for (size_t i = 0; i < std::size(decodeTable); ++i) {
|
||||
if (decodeTable[i] != replacementCharacter)
|
||||
(*mutableEntries)[j++] = { decodeTable[i], i + 0x80 };
|
||||
}
|
||||
ASSERT(j == size);
|
||||
auto collection = std::span { *mutableEntries };
|
||||
sortByFirst(collection);
|
||||
ASSERT(sortedFirstsAreUnique(collection));
|
||||
entries = mutableEntries;
|
||||
});
|
||||
return std::span { *entries };
|
||||
}
|
||||
|
||||
static SingleByteEncodeTable tableForEncoding(TextCodecSingleByte::Encoding encoding)
|
||||
{
|
||||
switch (encoding) {
|
||||
case TextCodecSingleByte::Encoding::ISO_8859_3:
|
||||
return tableForEncoding<iso88593>();
|
||||
case TextCodecSingleByte::Encoding::ISO_8859_6:
|
||||
return tableForEncoding<iso88596>();
|
||||
case TextCodecSingleByte::Encoding::ISO_8859_7:
|
||||
return tableForEncoding<iso88597>();
|
||||
case TextCodecSingleByte::Encoding::ISO_8859_8:
|
||||
return tableForEncoding<iso88598>();
|
||||
case TextCodecSingleByte::Encoding::Windows_874:
|
||||
return tableForEncoding<windows874>();
|
||||
case TextCodecSingleByte::Encoding::Windows_1253:
|
||||
return tableForEncoding<windows1253>();
|
||||
case TextCodecSingleByte::Encoding::Windows_1255:
|
||||
return tableForEncoding<windows1255>();
|
||||
case TextCodecSingleByte::Encoding::Windows_1257:
|
||||
return tableForEncoding<windows1257>();
|
||||
case TextCodecSingleByte::Encoding::IBM866:
|
||||
return tableForEncoding<ibm866>();
|
||||
case TextCodecSingleByte::Encoding::KOI8U:
|
||||
return tableForEncoding<koi8u>();
|
||||
}
|
||||
RELEASE_ASSERT_NOT_REACHED();
|
||||
}
|
||||
|
||||
static const SingleByteDecodeTable& tableForDecoding(TextCodecSingleByte::Encoding encoding)
|
||||
{
|
||||
switch (encoding) {
|
||||
case TextCodecSingleByte::Encoding::ISO_8859_3:
|
||||
return iso88593;
|
||||
case TextCodecSingleByte::Encoding::ISO_8859_6:
|
||||
return iso88596;
|
||||
case TextCodecSingleByte::Encoding::ISO_8859_7:
|
||||
return iso88597;
|
||||
case TextCodecSingleByte::Encoding::ISO_8859_8:
|
||||
return iso88598;
|
||||
case TextCodecSingleByte::Encoding::Windows_874:
|
||||
return windows874;
|
||||
case TextCodecSingleByte::Encoding::Windows_1253:
|
||||
return windows1253;
|
||||
case TextCodecSingleByte::Encoding::Windows_1255:
|
||||
return windows1255;
|
||||
case TextCodecSingleByte::Encoding::Windows_1257:
|
||||
return windows1257;
|
||||
case TextCodecSingleByte::Encoding::IBM866:
|
||||
return ibm866;
|
||||
case TextCodecSingleByte::Encoding::KOI8U:
|
||||
return koi8u;
|
||||
}
|
||||
RELEASE_ASSERT_NOT_REACHED();
|
||||
}
|
||||
|
||||
// https://encoding.spec.whatwg.org/#single-byte-encoder
|
||||
static Vector<uint8_t> encode(const SingleByteEncodeTable& table, StringView string, Function<void(char32_t, Vector<uint8_t>&)>&& unencodableHandler)
|
||||
{
|
||||
// FIXME: Consider adding an ASCII fast path like the one in TextCodecLatin1::decode.
|
||||
Vector<uint8_t> result;
|
||||
result.reserveInitialCapacity(string.length());
|
||||
for (auto codePoint : string.codePoints()) {
|
||||
if (isASCII(codePoint)) {
|
||||
result.append(codePoint);
|
||||
continue;
|
||||
}
|
||||
auto byte = findFirstInSortedPairs(table, codePoint);
|
||||
if (!byte) {
|
||||
unencodableHandler(codePoint, result);
|
||||
continue;
|
||||
}
|
||||
result.append(*byte);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
// https://encoding.spec.whatwg.org/#single-byte-decoder
|
||||
static String decode(const SingleByteDecodeTable& table, std::span<const uint8_t> bytes, bool, bool stopOnError, bool& sawError)
|
||||
{
|
||||
StringBuilder result;
|
||||
result.reserveCapacity(bytes.size());
|
||||
auto parseByte = [&] (uint8_t byte) {
|
||||
if (isASCII(byte)) {
|
||||
result.append(byte);
|
||||
return;
|
||||
}
|
||||
UChar codePoint = table[byte - 0x80];
|
||||
if (codePoint == replacementCharacter)
|
||||
sawError = true;
|
||||
result.append(codePoint);
|
||||
};
|
||||
if (stopOnError) {
|
||||
for (auto byte : bytes) {
|
||||
parseByte(byte);
|
||||
if (sawError)
|
||||
return result.toString();
|
||||
}
|
||||
} else {
|
||||
for (auto byte : bytes)
|
||||
parseByte(byte);
|
||||
}
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
Vector<uint8_t> TextCodecSingleByte::encode(StringView string, UnencodableHandling handling) const
|
||||
{
|
||||
return PAL::encode(tableForEncoding(m_encoding), string, unencodableHandler(handling));
|
||||
}
|
||||
|
||||
String TextCodecSingleByte::decode(std::span<const uint8_t> bytes, bool flush, bool stopOnError, bool& sawError)
|
||||
{
|
||||
return PAL::decode(tableForDecoding(m_encoding), bytes, flush, stopOnError, sawError);
|
||||
}
|
||||
|
||||
TextCodecSingleByte::TextCodecSingleByte(Encoding encoding)
|
||||
: m_encoding(encoding)
|
||||
{
|
||||
}
|
||||
|
||||
void TextCodecSingleByte::registerEncodingNames(EncodingNameRegistrar registrar)
|
||||
{
|
||||
// https://encoding.spec.whatwg.org/#names-and-labels
|
||||
auto registerAliases = [&] (std::initializer_list<ASCIILiteral> list) {
|
||||
for (auto& alias : list)
|
||||
registrar(alias, *list.begin());
|
||||
};
|
||||
registerAliases({
|
||||
"ISO-8859-3"_s,
|
||||
"csisolatin3"_s,
|
||||
"iso-ir-109"_s,
|
||||
"iso8859-3"_s,
|
||||
"iso88593"_s,
|
||||
"iso_8859-3"_s,
|
||||
"iso_8859-3:1988"_s,
|
||||
"l3"_s,
|
||||
"latin3"_s
|
||||
});
|
||||
|
||||
registerAliases({
|
||||
"ISO-8859-6"_s,
|
||||
"arabic"_s,
|
||||
"asmo-708"_s,
|
||||
"csiso88596e"_s,
|
||||
"csiso88596i"_s,
|
||||
"csisolatinarabic"_s,
|
||||
"ecma-114"_s,
|
||||
"iso-8859-6-e"_s,
|
||||
"iso-8859-6-i"_s,
|
||||
"iso-ir-127"_s,
|
||||
"iso8859-6"_s,
|
||||
"iso88596"_s,
|
||||
"iso_8859-6"_s,
|
||||
"iso_8859-6:1987"_s
|
||||
});
|
||||
|
||||
registerAliases({
|
||||
"ISO-8859-7"_s,
|
||||
"csisolatingreek"_s,
|
||||
"ecma-118"_s,
|
||||
"elot_928"_s,
|
||||
"greek"_s,
|
||||
"greek8"_s,
|
||||
"iso-ir-126"_s,
|
||||
"iso8859-7"_s,
|
||||
"iso88597"_s,
|
||||
"iso_8859-7"_s,
|
||||
"iso_8859-7:1987"_s,
|
||||
"sun_eu_greek"_s
|
||||
});
|
||||
|
||||
registerAliases({
|
||||
"ISO-8859-8"_s,
|
||||
"csiso88598e"_s,
|
||||
"csisolatinhebrew"_s,
|
||||
"hebrew"_s,
|
||||
"iso-8859-8-e"_s,
|
||||
"iso-ir-138"_s,
|
||||
"iso8859-8"_s,
|
||||
"iso88598"_s,
|
||||
"iso_8859-8"_s,
|
||||
"iso_8859-8:1988"_s,
|
||||
"visual"_s
|
||||
});
|
||||
|
||||
registerAliases({
|
||||
"ISO-8859-8-I"_s,
|
||||
"csiso88598i"_s,
|
||||
"logical"_s
|
||||
});
|
||||
|
||||
registerAliases({
|
||||
"windows-874"_s,
|
||||
"dos-874"_s,
|
||||
"iso-8859-11"_s,
|
||||
"iso8859-11"_s,
|
||||
"iso885911"_s,
|
||||
"tis-620"_s
|
||||
});
|
||||
|
||||
registerAliases({
|
||||
"windows-1253"_s,
|
||||
"cp1253"_s,
|
||||
"x-cp1253"_s
|
||||
});
|
||||
|
||||
registerAliases({
|
||||
"windows-1255"_s,
|
||||
"cp1255"_s,
|
||||
"x-cp1255"_s
|
||||
});
|
||||
|
||||
registerAliases({
|
||||
"windows-1257"_s,
|
||||
"cp1257"_s,
|
||||
"x-cp1257"_s
|
||||
});
|
||||
|
||||
registerAliases({
|
||||
"KOI8-U"_s,
|
||||
"koi8-ru"_s
|
||||
});
|
||||
|
||||
registerAliases({
|
||||
"IBM866"_s,
|
||||
"866"_s,
|
||||
"cp866"_s,
|
||||
"csibm866"_s
|
||||
});
|
||||
}
|
||||
|
||||
void TextCodecSingleByte::registerCodecs(TextCodecRegistrar registrar)
|
||||
{
|
||||
registrar("ISO-8859-3"_s, [] {
|
||||
return makeUnique<TextCodecSingleByte>(Encoding::ISO_8859_3);
|
||||
});
|
||||
registrar("ISO-8859-6"_s, [] {
|
||||
return makeUnique<TextCodecSingleByte>(Encoding::ISO_8859_6);
|
||||
});
|
||||
registrar("ISO-8859-7"_s, [] {
|
||||
return makeUnique<TextCodecSingleByte>(Encoding::ISO_8859_7);
|
||||
});
|
||||
registrar("ISO-8859-8"_s, [] {
|
||||
return makeUnique<TextCodecSingleByte>(Encoding::ISO_8859_8);
|
||||
});
|
||||
registrar("ISO-8859-8-I"_s, [] {
|
||||
return makeUnique<TextCodecSingleByte>(Encoding::ISO_8859_8);
|
||||
});
|
||||
registrar("windows-874"_s, [] {
|
||||
return makeUnique<TextCodecSingleByte>(Encoding::Windows_874);
|
||||
});
|
||||
registrar("windows-1253"_s, [] {
|
||||
return makeUnique<TextCodecSingleByte>(Encoding::Windows_1253);
|
||||
});
|
||||
registrar("windows-1255"_s, [] {
|
||||
return makeUnique<TextCodecSingleByte>(Encoding::Windows_1255);
|
||||
});
|
||||
registrar("windows-1257"_s, [] {
|
||||
return makeUnique<TextCodecSingleByte>(Encoding::Windows_1257);
|
||||
});
|
||||
registrar("KOI8-U"_s, [] {
|
||||
return makeUnique<TextCodecSingleByte>(Encoding::KOI8U);
|
||||
});
|
||||
registrar("IBM866"_s, [] {
|
||||
return makeUnique<TextCodecSingleByte>(Encoding::IBM866);
|
||||
});
|
||||
}
|
||||
|
||||
}
|
||||
49
src/bun.js/bindings/webcore/TextCodecSingleByte.h
Normal file
49
src/bun.js/bindings/webcore/TextCodecSingleByte.h
Normal file
@@ -0,0 +1,49 @@
|
||||
/*
|
||||
* Copyright (C) 2020 Apple Inc. All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
|
||||
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
|
||||
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "TextCodec.h"
|
||||
#include <wtf/TZoneMalloc.h>
|
||||
|
||||
namespace PAL {
|
||||
|
||||
class TextCodecSingleByte final : public TextCodec {
|
||||
WTF_MAKE_TZONE_ALLOCATED(TextCodecSingleByte);
|
||||
public:
|
||||
static void registerEncodingNames(EncodingNameRegistrar);
|
||||
static void registerCodecs(TextCodecRegistrar);
|
||||
|
||||
enum class Encoding : uint8_t;
|
||||
explicit TextCodecSingleByte(Encoding);
|
||||
|
||||
private:
|
||||
String decode(std::span<const uint8_t>, bool flush, bool stopOnError, bool& sawError) final;
|
||||
Vector<uint8_t> encode(StringView, UnencodableHandling) const final;
|
||||
|
||||
const Encoding m_encoding;
|
||||
};
|
||||
|
||||
} // namespace PAL
|
||||
166
src/bun.js/bindings/webcore/TextCodecUTF16.cpp
Normal file
166
src/bun.js/bindings/webcore/TextCodecUTF16.cpp
Normal file
@@ -0,0 +1,166 @@
|
||||
/*
|
||||
* Copyright (C) 2004-2020 Apple Inc. All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
|
||||
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
|
||||
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
#include "TextCodecUTF16.h"
|
||||
|
||||
#include <wtf/TZoneMallocInlines.h>
|
||||
#include <wtf/text/CString.h>
|
||||
#include <wtf/text/StringBuilder.h>
|
||||
#include <wtf/text/WTFString.h>
|
||||
#include <wtf/unicode/CharacterNames.h>
|
||||
|
||||
namespace PAL {
|
||||
|
||||
WTF_MAKE_TZONE_ALLOCATED_IMPL(TextCodecUTF16);
|
||||
|
||||
inline TextCodecUTF16::TextCodecUTF16(bool littleEndian)
|
||||
: m_littleEndian(littleEndian)
|
||||
{
|
||||
}
|
||||
|
||||
void TextCodecUTF16::registerEncodingNames(EncodingNameRegistrar registrar)
|
||||
{
|
||||
registrar("UTF-16LE"_s, "UTF-16LE"_s);
|
||||
registrar("UTF-16BE"_s, "UTF-16BE"_s);
|
||||
|
||||
registrar("ISO-10646-UCS-2"_s, "UTF-16LE"_s);
|
||||
registrar("UCS-2"_s, "UTF-16LE"_s);
|
||||
registrar("UTF-16"_s, "UTF-16LE"_s);
|
||||
registrar("Unicode"_s, "UTF-16LE"_s);
|
||||
registrar("csUnicode"_s, "UTF-16LE"_s);
|
||||
registrar("unicodeFEFF"_s, "UTF-16LE"_s);
|
||||
|
||||
registrar("unicodeFFFE"_s, "UTF-16BE"_s);
|
||||
}
|
||||
|
||||
void TextCodecUTF16::registerCodecs(TextCodecRegistrar registrar)
|
||||
{
|
||||
registrar("UTF-16LE"_s, [] {
|
||||
return makeUnique<TextCodecUTF16>(true);
|
||||
});
|
||||
registrar("UTF-16BE"_s, [] {
|
||||
return makeUnique<TextCodecUTF16>(false);
|
||||
});
|
||||
}
|
||||
|
||||
// https://encoding.spec.whatwg.org/#shared-utf-16-decoder
|
||||
String TextCodecUTF16::decode(std::span<const uint8_t> bytes, bool flush, bool, bool& sawError)
|
||||
{
|
||||
size_t index = 0;
|
||||
size_t lengthMinusOne = bytes.size() - 1;
|
||||
|
||||
StringBuilder result;
|
||||
result.reserveCapacity(bytes.size() / 2);
|
||||
|
||||
auto processCodeUnit = [&] (UChar codeUnit) {
|
||||
if (std::exchange(m_shouldStripByteOrderMark, false) && codeUnit == byteOrderMark)
|
||||
return;
|
||||
if (m_leadSurrogate) {
|
||||
auto leadSurrogate = *std::exchange(m_leadSurrogate, std::nullopt);
|
||||
if (U16_IS_TRAIL(codeUnit)) {
|
||||
char32_t codePoint = U16_GET_SUPPLEMENTARY(leadSurrogate, codeUnit);
|
||||
result.append(codePoint);
|
||||
return;
|
||||
}
|
||||
sawError = true;
|
||||
result.append(replacementCharacter);
|
||||
}
|
||||
if (U16_IS_LEAD(codeUnit)) {
|
||||
m_leadSurrogate = codeUnit;
|
||||
return;
|
||||
}
|
||||
if (U16_IS_TRAIL(codeUnit)) {
|
||||
sawError = true;
|
||||
result.append(replacementCharacter);
|
||||
return;
|
||||
}
|
||||
result.append(codeUnit);
|
||||
};
|
||||
auto processBytesLE = [&] (uint8_t first, uint8_t second) {
|
||||
processCodeUnit(first | (second << 8));
|
||||
};
|
||||
auto processBytesBE = [&] (uint8_t first, uint8_t second) {
|
||||
processCodeUnit((first << 8) | second);
|
||||
};
|
||||
|
||||
if (!bytes.empty()) {
|
||||
if (m_leadByte && index < bytes.size()) {
|
||||
auto leadByte = *std::exchange(m_leadByte, std::nullopt);
|
||||
auto trailByte = bytes[index++];
|
||||
if (m_littleEndian)
|
||||
processBytesLE(leadByte, trailByte);
|
||||
else
|
||||
processBytesBE(leadByte, trailByte);
|
||||
}
|
||||
if (m_littleEndian) {
|
||||
for (; index < lengthMinusOne; index += 2)
|
||||
processBytesLE(bytes[index], bytes[index + 1]);
|
||||
} else {
|
||||
for (; index < lengthMinusOne; index += 2)
|
||||
processBytesBE(bytes[index], bytes[index + 1]);
|
||||
}
|
||||
|
||||
if (index == lengthMinusOne) {
|
||||
ASSERT(!m_leadByte);
|
||||
m_leadByte = bytes[index];
|
||||
} else
|
||||
ASSERT(index == bytes.size());
|
||||
}
|
||||
|
||||
if (flush) {
|
||||
m_shouldStripByteOrderMark = false;
|
||||
if (m_leadByte || m_leadSurrogate) {
|
||||
m_leadByte = std::nullopt;
|
||||
m_leadSurrogate = std::nullopt;
|
||||
sawError = true;
|
||||
result.append(replacementCharacter);
|
||||
}
|
||||
}
|
||||
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
Vector<uint8_t> TextCodecUTF16::encode(StringView string, UnencodableHandling) const
|
||||
{
|
||||
Vector<uint8_t> result(WTF::checkedProduct<size_t>(string.length(), 2));
|
||||
size_t index = 0;
|
||||
|
||||
if (m_littleEndian) {
|
||||
for (auto character : string.codeUnits()) {
|
||||
result[index++] = character;
|
||||
result[index++] = character >> 8;
|
||||
}
|
||||
} else {
|
||||
for (auto character : string.codeUnits()) {
|
||||
result[index++] = character >> 8;
|
||||
result[index++] = character;
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
} // namespace PAL
|
||||
53
src/bun.js/bindings/webcore/TextCodecUTF16.h
Normal file
53
src/bun.js/bindings/webcore/TextCodecUTF16.h
Normal file
@@ -0,0 +1,53 @@
|
||||
/*
|
||||
* Copyright (C) 2004-2020 Apple Inc. All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
|
||||
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
|
||||
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "TextCodec.h"
|
||||
#include <optional>
|
||||
#include <wtf/TZoneMalloc.h>
|
||||
|
||||
namespace PAL {
|
||||
|
||||
class TextCodecUTF16 final : public TextCodec {
|
||||
WTF_MAKE_TZONE_ALLOCATED(TextCodecUTF16);
|
||||
public:
|
||||
static void registerEncodingNames(EncodingNameRegistrar);
|
||||
static void registerCodecs(TextCodecRegistrar);
|
||||
|
||||
explicit TextCodecUTF16(bool littleEndian);
|
||||
|
||||
private:
|
||||
void stripByteOrderMark() final { m_shouldStripByteOrderMark = true; }
|
||||
String decode(std::span<const uint8_t>, bool flush, bool stopOnError, bool& sawError) final;
|
||||
Vector<uint8_t> encode(StringView, UnencodableHandling) const final;
|
||||
|
||||
bool m_littleEndian;
|
||||
std::optional<uint8_t> m_leadByte;
|
||||
std::optional<UChar> m_leadSurrogate;
|
||||
bool m_shouldStripByteOrderMark { false };
|
||||
};
|
||||
|
||||
} // namespace PAL
|
||||
490
src/bun.js/bindings/webcore/TextCodecUTF8.cpp
Normal file
490
src/bun.js/bindings/webcore/TextCodecUTF8.cpp
Normal file
@@ -0,0 +1,490 @@
|
||||
/*
|
||||
* Copyright (C) 2004-2020 Apple Inc. All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
|
||||
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
|
||||
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
#include "TextCodecUTF8.h"
|
||||
|
||||
#include "TextCodecASCIIFastPath.h"
|
||||
#include <wtf/StdLibExtras.h>
|
||||
#include <wtf/TZoneMallocInlines.h>
|
||||
#include <wtf/text/CString.h>
|
||||
#include "ParsingUtilities-removeAfterWebKitUpgrade.h"
|
||||
#include <wtf/text/StringBuffer.h>
|
||||
#include <wtf/text/WTFString.h>
|
||||
#include <wtf/unicode/CharacterNames.h>
|
||||
|
||||
namespace PAL {
|
||||
|
||||
WTF_MAKE_TZONE_ALLOCATED_IMPL(TextCodecUTF8);
|
||||
|
||||
using namespace WTF::Unicode;
|
||||
|
||||
const int nonCharacter = -1;
|
||||
|
||||
void TextCodecUTF8::registerEncodingNames(EncodingNameRegistrar registrar)
|
||||
{
|
||||
// From https://encoding.spec.whatwg.org.
|
||||
registrar("UTF-8"_s, "UTF-8"_s);
|
||||
registrar("utf8"_s, "UTF-8"_s);
|
||||
registrar("unicode-1-1-utf-8"_s, "UTF-8"_s);
|
||||
|
||||
// Additional aliases that originally were present in the encoding
|
||||
// table in WebKit on Macintosh, and subsequently added by
|
||||
// TextCodecICU. Perhaps we can prove some are not used on the web
|
||||
// and remove them.
|
||||
registrar("unicode11utf8"_s, "UTF-8"_s);
|
||||
registrar("unicode20utf8"_s, "UTF-8"_s);
|
||||
registrar("x-unicode20utf8"_s, "UTF-8"_s);
|
||||
}
|
||||
|
||||
std::unique_ptr<TextCodecUTF8> TextCodecUTF8::codec()
|
||||
{
|
||||
return makeUnique<TextCodecUTF8>();
|
||||
}
|
||||
|
||||
void TextCodecUTF8::registerCodecs(TextCodecRegistrar registrar)
|
||||
{
|
||||
registrar("UTF-8"_s, [] {
|
||||
return codec();
|
||||
});
|
||||
}
|
||||
|
||||
static inline uint8_t nonASCIISequenceLength(uint8_t firstByte)
|
||||
{
|
||||
static constexpr std::array<uint8_t, 256> lengths {
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
|
||||
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
|
||||
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
|
||||
4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
|
||||
};
|
||||
return lengths[firstByte];
|
||||
}
|
||||
|
||||
static inline int decodeNonASCIISequence(std::span<const uint8_t> sequence, uint8_t& length)
|
||||
{
|
||||
ASSERT(!isASCII(sequence[0]));
|
||||
if (length == 2) {
|
||||
ASSERT(sequence[0] >= 0xC2);
|
||||
ASSERT(sequence[0] <= 0xDF);
|
||||
if (sequence[1] < 0x80 || sequence[1] > 0xBF) {
|
||||
length = 1;
|
||||
return nonCharacter;
|
||||
}
|
||||
return ((sequence[0] << 6) + sequence[1]) - 0x00003080;
|
||||
}
|
||||
if (length == 3) {
|
||||
ASSERT(sequence[0] >= 0xE0);
|
||||
ASSERT(sequence[0] <= 0xEF);
|
||||
switch (sequence[0]) {
|
||||
case 0xE0:
|
||||
if (sequence[1] < 0xA0 || sequence[1] > 0xBF) {
|
||||
length = 1;
|
||||
return nonCharacter;
|
||||
}
|
||||
break;
|
||||
case 0xED:
|
||||
if (sequence[1] < 0x80 || sequence[1] > 0x9F) {
|
||||
length = 1;
|
||||
return nonCharacter;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
if (sequence[1] < 0x80 || sequence[1] > 0xBF) {
|
||||
length = 1;
|
||||
return nonCharacter;
|
||||
}
|
||||
}
|
||||
if (sequence[2] < 0x80 || sequence[2] > 0xBF) {
|
||||
length = 2;
|
||||
return nonCharacter;
|
||||
}
|
||||
return ((sequence[0] << 12) + (sequence[1] << 6) + sequence[2]) - 0x000E2080;
|
||||
}
|
||||
ASSERT(length == 4);
|
||||
ASSERT(sequence[0] >= 0xF0);
|
||||
ASSERT(sequence[0] <= 0xF4);
|
||||
switch (sequence[0]) {
|
||||
case 0xF0:
|
||||
if (sequence[1] < 0x90 || sequence[1] > 0xBF) {
|
||||
length = 1;
|
||||
return nonCharacter;
|
||||
}
|
||||
break;
|
||||
case 0xF4:
|
||||
if (sequence[1] < 0x80 || sequence[1] > 0x8F) {
|
||||
length = 1;
|
||||
return nonCharacter;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
if (sequence[1] < 0x80 || sequence[1] > 0xBF) {
|
||||
length = 1;
|
||||
return nonCharacter;
|
||||
}
|
||||
}
|
||||
if (sequence[2] < 0x80 || sequence[2] > 0xBF) {
|
||||
length = 2;
|
||||
return nonCharacter;
|
||||
}
|
||||
if (sequence[3] < 0x80 || sequence[3] > 0xBF) {
|
||||
length = 3;
|
||||
return nonCharacter;
|
||||
}
|
||||
return ((sequence[0] << 18) + (sequence[1] << 12) + (sequence[2] << 6) + sequence[3]) - 0x03C82080;
|
||||
}
|
||||
|
||||
static inline std::span<UChar> appendCharacter(std::span<UChar> destination, int character)
|
||||
{
|
||||
ASSERT(character != nonCharacter);
|
||||
ASSERT(!U_IS_SURROGATE(character));
|
||||
if (U_IS_BMP(character))
|
||||
consume(destination) = character;
|
||||
else {
|
||||
destination[0] = U16_LEAD(character);
|
||||
destination[1] = U16_TRAIL(character);
|
||||
skip(destination, 2);
|
||||
}
|
||||
return destination;
|
||||
}
|
||||
|
||||
void TextCodecUTF8::consumePartialSequenceByte()
|
||||
{
|
||||
--m_partialSequenceSize;
|
||||
memmoveSpan(std::span { m_partialSequence }, std::span { m_partialSequence }.subspan(1, m_partialSequenceSize));
|
||||
}
|
||||
|
||||
bool TextCodecUTF8::handlePartialSequence(std::span<LChar>& destination, std::span<const uint8_t>& source, bool flush)
|
||||
{
|
||||
ASSERT(m_partialSequenceSize);
|
||||
do {
|
||||
if (isASCII(m_partialSequence[0])) {
|
||||
consume(destination) = m_partialSequence[0];
|
||||
consumePartialSequenceByte();
|
||||
continue;
|
||||
}
|
||||
auto count = nonASCIISequenceLength(m_partialSequence[0]);
|
||||
if (!count)
|
||||
return true;
|
||||
|
||||
// Copy from `source` until we have `count` bytes.
|
||||
if (count > m_partialSequenceSize && !source.empty()) {
|
||||
size_t additionalBytes = std::min<size_t>(count - m_partialSequenceSize, source.size());
|
||||
memcpySpan(std::span { m_partialSequence }.subspan(m_partialSequenceSize), consumeSpan(source, additionalBytes));
|
||||
m_partialSequenceSize += additionalBytes;
|
||||
}
|
||||
|
||||
// If we still don't have `count` bytes, fill the rest with zeros (any
|
||||
// other lead byte would do), so we can run `decodeNonASCIISequence` to
|
||||
// tell if the chunk that we have is valid. These bytes are not part of
|
||||
// the partial sequence, so don't increment `m_partialSequenceSize`.
|
||||
bool partialSequenceIsTooShort = false;
|
||||
if (count > m_partialSequenceSize) {
|
||||
partialSequenceIsTooShort = true;
|
||||
zeroSpan(std::span { m_partialSequence }.subspan(m_partialSequenceSize, count - m_partialSequenceSize));
|
||||
}
|
||||
|
||||
int character = decodeNonASCIISequence(std::span { m_partialSequence }, count);
|
||||
if (partialSequenceIsTooShort) {
|
||||
ASSERT(character == nonCharacter);
|
||||
ASSERT(count <= m_partialSequenceSize);
|
||||
// If we're not at the end, and the partial sequence that we have is
|
||||
// incomplete but otherwise valid, a non-character is not an error.
|
||||
if (!flush && count == m_partialSequenceSize)
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!isLatin1(character))
|
||||
return true;
|
||||
|
||||
m_partialSequenceSize -= count;
|
||||
consume(destination) = character;
|
||||
} while (m_partialSequenceSize);
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
void TextCodecUTF8::handlePartialSequence(std::span<UChar>& destination, std::span<const uint8_t>& source, bool flush, bool stopOnError, bool& sawError)
|
||||
{
|
||||
ASSERT(m_partialSequenceSize);
|
||||
do {
|
||||
if (isASCII(m_partialSequence[0])) {
|
||||
consume(destination) = m_partialSequence[0];
|
||||
consumePartialSequenceByte();
|
||||
continue;
|
||||
}
|
||||
auto count = nonASCIISequenceLength(m_partialSequence[0]);
|
||||
if (!count) {
|
||||
sawError = true;
|
||||
if (stopOnError)
|
||||
return;
|
||||
consume(destination) = replacementCharacter;
|
||||
consumePartialSequenceByte();
|
||||
continue;
|
||||
}
|
||||
|
||||
// Copy from `source` until we have `count` bytes.
|
||||
if (count > m_partialSequenceSize && !source.empty()) {
|
||||
size_t additionalBytes = std::min<size_t>(count - m_partialSequenceSize, source.size());
|
||||
memcpySpan(std::span { m_partialSequence }.subspan(m_partialSequenceSize), consumeSpan(source, additionalBytes));
|
||||
m_partialSequenceSize += additionalBytes;
|
||||
}
|
||||
|
||||
// If we still don't have `count` bytes, fill the rest with zeros (any
|
||||
// other lead byte would do), so we can run `decodeNonASCIISequence` to
|
||||
// tell if the chunk that we have is valid. These bytes are not part of
|
||||
// the partial sequence, so don't increment `m_partialSequenceSize`.
|
||||
bool partialSequenceIsTooShort = false;
|
||||
if (count > m_partialSequenceSize) {
|
||||
partialSequenceIsTooShort = true;
|
||||
zeroSpan(std::span { m_partialSequence }.subspan(m_partialSequenceSize, count - m_partialSequenceSize));
|
||||
}
|
||||
|
||||
int character = decodeNonASCIISequence(std::span { m_partialSequence }, count);
|
||||
if (partialSequenceIsTooShort) {
|
||||
ASSERT(character == nonCharacter);
|
||||
ASSERT(count <= m_partialSequenceSize);
|
||||
// If we're not at the end, and the partial sequence that we have is
|
||||
// incomplete but otherwise valid, a non-character is not an error.
|
||||
if (!flush && count == m_partialSequenceSize)
|
||||
return;
|
||||
}
|
||||
|
||||
if (character == nonCharacter) {
|
||||
sawError = true;
|
||||
if (stopOnError)
|
||||
return;
|
||||
consume(destination) = replacementCharacter;
|
||||
m_partialSequenceSize -= count;
|
||||
memmoveSpan(std::span { m_partialSequence }, std::span { m_partialSequence }.subspan(count, m_partialSequenceSize));
|
||||
continue;
|
||||
}
|
||||
|
||||
m_partialSequenceSize -= count;
|
||||
if (std::exchange(m_shouldStripByteOrderMark, false) && character == byteOrderMark)
|
||||
continue;
|
||||
destination = appendCharacter(destination, character);
|
||||
} while (m_partialSequenceSize);
|
||||
}
|
||||
|
||||
String TextCodecUTF8::decode(std::span<const uint8_t> bytes, bool flush, bool stopOnError, bool& sawError)
|
||||
{
|
||||
// Each input byte might turn into a character.
|
||||
// That includes all bytes in the partial-sequence buffer because
|
||||
// each byte in an invalid sequence will turn into a replacement character.
|
||||
size_t bufferSize = m_partialSequenceSize + bytes.size();
|
||||
if (bufferSize > std::numeric_limits<unsigned>::max()) {
|
||||
sawError = true;
|
||||
return {};
|
||||
}
|
||||
StringBuffer<LChar> buffer(bufferSize);
|
||||
|
||||
auto source = bytes;
|
||||
auto* alignedEnd = WTF::alignToMachineWord(std::to_address(source.end()));
|
||||
auto destination = buffer.span();
|
||||
|
||||
do {
|
||||
if (m_partialSequenceSize) {
|
||||
// Explicitly copy destination and source pointers to avoid taking pointers to the
|
||||
// local variables, which may harm code generation by disabling some optimizations
|
||||
// in some compilers.
|
||||
auto destinationForHandlePartialSequence = destination;
|
||||
if (handlePartialSequence(destinationForHandlePartialSequence, source, flush)) {
|
||||
goto upConvertTo16Bit;
|
||||
}
|
||||
destination = destinationForHandlePartialSequence;
|
||||
if (m_partialSequenceSize)
|
||||
break;
|
||||
}
|
||||
|
||||
while (!source.empty()) {
|
||||
if (isASCII(source[0])) {
|
||||
// Fast path for ASCII. Most UTF-8 text will be ASCII.
|
||||
if (WTF::isAlignedToMachineWord(source.data())) {
|
||||
while (source.data() < alignedEnd) {
|
||||
auto chunk = reinterpretCastSpanStartTo<const WTF::MachineWord>(source);
|
||||
if (!WTF::containsOnlyASCII<LChar>(chunk))
|
||||
break;
|
||||
copyASCIIMachineWord(destination, source);
|
||||
skip(source, sizeof(WTF::MachineWord));
|
||||
skip(destination, sizeof(WTF::MachineWord));
|
||||
}
|
||||
if (source.empty())
|
||||
break;
|
||||
if (!isASCII(source[0]))
|
||||
continue;
|
||||
}
|
||||
consume(destination) = consume(source);
|
||||
continue;
|
||||
}
|
||||
auto count = nonASCIISequenceLength(source[0]);
|
||||
int character;
|
||||
if (!count)
|
||||
character = nonCharacter;
|
||||
else {
|
||||
if (count > source.size()) {
|
||||
RELEASE_ASSERT_WITH_SECURITY_IMPLICATION(source.size() < m_partialSequence.size());
|
||||
ASSERT(!m_partialSequenceSize);
|
||||
m_partialSequenceSize = source.size();
|
||||
memcpySpan(std::span { m_partialSequence }, source.first(m_partialSequenceSize));
|
||||
source = {};
|
||||
break;
|
||||
}
|
||||
character = decodeNonASCIISequence(source, count);
|
||||
}
|
||||
if (character == nonCharacter) {
|
||||
sawError = true;
|
||||
if (stopOnError)
|
||||
break;
|
||||
|
||||
goto upConvertTo16Bit;
|
||||
}
|
||||
if (!isLatin1(character))
|
||||
goto upConvertTo16Bit;
|
||||
|
||||
skip(source, count);
|
||||
consume(destination) = character;
|
||||
}
|
||||
} while (m_partialSequenceSize);
|
||||
|
||||
buffer.shrink(destination.data() - buffer.characters());
|
||||
if (flush)
|
||||
m_partialSequenceSize = 0;
|
||||
if (flush || buffer.length())
|
||||
m_shouldStripByteOrderMark = false;
|
||||
return String::adopt(WTFMove(buffer));
|
||||
|
||||
upConvertTo16Bit:
|
||||
StringBuffer<UChar> buffer16(bufferSize);
|
||||
|
||||
auto destination16 = buffer16.span();
|
||||
|
||||
// Copy the already converted characters
|
||||
auto converted8 = buffer.span();
|
||||
size_t charactersToCopy = destination.data() - buffer.characters();
|
||||
for (size_t i = 0; i < charactersToCopy; ++i)
|
||||
destination16[i] = converted8[i];
|
||||
skip(destination16, charactersToCopy);
|
||||
|
||||
do {
|
||||
if (m_partialSequenceSize) {
|
||||
// Explicitly copy destination and source pointers to avoid taking pointers to the
|
||||
// local variables, which may harm code generation by disabling some optimizations
|
||||
// in some compilers.
|
||||
auto destinationForHandlePartialSequence = destination16;
|
||||
handlePartialSequence(destinationForHandlePartialSequence, source, flush, stopOnError, sawError);
|
||||
destination16 = destinationForHandlePartialSequence;
|
||||
if (m_partialSequenceSize)
|
||||
break;
|
||||
}
|
||||
|
||||
while (!source.empty()) {
|
||||
if (isASCII(source[0])) {
|
||||
// Fast path for ASCII. Most UTF-8 text will be ASCII.
|
||||
if (WTF::isAlignedToMachineWord(source.data())) {
|
||||
while (source.data() < alignedEnd) {
|
||||
auto chunk = reinterpretCastSpanStartTo<const WTF::MachineWord>(source);
|
||||
if (!WTF::containsOnlyASCII<LChar>(chunk))
|
||||
break;
|
||||
copyASCIIMachineWord(destination16, source);
|
||||
skip(source, sizeof(WTF::MachineWord));
|
||||
skip(destination16, sizeof(WTF::MachineWord));
|
||||
}
|
||||
if (source.empty())
|
||||
break;
|
||||
if (!isASCII(source[0]))
|
||||
continue;
|
||||
}
|
||||
consume(destination16) = consume(source);
|
||||
continue;
|
||||
}
|
||||
auto count = nonASCIISequenceLength(source[0]);
|
||||
int character;
|
||||
if (!count)
|
||||
character = nonCharacter;
|
||||
else {
|
||||
if (count > source.size()) {
|
||||
RELEASE_ASSERT_WITH_SECURITY_IMPLICATION(source.size() < m_partialSequence.size());
|
||||
ASSERT(!m_partialSequenceSize);
|
||||
m_partialSequenceSize = source.size();
|
||||
memcpySpan(std::span { m_partialSequence }, source.first(m_partialSequenceSize));
|
||||
source = {};
|
||||
break;
|
||||
}
|
||||
character = decodeNonASCIISequence(source, count);
|
||||
}
|
||||
if (character == nonCharacter) {
|
||||
sawError = true;
|
||||
if (stopOnError)
|
||||
break;
|
||||
consume(destination16) = replacementCharacter;
|
||||
skip(source, count ? count : 1);
|
||||
continue;
|
||||
}
|
||||
skip(source, count);
|
||||
if (character == byteOrderMark && destination16.data() == buffer16.characters() && std::exchange(m_shouldStripByteOrderMark, false))
|
||||
continue;
|
||||
destination16 = appendCharacter(destination16, character);
|
||||
}
|
||||
} while (m_partialSequenceSize);
|
||||
|
||||
buffer16.shrink(destination16.data() - buffer16.characters());
|
||||
if (flush)
|
||||
m_partialSequenceSize = 0;
|
||||
if (flush || buffer16.length())
|
||||
m_shouldStripByteOrderMark = false;
|
||||
return String::adopt(WTFMove(buffer16));
|
||||
}
|
||||
|
||||
Vector<uint8_t> TextCodecUTF8::encodeUTF8(StringView string)
|
||||
{
|
||||
// The maximum number of UTF-8 bytes needed per UTF-16 code unit is 3.
|
||||
// BMP characters take only one UTF-16 code unit and can take up to 3 bytes (3x).
|
||||
// Non-BMP characters take two UTF-16 code units and can take up to 4 bytes (2x).
|
||||
Vector<uint8_t> bytes(WTF::checkedProduct<size_t>(string.length(), 3));
|
||||
size_t bytesWritten = 0;
|
||||
for (auto character : string.codePoints())
|
||||
U8_APPEND_UNSAFE(bytes, bytesWritten, character);
|
||||
bytes.shrink(bytesWritten);
|
||||
return bytes;
|
||||
}
|
||||
|
||||
Vector<uint8_t> TextCodecUTF8::encode(StringView string, UnencodableHandling) const
|
||||
{
|
||||
return encodeUTF8(string);
|
||||
}
|
||||
|
||||
} // namespace PAL
|
||||
58
src/bun.js/bindings/webcore/TextCodecUTF8.h
Normal file
58
src/bun.js/bindings/webcore/TextCodecUTF8.h
Normal file
@@ -0,0 +1,58 @@
|
||||
/*
|
||||
* Copyright (C) 2011-2020 Apple Inc. All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
|
||||
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
|
||||
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "TextCodec.h"
|
||||
#include <unicode/utf8.h>
|
||||
#include <wtf/TZoneMalloc.h>
|
||||
#include <wtf/text/LChar.h>
|
||||
|
||||
namespace PAL {
|
||||
|
||||
class TextCodecUTF8 final : public TextCodec {
|
||||
WTF_MAKE_TZONE_ALLOCATED(TextCodecUTF8);
|
||||
public:
|
||||
static void registerEncodingNames(EncodingNameRegistrar);
|
||||
static void registerCodecs(TextCodecRegistrar);
|
||||
|
||||
static Vector<uint8_t> encodeUTF8(StringView);
|
||||
static std::unique_ptr<TextCodecUTF8> codec();
|
||||
|
||||
private:
|
||||
void stripByteOrderMark() final { m_shouldStripByteOrderMark = true; }
|
||||
String decode(std::span<const uint8_t>, bool flush, bool stopOnError, bool& sawError) final;
|
||||
Vector<uint8_t> encode(StringView, UnencodableHandling) const final;
|
||||
|
||||
bool handlePartialSequence(std::span<LChar>& destination, std::span<const uint8_t>& source, bool flush);
|
||||
void handlePartialSequence(std::span<UChar>& destination, std::span<const uint8_t>& source, bool flush, bool stopOnError, bool& sawError);
|
||||
void consumePartialSequenceByte();
|
||||
|
||||
int m_partialSequenceSize { 0 };
|
||||
std::array<uint8_t, U8_MAX_LENGTH> m_partialSequence;
|
||||
bool m_shouldStripByteOrderMark { false };
|
||||
};
|
||||
|
||||
} // namespace PAL
|
||||
99
src/bun.js/bindings/webcore/TextCodecUserDefined.cpp
Normal file
99
src/bun.js/bindings/webcore/TextCodecUserDefined.cpp
Normal file
@@ -0,0 +1,99 @@
|
||||
/*
|
||||
* Copyright (C) 2007-2017 Apple, Inc. All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
|
||||
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
|
||||
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
#include "TextCodecUserDefined.h"
|
||||
|
||||
#include <array>
|
||||
#include <wtf/TZoneMallocInlines.h>
|
||||
#include <wtf/text/CString.h>
|
||||
#include <wtf/text/StringBuilder.h>
|
||||
#include <wtf/text/WTFString.h>
|
||||
|
||||
namespace PAL {
|
||||
|
||||
WTF_MAKE_TZONE_ALLOCATED_IMPL(TextCodecUserDefined);
|
||||
|
||||
void TextCodecUserDefined::registerEncodingNames(EncodingNameRegistrar registrar)
|
||||
{
|
||||
registrar("x-user-defined"_s, "x-user-defined"_s);
|
||||
}
|
||||
|
||||
void TextCodecUserDefined::registerCodecs(TextCodecRegistrar registrar)
|
||||
{
|
||||
registrar("x-user-defined"_s, [] {
|
||||
return makeUnique<TextCodecUserDefined>();
|
||||
});
|
||||
}
|
||||
|
||||
String TextCodecUserDefined::decode(std::span<const uint8_t> bytes, bool, bool, bool&)
|
||||
{
|
||||
StringBuilder result;
|
||||
result.reserveCapacity(bytes.size());
|
||||
for (char byte : bytes)
|
||||
result.append(static_cast<UChar>(byte & 0xF7FF));
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
static Vector<uint8_t> encodeComplexUserDefined(StringView string, UnencodableHandling handling)
|
||||
{
|
||||
Vector<uint8_t> result;
|
||||
|
||||
for (auto character : string.codePoints()) {
|
||||
int8_t signedByte = character;
|
||||
if ((signedByte & 0xF7FF) == character)
|
||||
result.append(signedByte);
|
||||
else {
|
||||
// No way to encode this character with x-user-defined.
|
||||
UnencodableReplacementArray replacement;
|
||||
result.append(TextCodec::getUnencodableReplacement(character, handling, replacement));
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
Vector<uint8_t> TextCodecUserDefined::encode(StringView string, UnencodableHandling handling) const
|
||||
{
|
||||
{
|
||||
Vector<uint8_t> result(string.length());
|
||||
size_t index = 0;
|
||||
|
||||
// Convert and simultaneously do a check to see if it's all ASCII.
|
||||
UChar ored = 0;
|
||||
for (auto character : string.codeUnits()) {
|
||||
result[index++] = character;
|
||||
ored |= character;
|
||||
}
|
||||
|
||||
if (!(ored & 0xFF80))
|
||||
return result;
|
||||
}
|
||||
|
||||
// If it wasn't all ASCII, call the function that handles more-complex cases.
|
||||
return encodeComplexUserDefined(string, handling);
|
||||
}
|
||||
|
||||
} // namespace PAL
|
||||
44
src/bun.js/bindings/webcore/TextCodecUserDefined.h
Normal file
44
src/bun.js/bindings/webcore/TextCodecUserDefined.h
Normal file
@@ -0,0 +1,44 @@
|
||||
/*
|
||||
* Copyright (C) 2007-2017 Apple, Inc. All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
|
||||
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
|
||||
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "TextCodec.h"
|
||||
#include <wtf/TZoneMalloc.h>
|
||||
|
||||
namespace PAL {
|
||||
|
||||
class TextCodecUserDefined final : public TextCodec {
|
||||
WTF_MAKE_TZONE_ALLOCATED(TextCodecUserDefined);
|
||||
public:
|
||||
static void registerEncodingNames(EncodingNameRegistrar);
|
||||
static void registerCodecs(TextCodecRegistrar);
|
||||
|
||||
private:
|
||||
String decode(std::span<const uint8_t>, bool flush, bool stopOnError, bool& sawError) final;
|
||||
Vector<uint8_t> encode(StringView, UnencodableHandling) const final;
|
||||
};
|
||||
|
||||
} // namespace PAL
|
||||
197
src/bun.js/bindings/webcore/TextEncoding.cpp
Normal file
197
src/bun.js/bindings/webcore/TextEncoding.cpp
Normal file
@@ -0,0 +1,197 @@
|
||||
/*
|
||||
* Copyright (C) 2004-2019 Apple Inc. All rights reserved.
|
||||
* Copyright (C) 2006 Alexey Proskuryakov <ap@nypop.com>
|
||||
* Copyright (C) 2007-2009 Torch Mobile, Inc.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
|
||||
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
|
||||
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
#include "TextEncoding.h"
|
||||
|
||||
#include "DecodeEscapeSequences.h"
|
||||
#include "TextCodec.h"
|
||||
#include "TextEncodingRegistry.h"
|
||||
#include <wtf/NeverDestroyed.h>
|
||||
#include <wtf/StdLibExtras.h>
|
||||
#include <wtf/text/StringView.h>
|
||||
|
||||
namespace PAL {
|
||||
|
||||
static const TextEncoding& UTF7Encoding()
|
||||
{
|
||||
static NeverDestroyed<TextEncoding> globalUTF7Encoding("UTF-7"_s);
|
||||
return globalUTF7Encoding;
|
||||
}
|
||||
|
||||
TextEncoding::TextEncoding(ASCIILiteral name)
|
||||
: m_name(atomCanonicalTextEncodingName(name))
|
||||
, m_backslashAsCurrencySymbol(backslashAsCurrencySymbol())
|
||||
{
|
||||
}
|
||||
|
||||
TextEncoding::TextEncoding(StringView name)
|
||||
: m_name(atomCanonicalTextEncodingName(name))
|
||||
, m_backslashAsCurrencySymbol(backslashAsCurrencySymbol())
|
||||
{
|
||||
}
|
||||
|
||||
TextEncoding::TextEncoding(const String& name)
|
||||
: TextEncoding(StringView { name })
|
||||
{
|
||||
}
|
||||
|
||||
String TextEncoding::decode(std::span<const uint8_t> data, bool stopOnError, bool& sawError) const
|
||||
{
|
||||
if (m_name.isNull())
|
||||
return String();
|
||||
|
||||
return newTextCodec(*this)->decode(data, true, stopOnError, sawError);
|
||||
}
|
||||
|
||||
Vector<uint8_t> TextEncoding::encode(StringView string, PAL::UnencodableHandling handling, NFCNormalize normalize) const
|
||||
{
|
||||
if (m_name.isNull() || string.isEmpty())
|
||||
return {};
|
||||
|
||||
// FIXME: What's the right place to do normalization?
|
||||
// It's a little strange to do it inside the encode function.
|
||||
// Perhaps normalization should be an explicit step done before calling encode.
|
||||
if (normalize == NFCNormalize::Yes)
|
||||
return newTextCodec(*this)->encode(normalizedNFC(string).view, handling);
|
||||
return newTextCodec(*this)->encode(string, handling);
|
||||
}
|
||||
|
||||
ASCIILiteral TextEncoding::domName() const
|
||||
{
|
||||
if (noExtendedTextEncodingNameUsed())
|
||||
return m_name;
|
||||
|
||||
// We treat EUC-KR as windows-949 (its superset), but need to expose
|
||||
// the name 'EUC-KR' because the name 'windows-949' is not recognized by
|
||||
// most Korean web servers even though they do use the encoding
|
||||
// 'windows-949' with the name 'EUC-KR'.
|
||||
// FIXME: This is not thread-safe. At the moment, this function is
|
||||
// only accessed in a single thread, but eventually has to be made
|
||||
// thread-safe along with usesVisualOrdering().
|
||||
static const ASCIILiteral windows949 = atomCanonicalTextEncodingName("windows-949"_s);
|
||||
if (m_name == windows949)
|
||||
return "EUC-KR"_s;
|
||||
return m_name;
|
||||
}
|
||||
|
||||
bool TextEncoding::usesVisualOrdering() const
|
||||
{
|
||||
if (noExtendedTextEncodingNameUsed())
|
||||
return false;
|
||||
|
||||
static const ASCIILiteral iso88598 = atomCanonicalTextEncodingName("ISO-8859-8"_s);
|
||||
return m_name == iso88598;
|
||||
}
|
||||
|
||||
bool TextEncoding::isJapanese() const
|
||||
{
|
||||
return isJapaneseEncoding(m_name);
|
||||
}
|
||||
|
||||
UChar TextEncoding::backslashAsCurrencySymbol() const
|
||||
{
|
||||
return shouldShowBackslashAsCurrencySymbolIn(m_name) ? 0x00A5 : '\\';
|
||||
}
|
||||
|
||||
bool TextEncoding::isNonByteBasedEncoding() const
|
||||
{
|
||||
return *this == UTF16LittleEndianEncoding() || *this == UTF16BigEndianEncoding();
|
||||
}
|
||||
|
||||
bool TextEncoding::isUTF7Encoding() const
|
||||
{
|
||||
if (noExtendedTextEncodingNameUsed())
|
||||
return false;
|
||||
|
||||
return *this == UTF7Encoding();
|
||||
}
|
||||
|
||||
const TextEncoding& TextEncoding::closestByteBasedEquivalent() const
|
||||
{
|
||||
if (isNonByteBasedEncoding())
|
||||
return UTF8Encoding();
|
||||
return *this;
|
||||
}
|
||||
|
||||
// HTML5 specifies that UTF-8 be used in form submission when a form is
|
||||
// is a part of a document in UTF-16 probably because UTF-16 is not a
|
||||
// byte-based encoding and can contain 0x00. By extension, the same
|
||||
// should be done for UTF-32. In case of UTF-7, it is a byte-based encoding,
|
||||
// but it's fraught with problems and we'd rather steer clear of it.
|
||||
const TextEncoding& TextEncoding::encodingForFormSubmissionOrURLParsing() const
|
||||
{
|
||||
if (isNonByteBasedEncoding() || isUTF7Encoding())
|
||||
return UTF8Encoding();
|
||||
return *this;
|
||||
}
|
||||
|
||||
const TextEncoding& ASCIIEncoding()
|
||||
{
|
||||
static NeverDestroyed<TextEncoding> globalASCIIEncoding("ASCII"_s);
|
||||
return globalASCIIEncoding;
|
||||
}
|
||||
|
||||
const TextEncoding& Latin1Encoding()
|
||||
{
|
||||
static NeverDestroyed<TextEncoding> globalLatin1Encoding("latin1"_s);
|
||||
return globalLatin1Encoding;
|
||||
}
|
||||
|
||||
const TextEncoding& UTF16BigEndianEncoding()
|
||||
{
|
||||
static NeverDestroyed<TextEncoding> globalUTF16BigEndianEncoding("UTF-16BE"_s);
|
||||
return globalUTF16BigEndianEncoding;
|
||||
}
|
||||
|
||||
const TextEncoding& UTF16LittleEndianEncoding()
|
||||
{
|
||||
static NeverDestroyed<TextEncoding> globalUTF16LittleEndianEncoding("UTF-16LE"_s);
|
||||
return globalUTF16LittleEndianEncoding;
|
||||
}
|
||||
|
||||
const TextEncoding& UTF8Encoding()
|
||||
{
|
||||
static NeverDestroyed<TextEncoding> globalUTF8Encoding("UTF-8"_s);
|
||||
ASSERT(globalUTF8Encoding.get().isValid());
|
||||
return globalUTF8Encoding;
|
||||
}
|
||||
|
||||
const TextEncoding& WindowsLatin1Encoding()
|
||||
{
|
||||
static NeverDestroyed<TextEncoding> globalWindowsLatin1Encoding("WinLatin-1"_s);
|
||||
return globalWindowsLatin1Encoding;
|
||||
}
|
||||
|
||||
String decodeURLEscapeSequences(StringView string, const TextEncoding& encoding)
|
||||
{
|
||||
if (string.isEmpty())
|
||||
return string.toString();
|
||||
return decodeEscapeSequences<URLEscapeSequence>(string, encoding);
|
||||
}
|
||||
|
||||
} // namespace PAL
|
||||
91
src/bun.js/bindings/webcore/TextEncoding.h
Normal file
91
src/bun.js/bindings/webcore/TextEncoding.h
Normal file
@@ -0,0 +1,91 @@
|
||||
/*
|
||||
* Copyright (C) 2004-2017 Apple Inc. All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
|
||||
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
|
||||
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "root.h"
|
||||
|
||||
#include "UnencodableHandling.h"
|
||||
#include <wtf/URL.h>
|
||||
#include <wtf/text/StringView.h>
|
||||
|
||||
namespace PAL {
|
||||
|
||||
enum class NFCNormalize : bool { No,
|
||||
Yes };
|
||||
|
||||
class TextEncoding : public WTF::URLTextEncoding {
|
||||
public:
|
||||
TextEncoding() = default;
|
||||
TextEncoding(ASCIILiteral name);
|
||||
TextEncoding(StringView name);
|
||||
TextEncoding(const String& name);
|
||||
|
||||
bool isValid() const { return !m_name.isNull(); }
|
||||
ASCIILiteral name() const { return m_name; }
|
||||
ASCIILiteral domName() const; // name exposed via DOM
|
||||
bool usesVisualOrdering() const;
|
||||
bool isJapanese() const;
|
||||
|
||||
const TextEncoding& closestByteBasedEquivalent() const;
|
||||
const TextEncoding& encodingForFormSubmissionOrURLParsing() const;
|
||||
|
||||
String decode(std::span<const uint8_t>, bool stopOnError, bool& sawError) const;
|
||||
String decode(std::span<const uint8_t>) const;
|
||||
Vector<uint8_t> encode(StringView, PAL::UnencodableHandling, NFCNormalize = NFCNormalize::Yes) const;
|
||||
Vector<uint8_t> encodeForURLParsing(StringView string) const final { return encode(string, PAL::UnencodableHandling::URLEncodedEntities, NFCNormalize::No); }
|
||||
|
||||
UChar backslashAsCurrencySymbol() const;
|
||||
bool isByteBasedEncoding() const { return !isNonByteBasedEncoding(); }
|
||||
|
||||
private:
|
||||
bool isNonByteBasedEncoding() const;
|
||||
bool isUTF7Encoding() const;
|
||||
|
||||
ASCIILiteral m_name;
|
||||
UChar m_backslashAsCurrencySymbol;
|
||||
};
|
||||
|
||||
inline bool operator==(const TextEncoding& a, const TextEncoding& b) { return a.name() == b.name(); }
|
||||
|
||||
const TextEncoding& ASCIIEncoding();
|
||||
const TextEncoding& Latin1Encoding();
|
||||
const TextEncoding& UTF16BigEndianEncoding();
|
||||
const TextEncoding& UTF16LittleEndianEncoding();
|
||||
const TextEncoding& UTF8Encoding();
|
||||
const TextEncoding& WindowsLatin1Encoding();
|
||||
|
||||
// Unescapes the given string using URL escaping rules.
|
||||
// DANGER: If the URL has "%00" in it,
|
||||
// the resulting string will have embedded null characters!
|
||||
String decodeURLEscapeSequences(StringView, const TextEncoding& = UTF8Encoding());
|
||||
|
||||
inline String TextEncoding::decode(std::span<const uint8_t> characters) const
|
||||
{
|
||||
bool ignored;
|
||||
return decode(characters, false, ignored);
|
||||
}
|
||||
|
||||
} // namespace PAL
|
||||
46
src/bun.js/bindings/webcore/TextEncodingDetector.h
Normal file
46
src/bun.js/bindings/webcore/TextEncodingDetector.h
Normal file
@@ -0,0 +1,46 @@
|
||||
/*
|
||||
* Copyright (C) 2009 Google Inc. All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are
|
||||
* met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above
|
||||
* copyright notice, this list of conditions and the following disclaimer
|
||||
* in the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
* * Neither the name of Google Inc. nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <span>
|
||||
#include <wtf/text/ASCIILiteral.h>
|
||||
|
||||
namespace PAL {
|
||||
|
||||
class TextEncoding;
|
||||
|
||||
// Given a sequence of bytes in |data| of length |len| and an optional
|
||||
// hintEncodingName, detect the most likely character encoding.
|
||||
// The way hintEncodingName is used is up to an implementation.
|
||||
// Currently, the only caller sets it to the parent frame encoding.
|
||||
bool detectTextEncoding(std::span<const uint8_t> data, ASCIILiteral hintEncodingName, TextEncoding* detectedEncoding);
|
||||
|
||||
} // namespace PAL
|
||||
125
src/bun.js/bindings/webcore/TextEncodingDetectorICU.cpp
Normal file
125
src/bun.js/bindings/webcore/TextEncodingDetectorICU.cpp
Normal file
@@ -0,0 +1,125 @@
|
||||
/*
|
||||
* Copyright (C) 2008, 2009 Google Inc. All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are
|
||||
* met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above
|
||||
* copyright notice, this list of conditions and the following disclaimer
|
||||
* in the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
* * Neither the name of Google Inc. nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
#include "TextEncodingDetector.h"
|
||||
|
||||
#include "TextEncoding.h"
|
||||
#include "unicode-ucnv.h"
|
||||
#include "unicode-ucsdet.h"
|
||||
|
||||
#include <span>
|
||||
#include "unicode-ucsdet.h"
|
||||
|
||||
namespace WTF {
|
||||
|
||||
WTF_EXPORT_PRIVATE std::span<const UCharsetMatch*> ucsdet_detectAll_span(UCharsetDetector*, UErrorCode* status);
|
||||
|
||||
} // namespace WTF
|
||||
|
||||
using WTF::ucsdet_detectAll_span;
|
||||
|
||||
namespace PAL {
|
||||
|
||||
bool detectTextEncoding(std::span<const uint8_t> data, ASCIILiteral hintEncodingName, TextEncoding* detectedEncoding)
|
||||
{
|
||||
*detectedEncoding = TextEncoding();
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
UCharsetDetector* detector = ucsdet_open(&status);
|
||||
if (U_FAILURE(status))
|
||||
return false;
|
||||
ucsdet_enableInputFilter(detector, true);
|
||||
ucsdet_setText(detector, byteCast<char>(data.data()), static_cast<int32_t>(data.size()), &status);
|
||||
if (U_FAILURE(status))
|
||||
return false;
|
||||
|
||||
// FIXME: A few things we can do other than improving
|
||||
// the ICU detector itself.
|
||||
// 1. Use ucsdet_detectAll and pick the most likely one given
|
||||
// "the context" (parent-encoding, referrer encoding, etc).
|
||||
// 2. 'Emulate' Firefox/IE's non-Universal detectors (e.g.
|
||||
// Chinese, Japanese, Russian, Korean and Hebrew) by picking the
|
||||
// encoding with a highest confidence among the detector-specific
|
||||
// limited set of candidate encodings.
|
||||
// Below is a partial implementation of the first part of what's outlined
|
||||
// above.
|
||||
auto matches = ucsdet_detectAll_span(detector, &status);
|
||||
if (U_FAILURE(status)) {
|
||||
ucsdet_close(detector);
|
||||
return false;
|
||||
}
|
||||
|
||||
const char* encoding = nullptr;
|
||||
if (!hintEncodingName.isNull()) {
|
||||
TextEncoding hintEncoding(hintEncodingName);
|
||||
// 10 is the minimum confidence value consistent with the codepoint
|
||||
// allocation in a given encoding. The size of a chunk passed to
|
||||
// us varies even for the same html file (apparently depending on
|
||||
// the network load). When we're given a rather short chunk, we
|
||||
// don't have a sufficiently reliable signal other than the fact that
|
||||
// the chunk is consistent with a set of encodings. So, instead of
|
||||
// setting an arbitrary threshold, we have to scan all the encodings
|
||||
// consistent with the data.
|
||||
const int32_t kThreshold = 10;
|
||||
for (auto* match : matches) {
|
||||
int32_t confidence = ucsdet_getConfidence(match, &status);
|
||||
if (U_FAILURE(status)) {
|
||||
status = U_ZERO_ERROR;
|
||||
continue;
|
||||
}
|
||||
if (confidence < kThreshold)
|
||||
break;
|
||||
const char* matchEncoding = ucsdet_getName(match, &status);
|
||||
if (U_FAILURE(status)) {
|
||||
status = U_ZERO_ERROR;
|
||||
continue;
|
||||
}
|
||||
if (TextEncoding(StringView::fromLatin1(matchEncoding)) == hintEncoding) {
|
||||
encoding = hintEncodingName;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
// If no match is found so far, just pick the top match.
|
||||
// This can happen, say, when a parent frame in EUC-JP refers to
|
||||
// a child frame in Shift_JIS and both frames do NOT specify the encoding
|
||||
// making us resort to auto-detection (when it IS turned on).
|
||||
if (!encoding && !matches.empty())
|
||||
encoding = ucsdet_getName(matches[0], &status);
|
||||
if (U_SUCCESS(status)) {
|
||||
*detectedEncoding = TextEncoding(StringView::fromLatin1(encoding));
|
||||
ucsdet_close(detector);
|
||||
return true;
|
||||
}
|
||||
ucsdet_close(detector);
|
||||
return false;
|
||||
}
|
||||
|
||||
}
|
||||
368
src/bun.js/bindings/webcore/TextEncodingRegistry.cpp
Normal file
368
src/bun.js/bindings/webcore/TextEncodingRegistry.cpp
Normal file
@@ -0,0 +1,368 @@
|
||||
/*
|
||||
* Copyright (C) 2006-2017 Apple Inc. All rights reserved.
|
||||
* Copyright (C) 2007-2009 Torch Mobile, Inc.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
|
||||
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
|
||||
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
#include "TextEncodingRegistry.h"
|
||||
|
||||
#include "TextCodecCJK.h"
|
||||
#include "TextCodecICU.h"
|
||||
#include "TextCodecLatin1.h"
|
||||
#include "TextCodecReplacement.h"
|
||||
#include "TextCodecSingleByte.h"
|
||||
#include "TextCodecUTF16.h"
|
||||
#include "TextCodecUTF8.h"
|
||||
#include "TextCodecUserDefined.h"
|
||||
#include "TextEncoding.h"
|
||||
#include <mutex>
|
||||
#include <wtf/ASCIICType.h>
|
||||
#include <wtf/CheckedArithmetic.h>
|
||||
#include <wtf/HashMap.h>
|
||||
#include <wtf/HashSet.h>
|
||||
#include <wtf/Lock.h>
|
||||
#include <wtf/MainThread.h>
|
||||
#include <wtf/StdLibExtras.h>
|
||||
#include <wtf/text/CString.h>
|
||||
#include <wtf/text/StringHash.h>
|
||||
|
||||
namespace PAL {
|
||||
|
||||
constexpr size_t maxEncodingNameLength = 63;
|
||||
|
||||
// Hash for all-ASCII strings that does case folding.
|
||||
struct TextEncodingNameHash {
|
||||
static bool equal(std::span<const LChar> s1, std::span<const LChar> s2)
|
||||
{
|
||||
if (s1.size() != s2.size())
|
||||
return false;
|
||||
|
||||
for (size_t i = 0; i < s1.size(); ++i) {
|
||||
if (toASCIILower(s1[i]) != toASCIILower(s2[i]))
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool equal(ASCIILiteral s1, ASCIILiteral s2)
|
||||
{
|
||||
return equal(s1.span8(), s2.span8());
|
||||
}
|
||||
|
||||
// This algorithm is the one-at-a-time hash from:
|
||||
// http://burtleburtle.net/bob/hash/hashfaq.html
|
||||
// http://burtleburtle.net/bob/hash/doobs.html
|
||||
static unsigned hash(std::span<const LChar> s)
|
||||
{
|
||||
unsigned h = WTF::stringHashingStartValue;
|
||||
for (char c : s) {
|
||||
h += toASCIILower(c);
|
||||
h += (h << 10);
|
||||
h ^= (h >> 6);
|
||||
}
|
||||
h += (h << 3);
|
||||
h ^= (h >> 11);
|
||||
h += (h << 15);
|
||||
return h;
|
||||
}
|
||||
|
||||
static unsigned hash(ASCIILiteral s)
|
||||
{
|
||||
return hash(s.span8());
|
||||
}
|
||||
|
||||
static const bool safeToCompareToEmptyOrDeleted = false;
|
||||
};
|
||||
|
||||
struct HashTranslatorTextEncodingName {
|
||||
static unsigned hash(std::span<const LChar> literal)
|
||||
{
|
||||
return TextEncodingNameHash::hash(literal);
|
||||
}
|
||||
|
||||
static bool equal(const ASCIILiteral& a, std::span<const LChar> b)
|
||||
{
|
||||
return TextEncodingNameHash::equal(a.span8(), b);
|
||||
}
|
||||
};
|
||||
|
||||
using TextEncodingNameMap = HashMap<ASCIILiteral, ASCIILiteral, TextEncodingNameHash>;
|
||||
using TextCodecMap = HashMap<ASCIILiteral, NewTextCodecFunction>;
|
||||
|
||||
static Lock encodingRegistryLock;
|
||||
|
||||
static TextEncodingNameMap* textEncodingNameMap WTF_GUARDED_BY_LOCK(encodingRegistryLock);
|
||||
static TextCodecMap* textCodecMap WTF_GUARDED_BY_LOCK(encodingRegistryLock);
|
||||
static bool didExtendTextCodecMaps;
|
||||
static HashSet<ASCIILiteral>* japaneseEncodings;
|
||||
static HashSet<ASCIILiteral>* nonBackslashEncodings;
|
||||
|
||||
static constexpr ASCIILiteral textEncodingNameBlocklist[] = { "UTF-7"_s, "BOCU-1"_s, "SCSU"_s };
|
||||
|
||||
static bool isUndesiredAlias(ASCIILiteral alias)
|
||||
{
|
||||
// Reject aliases with version numbers that are supported by some back-ends (such as "ISO_2022,locale=ja,version=0" in ICU).
|
||||
if (strchr(alias.characters(), ','))
|
||||
return true;
|
||||
// 8859_1 is known to (at least) ICU, but other browsers don't support this name - and having it caused a compatibility
|
||||
// problem, see bug 43554.
|
||||
if (alias == "8859_1"_s)
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
static void addToTextEncodingNameMap(ASCIILiteral alias, ASCIILiteral name) WTF_REQUIRES_LOCK(encodingRegistryLock)
|
||||
{
|
||||
ASSERT(strlen(alias) <= maxEncodingNameLength);
|
||||
if (isUndesiredAlias(alias))
|
||||
return;
|
||||
ASCIILiteral atomName = textEncodingNameMap->get(name);
|
||||
ASSERT((alias == name) || !atomName.isNull());
|
||||
if (atomName.isNull())
|
||||
atomName = name;
|
||||
|
||||
ASSERT_WITH_MESSAGE(textEncodingNameMap->get(alias).isNull(), "Duplicate text encoding name %s for %s (previously registered as %s)", alias.characters(), atomName.characters(), textEncodingNameMap->get(alias).characters());
|
||||
|
||||
textEncodingNameMap->add(alias, atomName);
|
||||
}
|
||||
|
||||
static void addToTextCodecMap(ASCIILiteral name, NewTextCodecFunction&& function) WTF_REQUIRES_LOCK(encodingRegistryLock)
|
||||
{
|
||||
ASCIILiteral atomName = textEncodingNameMap->get(name);
|
||||
ASSERT(!atomName.isNull());
|
||||
textCodecMap->add(atomName, WTFMove(function));
|
||||
}
|
||||
|
||||
static void pruneBlocklistedCodecs() WTF_REQUIRES_LOCK(encodingRegistryLock)
|
||||
{
|
||||
for (auto& nameFromBlocklist : textEncodingNameBlocklist) {
|
||||
ASCIILiteral atomName = textEncodingNameMap->get(nameFromBlocklist);
|
||||
if (atomName.isNull())
|
||||
continue;
|
||||
|
||||
Vector<ASCIILiteral> names;
|
||||
for (auto& entry : *textEncodingNameMap) {
|
||||
if (entry.value == atomName)
|
||||
names.append(entry.key);
|
||||
}
|
||||
|
||||
for (auto& name : names)
|
||||
textEncodingNameMap->remove(name);
|
||||
|
||||
textCodecMap->remove(atomName);
|
||||
}
|
||||
}
|
||||
|
||||
static void buildBaseTextCodecMaps() WTF_REQUIRES_LOCK(encodingRegistryLock)
|
||||
{
|
||||
ASSERT(!textCodecMap);
|
||||
ASSERT(!textEncodingNameMap);
|
||||
|
||||
textCodecMap = new TextCodecMap;
|
||||
textEncodingNameMap = new TextEncodingNameMap;
|
||||
|
||||
TextCodecLatin1::registerEncodingNames(addToTextEncodingNameMap);
|
||||
TextCodecLatin1::registerCodecs(addToTextCodecMap);
|
||||
|
||||
TextCodecUTF8::registerEncodingNames(addToTextEncodingNameMap);
|
||||
TextCodecUTF8::registerCodecs(addToTextCodecMap);
|
||||
|
||||
TextCodecUTF16::registerEncodingNames(addToTextEncodingNameMap);
|
||||
TextCodecUTF16::registerCodecs(addToTextCodecMap);
|
||||
|
||||
TextCodecUserDefined::registerEncodingNames(addToTextEncodingNameMap);
|
||||
TextCodecUserDefined::registerCodecs(addToTextCodecMap);
|
||||
}
|
||||
|
||||
static void addEncodingName(HashSet<ASCIILiteral>& set, ASCIILiteral name) WTF_REQUIRES_LOCK(encodingRegistryLock)
|
||||
{
|
||||
// We must not use atomCanonicalTextEncodingName() because this function is called in it.
|
||||
ASCIILiteral atomName = textEncodingNameMap->get(name);
|
||||
if (!atomName.isNull())
|
||||
set.add(atomName);
|
||||
}
|
||||
|
||||
static void buildQuirksSets() WTF_REQUIRES_LOCK(encodingRegistryLock)
|
||||
{
|
||||
// FIXME: Having isJapaneseEncoding() and shouldShowBackslashAsCurrencySymbolIn()
|
||||
// and initializing the sets for them in TextEncodingRegistry.cpp look strange.
|
||||
|
||||
ASSERT(!japaneseEncodings);
|
||||
ASSERT(!nonBackslashEncodings);
|
||||
|
||||
japaneseEncodings = new HashSet<ASCIILiteral>;
|
||||
addEncodingName(*japaneseEncodings, "EUC-JP"_s);
|
||||
addEncodingName(*japaneseEncodings, "ISO-2022-JP"_s);
|
||||
addEncodingName(*japaneseEncodings, "ISO-2022-JP-1"_s);
|
||||
addEncodingName(*japaneseEncodings, "ISO-2022-JP-2"_s);
|
||||
addEncodingName(*japaneseEncodings, "ISO-2022-JP-3"_s);
|
||||
addEncodingName(*japaneseEncodings, "JIS_C6226-1978"_s);
|
||||
addEncodingName(*japaneseEncodings, "JIS_X0201"_s);
|
||||
addEncodingName(*japaneseEncodings, "JIS_X0208-1983"_s);
|
||||
addEncodingName(*japaneseEncodings, "JIS_X0208-1990"_s);
|
||||
addEncodingName(*japaneseEncodings, "JIS_X0212-1990"_s);
|
||||
addEncodingName(*japaneseEncodings, "Shift_JIS"_s);
|
||||
addEncodingName(*japaneseEncodings, "Shift_JIS_X0213-2000"_s);
|
||||
addEncodingName(*japaneseEncodings, "cp932"_s);
|
||||
addEncodingName(*japaneseEncodings, "x-mac-japanese"_s);
|
||||
|
||||
nonBackslashEncodings = new HashSet<ASCIILiteral>;
|
||||
// The text encodings below treat backslash as a currency symbol for IE compatibility.
|
||||
// See http://blogs.msdn.com/michkap/archive/2005/09/17/469941.aspx for more information.
|
||||
addEncodingName(*nonBackslashEncodings, "x-mac-japanese"_s);
|
||||
addEncodingName(*nonBackslashEncodings, "ISO-2022-JP"_s);
|
||||
addEncodingName(*nonBackslashEncodings, "EUC-JP"_s);
|
||||
// Shift_JIS_X0213-2000 is not the same encoding as Shift_JIS on Mac. We need to register both of them.
|
||||
addEncodingName(*nonBackslashEncodings, "Shift_JIS"_s);
|
||||
addEncodingName(*nonBackslashEncodings, "Shift_JIS_X0213-2000"_s);
|
||||
}
|
||||
|
||||
bool isJapaneseEncoding(ASCIILiteral canonicalEncodingName)
|
||||
{
|
||||
return !canonicalEncodingName.isNull() && japaneseEncodings && japaneseEncodings->contains(canonicalEncodingName);
|
||||
}
|
||||
|
||||
bool shouldShowBackslashAsCurrencySymbolIn(ASCIILiteral canonicalEncodingName)
|
||||
{
|
||||
return !canonicalEncodingName.isNull() && nonBackslashEncodings && nonBackslashEncodings->contains(canonicalEncodingName);
|
||||
}
|
||||
|
||||
static void extendTextCodecMaps() WTF_REQUIRES_LOCK(encodingRegistryLock)
|
||||
{
|
||||
TextCodecReplacement::registerEncodingNames(addToTextEncodingNameMap);
|
||||
TextCodecReplacement::registerCodecs(addToTextCodecMap);
|
||||
|
||||
TextCodecICU::registerEncodingNames(addToTextEncodingNameMap);
|
||||
TextCodecICU::registerCodecs(addToTextCodecMap);
|
||||
|
||||
TextCodecCJK::registerEncodingNames(addToTextEncodingNameMap);
|
||||
TextCodecCJK::registerCodecs(addToTextCodecMap);
|
||||
|
||||
TextCodecSingleByte::registerEncodingNames(addToTextEncodingNameMap);
|
||||
TextCodecSingleByte::registerCodecs(addToTextCodecMap);
|
||||
|
||||
pruneBlocklistedCodecs();
|
||||
buildQuirksSets();
|
||||
}
|
||||
|
||||
std::unique_ptr<TextCodec> newTextCodec(const TextEncoding& encoding)
|
||||
{
|
||||
Locker locker { encodingRegistryLock };
|
||||
|
||||
ASSERT(textCodecMap);
|
||||
if (!encoding.isValid()) {
|
||||
return nullptr;
|
||||
}
|
||||
auto result = textCodecMap->find(encoding.name());
|
||||
if (result == textCodecMap->end()) {
|
||||
return nullptr;
|
||||
}
|
||||
if (!result->value) {
|
||||
// RELEASE_LOG_ERROR(TextEncoding, "Codec for encoding %" PUBLIC_LOG_STRING " is null. Will default to UTF-8", encoding.name().characters());
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
return result->value();
|
||||
}
|
||||
|
||||
static ASCIILiteral atomCanonicalTextEncodingName(std::span<const LChar> name)
|
||||
{
|
||||
if (name.empty())
|
||||
return {};
|
||||
|
||||
Locker locker { encodingRegistryLock };
|
||||
|
||||
if (!textEncodingNameMap)
|
||||
buildBaseTextCodecMaps();
|
||||
|
||||
if (ASCIILiteral atomName = textEncodingNameMap->get<HashTranslatorTextEncodingName>(name))
|
||||
return atomName;
|
||||
if (didExtendTextCodecMaps)
|
||||
return {};
|
||||
|
||||
extendTextCodecMaps();
|
||||
didExtendTextCodecMaps = true;
|
||||
return textEncodingNameMap->get<HashTranslatorTextEncodingName>(name);
|
||||
}
|
||||
|
||||
static ASCIILiteral atomCanonicalTextEncodingName(std::span<const UChar> characters)
|
||||
{
|
||||
if (characters.size() > maxEncodingNameLength)
|
||||
return {};
|
||||
|
||||
std::array<LChar, maxEncodingNameLength> buffer;
|
||||
for (size_t i = 0; i < characters.size(); ++i)
|
||||
buffer[i] = characters[i];
|
||||
|
||||
return atomCanonicalTextEncodingName(std::span { buffer }.first(characters.size()));
|
||||
}
|
||||
|
||||
ASCIILiteral atomCanonicalTextEncodingName(ASCIILiteral name)
|
||||
{
|
||||
return atomCanonicalTextEncodingName(name.span8());
|
||||
}
|
||||
|
||||
ASCIILiteral atomCanonicalTextEncodingName(StringView alias)
|
||||
{
|
||||
if (alias.isEmpty() || !alias.containsOnlyASCII())
|
||||
return {};
|
||||
|
||||
if (alias.is8Bit())
|
||||
return atomCanonicalTextEncodingName(alias.span8());
|
||||
|
||||
return atomCanonicalTextEncodingName(alias.span16());
|
||||
}
|
||||
|
||||
bool noExtendedTextEncodingNameUsed()
|
||||
{
|
||||
// If the calling thread did not use extended encoding names, it is fine for it to use a stale false value.
|
||||
return !didExtendTextCodecMaps;
|
||||
}
|
||||
|
||||
String defaultTextEncodingNameForSystemLanguage()
|
||||
{
|
||||
#if PLATFORM(COCOA)
|
||||
String systemEncodingName = CFStringConvertEncodingToIANACharSetName(webDefaultCFStringEncoding());
|
||||
|
||||
// CFStringConvertEncodingToIANACharSetName() returns cp949 for kTextEncodingDOSKorean AKA "extended EUC-KR" AKA windows-949.
|
||||
// ICU uses this name for a different encoding, so we need to change the name to a value that actually gives us windows-949.
|
||||
// In addition, this value must match what is used in Safari, see <rdar://problem/5579292>.
|
||||
// On some OS versions, the result is CP949 (uppercase).
|
||||
if (equalLettersIgnoringASCIICase(systemEncodingName, "cp949"_s))
|
||||
systemEncodingName = "ks_c_5601-1987"_s;
|
||||
|
||||
// CFStringConvertEncodingToIANACharSetName() returns cp874 for kTextEncodingDOSThai, AKA windows-874.
|
||||
// Since "cp874" alias is not standard (https://encoding.spec.whatwg.org/#names-and-labels), map to
|
||||
// "dos-874" instead.
|
||||
if (equalLettersIgnoringASCIICase(systemEncodingName, "cp874"_s))
|
||||
systemEncodingName = "dos-874"_s;
|
||||
|
||||
return systemEncodingName;
|
||||
#else
|
||||
return "ISO-8859-1"_s;
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace PAL
|
||||
57
src/bun.js/bindings/webcore/TextEncodingRegistry.h
Normal file
57
src/bun.js/bindings/webcore/TextEncodingRegistry.h
Normal file
@@ -0,0 +1,57 @@
|
||||
/*
|
||||
* Copyright (C) 2006-2017 Apple Inc. All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
|
||||
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
|
||||
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <memory>
|
||||
#include <wtf/Forward.h>
|
||||
|
||||
#if PLATFORM(COCOA)
|
||||
#include <CoreFoundation/CoreFoundation.h>
|
||||
#endif
|
||||
|
||||
namespace PAL {
|
||||
|
||||
class TextCodec;
|
||||
class TextEncoding;
|
||||
|
||||
// Use TextResourceDecoder::decode to decode resources, since it handles BOMs.
|
||||
// Use TextEncoding::encode to encode, since it takes care of normalization.
|
||||
std::unique_ptr<TextCodec> newTextCodec(const TextEncoding&);
|
||||
|
||||
// Only TextEncoding should use the following functions directly.
|
||||
ASCIILiteral atomCanonicalTextEncodingName(ASCIILiteral alias);
|
||||
ASCIILiteral atomCanonicalTextEncodingName(StringView);
|
||||
bool noExtendedTextEncodingNameUsed();
|
||||
bool isJapaneseEncoding(ASCIILiteral canonicalEncodingName);
|
||||
bool shouldShowBackslashAsCurrencySymbolIn(ASCIILiteral canonicalEncodingName);
|
||||
|
||||
String defaultTextEncodingNameForSystemLanguage();
|
||||
|
||||
#if PLATFORM(COCOA)
|
||||
CFStringEncoding webDefaultCFStringEncoding();
|
||||
#endif
|
||||
|
||||
} // namespace PAL
|
||||
43
src/bun.js/bindings/webcore/UnencodableHandling.h
Normal file
43
src/bun.js/bindings/webcore/UnencodableHandling.h
Normal file
@@ -0,0 +1,43 @@
|
||||
/*
|
||||
* Copyright (C) 2004-2017 Apple Inc. All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
|
||||
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
|
||||
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
namespace PAL {
|
||||
|
||||
// Specifies what will happen when a character is encountered that is
|
||||
// not encodable in the character set.
|
||||
enum class UnencodableHandling: bool {
|
||||
// Encodes the character as an XML entity. For example, U+06DE
|
||||
// would be "۞" (0x6DE = 1758 in octal).
|
||||
Entities,
|
||||
|
||||
// Encodes the character as en entity as above, but escaped
|
||||
// non-alphanumeric characters. This is used in URLs.
|
||||
// For example, U+6DE would be "%26%231758%3B".
|
||||
URLEncodedEntities
|
||||
};
|
||||
|
||||
}
|
||||
63
src/bun.js/bindings/webcore/WebkitTextCodec.cpp
Normal file
63
src/bun.js/bindings/webcore/WebkitTextCodec.cpp
Normal file
@@ -0,0 +1,63 @@
|
||||
#include "root.h"
|
||||
|
||||
#include "TextCodec.h"
|
||||
#include "TextEncodingRegistry.h"
|
||||
#include "TextEncoding.h"
|
||||
#include "headers-handwritten.h"
|
||||
#include <JavaScriptCore/JSGlobalObject.h>
|
||||
namespace Bun {
|
||||
|
||||
using namespace PAL;
|
||||
using namespace WTF;
|
||||
|
||||
class WebKitTextCodec {
|
||||
WTF_MAKE_FAST_ALLOCATED;
|
||||
|
||||
public:
|
||||
std::unique_ptr<TextCodec> codec;
|
||||
TextEncoding encoding;
|
||||
|
||||
static WebKitTextCodec* create(std::span<const LChar> encodingLabel)
|
||||
{
|
||||
const auto encoding = TextEncoding(String(encodingLabel));
|
||||
auto codec = newTextCodec(encoding);
|
||||
if (codec) {
|
||||
return new WebKitTextCodec(WTFMove(codec), encoding);
|
||||
}
|
||||
|
||||
return nullptr;
|
||||
}
|
||||
};
|
||||
|
||||
extern "C" WebKitTextCodec* WebKitTextCodec__create(const LChar* ptr, size_t len)
|
||||
{
|
||||
|
||||
auto label = std::span<const LChar>(ptr, len);
|
||||
return WebKitTextCodec::create(label);
|
||||
}
|
||||
|
||||
extern "C" void WebKitTextCodec__deinit(WebKitTextCodec* codec)
|
||||
{
|
||||
delete codec;
|
||||
}
|
||||
|
||||
extern "C" BunString WebKitTextCodec__decode(WebKitTextCodec* code, const uint8_t* input_ptr, size_t input_len, bool flush, bool* stopOnError)
|
||||
{
|
||||
const std::span<const uint8_t> data = { input_ptr, input_len };
|
||||
bool shouldStop = stopOnError;
|
||||
*stopOnError = false;
|
||||
auto str = code->codec->decode(data, flush, shouldStop, *stopOnError);
|
||||
return Bun::toStringRef(str);
|
||||
}
|
||||
|
||||
extern "C" BunString WebKitTextCodec__name(WebKitTextCodec* code)
|
||||
{
|
||||
return Bun::toStringRef(code->encoding.name());
|
||||
}
|
||||
|
||||
extern "C" void WebKitTextCodec__stripByteOrderMark(WebKitTextCodec* code)
|
||||
{
|
||||
code->codec->stripByteOrderMark();
|
||||
}
|
||||
|
||||
}
|
||||
2049
src/bun.js/bindings/webcore/unicode-ucnv.h
Normal file
2049
src/bun.js/bindings/webcore/unicode-ucnv.h
Normal file
File diff suppressed because it is too large
Load Diff
164
src/bun.js/bindings/webcore/unicode-ucnv_cb.h
Normal file
164
src/bun.js/bindings/webcore/unicode-ucnv_cb.h
Normal file
@@ -0,0 +1,164 @@
|
||||
// © 2016 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 2000-2004, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
* ucnv_cb.h:
|
||||
* External APIs for the ICU's codeset conversion library
|
||||
* Helena Shih
|
||||
*
|
||||
* Modification History:
|
||||
*
|
||||
* Date Name Description
|
||||
*/
|
||||
|
||||
/**
|
||||
* \file
|
||||
* \brief C UConverter functions to aid the writers of callbacks
|
||||
*
|
||||
* <h2> Callback API for UConverter </h2>
|
||||
*
|
||||
* These functions are provided here for the convenience of the callback
|
||||
* writer. If you are just looking for callback functions to use, please
|
||||
* see ucnv_err.h. DO NOT call these functions directly when you are
|
||||
* working with converters, unless your code has been called as a callback
|
||||
* via ucnv_setFromUCallback or ucnv_setToUCallback !!
|
||||
*
|
||||
* A note about error codes and overflow. Unlike other ICU functions,
|
||||
* these functions do not expect the error status to be U_ZERO_ERROR.
|
||||
* Callbacks must be much more careful about their error codes.
|
||||
* The error codes used here are in/out parameters, which should be passed
|
||||
* back in the callback's error parameter.
|
||||
*
|
||||
* For example, if you call ucnv_cbfromUWriteBytes to write data out
|
||||
* to the output codepage, it may return U_BUFFER_OVERFLOW_ERROR if
|
||||
* the data did not fit in the target. But this isn't a failing error,
|
||||
* in fact, ucnv_cbfromUWriteBytes may be called AGAIN with the error
|
||||
* status still U_BUFFER_OVERFLOW_ERROR to attempt to write further bytes,
|
||||
* which will also go into the internal overflow buffers.
|
||||
*
|
||||
* Concerning offsets, the 'offset' parameters here are relative to the start
|
||||
* of SOURCE. For example, Suppose the string "ABCD" was being converted
|
||||
* from Unicode into a codepage which doesn't have a mapping for 'B'.
|
||||
* 'A' will be written out correctly, but
|
||||
* The FromU Callback will be called on an unassigned character for 'B'.
|
||||
* At this point, this is the state of the world:
|
||||
* Target: A [..] [points after A]
|
||||
* Source: A B [C] D [points to C - B has been consumed]
|
||||
* 0 1 2 3
|
||||
* codePoint = "B" [the unassigned codepoint]
|
||||
*
|
||||
* Now, suppose a callback wants to write the substitution character '?' to
|
||||
* the target. It calls ucnv_cbFromUWriteBytes() to write the ?.
|
||||
* It should pass ZERO as the offset, because the offset as far as the
|
||||
* callback is concerned is relative to the SOURCE pointer [which points
|
||||
* before 'C'.] If the callback goes into the args and consumes 'C' also,
|
||||
* it would call FromUWriteBytes with an offset of 1 (and advance the source
|
||||
* pointer).
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef UCNV_CB_H
|
||||
#define UCNV_CB_H
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
|
||||
#if !UCONFIG_NO_CONVERSION
|
||||
|
||||
#include "unicode-ucnv.h"
|
||||
#include "unicode-ucnv_err.h"
|
||||
|
||||
/**
|
||||
* ONLY used by FromU callback functions.
|
||||
* Writes out the specified byte output bytes to the target byte buffer or to converter internal buffers.
|
||||
*
|
||||
* @param args callback fromUnicode arguments
|
||||
* @param source source bytes to write
|
||||
* @param length length of bytes to write
|
||||
* @param offsetIndex the relative offset index from callback.
|
||||
* @param err error status. If <TT>U_BUFFER_OVERFLOW</TT> is returned, then U_BUFFER_OVERFLOW <STRONG>must</STRONG>
|
||||
* be returned to the user, because it means that not all data could be written into the target buffer, and some is
|
||||
* in the converter error buffer.
|
||||
* @see ucnv_cbFromUWriteSub
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
U_CAPI void U_EXPORT2
|
||||
ucnv_cbFromUWriteBytes(UConverterFromUnicodeArgs* args,
|
||||
const char* source,
|
||||
int32_t length,
|
||||
int32_t offsetIndex,
|
||||
UErrorCode* err);
|
||||
|
||||
/**
|
||||
* ONLY used by FromU callback functions.
|
||||
* This function will write out the correct substitution character sequence
|
||||
* to the target.
|
||||
*
|
||||
* @param args callback fromUnicode arguments
|
||||
* @param offsetIndex the relative offset index from the current source pointer to be used
|
||||
* @param err error status. If <TT>U_BUFFER_OVERFLOW</TT> is returned, then U_BUFFER_OVERFLOW <STRONG>must</STRONG>
|
||||
* be returned to the user, because it means that not all data could be written into the target buffer, and some is
|
||||
* in the converter error buffer.
|
||||
* @see ucnv_cbFromUWriteBytes
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
U_CAPI void U_EXPORT2
|
||||
ucnv_cbFromUWriteSub(UConverterFromUnicodeArgs* args,
|
||||
int32_t offsetIndex,
|
||||
UErrorCode* err);
|
||||
|
||||
/**
|
||||
* ONLY used by fromU callback functions.
|
||||
* This function will write out the error character(s) to the target UChar buffer.
|
||||
*
|
||||
* @param args callback fromUnicode arguments
|
||||
* @param source pointer to pointer to first UChar to write [on exit: 1 after last UChar processed]
|
||||
* @param sourceLimit pointer after last UChar to write
|
||||
* @param offsetIndex the relative offset index from callback which will be set
|
||||
* @param err error status <TT>U_BUFFER_OVERFLOW</TT>
|
||||
* @see ucnv_cbToUWriteSub
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
U_CAPI void U_EXPORT2 ucnv_cbFromUWriteUChars(UConverterFromUnicodeArgs* args,
|
||||
const UChar** source,
|
||||
const UChar* sourceLimit,
|
||||
int32_t offsetIndex,
|
||||
UErrorCode* err);
|
||||
|
||||
/**
|
||||
* ONLY used by ToU callback functions.
|
||||
* This function will write out the specified characters to the target
|
||||
* UChar buffer.
|
||||
*
|
||||
* @param args callback toUnicode arguments
|
||||
* @param source source string to write
|
||||
* @param length the length of source string
|
||||
* @param offsetIndex the relative offset index which will be written.
|
||||
* @param err error status <TT>U_BUFFER_OVERFLOW</TT>
|
||||
* @see ucnv_cbToUWriteSub
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
U_CAPI void U_EXPORT2 ucnv_cbToUWriteUChars(UConverterToUnicodeArgs* args,
|
||||
const UChar* source,
|
||||
int32_t length,
|
||||
int32_t offsetIndex,
|
||||
UErrorCode* err);
|
||||
|
||||
/**
|
||||
* ONLY used by ToU callback functions.
|
||||
* This function will write out the Unicode substitution character (U+FFFD).
|
||||
*
|
||||
* @param args callback fromUnicode arguments
|
||||
* @param offsetIndex the relative offset index from callback.
|
||||
* @param err error status <TT>U_BUFFER_OVERFLOW</TT>
|
||||
* @see ucnv_cbToUWriteUChars
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
U_CAPI void U_EXPORT2 ucnv_cbToUWriteSub(UConverterToUnicodeArgs* args,
|
||||
int32_t offsetIndex,
|
||||
UErrorCode* err);
|
||||
#endif
|
||||
|
||||
#endif
|
||||
465
src/bun.js/bindings/webcore/unicode-ucnv_err.h
Normal file
465
src/bun.js/bindings/webcore/unicode-ucnv_err.h
Normal file
@@ -0,0 +1,465 @@
|
||||
// © 2016 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 1999-2009, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
*
|
||||
*
|
||||
* ucnv_err.h:
|
||||
*/
|
||||
|
||||
/**
|
||||
* \file
|
||||
* \brief C UConverter predefined error callbacks
|
||||
*
|
||||
* <h2>Error Behaviour Functions</h2>
|
||||
* Defines some error behaviour functions called by ucnv_{from,to}Unicode
|
||||
* These are provided as part of ICU and many are stable, but they
|
||||
* can also be considered only as an example of what can be done with
|
||||
* callbacks. You may of course write your own.
|
||||
*
|
||||
* If you want to write your own, you may also find the functions from
|
||||
* ucnv_cb.h useful when writing your own callbacks.
|
||||
*
|
||||
* These functions, although public, should NEVER be called directly.
|
||||
* They should be used as parameters to the ucnv_setFromUCallback
|
||||
* and ucnv_setToUCallback functions, to set the behaviour of a converter
|
||||
* when it encounters ILLEGAL/UNMAPPED/INVALID sequences.
|
||||
*
|
||||
* usage example: 'STOP' doesn't need any context, but newContext
|
||||
* could be set to something other than 'NULL' if needed. The available
|
||||
* contexts in this header can modify the default behavior of the callback.
|
||||
*
|
||||
* \code
|
||||
* UErrorCode err = U_ZERO_ERROR;
|
||||
* UConverter *myConverter = ucnv_open("ibm-949", &err);
|
||||
* const void *oldContext;
|
||||
* UConverterFromUCallback oldAction;
|
||||
*
|
||||
*
|
||||
* if (U_SUCCESS(err))
|
||||
* {
|
||||
* ucnv_setFromUCallBack(myConverter,
|
||||
* UCNV_FROM_U_CALLBACK_STOP,
|
||||
* NULL,
|
||||
* &oldAction,
|
||||
* &oldContext,
|
||||
* &status);
|
||||
* }
|
||||
* \endcode
|
||||
*
|
||||
* The code above tells "myConverter" to stop when it encounters an
|
||||
* ILLEGAL/TRUNCATED/INVALID sequences when it is used to convert from
|
||||
* Unicode -> Codepage. The behavior from Codepage to Unicode is not changed,
|
||||
* and ucnv_setToUCallBack would need to be called in order to change
|
||||
* that behavior too.
|
||||
*
|
||||
* Here is an example with a context:
|
||||
*
|
||||
* \code
|
||||
* UErrorCode err = U_ZERO_ERROR;
|
||||
* UConverter *myConverter = ucnv_open("ibm-949", &err);
|
||||
* const void *oldContext;
|
||||
* UConverterFromUCallback oldAction;
|
||||
*
|
||||
*
|
||||
* if (U_SUCCESS(err))
|
||||
* {
|
||||
* ucnv_setToUCallBack(myConverter,
|
||||
* UCNV_TO_U_CALLBACK_SUBSTITUTE,
|
||||
* UCNV_SUB_STOP_ON_ILLEGAL,
|
||||
* &oldAction,
|
||||
* &oldContext,
|
||||
* &status);
|
||||
* }
|
||||
* \endcode
|
||||
*
|
||||
* The code above tells "myConverter" to stop when it encounters an
|
||||
* ILLEGAL/TRUNCATED/INVALID sequences when it is used to convert from
|
||||
* Codepage -> Unicode. Any unmapped and legal characters will be
|
||||
* substituted to be the default substitution character.
|
||||
*/
|
||||
|
||||
#ifndef UCNV_ERR_H
|
||||
#define UCNV_ERR_H
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
|
||||
#if !UCONFIG_NO_CONVERSION
|
||||
|
||||
/** Forward declaring the UConverter structure. @stable ICU 2.0 */
|
||||
struct UConverter;
|
||||
|
||||
/** @stable ICU 2.0 */
|
||||
typedef struct UConverter UConverter;
|
||||
|
||||
/**
|
||||
* FROM_U, TO_U context options for sub callback
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
#define UCNV_SUB_STOP_ON_ILLEGAL "i"
|
||||
|
||||
/**
|
||||
* FROM_U, TO_U context options for skip callback
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
#define UCNV_SKIP_STOP_ON_ILLEGAL "i"
|
||||
|
||||
/**
|
||||
* FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to ICU (%UXXXX)
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
#define UCNV_ESCAPE_ICU NULL
|
||||
/**
|
||||
* FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to JAVA (\\uXXXX)
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
#define UCNV_ESCAPE_JAVA "J"
|
||||
/**
|
||||
* FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to C (\\uXXXX \\UXXXXXXXX)
|
||||
* TO_U_CALLBACK_ESCAPE option to escape the character value according to C (\\xXXXX)
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
#define UCNV_ESCAPE_C "C"
|
||||
/**
|
||||
* FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to XML Decimal escape \htmlonly(&#DDDD;)\endhtmlonly
|
||||
* TO_U_CALLBACK_ESCAPE context option to escape the character value according to XML Decimal escape \htmlonly(&#DDDD;)\endhtmlonly
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
#define UCNV_ESCAPE_XML_DEC "D"
|
||||
/**
|
||||
* FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to XML Hex escape \htmlonly(&#xXXXX;)\endhtmlonly
|
||||
* TO_U_CALLBACK_ESCAPE context option to escape the character value according to XML Hex escape \htmlonly(&#xXXXX;)\endhtmlonly
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
#define UCNV_ESCAPE_XML_HEX "X"
|
||||
/**
|
||||
* FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to Unicode (U+XXXXX)
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
#define UCNV_ESCAPE_UNICODE "U"
|
||||
|
||||
/**
|
||||
* FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to CSS2 conventions (\\HH..H<space>, that is,
|
||||
* a backslash, 1..6 hex digits, and a space)
|
||||
* @stable ICU 4.0
|
||||
*/
|
||||
#define UCNV_ESCAPE_CSS2 "S"
|
||||
|
||||
/**
|
||||
* The process condition code to be used with the callbacks.
|
||||
* Codes which are greater than UCNV_IRREGULAR should be
|
||||
* passed on to any chained callbacks.
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
typedef enum {
|
||||
UCNV_UNASSIGNED = 0, /**< The code point is unassigned.
|
||||
The error code U_INVALID_CHAR_FOUND will be set. */
|
||||
UCNV_ILLEGAL = 1, /**< The code point is illegal. For example,
|
||||
\\x81\\x2E is illegal in SJIS because \\x2E
|
||||
is not a valid trail byte for the \\x81
|
||||
lead byte.
|
||||
Also, starting with Unicode 3.0.1, non-shortest byte sequences
|
||||
in UTF-8 (like \\xC1\\xA1 instead of \\x61 for U+0061)
|
||||
are also illegal, not just irregular.
|
||||
The error code U_ILLEGAL_CHAR_FOUND will be set. */
|
||||
UCNV_IRREGULAR = 2, /**< The codepoint is not a regular sequence in
|
||||
the encoding. For example, \\xED\\xA0\\x80..\\xED\\xBF\\xBF
|
||||
are irregular UTF-8 byte sequences for single surrogate
|
||||
code points.
|
||||
The error code U_INVALID_CHAR_FOUND will be set. */
|
||||
UCNV_RESET = 3, /**< The callback is called with this reason when a
|
||||
'reset' has occurred. Callback should reset all
|
||||
state. */
|
||||
UCNV_CLOSE = 4, /**< Called when the converter is closed. The
|
||||
callback should release any allocated memory.*/
|
||||
UCNV_CLONE = 5 /**< Called when ucnv_safeClone() is called on the
|
||||
converter. the pointer available as the
|
||||
'context' is an alias to the original converters'
|
||||
context pointer. If the context must be owned
|
||||
by the new converter, the callback must clone
|
||||
the data and call ucnv_setFromUCallback
|
||||
(or setToUCallback) with the correct pointer.
|
||||
@stable ICU 2.2
|
||||
*/
|
||||
} UConverterCallbackReason;
|
||||
|
||||
|
||||
/**
|
||||
* The structure for the fromUnicode callback function parameter.
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
typedef struct {
|
||||
uint16_t size; /**< The size of this struct. @stable ICU 2.0 */
|
||||
UBool flush; /**< The internal state of converter will be reset and data flushed if set to true. @stable ICU 2.0 */
|
||||
UConverter *converter; /**< Pointer to the converter that is opened and to which this struct is passed as an argument. @stable ICU 2.0 */
|
||||
const UChar *source; /**< Pointer to the source source buffer. @stable ICU 2.0 */
|
||||
const UChar *sourceLimit; /**< Pointer to the limit (end + 1) of source buffer. @stable ICU 2.0 */
|
||||
char *target; /**< Pointer to the target buffer. @stable ICU 2.0 */
|
||||
const char *targetLimit; /**< Pointer to the limit (end + 1) of target buffer. @stable ICU 2.0 */
|
||||
int32_t *offsets; /**< Pointer to the buffer that receives the offsets. *offset = blah ; offset++;. @stable ICU 2.0 */
|
||||
} UConverterFromUnicodeArgs;
|
||||
|
||||
|
||||
/**
|
||||
* The structure for the toUnicode callback function parameter.
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
typedef struct {
|
||||
uint16_t size; /**< The size of this struct @stable ICU 2.0 */
|
||||
UBool flush; /**< The internal state of converter will be reset and data flushed if set to true. @stable ICU 2.0 */
|
||||
UConverter *converter; /**< Pointer to the converter that is opened and to which this struct is passed as an argument. @stable ICU 2.0 */
|
||||
const char *source; /**< Pointer to the source source buffer. @stable ICU 2.0 */
|
||||
const char *sourceLimit; /**< Pointer to the limit (end + 1) of source buffer. @stable ICU 2.0 */
|
||||
UChar *target; /**< Pointer to the target buffer. @stable ICU 2.0 */
|
||||
const UChar *targetLimit; /**< Pointer to the limit (end + 1) of target buffer. @stable ICU 2.0 */
|
||||
int32_t *offsets; /**< Pointer to the buffer that receives the offsets. *offset = blah ; offset++;. @stable ICU 2.0 */
|
||||
} UConverterToUnicodeArgs;
|
||||
|
||||
|
||||
/**
|
||||
* DO NOT CALL THIS FUNCTION DIRECTLY!
|
||||
* This From Unicode callback STOPS at the ILLEGAL_SEQUENCE,
|
||||
* returning the error code back to the caller immediately.
|
||||
*
|
||||
* @param context Pointer to the callback's private data
|
||||
* @param fromUArgs Information about the conversion in progress
|
||||
* @param codeUnits Points to 'length' UChars of the concerned Unicode sequence
|
||||
* @param length Size (in bytes) of the concerned codepage sequence
|
||||
* @param codePoint Single UChar32 (UTF-32) containing the concerend Unicode codepoint.
|
||||
* @param reason Defines the reason the callback was invoked
|
||||
* @param err This should always be set to a failure status prior to calling.
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
U_CAPI void U_EXPORT2 UCNV_FROM_U_CALLBACK_STOP (
|
||||
const void *context,
|
||||
UConverterFromUnicodeArgs *fromUArgs,
|
||||
const UChar* codeUnits,
|
||||
int32_t length,
|
||||
UChar32 codePoint,
|
||||
UConverterCallbackReason reason,
|
||||
UErrorCode * err);
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* DO NOT CALL THIS FUNCTION DIRECTLY!
|
||||
* This To Unicode callback STOPS at the ILLEGAL_SEQUENCE,
|
||||
* returning the error code back to the caller immediately.
|
||||
*
|
||||
* @param context Pointer to the callback's private data
|
||||
* @param toUArgs Information about the conversion in progress
|
||||
* @param codeUnits Points to 'length' bytes of the concerned codepage sequence
|
||||
* @param length Size (in bytes) of the concerned codepage sequence
|
||||
* @param reason Defines the reason the callback was invoked
|
||||
* @param err This should always be set to a failure status prior to calling.
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
U_CAPI void U_EXPORT2 UCNV_TO_U_CALLBACK_STOP (
|
||||
const void *context,
|
||||
UConverterToUnicodeArgs *toUArgs,
|
||||
const char* codeUnits,
|
||||
int32_t length,
|
||||
UConverterCallbackReason reason,
|
||||
UErrorCode * err);
|
||||
|
||||
/**
|
||||
* DO NOT CALL THIS FUNCTION DIRECTLY!
|
||||
* This From Unicode callback skips any ILLEGAL_SEQUENCE, or
|
||||
* skips only UNASSIGNED_SEQUENCE depending on the context parameter
|
||||
* simply ignoring those characters.
|
||||
*
|
||||
* @param context The function currently recognizes the callback options:
|
||||
* UCNV_SKIP_STOP_ON_ILLEGAL: STOPS at the ILLEGAL_SEQUENCE,
|
||||
* returning the error code back to the caller immediately.
|
||||
* NULL: Skips any ILLEGAL_SEQUENCE
|
||||
* @param fromUArgs Information about the conversion in progress
|
||||
* @param codeUnits Points to 'length' UChars of the concerned Unicode sequence
|
||||
* @param length Size (in bytes) of the concerned codepage sequence
|
||||
* @param codePoint Single UChar32 (UTF-32) containing the concerend Unicode codepoint.
|
||||
* @param reason Defines the reason the callback was invoked
|
||||
* @param err Return value will be set to success if the callback was handled,
|
||||
* otherwise this value will be set to a failure status.
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
U_CAPI void U_EXPORT2 UCNV_FROM_U_CALLBACK_SKIP (
|
||||
const void *context,
|
||||
UConverterFromUnicodeArgs *fromUArgs,
|
||||
const UChar* codeUnits,
|
||||
int32_t length,
|
||||
UChar32 codePoint,
|
||||
UConverterCallbackReason reason,
|
||||
UErrorCode * err);
|
||||
|
||||
/**
|
||||
* DO NOT CALL THIS FUNCTION DIRECTLY!
|
||||
* This From Unicode callback will Substitute the ILLEGAL SEQUENCE, or
|
||||
* UNASSIGNED_SEQUENCE depending on context parameter, with the
|
||||
* current substitution string for the converter. This is the default
|
||||
* callback.
|
||||
*
|
||||
* @param context The function currently recognizes the callback options:
|
||||
* UCNV_SUB_STOP_ON_ILLEGAL: STOPS at the ILLEGAL_SEQUENCE,
|
||||
* returning the error code back to the caller immediately.
|
||||
* NULL: Substitutes any ILLEGAL_SEQUENCE
|
||||
* @param fromUArgs Information about the conversion in progress
|
||||
* @param codeUnits Points to 'length' UChars of the concerned Unicode sequence
|
||||
* @param length Size (in bytes) of the concerned codepage sequence
|
||||
* @param codePoint Single UChar32 (UTF-32) containing the concerend Unicode codepoint.
|
||||
* @param reason Defines the reason the callback was invoked
|
||||
* @param err Return value will be set to success if the callback was handled,
|
||||
* otherwise this value will be set to a failure status.
|
||||
* @see ucnv_setSubstChars
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
U_CAPI void U_EXPORT2 UCNV_FROM_U_CALLBACK_SUBSTITUTE (
|
||||
const void *context,
|
||||
UConverterFromUnicodeArgs *fromUArgs,
|
||||
const UChar* codeUnits,
|
||||
int32_t length,
|
||||
UChar32 codePoint,
|
||||
UConverterCallbackReason reason,
|
||||
UErrorCode * err);
|
||||
|
||||
/**
|
||||
* DO NOT CALL THIS FUNCTION DIRECTLY!
|
||||
* This From Unicode callback will Substitute the ILLEGAL SEQUENCE with the
|
||||
* hexadecimal representation of the illegal codepoints
|
||||
*
|
||||
* @param context The function currently recognizes the callback options:
|
||||
* <ul>
|
||||
* <li>UCNV_ESCAPE_ICU: Substitutes the ILLEGAL SEQUENCE with the hexadecimal
|
||||
* representation in the format %UXXXX, e.g. "%uFFFE%u00AC%uC8FE").
|
||||
* In the Event the converter doesn't support the characters {%,U}[A-F][0-9],
|
||||
* it will substitute the illegal sequence with the substitution characters.
|
||||
* Note that codeUnit(32bit int eg: unit of a surrogate pair) is represented as
|
||||
* %UD84D%UDC56</li>
|
||||
* <li>UCNV_ESCAPE_JAVA: Substitutes the ILLEGAL SEQUENCE with the hexadecimal
|
||||
* representation in the format \\uXXXX, e.g. "\\uFFFE\\u00AC\\uC8FE").
|
||||
* In the Event the converter doesn't support the characters {\,u}[A-F][0-9],
|
||||
* it will substitute the illegal sequence with the substitution characters.
|
||||
* Note that codeUnit(32bit int eg: unit of a surrogate pair) is represented as
|
||||
* \\uD84D\\uDC56</li>
|
||||
* <li>UCNV_ESCAPE_C: Substitutes the ILLEGAL SEQUENCE with the hexadecimal
|
||||
* representation in the format \\uXXXX, e.g. "\\uFFFE\\u00AC\\uC8FE").
|
||||
* In the Event the converter doesn't support the characters {\,u,U}[A-F][0-9],
|
||||
* it will substitute the illegal sequence with the substitution characters.
|
||||
* Note that codeUnit(32bit int eg: unit of a surrogate pair) is represented as
|
||||
* \\U00023456</li>
|
||||
* <li>UCNV_ESCAPE_XML_DEC: Substitutes the ILLEGAL SEQUENCE with the decimal
|
||||
* representation in the format \htmlonly&#DDDDDDDD;, e.g. "&#65534;&#172;&#51454;")\endhtmlonly.
|
||||
* In the Event the converter doesn't support the characters {&,#}[0-9],
|
||||
* it will substitute the illegal sequence with the substitution characters.
|
||||
* Note that codeUnit(32bit int eg: unit of a surrogate pair) is represented as
|
||||
* &#144470; and Zero padding is ignored.</li>
|
||||
* <li>UCNV_ESCAPE_XML_HEX:Substitutes the ILLEGAL SEQUENCE with the decimal
|
||||
* representation in the format \htmlonly&#xXXXX; e.g. "&#xFFFE;&#x00AC;&#xC8FE;")\endhtmlonly.
|
||||
* In the Event the converter doesn't support the characters {&,#,x}[0-9],
|
||||
* it will substitute the illegal sequence with the substitution characters.
|
||||
* Note that codeUnit(32bit int eg: unit of a surrogate pair) is represented as
|
||||
* \htmlonly&#x23456;\endhtmlonly</li>
|
||||
* </ul>
|
||||
* @param fromUArgs Information about the conversion in progress
|
||||
* @param codeUnits Points to 'length' UChars of the concerned Unicode sequence
|
||||
* @param length Size (in bytes) of the concerned codepage sequence
|
||||
* @param codePoint Single UChar32 (UTF-32) containing the concerend Unicode codepoint.
|
||||
* @param reason Defines the reason the callback was invoked
|
||||
* @param err Return value will be set to success if the callback was handled,
|
||||
* otherwise this value will be set to a failure status.
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
U_CAPI void U_EXPORT2 UCNV_FROM_U_CALLBACK_ESCAPE (
|
||||
const void *context,
|
||||
UConverterFromUnicodeArgs *fromUArgs,
|
||||
const UChar* codeUnits,
|
||||
int32_t length,
|
||||
UChar32 codePoint,
|
||||
UConverterCallbackReason reason,
|
||||
UErrorCode * err);
|
||||
|
||||
|
||||
/**
|
||||
* DO NOT CALL THIS FUNCTION DIRECTLY!
|
||||
* This To Unicode callback skips any ILLEGAL_SEQUENCE, or
|
||||
* skips only UNASSIGNED_SEQUENCE depending on the context parameter
|
||||
* simply ignoring those characters.
|
||||
*
|
||||
* @param context The function currently recognizes the callback options:
|
||||
* UCNV_SKIP_STOP_ON_ILLEGAL: STOPS at the ILLEGAL_SEQUENCE,
|
||||
* returning the error code back to the caller immediately.
|
||||
* NULL: Skips any ILLEGAL_SEQUENCE
|
||||
* @param toUArgs Information about the conversion in progress
|
||||
* @param codeUnits Points to 'length' bytes of the concerned codepage sequence
|
||||
* @param length Size (in bytes) of the concerned codepage sequence
|
||||
* @param reason Defines the reason the callback was invoked
|
||||
* @param err Return value will be set to success if the callback was handled,
|
||||
* otherwise this value will be set to a failure status.
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
U_CAPI void U_EXPORT2 UCNV_TO_U_CALLBACK_SKIP (
|
||||
const void *context,
|
||||
UConverterToUnicodeArgs *toUArgs,
|
||||
const char* codeUnits,
|
||||
int32_t length,
|
||||
UConverterCallbackReason reason,
|
||||
UErrorCode * err);
|
||||
|
||||
/**
|
||||
* DO NOT CALL THIS FUNCTION DIRECTLY!
|
||||
* This To Unicode callback will Substitute the ILLEGAL SEQUENCE,or
|
||||
* UNASSIGNED_SEQUENCE depending on context parameter, with the
|
||||
* Unicode substitution character, U+FFFD.
|
||||
*
|
||||
* @param context The function currently recognizes the callback options:
|
||||
* UCNV_SUB_STOP_ON_ILLEGAL: STOPS at the ILLEGAL_SEQUENCE,
|
||||
* returning the error code back to the caller immediately.
|
||||
* NULL: Substitutes any ILLEGAL_SEQUENCE
|
||||
* @param toUArgs Information about the conversion in progress
|
||||
* @param codeUnits Points to 'length' bytes of the concerned codepage sequence
|
||||
* @param length Size (in bytes) of the concerned codepage sequence
|
||||
* @param reason Defines the reason the callback was invoked
|
||||
* @param err Return value will be set to success if the callback was handled,
|
||||
* otherwise this value will be set to a failure status.
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
U_CAPI void U_EXPORT2 UCNV_TO_U_CALLBACK_SUBSTITUTE (
|
||||
const void *context,
|
||||
UConverterToUnicodeArgs *toUArgs,
|
||||
const char* codeUnits,
|
||||
int32_t length,
|
||||
UConverterCallbackReason reason,
|
||||
UErrorCode * err);
|
||||
|
||||
/**
|
||||
* DO NOT CALL THIS FUNCTION DIRECTLY!
|
||||
* This To Unicode callback will Substitute the ILLEGAL SEQUENCE with the
|
||||
* hexadecimal representation of the illegal bytes
|
||||
* (in the format %XNN, e.g. "%XFF%X0A%XC8%X03").
|
||||
*
|
||||
* @param context This function currently recognizes the callback options:
|
||||
* UCNV_ESCAPE_ICU, UCNV_ESCAPE_JAVA, UCNV_ESCAPE_C, UCNV_ESCAPE_XML_DEC,
|
||||
* UCNV_ESCAPE_XML_HEX and UCNV_ESCAPE_UNICODE.
|
||||
* @param toUArgs Information about the conversion in progress
|
||||
* @param codeUnits Points to 'length' bytes of the concerned codepage sequence
|
||||
* @param length Size (in bytes) of the concerned codepage sequence
|
||||
* @param reason Defines the reason the callback was invoked
|
||||
* @param err Return value will be set to success if the callback was handled,
|
||||
* otherwise this value will be set to a failure status.
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
|
||||
U_CAPI void U_EXPORT2 UCNV_TO_U_CALLBACK_ESCAPE (
|
||||
const void *context,
|
||||
UConverterToUnicodeArgs *toUArgs,
|
||||
const char* codeUnits,
|
||||
int32_t length,
|
||||
UConverterCallbackReason reason,
|
||||
UErrorCode * err);
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
/*UCNV_ERR_H*/
|
||||
410
src/bun.js/bindings/webcore/unicode-ucsdet.h
Normal file
410
src/bun.js/bindings/webcore/unicode-ucsdet.h
Normal file
@@ -0,0 +1,410 @@
|
||||
// © 2016 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 2005-2013, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
* file name: ucsdet.h
|
||||
* encoding: UTF-8
|
||||
* indentation:4
|
||||
*
|
||||
* created on: 2005Aug04
|
||||
* created by: Andy Heninger
|
||||
*
|
||||
* ICU Character Set Detection, API for C
|
||||
*
|
||||
* Draft version 18 Oct 2005
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef __UCSDET_H
|
||||
#define __UCSDET_H
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
|
||||
#if !UCONFIG_NO_CONVERSION
|
||||
|
||||
#include "unicode-uenum.h"
|
||||
|
||||
#if U_SHOW_CPLUSPLUS_API
|
||||
#include "unicode/localpointer.h"
|
||||
#endif // U_SHOW_CPLUSPLUS_API
|
||||
|
||||
/**
|
||||
* \file
|
||||
* \brief C API: Charset Detection API
|
||||
*
|
||||
* This API provides a facility for detecting the
|
||||
* charset or encoding of character data in an unknown text format.
|
||||
* The input data can be from an array of bytes.
|
||||
* <p>
|
||||
* Character set detection is at best an imprecise operation. The detection
|
||||
* process will attempt to identify the charset that best matches the characteristics
|
||||
* of the byte data, but the process is partly statistical in nature, and
|
||||
* the results can not be guaranteed to always be correct.
|
||||
* <p>
|
||||
* For best accuracy in charset detection, the input data should be primarily
|
||||
* in a single language, and a minimum of a few hundred bytes worth of plain text
|
||||
* in the language are needed. The detection process will attempt to
|
||||
* ignore html or xml style markup that could otherwise obscure the content.
|
||||
* <p>
|
||||
* An alternative to the ICU Charset Detector is the
|
||||
* Compact Encoding Detector, https://github.com/google/compact_enc_det.
|
||||
* It often gives more accurate results, especially with short input samples.
|
||||
*/
|
||||
|
||||
struct UCharsetDetector;
|
||||
/**
|
||||
* Structure representing a charset detector
|
||||
* @stable ICU 3.6
|
||||
*/
|
||||
typedef struct UCharsetDetector UCharsetDetector;
|
||||
|
||||
struct UCharsetMatch;
|
||||
/**
|
||||
* Opaque structure representing a match that was identified
|
||||
* from a charset detection operation.
|
||||
* @stable ICU 3.6
|
||||
*/
|
||||
typedef struct UCharsetMatch UCharsetMatch;
|
||||
|
||||
/**
|
||||
* Open a charset detector.
|
||||
*
|
||||
* @param status Any error conditions occurring during the open
|
||||
* operation are reported back in this variable.
|
||||
* @return the newly opened charset detector.
|
||||
* @stable ICU 3.6
|
||||
*/
|
||||
U_CAPI UCharsetDetector* U_EXPORT2
|
||||
ucsdet_open(UErrorCode* status);
|
||||
|
||||
/**
|
||||
* Close a charset detector. All storage and any other resources
|
||||
* owned by this charset detector will be released. Failure to
|
||||
* close a charset detector when finished with it can result in
|
||||
* memory leaks in the application.
|
||||
*
|
||||
* @param ucsd The charset detector to be closed.
|
||||
* @stable ICU 3.6
|
||||
*/
|
||||
U_CAPI void U_EXPORT2
|
||||
ucsdet_close(UCharsetDetector* ucsd);
|
||||
|
||||
#if U_SHOW_CPLUSPLUS_API
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
/**
|
||||
* \class LocalUCharsetDetectorPointer
|
||||
* "Smart pointer" class, closes a UCharsetDetector via ucsdet_close().
|
||||
* For most methods see the LocalPointerBase base class.
|
||||
*
|
||||
* @see LocalPointerBase
|
||||
* @see LocalPointer
|
||||
* @stable ICU 4.4
|
||||
*/
|
||||
U_DEFINE_LOCAL_OPEN_POINTER(LocalUCharsetDetectorPointer, UCharsetDetector, ucsdet_close);
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif
|
||||
|
||||
/**
|
||||
* Set the input byte data whose charset is to detected.
|
||||
*
|
||||
* Ownership of the input text byte array remains with the caller.
|
||||
* The input string must not be altered or deleted until the charset
|
||||
* detector is either closed or reset to refer to different input text.
|
||||
*
|
||||
* @param ucsd the charset detector to be used.
|
||||
* @param textIn the input text of unknown encoding. .
|
||||
* @param len the length of the input text, or -1 if the text
|
||||
* is NUL terminated.
|
||||
* @param status any error conditions are reported back in this variable.
|
||||
*
|
||||
* @stable ICU 3.6
|
||||
*/
|
||||
U_CAPI void U_EXPORT2
|
||||
ucsdet_setText(UCharsetDetector* ucsd, const char* textIn, int32_t len, UErrorCode* status);
|
||||
|
||||
/** Set the declared encoding for charset detection.
|
||||
* The declared encoding of an input text is an encoding obtained
|
||||
* by the user from an http header or xml declaration or similar source that
|
||||
* can be provided as an additional hint to the charset detector.
|
||||
*
|
||||
* How and whether the declared encoding will be used during the
|
||||
* detection process is TBD.
|
||||
*
|
||||
* @param ucsd the charset detector to be used.
|
||||
* @param encoding an encoding for the current data obtained from
|
||||
* a header or declaration or other source outside
|
||||
* of the byte data itself.
|
||||
* @param length the length of the encoding name, or -1 if the name string
|
||||
* is NUL terminated.
|
||||
* @param status any error conditions are reported back in this variable.
|
||||
*
|
||||
* @stable ICU 3.6
|
||||
*/
|
||||
U_CAPI void U_EXPORT2
|
||||
ucsdet_setDeclaredEncoding(UCharsetDetector* ucsd, const char* encoding, int32_t length, UErrorCode* status);
|
||||
|
||||
/**
|
||||
* Return the charset that best matches the supplied input data.
|
||||
*
|
||||
* Note though, that because the detection
|
||||
* only looks at the start of the input data,
|
||||
* there is a possibility that the returned charset will fail to handle
|
||||
* the full set of input data.
|
||||
* <p>
|
||||
* The returned UCharsetMatch object is owned by the UCharsetDetector.
|
||||
* It will remain valid until the detector input is reset, or until
|
||||
* the detector is closed.
|
||||
* <p>
|
||||
* The function will fail if
|
||||
* <ul>
|
||||
* <li>no charset appears to match the data.</li>
|
||||
* <li>no input text has been provided</li>
|
||||
* </ul>
|
||||
*
|
||||
* @param ucsd the charset detector to be used.
|
||||
* @param status any error conditions are reported back in this variable.
|
||||
* @return a UCharsetMatch representing the best matching charset,
|
||||
* or NULL if no charset matches the byte data.
|
||||
*
|
||||
* @stable ICU 3.6
|
||||
*/
|
||||
U_CAPI const UCharsetMatch* U_EXPORT2
|
||||
ucsdet_detect(UCharsetDetector* ucsd, UErrorCode* status);
|
||||
|
||||
/**
|
||||
* Find all charset matches that appear to be consistent with the input,
|
||||
* returning an array of results. The results are ordered with the
|
||||
* best quality match first.
|
||||
*
|
||||
* Because the detection only looks at a limited amount of the
|
||||
* input byte data, some of the returned charsets may fail to handle
|
||||
* the all of input data.
|
||||
* <p>
|
||||
* The returned UCharsetMatch objects are owned by the UCharsetDetector.
|
||||
* They will remain valid until the detector is closed or modified
|
||||
*
|
||||
* <p>
|
||||
* Return an error if
|
||||
* <ul>
|
||||
* <li>no charsets appear to match the input data.</li>
|
||||
* <li>no input text has been provided</li>
|
||||
* </ul>
|
||||
*
|
||||
* @param ucsd the charset detector to be used.
|
||||
* @param matchesFound pointer to a variable that will be set to the
|
||||
* number of charsets identified that are consistent with
|
||||
* the input data. Output only.
|
||||
* @param status any error conditions are reported back in this variable.
|
||||
* @return A pointer to an array of pointers to UCharSetMatch objects.
|
||||
* This array, and the UCharSetMatch instances to which it refers,
|
||||
* are owned by the UCharsetDetector, and will remain valid until
|
||||
* the detector is closed or modified.
|
||||
* @stable ICU 3.6
|
||||
*/
|
||||
U_CAPI const UCharsetMatch** U_EXPORT2
|
||||
ucsdet_detectAll(UCharsetDetector* ucsd, int32_t* matchesFound, UErrorCode* status);
|
||||
|
||||
/**
|
||||
* Get the name of the charset represented by a UCharsetMatch.
|
||||
*
|
||||
* The storage for the returned name string is owned by the
|
||||
* UCharsetMatch, and will remain valid while the UCharsetMatch
|
||||
* is valid.
|
||||
*
|
||||
* The name returned is suitable for use with the ICU conversion APIs.
|
||||
*
|
||||
* @param ucsm The charset match object.
|
||||
* @param status Any error conditions are reported back in this variable.
|
||||
* @return The name of the matching charset.
|
||||
*
|
||||
* @stable ICU 3.6
|
||||
*/
|
||||
U_CAPI const char* U_EXPORT2
|
||||
ucsdet_getName(const UCharsetMatch* ucsm, UErrorCode* status);
|
||||
|
||||
/**
|
||||
* Get a confidence number for the quality of the match of the byte
|
||||
* data with the charset. Confidence numbers range from zero to 100,
|
||||
* with 100 representing complete confidence and zero representing
|
||||
* no confidence.
|
||||
*
|
||||
* The confidence values are somewhat arbitrary. They define an
|
||||
* an ordering within the results for any single detection operation
|
||||
* but are not generally comparable between the results for different input.
|
||||
*
|
||||
* A confidence value of ten does have a general meaning - it is used
|
||||
* for charsets that can represent the input data, but for which there
|
||||
* is no other indication that suggests that the charset is the correct one.
|
||||
* Pure 7 bit ASCII data, for example, is compatible with a
|
||||
* great many charsets, most of which will appear as possible matches
|
||||
* with a confidence of 10.
|
||||
*
|
||||
* @param ucsm The charset match object.
|
||||
* @param status Any error conditions are reported back in this variable.
|
||||
* @return A confidence number for the charset match.
|
||||
*
|
||||
* @stable ICU 3.6
|
||||
*/
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
ucsdet_getConfidence(const UCharsetMatch* ucsm, UErrorCode* status);
|
||||
|
||||
/**
|
||||
* Get the RFC 3066 code for the language of the input data.
|
||||
*
|
||||
* The Charset Detection service is intended primarily for detecting
|
||||
* charsets, not language. For some, but not all, charsets, a language is
|
||||
* identified as a byproduct of the detection process, and that is what
|
||||
* is returned by this function.
|
||||
*
|
||||
* CAUTION:
|
||||
* 1. Language information is not available for input data encoded in
|
||||
* all charsets. In particular, no language is identified
|
||||
* for UTF-8 input data.
|
||||
*
|
||||
* 2. Closely related languages may sometimes be confused.
|
||||
*
|
||||
* If more accurate language detection is required, a linguistic
|
||||
* analysis package should be used.
|
||||
*
|
||||
* The storage for the returned name string is owned by the
|
||||
* UCharsetMatch, and will remain valid while the UCharsetMatch
|
||||
* is valid.
|
||||
*
|
||||
* @param ucsm The charset match object.
|
||||
* @param status Any error conditions are reported back in this variable.
|
||||
* @return The RFC 3066 code for the language of the input data, or
|
||||
* an empty string if the language could not be determined.
|
||||
*
|
||||
* @stable ICU 3.6
|
||||
*/
|
||||
U_CAPI const char* U_EXPORT2
|
||||
ucsdet_getLanguage(const UCharsetMatch* ucsm, UErrorCode* status);
|
||||
|
||||
/**
|
||||
* Get the entire input text as a UChar string, placing it into
|
||||
* a caller-supplied buffer. A terminating
|
||||
* NUL character will be appended to the buffer if space is available.
|
||||
*
|
||||
* The number of UChars in the output string, not including the terminating
|
||||
* NUL, is returned.
|
||||
*
|
||||
* If the supplied buffer is smaller than required to hold the output,
|
||||
* the contents of the buffer are undefined. The full output string length
|
||||
* (in UChars) is returned as always, and can be used to allocate a buffer
|
||||
* of the correct size.
|
||||
*
|
||||
*
|
||||
* @param ucsm The charset match object.
|
||||
* @param buf A UChar buffer to be filled with the converted text data.
|
||||
* @param cap The capacity of the buffer in UChars.
|
||||
* @param status Any error conditions are reported back in this variable.
|
||||
* @return The number of UChars in the output string.
|
||||
*
|
||||
* @stable ICU 3.6
|
||||
*/
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
ucsdet_getUChars(const UCharsetMatch* ucsm,
|
||||
UChar* buf, int32_t cap, UErrorCode* status);
|
||||
|
||||
/**
|
||||
* Get an iterator over the set of all detectable charsets -
|
||||
* over the charsets that are known to the charset detection
|
||||
* service.
|
||||
*
|
||||
* The returned UEnumeration provides access to the names of
|
||||
* the charsets.
|
||||
*
|
||||
* <p>
|
||||
* The state of the Charset detector that is passed in does not
|
||||
* affect the result of this function, but requiring a valid, open
|
||||
* charset detector as a parameter insures that the charset detection
|
||||
* service has been safely initialized and that the required detection
|
||||
* data is available.
|
||||
*
|
||||
* <p>
|
||||
* <b>Note:</b> Multiple different charset encodings in a same family may use
|
||||
* a single shared name in this implementation. For example, this method returns
|
||||
* an array including "ISO-8859-1" (ISO Latin 1), but not including "windows-1252"
|
||||
* (Windows Latin 1). However, actual detection result could be "windows-1252"
|
||||
* when the input data matches Latin 1 code points with any points only available
|
||||
* in "windows-1252".
|
||||
*
|
||||
* @param ucsd a Charset detector.
|
||||
* @param status Any error conditions are reported back in this variable.
|
||||
* @return an iterator providing access to the detectable charset names.
|
||||
* @stable ICU 3.6
|
||||
*/
|
||||
U_CAPI UEnumeration* U_EXPORT2
|
||||
ucsdet_getAllDetectableCharsets(const UCharsetDetector* ucsd, UErrorCode* status);
|
||||
|
||||
/**
|
||||
* Test whether input filtering is enabled for this charset detector.
|
||||
* Input filtering removes text that appears to be HTML or xml
|
||||
* markup from the input before applying the code page detection
|
||||
* heuristics.
|
||||
*
|
||||
* @param ucsd The charset detector to check.
|
||||
* @return true if filtering is enabled.
|
||||
* @stable ICU 3.6
|
||||
*/
|
||||
|
||||
U_CAPI UBool U_EXPORT2
|
||||
ucsdet_isInputFilterEnabled(const UCharsetDetector* ucsd);
|
||||
|
||||
/**
|
||||
* Enable filtering of input text. If filtering is enabled,
|
||||
* text within angle brackets ("<" and ">") will be removed
|
||||
* before detection, which will remove most HTML or xml markup.
|
||||
*
|
||||
* @param ucsd the charset detector to be modified.
|
||||
* @param filter <code>true</code> to enable input text filtering.
|
||||
* @return The previous setting.
|
||||
*
|
||||
* @stable ICU 3.6
|
||||
*/
|
||||
U_CAPI UBool U_EXPORT2
|
||||
ucsdet_enableInputFilter(UCharsetDetector* ucsd, UBool filter);
|
||||
|
||||
#ifndef U_HIDE_INTERNAL_API
|
||||
/**
|
||||
* Get an iterator over the set of detectable charsets -
|
||||
* over the charsets that are enabled by the specified charset detector.
|
||||
*
|
||||
* The returned UEnumeration provides access to the names of
|
||||
* the charsets.
|
||||
*
|
||||
* @param ucsd a Charset detector.
|
||||
* @param status Any error conditions are reported back in this variable.
|
||||
* @return an iterator providing access to the detectable charset names by
|
||||
* the specified charset detector.
|
||||
* @internal
|
||||
*/
|
||||
U_CAPI UEnumeration* U_EXPORT2
|
||||
ucsdet_getDetectableCharsets(const UCharsetDetector* ucsd, UErrorCode* status);
|
||||
|
||||
/**
|
||||
* Enable or disable individual charset encoding.
|
||||
* A name of charset encoding must be included in the names returned by
|
||||
* {@link #ucsdet_getAllDetectableCharsets()}.
|
||||
*
|
||||
* @param ucsd a Charset detector.
|
||||
* @param encoding encoding the name of charset encoding.
|
||||
* @param enabled <code>true</code> to enable, or <code>false</code> to disable the
|
||||
* charset encoding.
|
||||
* @param status receives the return status. When the name of charset encoding
|
||||
* is not supported, U_ILLEGAL_ARGUMENT_ERROR is set.
|
||||
* @internal
|
||||
*/
|
||||
U_CAPI void U_EXPORT2
|
||||
ucsdet_setDetectableCharset(UCharsetDetector* ucsd, const char* encoding, UBool enabled, UErrorCode* status);
|
||||
#endif /* U_HIDE_INTERNAL_API */
|
||||
|
||||
#endif
|
||||
#endif /* __UCSDET_H */
|
||||
209
src/bun.js/bindings/webcore/unicode-uenum.h
Normal file
209
src/bun.js/bindings/webcore/unicode-uenum.h
Normal file
@@ -0,0 +1,209 @@
|
||||
// © 2016 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2002-2013, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
* file name: uenum.h
|
||||
* encoding: UTF-8
|
||||
* tab size: 8 (not used)
|
||||
* indentation:2
|
||||
*
|
||||
* created on: 2002jul08
|
||||
* created by: Vladimir Weinstein
|
||||
*/
|
||||
|
||||
#ifndef __UENUM_H
|
||||
#define __UENUM_H
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
|
||||
#if U_SHOW_CPLUSPLUS_API
|
||||
#include "unicode/localpointer.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
class StringEnumeration;
|
||||
U_NAMESPACE_END
|
||||
#endif // U_SHOW_CPLUSPLUS_API
|
||||
|
||||
/**
|
||||
* \file
|
||||
* \brief C API: String Enumeration
|
||||
*/
|
||||
|
||||
/**
|
||||
* An enumeration object.
|
||||
* For usage in C programs.
|
||||
* @stable ICU 2.2
|
||||
*/
|
||||
struct UEnumeration;
|
||||
/** structure representing an enumeration object instance @stable ICU 2.2 */
|
||||
typedef struct UEnumeration UEnumeration;
|
||||
|
||||
/**
|
||||
* Disposes of resources in use by the iterator. If en is NULL,
|
||||
* does nothing. After this call, any char* or UChar* pointer
|
||||
* returned by uenum_unext() or uenum_next() is invalid.
|
||||
* @param en UEnumeration structure pointer
|
||||
* @stable ICU 2.2
|
||||
*/
|
||||
U_CAPI void U_EXPORT2
|
||||
uenum_close(UEnumeration* en);
|
||||
|
||||
#if U_SHOW_CPLUSPLUS_API
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
/**
|
||||
* \class LocalUEnumerationPointer
|
||||
* "Smart pointer" class, closes a UEnumeration via uenum_close().
|
||||
* For most methods see the LocalPointerBase base class.
|
||||
*
|
||||
* @see LocalPointerBase
|
||||
* @see LocalPointer
|
||||
* @stable ICU 4.4
|
||||
*/
|
||||
U_DEFINE_LOCAL_OPEN_POINTER(LocalUEnumerationPointer, UEnumeration, uenum_close);
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif
|
||||
|
||||
/**
|
||||
* Returns the number of elements that the iterator traverses. If
|
||||
* the iterator is out-of-sync with its service, status is set to
|
||||
* U_ENUM_OUT_OF_SYNC_ERROR.
|
||||
* This is a convenience function. It can end up being very
|
||||
* expensive as all the items might have to be pre-fetched (depending
|
||||
* on the type of data being traversed). Use with caution and only
|
||||
* when necessary.
|
||||
* @param en UEnumeration structure pointer
|
||||
* @param status error code, can be U_ENUM_OUT_OF_SYNC_ERROR if the
|
||||
* iterator is out of sync.
|
||||
* @return number of elements in the iterator
|
||||
* @stable ICU 2.2
|
||||
*/
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
uenum_count(UEnumeration* en, UErrorCode* status);
|
||||
|
||||
/**
|
||||
* Returns the next element in the iterator's list. If there are
|
||||
* no more elements, returns NULL. If the iterator is out-of-sync
|
||||
* with its service, status is set to U_ENUM_OUT_OF_SYNC_ERROR and
|
||||
* NULL is returned. If the native service string is a char* string,
|
||||
* it is converted to UChar* with the invariant converter.
|
||||
* The result is terminated by (UChar)0.
|
||||
* @param en the iterator object
|
||||
* @param resultLength pointer to receive the length of the result
|
||||
* (not including the terminating \\0).
|
||||
* If the pointer is NULL it is ignored.
|
||||
* @param status the error code, set to U_ENUM_OUT_OF_SYNC_ERROR if
|
||||
* the iterator is out of sync with its service.
|
||||
* @return a pointer to the string. The string will be
|
||||
* zero-terminated. The return pointer is owned by this iterator
|
||||
* and must not be deleted by the caller. The pointer is valid
|
||||
* until the next call to any uenum_... method, including
|
||||
* uenum_next() or uenum_unext(). When all strings have been
|
||||
* traversed, returns NULL.
|
||||
* @stable ICU 2.2
|
||||
*/
|
||||
U_CAPI const UChar* U_EXPORT2
|
||||
uenum_unext(UEnumeration* en,
|
||||
int32_t* resultLength,
|
||||
UErrorCode* status);
|
||||
|
||||
/**
|
||||
* Returns the next element in the iterator's list. If there are
|
||||
* no more elements, returns NULL. If the iterator is out-of-sync
|
||||
* with its service, status is set to U_ENUM_OUT_OF_SYNC_ERROR and
|
||||
* NULL is returned. If the native service string is a UChar*
|
||||
* string, it is converted to char* with the invariant converter.
|
||||
* The result is terminated by (char)0. If the conversion fails
|
||||
* (because a character cannot be converted) then status is set to
|
||||
* U_INVARIANT_CONVERSION_ERROR and the return value is undefined
|
||||
* (but non-NULL).
|
||||
* @param en the iterator object
|
||||
* @param resultLength pointer to receive the length of the result
|
||||
* (not including the terminating \\0).
|
||||
* If the pointer is NULL it is ignored.
|
||||
* @param status the error code, set to U_ENUM_OUT_OF_SYNC_ERROR if
|
||||
* the iterator is out of sync with its service. Set to
|
||||
* U_INVARIANT_CONVERSION_ERROR if the underlying native string is
|
||||
* UChar* and conversion to char* with the invariant converter
|
||||
* fails. This error pertains only to current string, so iteration
|
||||
* might be able to continue successfully.
|
||||
* @return a pointer to the string. The string will be
|
||||
* zero-terminated. The return pointer is owned by this iterator
|
||||
* and must not be deleted by the caller. The pointer is valid
|
||||
* until the next call to any uenum_... method, including
|
||||
* uenum_next() or uenum_unext(). When all strings have been
|
||||
* traversed, returns NULL.
|
||||
* @stable ICU 2.2
|
||||
*/
|
||||
U_CAPI const char* U_EXPORT2
|
||||
uenum_next(UEnumeration* en,
|
||||
int32_t* resultLength,
|
||||
UErrorCode* status);
|
||||
|
||||
/**
|
||||
* Resets the iterator to the current list of service IDs. This
|
||||
* re-establishes sync with the service and rewinds the iterator
|
||||
* to start at the first element.
|
||||
* @param en the iterator object
|
||||
* @param status the error code, set to U_ENUM_OUT_OF_SYNC_ERROR if
|
||||
* the iterator is out of sync with its service.
|
||||
* @stable ICU 2.2
|
||||
*/
|
||||
U_CAPI void U_EXPORT2
|
||||
uenum_reset(UEnumeration* en, UErrorCode* status);
|
||||
|
||||
#if U_SHOW_CPLUSPLUS_API
|
||||
|
||||
/**
|
||||
* Given a StringEnumeration, wrap it in a UEnumeration. The
|
||||
* StringEnumeration is adopted; after this call, the caller must not
|
||||
* delete it (regardless of error status).
|
||||
* @param adopted the C++ StringEnumeration to be wrapped in a UEnumeration.
|
||||
* @param ec the error code.
|
||||
* @return a UEnumeration wrapping the adopted StringEnumeration.
|
||||
* @stable ICU 4.2
|
||||
*/
|
||||
U_CAPI UEnumeration* U_EXPORT2
|
||||
uenum_openFromStringEnumeration(icu::StringEnumeration* adopted, UErrorCode* ec);
|
||||
|
||||
#endif
|
||||
|
||||
/**
|
||||
* Given an array of const UChar* strings, return a UEnumeration. String pointers from 0..count-1 must not be null.
|
||||
* Do not free or modify either the string array or the characters it points to until this object has been destroyed with uenum_close.
|
||||
* \snippet test/cintltst/uenumtst.c uenum_openUCharStringsEnumeration
|
||||
* @param strings array of const UChar* strings (each null terminated). All storage is owned by the caller.
|
||||
* @param count length of the array
|
||||
* @param ec error code
|
||||
* @return the new UEnumeration object. Caller is responsible for calling uenum_close to free memory.
|
||||
* @see uenum_close
|
||||
* @stable ICU 50
|
||||
*/
|
||||
U_CAPI UEnumeration* U_EXPORT2
|
||||
uenum_openUCharStringsEnumeration(const UChar* const strings[], int32_t count,
|
||||
UErrorCode* ec);
|
||||
|
||||
/**
|
||||
* Given an array of const char* strings (invariant chars only), return a UEnumeration. String pointers from 0..count-1 must not be null.
|
||||
* Do not free or modify either the string array or the characters it points to until this object has been destroyed with uenum_close.
|
||||
* \snippet test/cintltst/uenumtst.c uenum_openCharStringsEnumeration
|
||||
* @param strings array of char* strings (each null terminated). All storage is owned by the caller.
|
||||
* @param count length of the array
|
||||
* @param ec error code
|
||||
* @return the new UEnumeration object. Caller is responsible for calling uenum_close to free memory
|
||||
* @see uenum_close
|
||||
* @stable ICU 50
|
||||
*/
|
||||
U_CAPI UEnumeration* U_EXPORT2
|
||||
uenum_openCharStringsEnumeration(const char* const strings[], int32_t count,
|
||||
UErrorCode* ec);
|
||||
|
||||
#endif
|
||||
@@ -29,10 +29,6 @@
|
||||
#include <wtf/Vector.h>
|
||||
#include <wtf/text/WTFString.h>
|
||||
|
||||
#ifndef PAL_EXPORT
|
||||
#define PAL_EXPORT
|
||||
#endif
|
||||
|
||||
namespace PAL {
|
||||
|
||||
struct CryptoDigestContext;
|
||||
@@ -48,12 +44,12 @@ public:
|
||||
SHA_384,
|
||||
SHA_512,
|
||||
};
|
||||
PAL_EXPORT static std::unique_ptr<CryptoDigest> create(Algorithm);
|
||||
PAL_EXPORT ~CryptoDigest();
|
||||
static std::unique_ptr<CryptoDigest> create(Algorithm);
|
||||
~CryptoDigest();
|
||||
|
||||
PAL_EXPORT void addBytes(const void* input, size_t length);
|
||||
PAL_EXPORT Vector<uint8_t> computeHash();
|
||||
PAL_EXPORT String toHexString();
|
||||
void addBytes(const void* input, size_t length);
|
||||
Vector<uint8_t> computeHash();
|
||||
String toHexString();
|
||||
|
||||
private:
|
||||
CryptoDigest();
|
||||
|
||||
@@ -250,157 +250,50 @@ comptime {
|
||||
|
||||
/// https://encoding.spec.whatwg.org/encodings.json
|
||||
pub const EncodingLabel = enum {
|
||||
@"UTF-8",
|
||||
IBM866,
|
||||
@"ISO-8859-2",
|
||||
@"ISO-8859-3",
|
||||
@"ISO-8859-4",
|
||||
@"ISO-8859-5",
|
||||
@"ISO-8859-6",
|
||||
@"ISO-8859-7",
|
||||
@"ISO-8859-8",
|
||||
@"ISO-8859-8-I",
|
||||
@"ISO-8859-10",
|
||||
@"ISO-8859-13",
|
||||
@"ISO-8859-14",
|
||||
@"ISO-8859-15",
|
||||
@"ISO-8859-16",
|
||||
@"KOI8-R",
|
||||
@"KOI8-U",
|
||||
macintosh,
|
||||
@"windows-874",
|
||||
@"windows-1250",
|
||||
@"windows-1251",
|
||||
/// Also known as
|
||||
/// - ASCII
|
||||
/// - latin1
|
||||
@"utf-8",
|
||||
@"windows-1252",
|
||||
@"windows-1253",
|
||||
@"windows-1254",
|
||||
@"windows-1255",
|
||||
@"windows-1256",
|
||||
@"windows-1257",
|
||||
@"windows-1258",
|
||||
@"x-mac-cyrillic",
|
||||
Big5,
|
||||
@"EUC-JP",
|
||||
@"ISO-2022-JP",
|
||||
Shift_JIS,
|
||||
@"EUC-KR",
|
||||
@"UTF-16BE",
|
||||
@"UTF-16LE",
|
||||
@"x-user-defined",
|
||||
@"utf-16be",
|
||||
@"utf-16le",
|
||||
|
||||
pub const Map = std.enums.EnumMap(EncodingLabel, string);
|
||||
pub const label: Map = brk: {
|
||||
var map = Map.initFull("");
|
||||
map.put(EncodingLabel.@"UTF-8", "utf-8");
|
||||
map.put(EncodingLabel.@"UTF-16LE", "utf-16le");
|
||||
map.put(EncodingLabel.@"windows-1252", "windows-1252");
|
||||
break :brk map;
|
||||
};
|
||||
|
||||
const utf16_names = [_]string{
|
||||
"ucs-2",
|
||||
"utf-16",
|
||||
"unicode",
|
||||
"utf-16le",
|
||||
"csunicode",
|
||||
"unicodefeff",
|
||||
"iso-10646-ucs-2",
|
||||
};
|
||||
|
||||
const utf8_names = [_]string{
|
||||
"utf8",
|
||||
"utf-8",
|
||||
"unicode11utf8",
|
||||
"unicode20utf8",
|
||||
"x-unicode20utf8",
|
||||
"unicode-1-1-utf-8",
|
||||
};
|
||||
|
||||
const latin1_names = [_]string{
|
||||
"l1",
|
||||
"ascii",
|
||||
"cp819",
|
||||
"cp1252",
|
||||
"ibm819",
|
||||
"latin1",
|
||||
"iso88591",
|
||||
"us-ascii",
|
||||
"x-cp1252",
|
||||
"iso8859-1",
|
||||
"iso_8859-1",
|
||||
"iso-8859-1",
|
||||
"iso-ir-100",
|
||||
"csisolatin1",
|
||||
"windows-1252",
|
||||
"ansi_x3.4-1968",
|
||||
"iso_8859-1:1987",
|
||||
};
|
||||
|
||||
pub const latin1 = EncodingLabel.@"windows-1252";
|
||||
|
||||
const map = bun.ComptimeStringMap(EncodingLabel, .{
|
||||
.{ "ansi_x3.4-1968", latin1 },
|
||||
.{ "ascii", latin1 },
|
||||
.{ "cp1252", latin1 },
|
||||
.{ "cp819", latin1 },
|
||||
.{ "csisolatin1", latin1 },
|
||||
.{ "csunicode", EncodingLabel.@"utf-16le" },
|
||||
.{ "ibm819", latin1 },
|
||||
.{ "iso_8859-1:1987", latin1 },
|
||||
.{ "iso_8859-1", latin1 },
|
||||
.{ "iso-10646-ucs-2", EncodingLabel.@"utf-16le" },
|
||||
.{ "iso-8859-1", latin1 },
|
||||
.{ "iso-ir-100", latin1 },
|
||||
.{ "iso8859-1", latin1 },
|
||||
.{ "iso88591", latin1 },
|
||||
.{ "l1", latin1 },
|
||||
.{ "latin1", latin1 },
|
||||
.{ "ucs-2", EncodingLabel.@"utf-16le" },
|
||||
.{ "unicode-1-1-utf-8", EncodingLabel.@"utf-8" },
|
||||
.{ "unicode", EncodingLabel.@"utf-16le" },
|
||||
.{ "unicode11utf8", EncodingLabel.@"utf-8" },
|
||||
.{ "unicode20utf8", EncodingLabel.@"utf-8" },
|
||||
.{ "unicodefeff", EncodingLabel.@"utf-16le" },
|
||||
.{ "us-ascii", latin1 },
|
||||
.{ "utf-16", EncodingLabel.@"utf-16le" },
|
||||
.{ "utf-16be", EncodingLabel.@"utf-16be" },
|
||||
.{ "utf-16le", EncodingLabel.@"utf-16le" },
|
||||
.{ "utf-8", EncodingLabel.@"utf-8" },
|
||||
.{ "utf8", EncodingLabel.@"utf-8" },
|
||||
});
|
||||
|
||||
pub fn which(input_: string) ?EncodingLabel {
|
||||
const input = strings.trim(input_, " \t\r\n");
|
||||
const ExactMatcher = strings.ExactSizeMatcher;
|
||||
const Eight = ExactMatcher(8);
|
||||
const Sixteen = ExactMatcher(16);
|
||||
return switch (input.len) {
|
||||
1, 0 => null,
|
||||
2...8 => switch (Eight.matchLower(input)) {
|
||||
Eight.case("l1"),
|
||||
Eight.case("ascii"),
|
||||
Eight.case("cp819"),
|
||||
Eight.case("cp1252"),
|
||||
Eight.case("ibm819"),
|
||||
Eight.case("latin1"),
|
||||
Eight.case("iso88591"),
|
||||
Eight.case("us-ascii"),
|
||||
Eight.case("x-cp1252"),
|
||||
=> EncodingLabel.latin1,
|
||||
|
||||
Eight.case("ucs-2"),
|
||||
Eight.case("utf-16"),
|
||||
Eight.case("unicode"),
|
||||
Eight.case("utf-16le"),
|
||||
=> EncodingLabel.@"UTF-16LE",
|
||||
|
||||
Eight.case("utf-16be"),
|
||||
=> EncodingLabel.@"UTF-16BE",
|
||||
|
||||
Eight.case("utf8"), Eight.case("utf-8") => EncodingLabel.@"UTF-8",
|
||||
else => null,
|
||||
},
|
||||
|
||||
9...16 => switch (Sixteen.matchLower(input)) {
|
||||
Sixteen.case("iso8859-1"),
|
||||
Sixteen.case("iso_8859-1"),
|
||||
Sixteen.case("iso-8859-1"),
|
||||
Sixteen.case("iso-ir-100"),
|
||||
Sixteen.case("csisolatin1"),
|
||||
Sixteen.case("windows-1252"),
|
||||
Sixteen.case("ansi_x3.4-1968"),
|
||||
Sixteen.case("iso_8859-1:1987"),
|
||||
=> EncodingLabel.latin1,
|
||||
|
||||
Sixteen.case("unicode11utf8"),
|
||||
Sixteen.case("unicode20utf8"),
|
||||
Sixteen.case("x-unicode20utf8"),
|
||||
=> EncodingLabel.@"UTF-8",
|
||||
|
||||
Sixteen.case("csunicode"),
|
||||
Sixteen.case("unicodefeff"),
|
||||
Sixteen.case("iso-10646-ucs-2"),
|
||||
=> EncodingLabel.@"UTF-16LE",
|
||||
|
||||
else => null,
|
||||
},
|
||||
else => if (strings.eqlCaseInsensitiveASCII(input, "unicode-1-1-utf-8", true))
|
||||
EncodingLabel.@"UTF-8"
|
||||
else
|
||||
null,
|
||||
};
|
||||
return map.getASCIIICaseInsensitive(input);
|
||||
}
|
||||
};
|
||||
|
||||
@@ -618,11 +511,47 @@ pub const TextDecoder = struct {
|
||||
|
||||
ignore_bom: bool = false,
|
||||
fatal: bool = false,
|
||||
encoding: EncodingLabel = EncodingLabel.@"UTF-8",
|
||||
encoding: Encoding = .{ .@"utf-8" = {} },
|
||||
|
||||
const Encoding = union(Tag) {
|
||||
@"utf-8": void,
|
||||
@"windows-1252": void,
|
||||
@"utf-16be": void,
|
||||
@"utf-16le": void,
|
||||
other: *JSC.WebKitTextCodec,
|
||||
|
||||
pub const Tag = enum {
|
||||
@"utf-8",
|
||||
@"windows-1252",
|
||||
@"utf-16be",
|
||||
@"utf-16le",
|
||||
other,
|
||||
};
|
||||
|
||||
pub fn from(input: []const u8) ?Encoding {
|
||||
if (EncodingLabel.which(input)) |label| {
|
||||
return switch (label) {
|
||||
.@"utf-8" => .{ .@"utf-8" = {} },
|
||||
.@"utf-16le" => .{ .@"utf-16le" = {} },
|
||||
.@"utf-16be" => .{ .@"utf-16be" = {} },
|
||||
.@"windows-1252" => .{ .@"windows-1252" = {} },
|
||||
};
|
||||
}
|
||||
|
||||
return .{ .other = JSC.WebKitTextCodec.init(input) orelse return null };
|
||||
}
|
||||
|
||||
pub fn deinit(this: *@This()) void {
|
||||
if (this.* == .other) {
|
||||
this.other.deinit();
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
pub usingnamespace bun.New(TextDecoder);
|
||||
|
||||
pub fn finalize(this: *TextDecoder) void {
|
||||
this.encoding.deinit();
|
||||
this.destroy();
|
||||
}
|
||||
|
||||
@@ -646,7 +575,15 @@ pub const TextDecoder = struct {
|
||||
this: *TextDecoder,
|
||||
globalThis: *JSC.JSGlobalObject,
|
||||
) JSC.JSValue {
|
||||
return ZigString.init(EncodingLabel.label.get(this.encoding).?).toJS(globalThis);
|
||||
switch (this.encoding) {
|
||||
.other => |codec| {
|
||||
var name = codec.name();
|
||||
return name.transferToJS(globalThis);
|
||||
},
|
||||
else => {
|
||||
return ZigString.init(@tagName(this.encoding)).toJS(globalThis);
|
||||
},
|
||||
}
|
||||
}
|
||||
const Vector16 = std.meta.Vector(16, u16);
|
||||
const max_16_ascii: Vector16 = @splat(@as(u16, 127));
|
||||
@@ -793,8 +730,8 @@ pub const TextDecoder = struct {
|
||||
}
|
||||
|
||||
fn decodeSlice(this: *TextDecoder, globalThis: *JSC.JSGlobalObject, buffer_slice: []const u8, comptime flush: bool) bun.JSError!JSValue {
|
||||
switch (this.encoding) {
|
||||
EncodingLabel.latin1 => {
|
||||
switch (@as(Encoding.Tag, this.encoding)) {
|
||||
.@"windows-1252" => {
|
||||
if (strings.isAllASCII(buffer_slice)) {
|
||||
return ZigString.init(buffer_slice).toJS(globalThis);
|
||||
}
|
||||
@@ -809,7 +746,7 @@ pub const TextDecoder = struct {
|
||||
const out = strings.copyLatin1IntoUTF16([]u16, bytes, []const u8, buffer_slice);
|
||||
return ZigString.toExternalU16(bytes.ptr, out.written, globalThis);
|
||||
},
|
||||
EncodingLabel.@"UTF-8" => {
|
||||
.@"utf-8" => {
|
||||
const input, const deinit = input: {
|
||||
const maybe_without_bom = if (!this.ignore_bom and strings.hasPrefixComptime(buffer_slice, "\xef\xbb\xbf"))
|
||||
buffer_slice[3..]
|
||||
@@ -860,25 +797,35 @@ pub const TextDecoder = struct {
|
||||
return ZigString.init(input).toJS(globalThis);
|
||||
},
|
||||
|
||||
inline .@"UTF-16LE", .@"UTF-16BE" => |utf16_encoding| {
|
||||
const bom = if (comptime utf16_encoding == .@"UTF-16LE") "\xff\xfe" else "\xfe\xff";
|
||||
inline .@"utf-16le", .@"utf-16be" => |encoding| {
|
||||
const bom = comptime if (encoding == .@"utf-16le") "\xff\xfe" else "\xfe\xff";
|
||||
const input = if (!this.ignore_bom and strings.hasPrefixComptime(buffer_slice, bom))
|
||||
buffer_slice[2..]
|
||||
else
|
||||
buffer_slice;
|
||||
|
||||
var decoded, const saw_error = try this.decodeUTF16(input, utf16_encoding == .@"UTF-16BE", flush);
|
||||
var decoded, const saw_error = try this.decodeUTF16(input, encoding == .@"utf-16be", flush);
|
||||
|
||||
if (saw_error and this.fatal) {
|
||||
decoded.deinit(bun.default_allocator);
|
||||
return globalThis.ERR_ENCODING_INVALID_ENCODED_DATA("The encoded data was not valid {s} data", .{@tagName(utf16_encoding)}).throw();
|
||||
return globalThis.ERR_ENCODING_INVALID_ENCODED_DATA("The encoded data was not valid {s} data", .{@tagName(encoding)}).throw();
|
||||
}
|
||||
|
||||
var output = bun.String.fromUTF16(decoded.items);
|
||||
return output.toJS(globalThis);
|
||||
},
|
||||
else => {
|
||||
return globalThis.throwInvalidArguments("TextDecoder.decode set to unsupported encoding", .{});
|
||||
.other => {
|
||||
const codec = this.encoding.other;
|
||||
var did_stop_on_error = this.fatal;
|
||||
var str = codec.decode(buffer_slice, flush, &did_stop_on_error);
|
||||
defer str.deref();
|
||||
if (did_stop_on_error and this.fatal) {
|
||||
var name = codec.name();
|
||||
defer name.deref();
|
||||
return globalThis.ERR_ENCODING_INVALID_ENCODED_DATA("The encoded data was not valid {} data", .{name}).throw();
|
||||
}
|
||||
|
||||
return str.toJS(globalThis);
|
||||
},
|
||||
}
|
||||
}
|
||||
@@ -895,14 +842,14 @@ pub const TextDecoder = struct {
|
||||
var str = arguments[0].toSlice(globalThis, bun.default_allocator);
|
||||
defer if (str.isAllocated()) str.deinit();
|
||||
|
||||
if (EncodingLabel.which(str.slice())) |label| {
|
||||
if (Encoding.from(str.slice())) |label| {
|
||||
decoder.encoding = label;
|
||||
} else {
|
||||
return globalThis.throwInvalidArguments("Unsupported encoding label \"{s}\"", .{str.slice()});
|
||||
}
|
||||
} else if (arguments[0].isUndefined()) {
|
||||
// default to utf-8
|
||||
decoder.encoding = EncodingLabel.@"UTF-8";
|
||||
decoder.encoding = .@"utf-8";
|
||||
} else {
|
||||
return globalThis.throwInvalidArguments("TextDecoder(encoding) label is invalid", .{});
|
||||
}
|
||||
|
||||
@@ -204,17 +204,18 @@ pub fn ComptimeStringMapWithKeyType(comptime KeyType: type, comptime V: type, co
|
||||
return null;
|
||||
|
||||
comptime var i: usize = precomputed.min_len;
|
||||
var shared_lowercased_buf: [precomputed.max_len]u8 = undefined;
|
||||
|
||||
inline while (i <= precomputed.max_len) : (i += 1) {
|
||||
if (length == i) {
|
||||
const lowerbuf: [i]u8 = brk: {
|
||||
var buf: [i]u8 = undefined;
|
||||
for (input, &buf) |c, *j| {
|
||||
const lowerbuf: *const [i]u8 = brk: {
|
||||
for (input, shared_lowercased_buf[0..i]) |c, *j| {
|
||||
j.* = std.ascii.toLower(c);
|
||||
}
|
||||
break :brk buf;
|
||||
break :brk shared_lowercased_buf[0..i];
|
||||
};
|
||||
|
||||
return getWithLengthAndEql(&lowerbuf, i, eql);
|
||||
return getWithLengthAndEql(lowerbuf, i, eql);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1228,6 +1228,10 @@ pub const String = extern struct {
|
||||
}
|
||||
|
||||
const u16_bytes = this.utf16();
|
||||
if (comptime values.len == 0) {
|
||||
@compileError("values.len must be > 0");
|
||||
}
|
||||
|
||||
const buffer: [values[0].len]u8 = brk: {
|
||||
var bytes: [values[0].len]u8 = undefined;
|
||||
for (&bytes, u16_bytes) |*byte, uchar| {
|
||||
|
||||
17
test/js/web/encoding/text-decoder-shiftjis.test.ts
Normal file
17
test/js/web/encoding/text-decoder-shiftjis.test.ts
Normal file
@@ -0,0 +1,17 @@
|
||||
import { test, expect } from "bun:test";
|
||||
|
||||
test("shift_jis", () => {
|
||||
const bytes = [147, 250, 150, 123, 140, 234];
|
||||
const decoder = new TextDecoder("shift_jis");
|
||||
const data = decoder.decode(Uint8Array.from(bytes));
|
||||
expect(data).toEqual("日本語");
|
||||
expect(decoder.encoding).toBe("Shift_JIS");
|
||||
expect(new TextDecoder().decode(Uint8Array.from(bytes))).not.toBe("日本語");
|
||||
|
||||
bytes.push(255);
|
||||
expect(() => new TextDecoder("shift_jis", { fatal: true }).decode(Uint8Array.from(bytes))).toThrow();
|
||||
});
|
||||
|
||||
test("unknown encoding throws", () => {
|
||||
expect(() => new TextDecoder("pooop")).toThrow();
|
||||
});
|
||||
Reference in New Issue
Block a user