Compare commits

...

5 Commits

Author SHA1 Message Date
Jarred Sumner
6edb3f4845 revive pr 2025-01-08 17:39:22 -08:00
Jarred Sumner
38c9de8516 Merge branch 'main' into jarred/text-encodings 2025-01-08 16:48:42 -08:00
Jarred Sumner
93fdf30a38 Set homebrew prefix path 2024-09-14 02:02:43 -04:00
Jarred Sumner
f9c23b6426 Finish it 2024-09-13 18:02:05 -07:00
Jarred Sumner
237c191033 Support all standard text encodings in TextDecoder 2024-09-13 16:13:58 -07:00
48 changed files with 48817 additions and 187 deletions

View File

@@ -518,10 +518,15 @@ set(BUN_ZIG_OUTPUT ${BUILD_PATH}/bun-zig.o)
if(CMAKE_SYSTEM_PROCESSOR MATCHES "arm|ARM|arm64|ARM64|aarch64|AARCH64")
if(APPLE)
set(ZIG_CPU "apple_m1")
set(HOMEBREW_PREFIX "/opt/homebrew")
else()
set(ZIG_CPU "native")
endif()
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|X86_64|x64|X64|amd64|AMD64")
if(APPLE)
set(HOMEBREW_PREFIX "/usr/local")
endif()
if(ENABLE_BASELINE)
set(ZIG_CPU "nehalem")
else()
@@ -1010,6 +1015,8 @@ include_directories(${WEBKIT_INCLUDE_PATH})
if(NOT WEBKIT_LOCAL AND NOT APPLE)
include_directories(${WEBKIT_INCLUDE_PATH}/wtf/unicode)
elseif(WEBKIT_PREBUILT AND APPLE)
include_directories(${HOMEBREW_PREFIX}/opt/icu4c/include)
endif()
# --- Dependencies ---

View File

@@ -7,6 +7,7 @@
#include "_libusockets.h"
#include "BunClientData.h"
#include "EventLoopTask.h"
#include "TextCodecICU.h"
extern "C" void Bun__startLoop(us_loop_t* loop);
@@ -17,6 +18,24 @@ static std::atomic<unsigned> lastUniqueIdentifier = 0;
WTF_MAKE_ISO_ALLOCATED_IMPL(EventLoopTask);
WTF_MAKE_ISO_ALLOCATED_IMPL(ScriptExecutionContext);
ScriptExecutionContext::ScriptExecutionContext(JSC::VM* vm, JSC::JSGlobalObject* globalObject)
: m_vm(vm)
, m_globalObject(globalObject)
, m_identifier(0)
, m_broadcastChannelRegistry(BunBroadcastChannelRegistry::create())
{
regenerateIdentifier();
}
ScriptExecutionContext::ScriptExecutionContext(JSC::VM* vm, JSC::JSGlobalObject* globalObject, ScriptExecutionContextIdentifier identifier)
: m_vm(vm)
, m_globalObject(globalObject)
, m_identifier(identifier)
, m_broadcastChannelRegistry(BunBroadcastChannelRegistry::create())
{
addToContextsMap();
}
static Lock allScriptExecutionContextsMapLock;
static HashMap<ScriptExecutionContextIdentifier, ScriptExecutionContext*>& allScriptExecutionContextsMap() WTF_REQUIRES_LOCK(allScriptExecutionContextsMapLock)
{
@@ -31,6 +50,15 @@ ScriptExecutionContext* ScriptExecutionContext::getScriptExecutionContext(Script
return allScriptExecutionContextsMap().get(identifier);
}
PAL::ICUConverterWrapper& ScriptExecutionContext::cachedConverterICU()
{
if (!m_cachedConverterICU) {
m_cachedConverterICU = makeUnique<PAL::ICUConverterWrapper>();
}
return *m_cachedConverterICU;
}
template<bool SSL, bool isServer>
static void registerHTTPContextForWebSocket(ScriptExecutionContext* script, us_socket_context_t* ctx, us_loop_t* loop)
{

View File

@@ -14,6 +14,10 @@
#include "CachedScript.h"
#include <wtf/URL.h>
namespace PAL {
class ICUConverterWrapper;
}
namespace uWS {
template<bool isServer, bool isClient, typename UserData>
struct WebSocketContext;
@@ -37,24 +41,8 @@ class ScriptExecutionContext : public CanMakeWeakPtr<ScriptExecutionContext> {
WTF_MAKE_ISO_ALLOCATED(ScriptExecutionContext);
public:
ScriptExecutionContext(JSC::VM* vm, JSC::JSGlobalObject* globalObject)
: m_vm(vm)
, m_globalObject(globalObject)
, m_identifier(0)
, m_broadcastChannelRegistry(BunBroadcastChannelRegistry::create())
{
regenerateIdentifier();
}
ScriptExecutionContext(JSC::VM* vm, JSC::JSGlobalObject* globalObject, ScriptExecutionContextIdentifier identifier)
: m_vm(vm)
, m_globalObject(globalObject)
, m_identifier(identifier)
, m_broadcastChannelRegistry(BunBroadcastChannelRegistry::create())
{
addToContextsMap();
}
ScriptExecutionContext(JSC::VM* vm, JSC::JSGlobalObject* globalObject);
ScriptExecutionContext(JSC::VM* vm, JSC::JSGlobalObject* globalObject, ScriptExecutionContextIdentifier identifier);
~ScriptExecutionContext();
static ScriptExecutionContextIdentifier generateIdentifier();
@@ -160,6 +148,8 @@ public:
static ScriptExecutionContext* getMainThreadScriptExecutionContext();
PAL::ICUConverterWrapper& cachedConverterICU();
private:
JSC::VM* m_vm = nullptr;
JSC::JSGlobalObject* m_globalObject = nullptr;
@@ -184,6 +174,8 @@ private:
us_socket_context_t* m_connected_ssl_client_websockets_ctx = nullptr;
us_socket_context_t* m_connected_client_websockets_ctx = nullptr;
std::unique_ptr<PAL::ICUConverterWrapper> m_cachedConverterICU = { nullptr };
public:
template<bool isSSL, bool isServer>
us_socket_context_t* connectedWebSocketContext()

View File

@@ -1092,3 +1092,24 @@ fn findPathInner(
);
return errorable.unwrap() catch null;
}
pub const WebKitTextCodec = opaque {
extern fn WebKitTextCodec__create(encoding_label: [*]const u8, len: usize) ?*WebKitTextCodec;
extern fn WebKitTextCodec__deinit(this: *WebKitTextCodec) void;
extern fn WebKitTextCodec__decode(this: *WebKitTextCodec, ptr: [*]const u8, len: usize, flush: bool, stopOnError: *bool) bun.String;
extern fn WebKitTextCodec__stripByteOrderMark(this: *WebKitTextCodec) void;
extern fn WebKitTextCodec__name(this: *WebKitTextCodec) bun.String;
pub fn init(encoding_label: []const u8) ?*WebKitTextCodec {
return WebKitTextCodec__create(encoding_label.ptr, encoding_label.len);
}
pub const name = WebKitTextCodec__name;
pub const deinit = WebKitTextCodec__deinit;
pub fn decode(this: *WebKitTextCodec, input: []const u8, flush: bool, stop_on_error: *bool) bun.String {
return WebKitTextCodec__decode(this, input.ptr, input.len, flush, stop_on_error);
}
pub const stripByteOrderMark = WebKitTextCodec__stripByteOrderMark;
};

View File

@@ -73,6 +73,11 @@
#define WEBCORE_EXPORT JS_EXPORT_PRIVATE
#endif
#if OS(DARWIN)
// Prevent symbol names from causing issues
#define U_DISABLE_RENAMING 1
#endif
#include <wtf/PlatformCallingConventions.h>
#include <JavaScriptCore/JSCJSValue.h>
#include <wtf/text/MakeString.h>

View File

@@ -0,0 +1,187 @@
/*
* Copyright (C) 2011 Daniel Bates (dbates@intudata.com). All Rights Reserved.
* Copyright (c) 2012 Google, inc. All Rights Reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of Google Inc. nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once
#include "TextEncoding.h"
#include <wtf/ASCIICType.h>
#include <wtf/Assertions.h>
#include <wtf/text/StringBuilder.h>
namespace PAL {
// See <http://en.wikipedia.org/wiki/Percent-encoding#Non-standard_implementations>.
struct Unicode16BitEscapeSequence {
enum { SequenceSize = 6 }; // e.g. %u26C4
static size_t findInString(StringView string, size_t startPosition) { return string.find("%u"_s, startPosition); }
static size_t findEndOfRun(StringView string, size_t startPosition, size_t endPosition)
{
size_t runEnd = startPosition;
while (endPosition - runEnd >= SequenceSize && string[runEnd] == '%' && string[runEnd + 1] == 'u'
&& isASCIIHexDigit(string[runEnd + 2]) && isASCIIHexDigit(string[runEnd + 3])
&& isASCIIHexDigit(string[runEnd + 4]) && isASCIIHexDigit(string[runEnd + 5])) {
runEnd += SequenceSize;
}
return runEnd;
}
static String decodeRun(StringView run, const TextEncoding&)
{
// Each %u-escape sequence represents a UTF-16 code unit.
// See <http://www.w3.org/International/iri-edit/draft-duerst-iri.html#anchor29>.
// For 16-bit escape sequences, we know that findEndOfRun() has given us a contiguous run of sequences
// without any intervening characters, so decode the run without additional checks.
auto numberOfSequences = run.length() / SequenceSize;
StringBuilder builder;
builder.reserveCapacity(numberOfSequences);
while (numberOfSequences--) {
UChar codeUnit = (toASCIIHexValue(run[2]) << 12) | (toASCIIHexValue(run[3]) << 8) | (toASCIIHexValue(run[4]) << 4) | toASCIIHexValue(run[5]);
builder.append(codeUnit);
run = run.substring(SequenceSize);
}
return builder.toString();
}
};
struct URLEscapeSequence {
enum { SequenceSize = 3 }; // e.g. %41
static size_t findInString(StringView string, size_t startPosition) { return string.find('%', startPosition); }
static size_t findEndOfRun(StringView string, size_t startPosition, size_t endPosition)
{
// Make the simplifying assumption that supported encodings may have up to two unescaped characters
// in the range 0x40 - 0x7F as the trailing bytes of their sequences which need to be passed into the
// decoder as part of the run. In other words, we end the run at the first value outside of the
// 0x40 - 0x7F range, after two values in this range, or at a %-sign that does not introduce a valid
// escape sequence.
size_t runEnd = startPosition;
int numberOfTrailingCharacters = 0;
while (runEnd < endPosition) {
if (string[runEnd] == '%') {
if (endPosition - runEnd >= SequenceSize && isASCIIHexDigit(string[runEnd + 1]) && isASCIIHexDigit(string[runEnd + 2])) {
runEnd += SequenceSize;
numberOfTrailingCharacters = 0;
} else
break;
} else if (string[runEnd] >= 0x40 && string[runEnd] <= 0x7F && numberOfTrailingCharacters < 2) {
runEnd += 1;
numberOfTrailingCharacters += 1;
} else
break;
}
return runEnd;
}
static Vector<uint8_t, 512> decodeRun(StringView run)
{
// For URL escape sequences, we know that findEndOfRun() has given us a run where every %-sign introduces
// a valid escape sequence, but there may be characters between the sequences.
Vector<uint8_t, 512> buffer;
buffer.grow(run.length()); // Unescaping hex sequences only makes the length smaller.
size_t bufferIndex = 0;
while (!run.isEmpty()) {
if (run[0] == '%') {
buffer[bufferIndex++] = (toASCIIHexValue(run[1]) << 4) | toASCIIHexValue(run[2]);
run = run.substring(SequenceSize);
} else {
buffer[bufferIndex++] = run[0];
run = run.substring(1);
}
}
buffer.shrink(bufferIndex);
return buffer;
}
static String decodeRun(StringView run, const TextEncoding& encoding)
{
auto buffer = decodeRun(run);
if (!encoding.isValid())
return PAL::UTF8Encoding().decode(buffer.span());
return encoding.decode(buffer.span());
}
};
template<typename EscapeSequence>
String decodeEscapeSequences(StringView string, const TextEncoding& encoding)
{
StringBuilder result;
size_t length = string.length();
size_t decodedPosition = 0;
size_t searchPosition = 0;
size_t encodedRunPosition;
while ((encodedRunPosition = EscapeSequence::findInString(string, searchPosition)) != notFound) {
size_t encodedRunEnd = EscapeSequence::findEndOfRun(string, encodedRunPosition, length);
searchPosition = encodedRunEnd;
if (encodedRunEnd == encodedRunPosition) {
++searchPosition;
continue;
}
String decoded = EscapeSequence::decodeRun(string.substring(encodedRunPosition, encodedRunEnd - encodedRunPosition), encoding);
if (decoded.isEmpty())
continue;
result.append(string.substring(decodedPosition, encodedRunPosition - decodedPosition), decoded);
decodedPosition = encodedRunEnd;
}
result.append(string.substring(decodedPosition, length - decodedPosition));
return result.toString();
}
inline Vector<uint8_t> decodeURLEscapeSequencesAsData(StringView string)
{
Vector<uint8_t> result;
size_t decodedPosition = 0;
size_t searchPosition = 0;
while (true) {
size_t encodedRunPosition = URLEscapeSequence::findInString(string, searchPosition);
size_t encodedRunEnd = 0;
if (encodedRunPosition != notFound) {
encodedRunEnd = URLEscapeSequence::findEndOfRun(string, encodedRunPosition, string.length());
searchPosition = encodedRunEnd;
if (encodedRunEnd == encodedRunPosition) {
++searchPosition;
continue;
}
}
// Strings are encoded as requested.
result.appendVector(PAL::UTF8Encoding().encodeForURLParsing(string.substring(decodedPosition, encodedRunPosition - decodedPosition)));
if (encodedRunPosition == notFound)
return result;
// Bytes go through as-is.
auto decodedEscapeSequence = URLEscapeSequence::decodeRun(string.substring(encodedRunPosition, encodedRunEnd - encodedRunPosition));
ASSERT(!decodedEscapeSequence.isEmpty());
result.appendVector(decodedEscapeSequence);
decodedPosition = encodedRunEnd;
}
}
} // namespace PAL

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,136 @@
/*
* Copyright (C) 2020 Apple Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once
#include <algorithm>
#include <array>
#include <iterator>
#include <optional>
#include <unicode/umachine.h>
#include <utility>
namespace PAL {
const std::array<std::pair<uint16_t, UChar>, 7724>& jis0208();
const std::array<std::pair<uint16_t, UChar>, 6067>& jis0212();
const std::array<std::pair<uint16_t, char32_t>, 18590>& big5();
const std::array<std::pair<uint16_t, UChar>, 17048>& eucKR();
const std::array<UChar, 23940>& gb18030();
void checkEncodingTableInvariants();
// Functions for using sorted arrays of pairs as a map.
// FIXME: Consider moving these functions to StdLibExtras.h for uses other than encoding tables.
template<typename CollectionType> void sortByFirst(CollectionType&);
template<typename CollectionType> void stableSortByFirst(CollectionType&);
template<typename CollectionType> bool isSortedByFirst(const CollectionType&);
template<typename CollectionType> bool sortedFirstsAreUnique(const CollectionType&);
template<typename CollectionType, typename KeyType> static auto findFirstInSortedPairs(const CollectionType& sortedPairsCollection, const KeyType&) -> std::optional<decltype(std::begin(sortedPairsCollection)->second)>;
template<typename CollectionType, typename KeyType> static auto findInSortedPairs(const CollectionType& sortedPairsCollection, const KeyType&) -> std::pair<decltype(std::begin(sortedPairsCollection)), decltype(std::begin(sortedPairsCollection))>;
#if !ASSERT_ENABLED
inline void checkEncodingTableInvariants() {}
#endif
struct CompareFirst {
template<typename TypeA, typename TypeB> bool operator()(const TypeA& a, const TypeB& b)
{
return a.first < b.first;
}
};
struct EqualFirst {
template<typename TypeA, typename TypeB> bool operator()(const TypeA& a, const TypeB& b)
{
return a.first == b.first;
}
};
struct CompareSecond {
template<typename TypeA, typename TypeB> bool operator()(const TypeA& a, const TypeB& b)
{
return a.second < b.second;
}
};
template<typename T> struct FirstAdapter {
const T& first;
};
template<typename T> FirstAdapter<T> makeFirstAdapter(const T& value)
{
return { value };
}
template<typename T> struct SecondAdapter {
const T& second;
};
template<typename T> SecondAdapter<T> makeSecondAdapter(const T& value)
{
return { value };
}
template<typename CollectionType> void sortByFirst(CollectionType& collection)
{
std::sort(std::begin(collection), std::end(collection), CompareFirst {});
}
template<typename CollectionType> void stableSortByFirst(CollectionType& collection)
{
std::stable_sort(std::begin(collection), std::end(collection), CompareFirst {});
}
template<typename CollectionType> bool isSortedByFirst(const CollectionType& collection)
{
return std::is_sorted(std::begin(collection), std::end(collection), CompareFirst {});
}
template<typename CollectionType> bool sortedFirstsAreUnique(const CollectionType& collection)
{
return std::adjacent_find(std::begin(collection), std::end(collection), EqualFirst {}) == std::end(collection);
}
template<typename CollectionType, typename KeyType> static auto findFirstInSortedPairs(const CollectionType& collection, const KeyType& key) -> std::optional<decltype(std::begin(collection)->second)>
{
if constexpr (std::is_integral_v<KeyType>) {
if (key != decltype(std::begin(collection)->first)(key))
return std::nullopt;
}
auto iterator = std::lower_bound(std::begin(collection), std::end(collection), makeFirstAdapter(key), CompareFirst {});
if (iterator == std::end(collection) || key < iterator->first)
return std::nullopt;
return iterator->second;
}
template<typename CollectionType, typename KeyType> static auto findInSortedPairs(const CollectionType& collection, const KeyType& key) -> std::pair<decltype(std::begin(collection)), decltype(std::begin(collection))>
{
if constexpr (std::is_integral_v<KeyType>) {
if (key != decltype(std::begin(collection)->first)(key))
return { std::end(collection), std::end(collection) };
}
return std::equal_range(std::begin(collection), std::end(collection), makeFirstAdapter(key), CompareFirst {});
}
}

View File

@@ -0,0 +1,59 @@
/*
* Copyright (C) 2010 Google Inc. All Rights Reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "config.h"
#include "KillRing.h"
#include <wtf/TZoneMallocInlines.h>
#if !PLATFORM(MAC)
namespace PAL {
WTF_MAKE_TZONE_ALLOCATED_IMPL(KillRing);
void KillRing::append(const String&)
{
}
void KillRing::prepend(const String&)
{
}
String KillRing::yank()
{
return String();
}
void KillRing::startNewSequence()
{
}
void KillRing::setToYankedState()
{
}
} // namespace PAL
#endif // !PLATFORM(MAC)

View File

@@ -0,0 +1,44 @@
/*
* Copyright (C) 2010 Google Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once
#include <wtf/TZoneMalloc.h>
#include <wtf/text/WTFString.h>
namespace PAL {
class KillRing {
WTF_MAKE_TZONE_ALLOCATED_EXPORT(KillRing, );
public:
void append(const String&);
void prepend(const String&);
String yank();
void startNewSequence();
void setToYankedState();
};
} // namespace PAL

View File

@@ -0,0 +1,286 @@
/*
* Copyright (C) 2013 Google Inc. All rights reserved.
* Copyright (C) 2020 Apple Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following disclaimer
* in the documentation and/or other materials provided with the
* distribution.
* * Neither the name of Google Inc. nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once
#include <wtf/StdLibExtras.h>
#include <wtf/text/StringCommon.h>
#include <wtf/text/StringParsingBuffer.h>
namespace WTF {
template<typename CharacterType> inline bool isNotASCIISpace(CharacterType c)
{
return !isUnicodeCompatibleASCIIWhitespace(c);
}
template<typename T> void skip(std::span<T>& data, size_t amountToSkip)
{
data = data.subspan(amountToSkip);
}
template<typename CharacterType, typename DelimiterType> bool skipExactly(const CharacterType*& position, const CharacterType* end, DelimiterType delimiter)
{
if (position < end && *position == delimiter) {
++position;
return true;
}
return false;
}
template<typename CharacterType, typename DelimiterType> bool skipExactly(std::span<const CharacterType>& data, DelimiterType delimiter)
{
if (!data.empty() && data.front() == delimiter) {
skip(data, 1);
return true;
}
return false;
}
template<typename CharacterType, typename DelimiterType> bool skipExactly(StringParsingBuffer<CharacterType>& buffer, DelimiterType delimiter)
{
if (buffer.hasCharactersRemaining() && *buffer == delimiter) {
++buffer;
return true;
}
return false;
}
template<bool characterPredicate(LChar)> bool skipExactly(StringParsingBuffer<LChar>& buffer)
{
if (buffer.hasCharactersRemaining() && characterPredicate(*buffer)) {
++buffer;
return true;
}
return false;
}
template<bool characterPredicate(UChar)> bool skipExactly(StringParsingBuffer<UChar>& buffer)
{
if (buffer.hasCharactersRemaining() && characterPredicate(*buffer)) {
++buffer;
return true;
}
return false;
}
template<bool characterPredicate(LChar)> bool skipExactly(std::span<const LChar>& buffer)
{
if (!buffer.empty() && characterPredicate(buffer[0])) {
skip(buffer, 1);
return true;
}
return false;
}
template<bool characterPredicate(UChar)> bool skipExactly(std::span<const UChar>& buffer)
{
if (!buffer.empty() && characterPredicate(buffer[0])) {
skip(buffer, 1);
return true;
}
return false;
}
template<typename CharacterType, typename DelimiterType> void skipUntil(StringParsingBuffer<CharacterType>& buffer, DelimiterType delimiter)
{
while (buffer.hasCharactersRemaining() && *buffer != delimiter)
++buffer;
}
template<typename CharacterType, typename DelimiterType> void skipUntil(std::span<const CharacterType>& buffer, DelimiterType delimiter)
{
size_t index = 0;
while (index < buffer.size() && buffer[index] != delimiter)
++index;
skip(buffer, index);
}
template<bool characterPredicate(LChar)> void skipUntil(std::span<const LChar>& data)
{
size_t index = 0;
while (index < data.size() && !characterPredicate(data[index]))
++index;
skip(data, index);
}
template<bool characterPredicate(UChar)> void skipUntil(std::span<const UChar>& data)
{
size_t index = 0;
while (index < data.size() && !characterPredicate(data[index]))
++index;
skip(data, index);
}
template<bool characterPredicate(LChar)> void skipUntil(StringParsingBuffer<LChar>& buffer)
{
while (buffer.hasCharactersRemaining() && !characterPredicate(*buffer))
++buffer;
}
template<bool characterPredicate(UChar)> void skipUntil(StringParsingBuffer<UChar>& buffer)
{
while (buffer.hasCharactersRemaining() && !characterPredicate(*buffer))
++buffer;
}
template<typename CharacterType, typename DelimiterType> void skipWhile(StringParsingBuffer<CharacterType>& buffer, DelimiterType delimiter)
{
while (buffer.hasCharactersRemaining() && *buffer == delimiter)
++buffer;
}
template<typename CharacterType, typename DelimiterType> void skipWhile(std::span<const CharacterType>& buffer, DelimiterType delimiter)
{
size_t index = 0;
while (index < buffer.size() && buffer[index] == delimiter)
++index;
skip(buffer, index);
}
template<bool characterPredicate(LChar)> void skipWhile(std::span<const LChar>& data)
{
size_t index = 0;
while (index < data.size() && characterPredicate(data[index]))
++index;
skip(data, index);
}
template<bool characterPredicate(UChar)> void skipWhile(std::span<const UChar>& data)
{
size_t index = 0;
while (index < data.size() && characterPredicate(data[index]))
++index;
skip(data, index);
}
template<bool characterPredicate(LChar)> void skipWhile(StringParsingBuffer<LChar>& buffer)
{
while (buffer.hasCharactersRemaining() && characterPredicate(*buffer))
++buffer;
}
template<bool characterPredicate(UChar)> void skipWhile(StringParsingBuffer<UChar>& buffer)
{
while (buffer.hasCharactersRemaining() && characterPredicate(*buffer))
++buffer;
}
template<typename CharacterType> bool skipExactlyIgnoringASCIICase(StringParsingBuffer<CharacterType>& buffer, ASCIILiteral literal)
{
auto literalLength = literal.length();
if (buffer.lengthRemaining() < literalLength)
return false;
if (!equalLettersIgnoringASCIICaseWithLength(buffer.span(), literal.span8(), literalLength))
return false;
buffer += literalLength;
return true;
}
template<typename CharacterType, std::size_t Extent> bool skipLettersExactlyIgnoringASCIICase(StringParsingBuffer<CharacterType>& buffer, std::span<const CharacterType, Extent> letters)
{
if (buffer.lengthRemaining() < letters.size())
return false;
for (unsigned i = 0; i < letters.size(); ++i) {
ASSERT(isASCIIAlpha(letters[i]));
if (!isASCIIAlphaCaselessEqual(buffer[i], static_cast<char>(letters[i])))
return false;
}
buffer += letters.size();
return true;
}
template<typename CharacterType, std::size_t Extent> bool skipLettersExactlyIgnoringASCIICase(std::span<const CharacterType>& buffer, std::span<const CharacterType, Extent> letters)
{
if (buffer.size() < letters.size())
return false;
if (!equalLettersIgnoringASCIICaseWithLength(buffer, letters, letters.size()))
return false;
skip(buffer, letters.size());
return true;
}
template<typename CharacterType, std::size_t Extent> constexpr bool skipCharactersExactly(StringParsingBuffer<CharacterType>& buffer, std::span<const CharacterType, Extent> string)
{
if (!spanHasPrefix(buffer.span(), string))
return false;
buffer += string.size();
return true;
}
template<typename CharacterType, std::size_t Extent> constexpr bool skipCharactersExactly(std::span<const CharacterType>& buffer, std::span<const CharacterType, Extent> string)
{
if (!spanHasPrefix(buffer, string))
return false;
skip(buffer, string.size());
return true;
}
template<typename T> std::span<T> consumeSpan(std::span<T>& data, size_t amountToConsume)
{
auto consumed = data.first(amountToConsume);
skip(data, amountToConsume);
return consumed;
}
template<typename T> T& consume(std::span<T>& data)
{
T& value = data[0];
skip(data, 1);
return value;
}
template<typename DestinationType, typename SourceType>
match_constness_t<SourceType, DestinationType>& consumeAndCastTo(std::span<SourceType>& data)
requires(sizeof(SourceType) == 1)
{
return spanReinterpretCast<match_constness_t<SourceType, DestinationType>>(consumeSpan(data, sizeof(DestinationType)))[0];
}
// Adapt a UChar-predicate to an LChar-predicate.
template<bool characterPredicate(UChar)>
static inline bool LCharPredicateAdapter(LChar c) { return characterPredicate(c); }
} // namespace WTF
using WTF::consume;
using WTF::consumeAndCastTo;
using WTF::consumeSpan;
using WTF::isNotASCIISpace;
using WTF::LCharPredicateAdapter;
using WTF::skip;
using WTF::skipCharactersExactly;
using WTF::skipExactly;
using WTF::skipExactlyIgnoringASCIICase;
using WTF::skipLettersExactlyIgnoringASCIICase;
using WTF::skipUntil;
using WTF::skipWhile;

View File

@@ -0,0 +1,68 @@
/*
* Copyright (C) 2004-2017 Apple Inc. All rights reserved.
* Copyright (C) 2006 Alexey Proskuryakov <ap@nypop.com>
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "config.h"
#include "TextCodec.h"
#include <unicode/uchar.h>
#include <wtf/TZoneMallocInlines.h>
#include <wtf/text/WTFString.h>
#include <wtf/unicode/CharacterNames.h>
#include <array>
#include <cstdio>
namespace PAL {
WTF_MAKE_TZONE_ALLOCATED_IMPL(TextCodec);
std::span<char> TextCodec::getUnencodableReplacement(char32_t codePoint, UnencodableHandling handling, UnencodableReplacementArray& replacement)
{
ASSERT(!(codePoint > UCHAR_MAX_VALUE));
// The Encoding Standard doesn't have surrogate code points in the input, but that would require
// scanning and potentially manipulating inputs ahead of time. Instead handle them at the last
// possible point.
if (U_IS_SURROGATE(codePoint))
codePoint = replacementCharacter;
switch (handling) {
case UnencodableHandling::Entities: {
int count = snprintf(replacement.data(), sizeof(UnencodableReplacementArray), "&#%u;", static_cast<unsigned>(codePoint));
ASSERT(count >= 0);
return std::span { replacement }.first(std::max<int>(0, count));
}
case UnencodableHandling::URLEncodedEntities: {
int count = snprintf(replacement.data(), sizeof(UnencodableReplacementArray), "%%26%%23%u%%3B", static_cast<unsigned>(codePoint));
ASSERT(count >= 0);
return std::span { replacement }.first(std::max<int>(0, count));
} }
ASSERT_NOT_REACHED();
replacement[0] = '\0';
return std::span { replacement }.first(0);
}
} // namespace PAL

View File

@@ -0,0 +1,70 @@
/*
* Copyright (C) 2004-2020 Apple Inc. All rights reserved.
* Copyright (C) 2006 Alexey Proskuryakov <ap@nypop.com>
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once
#include "UnencodableHandling.h"
#include <array>
#include <memory>
#include <span>
#include <unicode/umachine.h>
#include <wtf/Forward.h>
#include <wtf/Noncopyable.h>
#include <wtf/TZoneMalloc.h>
namespace PAL {
class TextEncoding;
using UnencodableReplacementArray = std::array<char, 32>;
class TextCodec {
WTF_MAKE_TZONE_ALLOCATED(TextCodec);
WTF_MAKE_NONCOPYABLE(TextCodec);
public:
TextCodec() = default;
virtual ~TextCodec() = default;
virtual void stripByteOrderMark() {}
virtual String decode(std::span<const uint8_t> data, bool flush, bool stopOnError, bool& sawError) = 0;
virtual Vector<uint8_t> encode(StringView, UnencodableHandling) const = 0;
// Fills a null-terminated string representation of the given
// unencodable character into the given replacement buffer.
// The length of the string (not including the null) will be returned.
static std::span<char> getUnencodableReplacement(char32_t, UnencodableHandling, UnencodableReplacementArray& LIFETIME_BOUND);
};
Function<void(char32_t, Vector<uint8_t>&)> unencodableHandler(UnencodableHandling);
using EncodingNameRegistrar = void (*)(ASCIILiteral alias, ASCIILiteral name);
using NewTextCodecFunction = Function<std::unique_ptr<TextCodec>()>;
using TextCodecRegistrar = void (*)(ASCIILiteral name, NewTextCodecFunction&&);
} // namespace PAL

View File

@@ -0,0 +1,78 @@
/*
* Copyright (C) 2011 Apple Inc. All rights reserved.
* Copyright (C) 2011 Nokia Corporation and/or its subsidiary(-ies).
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once
#include <wtf/StdLibExtras.h>
#include <wtf/text/ASCIIFastPath.h>
namespace PAL {
template<size_t size> struct UCharByteFiller;
template<> struct UCharByteFiller<4> {
static void copy(std::span<LChar> destination, std::span<const uint8_t> source)
{
memcpySpan(destination, source.first(4));
}
static void copy(std::span<UChar> destination, std::span<const uint8_t> source)
{
destination[0] = source[0];
destination[1] = source[1];
destination[2] = source[2];
destination[3] = source[3];
}
};
template<> struct UCharByteFiller<8> {
static void copy(std::span<LChar> destination, std::span<const uint8_t> source)
{
memcpySpan(destination, source.first(8));
}
static void copy(std::span<UChar> destination, std::span<const uint8_t> source)
{
destination[0] = source[0];
destination[1] = source[1];
destination[2] = source[2];
destination[3] = source[3];
destination[4] = source[4];
destination[5] = source[5];
destination[6] = source[6];
destination[7] = source[7];
}
};
inline void copyASCIIMachineWord(std::span<LChar> destination, std::span<const uint8_t> source)
{
UCharByteFiller<sizeof(WTF::MachineWord)>::copy(destination, source);
}
inline void copyASCIIMachineWord(std::span<UChar> destination, std::span<const uint8_t> source)
{
UCharByteFiller<sizeof(WTF::MachineWord)>::copy(destination, source);
}
} // namespace PAL

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,76 @@
/*
* Copyright (C) 2020 Apple Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once
#include "TextCodec.h"
#include <optional>
#include <wtf/TZoneMalloc.h>
namespace PAL {
class TextCodecCJK final : public TextCodec {
WTF_MAKE_TZONE_ALLOCATED(TextCodecCJK);
public:
static void registerEncodingNames(EncodingNameRegistrar);
static void registerCodecs(TextCodecRegistrar);
enum class Encoding : uint8_t;
explicit TextCodecCJK(Encoding);
private:
String decode(std::span<const uint8_t>, bool flush, bool stopOnError, bool& sawError) final;
Vector<uint8_t> encode(StringView, UnencodableHandling) const final;
enum class SawError : bool { No, Yes };
String decodeCommon(std::span<const uint8_t>, bool, bool, bool&, const Function<SawError(uint8_t, StringBuilder&)>&);
String eucJPDecode(std::span<const uint8_t>, bool, bool, bool&);
String iso2022JPDecode(std::span<const uint8_t>, bool, bool, bool&);
String shiftJISDecode(std::span<const uint8_t>, bool, bool, bool&);
String eucKRDecode(std::span<const uint8_t>, bool, bool, bool&);
String big5Decode(std::span<const uint8_t>, bool, bool, bool&);
String gbkDecode(std::span<const uint8_t>, bool, bool, bool&);
String gb18030Decode(std::span<const uint8_t>, bool, bool, bool&);
const Encoding m_encoding;
bool m_jis0212 { false };
enum class ISO2022JPDecoderState : uint8_t { ASCII, Roman, Katakana, LeadByte, TrailByte, EscapeStart, Escape };
ISO2022JPDecoderState m_iso2022JPDecoderState { ISO2022JPDecoderState::ASCII };
ISO2022JPDecoderState m_iso2022JPDecoderOutputState { ISO2022JPDecoderState::ASCII };
bool m_iso2022JPOutput { false };
std::optional<uint8_t> m_iso2022JPSecondPrependedByte;
uint8_t m_gb18030First { 0x00 };
uint8_t m_gb18030Second { 0x00 };
uint8_t m_gb18030Third { 0x00 };
uint8_t m_lead { 0x00 };
std::optional<uint8_t> m_prependedByte;
};
} // namespace PAL

View File

@@ -0,0 +1,337 @@
/*
* Copyright (C) 2004-2017 Apple Inc. All rights reserved.
* Copyright (C) 2006 Alexey Proskuryakov <ap@nypop.com>
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "config.h"
#include "TextCodecICU.h"
#include "ZigGlobalObject.h"
#include "TextEncoding.h"
#include "TextEncodingRegistry.h"
// #include "ThreadGlobalData.h"
#include <array>
#include <unicode-ucnv_cb.h>
#include <wtf/TZoneMallocInlines.h>
#include <wtf/Threading.h>
#include <wtf/text/CString.h>
#include "ParsingUtilities-removeAfterWebKitUpgrade.h"
#include <wtf/text/StringBuilder.h>
#include <wtf/unicode/CharacterNames.h>
#include <wtf/unicode/icu/ICUHelpers.h>
#include "ScriptExecutionContext.h"
namespace PAL {
WTF_MAKE_TZONE_ALLOCATED_IMPL(TextCodecICU);
const size_t ConversionBufferSize = 16384;
static ICUConverterWrapper& cachedConverterICU()
{
return defaultGlobalObject()->scriptExecutionContext()->cachedConverterICU();
}
#define DECLARE_ALIASES(encoding, ...) \
static constexpr ASCIILiteral encoding##_aliases[] { __VA_ARGS__ }
// From https://encoding.spec.whatwg.org. Plus a few extra aliases that macOS had historically from TEC.
DECLARE_ALIASES(ISO_8859_2, "csisolatin2"_s, "iso-ir-101"_s, "iso8859-2"_s, "iso88592"_s, "iso_8859-2"_s, "iso_8859-2:1987"_s, "l2"_s, "latin2"_s);
DECLARE_ALIASES(ISO_8859_4, "csisolatin4"_s, "iso-ir-110"_s, "iso8859-4"_s, "iso88594"_s, "iso_8859-4"_s, "iso_8859-4:1988"_s, "l4"_s, "latin4"_s);
DECLARE_ALIASES(ISO_8859_5, "csisolatincyrillic"_s, "cyrillic"_s, "iso-ir-144"_s, "iso8859-5"_s, "iso88595"_s, "iso_8859-5"_s, "iso_8859-5:1988"_s);
DECLARE_ALIASES(ISO_8859_10, "csisolatin6"_s, "iso-ir-157"_s, "iso8859-10"_s, "iso885910"_s, "l6"_s, "latin6"_s, "iso8859101992"_s, "isoir157"_s);
DECLARE_ALIASES(ISO_8859_13, "iso8859-13"_s, "iso885913"_s);
DECLARE_ALIASES(ISO_8859_14, "iso8859-14"_s, "iso885914"_s, "isoceltic"_s, "iso8859141998"_s, "isoir199"_s, "latin8"_s, "l8"_s);
DECLARE_ALIASES(ISO_8859_15, "csisolatin9"_s, "iso8859-15"_s, "iso885915"_s, "iso_8859-15"_s, "l9"_s);
DECLARE_ALIASES(ISO_8859_16, "isoir226"_s, "iso8859162001"_s, "l10"_s, "latin10"_s);
DECLARE_ALIASES(KOI8_R, "cskoi8r"_s, "koi"_s, "koi8"_s, "koi8_r"_s);
DECLARE_ALIASES(macintosh, "csmacintosh"_s, "mac"_s, "x-mac-roman"_s, "macroman"_s, "x-macroman"_s);
DECLARE_ALIASES(windows_1250, "cp1250"_s, "x-cp1250"_s, "winlatin2"_s);
DECLARE_ALIASES(windows_1251, "cp1251"_s, "wincyrillic"_s, "x-cp1251"_s);
DECLARE_ALIASES(windows_1254, "winturkish"_s, "cp1254"_s, "csisolatin5"_s, "iso-8859-9"_s, "iso-ir-148"_s, "iso8859-9"_s, "iso88599"_s, "iso_8859-9"_s, "iso_8859-9:1989"_s, "l5"_s, "latin5"_s, "x-cp1254"_s);
DECLARE_ALIASES(windows_1256, "winarabic"_s, "cp1256"_s, "x-cp1256"_s);
DECLARE_ALIASES(windows_1258, "winvietnamese"_s, "cp1258"_s, "x-cp1258"_s);
DECLARE_ALIASES(x_mac_cyrillic, "maccyrillic"_s, "x-mac-ukrainian"_s, "windows-10007"_s, "mac-cyrillic"_s, "maccy"_s, "x-MacCyrillic"_s, "x-MacUkraine"_s);
// Encodings below are not in the standard.
DECLARE_ALIASES(x_mac_greek, "windows-10006"_s, "macgr"_s, "x-MacGreek"_s);
DECLARE_ALIASES(x_mac_centraleurroman, "windows-10029"_s, "x-mac-ce"_s, "macce"_s, "maccentraleurope"_s, "x-MacCentralEurope"_s);
DECLARE_ALIASES(x_mac_turkish, "windows-10081"_s, "mactr"_s, "x-MacTurkish"_s);
#define DECLARE_ENCODING_NAME(encoding, alias_array) \
{ \
encoding, std::span { alias_array##_aliases } \
}
#define DECLARE_ENCODING_NAME_NO_ALIASES(encoding) \
{ \
encoding, {} \
}
static const struct EncodingName {
ASCIILiteral name;
std::span<const ASCIILiteral> aliases;
} encodingNames[] = {
DECLARE_ENCODING_NAME("ISO-8859-2"_s, ISO_8859_2),
DECLARE_ENCODING_NAME("ISO-8859-4"_s, ISO_8859_4),
DECLARE_ENCODING_NAME("ISO-8859-5"_s, ISO_8859_5),
DECLARE_ENCODING_NAME("ISO-8859-10"_s, ISO_8859_10),
DECLARE_ENCODING_NAME("ISO-8859-13"_s, ISO_8859_13),
DECLARE_ENCODING_NAME("ISO-8859-14"_s, ISO_8859_14),
DECLARE_ENCODING_NAME("ISO-8859-15"_s, ISO_8859_15),
DECLARE_ENCODING_NAME("ISO-8859-16"_s, ISO_8859_16),
DECLARE_ENCODING_NAME("KOI8-R"_s, KOI8_R),
DECLARE_ENCODING_NAME("macintosh"_s, macintosh),
DECLARE_ENCODING_NAME("windows-1250"_s, windows_1250),
DECLARE_ENCODING_NAME("windows-1251"_s, windows_1251),
DECLARE_ENCODING_NAME("windows-1254"_s, windows_1254),
DECLARE_ENCODING_NAME("windows-1256"_s, windows_1256),
DECLARE_ENCODING_NAME("windows-1258"_s, windows_1258),
DECLARE_ENCODING_NAME("x-mac-cyrillic"_s, x_mac_cyrillic),
// Encodings below are not in the standard.
DECLARE_ENCODING_NAME("x-mac-greek"_s, x_mac_greek),
DECLARE_ENCODING_NAME("x-mac-centraleurroman"_s, x_mac_centraleurroman),
DECLARE_ENCODING_NAME("x-mac-turkish"_s, x_mac_turkish),
DECLARE_ENCODING_NAME_NO_ALIASES("EUC-TW"_s),
};
void TextCodecICU::registerEncodingNames(EncodingNameRegistrar registrar)
{
for (auto& encodingName : encodingNames) {
registrar(encodingName.name, encodingName.name);
for (auto& alias : encodingName.aliases)
registrar(alias, encodingName.name);
}
}
void TextCodecICU::registerCodecs(TextCodecRegistrar registrar)
{
for (auto& encodingName : encodingNames) {
ASCIILiteral name = encodingName.name;
UErrorCode error = U_ZERO_ERROR;
const char* canonicalConverterName = ucnv_getCanonicalName(name, "IANA", &error);
ASSERT(U_SUCCESS(error));
if (!canonicalConverterName) {
auto converter = ICUConverterPtr { ucnv_open(name, &error) };
ASSERT(U_SUCCESS(error));
canonicalConverterName = ucnv_getName(converter.get(), &error);
ASSERT(U_SUCCESS(error));
if (!canonicalConverterName) {
ASSERT_NOT_REACHED();
continue;
}
}
registrar(name, [name, canonicalConverterName] {
// ucnv_getCanonicalName() returns a static string owned by libicu so the call to
// ASCIILiteral::fromLiteralUnsafe() should be safe.
return makeUnique<TextCodecICU>(name, ASCIILiteral::fromLiteralUnsafe(canonicalConverterName));
});
}
}
TextCodecICU::TextCodecICU(ASCIILiteral encoding, ASCIILiteral canonicalConverterName)
: m_encodingName(encoding)
, m_canonicalConverterName(canonicalConverterName)
{
ASSERT(!m_canonicalConverterName.isNull());
}
TextCodecICU::~TextCodecICU()
{
if (m_converter) {
ucnv_reset(m_converter.get());
cachedConverterICU().converter = WTFMove(m_converter);
}
}
void TextCodecICU::createICUConverter() const
{
ASSERT(!m_converter);
auto& cachedConverter = cachedConverterICU().converter;
if (cachedConverter) {
UErrorCode error = U_ZERO_ERROR;
const char* cachedConverterName = ucnv_getName(cachedConverter.get(), &error);
if (U_SUCCESS(error) && !strcmp(m_canonicalConverterName, cachedConverterName)) {
m_converter = WTFMove(cachedConverter);
return;
}
}
UErrorCode error = U_ZERO_ERROR;
m_converter = ICUConverterPtr { ucnv_open(m_canonicalConverterName, &error) };
if (m_converter)
ucnv_setFallback(m_converter.get(), true);
}
int TextCodecICU::decodeToBuffer(std::span<UChar> targetSpan, std::span<const uint8_t>& sourceSpan, int32_t* offsets, bool flush, UErrorCode& error)
{
UChar* targetStart = targetSpan.data();
error = U_ZERO_ERROR;
auto* source = byteCast<char>(sourceSpan.data());
auto* sourceLimit = byteCast<char>(std::to_address(sourceSpan.end()));
auto* target = targetSpan.data();
auto* targetLimit = std::to_address(targetSpan.end());
ucnv_toUnicode(m_converter.get(), &target, targetLimit, &source, sourceLimit, offsets, flush, &error);
skip(sourceSpan, byteCast<uint8_t>(source) - sourceSpan.data());
return target - targetStart;
}
class ErrorCallbackSetter {
public:
ErrorCallbackSetter(UConverter& converter, bool stopOnError)
: m_converter(converter)
, m_shouldStopOnEncodingErrors(stopOnError)
{
if (m_shouldStopOnEncodingErrors) {
UErrorCode err = U_ZERO_ERROR;
ucnv_setToUCallBack(&m_converter, UCNV_TO_U_CALLBACK_SUBSTITUTE, UCNV_SUB_STOP_ON_ILLEGAL, &m_savedAction, &m_savedContext, &err);
ASSERT(U_SUCCESS(err));
}
}
~ErrorCallbackSetter()
{
if (m_shouldStopOnEncodingErrors) {
UErrorCode err = U_ZERO_ERROR;
const void* oldContext;
UConverterToUCallback oldAction;
ucnv_setToUCallBack(&m_converter, m_savedAction, m_savedContext, &oldAction, &oldContext, &err);
ASSERT(oldAction == UCNV_TO_U_CALLBACK_SUBSTITUTE);
ASSERT(!strcmp(static_cast<const char*>(oldContext), UCNV_SUB_STOP_ON_ILLEGAL));
ASSERT(U_SUCCESS(err));
}
}
private:
UConverter& m_converter;
bool m_shouldStopOnEncodingErrors;
const void* m_savedContext { nullptr };
UConverterToUCallback m_savedAction { nullptr };
};
String TextCodecICU::decode(std::span<const uint8_t> source, bool flush, bool stopOnError, bool& sawError)
{
// Get a converter for the passed-in encoding.
if (!m_converter) {
createICUConverter();
if (!m_converter) {
LOG_ERROR("error creating ICU encoder even though encoding was in table");
sawError = true;
return {};
}
}
ErrorCallbackSetter callbackSetter(*m_converter, stopOnError);
StringBuilder result;
std::array<UChar, ConversionBufferSize> buffer;
auto target = std::span { buffer };
int32_t* offsets = nullptr;
UErrorCode err = U_ZERO_ERROR;
do {
size_t ucharsDecoded = decodeToBuffer(target, source, offsets, flush, err);
result.append(target.first(ucharsDecoded));
} while (needsToGrowToProduceBuffer(err));
if (U_FAILURE(err)) {
// flush the converter so it can be reused, and not be bothered by this error.
do {
decodeToBuffer(target, source, offsets, true, err);
} while (!source.empty());
sawError = true;
}
String resultString = result.toString();
return resultString;
}
// Invalid character handler when writing escaped entities for unrepresentable
// characters. See the declaration of TextCodec::encode for more.
static void urlEscapedEntityCallback(const void* context, UConverterFromUnicodeArgs* fromUArgs, const UChar* codeUnits, int32_t length,
UChar32 codePoint, UConverterCallbackReason reason, UErrorCode* error)
{
if (reason == UCNV_UNASSIGNED) {
*error = U_ZERO_ERROR;
UnencodableReplacementArray entity;
auto span = TextCodec::getUnencodableReplacement(codePoint, UnencodableHandling::URLEncodedEntities, entity);
ucnv_cbFromUWriteBytes(fromUArgs, span.data(), span.size(), 0, error);
} else
UCNV_FROM_U_CALLBACK_ESCAPE(context, fromUArgs, codeUnits, length, codePoint, reason, error);
}
Vector<uint8_t> TextCodecICU::encode(StringView string, UnencodableHandling handling) const
{
if (string.isEmpty())
return {};
if (!m_converter) {
createICUConverter();
if (!m_converter)
return {};
}
// FIXME: We should see if there is "force ASCII range" mode in ICU;
// until then, we change the backslash into a yen sign.
// Encoding will change the yen sign back into a backslash.
String copy;
if (shouldShowBackslashAsCurrencySymbolIn(m_encodingName)) {
copy = makeStringByReplacingAll(string, '\\', yenSign);
string = copy;
}
UErrorCode error;
switch (handling) {
case UnencodableHandling::Entities:
error = U_ZERO_ERROR;
ucnv_setFromUCallBack(m_converter.get(), UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_DEC, 0, 0, &error);
if (U_FAILURE(error))
return {};
break;
case UnencodableHandling::URLEncodedEntities:
error = U_ZERO_ERROR;
ucnv_setFromUCallBack(m_converter.get(), urlEscapedEntityCallback, 0, 0, 0, &error);
if (U_FAILURE(error))
return {};
break;
}
auto upconvertedCharacters = string.upconvertedCharacters();
auto source = upconvertedCharacters.span().data();
auto* sourceLimit = std::to_address(upconvertedCharacters.span().end());
Vector<uint8_t> result;
do {
std::array<char, ConversionBufferSize> buffer;
char* target = buffer.data();
char* targetLimit = std::to_address(std::span { buffer }.end());
error = U_ZERO_ERROR;
ucnv_fromUnicode(m_converter.get(), &target, targetLimit, &source, sourceLimit, 0, true, &error);
result.append(byteCast<uint8_t>(std::span(buffer)).first(target - buffer.data()));
} while (needsToGrowToProduceBuffer(error));
return result;
}
} // namespace PAL

View File

@@ -0,0 +1,69 @@
/*
* Copyright (C) 2004-2017 Apple Inc. All rights reserved.
* Copyright (C) 2006 Alexey Proskuryakov <ap@nypop.com>
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once
#include "TextCodec.h"
#include "unicode-ucnv.h"
#include <wtf/TZoneMalloc.h>
#include <wtf/text/ASCIILiteral.h>
#include <wtf/unicode/icu/ICUHelpers.h>
namespace PAL {
using ICUConverterPtr = std::unique_ptr<UConverter, ICUDeleter<ucnv_close>>;
class TextCodecICU final : public TextCodec {
WTF_MAKE_TZONE_ALLOCATED(TextCodecICU);
public:
static void registerEncodingNames(EncodingNameRegistrar);
static void registerCodecs(TextCodecRegistrar);
explicit TextCodecICU(ASCIILiteral encoding, ASCIILiteral canonicalConverterName);
virtual ~TextCodecICU();
private:
String decode(std::span<const uint8_t>, bool flush, bool stopOnError, bool& sawError) final;
Vector<uint8_t> encode(StringView, UnencodableHandling) const final;
void createICUConverter() const;
void releaseICUConverter() const;
int decodeToBuffer(std::span<UChar> buffer, std::span<const uint8_t>& source, int32_t* offsets, bool flush, UErrorCode&);
ASCIILiteral m_encodingName;
ASCIILiteral const m_canonicalConverterName;
mutable ICUConverterPtr m_converter;
};
struct ICUConverterWrapper {
WTF_MAKE_STRUCT_FAST_ALLOCATED;
ICUConverterPtr converter;
};
} // namespace PAL

View File

@@ -0,0 +1,256 @@
/*
* Copyright (C) 2004-2017 Apple Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "config.h"
#include "TextCodecLatin1.h"
#include "TextCodecASCIIFastPath.h"
#include <array>
#include <wtf/text/CString.h>
#include "ParsingUtilities-removeAfterWebKitUpgrade.h"
#include <wtf/text/WTFString.h>
namespace PAL {
static constexpr std::array<UChar, 256> latin1ConversionTable = {
0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, // 00-07
0x0008, 0x0009, 0x000A, 0x000B, 0x000C, 0x000D, 0x000E, 0x000F, // 08-0F
0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, // 10-17
0x0018, 0x0019, 0x001A, 0x001B, 0x001C, 0x001D, 0x001E, 0x001F, // 18-1F
0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, // 20-27
0x0028, 0x0029, 0x002A, 0x002B, 0x002C, 0x002D, 0x002E, 0x002F, // 28-2F
0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, // 30-37
0x0038, 0x0039, 0x003A, 0x003B, 0x003C, 0x003D, 0x003E, 0x003F, // 38-3F
0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, // 40-47
0x0048, 0x0049, 0x004A, 0x004B, 0x004C, 0x004D, 0x004E, 0x004F, // 48-4F
0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, // 50-57
0x0058, 0x0059, 0x005A, 0x005B, 0x005C, 0x005D, 0x005E, 0x005F, // 58-5F
0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, // 60-67
0x0068, 0x0069, 0x006A, 0x006B, 0x006C, 0x006D, 0x006E, 0x006F, // 68-6F
0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, // 70-77
0x0078, 0x0079, 0x007A, 0x007B, 0x007C, 0x007D, 0x007E, 0x007F, // 78-7F
0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, // 80-87
0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F, // 88-8F
0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, // 90-97
0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178, // 98-9F
0x00A0, 0x00A1, 0x00A2, 0x00A3, 0x00A4, 0x00A5, 0x00A6, 0x00A7, // A0-A7
0x00A8, 0x00A9, 0x00AA, 0x00AB, 0x00AC, 0x00AD, 0x00AE, 0x00AF, // A8-AF
0x00B0, 0x00B1, 0x00B2, 0x00B3, 0x00B4, 0x00B5, 0x00B6, 0x00B7, // B0-B7
0x00B8, 0x00B9, 0x00BA, 0x00BB, 0x00BC, 0x00BD, 0x00BE, 0x00BF, // B8-BF
0x00C0, 0x00C1, 0x00C2, 0x00C3, 0x00C4, 0x00C5, 0x00C6, 0x00C7, // C0-C7
0x00C8, 0x00C9, 0x00CA, 0x00CB, 0x00CC, 0x00CD, 0x00CE, 0x00CF, // C8-CF
0x00D0, 0x00D1, 0x00D2, 0x00D3, 0x00D4, 0x00D5, 0x00D6, 0x00D7, // D0-D7
0x00D8, 0x00D9, 0x00DA, 0x00DB, 0x00DC, 0x00DD, 0x00DE, 0x00DF, // D8-DF
0x00E0, 0x00E1, 0x00E2, 0x00E3, 0x00E4, 0x00E5, 0x00E6, 0x00E7, // E0-E7
0x00E8, 0x00E9, 0x00EA, 0x00EB, 0x00EC, 0x00ED, 0x00EE, 0x00EF, // E8-EF
0x00F0, 0x00F1, 0x00F2, 0x00F3, 0x00F4, 0x00F5, 0x00F6, 0x00F7, // F0-F7
0x00F8, 0x00F9, 0x00FA, 0x00FB, 0x00FC, 0x00FD, 0x00FE, 0x00FF // F8-FF
};
void TextCodecLatin1::registerEncodingNames(EncodingNameRegistrar registrar)
{
// From https://encoding.spec.whatwg.org.
registrar("windows-1252"_s, "windows-1252"_s);
registrar("ansi_x3.4-1968"_s, "windows-1252"_s);
registrar("ascii"_s, "windows-1252"_s);
registrar("cp1252"_s, "windows-1252"_s);
registrar("cp819"_s, "windows-1252"_s);
registrar("csisolatin1"_s, "windows-1252"_s);
registrar("ibm819"_s, "windows-1252"_s);
registrar("iso-8859-1"_s, "windows-1252"_s);
registrar("iso-ir-100"_s, "windows-1252"_s);
registrar("iso8859-1"_s, "windows-1252"_s);
registrar("iso88591"_s, "windows-1252"_s);
registrar("iso_8859-1"_s, "windows-1252"_s);
registrar("iso_8859-1:1987"_s, "windows-1252"_s);
registrar("l1"_s, "windows-1252"_s);
registrar("latin1"_s, "windows-1252"_s);
registrar("us-ascii"_s, "windows-1252"_s);
registrar("x-cp1252"_s, "windows-1252"_s);
}
void TextCodecLatin1::registerCodecs(TextCodecRegistrar registrar)
{
registrar("windows-1252"_s, [] {
return makeUnique<TextCodecLatin1>();
});
}
WTF_ALLOW_UNSAFE_BUFFER_USAGE_BEGIN
String TextCodecLatin1::decode(std::span<const uint8_t> bytes, bool, bool, bool& sawException)
{
std::span<LChar> characters;
if (bytes.empty())
return emptyString();
if (UNLIKELY(bytes.size() > std::numeric_limits<unsigned>::max())) {
ASSERT_NOT_REACHED();
sawException = true;
return emptyString();
}
String result = String::createUninitialized(bytes.size(), characters);
auto source = bytes;
const uint8_t* alignedEnd = WTF::alignToMachineWord(std::to_address(source.end()));
auto destination = characters;
while (!source.empty()) {
if (isASCII(source[0])) {
// Fast path for ASCII. Most Latin-1 text will be ASCII.
if (WTF::isAlignedToMachineWord(source.data())) {
while (source.data() < alignedEnd) {
auto chunk = reinterpretCastSpanStartTo<WTF::MachineWord>(source);
if (!WTF::containsOnlyASCII<LChar>(chunk))
goto useLookupTable;
copyASCIIMachineWord(destination, source);
skip(source, sizeof(WTF::MachineWord));
skip(destination, sizeof(WTF::MachineWord));
}
if (source.empty())
break;
// *source may not be ASCII anymore if source moves inside the loop of the fast code path
if (!isASCII(source[0]))
goto useLookupTable;
}
destination[0] = source[0];
} else {
useLookupTable:
auto sourceCharacter = source[0];
if (!isLatin1(latin1ConversionTable[sourceCharacter]))
goto upConvertTo16Bit;
destination[0] = latin1ConversionTable[sourceCharacter];
}
skip(source, 1);
skip(destination, 1);
}
return result;
upConvertTo16Bit:
std::span<UChar> characters16;
String result16 = String::createUninitialized(bytes.size(), characters16);
auto destination16 = characters16;
// Zero extend and copy already processed 8 bit data
LChar* ptr8 = characters.data();
LChar* endPtr8 = destination.data();
while (ptr8 < endPtr8)
consume(destination16) = *ptr8++;
// Handle the character that triggered the 16 bit path
consume(destination16) = latin1ConversionTable[consume(source)];
while (!source.empty()) {
if (isASCII(source[0])) {
// Fast path for ASCII. Most Latin-1 text will be ASCII.
if (WTF::isAlignedToMachineWord(source.data())) {
while (source.data() < alignedEnd) {
auto chunk = reinterpretCastSpanStartTo<WTF::MachineWord>(source);
if (!WTF::containsOnlyASCII<LChar>(chunk))
goto useLookupTable16;
copyASCIIMachineWord(destination16, source);
skip(source, sizeof(WTF::MachineWord));
skip(destination16, sizeof(WTF::MachineWord));
}
if (source.empty())
break;
// *source may not be ASCII anymore if source moves inside the loop of the fast code path
if (!isASCII(source[0]))
goto useLookupTable16;
}
destination16[0] = source[0];
} else {
useLookupTable16:
destination16[0] = latin1ConversionTable[source[0]];
}
skip(source, 1);
skip(destination16, 1);
}
return result16;
}
WTF_ALLOW_UNSAFE_BUFFER_USAGE_END
static Vector<uint8_t> encodeComplexWindowsLatin1(StringView string, UnencodableHandling handling)
{
Vector<uint8_t> result;
for (auto character : string.codePoints()) {
uint8_t b = character;
// Do an efficient check to detect characters other than 00-7F and A0-FF.
if (b != character || (character & 0xE0) == 0x80) {
// Look for a way to encode this with Windows Latin-1.
for (b = 0x80; b < 0xA0; ++b) {
if (latin1ConversionTable[b] == character)
goto gotByte;
}
// No way to encode this character with Windows Latin-1.
UnencodableReplacementArray replacement;
result.append(TextCodec::getUnencodableReplacement(character, handling, replacement));
continue;
}
gotByte:
result.append(b);
}
return result;
}
Vector<uint8_t> TextCodecLatin1::encode(StringView string, UnencodableHandling handling) const
{
{
Vector<uint8_t> result(string.length());
size_t index = 0;
// Convert and simultaneously do a check to see if it's all ASCII.
UChar ored = 0;
for (auto character : string.codeUnits()) {
result[index++] = character;
ored |= character;
}
if (!(ored & 0xFF80))
return result;
}
// If it wasn't all ASCII, call the function that handles more-complex cases.
return encodeComplexWindowsLatin1(string, handling);
}
} // namespace PAL

View File

@@ -0,0 +1,42 @@
/*
* Copyright (C) 2004-2017 Apple Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once
#include "TextCodec.h"
namespace PAL {
class TextCodecLatin1 final : public TextCodec {
public:
static void registerEncodingNames(EncodingNameRegistrar);
static void registerCodecs(TextCodecRegistrar);
private:
String decode(std::span<const uint8_t>, bool flush, bool stopOnError, bool& sawError) final;
Vector<uint8_t> encode(StringView, UnencodableHandling) const final;
};
} // namespace PAL

View File

@@ -0,0 +1,70 @@
/*
* Copyright (C) 2016-2017 Apple Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS''
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
* THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
* THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "config.h"
#include "TextCodecReplacement.h"
#include <wtf/Function.h>
#include <wtf/TZoneMallocInlines.h>
#include <wtf/text/WTFString.h>
#include <wtf/unicode/CharacterNames.h>
namespace PAL {
WTF_MAKE_TZONE_ALLOCATED_IMPL(TextCodecReplacement);
void TextCodecReplacement::registerEncodingNames(EncodingNameRegistrar registrar)
{
registrar("replacement"_s, "replacement"_s);
registrar("csiso2022kr"_s, "replacement"_s);
registrar("hz-gb-2312"_s, "replacement"_s);
registrar("iso-2022-cn"_s, "replacement"_s);
registrar("iso-2022-cn-ext"_s, "replacement"_s);
registrar("iso-2022-kr"_s, "replacement"_s);
}
void TextCodecReplacement::registerCodecs(TextCodecRegistrar registrar)
{
registrar("replacement"_s, [] {
return makeUnique<TextCodecReplacement>();
});
}
String TextCodecReplacement::decode(std::span<const uint8_t>, bool, bool, bool& sawError)
{
sawError = true;
if (m_sentEOF)
return emptyString();
m_sentEOF = true;
return span(replacementCharacter);
}
Vector<uint8_t> TextCodecReplacement::encode(StringView string, UnencodableHandling) const
{
return TextCodecUTF8::encodeUTF8(string);
}
} // namespace PAL

View File

@@ -0,0 +1,46 @@
/*
* Copyright (C) 2016-2020 Apple Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS''
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
* THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
* THE POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once
#include "TextCodecUTF8.h"
#include <wtf/TZoneMalloc.h>
namespace PAL {
class TextCodecReplacement final : public TextCodec {
WTF_MAKE_TZONE_ALLOCATED(TextCodecReplacement);
public:
static void registerEncodingNames(EncodingNameRegistrar);
static void registerCodecs(TextCodecRegistrar);
private:
String decode(std::span<const uint8_t>, bool flush, bool stopOnError, bool& sawError) final;
Vector<uint8_t> encode(StringView, UnencodableHandling) const final;
bool m_sentEOF { false };
};
} // namespace PAL

View File

@@ -0,0 +1,467 @@
/*
* Copyright (C) 2020 Apple Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "config.h"
#include "TextCodecSingleByte.h"
#include "EncodingTables.h"
#include <array>
#include <mutex>
#include <wtf/IteratorRange.h>
#include <wtf/NeverDestroyed.h>
#include <wtf/TZoneMallocInlines.h>
#include <wtf/text/CodePointIterator.h>
#include <wtf/text/StringBuilder.h>
#include <wtf/unicode/CharacterNames.h>
namespace PAL {
WTF_MAKE_TZONE_ALLOCATED_IMPL(TextCodecSingleByte);
enum class TextCodecSingleByte::Encoding : uint8_t {
ISO_8859_3,
ISO_8859_6,
ISO_8859_7,
ISO_8859_8,
Windows_874,
Windows_1253,
Windows_1255,
Windows_1257,
IBM866,
KOI8U,
};
using SingleByteDecodeTable = std::array<UChar, 128>;
using SingleByteEncodeTableEntry = std::pair<UChar, uint8_t>;
using SingleByteEncodeTable = std::span<const SingleByteEncodeTableEntry>;
// From https://encoding.spec.whatwg.org/index-iso-8859-3.txt with 0xFFFD filling the gaps
static constexpr SingleByteDecodeTable iso88593 {
0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088, 0x0089, 0x008A, 0x008B, 0x008C, 0x008D, 0x008E, 0x008F,
0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009A, 0x009B, 0x009C, 0x009D, 0x009E, 0x009F,
0x00A0, 0x0126, 0x02D8, 0x00A3, 0x00A4, 0xFFFD, 0x0124, 0x00A7, 0x00A8, 0x0130, 0x015E, 0x011E, 0x0134, 0x00AD, 0xFFFD, 0x017B,
0x00B0, 0x0127, 0x00B2, 0x00B3, 0x00B4, 0x00B5, 0x0125, 0x00B7, 0x00B8, 0x0131, 0x015F, 0x011F, 0x0135, 0x00BD, 0xFFFD, 0x017C,
0x00C0, 0x00C1, 0x00C2, 0xFFFD, 0x00C4, 0x010A, 0x0108, 0x00C7, 0x00C8, 0x00C9, 0x00CA, 0x00CB, 0x00CC, 0x00CD, 0x00CE, 0x00CF,
0xFFFD, 0x00D1, 0x00D2, 0x00D3, 0x00D4, 0x0120, 0x00D6, 0x00D7, 0x011C, 0x00D9, 0x00DA, 0x00DB, 0x00DC, 0x016C, 0x015C, 0x00DF,
0x00E0, 0x00E1, 0x00E2, 0xFFFD, 0x00E4, 0x010B, 0x0109, 0x00E7, 0x00E8, 0x00E9, 0x00EA, 0x00EB, 0x00EC, 0x00ED, 0x00EE, 0x00EF,
0xFFFD, 0x00F1, 0x00F2, 0x00F3, 0x00F4, 0x0121, 0x00F6, 0x00F7, 0x011D, 0x00F9, 0x00FA, 0x00FB, 0x00FC, 0x016D, 0x015D, 0x02D9
};
// From https://encoding.spec.whatwg.org/index-iso-8859-6.txt with 0xFFFD filling the gaps
static constexpr SingleByteDecodeTable iso88596 {
0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088, 0x0089, 0x008A, 0x008B, 0x008C, 0x008D, 0x008E, 0x008F,
0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009A, 0x009B, 0x009C, 0x009D, 0x009E, 0x009F,
0x00A0, 0xFFFD, 0xFFFD, 0xFFFD, 0x00A4, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0x060C, 0x00AD, 0xFFFD, 0xFFFD,
0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0x061B, 0xFFFD, 0xFFFD, 0xFFFD, 0x061F,
0xFFFD, 0x0621, 0x0622, 0x0623, 0x0624, 0x0625, 0x0626, 0x0627, 0x0628, 0x0629, 0x062A, 0x062B, 0x062C, 0x062D, 0x062E, 0x062F,
0x0630, 0x0631, 0x0632, 0x0633, 0x0634, 0x0635, 0x0636, 0x0637, 0x0638, 0x0639, 0x063A, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD,
0x0640, 0x0641, 0x0642, 0x0643, 0x0644, 0x0645, 0x0646, 0x0647, 0x0648, 0x0649, 0x064A, 0x064B, 0x064C, 0x064D, 0x064E, 0x064F,
0x0650, 0x0651, 0x0652, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD
};
// From https://encoding.spec.whatwg.org/index-iso-8859-7.txt with 0xFFFD filling the gaps
static constexpr SingleByteDecodeTable iso88597 {
0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088, 0x0089, 0x008A, 0x008B, 0x008C, 0x008D, 0x008E, 0x008F,
0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009A, 0x009B, 0x009C, 0x009D, 0x009E, 0x009F,
0x00A0, 0x2018, 0x2019, 0x00A3, 0x20AC, 0x20AF, 0x00A6, 0x00A7, 0x00A8, 0x00A9, 0x037A, 0x00AB, 0x00AC, 0x00AD, 0xFFFD, 0x2015,
0x00B0, 0x00B1, 0x00B2, 0x00B3, 0x0384, 0x0385, 0x0386, 0x00B7, 0x0388, 0x0389, 0x038A, 0x00BB, 0x038C, 0x00BD, 0x038E, 0x038F,
0x0390, 0x0391, 0x0392, 0x0393, 0x0394, 0x0395, 0x0396, 0x0397, 0x0398, 0x0399, 0x039A, 0x039B, 0x039C, 0x039D, 0x039E, 0x039F,
0x03A0, 0x03A1, 0xFFFD, 0x03A3, 0x03A4, 0x03A5, 0x03A6, 0x03A7, 0x03A8, 0x03A9, 0x03AA, 0x03AB, 0x03AC, 0x03AD, 0x03AE, 0x03AF,
0x03B0, 0x03B1, 0x03B2, 0x03B3, 0x03B4, 0x03B5, 0x03B6, 0x03B7, 0x03B8, 0x03B9, 0x03BA, 0x03BB, 0x03BC, 0x03BD, 0x03BE, 0x03BF,
0x03C0, 0x03C1, 0x03C2, 0x03C3, 0x03C4, 0x03C5, 0x03C6, 0x03C7, 0x03C8, 0x03C9, 0x03CA, 0x03CB, 0x03CC, 0x03CD, 0x03CE, 0xFFFD
};
// From https://encoding.spec.whatwg.org/index-iso-8859-8.txt with 0xFFFD filling the gaps
static constexpr SingleByteDecodeTable iso88598 {
0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088, 0x0089, 0x008A, 0x008B, 0x008C, 0x008D, 0x008E, 0x008F,
0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009A, 0x009B, 0x009C, 0x009D, 0x009E, 0x009F,
0x00A0, 0xFFFD, 0x00A2, 0x00A3, 0x00A4, 0x00A5, 0x00A6, 0x00A7, 0x00A8, 0x00A9, 0x00D7, 0x00AB, 0x00AC, 0x00AD, 0x00AE, 0x00AF,
0x00B0, 0x00B1, 0x00B2, 0x00B3, 0x00B4, 0x00B5, 0x00B6, 0x00B7, 0x00B8, 0x00B9, 0x00F7, 0x00BB, 0x00BC, 0x00BD, 0x00BE, 0xFFFD,
0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD,
0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0x2017,
0x05D0, 0x05D1, 0x05D2, 0x05D3, 0x05D4, 0x05D5, 0x05D6, 0x05D7, 0x05D8, 0x05D9, 0x05DA, 0x05DB, 0x05DC, 0x05DD, 0x05DE, 0x05DF,
0x05E0, 0x05E1, 0x05E2, 0x05E3, 0x05E4, 0x05E5, 0x05E6, 0x05E7, 0x05E8, 0x05E9, 0x05EA, 0xFFFD, 0xFFFD, 0x200E, 0x200F, 0xFFFD
};
// From https://encoding.spec.whatwg.org/index-windows-874.txt with 0xFFFD filling the gaps
static constexpr SingleByteDecodeTable windows874 {
0x20AC, 0x0081, 0x0082, 0x0083, 0x0084, 0x2026, 0x0086, 0x0087, 0x0088, 0x0089, 0x008A, 0x008B, 0x008C, 0x008D, 0x008E, 0x008F,
0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, 0x0098, 0x0099, 0x009A, 0x009B, 0x009C, 0x009D, 0x009E, 0x009F,
0x00A0, 0x0E01, 0x0E02, 0x0E03, 0x0E04, 0x0E05, 0x0E06, 0x0E07, 0x0E08, 0x0E09, 0x0E0A, 0x0E0B, 0x0E0C, 0x0E0D, 0x0E0E, 0x0E0F,
0x0E10, 0x0E11, 0x0E12, 0x0E13, 0x0E14, 0x0E15, 0x0E16, 0x0E17, 0x0E18, 0x0E19, 0x0E1A, 0x0E1B, 0x0E1C, 0x0E1D, 0x0E1E, 0x0E1F,
0x0E20, 0x0E21, 0x0E22, 0x0E23, 0x0E24, 0x0E25, 0x0E26, 0x0E27, 0x0E28, 0x0E29, 0x0E2A, 0x0E2B, 0x0E2C, 0x0E2D, 0x0E2E, 0x0E2F,
0x0E30, 0x0E31, 0x0E32, 0x0E33, 0x0E34, 0x0E35, 0x0E36, 0x0E37, 0x0E38, 0x0E39, 0x0E3A, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0x0E3F,
0x0E40, 0x0E41, 0x0E42, 0x0E43, 0x0E44, 0x0E45, 0x0E46, 0x0E47, 0x0E48, 0x0E49, 0x0E4A, 0x0E4B, 0x0E4C, 0x0E4D, 0x0E4E, 0x0E4F,
0x0E50, 0x0E51, 0x0E52, 0x0E53, 0x0E54, 0x0E55, 0x0E56, 0x0E57, 0x0E58, 0x0E59, 0x0E5A, 0x0E5B, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD
};
// From https://encoding.spec.whatwg.org/index-windows-1253.txt with 0xFFFD filling the gaps
static constexpr SingleByteDecodeTable windows1253 {
0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, 0x0088, 0x2030, 0x008A, 0x2039, 0x008C, 0x008D, 0x008E, 0x008F,
0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, 0x0098, 0x2122, 0x009A, 0x203A, 0x009C, 0x009D, 0x009E, 0x009F,
0x00A0, 0x0385, 0x0386, 0x00A3, 0x00A4, 0x00A5, 0x00A6, 0x00A7, 0x00A8, 0x00A9, 0xFFFD, 0x00AB, 0x00AC, 0x00AD, 0x00AE, 0x2015,
0x00B0, 0x00B1, 0x00B2, 0x00B3, 0x0384, 0x00B5, 0x00B6, 0x00B7, 0x0388, 0x0389, 0x038A, 0x00BB, 0x038C, 0x00BD, 0x038E, 0x038F,
0x0390, 0x0391, 0x0392, 0x0393, 0x0394, 0x0395, 0x0396, 0x0397, 0x0398, 0x0399, 0x039A, 0x039B, 0x039C, 0x039D, 0x039E, 0x039F,
0x03A0, 0x03A1, 0xFFFD, 0x03A3, 0x03A4, 0x03A5, 0x03A6, 0x03A7, 0x03A8, 0x03A9, 0x03AA, 0x03AB, 0x03AC, 0x03AD, 0x03AE, 0x03AF,
0x03B0, 0x03B1, 0x03B2, 0x03B3, 0x03B4, 0x03B5, 0x03B6, 0x03B7, 0x03B8, 0x03B9, 0x03BA, 0x03BB, 0x03BC, 0x03BD, 0x03BE, 0x03BF,
0x03C0, 0x03C1, 0x03C2, 0x03C3, 0x03C4, 0x03C5, 0x03C6, 0x03C7, 0x03C8, 0x03C9, 0x03CA, 0x03CB, 0x03CC, 0x03CD, 0x03CE, 0xFFFD
};
static constexpr SingleByteDecodeTable windows1255 {
0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, 0x02C6, 0x2030, 0x008A, 0x2039, 0x008C, 0x008D, 0x008E, 0x008F,
0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, 0x02DC, 0x2122, 0x009A, 0x203A, 0x009C, 0x009D, 0x009E, 0x009F,
0x00A0, 0x00A1, 0x00A2, 0x00A3, 0x20AA, 0x00A5, 0x00A6, 0x00A7, 0x00A8, 0x00A9, 0x00D7, 0x00AB, 0x00AC, 0x00AD, 0x00AE, 0x00AF,
0x00B0, 0x00B1, 0x00B2, 0x00B3, 0x00B4, 0x00B5, 0x00B6, 0x00B7, 0x00B8, 0x00B9, 0x00F7, 0x00BB, 0x00BC, 0x00BD, 0x00BE, 0x00BF,
0x05B0, 0x05B1, 0x05B2, 0x05B3, 0x05B4, 0x05B5, 0x05B6, 0x05B7, 0x05B8, 0x05B9, 0x05BA, 0x05BB, 0x05BC, 0x05BD, 0x05BE, 0x05BF,
0x05C0, 0x05C1, 0x05C2, 0x05C3, 0x05F0, 0x05F1, 0x05F2, 0x05F3, 0x05F4, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD,
0x05D0, 0x05D1, 0x05D2, 0x05D3, 0x05D4, 0x05D5, 0x05D6, 0x05D7, 0x05D8, 0x05D9, 0x05DA, 0x05DB, 0x05DC, 0x05DD, 0x05DE, 0x05DF,
0x05E0, 0x05E1, 0x05E2, 0x05E3, 0x05E4, 0x05E5, 0x05E6, 0x05E7, 0x05E8, 0x05E9, 0x05EA, 0xFFFD, 0xFFFD, 0x200E, 0x200F, 0xFFFD
};
// From https://encoding.spec.whatwg.org/index-windows-1257.txt with 0xFFFD filling the gaps
static constexpr SingleByteDecodeTable windows1257 {
0x20AC, 0x0081, 0x201A, 0x0083, 0x201E, 0x2026, 0x2020, 0x2021, 0x0088, 0x2030, 0x008A, 0x2039, 0x008C, 0x00A8, 0x02C7, 0x00B8,
0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, 0x0098, 0x2122, 0x009A, 0x203A, 0x009C, 0x00AF, 0x02DB, 0x009F,
0x00A0, 0xFFFD, 0x00A2, 0x00A3, 0x00A4, 0xFFFD, 0x00A6, 0x00A7, 0x00D8, 0x00A9, 0x0156, 0x00AB, 0x00AC, 0x00AD, 0x00AE, 0x00C6,
0x00B0, 0x00B1, 0x00B2, 0x00B3, 0x00B4, 0x00B5, 0x00B6, 0x00B7, 0x00F8, 0x00B9, 0x0157, 0x00BB, 0x00BC, 0x00BD, 0x00BE, 0x00E6,
0x0104, 0x012E, 0x0100, 0x0106, 0x00C4, 0x00C5, 0x0118, 0x0112, 0x010C, 0x00C9, 0x0179, 0x0116, 0x0122, 0x0136, 0x012A, 0x013B,
0x0160, 0x0143, 0x0145, 0x00D3, 0x014C, 0x00D5, 0x00D6, 0x00D7, 0x0172, 0x0141, 0x015A, 0x016A, 0x00DC, 0x017B, 0x017D, 0x00DF,
0x0105, 0x012F, 0x0101, 0x0107, 0x00E4, 0x00E5, 0x0119, 0x0113, 0x010D, 0x00E9, 0x017A, 0x0117, 0x0123, 0x0137, 0x012B, 0x013C,
0x0161, 0x0144, 0x0146, 0x00F3, 0x014D, 0x00F5, 0x00F6, 0x00F7, 0x0173, 0x0142, 0x015B, 0x016B, 0x00FC, 0x017C, 0x017E, 0x02D9
};
// From https://encoding.spec.whatwg.org/index-koi8-u.txt
static constexpr SingleByteDecodeTable koi8u {
0x2500, 0x2502, 0x250C, 0x2510, 0x2514, 0x2518, 0x251C, 0x2524, 0x252C, 0x2534, 0x253C, 0x2580, 0x2584, 0x2588, 0x258C, 0x2590,
0x2591, 0x2592, 0x2593, 0x2320, 0x25A0, 0x2219, 0x221A, 0x2248, 0x2264, 0x2265, 0x00A0, 0x2321, 0x00B0, 0x00B2, 0x00B7, 0x00F7,
0x2550, 0x2551, 0x2552, 0x0451, 0x0454, 0x2554, 0x0456, 0x0457, 0x2557, 0x2558, 0x2559, 0x255A, 0x255B, 0x0491, 0x045E, 0x255E,
0x255F, 0x2560, 0x2561, 0x0401, 0x0404, 0x2563, 0x0406, 0x0407, 0x2566, 0x2567, 0x2568, 0x2569, 0x256A, 0x0490, 0x040E, 0x00A9,
0x044E, 0x0430, 0x0431, 0x0446, 0x0434, 0x0435, 0x0444, 0x0433, 0x0445, 0x0438, 0x0439, 0x043A, 0x043B, 0x043C, 0x043D, 0x043E,
0x043F, 0x044F, 0x0440, 0x0441, 0x0442, 0x0443, 0x0436, 0x0432, 0x044C, 0x044B, 0x0437, 0x0448, 0x044D, 0x0449, 0x0447, 0x044A,
0x042E, 0x0410, 0x0411, 0x0426, 0x0414, 0x0415, 0x0424, 0x0413, 0x0425, 0x0418, 0x0419, 0x041A, 0x041B, 0x041C, 0x041D, 0x041E,
0x041F, 0x042F, 0x0420, 0x0421, 0x0422, 0x0423, 0x0416, 0x0412, 0x042C, 0x042B, 0x0417, 0x0428, 0x042D, 0x0429, 0x0427, 0x042A
};
// From https://encoding.spec.whatwg.org/index-ibm866.txt
static constexpr SingleByteDecodeTable ibm866 {
0x0410, 0x0411, 0x0412, 0x0413, 0x0414, 0x0415, 0x0416, 0x0417, 0x0418, 0x0419, 0x041A, 0x041B, 0x041C, 0x041D, 0x041E, 0x041F,
0x0420, 0x0421, 0x0422, 0x0423, 0x0424, 0x0425, 0x0426, 0x0427, 0x0428, 0x0429, 0x042A, 0x042B, 0x042C, 0x042D, 0x042E, 0x042F,
0x0430, 0x0431, 0x0432, 0x0433, 0x0434, 0x0435, 0x0436, 0x0437, 0x0438, 0x0439, 0x043A, 0x043B, 0x043C, 0x043D, 0x043E, 0x043F,
0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x2561, 0x2562, 0x2556, 0x2555, 0x2563, 0x2551, 0x2557, 0x255D, 0x255C, 0x255B, 0x2510,
0x2514, 0x2534, 0x252C, 0x251C, 0x2500, 0x253C, 0x255E, 0x255F, 0x255A, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256C, 0x2567,
0x2568, 0x2564, 0x2565, 0x2559, 0x2558, 0x2552, 0x2553, 0x256B, 0x256A, 0x2518, 0x250C, 0x2588, 0x2584, 0x258C, 0x2590, 0x2580,
0x0440, 0x0441, 0x0442, 0x0443, 0x0444, 0x0445, 0x0446, 0x0447, 0x0448, 0x0449, 0x044A, 0x044B, 0x044C, 0x044D, 0x044E, 0x044F,
0x0401, 0x0451, 0x0404, 0x0454, 0x0407, 0x0457, 0x040E, 0x045E, 0x00B0, 0x2219, 0x00B7, 0x221A, 0x2116, 0x00A4, 0x25A0, 0x00A0
};
template<const SingleByteDecodeTable& decodeTable> SingleByteEncodeTable tableForEncoding()
{
// Allocate this at runtime because building it at compile time would make the binary much larger and this is often not used.
static constexpr auto size = std::size(decodeTable) - std::count(std::begin(decodeTable), std::end(decodeTable), replacementCharacter);
static const std::array<SingleByteEncodeTableEntry, size>* entries;
static std::once_flag once;
std::call_once(once, [&] {
auto* mutableEntries = new std::array<SingleByteEncodeTableEntry, size>();
size_t j = 0;
for (size_t i = 0; i < std::size(decodeTable); ++i) {
if (decodeTable[i] != replacementCharacter)
(*mutableEntries)[j++] = { decodeTable[i], i + 0x80 };
}
ASSERT(j == size);
auto collection = std::span { *mutableEntries };
sortByFirst(collection);
ASSERT(sortedFirstsAreUnique(collection));
entries = mutableEntries;
});
return std::span { *entries };
}
static SingleByteEncodeTable tableForEncoding(TextCodecSingleByte::Encoding encoding)
{
switch (encoding) {
case TextCodecSingleByte::Encoding::ISO_8859_3:
return tableForEncoding<iso88593>();
case TextCodecSingleByte::Encoding::ISO_8859_6:
return tableForEncoding<iso88596>();
case TextCodecSingleByte::Encoding::ISO_8859_7:
return tableForEncoding<iso88597>();
case TextCodecSingleByte::Encoding::ISO_8859_8:
return tableForEncoding<iso88598>();
case TextCodecSingleByte::Encoding::Windows_874:
return tableForEncoding<windows874>();
case TextCodecSingleByte::Encoding::Windows_1253:
return tableForEncoding<windows1253>();
case TextCodecSingleByte::Encoding::Windows_1255:
return tableForEncoding<windows1255>();
case TextCodecSingleByte::Encoding::Windows_1257:
return tableForEncoding<windows1257>();
case TextCodecSingleByte::Encoding::IBM866:
return tableForEncoding<ibm866>();
case TextCodecSingleByte::Encoding::KOI8U:
return tableForEncoding<koi8u>();
}
RELEASE_ASSERT_NOT_REACHED();
}
static const SingleByteDecodeTable& tableForDecoding(TextCodecSingleByte::Encoding encoding)
{
switch (encoding) {
case TextCodecSingleByte::Encoding::ISO_8859_3:
return iso88593;
case TextCodecSingleByte::Encoding::ISO_8859_6:
return iso88596;
case TextCodecSingleByte::Encoding::ISO_8859_7:
return iso88597;
case TextCodecSingleByte::Encoding::ISO_8859_8:
return iso88598;
case TextCodecSingleByte::Encoding::Windows_874:
return windows874;
case TextCodecSingleByte::Encoding::Windows_1253:
return windows1253;
case TextCodecSingleByte::Encoding::Windows_1255:
return windows1255;
case TextCodecSingleByte::Encoding::Windows_1257:
return windows1257;
case TextCodecSingleByte::Encoding::IBM866:
return ibm866;
case TextCodecSingleByte::Encoding::KOI8U:
return koi8u;
}
RELEASE_ASSERT_NOT_REACHED();
}
// https://encoding.spec.whatwg.org/#single-byte-encoder
static Vector<uint8_t> encode(const SingleByteEncodeTable& table, StringView string, Function<void(char32_t, Vector<uint8_t>&)>&& unencodableHandler)
{
// FIXME: Consider adding an ASCII fast path like the one in TextCodecLatin1::decode.
Vector<uint8_t> result;
result.reserveInitialCapacity(string.length());
for (auto codePoint : string.codePoints()) {
if (isASCII(codePoint)) {
result.append(codePoint);
continue;
}
auto byte = findFirstInSortedPairs(table, codePoint);
if (!byte) {
unencodableHandler(codePoint, result);
continue;
}
result.append(*byte);
}
return result;
}
// https://encoding.spec.whatwg.org/#single-byte-decoder
static String decode(const SingleByteDecodeTable& table, std::span<const uint8_t> bytes, bool, bool stopOnError, bool& sawError)
{
StringBuilder result;
result.reserveCapacity(bytes.size());
auto parseByte = [&] (uint8_t byte) {
if (isASCII(byte)) {
result.append(byte);
return;
}
UChar codePoint = table[byte - 0x80];
if (codePoint == replacementCharacter)
sawError = true;
result.append(codePoint);
};
if (stopOnError) {
for (auto byte : bytes) {
parseByte(byte);
if (sawError)
return result.toString();
}
} else {
for (auto byte : bytes)
parseByte(byte);
}
return result.toString();
}
Vector<uint8_t> TextCodecSingleByte::encode(StringView string, UnencodableHandling handling) const
{
return PAL::encode(tableForEncoding(m_encoding), string, unencodableHandler(handling));
}
String TextCodecSingleByte::decode(std::span<const uint8_t> bytes, bool flush, bool stopOnError, bool& sawError)
{
return PAL::decode(tableForDecoding(m_encoding), bytes, flush, stopOnError, sawError);
}
TextCodecSingleByte::TextCodecSingleByte(Encoding encoding)
: m_encoding(encoding)
{
}
void TextCodecSingleByte::registerEncodingNames(EncodingNameRegistrar registrar)
{
// https://encoding.spec.whatwg.org/#names-and-labels
auto registerAliases = [&] (std::initializer_list<ASCIILiteral> list) {
for (auto& alias : list)
registrar(alias, *list.begin());
};
registerAliases({
"ISO-8859-3"_s,
"csisolatin3"_s,
"iso-ir-109"_s,
"iso8859-3"_s,
"iso88593"_s,
"iso_8859-3"_s,
"iso_8859-3:1988"_s,
"l3"_s,
"latin3"_s
});
registerAliases({
"ISO-8859-6"_s,
"arabic"_s,
"asmo-708"_s,
"csiso88596e"_s,
"csiso88596i"_s,
"csisolatinarabic"_s,
"ecma-114"_s,
"iso-8859-6-e"_s,
"iso-8859-6-i"_s,
"iso-ir-127"_s,
"iso8859-6"_s,
"iso88596"_s,
"iso_8859-6"_s,
"iso_8859-6:1987"_s
});
registerAliases({
"ISO-8859-7"_s,
"csisolatingreek"_s,
"ecma-118"_s,
"elot_928"_s,
"greek"_s,
"greek8"_s,
"iso-ir-126"_s,
"iso8859-7"_s,
"iso88597"_s,
"iso_8859-7"_s,
"iso_8859-7:1987"_s,
"sun_eu_greek"_s
});
registerAliases({
"ISO-8859-8"_s,
"csiso88598e"_s,
"csisolatinhebrew"_s,
"hebrew"_s,
"iso-8859-8-e"_s,
"iso-ir-138"_s,
"iso8859-8"_s,
"iso88598"_s,
"iso_8859-8"_s,
"iso_8859-8:1988"_s,
"visual"_s
});
registerAliases({
"ISO-8859-8-I"_s,
"csiso88598i"_s,
"logical"_s
});
registerAliases({
"windows-874"_s,
"dos-874"_s,
"iso-8859-11"_s,
"iso8859-11"_s,
"iso885911"_s,
"tis-620"_s
});
registerAliases({
"windows-1253"_s,
"cp1253"_s,
"x-cp1253"_s
});
registerAliases({
"windows-1255"_s,
"cp1255"_s,
"x-cp1255"_s
});
registerAliases({
"windows-1257"_s,
"cp1257"_s,
"x-cp1257"_s
});
registerAliases({
"KOI8-U"_s,
"koi8-ru"_s
});
registerAliases({
"IBM866"_s,
"866"_s,
"cp866"_s,
"csibm866"_s
});
}
void TextCodecSingleByte::registerCodecs(TextCodecRegistrar registrar)
{
registrar("ISO-8859-3"_s, [] {
return makeUnique<TextCodecSingleByte>(Encoding::ISO_8859_3);
});
registrar("ISO-8859-6"_s, [] {
return makeUnique<TextCodecSingleByte>(Encoding::ISO_8859_6);
});
registrar("ISO-8859-7"_s, [] {
return makeUnique<TextCodecSingleByte>(Encoding::ISO_8859_7);
});
registrar("ISO-8859-8"_s, [] {
return makeUnique<TextCodecSingleByte>(Encoding::ISO_8859_8);
});
registrar("ISO-8859-8-I"_s, [] {
return makeUnique<TextCodecSingleByte>(Encoding::ISO_8859_8);
});
registrar("windows-874"_s, [] {
return makeUnique<TextCodecSingleByte>(Encoding::Windows_874);
});
registrar("windows-1253"_s, [] {
return makeUnique<TextCodecSingleByte>(Encoding::Windows_1253);
});
registrar("windows-1255"_s, [] {
return makeUnique<TextCodecSingleByte>(Encoding::Windows_1255);
});
registrar("windows-1257"_s, [] {
return makeUnique<TextCodecSingleByte>(Encoding::Windows_1257);
});
registrar("KOI8-U"_s, [] {
return makeUnique<TextCodecSingleByte>(Encoding::KOI8U);
});
registrar("IBM866"_s, [] {
return makeUnique<TextCodecSingleByte>(Encoding::IBM866);
});
}
}

View File

@@ -0,0 +1,49 @@
/*
* Copyright (C) 2020 Apple Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once
#include "TextCodec.h"
#include <wtf/TZoneMalloc.h>
namespace PAL {
class TextCodecSingleByte final : public TextCodec {
WTF_MAKE_TZONE_ALLOCATED(TextCodecSingleByte);
public:
static void registerEncodingNames(EncodingNameRegistrar);
static void registerCodecs(TextCodecRegistrar);
enum class Encoding : uint8_t;
explicit TextCodecSingleByte(Encoding);
private:
String decode(std::span<const uint8_t>, bool flush, bool stopOnError, bool& sawError) final;
Vector<uint8_t> encode(StringView, UnencodableHandling) const final;
const Encoding m_encoding;
};
} // namespace PAL

View File

@@ -0,0 +1,166 @@
/*
* Copyright (C) 2004-2020 Apple Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "config.h"
#include "TextCodecUTF16.h"
#include <wtf/TZoneMallocInlines.h>
#include <wtf/text/CString.h>
#include <wtf/text/StringBuilder.h>
#include <wtf/text/WTFString.h>
#include <wtf/unicode/CharacterNames.h>
namespace PAL {
WTF_MAKE_TZONE_ALLOCATED_IMPL(TextCodecUTF16);
inline TextCodecUTF16::TextCodecUTF16(bool littleEndian)
: m_littleEndian(littleEndian)
{
}
void TextCodecUTF16::registerEncodingNames(EncodingNameRegistrar registrar)
{
registrar("UTF-16LE"_s, "UTF-16LE"_s);
registrar("UTF-16BE"_s, "UTF-16BE"_s);
registrar("ISO-10646-UCS-2"_s, "UTF-16LE"_s);
registrar("UCS-2"_s, "UTF-16LE"_s);
registrar("UTF-16"_s, "UTF-16LE"_s);
registrar("Unicode"_s, "UTF-16LE"_s);
registrar("csUnicode"_s, "UTF-16LE"_s);
registrar("unicodeFEFF"_s, "UTF-16LE"_s);
registrar("unicodeFFFE"_s, "UTF-16BE"_s);
}
void TextCodecUTF16::registerCodecs(TextCodecRegistrar registrar)
{
registrar("UTF-16LE"_s, [] {
return makeUnique<TextCodecUTF16>(true);
});
registrar("UTF-16BE"_s, [] {
return makeUnique<TextCodecUTF16>(false);
});
}
// https://encoding.spec.whatwg.org/#shared-utf-16-decoder
String TextCodecUTF16::decode(std::span<const uint8_t> bytes, bool flush, bool, bool& sawError)
{
size_t index = 0;
size_t lengthMinusOne = bytes.size() - 1;
StringBuilder result;
result.reserveCapacity(bytes.size() / 2);
auto processCodeUnit = [&] (UChar codeUnit) {
if (std::exchange(m_shouldStripByteOrderMark, false) && codeUnit == byteOrderMark)
return;
if (m_leadSurrogate) {
auto leadSurrogate = *std::exchange(m_leadSurrogate, std::nullopt);
if (U16_IS_TRAIL(codeUnit)) {
char32_t codePoint = U16_GET_SUPPLEMENTARY(leadSurrogate, codeUnit);
result.append(codePoint);
return;
}
sawError = true;
result.append(replacementCharacter);
}
if (U16_IS_LEAD(codeUnit)) {
m_leadSurrogate = codeUnit;
return;
}
if (U16_IS_TRAIL(codeUnit)) {
sawError = true;
result.append(replacementCharacter);
return;
}
result.append(codeUnit);
};
auto processBytesLE = [&] (uint8_t first, uint8_t second) {
processCodeUnit(first | (second << 8));
};
auto processBytesBE = [&] (uint8_t first, uint8_t second) {
processCodeUnit((first << 8) | second);
};
if (!bytes.empty()) {
if (m_leadByte && index < bytes.size()) {
auto leadByte = *std::exchange(m_leadByte, std::nullopt);
auto trailByte = bytes[index++];
if (m_littleEndian)
processBytesLE(leadByte, trailByte);
else
processBytesBE(leadByte, trailByte);
}
if (m_littleEndian) {
for (; index < lengthMinusOne; index += 2)
processBytesLE(bytes[index], bytes[index + 1]);
} else {
for (; index < lengthMinusOne; index += 2)
processBytesBE(bytes[index], bytes[index + 1]);
}
if (index == lengthMinusOne) {
ASSERT(!m_leadByte);
m_leadByte = bytes[index];
} else
ASSERT(index == bytes.size());
}
if (flush) {
m_shouldStripByteOrderMark = false;
if (m_leadByte || m_leadSurrogate) {
m_leadByte = std::nullopt;
m_leadSurrogate = std::nullopt;
sawError = true;
result.append(replacementCharacter);
}
}
return result.toString();
}
Vector<uint8_t> TextCodecUTF16::encode(StringView string, UnencodableHandling) const
{
Vector<uint8_t> result(WTF::checkedProduct<size_t>(string.length(), 2));
size_t index = 0;
if (m_littleEndian) {
for (auto character : string.codeUnits()) {
result[index++] = character;
result[index++] = character >> 8;
}
} else {
for (auto character : string.codeUnits()) {
result[index++] = character >> 8;
result[index++] = character;
}
}
return result;
}
} // namespace PAL

View File

@@ -0,0 +1,53 @@
/*
* Copyright (C) 2004-2020 Apple Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once
#include "TextCodec.h"
#include <optional>
#include <wtf/TZoneMalloc.h>
namespace PAL {
class TextCodecUTF16 final : public TextCodec {
WTF_MAKE_TZONE_ALLOCATED(TextCodecUTF16);
public:
static void registerEncodingNames(EncodingNameRegistrar);
static void registerCodecs(TextCodecRegistrar);
explicit TextCodecUTF16(bool littleEndian);
private:
void stripByteOrderMark() final { m_shouldStripByteOrderMark = true; }
String decode(std::span<const uint8_t>, bool flush, bool stopOnError, bool& sawError) final;
Vector<uint8_t> encode(StringView, UnencodableHandling) const final;
bool m_littleEndian;
std::optional<uint8_t> m_leadByte;
std::optional<UChar> m_leadSurrogate;
bool m_shouldStripByteOrderMark { false };
};
} // namespace PAL

View File

@@ -0,0 +1,490 @@
/*
* Copyright (C) 2004-2020 Apple Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "config.h"
#include "TextCodecUTF8.h"
#include "TextCodecASCIIFastPath.h"
#include <wtf/StdLibExtras.h>
#include <wtf/TZoneMallocInlines.h>
#include <wtf/text/CString.h>
#include "ParsingUtilities-removeAfterWebKitUpgrade.h"
#include <wtf/text/StringBuffer.h>
#include <wtf/text/WTFString.h>
#include <wtf/unicode/CharacterNames.h>
namespace PAL {
WTF_MAKE_TZONE_ALLOCATED_IMPL(TextCodecUTF8);
using namespace WTF::Unicode;
const int nonCharacter = -1;
void TextCodecUTF8::registerEncodingNames(EncodingNameRegistrar registrar)
{
// From https://encoding.spec.whatwg.org.
registrar("UTF-8"_s, "UTF-8"_s);
registrar("utf8"_s, "UTF-8"_s);
registrar("unicode-1-1-utf-8"_s, "UTF-8"_s);
// Additional aliases that originally were present in the encoding
// table in WebKit on Macintosh, and subsequently added by
// TextCodecICU. Perhaps we can prove some are not used on the web
// and remove them.
registrar("unicode11utf8"_s, "UTF-8"_s);
registrar("unicode20utf8"_s, "UTF-8"_s);
registrar("x-unicode20utf8"_s, "UTF-8"_s);
}
std::unique_ptr<TextCodecUTF8> TextCodecUTF8::codec()
{
return makeUnique<TextCodecUTF8>();
}
void TextCodecUTF8::registerCodecs(TextCodecRegistrar registrar)
{
registrar("UTF-8"_s, [] {
return codec();
});
}
static inline uint8_t nonASCIISequenceLength(uint8_t firstByte)
{
static constexpr std::array<uint8_t, 256> lengths {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
};
return lengths[firstByte];
}
static inline int decodeNonASCIISequence(std::span<const uint8_t> sequence, uint8_t& length)
{
ASSERT(!isASCII(sequence[0]));
if (length == 2) {
ASSERT(sequence[0] >= 0xC2);
ASSERT(sequence[0] <= 0xDF);
if (sequence[1] < 0x80 || sequence[1] > 0xBF) {
length = 1;
return nonCharacter;
}
return ((sequence[0] << 6) + sequence[1]) - 0x00003080;
}
if (length == 3) {
ASSERT(sequence[0] >= 0xE0);
ASSERT(sequence[0] <= 0xEF);
switch (sequence[0]) {
case 0xE0:
if (sequence[1] < 0xA0 || sequence[1] > 0xBF) {
length = 1;
return nonCharacter;
}
break;
case 0xED:
if (sequence[1] < 0x80 || sequence[1] > 0x9F) {
length = 1;
return nonCharacter;
}
break;
default:
if (sequence[1] < 0x80 || sequence[1] > 0xBF) {
length = 1;
return nonCharacter;
}
}
if (sequence[2] < 0x80 || sequence[2] > 0xBF) {
length = 2;
return nonCharacter;
}
return ((sequence[0] << 12) + (sequence[1] << 6) + sequence[2]) - 0x000E2080;
}
ASSERT(length == 4);
ASSERT(sequence[0] >= 0xF0);
ASSERT(sequence[0] <= 0xF4);
switch (sequence[0]) {
case 0xF0:
if (sequence[1] < 0x90 || sequence[1] > 0xBF) {
length = 1;
return nonCharacter;
}
break;
case 0xF4:
if (sequence[1] < 0x80 || sequence[1] > 0x8F) {
length = 1;
return nonCharacter;
}
break;
default:
if (sequence[1] < 0x80 || sequence[1] > 0xBF) {
length = 1;
return nonCharacter;
}
}
if (sequence[2] < 0x80 || sequence[2] > 0xBF) {
length = 2;
return nonCharacter;
}
if (sequence[3] < 0x80 || sequence[3] > 0xBF) {
length = 3;
return nonCharacter;
}
return ((sequence[0] << 18) + (sequence[1] << 12) + (sequence[2] << 6) + sequence[3]) - 0x03C82080;
}
static inline std::span<UChar> appendCharacter(std::span<UChar> destination, int character)
{
ASSERT(character != nonCharacter);
ASSERT(!U_IS_SURROGATE(character));
if (U_IS_BMP(character))
consume(destination) = character;
else {
destination[0] = U16_LEAD(character);
destination[1] = U16_TRAIL(character);
skip(destination, 2);
}
return destination;
}
void TextCodecUTF8::consumePartialSequenceByte()
{
--m_partialSequenceSize;
memmoveSpan(std::span { m_partialSequence }, std::span { m_partialSequence }.subspan(1, m_partialSequenceSize));
}
bool TextCodecUTF8::handlePartialSequence(std::span<LChar>& destination, std::span<const uint8_t>& source, bool flush)
{
ASSERT(m_partialSequenceSize);
do {
if (isASCII(m_partialSequence[0])) {
consume(destination) = m_partialSequence[0];
consumePartialSequenceByte();
continue;
}
auto count = nonASCIISequenceLength(m_partialSequence[0]);
if (!count)
return true;
// Copy from `source` until we have `count` bytes.
if (count > m_partialSequenceSize && !source.empty()) {
size_t additionalBytes = std::min<size_t>(count - m_partialSequenceSize, source.size());
memcpySpan(std::span { m_partialSequence }.subspan(m_partialSequenceSize), consumeSpan(source, additionalBytes));
m_partialSequenceSize += additionalBytes;
}
// If we still don't have `count` bytes, fill the rest with zeros (any
// other lead byte would do), so we can run `decodeNonASCIISequence` to
// tell if the chunk that we have is valid. These bytes are not part of
// the partial sequence, so don't increment `m_partialSequenceSize`.
bool partialSequenceIsTooShort = false;
if (count > m_partialSequenceSize) {
partialSequenceIsTooShort = true;
zeroSpan(std::span { m_partialSequence }.subspan(m_partialSequenceSize, count - m_partialSequenceSize));
}
int character = decodeNonASCIISequence(std::span { m_partialSequence }, count);
if (partialSequenceIsTooShort) {
ASSERT(character == nonCharacter);
ASSERT(count <= m_partialSequenceSize);
// If we're not at the end, and the partial sequence that we have is
// incomplete but otherwise valid, a non-character is not an error.
if (!flush && count == m_partialSequenceSize)
return false;
}
if (!isLatin1(character))
return true;
m_partialSequenceSize -= count;
consume(destination) = character;
} while (m_partialSequenceSize);
return false;
}
void TextCodecUTF8::handlePartialSequence(std::span<UChar>& destination, std::span<const uint8_t>& source, bool flush, bool stopOnError, bool& sawError)
{
ASSERT(m_partialSequenceSize);
do {
if (isASCII(m_partialSequence[0])) {
consume(destination) = m_partialSequence[0];
consumePartialSequenceByte();
continue;
}
auto count = nonASCIISequenceLength(m_partialSequence[0]);
if (!count) {
sawError = true;
if (stopOnError)
return;
consume(destination) = replacementCharacter;
consumePartialSequenceByte();
continue;
}
// Copy from `source` until we have `count` bytes.
if (count > m_partialSequenceSize && !source.empty()) {
size_t additionalBytes = std::min<size_t>(count - m_partialSequenceSize, source.size());
memcpySpan(std::span { m_partialSequence }.subspan(m_partialSequenceSize), consumeSpan(source, additionalBytes));
m_partialSequenceSize += additionalBytes;
}
// If we still don't have `count` bytes, fill the rest with zeros (any
// other lead byte would do), so we can run `decodeNonASCIISequence` to
// tell if the chunk that we have is valid. These bytes are not part of
// the partial sequence, so don't increment `m_partialSequenceSize`.
bool partialSequenceIsTooShort = false;
if (count > m_partialSequenceSize) {
partialSequenceIsTooShort = true;
zeroSpan(std::span { m_partialSequence }.subspan(m_partialSequenceSize, count - m_partialSequenceSize));
}
int character = decodeNonASCIISequence(std::span { m_partialSequence }, count);
if (partialSequenceIsTooShort) {
ASSERT(character == nonCharacter);
ASSERT(count <= m_partialSequenceSize);
// If we're not at the end, and the partial sequence that we have is
// incomplete but otherwise valid, a non-character is not an error.
if (!flush && count == m_partialSequenceSize)
return;
}
if (character == nonCharacter) {
sawError = true;
if (stopOnError)
return;
consume(destination) = replacementCharacter;
m_partialSequenceSize -= count;
memmoveSpan(std::span { m_partialSequence }, std::span { m_partialSequence }.subspan(count, m_partialSequenceSize));
continue;
}
m_partialSequenceSize -= count;
if (std::exchange(m_shouldStripByteOrderMark, false) && character == byteOrderMark)
continue;
destination = appendCharacter(destination, character);
} while (m_partialSequenceSize);
}
String TextCodecUTF8::decode(std::span<const uint8_t> bytes, bool flush, bool stopOnError, bool& sawError)
{
// Each input byte might turn into a character.
// That includes all bytes in the partial-sequence buffer because
// each byte in an invalid sequence will turn into a replacement character.
size_t bufferSize = m_partialSequenceSize + bytes.size();
if (bufferSize > std::numeric_limits<unsigned>::max()) {
sawError = true;
return {};
}
StringBuffer<LChar> buffer(bufferSize);
auto source = bytes;
auto* alignedEnd = WTF::alignToMachineWord(std::to_address(source.end()));
auto destination = buffer.span();
do {
if (m_partialSequenceSize) {
// Explicitly copy destination and source pointers to avoid taking pointers to the
// local variables, which may harm code generation by disabling some optimizations
// in some compilers.
auto destinationForHandlePartialSequence = destination;
if (handlePartialSequence(destinationForHandlePartialSequence, source, flush)) {
goto upConvertTo16Bit;
}
destination = destinationForHandlePartialSequence;
if (m_partialSequenceSize)
break;
}
while (!source.empty()) {
if (isASCII(source[0])) {
// Fast path for ASCII. Most UTF-8 text will be ASCII.
if (WTF::isAlignedToMachineWord(source.data())) {
while (source.data() < alignedEnd) {
auto chunk = reinterpretCastSpanStartTo<const WTF::MachineWord>(source);
if (!WTF::containsOnlyASCII<LChar>(chunk))
break;
copyASCIIMachineWord(destination, source);
skip(source, sizeof(WTF::MachineWord));
skip(destination, sizeof(WTF::MachineWord));
}
if (source.empty())
break;
if (!isASCII(source[0]))
continue;
}
consume(destination) = consume(source);
continue;
}
auto count = nonASCIISequenceLength(source[0]);
int character;
if (!count)
character = nonCharacter;
else {
if (count > source.size()) {
RELEASE_ASSERT_WITH_SECURITY_IMPLICATION(source.size() < m_partialSequence.size());
ASSERT(!m_partialSequenceSize);
m_partialSequenceSize = source.size();
memcpySpan(std::span { m_partialSequence }, source.first(m_partialSequenceSize));
source = {};
break;
}
character = decodeNonASCIISequence(source, count);
}
if (character == nonCharacter) {
sawError = true;
if (stopOnError)
break;
goto upConvertTo16Bit;
}
if (!isLatin1(character))
goto upConvertTo16Bit;
skip(source, count);
consume(destination) = character;
}
} while (m_partialSequenceSize);
buffer.shrink(destination.data() - buffer.characters());
if (flush)
m_partialSequenceSize = 0;
if (flush || buffer.length())
m_shouldStripByteOrderMark = false;
return String::adopt(WTFMove(buffer));
upConvertTo16Bit:
StringBuffer<UChar> buffer16(bufferSize);
auto destination16 = buffer16.span();
// Copy the already converted characters
auto converted8 = buffer.span();
size_t charactersToCopy = destination.data() - buffer.characters();
for (size_t i = 0; i < charactersToCopy; ++i)
destination16[i] = converted8[i];
skip(destination16, charactersToCopy);
do {
if (m_partialSequenceSize) {
// Explicitly copy destination and source pointers to avoid taking pointers to the
// local variables, which may harm code generation by disabling some optimizations
// in some compilers.
auto destinationForHandlePartialSequence = destination16;
handlePartialSequence(destinationForHandlePartialSequence, source, flush, stopOnError, sawError);
destination16 = destinationForHandlePartialSequence;
if (m_partialSequenceSize)
break;
}
while (!source.empty()) {
if (isASCII(source[0])) {
// Fast path for ASCII. Most UTF-8 text will be ASCII.
if (WTF::isAlignedToMachineWord(source.data())) {
while (source.data() < alignedEnd) {
auto chunk = reinterpretCastSpanStartTo<const WTF::MachineWord>(source);
if (!WTF::containsOnlyASCII<LChar>(chunk))
break;
copyASCIIMachineWord(destination16, source);
skip(source, sizeof(WTF::MachineWord));
skip(destination16, sizeof(WTF::MachineWord));
}
if (source.empty())
break;
if (!isASCII(source[0]))
continue;
}
consume(destination16) = consume(source);
continue;
}
auto count = nonASCIISequenceLength(source[0]);
int character;
if (!count)
character = nonCharacter;
else {
if (count > source.size()) {
RELEASE_ASSERT_WITH_SECURITY_IMPLICATION(source.size() < m_partialSequence.size());
ASSERT(!m_partialSequenceSize);
m_partialSequenceSize = source.size();
memcpySpan(std::span { m_partialSequence }, source.first(m_partialSequenceSize));
source = {};
break;
}
character = decodeNonASCIISequence(source, count);
}
if (character == nonCharacter) {
sawError = true;
if (stopOnError)
break;
consume(destination16) = replacementCharacter;
skip(source, count ? count : 1);
continue;
}
skip(source, count);
if (character == byteOrderMark && destination16.data() == buffer16.characters() && std::exchange(m_shouldStripByteOrderMark, false))
continue;
destination16 = appendCharacter(destination16, character);
}
} while (m_partialSequenceSize);
buffer16.shrink(destination16.data() - buffer16.characters());
if (flush)
m_partialSequenceSize = 0;
if (flush || buffer16.length())
m_shouldStripByteOrderMark = false;
return String::adopt(WTFMove(buffer16));
}
Vector<uint8_t> TextCodecUTF8::encodeUTF8(StringView string)
{
// The maximum number of UTF-8 bytes needed per UTF-16 code unit is 3.
// BMP characters take only one UTF-16 code unit and can take up to 3 bytes (3x).
// Non-BMP characters take two UTF-16 code units and can take up to 4 bytes (2x).
Vector<uint8_t> bytes(WTF::checkedProduct<size_t>(string.length(), 3));
size_t bytesWritten = 0;
for (auto character : string.codePoints())
U8_APPEND_UNSAFE(bytes, bytesWritten, character);
bytes.shrink(bytesWritten);
return bytes;
}
Vector<uint8_t> TextCodecUTF8::encode(StringView string, UnencodableHandling) const
{
return encodeUTF8(string);
}
} // namespace PAL

View File

@@ -0,0 +1,58 @@
/*
* Copyright (C) 2011-2020 Apple Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once
#include "TextCodec.h"
#include <unicode/utf8.h>
#include <wtf/TZoneMalloc.h>
#include <wtf/text/LChar.h>
namespace PAL {
class TextCodecUTF8 final : public TextCodec {
WTF_MAKE_TZONE_ALLOCATED(TextCodecUTF8);
public:
static void registerEncodingNames(EncodingNameRegistrar);
static void registerCodecs(TextCodecRegistrar);
static Vector<uint8_t> encodeUTF8(StringView);
static std::unique_ptr<TextCodecUTF8> codec();
private:
void stripByteOrderMark() final { m_shouldStripByteOrderMark = true; }
String decode(std::span<const uint8_t>, bool flush, bool stopOnError, bool& sawError) final;
Vector<uint8_t> encode(StringView, UnencodableHandling) const final;
bool handlePartialSequence(std::span<LChar>& destination, std::span<const uint8_t>& source, bool flush);
void handlePartialSequence(std::span<UChar>& destination, std::span<const uint8_t>& source, bool flush, bool stopOnError, bool& sawError);
void consumePartialSequenceByte();
int m_partialSequenceSize { 0 };
std::array<uint8_t, U8_MAX_LENGTH> m_partialSequence;
bool m_shouldStripByteOrderMark { false };
};
} // namespace PAL

View File

@@ -0,0 +1,99 @@
/*
* Copyright (C) 2007-2017 Apple, Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "config.h"
#include "TextCodecUserDefined.h"
#include <array>
#include <wtf/TZoneMallocInlines.h>
#include <wtf/text/CString.h>
#include <wtf/text/StringBuilder.h>
#include <wtf/text/WTFString.h>
namespace PAL {
WTF_MAKE_TZONE_ALLOCATED_IMPL(TextCodecUserDefined);
void TextCodecUserDefined::registerEncodingNames(EncodingNameRegistrar registrar)
{
registrar("x-user-defined"_s, "x-user-defined"_s);
}
void TextCodecUserDefined::registerCodecs(TextCodecRegistrar registrar)
{
registrar("x-user-defined"_s, [] {
return makeUnique<TextCodecUserDefined>();
});
}
String TextCodecUserDefined::decode(std::span<const uint8_t> bytes, bool, bool, bool&)
{
StringBuilder result;
result.reserveCapacity(bytes.size());
for (char byte : bytes)
result.append(static_cast<UChar>(byte & 0xF7FF));
return result.toString();
}
static Vector<uint8_t> encodeComplexUserDefined(StringView string, UnencodableHandling handling)
{
Vector<uint8_t> result;
for (auto character : string.codePoints()) {
int8_t signedByte = character;
if ((signedByte & 0xF7FF) == character)
result.append(signedByte);
else {
// No way to encode this character with x-user-defined.
UnencodableReplacementArray replacement;
result.append(TextCodec::getUnencodableReplacement(character, handling, replacement));
}
}
return result;
}
Vector<uint8_t> TextCodecUserDefined::encode(StringView string, UnencodableHandling handling) const
{
{
Vector<uint8_t> result(string.length());
size_t index = 0;
// Convert and simultaneously do a check to see if it's all ASCII.
UChar ored = 0;
for (auto character : string.codeUnits()) {
result[index++] = character;
ored |= character;
}
if (!(ored & 0xFF80))
return result;
}
// If it wasn't all ASCII, call the function that handles more-complex cases.
return encodeComplexUserDefined(string, handling);
}
} // namespace PAL

View File

@@ -0,0 +1,44 @@
/*
* Copyright (C) 2007-2017 Apple, Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once
#include "TextCodec.h"
#include <wtf/TZoneMalloc.h>
namespace PAL {
class TextCodecUserDefined final : public TextCodec {
WTF_MAKE_TZONE_ALLOCATED(TextCodecUserDefined);
public:
static void registerEncodingNames(EncodingNameRegistrar);
static void registerCodecs(TextCodecRegistrar);
private:
String decode(std::span<const uint8_t>, bool flush, bool stopOnError, bool& sawError) final;
Vector<uint8_t> encode(StringView, UnencodableHandling) const final;
};
} // namespace PAL

View File

@@ -0,0 +1,197 @@
/*
* Copyright (C) 2004-2019 Apple Inc. All rights reserved.
* Copyright (C) 2006 Alexey Proskuryakov <ap@nypop.com>
* Copyright (C) 2007-2009 Torch Mobile, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "config.h"
#include "TextEncoding.h"
#include "DecodeEscapeSequences.h"
#include "TextCodec.h"
#include "TextEncodingRegistry.h"
#include <wtf/NeverDestroyed.h>
#include <wtf/StdLibExtras.h>
#include <wtf/text/StringView.h>
namespace PAL {
static const TextEncoding& UTF7Encoding()
{
static NeverDestroyed<TextEncoding> globalUTF7Encoding("UTF-7"_s);
return globalUTF7Encoding;
}
TextEncoding::TextEncoding(ASCIILiteral name)
: m_name(atomCanonicalTextEncodingName(name))
, m_backslashAsCurrencySymbol(backslashAsCurrencySymbol())
{
}
TextEncoding::TextEncoding(StringView name)
: m_name(atomCanonicalTextEncodingName(name))
, m_backslashAsCurrencySymbol(backslashAsCurrencySymbol())
{
}
TextEncoding::TextEncoding(const String& name)
: TextEncoding(StringView { name })
{
}
String TextEncoding::decode(std::span<const uint8_t> data, bool stopOnError, bool& sawError) const
{
if (m_name.isNull())
return String();
return newTextCodec(*this)->decode(data, true, stopOnError, sawError);
}
Vector<uint8_t> TextEncoding::encode(StringView string, PAL::UnencodableHandling handling, NFCNormalize normalize) const
{
if (m_name.isNull() || string.isEmpty())
return {};
// FIXME: What's the right place to do normalization?
// It's a little strange to do it inside the encode function.
// Perhaps normalization should be an explicit step done before calling encode.
if (normalize == NFCNormalize::Yes)
return newTextCodec(*this)->encode(normalizedNFC(string).view, handling);
return newTextCodec(*this)->encode(string, handling);
}
ASCIILiteral TextEncoding::domName() const
{
if (noExtendedTextEncodingNameUsed())
return m_name;
// We treat EUC-KR as windows-949 (its superset), but need to expose
// the name 'EUC-KR' because the name 'windows-949' is not recognized by
// most Korean web servers even though they do use the encoding
// 'windows-949' with the name 'EUC-KR'.
// FIXME: This is not thread-safe. At the moment, this function is
// only accessed in a single thread, but eventually has to be made
// thread-safe along with usesVisualOrdering().
static const ASCIILiteral windows949 = atomCanonicalTextEncodingName("windows-949"_s);
if (m_name == windows949)
return "EUC-KR"_s;
return m_name;
}
bool TextEncoding::usesVisualOrdering() const
{
if (noExtendedTextEncodingNameUsed())
return false;
static const ASCIILiteral iso88598 = atomCanonicalTextEncodingName("ISO-8859-8"_s);
return m_name == iso88598;
}
bool TextEncoding::isJapanese() const
{
return isJapaneseEncoding(m_name);
}
UChar TextEncoding::backslashAsCurrencySymbol() const
{
return shouldShowBackslashAsCurrencySymbolIn(m_name) ? 0x00A5 : '\\';
}
bool TextEncoding::isNonByteBasedEncoding() const
{
return *this == UTF16LittleEndianEncoding() || *this == UTF16BigEndianEncoding();
}
bool TextEncoding::isUTF7Encoding() const
{
if (noExtendedTextEncodingNameUsed())
return false;
return *this == UTF7Encoding();
}
const TextEncoding& TextEncoding::closestByteBasedEquivalent() const
{
if (isNonByteBasedEncoding())
return UTF8Encoding();
return *this;
}
// HTML5 specifies that UTF-8 be used in form submission when a form is
// is a part of a document in UTF-16 probably because UTF-16 is not a
// byte-based encoding and can contain 0x00. By extension, the same
// should be done for UTF-32. In case of UTF-7, it is a byte-based encoding,
// but it's fraught with problems and we'd rather steer clear of it.
const TextEncoding& TextEncoding::encodingForFormSubmissionOrURLParsing() const
{
if (isNonByteBasedEncoding() || isUTF7Encoding())
return UTF8Encoding();
return *this;
}
const TextEncoding& ASCIIEncoding()
{
static NeverDestroyed<TextEncoding> globalASCIIEncoding("ASCII"_s);
return globalASCIIEncoding;
}
const TextEncoding& Latin1Encoding()
{
static NeverDestroyed<TextEncoding> globalLatin1Encoding("latin1"_s);
return globalLatin1Encoding;
}
const TextEncoding& UTF16BigEndianEncoding()
{
static NeverDestroyed<TextEncoding> globalUTF16BigEndianEncoding("UTF-16BE"_s);
return globalUTF16BigEndianEncoding;
}
const TextEncoding& UTF16LittleEndianEncoding()
{
static NeverDestroyed<TextEncoding> globalUTF16LittleEndianEncoding("UTF-16LE"_s);
return globalUTF16LittleEndianEncoding;
}
const TextEncoding& UTF8Encoding()
{
static NeverDestroyed<TextEncoding> globalUTF8Encoding("UTF-8"_s);
ASSERT(globalUTF8Encoding.get().isValid());
return globalUTF8Encoding;
}
const TextEncoding& WindowsLatin1Encoding()
{
static NeverDestroyed<TextEncoding> globalWindowsLatin1Encoding("WinLatin-1"_s);
return globalWindowsLatin1Encoding;
}
String decodeURLEscapeSequences(StringView string, const TextEncoding& encoding)
{
if (string.isEmpty())
return string.toString();
return decodeEscapeSequences<URLEscapeSequence>(string, encoding);
}
} // namespace PAL

View File

@@ -0,0 +1,91 @@
/*
* Copyright (C) 2004-2017 Apple Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once
#include "root.h"
#include "UnencodableHandling.h"
#include <wtf/URL.h>
#include <wtf/text/StringView.h>
namespace PAL {
enum class NFCNormalize : bool { No,
Yes };
class TextEncoding : public WTF::URLTextEncoding {
public:
TextEncoding() = default;
TextEncoding(ASCIILiteral name);
TextEncoding(StringView name);
TextEncoding(const String& name);
bool isValid() const { return !m_name.isNull(); }
ASCIILiteral name() const { return m_name; }
ASCIILiteral domName() const; // name exposed via DOM
bool usesVisualOrdering() const;
bool isJapanese() const;
const TextEncoding& closestByteBasedEquivalent() const;
const TextEncoding& encodingForFormSubmissionOrURLParsing() const;
String decode(std::span<const uint8_t>, bool stopOnError, bool& sawError) const;
String decode(std::span<const uint8_t>) const;
Vector<uint8_t> encode(StringView, PAL::UnencodableHandling, NFCNormalize = NFCNormalize::Yes) const;
Vector<uint8_t> encodeForURLParsing(StringView string) const final { return encode(string, PAL::UnencodableHandling::URLEncodedEntities, NFCNormalize::No); }
UChar backslashAsCurrencySymbol() const;
bool isByteBasedEncoding() const { return !isNonByteBasedEncoding(); }
private:
bool isNonByteBasedEncoding() const;
bool isUTF7Encoding() const;
ASCIILiteral m_name;
UChar m_backslashAsCurrencySymbol;
};
inline bool operator==(const TextEncoding& a, const TextEncoding& b) { return a.name() == b.name(); }
const TextEncoding& ASCIIEncoding();
const TextEncoding& Latin1Encoding();
const TextEncoding& UTF16BigEndianEncoding();
const TextEncoding& UTF16LittleEndianEncoding();
const TextEncoding& UTF8Encoding();
const TextEncoding& WindowsLatin1Encoding();
// Unescapes the given string using URL escaping rules.
// DANGER: If the URL has "%00" in it,
// the resulting string will have embedded null characters!
String decodeURLEscapeSequences(StringView, const TextEncoding& = UTF8Encoding());
inline String TextEncoding::decode(std::span<const uint8_t> characters) const
{
bool ignored;
return decode(characters, false, ignored);
}
} // namespace PAL

View File

@@ -0,0 +1,46 @@
/*
* Copyright (C) 2009 Google Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following disclaimer
* in the documentation and/or other materials provided with the
* distribution.
* * Neither the name of Google Inc. nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once
#include <span>
#include <wtf/text/ASCIILiteral.h>
namespace PAL {
class TextEncoding;
// Given a sequence of bytes in |data| of length |len| and an optional
// hintEncodingName, detect the most likely character encoding.
// The way hintEncodingName is used is up to an implementation.
// Currently, the only caller sets it to the parent frame encoding.
bool detectTextEncoding(std::span<const uint8_t> data, ASCIILiteral hintEncodingName, TextEncoding* detectedEncoding);
} // namespace PAL

View File

@@ -0,0 +1,125 @@
/*
* Copyright (C) 2008, 2009 Google Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following disclaimer
* in the documentation and/or other materials provided with the
* distribution.
* * Neither the name of Google Inc. nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "config.h"
#include "TextEncodingDetector.h"
#include "TextEncoding.h"
#include "unicode-ucnv.h"
#include "unicode-ucsdet.h"
#include <span>
#include "unicode-ucsdet.h"
namespace WTF {
WTF_EXPORT_PRIVATE std::span<const UCharsetMatch*> ucsdet_detectAll_span(UCharsetDetector*, UErrorCode* status);
} // namespace WTF
using WTF::ucsdet_detectAll_span;
namespace PAL {
bool detectTextEncoding(std::span<const uint8_t> data, ASCIILiteral hintEncodingName, TextEncoding* detectedEncoding)
{
*detectedEncoding = TextEncoding();
UErrorCode status = U_ZERO_ERROR;
UCharsetDetector* detector = ucsdet_open(&status);
if (U_FAILURE(status))
return false;
ucsdet_enableInputFilter(detector, true);
ucsdet_setText(detector, byteCast<char>(data.data()), static_cast<int32_t>(data.size()), &status);
if (U_FAILURE(status))
return false;
// FIXME: A few things we can do other than improving
// the ICU detector itself.
// 1. Use ucsdet_detectAll and pick the most likely one given
// "the context" (parent-encoding, referrer encoding, etc).
// 2. 'Emulate' Firefox/IE's non-Universal detectors (e.g.
// Chinese, Japanese, Russian, Korean and Hebrew) by picking the
// encoding with a highest confidence among the detector-specific
// limited set of candidate encodings.
// Below is a partial implementation of the first part of what's outlined
// above.
auto matches = ucsdet_detectAll_span(detector, &status);
if (U_FAILURE(status)) {
ucsdet_close(detector);
return false;
}
const char* encoding = nullptr;
if (!hintEncodingName.isNull()) {
TextEncoding hintEncoding(hintEncodingName);
// 10 is the minimum confidence value consistent with the codepoint
// allocation in a given encoding. The size of a chunk passed to
// us varies even for the same html file (apparently depending on
// the network load). When we're given a rather short chunk, we
// don't have a sufficiently reliable signal other than the fact that
// the chunk is consistent with a set of encodings. So, instead of
// setting an arbitrary threshold, we have to scan all the encodings
// consistent with the data.
const int32_t kThreshold = 10;
for (auto* match : matches) {
int32_t confidence = ucsdet_getConfidence(match, &status);
if (U_FAILURE(status)) {
status = U_ZERO_ERROR;
continue;
}
if (confidence < kThreshold)
break;
const char* matchEncoding = ucsdet_getName(match, &status);
if (U_FAILURE(status)) {
status = U_ZERO_ERROR;
continue;
}
if (TextEncoding(StringView::fromLatin1(matchEncoding)) == hintEncoding) {
encoding = hintEncodingName;
break;
}
}
}
// If no match is found so far, just pick the top match.
// This can happen, say, when a parent frame in EUC-JP refers to
// a child frame in Shift_JIS and both frames do NOT specify the encoding
// making us resort to auto-detection (when it IS turned on).
if (!encoding && !matches.empty())
encoding = ucsdet_getName(matches[0], &status);
if (U_SUCCESS(status)) {
*detectedEncoding = TextEncoding(StringView::fromLatin1(encoding));
ucsdet_close(detector);
return true;
}
ucsdet_close(detector);
return false;
}
}

View File

@@ -0,0 +1,368 @@
/*
* Copyright (C) 2006-2017 Apple Inc. All rights reserved.
* Copyright (C) 2007-2009 Torch Mobile, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "config.h"
#include "TextEncodingRegistry.h"
#include "TextCodecCJK.h"
#include "TextCodecICU.h"
#include "TextCodecLatin1.h"
#include "TextCodecReplacement.h"
#include "TextCodecSingleByte.h"
#include "TextCodecUTF16.h"
#include "TextCodecUTF8.h"
#include "TextCodecUserDefined.h"
#include "TextEncoding.h"
#include <mutex>
#include <wtf/ASCIICType.h>
#include <wtf/CheckedArithmetic.h>
#include <wtf/HashMap.h>
#include <wtf/HashSet.h>
#include <wtf/Lock.h>
#include <wtf/MainThread.h>
#include <wtf/StdLibExtras.h>
#include <wtf/text/CString.h>
#include <wtf/text/StringHash.h>
namespace PAL {
constexpr size_t maxEncodingNameLength = 63;
// Hash for all-ASCII strings that does case folding.
struct TextEncodingNameHash {
static bool equal(std::span<const LChar> s1, std::span<const LChar> s2)
{
if (s1.size() != s2.size())
return false;
for (size_t i = 0; i < s1.size(); ++i) {
if (toASCIILower(s1[i]) != toASCIILower(s2[i]))
return false;
}
return true;
}
static bool equal(ASCIILiteral s1, ASCIILiteral s2)
{
return equal(s1.span8(), s2.span8());
}
// This algorithm is the one-at-a-time hash from:
// http://burtleburtle.net/bob/hash/hashfaq.html
// http://burtleburtle.net/bob/hash/doobs.html
static unsigned hash(std::span<const LChar> s)
{
unsigned h = WTF::stringHashingStartValue;
for (char c : s) {
h += toASCIILower(c);
h += (h << 10);
h ^= (h >> 6);
}
h += (h << 3);
h ^= (h >> 11);
h += (h << 15);
return h;
}
static unsigned hash(ASCIILiteral s)
{
return hash(s.span8());
}
static const bool safeToCompareToEmptyOrDeleted = false;
};
struct HashTranslatorTextEncodingName {
static unsigned hash(std::span<const LChar> literal)
{
return TextEncodingNameHash::hash(literal);
}
static bool equal(const ASCIILiteral& a, std::span<const LChar> b)
{
return TextEncodingNameHash::equal(a.span8(), b);
}
};
using TextEncodingNameMap = HashMap<ASCIILiteral, ASCIILiteral, TextEncodingNameHash>;
using TextCodecMap = HashMap<ASCIILiteral, NewTextCodecFunction>;
static Lock encodingRegistryLock;
static TextEncodingNameMap* textEncodingNameMap WTF_GUARDED_BY_LOCK(encodingRegistryLock);
static TextCodecMap* textCodecMap WTF_GUARDED_BY_LOCK(encodingRegistryLock);
static bool didExtendTextCodecMaps;
static HashSet<ASCIILiteral>* japaneseEncodings;
static HashSet<ASCIILiteral>* nonBackslashEncodings;
static constexpr ASCIILiteral textEncodingNameBlocklist[] = { "UTF-7"_s, "BOCU-1"_s, "SCSU"_s };
static bool isUndesiredAlias(ASCIILiteral alias)
{
// Reject aliases with version numbers that are supported by some back-ends (such as "ISO_2022,locale=ja,version=0" in ICU).
if (strchr(alias.characters(), ','))
return true;
// 8859_1 is known to (at least) ICU, but other browsers don't support this name - and having it caused a compatibility
// problem, see bug 43554.
if (alias == "8859_1"_s)
return true;
return false;
}
static void addToTextEncodingNameMap(ASCIILiteral alias, ASCIILiteral name) WTF_REQUIRES_LOCK(encodingRegistryLock)
{
ASSERT(strlen(alias) <= maxEncodingNameLength);
if (isUndesiredAlias(alias))
return;
ASCIILiteral atomName = textEncodingNameMap->get(name);
ASSERT((alias == name) || !atomName.isNull());
if (atomName.isNull())
atomName = name;
ASSERT_WITH_MESSAGE(textEncodingNameMap->get(alias).isNull(), "Duplicate text encoding name %s for %s (previously registered as %s)", alias.characters(), atomName.characters(), textEncodingNameMap->get(alias).characters());
textEncodingNameMap->add(alias, atomName);
}
static void addToTextCodecMap(ASCIILiteral name, NewTextCodecFunction&& function) WTF_REQUIRES_LOCK(encodingRegistryLock)
{
ASCIILiteral atomName = textEncodingNameMap->get(name);
ASSERT(!atomName.isNull());
textCodecMap->add(atomName, WTFMove(function));
}
static void pruneBlocklistedCodecs() WTF_REQUIRES_LOCK(encodingRegistryLock)
{
for (auto& nameFromBlocklist : textEncodingNameBlocklist) {
ASCIILiteral atomName = textEncodingNameMap->get(nameFromBlocklist);
if (atomName.isNull())
continue;
Vector<ASCIILiteral> names;
for (auto& entry : *textEncodingNameMap) {
if (entry.value == atomName)
names.append(entry.key);
}
for (auto& name : names)
textEncodingNameMap->remove(name);
textCodecMap->remove(atomName);
}
}
static void buildBaseTextCodecMaps() WTF_REQUIRES_LOCK(encodingRegistryLock)
{
ASSERT(!textCodecMap);
ASSERT(!textEncodingNameMap);
textCodecMap = new TextCodecMap;
textEncodingNameMap = new TextEncodingNameMap;
TextCodecLatin1::registerEncodingNames(addToTextEncodingNameMap);
TextCodecLatin1::registerCodecs(addToTextCodecMap);
TextCodecUTF8::registerEncodingNames(addToTextEncodingNameMap);
TextCodecUTF8::registerCodecs(addToTextCodecMap);
TextCodecUTF16::registerEncodingNames(addToTextEncodingNameMap);
TextCodecUTF16::registerCodecs(addToTextCodecMap);
TextCodecUserDefined::registerEncodingNames(addToTextEncodingNameMap);
TextCodecUserDefined::registerCodecs(addToTextCodecMap);
}
static void addEncodingName(HashSet<ASCIILiteral>& set, ASCIILiteral name) WTF_REQUIRES_LOCK(encodingRegistryLock)
{
// We must not use atomCanonicalTextEncodingName() because this function is called in it.
ASCIILiteral atomName = textEncodingNameMap->get(name);
if (!atomName.isNull())
set.add(atomName);
}
static void buildQuirksSets() WTF_REQUIRES_LOCK(encodingRegistryLock)
{
// FIXME: Having isJapaneseEncoding() and shouldShowBackslashAsCurrencySymbolIn()
// and initializing the sets for them in TextEncodingRegistry.cpp look strange.
ASSERT(!japaneseEncodings);
ASSERT(!nonBackslashEncodings);
japaneseEncodings = new HashSet<ASCIILiteral>;
addEncodingName(*japaneseEncodings, "EUC-JP"_s);
addEncodingName(*japaneseEncodings, "ISO-2022-JP"_s);
addEncodingName(*japaneseEncodings, "ISO-2022-JP-1"_s);
addEncodingName(*japaneseEncodings, "ISO-2022-JP-2"_s);
addEncodingName(*japaneseEncodings, "ISO-2022-JP-3"_s);
addEncodingName(*japaneseEncodings, "JIS_C6226-1978"_s);
addEncodingName(*japaneseEncodings, "JIS_X0201"_s);
addEncodingName(*japaneseEncodings, "JIS_X0208-1983"_s);
addEncodingName(*japaneseEncodings, "JIS_X0208-1990"_s);
addEncodingName(*japaneseEncodings, "JIS_X0212-1990"_s);
addEncodingName(*japaneseEncodings, "Shift_JIS"_s);
addEncodingName(*japaneseEncodings, "Shift_JIS_X0213-2000"_s);
addEncodingName(*japaneseEncodings, "cp932"_s);
addEncodingName(*japaneseEncodings, "x-mac-japanese"_s);
nonBackslashEncodings = new HashSet<ASCIILiteral>;
// The text encodings below treat backslash as a currency symbol for IE compatibility.
// See http://blogs.msdn.com/michkap/archive/2005/09/17/469941.aspx for more information.
addEncodingName(*nonBackslashEncodings, "x-mac-japanese"_s);
addEncodingName(*nonBackslashEncodings, "ISO-2022-JP"_s);
addEncodingName(*nonBackslashEncodings, "EUC-JP"_s);
// Shift_JIS_X0213-2000 is not the same encoding as Shift_JIS on Mac. We need to register both of them.
addEncodingName(*nonBackslashEncodings, "Shift_JIS"_s);
addEncodingName(*nonBackslashEncodings, "Shift_JIS_X0213-2000"_s);
}
bool isJapaneseEncoding(ASCIILiteral canonicalEncodingName)
{
return !canonicalEncodingName.isNull() && japaneseEncodings && japaneseEncodings->contains(canonicalEncodingName);
}
bool shouldShowBackslashAsCurrencySymbolIn(ASCIILiteral canonicalEncodingName)
{
return !canonicalEncodingName.isNull() && nonBackslashEncodings && nonBackslashEncodings->contains(canonicalEncodingName);
}
static void extendTextCodecMaps() WTF_REQUIRES_LOCK(encodingRegistryLock)
{
TextCodecReplacement::registerEncodingNames(addToTextEncodingNameMap);
TextCodecReplacement::registerCodecs(addToTextCodecMap);
TextCodecICU::registerEncodingNames(addToTextEncodingNameMap);
TextCodecICU::registerCodecs(addToTextCodecMap);
TextCodecCJK::registerEncodingNames(addToTextEncodingNameMap);
TextCodecCJK::registerCodecs(addToTextCodecMap);
TextCodecSingleByte::registerEncodingNames(addToTextEncodingNameMap);
TextCodecSingleByte::registerCodecs(addToTextCodecMap);
pruneBlocklistedCodecs();
buildQuirksSets();
}
std::unique_ptr<TextCodec> newTextCodec(const TextEncoding& encoding)
{
Locker locker { encodingRegistryLock };
ASSERT(textCodecMap);
if (!encoding.isValid()) {
return nullptr;
}
auto result = textCodecMap->find(encoding.name());
if (result == textCodecMap->end()) {
return nullptr;
}
if (!result->value) {
// RELEASE_LOG_ERROR(TextEncoding, "Codec for encoding %" PUBLIC_LOG_STRING " is null. Will default to UTF-8", encoding.name().characters());
return nullptr;
}
return result->value();
}
static ASCIILiteral atomCanonicalTextEncodingName(std::span<const LChar> name)
{
if (name.empty())
return {};
Locker locker { encodingRegistryLock };
if (!textEncodingNameMap)
buildBaseTextCodecMaps();
if (ASCIILiteral atomName = textEncodingNameMap->get<HashTranslatorTextEncodingName>(name))
return atomName;
if (didExtendTextCodecMaps)
return {};
extendTextCodecMaps();
didExtendTextCodecMaps = true;
return textEncodingNameMap->get<HashTranslatorTextEncodingName>(name);
}
static ASCIILiteral atomCanonicalTextEncodingName(std::span<const UChar> characters)
{
if (characters.size() > maxEncodingNameLength)
return {};
std::array<LChar, maxEncodingNameLength> buffer;
for (size_t i = 0; i < characters.size(); ++i)
buffer[i] = characters[i];
return atomCanonicalTextEncodingName(std::span { buffer }.first(characters.size()));
}
ASCIILiteral atomCanonicalTextEncodingName(ASCIILiteral name)
{
return atomCanonicalTextEncodingName(name.span8());
}
ASCIILiteral atomCanonicalTextEncodingName(StringView alias)
{
if (alias.isEmpty() || !alias.containsOnlyASCII())
return {};
if (alias.is8Bit())
return atomCanonicalTextEncodingName(alias.span8());
return atomCanonicalTextEncodingName(alias.span16());
}
bool noExtendedTextEncodingNameUsed()
{
// If the calling thread did not use extended encoding names, it is fine for it to use a stale false value.
return !didExtendTextCodecMaps;
}
String defaultTextEncodingNameForSystemLanguage()
{
#if PLATFORM(COCOA)
String systemEncodingName = CFStringConvertEncodingToIANACharSetName(webDefaultCFStringEncoding());
// CFStringConvertEncodingToIANACharSetName() returns cp949 for kTextEncodingDOSKorean AKA "extended EUC-KR" AKA windows-949.
// ICU uses this name for a different encoding, so we need to change the name to a value that actually gives us windows-949.
// In addition, this value must match what is used in Safari, see <rdar://problem/5579292>.
// On some OS versions, the result is CP949 (uppercase).
if (equalLettersIgnoringASCIICase(systemEncodingName, "cp949"_s))
systemEncodingName = "ks_c_5601-1987"_s;
// CFStringConvertEncodingToIANACharSetName() returns cp874 for kTextEncodingDOSThai, AKA windows-874.
// Since "cp874" alias is not standard (https://encoding.spec.whatwg.org/#names-and-labels), map to
// "dos-874" instead.
if (equalLettersIgnoringASCIICase(systemEncodingName, "cp874"_s))
systemEncodingName = "dos-874"_s;
return systemEncodingName;
#else
return "ISO-8859-1"_s;
#endif
}
} // namespace PAL

View File

@@ -0,0 +1,57 @@
/*
* Copyright (C) 2006-2017 Apple Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once
#include <memory>
#include <wtf/Forward.h>
#if PLATFORM(COCOA)
#include <CoreFoundation/CoreFoundation.h>
#endif
namespace PAL {
class TextCodec;
class TextEncoding;
// Use TextResourceDecoder::decode to decode resources, since it handles BOMs.
// Use TextEncoding::encode to encode, since it takes care of normalization.
std::unique_ptr<TextCodec> newTextCodec(const TextEncoding&);
// Only TextEncoding should use the following functions directly.
ASCIILiteral atomCanonicalTextEncodingName(ASCIILiteral alias);
ASCIILiteral atomCanonicalTextEncodingName(StringView);
bool noExtendedTextEncodingNameUsed();
bool isJapaneseEncoding(ASCIILiteral canonicalEncodingName);
bool shouldShowBackslashAsCurrencySymbolIn(ASCIILiteral canonicalEncodingName);
String defaultTextEncodingNameForSystemLanguage();
#if PLATFORM(COCOA)
CFStringEncoding webDefaultCFStringEncoding();
#endif
} // namespace PAL

View File

@@ -0,0 +1,43 @@
/*
* Copyright (C) 2004-2017 Apple Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once
namespace PAL {
// Specifies what will happen when a character is encountered that is
// not encodable in the character set.
enum class UnencodableHandling: bool {
// Encodes the character as an XML entity. For example, U+06DE
// would be "&#1758;" (0x6DE = 1758 in octal).
Entities,
// Encodes the character as en entity as above, but escaped
// non-alphanumeric characters. This is used in URLs.
// For example, U+6DE would be "%26%231758%3B".
URLEncodedEntities
};
}

View File

@@ -0,0 +1,63 @@
#include "root.h"
#include "TextCodec.h"
#include "TextEncodingRegistry.h"
#include "TextEncoding.h"
#include "headers-handwritten.h"
#include <JavaScriptCore/JSGlobalObject.h>
namespace Bun {
using namespace PAL;
using namespace WTF;
class WebKitTextCodec {
WTF_MAKE_FAST_ALLOCATED;
public:
std::unique_ptr<TextCodec> codec;
TextEncoding encoding;
static WebKitTextCodec* create(std::span<const LChar> encodingLabel)
{
const auto encoding = TextEncoding(String(encodingLabel));
auto codec = newTextCodec(encoding);
if (codec) {
return new WebKitTextCodec(WTFMove(codec), encoding);
}
return nullptr;
}
};
extern "C" WebKitTextCodec* WebKitTextCodec__create(const LChar* ptr, size_t len)
{
auto label = std::span<const LChar>(ptr, len);
return WebKitTextCodec::create(label);
}
extern "C" void WebKitTextCodec__deinit(WebKitTextCodec* codec)
{
delete codec;
}
extern "C" BunString WebKitTextCodec__decode(WebKitTextCodec* code, const uint8_t* input_ptr, size_t input_len, bool flush, bool* stopOnError)
{
const std::span<const uint8_t> data = { input_ptr, input_len };
bool shouldStop = stopOnError;
*stopOnError = false;
auto str = code->codec->decode(data, flush, shouldStop, *stopOnError);
return Bun::toStringRef(str);
}
extern "C" BunString WebKitTextCodec__name(WebKitTextCodec* code)
{
return Bun::toStringRef(code->encoding.name());
}
extern "C" void WebKitTextCodec__stripByteOrderMark(WebKitTextCodec* code)
{
code->codec->stripByteOrderMark();
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,164 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
**********************************************************************
* Copyright (C) 2000-2004, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* ucnv_cb.h:
* External APIs for the ICU's codeset conversion library
* Helena Shih
*
* Modification History:
*
* Date Name Description
*/
/**
* \file
* \brief C UConverter functions to aid the writers of callbacks
*
* <h2> Callback API for UConverter </h2>
*
* These functions are provided here for the convenience of the callback
* writer. If you are just looking for callback functions to use, please
* see ucnv_err.h. DO NOT call these functions directly when you are
* working with converters, unless your code has been called as a callback
* via ucnv_setFromUCallback or ucnv_setToUCallback !!
*
* A note about error codes and overflow. Unlike other ICU functions,
* these functions do not expect the error status to be U_ZERO_ERROR.
* Callbacks must be much more careful about their error codes.
* The error codes used here are in/out parameters, which should be passed
* back in the callback's error parameter.
*
* For example, if you call ucnv_cbfromUWriteBytes to write data out
* to the output codepage, it may return U_BUFFER_OVERFLOW_ERROR if
* the data did not fit in the target. But this isn't a failing error,
* in fact, ucnv_cbfromUWriteBytes may be called AGAIN with the error
* status still U_BUFFER_OVERFLOW_ERROR to attempt to write further bytes,
* which will also go into the internal overflow buffers.
*
* Concerning offsets, the 'offset' parameters here are relative to the start
* of SOURCE. For example, Suppose the string "ABCD" was being converted
* from Unicode into a codepage which doesn't have a mapping for 'B'.
* 'A' will be written out correctly, but
* The FromU Callback will be called on an unassigned character for 'B'.
* At this point, this is the state of the world:
* Target: A [..] [points after A]
* Source: A B [C] D [points to C - B has been consumed]
* 0 1 2 3
* codePoint = "B" [the unassigned codepoint]
*
* Now, suppose a callback wants to write the substitution character '?' to
* the target. It calls ucnv_cbFromUWriteBytes() to write the ?.
* It should pass ZERO as the offset, because the offset as far as the
* callback is concerned is relative to the SOURCE pointer [which points
* before 'C'.] If the callback goes into the args and consumes 'C' also,
* it would call FromUWriteBytes with an offset of 1 (and advance the source
* pointer).
*
*/
#ifndef UCNV_CB_H
#define UCNV_CB_H
#include "unicode/utypes.h"
#if !UCONFIG_NO_CONVERSION
#include "unicode-ucnv.h"
#include "unicode-ucnv_err.h"
/**
* ONLY used by FromU callback functions.
* Writes out the specified byte output bytes to the target byte buffer or to converter internal buffers.
*
* @param args callback fromUnicode arguments
* @param source source bytes to write
* @param length length of bytes to write
* @param offsetIndex the relative offset index from callback.
* @param err error status. If <TT>U_BUFFER_OVERFLOW</TT> is returned, then U_BUFFER_OVERFLOW <STRONG>must</STRONG>
* be returned to the user, because it means that not all data could be written into the target buffer, and some is
* in the converter error buffer.
* @see ucnv_cbFromUWriteSub
* @stable ICU 2.0
*/
U_CAPI void U_EXPORT2
ucnv_cbFromUWriteBytes(UConverterFromUnicodeArgs* args,
const char* source,
int32_t length,
int32_t offsetIndex,
UErrorCode* err);
/**
* ONLY used by FromU callback functions.
* This function will write out the correct substitution character sequence
* to the target.
*
* @param args callback fromUnicode arguments
* @param offsetIndex the relative offset index from the current source pointer to be used
* @param err error status. If <TT>U_BUFFER_OVERFLOW</TT> is returned, then U_BUFFER_OVERFLOW <STRONG>must</STRONG>
* be returned to the user, because it means that not all data could be written into the target buffer, and some is
* in the converter error buffer.
* @see ucnv_cbFromUWriteBytes
* @stable ICU 2.0
*/
U_CAPI void U_EXPORT2
ucnv_cbFromUWriteSub(UConverterFromUnicodeArgs* args,
int32_t offsetIndex,
UErrorCode* err);
/**
* ONLY used by fromU callback functions.
* This function will write out the error character(s) to the target UChar buffer.
*
* @param args callback fromUnicode arguments
* @param source pointer to pointer to first UChar to write [on exit: 1 after last UChar processed]
* @param sourceLimit pointer after last UChar to write
* @param offsetIndex the relative offset index from callback which will be set
* @param err error status <TT>U_BUFFER_OVERFLOW</TT>
* @see ucnv_cbToUWriteSub
* @stable ICU 2.0
*/
U_CAPI void U_EXPORT2 ucnv_cbFromUWriteUChars(UConverterFromUnicodeArgs* args,
const UChar** source,
const UChar* sourceLimit,
int32_t offsetIndex,
UErrorCode* err);
/**
* ONLY used by ToU callback functions.
* This function will write out the specified characters to the target
* UChar buffer.
*
* @param args callback toUnicode arguments
* @param source source string to write
* @param length the length of source string
* @param offsetIndex the relative offset index which will be written.
* @param err error status <TT>U_BUFFER_OVERFLOW</TT>
* @see ucnv_cbToUWriteSub
* @stable ICU 2.0
*/
U_CAPI void U_EXPORT2 ucnv_cbToUWriteUChars(UConverterToUnicodeArgs* args,
const UChar* source,
int32_t length,
int32_t offsetIndex,
UErrorCode* err);
/**
* ONLY used by ToU callback functions.
* This function will write out the Unicode substitution character (U+FFFD).
*
* @param args callback fromUnicode arguments
* @param offsetIndex the relative offset index from callback.
* @param err error status <TT>U_BUFFER_OVERFLOW</TT>
* @see ucnv_cbToUWriteUChars
* @stable ICU 2.0
*/
U_CAPI void U_EXPORT2 ucnv_cbToUWriteSub(UConverterToUnicodeArgs* args,
int32_t offsetIndex,
UErrorCode* err);
#endif
#endif

View File

@@ -0,0 +1,465 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
**********************************************************************
* Copyright (C) 1999-2009, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
*
*
* ucnv_err.h:
*/
/**
* \file
* \brief C UConverter predefined error callbacks
*
* <h2>Error Behaviour Functions</h2>
* Defines some error behaviour functions called by ucnv_{from,to}Unicode
* These are provided as part of ICU and many are stable, but they
* can also be considered only as an example of what can be done with
* callbacks. You may of course write your own.
*
* If you want to write your own, you may also find the functions from
* ucnv_cb.h useful when writing your own callbacks.
*
* These functions, although public, should NEVER be called directly.
* They should be used as parameters to the ucnv_setFromUCallback
* and ucnv_setToUCallback functions, to set the behaviour of a converter
* when it encounters ILLEGAL/UNMAPPED/INVALID sequences.
*
* usage example: 'STOP' doesn't need any context, but newContext
* could be set to something other than 'NULL' if needed. The available
* contexts in this header can modify the default behavior of the callback.
*
* \code
* UErrorCode err = U_ZERO_ERROR;
* UConverter *myConverter = ucnv_open("ibm-949", &err);
* const void *oldContext;
* UConverterFromUCallback oldAction;
*
*
* if (U_SUCCESS(err))
* {
* ucnv_setFromUCallBack(myConverter,
* UCNV_FROM_U_CALLBACK_STOP,
* NULL,
* &oldAction,
* &oldContext,
* &status);
* }
* \endcode
*
* The code above tells "myConverter" to stop when it encounters an
* ILLEGAL/TRUNCATED/INVALID sequences when it is used to convert from
* Unicode -> Codepage. The behavior from Codepage to Unicode is not changed,
* and ucnv_setToUCallBack would need to be called in order to change
* that behavior too.
*
* Here is an example with a context:
*
* \code
* UErrorCode err = U_ZERO_ERROR;
* UConverter *myConverter = ucnv_open("ibm-949", &err);
* const void *oldContext;
* UConverterFromUCallback oldAction;
*
*
* if (U_SUCCESS(err))
* {
* ucnv_setToUCallBack(myConverter,
* UCNV_TO_U_CALLBACK_SUBSTITUTE,
* UCNV_SUB_STOP_ON_ILLEGAL,
* &oldAction,
* &oldContext,
* &status);
* }
* \endcode
*
* The code above tells "myConverter" to stop when it encounters an
* ILLEGAL/TRUNCATED/INVALID sequences when it is used to convert from
* Codepage -> Unicode. Any unmapped and legal characters will be
* substituted to be the default substitution character.
*/
#ifndef UCNV_ERR_H
#define UCNV_ERR_H
#include "unicode/utypes.h"
#if !UCONFIG_NO_CONVERSION
/** Forward declaring the UConverter structure. @stable ICU 2.0 */
struct UConverter;
/** @stable ICU 2.0 */
typedef struct UConverter UConverter;
/**
* FROM_U, TO_U context options for sub callback
* @stable ICU 2.0
*/
#define UCNV_SUB_STOP_ON_ILLEGAL "i"
/**
* FROM_U, TO_U context options for skip callback
* @stable ICU 2.0
*/
#define UCNV_SKIP_STOP_ON_ILLEGAL "i"
/**
* FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to ICU (%UXXXX)
* @stable ICU 2.0
*/
#define UCNV_ESCAPE_ICU NULL
/**
* FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to JAVA (\\uXXXX)
* @stable ICU 2.0
*/
#define UCNV_ESCAPE_JAVA "J"
/**
* FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to C (\\uXXXX \\UXXXXXXXX)
* TO_U_CALLBACK_ESCAPE option to escape the character value according to C (\\xXXXX)
* @stable ICU 2.0
*/
#define UCNV_ESCAPE_C "C"
/**
* FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to XML Decimal escape \htmlonly(&amp;#DDDD;)\endhtmlonly
* TO_U_CALLBACK_ESCAPE context option to escape the character value according to XML Decimal escape \htmlonly(&amp;#DDDD;)\endhtmlonly
* @stable ICU 2.0
*/
#define UCNV_ESCAPE_XML_DEC "D"
/**
* FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to XML Hex escape \htmlonly(&amp;#xXXXX;)\endhtmlonly
* TO_U_CALLBACK_ESCAPE context option to escape the character value according to XML Hex escape \htmlonly(&amp;#xXXXX;)\endhtmlonly
* @stable ICU 2.0
*/
#define UCNV_ESCAPE_XML_HEX "X"
/**
* FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to Unicode (U+XXXXX)
* @stable ICU 2.0
*/
#define UCNV_ESCAPE_UNICODE "U"
/**
* FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to CSS2 conventions (\\HH..H<space>, that is,
* a backslash, 1..6 hex digits, and a space)
* @stable ICU 4.0
*/
#define UCNV_ESCAPE_CSS2 "S"
/**
* The process condition code to be used with the callbacks.
* Codes which are greater than UCNV_IRREGULAR should be
* passed on to any chained callbacks.
* @stable ICU 2.0
*/
typedef enum {
UCNV_UNASSIGNED = 0, /**< The code point is unassigned.
The error code U_INVALID_CHAR_FOUND will be set. */
UCNV_ILLEGAL = 1, /**< The code point is illegal. For example,
\\x81\\x2E is illegal in SJIS because \\x2E
is not a valid trail byte for the \\x81
lead byte.
Also, starting with Unicode 3.0.1, non-shortest byte sequences
in UTF-8 (like \\xC1\\xA1 instead of \\x61 for U+0061)
are also illegal, not just irregular.
The error code U_ILLEGAL_CHAR_FOUND will be set. */
UCNV_IRREGULAR = 2, /**< The codepoint is not a regular sequence in
the encoding. For example, \\xED\\xA0\\x80..\\xED\\xBF\\xBF
are irregular UTF-8 byte sequences for single surrogate
code points.
The error code U_INVALID_CHAR_FOUND will be set. */
UCNV_RESET = 3, /**< The callback is called with this reason when a
'reset' has occurred. Callback should reset all
state. */
UCNV_CLOSE = 4, /**< Called when the converter is closed. The
callback should release any allocated memory.*/
UCNV_CLONE = 5 /**< Called when ucnv_safeClone() is called on the
converter. the pointer available as the
'context' is an alias to the original converters'
context pointer. If the context must be owned
by the new converter, the callback must clone
the data and call ucnv_setFromUCallback
(or setToUCallback) with the correct pointer.
@stable ICU 2.2
*/
} UConverterCallbackReason;
/**
* The structure for the fromUnicode callback function parameter.
* @stable ICU 2.0
*/
typedef struct {
uint16_t size; /**< The size of this struct. @stable ICU 2.0 */
UBool flush; /**< The internal state of converter will be reset and data flushed if set to true. @stable ICU 2.0 */
UConverter *converter; /**< Pointer to the converter that is opened and to which this struct is passed as an argument. @stable ICU 2.0 */
const UChar *source; /**< Pointer to the source source buffer. @stable ICU 2.0 */
const UChar *sourceLimit; /**< Pointer to the limit (end + 1) of source buffer. @stable ICU 2.0 */
char *target; /**< Pointer to the target buffer. @stable ICU 2.0 */
const char *targetLimit; /**< Pointer to the limit (end + 1) of target buffer. @stable ICU 2.0 */
int32_t *offsets; /**< Pointer to the buffer that receives the offsets. *offset = blah ; offset++;. @stable ICU 2.0 */
} UConverterFromUnicodeArgs;
/**
* The structure for the toUnicode callback function parameter.
* @stable ICU 2.0
*/
typedef struct {
uint16_t size; /**< The size of this struct @stable ICU 2.0 */
UBool flush; /**< The internal state of converter will be reset and data flushed if set to true. @stable ICU 2.0 */
UConverter *converter; /**< Pointer to the converter that is opened and to which this struct is passed as an argument. @stable ICU 2.0 */
const char *source; /**< Pointer to the source source buffer. @stable ICU 2.0 */
const char *sourceLimit; /**< Pointer to the limit (end + 1) of source buffer. @stable ICU 2.0 */
UChar *target; /**< Pointer to the target buffer. @stable ICU 2.0 */
const UChar *targetLimit; /**< Pointer to the limit (end + 1) of target buffer. @stable ICU 2.0 */
int32_t *offsets; /**< Pointer to the buffer that receives the offsets. *offset = blah ; offset++;. @stable ICU 2.0 */
} UConverterToUnicodeArgs;
/**
* DO NOT CALL THIS FUNCTION DIRECTLY!
* This From Unicode callback STOPS at the ILLEGAL_SEQUENCE,
* returning the error code back to the caller immediately.
*
* @param context Pointer to the callback's private data
* @param fromUArgs Information about the conversion in progress
* @param codeUnits Points to 'length' UChars of the concerned Unicode sequence
* @param length Size (in bytes) of the concerned codepage sequence
* @param codePoint Single UChar32 (UTF-32) containing the concerend Unicode codepoint.
* @param reason Defines the reason the callback was invoked
* @param err This should always be set to a failure status prior to calling.
* @stable ICU 2.0
*/
U_CAPI void U_EXPORT2 UCNV_FROM_U_CALLBACK_STOP (
const void *context,
UConverterFromUnicodeArgs *fromUArgs,
const UChar* codeUnits,
int32_t length,
UChar32 codePoint,
UConverterCallbackReason reason,
UErrorCode * err);
/**
* DO NOT CALL THIS FUNCTION DIRECTLY!
* This To Unicode callback STOPS at the ILLEGAL_SEQUENCE,
* returning the error code back to the caller immediately.
*
* @param context Pointer to the callback's private data
* @param toUArgs Information about the conversion in progress
* @param codeUnits Points to 'length' bytes of the concerned codepage sequence
* @param length Size (in bytes) of the concerned codepage sequence
* @param reason Defines the reason the callback was invoked
* @param err This should always be set to a failure status prior to calling.
* @stable ICU 2.0
*/
U_CAPI void U_EXPORT2 UCNV_TO_U_CALLBACK_STOP (
const void *context,
UConverterToUnicodeArgs *toUArgs,
const char* codeUnits,
int32_t length,
UConverterCallbackReason reason,
UErrorCode * err);
/**
* DO NOT CALL THIS FUNCTION DIRECTLY!
* This From Unicode callback skips any ILLEGAL_SEQUENCE, or
* skips only UNASSIGNED_SEQUENCE depending on the context parameter
* simply ignoring those characters.
*
* @param context The function currently recognizes the callback options:
* UCNV_SKIP_STOP_ON_ILLEGAL: STOPS at the ILLEGAL_SEQUENCE,
* returning the error code back to the caller immediately.
* NULL: Skips any ILLEGAL_SEQUENCE
* @param fromUArgs Information about the conversion in progress
* @param codeUnits Points to 'length' UChars of the concerned Unicode sequence
* @param length Size (in bytes) of the concerned codepage sequence
* @param codePoint Single UChar32 (UTF-32) containing the concerend Unicode codepoint.
* @param reason Defines the reason the callback was invoked
* @param err Return value will be set to success if the callback was handled,
* otherwise this value will be set to a failure status.
* @stable ICU 2.0
*/
U_CAPI void U_EXPORT2 UCNV_FROM_U_CALLBACK_SKIP (
const void *context,
UConverterFromUnicodeArgs *fromUArgs,
const UChar* codeUnits,
int32_t length,
UChar32 codePoint,
UConverterCallbackReason reason,
UErrorCode * err);
/**
* DO NOT CALL THIS FUNCTION DIRECTLY!
* This From Unicode callback will Substitute the ILLEGAL SEQUENCE, or
* UNASSIGNED_SEQUENCE depending on context parameter, with the
* current substitution string for the converter. This is the default
* callback.
*
* @param context The function currently recognizes the callback options:
* UCNV_SUB_STOP_ON_ILLEGAL: STOPS at the ILLEGAL_SEQUENCE,
* returning the error code back to the caller immediately.
* NULL: Substitutes any ILLEGAL_SEQUENCE
* @param fromUArgs Information about the conversion in progress
* @param codeUnits Points to 'length' UChars of the concerned Unicode sequence
* @param length Size (in bytes) of the concerned codepage sequence
* @param codePoint Single UChar32 (UTF-32) containing the concerend Unicode codepoint.
* @param reason Defines the reason the callback was invoked
* @param err Return value will be set to success if the callback was handled,
* otherwise this value will be set to a failure status.
* @see ucnv_setSubstChars
* @stable ICU 2.0
*/
U_CAPI void U_EXPORT2 UCNV_FROM_U_CALLBACK_SUBSTITUTE (
const void *context,
UConverterFromUnicodeArgs *fromUArgs,
const UChar* codeUnits,
int32_t length,
UChar32 codePoint,
UConverterCallbackReason reason,
UErrorCode * err);
/**
* DO NOT CALL THIS FUNCTION DIRECTLY!
* This From Unicode callback will Substitute the ILLEGAL SEQUENCE with the
* hexadecimal representation of the illegal codepoints
*
* @param context The function currently recognizes the callback options:
* <ul>
* <li>UCNV_ESCAPE_ICU: Substitutes the ILLEGAL SEQUENCE with the hexadecimal
* representation in the format %UXXXX, e.g. "%uFFFE%u00AC%uC8FE").
* In the Event the converter doesn't support the characters {%,U}[A-F][0-9],
* it will substitute the illegal sequence with the substitution characters.
* Note that codeUnit(32bit int eg: unit of a surrogate pair) is represented as
* %UD84D%UDC56</li>
* <li>UCNV_ESCAPE_JAVA: Substitutes the ILLEGAL SEQUENCE with the hexadecimal
* representation in the format \\uXXXX, e.g. "\\uFFFE\\u00AC\\uC8FE").
* In the Event the converter doesn't support the characters {\,u}[A-F][0-9],
* it will substitute the illegal sequence with the substitution characters.
* Note that codeUnit(32bit int eg: unit of a surrogate pair) is represented as
* \\uD84D\\uDC56</li>
* <li>UCNV_ESCAPE_C: Substitutes the ILLEGAL SEQUENCE with the hexadecimal
* representation in the format \\uXXXX, e.g. "\\uFFFE\\u00AC\\uC8FE").
* In the Event the converter doesn't support the characters {\,u,U}[A-F][0-9],
* it will substitute the illegal sequence with the substitution characters.
* Note that codeUnit(32bit int eg: unit of a surrogate pair) is represented as
* \\U00023456</li>
* <li>UCNV_ESCAPE_XML_DEC: Substitutes the ILLEGAL SEQUENCE with the decimal
* representation in the format \htmlonly&amp;#DDDDDDDD;, e.g. "&amp;#65534;&amp;#172;&amp;#51454;")\endhtmlonly.
* In the Event the converter doesn't support the characters {&amp;,#}[0-9],
* it will substitute the illegal sequence with the substitution characters.
* Note that codeUnit(32bit int eg: unit of a surrogate pair) is represented as
* &amp;#144470; and Zero padding is ignored.</li>
* <li>UCNV_ESCAPE_XML_HEX:Substitutes the ILLEGAL SEQUENCE with the decimal
* representation in the format \htmlonly&amp;#xXXXX; e.g. "&amp;#xFFFE;&amp;#x00AC;&amp;#xC8FE;")\endhtmlonly.
* In the Event the converter doesn't support the characters {&,#,x}[0-9],
* it will substitute the illegal sequence with the substitution characters.
* Note that codeUnit(32bit int eg: unit of a surrogate pair) is represented as
* \htmlonly&amp;#x23456;\endhtmlonly</li>
* </ul>
* @param fromUArgs Information about the conversion in progress
* @param codeUnits Points to 'length' UChars of the concerned Unicode sequence
* @param length Size (in bytes) of the concerned codepage sequence
* @param codePoint Single UChar32 (UTF-32) containing the concerend Unicode codepoint.
* @param reason Defines the reason the callback was invoked
* @param err Return value will be set to success if the callback was handled,
* otherwise this value will be set to a failure status.
* @stable ICU 2.0
*/
U_CAPI void U_EXPORT2 UCNV_FROM_U_CALLBACK_ESCAPE (
const void *context,
UConverterFromUnicodeArgs *fromUArgs,
const UChar* codeUnits,
int32_t length,
UChar32 codePoint,
UConverterCallbackReason reason,
UErrorCode * err);
/**
* DO NOT CALL THIS FUNCTION DIRECTLY!
* This To Unicode callback skips any ILLEGAL_SEQUENCE, or
* skips only UNASSIGNED_SEQUENCE depending on the context parameter
* simply ignoring those characters.
*
* @param context The function currently recognizes the callback options:
* UCNV_SKIP_STOP_ON_ILLEGAL: STOPS at the ILLEGAL_SEQUENCE,
* returning the error code back to the caller immediately.
* NULL: Skips any ILLEGAL_SEQUENCE
* @param toUArgs Information about the conversion in progress
* @param codeUnits Points to 'length' bytes of the concerned codepage sequence
* @param length Size (in bytes) of the concerned codepage sequence
* @param reason Defines the reason the callback was invoked
* @param err Return value will be set to success if the callback was handled,
* otherwise this value will be set to a failure status.
* @stable ICU 2.0
*/
U_CAPI void U_EXPORT2 UCNV_TO_U_CALLBACK_SKIP (
const void *context,
UConverterToUnicodeArgs *toUArgs,
const char* codeUnits,
int32_t length,
UConverterCallbackReason reason,
UErrorCode * err);
/**
* DO NOT CALL THIS FUNCTION DIRECTLY!
* This To Unicode callback will Substitute the ILLEGAL SEQUENCE,or
* UNASSIGNED_SEQUENCE depending on context parameter, with the
* Unicode substitution character, U+FFFD.
*
* @param context The function currently recognizes the callback options:
* UCNV_SUB_STOP_ON_ILLEGAL: STOPS at the ILLEGAL_SEQUENCE,
* returning the error code back to the caller immediately.
* NULL: Substitutes any ILLEGAL_SEQUENCE
* @param toUArgs Information about the conversion in progress
* @param codeUnits Points to 'length' bytes of the concerned codepage sequence
* @param length Size (in bytes) of the concerned codepage sequence
* @param reason Defines the reason the callback was invoked
* @param err Return value will be set to success if the callback was handled,
* otherwise this value will be set to a failure status.
* @stable ICU 2.0
*/
U_CAPI void U_EXPORT2 UCNV_TO_U_CALLBACK_SUBSTITUTE (
const void *context,
UConverterToUnicodeArgs *toUArgs,
const char* codeUnits,
int32_t length,
UConverterCallbackReason reason,
UErrorCode * err);
/**
* DO NOT CALL THIS FUNCTION DIRECTLY!
* This To Unicode callback will Substitute the ILLEGAL SEQUENCE with the
* hexadecimal representation of the illegal bytes
* (in the format %XNN, e.g. "%XFF%X0A%XC8%X03").
*
* @param context This function currently recognizes the callback options:
* UCNV_ESCAPE_ICU, UCNV_ESCAPE_JAVA, UCNV_ESCAPE_C, UCNV_ESCAPE_XML_DEC,
* UCNV_ESCAPE_XML_HEX and UCNV_ESCAPE_UNICODE.
* @param toUArgs Information about the conversion in progress
* @param codeUnits Points to 'length' bytes of the concerned codepage sequence
* @param length Size (in bytes) of the concerned codepage sequence
* @param reason Defines the reason the callback was invoked
* @param err Return value will be set to success if the callback was handled,
* otherwise this value will be set to a failure status.
* @stable ICU 2.0
*/
U_CAPI void U_EXPORT2 UCNV_TO_U_CALLBACK_ESCAPE (
const void *context,
UConverterToUnicodeArgs *toUArgs,
const char* codeUnits,
int32_t length,
UConverterCallbackReason reason,
UErrorCode * err);
#endif
#endif
/*UCNV_ERR_H*/

View File

@@ -0,0 +1,410 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
**********************************************************************
* Copyright (C) 2005-2013, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* file name: ucsdet.h
* encoding: UTF-8
* indentation:4
*
* created on: 2005Aug04
* created by: Andy Heninger
*
* ICU Character Set Detection, API for C
*
* Draft version 18 Oct 2005
*
*/
#ifndef __UCSDET_H
#define __UCSDET_H
#include "unicode/utypes.h"
#if !UCONFIG_NO_CONVERSION
#include "unicode-uenum.h"
#if U_SHOW_CPLUSPLUS_API
#include "unicode/localpointer.h"
#endif // U_SHOW_CPLUSPLUS_API
/**
* \file
* \brief C API: Charset Detection API
*
* This API provides a facility for detecting the
* charset or encoding of character data in an unknown text format.
* The input data can be from an array of bytes.
* <p>
* Character set detection is at best an imprecise operation. The detection
* process will attempt to identify the charset that best matches the characteristics
* of the byte data, but the process is partly statistical in nature, and
* the results can not be guaranteed to always be correct.
* <p>
* For best accuracy in charset detection, the input data should be primarily
* in a single language, and a minimum of a few hundred bytes worth of plain text
* in the language are needed. The detection process will attempt to
* ignore html or xml style markup that could otherwise obscure the content.
* <p>
* An alternative to the ICU Charset Detector is the
* Compact Encoding Detector, https://github.com/google/compact_enc_det.
* It often gives more accurate results, especially with short input samples.
*/
struct UCharsetDetector;
/**
* Structure representing a charset detector
* @stable ICU 3.6
*/
typedef struct UCharsetDetector UCharsetDetector;
struct UCharsetMatch;
/**
* Opaque structure representing a match that was identified
* from a charset detection operation.
* @stable ICU 3.6
*/
typedef struct UCharsetMatch UCharsetMatch;
/**
* Open a charset detector.
*
* @param status Any error conditions occurring during the open
* operation are reported back in this variable.
* @return the newly opened charset detector.
* @stable ICU 3.6
*/
U_CAPI UCharsetDetector* U_EXPORT2
ucsdet_open(UErrorCode* status);
/**
* Close a charset detector. All storage and any other resources
* owned by this charset detector will be released. Failure to
* close a charset detector when finished with it can result in
* memory leaks in the application.
*
* @param ucsd The charset detector to be closed.
* @stable ICU 3.6
*/
U_CAPI void U_EXPORT2
ucsdet_close(UCharsetDetector* ucsd);
#if U_SHOW_CPLUSPLUS_API
U_NAMESPACE_BEGIN
/**
* \class LocalUCharsetDetectorPointer
* "Smart pointer" class, closes a UCharsetDetector via ucsdet_close().
* For most methods see the LocalPointerBase base class.
*
* @see LocalPointerBase
* @see LocalPointer
* @stable ICU 4.4
*/
U_DEFINE_LOCAL_OPEN_POINTER(LocalUCharsetDetectorPointer, UCharsetDetector, ucsdet_close);
U_NAMESPACE_END
#endif
/**
* Set the input byte data whose charset is to detected.
*
* Ownership of the input text byte array remains with the caller.
* The input string must not be altered or deleted until the charset
* detector is either closed or reset to refer to different input text.
*
* @param ucsd the charset detector to be used.
* @param textIn the input text of unknown encoding. .
* @param len the length of the input text, or -1 if the text
* is NUL terminated.
* @param status any error conditions are reported back in this variable.
*
* @stable ICU 3.6
*/
U_CAPI void U_EXPORT2
ucsdet_setText(UCharsetDetector* ucsd, const char* textIn, int32_t len, UErrorCode* status);
/** Set the declared encoding for charset detection.
* The declared encoding of an input text is an encoding obtained
* by the user from an http header or xml declaration or similar source that
* can be provided as an additional hint to the charset detector.
*
* How and whether the declared encoding will be used during the
* detection process is TBD.
*
* @param ucsd the charset detector to be used.
* @param encoding an encoding for the current data obtained from
* a header or declaration or other source outside
* of the byte data itself.
* @param length the length of the encoding name, or -1 if the name string
* is NUL terminated.
* @param status any error conditions are reported back in this variable.
*
* @stable ICU 3.6
*/
U_CAPI void U_EXPORT2
ucsdet_setDeclaredEncoding(UCharsetDetector* ucsd, const char* encoding, int32_t length, UErrorCode* status);
/**
* Return the charset that best matches the supplied input data.
*
* Note though, that because the detection
* only looks at the start of the input data,
* there is a possibility that the returned charset will fail to handle
* the full set of input data.
* <p>
* The returned UCharsetMatch object is owned by the UCharsetDetector.
* It will remain valid until the detector input is reset, or until
* the detector is closed.
* <p>
* The function will fail if
* <ul>
* <li>no charset appears to match the data.</li>
* <li>no input text has been provided</li>
* </ul>
*
* @param ucsd the charset detector to be used.
* @param status any error conditions are reported back in this variable.
* @return a UCharsetMatch representing the best matching charset,
* or NULL if no charset matches the byte data.
*
* @stable ICU 3.6
*/
U_CAPI const UCharsetMatch* U_EXPORT2
ucsdet_detect(UCharsetDetector* ucsd, UErrorCode* status);
/**
* Find all charset matches that appear to be consistent with the input,
* returning an array of results. The results are ordered with the
* best quality match first.
*
* Because the detection only looks at a limited amount of the
* input byte data, some of the returned charsets may fail to handle
* the all of input data.
* <p>
* The returned UCharsetMatch objects are owned by the UCharsetDetector.
* They will remain valid until the detector is closed or modified
*
* <p>
* Return an error if
* <ul>
* <li>no charsets appear to match the input data.</li>
* <li>no input text has been provided</li>
* </ul>
*
* @param ucsd the charset detector to be used.
* @param matchesFound pointer to a variable that will be set to the
* number of charsets identified that are consistent with
* the input data. Output only.
* @param status any error conditions are reported back in this variable.
* @return A pointer to an array of pointers to UCharSetMatch objects.
* This array, and the UCharSetMatch instances to which it refers,
* are owned by the UCharsetDetector, and will remain valid until
* the detector is closed or modified.
* @stable ICU 3.6
*/
U_CAPI const UCharsetMatch** U_EXPORT2
ucsdet_detectAll(UCharsetDetector* ucsd, int32_t* matchesFound, UErrorCode* status);
/**
* Get the name of the charset represented by a UCharsetMatch.
*
* The storage for the returned name string is owned by the
* UCharsetMatch, and will remain valid while the UCharsetMatch
* is valid.
*
* The name returned is suitable for use with the ICU conversion APIs.
*
* @param ucsm The charset match object.
* @param status Any error conditions are reported back in this variable.
* @return The name of the matching charset.
*
* @stable ICU 3.6
*/
U_CAPI const char* U_EXPORT2
ucsdet_getName(const UCharsetMatch* ucsm, UErrorCode* status);
/**
* Get a confidence number for the quality of the match of the byte
* data with the charset. Confidence numbers range from zero to 100,
* with 100 representing complete confidence and zero representing
* no confidence.
*
* The confidence values are somewhat arbitrary. They define an
* an ordering within the results for any single detection operation
* but are not generally comparable between the results for different input.
*
* A confidence value of ten does have a general meaning - it is used
* for charsets that can represent the input data, but for which there
* is no other indication that suggests that the charset is the correct one.
* Pure 7 bit ASCII data, for example, is compatible with a
* great many charsets, most of which will appear as possible matches
* with a confidence of 10.
*
* @param ucsm The charset match object.
* @param status Any error conditions are reported back in this variable.
* @return A confidence number for the charset match.
*
* @stable ICU 3.6
*/
U_CAPI int32_t U_EXPORT2
ucsdet_getConfidence(const UCharsetMatch* ucsm, UErrorCode* status);
/**
* Get the RFC 3066 code for the language of the input data.
*
* The Charset Detection service is intended primarily for detecting
* charsets, not language. For some, but not all, charsets, a language is
* identified as a byproduct of the detection process, and that is what
* is returned by this function.
*
* CAUTION:
* 1. Language information is not available for input data encoded in
* all charsets. In particular, no language is identified
* for UTF-8 input data.
*
* 2. Closely related languages may sometimes be confused.
*
* If more accurate language detection is required, a linguistic
* analysis package should be used.
*
* The storage for the returned name string is owned by the
* UCharsetMatch, and will remain valid while the UCharsetMatch
* is valid.
*
* @param ucsm The charset match object.
* @param status Any error conditions are reported back in this variable.
* @return The RFC 3066 code for the language of the input data, or
* an empty string if the language could not be determined.
*
* @stable ICU 3.6
*/
U_CAPI const char* U_EXPORT2
ucsdet_getLanguage(const UCharsetMatch* ucsm, UErrorCode* status);
/**
* Get the entire input text as a UChar string, placing it into
* a caller-supplied buffer. A terminating
* NUL character will be appended to the buffer if space is available.
*
* The number of UChars in the output string, not including the terminating
* NUL, is returned.
*
* If the supplied buffer is smaller than required to hold the output,
* the contents of the buffer are undefined. The full output string length
* (in UChars) is returned as always, and can be used to allocate a buffer
* of the correct size.
*
*
* @param ucsm The charset match object.
* @param buf A UChar buffer to be filled with the converted text data.
* @param cap The capacity of the buffer in UChars.
* @param status Any error conditions are reported back in this variable.
* @return The number of UChars in the output string.
*
* @stable ICU 3.6
*/
U_CAPI int32_t U_EXPORT2
ucsdet_getUChars(const UCharsetMatch* ucsm,
UChar* buf, int32_t cap, UErrorCode* status);
/**
* Get an iterator over the set of all detectable charsets -
* over the charsets that are known to the charset detection
* service.
*
* The returned UEnumeration provides access to the names of
* the charsets.
*
* <p>
* The state of the Charset detector that is passed in does not
* affect the result of this function, but requiring a valid, open
* charset detector as a parameter insures that the charset detection
* service has been safely initialized and that the required detection
* data is available.
*
* <p>
* <b>Note:</b> Multiple different charset encodings in a same family may use
* a single shared name in this implementation. For example, this method returns
* an array including "ISO-8859-1" (ISO Latin 1), but not including "windows-1252"
* (Windows Latin 1). However, actual detection result could be "windows-1252"
* when the input data matches Latin 1 code points with any points only available
* in "windows-1252".
*
* @param ucsd a Charset detector.
* @param status Any error conditions are reported back in this variable.
* @return an iterator providing access to the detectable charset names.
* @stable ICU 3.6
*/
U_CAPI UEnumeration* U_EXPORT2
ucsdet_getAllDetectableCharsets(const UCharsetDetector* ucsd, UErrorCode* status);
/**
* Test whether input filtering is enabled for this charset detector.
* Input filtering removes text that appears to be HTML or xml
* markup from the input before applying the code page detection
* heuristics.
*
* @param ucsd The charset detector to check.
* @return true if filtering is enabled.
* @stable ICU 3.6
*/
U_CAPI UBool U_EXPORT2
ucsdet_isInputFilterEnabled(const UCharsetDetector* ucsd);
/**
* Enable filtering of input text. If filtering is enabled,
* text within angle brackets ("<" and ">") will be removed
* before detection, which will remove most HTML or xml markup.
*
* @param ucsd the charset detector to be modified.
* @param filter <code>true</code> to enable input text filtering.
* @return The previous setting.
*
* @stable ICU 3.6
*/
U_CAPI UBool U_EXPORT2
ucsdet_enableInputFilter(UCharsetDetector* ucsd, UBool filter);
#ifndef U_HIDE_INTERNAL_API
/**
* Get an iterator over the set of detectable charsets -
* over the charsets that are enabled by the specified charset detector.
*
* The returned UEnumeration provides access to the names of
* the charsets.
*
* @param ucsd a Charset detector.
* @param status Any error conditions are reported back in this variable.
* @return an iterator providing access to the detectable charset names by
* the specified charset detector.
* @internal
*/
U_CAPI UEnumeration* U_EXPORT2
ucsdet_getDetectableCharsets(const UCharsetDetector* ucsd, UErrorCode* status);
/**
* Enable or disable individual charset encoding.
* A name of charset encoding must be included in the names returned by
* {@link #ucsdet_getAllDetectableCharsets()}.
*
* @param ucsd a Charset detector.
* @param encoding encoding the name of charset encoding.
* @param enabled <code>true</code> to enable, or <code>false</code> to disable the
* charset encoding.
* @param status receives the return status. When the name of charset encoding
* is not supported, U_ILLEGAL_ARGUMENT_ERROR is set.
* @internal
*/
U_CAPI void U_EXPORT2
ucsdet_setDetectableCharset(UCharsetDetector* ucsd, const char* encoding, UBool enabled, UErrorCode* status);
#endif /* U_HIDE_INTERNAL_API */
#endif
#endif /* __UCSDET_H */

View File

@@ -0,0 +1,209 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
*
* Copyright (C) 2002-2013, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: uenum.h
* encoding: UTF-8
* tab size: 8 (not used)
* indentation:2
*
* created on: 2002jul08
* created by: Vladimir Weinstein
*/
#ifndef __UENUM_H
#define __UENUM_H
#include "unicode/utypes.h"
#if U_SHOW_CPLUSPLUS_API
#include "unicode/localpointer.h"
U_NAMESPACE_BEGIN
class StringEnumeration;
U_NAMESPACE_END
#endif // U_SHOW_CPLUSPLUS_API
/**
* \file
* \brief C API: String Enumeration
*/
/**
* An enumeration object.
* For usage in C programs.
* @stable ICU 2.2
*/
struct UEnumeration;
/** structure representing an enumeration object instance @stable ICU 2.2 */
typedef struct UEnumeration UEnumeration;
/**
* Disposes of resources in use by the iterator. If en is NULL,
* does nothing. After this call, any char* or UChar* pointer
* returned by uenum_unext() or uenum_next() is invalid.
* @param en UEnumeration structure pointer
* @stable ICU 2.2
*/
U_CAPI void U_EXPORT2
uenum_close(UEnumeration* en);
#if U_SHOW_CPLUSPLUS_API
U_NAMESPACE_BEGIN
/**
* \class LocalUEnumerationPointer
* "Smart pointer" class, closes a UEnumeration via uenum_close().
* For most methods see the LocalPointerBase base class.
*
* @see LocalPointerBase
* @see LocalPointer
* @stable ICU 4.4
*/
U_DEFINE_LOCAL_OPEN_POINTER(LocalUEnumerationPointer, UEnumeration, uenum_close);
U_NAMESPACE_END
#endif
/**
* Returns the number of elements that the iterator traverses. If
* the iterator is out-of-sync with its service, status is set to
* U_ENUM_OUT_OF_SYNC_ERROR.
* This is a convenience function. It can end up being very
* expensive as all the items might have to be pre-fetched (depending
* on the type of data being traversed). Use with caution and only
* when necessary.
* @param en UEnumeration structure pointer
* @param status error code, can be U_ENUM_OUT_OF_SYNC_ERROR if the
* iterator is out of sync.
* @return number of elements in the iterator
* @stable ICU 2.2
*/
U_CAPI int32_t U_EXPORT2
uenum_count(UEnumeration* en, UErrorCode* status);
/**
* Returns the next element in the iterator's list. If there are
* no more elements, returns NULL. If the iterator is out-of-sync
* with its service, status is set to U_ENUM_OUT_OF_SYNC_ERROR and
* NULL is returned. If the native service string is a char* string,
* it is converted to UChar* with the invariant converter.
* The result is terminated by (UChar)0.
* @param en the iterator object
* @param resultLength pointer to receive the length of the result
* (not including the terminating \\0).
* If the pointer is NULL it is ignored.
* @param status the error code, set to U_ENUM_OUT_OF_SYNC_ERROR if
* the iterator is out of sync with its service.
* @return a pointer to the string. The string will be
* zero-terminated. The return pointer is owned by this iterator
* and must not be deleted by the caller. The pointer is valid
* until the next call to any uenum_... method, including
* uenum_next() or uenum_unext(). When all strings have been
* traversed, returns NULL.
* @stable ICU 2.2
*/
U_CAPI const UChar* U_EXPORT2
uenum_unext(UEnumeration* en,
int32_t* resultLength,
UErrorCode* status);
/**
* Returns the next element in the iterator's list. If there are
* no more elements, returns NULL. If the iterator is out-of-sync
* with its service, status is set to U_ENUM_OUT_OF_SYNC_ERROR and
* NULL is returned. If the native service string is a UChar*
* string, it is converted to char* with the invariant converter.
* The result is terminated by (char)0. If the conversion fails
* (because a character cannot be converted) then status is set to
* U_INVARIANT_CONVERSION_ERROR and the return value is undefined
* (but non-NULL).
* @param en the iterator object
* @param resultLength pointer to receive the length of the result
* (not including the terminating \\0).
* If the pointer is NULL it is ignored.
* @param status the error code, set to U_ENUM_OUT_OF_SYNC_ERROR if
* the iterator is out of sync with its service. Set to
* U_INVARIANT_CONVERSION_ERROR if the underlying native string is
* UChar* and conversion to char* with the invariant converter
* fails. This error pertains only to current string, so iteration
* might be able to continue successfully.
* @return a pointer to the string. The string will be
* zero-terminated. The return pointer is owned by this iterator
* and must not be deleted by the caller. The pointer is valid
* until the next call to any uenum_... method, including
* uenum_next() or uenum_unext(). When all strings have been
* traversed, returns NULL.
* @stable ICU 2.2
*/
U_CAPI const char* U_EXPORT2
uenum_next(UEnumeration* en,
int32_t* resultLength,
UErrorCode* status);
/**
* Resets the iterator to the current list of service IDs. This
* re-establishes sync with the service and rewinds the iterator
* to start at the first element.
* @param en the iterator object
* @param status the error code, set to U_ENUM_OUT_OF_SYNC_ERROR if
* the iterator is out of sync with its service.
* @stable ICU 2.2
*/
U_CAPI void U_EXPORT2
uenum_reset(UEnumeration* en, UErrorCode* status);
#if U_SHOW_CPLUSPLUS_API
/**
* Given a StringEnumeration, wrap it in a UEnumeration. The
* StringEnumeration is adopted; after this call, the caller must not
* delete it (regardless of error status).
* @param adopted the C++ StringEnumeration to be wrapped in a UEnumeration.
* @param ec the error code.
* @return a UEnumeration wrapping the adopted StringEnumeration.
* @stable ICU 4.2
*/
U_CAPI UEnumeration* U_EXPORT2
uenum_openFromStringEnumeration(icu::StringEnumeration* adopted, UErrorCode* ec);
#endif
/**
* Given an array of const UChar* strings, return a UEnumeration. String pointers from 0..count-1 must not be null.
* Do not free or modify either the string array or the characters it points to until this object has been destroyed with uenum_close.
* \snippet test/cintltst/uenumtst.c uenum_openUCharStringsEnumeration
* @param strings array of const UChar* strings (each null terminated). All storage is owned by the caller.
* @param count length of the array
* @param ec error code
* @return the new UEnumeration object. Caller is responsible for calling uenum_close to free memory.
* @see uenum_close
* @stable ICU 50
*/
U_CAPI UEnumeration* U_EXPORT2
uenum_openUCharStringsEnumeration(const UChar* const strings[], int32_t count,
UErrorCode* ec);
/**
* Given an array of const char* strings (invariant chars only), return a UEnumeration. String pointers from 0..count-1 must not be null.
* Do not free or modify either the string array or the characters it points to until this object has been destroyed with uenum_close.
* \snippet test/cintltst/uenumtst.c uenum_openCharStringsEnumeration
* @param strings array of char* strings (each null terminated). All storage is owned by the caller.
* @param count length of the array
* @param ec error code
* @return the new UEnumeration object. Caller is responsible for calling uenum_close to free memory
* @see uenum_close
* @stable ICU 50
*/
U_CAPI UEnumeration* U_EXPORT2
uenum_openCharStringsEnumeration(const char* const strings[], int32_t count,
UErrorCode* ec);
#endif

View File

@@ -29,10 +29,6 @@
#include <wtf/Vector.h>
#include <wtf/text/WTFString.h>
#ifndef PAL_EXPORT
#define PAL_EXPORT
#endif
namespace PAL {
struct CryptoDigestContext;
@@ -48,12 +44,12 @@ public:
SHA_384,
SHA_512,
};
PAL_EXPORT static std::unique_ptr<CryptoDigest> create(Algorithm);
PAL_EXPORT ~CryptoDigest();
static std::unique_ptr<CryptoDigest> create(Algorithm);
~CryptoDigest();
PAL_EXPORT void addBytes(const void* input, size_t length);
PAL_EXPORT Vector<uint8_t> computeHash();
PAL_EXPORT String toHexString();
void addBytes(const void* input, size_t length);
Vector<uint8_t> computeHash();
String toHexString();
private:
CryptoDigest();

View File

@@ -250,157 +250,50 @@ comptime {
/// https://encoding.spec.whatwg.org/encodings.json
pub const EncodingLabel = enum {
@"UTF-8",
IBM866,
@"ISO-8859-2",
@"ISO-8859-3",
@"ISO-8859-4",
@"ISO-8859-5",
@"ISO-8859-6",
@"ISO-8859-7",
@"ISO-8859-8",
@"ISO-8859-8-I",
@"ISO-8859-10",
@"ISO-8859-13",
@"ISO-8859-14",
@"ISO-8859-15",
@"ISO-8859-16",
@"KOI8-R",
@"KOI8-U",
macintosh,
@"windows-874",
@"windows-1250",
@"windows-1251",
/// Also known as
/// - ASCII
/// - latin1
@"utf-8",
@"windows-1252",
@"windows-1253",
@"windows-1254",
@"windows-1255",
@"windows-1256",
@"windows-1257",
@"windows-1258",
@"x-mac-cyrillic",
Big5,
@"EUC-JP",
@"ISO-2022-JP",
Shift_JIS,
@"EUC-KR",
@"UTF-16BE",
@"UTF-16LE",
@"x-user-defined",
@"utf-16be",
@"utf-16le",
pub const Map = std.enums.EnumMap(EncodingLabel, string);
pub const label: Map = brk: {
var map = Map.initFull("");
map.put(EncodingLabel.@"UTF-8", "utf-8");
map.put(EncodingLabel.@"UTF-16LE", "utf-16le");
map.put(EncodingLabel.@"windows-1252", "windows-1252");
break :brk map;
};
const utf16_names = [_]string{
"ucs-2",
"utf-16",
"unicode",
"utf-16le",
"csunicode",
"unicodefeff",
"iso-10646-ucs-2",
};
const utf8_names = [_]string{
"utf8",
"utf-8",
"unicode11utf8",
"unicode20utf8",
"x-unicode20utf8",
"unicode-1-1-utf-8",
};
const latin1_names = [_]string{
"l1",
"ascii",
"cp819",
"cp1252",
"ibm819",
"latin1",
"iso88591",
"us-ascii",
"x-cp1252",
"iso8859-1",
"iso_8859-1",
"iso-8859-1",
"iso-ir-100",
"csisolatin1",
"windows-1252",
"ansi_x3.4-1968",
"iso_8859-1:1987",
};
pub const latin1 = EncodingLabel.@"windows-1252";
const map = bun.ComptimeStringMap(EncodingLabel, .{
.{ "ansi_x3.4-1968", latin1 },
.{ "ascii", latin1 },
.{ "cp1252", latin1 },
.{ "cp819", latin1 },
.{ "csisolatin1", latin1 },
.{ "csunicode", EncodingLabel.@"utf-16le" },
.{ "ibm819", latin1 },
.{ "iso_8859-1:1987", latin1 },
.{ "iso_8859-1", latin1 },
.{ "iso-10646-ucs-2", EncodingLabel.@"utf-16le" },
.{ "iso-8859-1", latin1 },
.{ "iso-ir-100", latin1 },
.{ "iso8859-1", latin1 },
.{ "iso88591", latin1 },
.{ "l1", latin1 },
.{ "latin1", latin1 },
.{ "ucs-2", EncodingLabel.@"utf-16le" },
.{ "unicode-1-1-utf-8", EncodingLabel.@"utf-8" },
.{ "unicode", EncodingLabel.@"utf-16le" },
.{ "unicode11utf8", EncodingLabel.@"utf-8" },
.{ "unicode20utf8", EncodingLabel.@"utf-8" },
.{ "unicodefeff", EncodingLabel.@"utf-16le" },
.{ "us-ascii", latin1 },
.{ "utf-16", EncodingLabel.@"utf-16le" },
.{ "utf-16be", EncodingLabel.@"utf-16be" },
.{ "utf-16le", EncodingLabel.@"utf-16le" },
.{ "utf-8", EncodingLabel.@"utf-8" },
.{ "utf8", EncodingLabel.@"utf-8" },
});
pub fn which(input_: string) ?EncodingLabel {
const input = strings.trim(input_, " \t\r\n");
const ExactMatcher = strings.ExactSizeMatcher;
const Eight = ExactMatcher(8);
const Sixteen = ExactMatcher(16);
return switch (input.len) {
1, 0 => null,
2...8 => switch (Eight.matchLower(input)) {
Eight.case("l1"),
Eight.case("ascii"),
Eight.case("cp819"),
Eight.case("cp1252"),
Eight.case("ibm819"),
Eight.case("latin1"),
Eight.case("iso88591"),
Eight.case("us-ascii"),
Eight.case("x-cp1252"),
=> EncodingLabel.latin1,
Eight.case("ucs-2"),
Eight.case("utf-16"),
Eight.case("unicode"),
Eight.case("utf-16le"),
=> EncodingLabel.@"UTF-16LE",
Eight.case("utf-16be"),
=> EncodingLabel.@"UTF-16BE",
Eight.case("utf8"), Eight.case("utf-8") => EncodingLabel.@"UTF-8",
else => null,
},
9...16 => switch (Sixteen.matchLower(input)) {
Sixteen.case("iso8859-1"),
Sixteen.case("iso_8859-1"),
Sixteen.case("iso-8859-1"),
Sixteen.case("iso-ir-100"),
Sixteen.case("csisolatin1"),
Sixteen.case("windows-1252"),
Sixteen.case("ansi_x3.4-1968"),
Sixteen.case("iso_8859-1:1987"),
=> EncodingLabel.latin1,
Sixteen.case("unicode11utf8"),
Sixteen.case("unicode20utf8"),
Sixteen.case("x-unicode20utf8"),
=> EncodingLabel.@"UTF-8",
Sixteen.case("csunicode"),
Sixteen.case("unicodefeff"),
Sixteen.case("iso-10646-ucs-2"),
=> EncodingLabel.@"UTF-16LE",
else => null,
},
else => if (strings.eqlCaseInsensitiveASCII(input, "unicode-1-1-utf-8", true))
EncodingLabel.@"UTF-8"
else
null,
};
return map.getASCIIICaseInsensitive(input);
}
};
@@ -618,11 +511,47 @@ pub const TextDecoder = struct {
ignore_bom: bool = false,
fatal: bool = false,
encoding: EncodingLabel = EncodingLabel.@"UTF-8",
encoding: Encoding = .{ .@"utf-8" = {} },
const Encoding = union(Tag) {
@"utf-8": void,
@"windows-1252": void,
@"utf-16be": void,
@"utf-16le": void,
other: *JSC.WebKitTextCodec,
pub const Tag = enum {
@"utf-8",
@"windows-1252",
@"utf-16be",
@"utf-16le",
other,
};
pub fn from(input: []const u8) ?Encoding {
if (EncodingLabel.which(input)) |label| {
return switch (label) {
.@"utf-8" => .{ .@"utf-8" = {} },
.@"utf-16le" => .{ .@"utf-16le" = {} },
.@"utf-16be" => .{ .@"utf-16be" = {} },
.@"windows-1252" => .{ .@"windows-1252" = {} },
};
}
return .{ .other = JSC.WebKitTextCodec.init(input) orelse return null };
}
pub fn deinit(this: *@This()) void {
if (this.* == .other) {
this.other.deinit();
}
}
};
pub usingnamespace bun.New(TextDecoder);
pub fn finalize(this: *TextDecoder) void {
this.encoding.deinit();
this.destroy();
}
@@ -646,7 +575,15 @@ pub const TextDecoder = struct {
this: *TextDecoder,
globalThis: *JSC.JSGlobalObject,
) JSC.JSValue {
return ZigString.init(EncodingLabel.label.get(this.encoding).?).toJS(globalThis);
switch (this.encoding) {
.other => |codec| {
var name = codec.name();
return name.transferToJS(globalThis);
},
else => {
return ZigString.init(@tagName(this.encoding)).toJS(globalThis);
},
}
}
const Vector16 = std.meta.Vector(16, u16);
const max_16_ascii: Vector16 = @splat(@as(u16, 127));
@@ -793,8 +730,8 @@ pub const TextDecoder = struct {
}
fn decodeSlice(this: *TextDecoder, globalThis: *JSC.JSGlobalObject, buffer_slice: []const u8, comptime flush: bool) bun.JSError!JSValue {
switch (this.encoding) {
EncodingLabel.latin1 => {
switch (@as(Encoding.Tag, this.encoding)) {
.@"windows-1252" => {
if (strings.isAllASCII(buffer_slice)) {
return ZigString.init(buffer_slice).toJS(globalThis);
}
@@ -809,7 +746,7 @@ pub const TextDecoder = struct {
const out = strings.copyLatin1IntoUTF16([]u16, bytes, []const u8, buffer_slice);
return ZigString.toExternalU16(bytes.ptr, out.written, globalThis);
},
EncodingLabel.@"UTF-8" => {
.@"utf-8" => {
const input, const deinit = input: {
const maybe_without_bom = if (!this.ignore_bom and strings.hasPrefixComptime(buffer_slice, "\xef\xbb\xbf"))
buffer_slice[3..]
@@ -860,25 +797,35 @@ pub const TextDecoder = struct {
return ZigString.init(input).toJS(globalThis);
},
inline .@"UTF-16LE", .@"UTF-16BE" => |utf16_encoding| {
const bom = if (comptime utf16_encoding == .@"UTF-16LE") "\xff\xfe" else "\xfe\xff";
inline .@"utf-16le", .@"utf-16be" => |encoding| {
const bom = comptime if (encoding == .@"utf-16le") "\xff\xfe" else "\xfe\xff";
const input = if (!this.ignore_bom and strings.hasPrefixComptime(buffer_slice, bom))
buffer_slice[2..]
else
buffer_slice;
var decoded, const saw_error = try this.decodeUTF16(input, utf16_encoding == .@"UTF-16BE", flush);
var decoded, const saw_error = try this.decodeUTF16(input, encoding == .@"utf-16be", flush);
if (saw_error and this.fatal) {
decoded.deinit(bun.default_allocator);
return globalThis.ERR_ENCODING_INVALID_ENCODED_DATA("The encoded data was not valid {s} data", .{@tagName(utf16_encoding)}).throw();
return globalThis.ERR_ENCODING_INVALID_ENCODED_DATA("The encoded data was not valid {s} data", .{@tagName(encoding)}).throw();
}
var output = bun.String.fromUTF16(decoded.items);
return output.toJS(globalThis);
},
else => {
return globalThis.throwInvalidArguments("TextDecoder.decode set to unsupported encoding", .{});
.other => {
const codec = this.encoding.other;
var did_stop_on_error = this.fatal;
var str = codec.decode(buffer_slice, flush, &did_stop_on_error);
defer str.deref();
if (did_stop_on_error and this.fatal) {
var name = codec.name();
defer name.deref();
return globalThis.ERR_ENCODING_INVALID_ENCODED_DATA("The encoded data was not valid {} data", .{name}).throw();
}
return str.toJS(globalThis);
},
}
}
@@ -895,14 +842,14 @@ pub const TextDecoder = struct {
var str = arguments[0].toSlice(globalThis, bun.default_allocator);
defer if (str.isAllocated()) str.deinit();
if (EncodingLabel.which(str.slice())) |label| {
if (Encoding.from(str.slice())) |label| {
decoder.encoding = label;
} else {
return globalThis.throwInvalidArguments("Unsupported encoding label \"{s}\"", .{str.slice()});
}
} else if (arguments[0].isUndefined()) {
// default to utf-8
decoder.encoding = EncodingLabel.@"UTF-8";
decoder.encoding = .@"utf-8";
} else {
return globalThis.throwInvalidArguments("TextDecoder(encoding) label is invalid", .{});
}

View File

@@ -204,17 +204,18 @@ pub fn ComptimeStringMapWithKeyType(comptime KeyType: type, comptime V: type, co
return null;
comptime var i: usize = precomputed.min_len;
var shared_lowercased_buf: [precomputed.max_len]u8 = undefined;
inline while (i <= precomputed.max_len) : (i += 1) {
if (length == i) {
const lowerbuf: [i]u8 = brk: {
var buf: [i]u8 = undefined;
for (input, &buf) |c, *j| {
const lowerbuf: *const [i]u8 = brk: {
for (input, shared_lowercased_buf[0..i]) |c, *j| {
j.* = std.ascii.toLower(c);
}
break :brk buf;
break :brk shared_lowercased_buf[0..i];
};
return getWithLengthAndEql(&lowerbuf, i, eql);
return getWithLengthAndEql(lowerbuf, i, eql);
}
}

View File

@@ -1228,6 +1228,10 @@ pub const String = extern struct {
}
const u16_bytes = this.utf16();
if (comptime values.len == 0) {
@compileError("values.len must be > 0");
}
const buffer: [values[0].len]u8 = brk: {
var bytes: [values[0].len]u8 = undefined;
for (&bytes, u16_bytes) |*byte, uchar| {

View File

@@ -0,0 +1,17 @@
import { test, expect } from "bun:test";
test("shift_jis", () => {
const bytes = [147, 250, 150, 123, 140, 234];
const decoder = new TextDecoder("shift_jis");
const data = decoder.decode(Uint8Array.from(bytes));
expect(data).toEqual("日本語");
expect(decoder.encoding).toBe("Shift_JIS");
expect(new TextDecoder().decode(Uint8Array.from(bytes))).not.toBe("日本語");
bytes.push(255);
expect(() => new TextDecoder("shift_jis", { fatal: true }).decode(Uint8Array.from(bytes))).toThrow();
});
test("unknown encoding throws", () => {
expect(() => new TextDecoder("pooop")).toThrow();
});