Compare commits

...

5 Commits

Author SHA1 Message Date
Dylan Conway
1f238748f3 Merge branch 'main' into dylan/faster-text-encoder-stream-part-2 2024-08-12 12:59:59 -07:00
Dylan Conway
08f44ccc6e rare data buffer 2024-08-12 12:06:49 -07:00
Dylan Conway
0e99822556 valid simdutf functions, uint8array 2024-08-12 11:28:53 -07:00
Dylan Conway
6df1f03a6f plus 1 2024-08-10 17:57:44 -07:00
Dylan Conway
a2adeb1888 exclude pending surrogate from simdutf conversion 2024-08-10 17:35:05 -07:00
5 changed files with 194 additions and 51 deletions

View File

@@ -2,12 +2,19 @@ import { bench, run } from "./runner.mjs";
const latin1 = `hello hello hello!!!! `.repeat(10240);
function create(src) {
function split(str, chunkSize) {
const astralCharacter = "\u{1F499}"; // BLUE HEART
const leading = astralCharacter[0];
const trailing = astralCharacter[1];
async function create(src, testPendingSurrogate) {
function split(str, chunkSize, pendingLeadSurrogate) {
let chunkedHTML = [];
let html = str;
while (html.length > 0) {
chunkedHTML.push(html.slice(0, chunkSize));
pendingLeadSurrogate
? chunkedHTML.push(html.slice(0, chunkSize) + leading)
: chunkedHTML.push(html.slice(0, chunkSize));
html = html.slice(chunkSize);
}
return chunkedHTML;
@@ -29,21 +36,30 @@ function create(src) {
// if (new TextDecoder().decode(await runBench(oneKB)) !== src) {
// throw new Error("Benchmark failed");
// }
const pendingSurrogateTests = [false];
if (testPendingSurrogate) {
pendingSurrogateTests.push(true);
}
const sizes = [1024, 16 * 1024, 64 * 1024, 256 * 1024];
for (const chunkSize of sizes) {
const text = split(src, chunkSize);
bench(
`${Math.round(src.length / 1024)} KB of text in ${Math.round(chunkSize / 1024) > 0 ? Math.round(chunkSize / 1024) : (chunkSize / 1024).toFixed(2)} KB chunks`,
async () => {
await runBench(text);
},
);
for (const pendingLeadSurrogate of pendingSurrogateTests) {
const text = split(src, chunkSize, testPendingSurrogate && pendingLeadSurrogate);
bench(
`${Math.round(src.length / 1024)} KB, ${Math.round(chunkSize / 1024) > 0 ? Math.round(chunkSize / 1024) : (chunkSize / 1024).toFixed(2)} KB chunks, ${pendingLeadSurrogate ? "pending surrogate" : ""}`,
async () => {
await runBench(text);
},
);
}
}
}
create(latin1);
create(latin1, false);
create(
// bun's old readme was extremely long
await fetch("https://web.archive.org/web/20230119110956/https://github.com/oven-sh/bun").then(res => res.text()),
true,
);
await run();

View File

@@ -56,6 +56,7 @@ pub extern fn simdutf__convert_utf8_to_utf16le(buf: [*]const u8, len: usize, utf
pub extern fn simdutf__convert_utf8_to_utf16be(buf: [*]const u8, len: usize, utf16_output: [*]u16) usize;
pub extern fn simdutf__convert_utf8_to_utf16le_with_errors(buf: [*]const u8, len: usize, utf16_output: [*]u16) SIMDUTFResult;
pub extern fn simdutf__convert_utf8_to_utf16be_with_errors(buf: [*]const u8, len: usize, utf16_output: [*]u16) SIMDUTFResult;
pub extern fn simdutf__convert_valid_utf8_to_utf16le(buf: [*]const u8, len: usize, utf16_buffer: [*]u16) usize;
pub extern fn simdutf__convert_valid_utf8_to_utf16be(buf: [*]const u8, len: usize, utf16_buffer: [*]u16) usize;
pub extern fn simdutf__convert_utf8_to_utf32(buf: [*]const u8, len: usize, utf32_output: [*]u32) usize;
pub extern fn simdutf__convert_utf8_to_utf32_with_errors(buf: [*]const u8, len: usize, utf32_output: [*]u32) SIMDUTFResult;
@@ -167,13 +168,60 @@ pub const convert = struct {
};
pub fn le(input: []const u8, output: []u32) usize {
return simdutf__convert_valid_utf8_to_utf32(input.ptr, input.len, output.ptr);
return simdutf__convert_utf8_to_utf32(input.ptr, input.len, output.ptr);
}
pub fn be(input: []const u8, output: []u32) usize {
return simdutf__convert_utf8_to_utf32(input.ptr, input.len, output.ptr);
}
};
};
};
pub const valid = struct {
pub const utf16 = struct {
pub const to = struct {
pub const utf8 = struct {
pub fn le(input: []const u16, output: []u8) usize {
return simdutf__convert_valid_utf16le_to_utf8(input.ptr, input.len, output.ptr);
}
pub fn be(input: []const u16, output: []u8) usize {
return simdutf__convert_valid_utf16be_to_utf8(input.ptr, input.len, output.ptr);
}
};
};
};
pub const utf8 = struct {
pub const to = struct {
pub const utf16 = struct {
pub fn le(input: []const u8, output: []u16) usize {
return simdutf__convert_valid_utf8_to_utf16le(input.ptr, input.len, output.ptr);
}
pub fn be(input: []const u8, output: []u16) usize {
return simdutf__convert_valid_utf8_to_utf16be(input.ptr, input.len, output.ptr);
}
};
pub fn utf32(input: []const u8, output: []u32) usize {
return simdutf__convert_valid_utf8_to_utf32(input.ptr, input.len, output.ptr);
}
};
};
pub const utf32 = struct {
pub const to = struct {
pub fn utf8(input: []const u32, output: []u8) usize {
return simdutf__convert_valid_utf32_to_utf8(input.ptr, input.len, output.ptr);
}
pub const utf16 = struct {
pub fn le(input: []const u32, output: []u16) usize {
return simdutf__convert_valid_utf32_to_utf16le(input.ptr, input.len, output.ptr);
}
pub fn be(input: []const u32, output: []u16) usize {
return simdutf__convert_valid_utf32_to_utf16be(input.ptr, input.len, output.ptr);
}
};
};
};
};
pub const utf16 = struct {
@@ -189,10 +237,10 @@ pub const convert = struct {
};
pub fn le(input: []const u16, output: []u8) usize {
return simdutf__convert_valid_utf16le_to_utf8(input.ptr, input.len, output.ptr);
return simdutf__convert_utf16le_to_utf8(input.ptr, input.len, output.ptr);
}
pub fn be(input: []const u16, output: []u8) usize {
return simdutf__convert_valid_utf16be_to_utf8(input.ptr, input.len, output.ptr);
return simdutf__convert_utf16be_to_utf8(input.ptr, input.len, output.ptr);
}
};
@@ -207,10 +255,10 @@ pub const convert = struct {
};
pub fn le(input: []const u16, output: []u32) usize {
return simdutf__convert_valid_utf16le_to_utf32(input.ptr, input.len, output.ptr);
return simdutf__convert_utf16le_to_utf32(input.ptr, input.len, output.ptr);
}
pub fn be(input: []const u16, output: []u32) usize {
return simdutf__convert_valid_utf16be_to_utf32(input.ptr, input.len, output.ptr);
return simdutf__convert_utf16be_to_utf32(input.ptr, input.len, output.ptr);
}
};
};
@@ -229,10 +277,10 @@ pub const convert = struct {
};
pub fn le(input: []const u32, output: []u8) usize {
return simdutf__convert_valid_utf32_to_utf8(input.ptr, input.len, output.ptr);
return simdutf__convert_utf32_to_utf8(input.ptr, input.len, output.ptr);
}
pub fn be(input: []const u32, output: []u8) usize {
return simdutf__convert_valid_utf32_to_utf8(input.ptr, input.len, output.ptr);
return simdutf__convert_utf32_to_utf8(input.ptr, input.len, output.ptr);
}
};
@@ -247,10 +295,10 @@ pub const convert = struct {
};
pub fn le(input: []const u32, output: []u16) usize {
return simdutf__convert_valid_utf32_to_utf16le(input.ptr, input.len, output.ptr);
return simdutf__convert_utf32_to_utf16le(input.ptr, input.len, output.ptr);
}
pub fn be(input: []const u32, output: []u16) usize {
return simdutf__convert_valid_utf32_to_utf16be(input.ptr, input.len, output.ptr);
return simdutf__convert_utf32_to_utf16be(input.ptr, input.len, output.ptr);
}
};
};

View File

@@ -49,6 +49,10 @@ listening_sockets_for_watch_mode_lock: bun.Lock = .{},
temp_pipe_read_buffer: ?*PipeReadBuffer = null,
text_encoder_stream_buffer: ?*TextEncoderStreamBuffer = undefined,
pub const TextEncoderStreamBuffer = [256 * 1024]u8;
const PipeReadBuffer = [256 * 1024]u8;
pub fn pipeReadBuffer(this: *RareData) *PipeReadBuffer {
@@ -58,6 +62,13 @@ pub fn pipeReadBuffer(this: *RareData) *PipeReadBuffer {
};
}
pub fn textEncoderStreamBuffer(this: *RareData) *PipeReadBuffer {
return this.text_encoder_stream_buffer orelse {
this.text_encoder_stream_buffer = default_allocator.create(TextEncoderStreamBuffer) catch bun.outOfMemory();
return this.text_encoder_stream_buffer.?;
};
}
pub fn addListeningSocketForWatchMode(this: *RareData, socket: bun.FileDescriptor) void {
this.listening_sockets_for_watch_mode_lock.lock();
defer this.listening_sockets_for_watch_mode_lock.unlock();

View File

@@ -516,6 +516,7 @@ pub const TextEncoderStreamEncoder = struct {
len: u3,
pub const replacement: @This() = .{ .bytes = .{ 0xef, 0xbf, 0xbd, 0 }, .len = 3 };
pub const empty: @This() = .{ .bytes = .{ 0, 0, 0, 0 }, .len = 0 };
pub fn fromSequence(seq: [4]u8, length: u3) @This() {
return .{ .bytes = seq, .len = length };
@@ -524,7 +525,7 @@ pub const TextEncoderStreamEncoder = struct {
var remain = input;
const prepend: ?Prepend = prepend: {
const prepend: Prepend = prepend: {
if (this.pending_lead_surrogate) |lead| {
this.pending_lead_surrogate = null;
const maybe_trail = remain[0];
@@ -538,7 +539,7 @@ pub const TextEncoderStreamEncoder = struct {
remain = remain[1..];
if (remain.len == 0) {
return ArrayBuffer.createBuffer(
return ArrayBuffer.createUint8Array(
globalObject,
sequence[0..converted.utf8Width()],
);
@@ -549,46 +550,113 @@ pub const TextEncoderStreamEncoder = struct {
break :prepend Prepend.replacement;
}
break :prepend null;
break :prepend Prepend.empty;
};
const length = bun.simdutf.length.utf8.from.utf16.le(remain);
bun.debugAssert(remain.len != 0);
var buf = std.ArrayList(u8).initCapacity(
bun.default_allocator,
length + @as(usize, if (prepend) |pre| pre.len else 0),
) catch {
globalObject.throwOutOfMemory();
return .zero;
};
const last_token = remain[remain.len - 1];
if (strings.u16IsLead(last_token)) {
if (prepend) |*pre| {
buf.appendSliceAssumeCapacity(pre.bytes[0..pre.len]);
// if last unit is high surrogate, exclude from simdutf conversion to avoid error
remain = remain[0 .. remain.len - 1];
this.pending_lead_surrogate = last_token;
if (remain.len == 0) {
if (prepend.len == 0) {
return .undefined;
}
return ArrayBuffer.createUint8Array(
globalObject,
prepend.bytes[0..prepend.len],
);
}
}
const result = bun.simdutf.convert.utf16.to.utf8.with_errors.le(remain, buf.unusedCapacitySlice());
bun.debugAssert(remain.len != 0);
switch (result.status) {
else => {
// Slow path: there was invalid UTF-16, so we need to convert it without simdutf.
const lead_surrogate = strings.toUTF8ListWithTypeBun(&buf, []const u16, remain, true) catch {
buf.deinit();
const buf, const exact_size = buf: {
const max_length = ((remain.len + 1) * 3) + prepend.len;
if (max_length < @sizeOf(JSC.RareData.TextEncoderStreamBuffer)) {
break :buf .{
globalObject.bunVM().rareData().textEncoderStreamBuffer(),
false,
};
}
const length = bun.simdutf.length.utf8.from.utf16.le(remain) + prepend.len;
break :buf .{
bun.default_allocator.alloc(u8, length) catch {
globalObject.throwOutOfMemory();
return .zero;
},
true,
};
};
const count = bun.simdutf.convert.utf16.to.utf8.le(remain, buf);
if (count == 0) {
var arr = if (exact_size)
std.ArrayList(u8).fromOwnedSlice(bun.default_allocator, buf)
else
std.ArrayList(u8).init(bun.default_allocator);
arr.items.len = 0;
if (prepend.len != 0) {
arr.appendSlice(prepend.bytes[0..prepend.len]) catch {
globalObject.throwOutOfMemory();
return .zero;
};
}
_ = strings.toUTF8ListWithTypeBun(&arr, []const u16, remain, false) catch {
arr.deinit();
globalObject.throwOutOfMemory();
return .zero;
};
const result_bytes = result_bytes: {
// very unlikely
if (arr.capacity == arr.items.len) break :result_bytes arr.items;
defer arr.deinit();
const owned = bun.default_allocator.alloc(u8, arr.items.len) catch {
globalObject.throwOutOfMemory();
return .zero;
};
if (lead_surrogate) |pending_lead| {
this.pending_lead_surrogate = pending_lead;
if (buf.items.len == 0) return .undefined;
}
@memcpy(owned, arr.items);
return JSC.JSUint8Array.fromBytes(globalObject, buf.items);
},
.success => {
buf.items.len += result.count;
return JSC.JSUint8Array.fromBytes(globalObject, buf.items);
},
break :result_bytes owned;
};
return JSC.JSUint8Array.fromBytes(globalObject, result_bytes);
}
const result_bytes = if (exact_size) buf else result_bytes: {
const owned = bun.default_allocator.alloc(u8, count + prepend.len) catch {
globalObject.throwOutOfMemory();
return .zero;
};
var remain_owned = owned;
if (prepend.len != 0) {
@memcpy(remain_owned[0..prepend.len], prepend.bytes[0..prepend.len]);
remain_owned = remain_owned[prepend.len..];
}
@memcpy(remain_owned, buf[0..count]);
break :result_bytes owned;
};
return JSC.JSUint8Array.fromBytes(globalObject, result_bytes);
}
pub fn flush(this: *TextEncoderStreamEncoder, globalObject: *JSGlobalObject, _: *JSC.CallFrame) JSValue {
@@ -603,7 +671,7 @@ pub const TextEncoderStreamEncoder = struct {
return if (this.pending_lead_surrogate == null)
.undefined
else
JSC.ArrayBuffer.createBuffer(globalObject, &.{ 0xef, 0xbf, 0xbd });
JSC.ArrayBuffer.createUint8Array(globalObject, &.{ 0xef, 0xbf, 0xbd });
}
};

View File

@@ -1039,7 +1039,7 @@ pub fn GlobWalker_(
}
pub fn convertUtf8ToCodepoints(codepoints: []u32, pattern: []const u8) void {
_ = bun.simdutf.convert.utf8.to.utf32.le(pattern, codepoints);
_ = bun.simdutf.convert.valid.utf8.to.utf32(pattern, codepoints);
}
pub fn debugPatternComopnents(this: *GlobWalker) void {