mirror of
https://github.com/oven-sh/bun
synced 2026-02-12 11:59:00 +00:00
519 lines
20 KiB
Zig
519 lines
20 KiB
Zig
//! Contains helpers for C++ to do TextEncoder/Decoder like operations.
|
|
//! Also contains the code used by `bun.String.encode` and `bun.String.encodeInto`
|
|
|
|
export fn Bun__encoding__writeLatin1(input: [*]const u8, len: usize, to: [*]u8, to_len: usize, encoding: u8) usize {
|
|
return switch (@as(Encoding, @enumFromInt(encoding))) {
|
|
.utf8 => writeU8(input, len, to, to_len, .utf8),
|
|
.latin1 => writeU8(input, len, to, to_len, .latin1),
|
|
.ascii => writeU8(input, len, to, to_len, .ascii),
|
|
.ucs2 => writeU8(input, len, to, to_len, .utf16le),
|
|
.utf16le => writeU8(input, len, to, to_len, .utf16le),
|
|
.base64 => writeU8(input, len, to, to_len, .base64),
|
|
.base64url => writeU8(input, len, to, to_len, .base64url),
|
|
.hex => writeU8(input, len, to, to_len, .hex),
|
|
else => unreachable,
|
|
} catch 0;
|
|
}
|
|
|
|
export fn Bun__encoding__writeUTF16(input: [*]const u16, len: usize, to: [*]u8, to_len: usize, encoding: u8) usize {
|
|
return switch (@as(Encoding, @enumFromInt(encoding))) {
|
|
.utf8 => writeU16(input, len, to, to_len, .utf8, false),
|
|
.latin1 => writeU16(input, len, to, to_len, .ascii, false),
|
|
.ascii => writeU16(input, len, to, to_len, .ascii, false),
|
|
.ucs2 => writeU16(input, len, to, to_len, .utf16le, false),
|
|
.utf16le => writeU16(input, len, to, to_len, .utf16le, false),
|
|
.base64 => writeU16(input, len, to, to_len, .base64, false),
|
|
.base64url => writeU16(input, len, to, to_len, .base64url, false),
|
|
.hex => writeU16(input, len, to, to_len, .hex, false),
|
|
else => unreachable,
|
|
} catch 0;
|
|
}
|
|
|
|
// TODO(@190n) handle unpaired surrogates
|
|
export fn Bun__encoding__byteLengthLatin1AsUTF8(input: [*]const u8, len: usize) usize {
|
|
return byteLengthU8(input, len, .utf8);
|
|
}
|
|
|
|
// TODO(@190n) handle unpaired surrogates
|
|
export fn Bun__encoding__byteLengthUTF16AsUTF8(input: [*]const u16, len: usize) usize {
|
|
return strings.elementLengthUTF16IntoUTF8([]const u16, input[0..len]);
|
|
}
|
|
|
|
export fn Bun__encoding__constructFromLatin1(globalObject: *JSGlobalObject, input: [*]const u8, len: usize, encoding: u8) JSValue {
|
|
const slice = switch (@as(Encoding, @enumFromInt(encoding))) {
|
|
.hex => constructFromU8(input, len, bun.default_allocator, .hex),
|
|
.ascii => constructFromU8(input, len, bun.default_allocator, .ascii),
|
|
.base64url => constructFromU8(input, len, bun.default_allocator, .base64url),
|
|
.utf16le => constructFromU8(input, len, bun.default_allocator, .utf16le),
|
|
.ucs2 => constructFromU8(input, len, bun.default_allocator, .utf16le),
|
|
.utf8 => constructFromU8(input, len, bun.default_allocator, .utf8),
|
|
.base64 => constructFromU8(input, len, bun.default_allocator, .base64),
|
|
else => unreachable,
|
|
};
|
|
return JSC.JSValue.createBuffer(globalObject, slice, globalObject.bunVM().allocator);
|
|
}
|
|
|
|
export fn Bun__encoding__constructFromUTF16(globalObject: *JSGlobalObject, input: [*]const u16, len: usize, encoding: u8) JSValue {
|
|
const slice = switch (@as(Encoding, @enumFromInt(encoding))) {
|
|
.base64 => constructFromU16(input, len, bun.default_allocator, .base64),
|
|
.hex => constructFromU16(input, len, bun.default_allocator, .hex),
|
|
.base64url => constructFromU16(input, len, bun.default_allocator, .base64url),
|
|
.utf16le => constructFromU16(input, len, bun.default_allocator, .utf16le),
|
|
.ucs2 => constructFromU16(input, len, bun.default_allocator, .utf16le),
|
|
.utf8 => constructFromU16(input, len, bun.default_allocator, .utf8),
|
|
.ascii => constructFromU16(input, len, bun.default_allocator, .ascii),
|
|
.latin1 => constructFromU16(input, len, bun.default_allocator, .latin1),
|
|
else => unreachable,
|
|
};
|
|
return JSC.JSValue.createBuffer(globalObject, slice, globalObject.bunVM().allocator);
|
|
}
|
|
|
|
// for SQL statement
|
|
export fn Bun__encoding__toStringUTF8(input: [*]const u8, len: usize, globalObject: *JSC.JSGlobalObject) JSValue {
|
|
return toStringComptime(input[0..len], globalObject, .utf8);
|
|
}
|
|
|
|
export fn Bun__encoding__toString(input: [*]const u8, len: usize, globalObject: *JSC.JSGlobalObject, encoding: u8) JSValue {
|
|
return toString(input[0..len], globalObject, @enumFromInt(encoding));
|
|
}
|
|
|
|
// pub fn writeUTF16AsUTF8(utf16: [*]const u16, len: usize, to: [*]u8, to_len: usize) callconv(.C) i32 {
|
|
// return @intCast(i32, strings.copyUTF16IntoUTF8(to[0..to_len], []const u16, utf16[0..len]).written);
|
|
// }
|
|
pub fn toString(input: []const u8, globalObject: *JSGlobalObject, encoding: Encoding) JSValue {
|
|
return switch (encoding) {
|
|
// treat buffer as utf8
|
|
// callers are expected to check that before constructing `Buffer` objects
|
|
.buffer, .utf8 => toStringComptime(input, globalObject, .utf8),
|
|
|
|
inline else => |enc| toStringComptime(input, globalObject, enc),
|
|
};
|
|
}
|
|
|
|
pub fn toBunStringFromOwnedSlice(input: []u8, encoding: Encoding) bun.String {
|
|
if (input.len == 0)
|
|
return bun.String.empty;
|
|
|
|
switch (encoding) {
|
|
.ascii => {
|
|
if (strings.isAllASCII(input)) {
|
|
return bun.String.createExternalGloballyAllocated(.latin1, input);
|
|
}
|
|
|
|
const str, const chars = bun.String.createUninitialized(.latin1, input.len);
|
|
defer bun.default_allocator.free(input);
|
|
if (str.tag == .Dead) {
|
|
return str;
|
|
}
|
|
strings.copyLatin1IntoASCII(chars, input);
|
|
return str;
|
|
},
|
|
.latin1 => {
|
|
return bun.String.createExternalGloballyAllocated(.latin1, input);
|
|
},
|
|
.buffer, .utf8 => {
|
|
const converted = strings.toUTF16Alloc(bun.default_allocator, input, false, false) catch {
|
|
bun.default_allocator.free(input);
|
|
return bun.String.dead;
|
|
};
|
|
|
|
if (converted) |utf16| {
|
|
defer bun.default_allocator.free(input);
|
|
return bun.String.createExternalGloballyAllocated(.utf16, utf16);
|
|
}
|
|
|
|
// If we get here, it means we can safely assume the string is 100% ASCII characters
|
|
return bun.String.createExternalGloballyAllocated(.latin1, input);
|
|
},
|
|
.ucs2, .utf16le => {
|
|
// Avoid incomplete characters
|
|
if (input.len / 2 == 0) {
|
|
bun.default_allocator.free(input);
|
|
return bun.String.empty;
|
|
}
|
|
|
|
const as_u16 = std.mem.bytesAsSlice(u16, input);
|
|
return bun.String.createExternalGloballyAllocated(.utf16, @alignCast(as_u16));
|
|
},
|
|
|
|
.hex => {
|
|
defer bun.default_allocator.free(input);
|
|
const str, const chars = bun.String.createUninitialized(.latin1, input.len * 2);
|
|
|
|
if (str.tag == .Dead) {
|
|
return str;
|
|
}
|
|
|
|
const wrote = strings.encodeBytesToHex(chars, input);
|
|
|
|
// Return an empty string in this case, just like node.
|
|
if (wrote < chars.len) {
|
|
str.deref();
|
|
return bun.String.empty;
|
|
}
|
|
|
|
return str;
|
|
},
|
|
|
|
// TODO: this is not right. There is an issue here. But it needs to
|
|
// be addressed separately because constructFromU8's base64url also
|
|
// appears inconsistent with Node.js.
|
|
.base64url => {
|
|
defer bun.default_allocator.free(input);
|
|
const out, const chars = bun.String.createUninitialized(.latin1, bun.base64.urlSafeEncodeLen(input));
|
|
if (out.tag != .Dead) {
|
|
_ = bun.base64.encodeURLSafe(chars, input);
|
|
}
|
|
return out;
|
|
},
|
|
|
|
.base64 => {
|
|
defer bun.default_allocator.free(input);
|
|
const to_len = bun.base64.encodeLen(input);
|
|
const to = bun.default_allocator.alloc(u8, to_len) catch return bun.String.dead;
|
|
const wrote = bun.base64.encode(to, input);
|
|
return bun.String.createExternalGloballyAllocated(.latin1, to[0..wrote]);
|
|
},
|
|
}
|
|
}
|
|
|
|
pub fn toStringComptime(input: []const u8, global: *JSGlobalObject, comptime encoding: Encoding) JSValue {
|
|
var bun_string = toBunStringComptime(input, encoding);
|
|
return bun_string.transferToJS(global);
|
|
}
|
|
|
|
pub fn toBunString(input: []const u8, encoding: Encoding) bun.String {
|
|
return switch (encoding) {
|
|
inline else => |enc| toBunStringComptime(input, enc),
|
|
};
|
|
}
|
|
|
|
pub fn toBunStringComptime(input: []const u8, comptime encoding: Encoding) bun.String {
|
|
if (input.len == 0)
|
|
return bun.String.empty;
|
|
|
|
switch (comptime encoding) {
|
|
.ascii => {
|
|
const str, const chars = bun.String.createUninitialized(.latin1, input.len);
|
|
strings.copyLatin1IntoASCII(chars, input);
|
|
return str;
|
|
},
|
|
.latin1 => {
|
|
const str, const chars = bun.String.createUninitialized(.latin1, input.len);
|
|
@memcpy(chars, input);
|
|
return str;
|
|
},
|
|
.buffer, .utf8 => {
|
|
const converted = strings.toUTF16Alloc(bun.default_allocator, input, false, false) catch return bun.String.dead;
|
|
if (converted) |utf16| {
|
|
return bun.String.createExternalGloballyAllocated(.utf16, utf16);
|
|
}
|
|
|
|
// If we get here, it means we can safely assume the string is 100% ASCII characters
|
|
// For this, we rely on WebKit to manage the memory.
|
|
return bun.String.cloneLatin1(input);
|
|
},
|
|
.ucs2, .utf16le => {
|
|
// Avoid incomplete characters
|
|
if (input.len / 2 == 0) return bun.String.empty;
|
|
|
|
const str, const chars = bun.String.createUninitialized(.utf16, input.len / 2);
|
|
var output_bytes = std.mem.sliceAsBytes(chars);
|
|
output_bytes[output_bytes.len - 1] = 0;
|
|
|
|
@memcpy(output_bytes, input[0..output_bytes.len]);
|
|
return str;
|
|
},
|
|
|
|
.hex => {
|
|
const str, const chars = bun.String.createUninitialized(.latin1, input.len * 2);
|
|
|
|
const wrote = strings.encodeBytesToHex(chars, input);
|
|
bun.assert(wrote == chars.len);
|
|
return str;
|
|
},
|
|
|
|
.base64url => {
|
|
const to_len = bun.base64.urlSafeEncodeLen(input);
|
|
const to = bun.default_allocator.alloc(u8, to_len) catch return bun.String.dead;
|
|
const wrote = bun.base64.encodeURLSafe(to, input);
|
|
return bun.String.createExternalGloballyAllocated(.latin1, to[0..wrote]);
|
|
},
|
|
|
|
.base64 => {
|
|
const to_len = bun.base64.encodeLen(input);
|
|
const to = bun.default_allocator.alloc(u8, to_len) catch return bun.String.dead;
|
|
const wrote = bun.base64.encode(to, input);
|
|
return bun.String.createExternalGloballyAllocated(.latin1, to[0..wrote]);
|
|
},
|
|
}
|
|
}
|
|
|
|
pub fn writeU8(input: [*]const u8, len: usize, to_ptr: [*]u8, to_len: usize, comptime encoding: Encoding) !usize {
|
|
if (len == 0 or to_len == 0)
|
|
return 0;
|
|
|
|
// TODO: increase temporary buffer size for larger amounts of data
|
|
// defer {
|
|
// if (comptime encoding.isBinaryToText()) {}
|
|
// }
|
|
|
|
// if (comptime encoding.isBinaryToText()) {}
|
|
|
|
switch (comptime encoding) {
|
|
.buffer, .latin1 => {
|
|
const written = @min(len, to_len);
|
|
@memcpy(to_ptr[0..written], input[0..written]);
|
|
|
|
return written;
|
|
},
|
|
.ascii => {
|
|
const written = @min(len, to_len);
|
|
|
|
const to = to_ptr[0..written];
|
|
var remain = input[0..written];
|
|
|
|
if (bun.simdutf.validate.ascii(remain)) {
|
|
@memcpy(to_ptr[0..written], remain[0..written]);
|
|
} else {
|
|
strings.copyLatin1IntoASCII(to, remain);
|
|
}
|
|
|
|
return written;
|
|
},
|
|
.utf8 => {
|
|
// need to encode
|
|
return strings.copyLatin1IntoUTF8(to_ptr[0..to_len], []const u8, input[0..len]).written;
|
|
},
|
|
// encode latin1 into UTF16
|
|
.ucs2, .utf16le => {
|
|
if (to_len < 2)
|
|
return 0;
|
|
|
|
if (std.mem.isAligned(@intFromPtr(to_ptr), @alignOf([*]u16))) {
|
|
const buf = input[0..len];
|
|
|
|
const output = @as([*]u16, @ptrCast(@alignCast(to_ptr)))[0 .. to_len / 2];
|
|
const written = strings.copyLatin1IntoUTF16([]u16, output, []const u8, buf).written;
|
|
return written * 2;
|
|
} else {
|
|
const buf = input[0..len];
|
|
const output = @as([*]align(1) u16, @ptrCast(to_ptr))[0 .. to_len / 2];
|
|
|
|
const written = strings.copyLatin1IntoUTF16([]align(1) u16, output, []const u8, buf).written;
|
|
return written * 2;
|
|
}
|
|
},
|
|
|
|
.hex => {
|
|
return strings.decodeHexToBytesTruncate(to_ptr[0..to_len], u8, input[0..len]);
|
|
},
|
|
|
|
.base64, .base64url => {
|
|
return bun.base64.decode(to_ptr[0..to_len], input[0..len]).count;
|
|
},
|
|
}
|
|
}
|
|
|
|
pub fn byteLengthU8(input: [*]const u8, len: usize, comptime encoding: Encoding) usize {
|
|
if (len == 0)
|
|
return 0;
|
|
|
|
switch (comptime encoding) {
|
|
.utf8 => {
|
|
return strings.elementLengthLatin1IntoUTF8(input[0..len]);
|
|
},
|
|
|
|
.latin1, .ascii, .buffer => {
|
|
return len;
|
|
},
|
|
|
|
.ucs2, .utf16le => {
|
|
return strings.elementLengthUTF8IntoUTF16([]const u8, input[0..len]) * 2;
|
|
},
|
|
|
|
.hex => {
|
|
return len / 2;
|
|
},
|
|
|
|
.base64, .base64url => {
|
|
return bun.base64.decodeLen(input[0..len]);
|
|
},
|
|
// else => return &[_]u8{};
|
|
}
|
|
}
|
|
|
|
pub fn encodeIntoFrom16(input: []const u16, to: []u8, comptime encoding: Encoding, comptime allow_partial_write: bool) !usize {
|
|
return writeU16(input.ptr, input.len, to.ptr, to.len, encoding, allow_partial_write);
|
|
}
|
|
|
|
pub fn encodeIntoFrom8(input: []const u8, to: []u8, comptime encoding: Encoding) !usize {
|
|
return writeU8(input.ptr, input.len, to.ptr, to.len, encoding);
|
|
}
|
|
|
|
pub fn writeU16(input: [*]const u16, len: usize, to: [*]u8, to_len: usize, comptime encoding: Encoding, comptime allow_partial_write: bool) !usize {
|
|
if (len == 0)
|
|
return 0;
|
|
|
|
switch (comptime encoding) {
|
|
.utf8 => {
|
|
return strings.copyUTF16IntoUTF8Impl(
|
|
to[0..to_len],
|
|
[]const u16,
|
|
input[0..len],
|
|
allow_partial_write,
|
|
).written;
|
|
},
|
|
.latin1, .ascii, .buffer => {
|
|
const out = @min(len, to_len);
|
|
strings.copyU16IntoU8(to[0..to_len], input[0..out]);
|
|
return out;
|
|
},
|
|
// string is already encoded, just need to copy the data
|
|
.ucs2, .utf16le => {
|
|
if (allow_partial_write) {
|
|
const bytes_input_len = len * 2;
|
|
const written = @min(bytes_input_len, to_len);
|
|
const input_u8 = @as([*]const u8, @ptrCast(input));
|
|
bun.memmove(to[0..written], input_u8[0..written]);
|
|
return written;
|
|
} else {
|
|
const bytes_input_len = len * 2;
|
|
const written = @min(bytes_input_len, to_len);
|
|
if (written < 2) return 0;
|
|
|
|
const fixed_len = (written / 2) * 2;
|
|
const input_u8 = @as([*]const u8, @ptrCast(input));
|
|
bun.memmove(to[0..written], input_u8[0..fixed_len]);
|
|
return fixed_len;
|
|
}
|
|
},
|
|
|
|
.hex => {
|
|
return strings.decodeHexToBytesTruncate(to[0..to_len], u16, input[0..len]);
|
|
},
|
|
|
|
.base64, .base64url => {
|
|
if (to_len < 2 or len == 0)
|
|
return 0;
|
|
|
|
// very very slow case!
|
|
// shouldn't really happen though
|
|
const transcoded = strings.toUTF8Alloc(bun.default_allocator, input[0..len]) catch return 0;
|
|
defer bun.default_allocator.free(transcoded);
|
|
return writeU8(transcoded.ptr, transcoded.len, to, to_len, encoding);
|
|
},
|
|
// else => return &[_]u8{};
|
|
}
|
|
}
|
|
|
|
pub fn constructFrom(comptime T: type, input: []const T, allocator: std.mem.Allocator, comptime encoding: Encoding) []u8 {
|
|
return switch (comptime T) {
|
|
u16 => constructFromU16(input.ptr, input.len, allocator, encoding),
|
|
u8 => constructFromU8(input.ptr, input.len, allocator, encoding),
|
|
else => @compileError("Unsupported type for constructFrom: " ++ @typeName(T)),
|
|
};
|
|
}
|
|
|
|
pub fn constructFromU8(input: [*]const u8, len: usize, allocator: std.mem.Allocator, comptime encoding: Encoding) []u8 {
|
|
if (len == 0) return &[_]u8{};
|
|
|
|
switch (comptime encoding) {
|
|
.buffer => {
|
|
var to = allocator.alloc(u8, len) catch return &[_]u8{};
|
|
@memcpy(to[0..len], input[0..len]);
|
|
return to;
|
|
},
|
|
.latin1, .ascii => {
|
|
var to = allocator.alloc(u8, len) catch return &[_]u8{};
|
|
@memcpy(to[0..len], input[0..len]);
|
|
return to;
|
|
},
|
|
.utf8 => {
|
|
// need to encode
|
|
return strings.allocateLatin1IntoUTF8(allocator, []const u8, input[0..len]) catch return &[_]u8{};
|
|
},
|
|
// encode latin1 into UTF16
|
|
// return as bytes
|
|
.ucs2, .utf16le => {
|
|
var to = allocator.alloc(u16, len) catch return &[_]u8{};
|
|
_ = strings.copyLatin1IntoUTF16([]u16, to, []const u8, input[0..len]);
|
|
return std.mem.sliceAsBytes(to[0..len]);
|
|
},
|
|
|
|
.hex => {
|
|
if (len < 2)
|
|
return &[_]u8{};
|
|
|
|
var to = allocator.alloc(u8, len / 2) catch return &[_]u8{};
|
|
return to[0..strings.decodeHexToBytesTruncate(to, u8, input[0..len])];
|
|
},
|
|
|
|
.base64, .base64url => {
|
|
const slice = strings.trim(input[0..len], "\r\n\t " ++ [_]u8{std.ascii.control_code.vt});
|
|
if (slice.len == 0) return &[_]u8{};
|
|
|
|
const outlen = bun.base64.decodeLen(slice);
|
|
const to = allocator.alloc(u8, outlen) catch return &[_]u8{};
|
|
|
|
const wrote = bun.base64.decode(to[0..outlen], slice).count;
|
|
return to[0..wrote];
|
|
},
|
|
}
|
|
}
|
|
|
|
pub fn constructFromU16(input: [*]const u16, len: usize, allocator: std.mem.Allocator, comptime encoding: Encoding) []u8 {
|
|
if (len == 0) return &[_]u8{};
|
|
|
|
switch (comptime encoding) {
|
|
.utf8 => {
|
|
return strings.toUTF8AllocWithType(allocator, []const u16, input[0..len]) catch return &[_]u8{};
|
|
},
|
|
.latin1, .buffer, .ascii => {
|
|
var to = allocator.alloc(u8, len) catch return &[_]u8{};
|
|
strings.copyU16IntoU8(to[0..len], input[0..len]);
|
|
return to;
|
|
},
|
|
// string is already encoded, just need to copy the data
|
|
.ucs2, .utf16le => {
|
|
var to = std.mem.sliceAsBytes(allocator.alloc(u16, len) catch return &[_]u8{});
|
|
const bytes = std.mem.sliceAsBytes(input[0..len]);
|
|
@memcpy(to[0..bytes.len], bytes);
|
|
return to;
|
|
},
|
|
|
|
.hex => {
|
|
var to = allocator.alloc(u8, len * 2) catch return &[_]u8{};
|
|
return to[0..strings.decodeHexToBytesTruncate(to, u16, input[0..len])];
|
|
},
|
|
|
|
.base64, .base64url => {
|
|
// very very slow case!
|
|
// shouldn't really happen though
|
|
const transcoded = strings.toUTF8Alloc(allocator, input[0..len]) catch return &[_]u8{};
|
|
defer allocator.free(transcoded);
|
|
return constructFromU8(transcoded.ptr, transcoded.len, allocator, encoding);
|
|
},
|
|
}
|
|
}
|
|
|
|
comptime {
|
|
_ = &Bun__encoding__writeLatin1;
|
|
_ = &Bun__encoding__writeUTF16;
|
|
_ = &Bun__encoding__byteLengthLatin1AsUTF8;
|
|
_ = &Bun__encoding__byteLengthUTF16AsUTF8;
|
|
_ = &Bun__encoding__toString;
|
|
_ = &Bun__encoding__toStringUTF8;
|
|
_ = &Bun__encoding__constructFromLatin1;
|
|
_ = &Bun__encoding__constructFromUTF16;
|
|
}
|
|
|
|
const std = @import("std");
|
|
const JSC = bun.JSC;
|
|
const Encoding = JSC.Node.Encoding;
|
|
const bun = @import("bun");
|
|
const strings = bun.strings;
|
|
const string = bun.string;
|
|
const JSValue = JSC.JSValue;
|
|
const JSGlobalObject = JSC.JSGlobalObject;
|