Files
bun.sh/src/bun.js/webcore/encoding.zig
2025-07-12 18:19:16 -07:00

519 lines
20 KiB
Zig

//! Contains helpers for C++ to do TextEncoder/Decoder like operations.
//! Also contains the code used by `bun.String.encode` and `bun.String.encodeInto`
export fn Bun__encoding__writeLatin1(input: [*]const u8, len: usize, to: [*]u8, to_len: usize, encoding: u8) usize {
return switch (@as(Encoding, @enumFromInt(encoding))) {
.utf8 => writeU8(input, len, to, to_len, .utf8),
.latin1 => writeU8(input, len, to, to_len, .latin1),
.ascii => writeU8(input, len, to, to_len, .ascii),
.ucs2 => writeU8(input, len, to, to_len, .utf16le),
.utf16le => writeU8(input, len, to, to_len, .utf16le),
.base64 => writeU8(input, len, to, to_len, .base64),
.base64url => writeU8(input, len, to, to_len, .base64url),
.hex => writeU8(input, len, to, to_len, .hex),
else => unreachable,
} catch 0;
}
export fn Bun__encoding__writeUTF16(input: [*]const u16, len: usize, to: [*]u8, to_len: usize, encoding: u8) usize {
return switch (@as(Encoding, @enumFromInt(encoding))) {
.utf8 => writeU16(input, len, to, to_len, .utf8, false),
.latin1 => writeU16(input, len, to, to_len, .ascii, false),
.ascii => writeU16(input, len, to, to_len, .ascii, false),
.ucs2 => writeU16(input, len, to, to_len, .utf16le, false),
.utf16le => writeU16(input, len, to, to_len, .utf16le, false),
.base64 => writeU16(input, len, to, to_len, .base64, false),
.base64url => writeU16(input, len, to, to_len, .base64url, false),
.hex => writeU16(input, len, to, to_len, .hex, false),
else => unreachable,
} catch 0;
}
// TODO(@190n) handle unpaired surrogates
export fn Bun__encoding__byteLengthLatin1AsUTF8(input: [*]const u8, len: usize) usize {
return byteLengthU8(input, len, .utf8);
}
// TODO(@190n) handle unpaired surrogates
export fn Bun__encoding__byteLengthUTF16AsUTF8(input: [*]const u16, len: usize) usize {
return strings.elementLengthUTF16IntoUTF8([]const u16, input[0..len]);
}
export fn Bun__encoding__constructFromLatin1(globalObject: *JSGlobalObject, input: [*]const u8, len: usize, encoding: u8) JSValue {
const slice = switch (@as(Encoding, @enumFromInt(encoding))) {
.hex => constructFromU8(input, len, bun.default_allocator, .hex),
.ascii => constructFromU8(input, len, bun.default_allocator, .ascii),
.base64url => constructFromU8(input, len, bun.default_allocator, .base64url),
.utf16le => constructFromU8(input, len, bun.default_allocator, .utf16le),
.ucs2 => constructFromU8(input, len, bun.default_allocator, .utf16le),
.utf8 => constructFromU8(input, len, bun.default_allocator, .utf8),
.base64 => constructFromU8(input, len, bun.default_allocator, .base64),
else => unreachable,
};
return JSC.JSValue.createBuffer(globalObject, slice, globalObject.bunVM().allocator);
}
export fn Bun__encoding__constructFromUTF16(globalObject: *JSGlobalObject, input: [*]const u16, len: usize, encoding: u8) JSValue {
const slice = switch (@as(Encoding, @enumFromInt(encoding))) {
.base64 => constructFromU16(input, len, bun.default_allocator, .base64),
.hex => constructFromU16(input, len, bun.default_allocator, .hex),
.base64url => constructFromU16(input, len, bun.default_allocator, .base64url),
.utf16le => constructFromU16(input, len, bun.default_allocator, .utf16le),
.ucs2 => constructFromU16(input, len, bun.default_allocator, .utf16le),
.utf8 => constructFromU16(input, len, bun.default_allocator, .utf8),
.ascii => constructFromU16(input, len, bun.default_allocator, .ascii),
.latin1 => constructFromU16(input, len, bun.default_allocator, .latin1),
else => unreachable,
};
return JSC.JSValue.createBuffer(globalObject, slice, globalObject.bunVM().allocator);
}
// for SQL statement
export fn Bun__encoding__toStringUTF8(input: [*]const u8, len: usize, globalObject: *JSC.JSGlobalObject) JSValue {
return toStringComptime(input[0..len], globalObject, .utf8);
}
export fn Bun__encoding__toString(input: [*]const u8, len: usize, globalObject: *JSC.JSGlobalObject, encoding: u8) JSValue {
return toString(input[0..len], globalObject, @enumFromInt(encoding));
}
// pub fn writeUTF16AsUTF8(utf16: [*]const u16, len: usize, to: [*]u8, to_len: usize) callconv(.C) i32 {
// return @intCast(i32, strings.copyUTF16IntoUTF8(to[0..to_len], []const u16, utf16[0..len]).written);
// }
pub fn toString(input: []const u8, globalObject: *JSGlobalObject, encoding: Encoding) JSValue {
return switch (encoding) {
// treat buffer as utf8
// callers are expected to check that before constructing `Buffer` objects
.buffer, .utf8 => toStringComptime(input, globalObject, .utf8),
inline else => |enc| toStringComptime(input, globalObject, enc),
};
}
pub fn toBunStringFromOwnedSlice(input: []u8, encoding: Encoding) bun.String {
if (input.len == 0)
return bun.String.empty;
switch (encoding) {
.ascii => {
if (strings.isAllASCII(input)) {
return bun.String.createExternalGloballyAllocated(.latin1, input);
}
const str, const chars = bun.String.createUninitialized(.latin1, input.len);
defer bun.default_allocator.free(input);
if (str.tag == .Dead) {
return str;
}
strings.copyLatin1IntoASCII(chars, input);
return str;
},
.latin1 => {
return bun.String.createExternalGloballyAllocated(.latin1, input);
},
.buffer, .utf8 => {
const converted = strings.toUTF16Alloc(bun.default_allocator, input, false, false) catch {
bun.default_allocator.free(input);
return bun.String.dead;
};
if (converted) |utf16| {
defer bun.default_allocator.free(input);
return bun.String.createExternalGloballyAllocated(.utf16, utf16);
}
// If we get here, it means we can safely assume the string is 100% ASCII characters
return bun.String.createExternalGloballyAllocated(.latin1, input);
},
.ucs2, .utf16le => {
// Avoid incomplete characters
if (input.len / 2 == 0) {
bun.default_allocator.free(input);
return bun.String.empty;
}
const as_u16 = std.mem.bytesAsSlice(u16, input);
return bun.String.createExternalGloballyAllocated(.utf16, @alignCast(as_u16));
},
.hex => {
defer bun.default_allocator.free(input);
const str, const chars = bun.String.createUninitialized(.latin1, input.len * 2);
if (str.tag == .Dead) {
return str;
}
const wrote = strings.encodeBytesToHex(chars, input);
// Return an empty string in this case, just like node.
if (wrote < chars.len) {
str.deref();
return bun.String.empty;
}
return str;
},
// TODO: this is not right. There is an issue here. But it needs to
// be addressed separately because constructFromU8's base64url also
// appears inconsistent with Node.js.
.base64url => {
defer bun.default_allocator.free(input);
const out, const chars = bun.String.createUninitialized(.latin1, bun.base64.urlSafeEncodeLen(input));
if (out.tag != .Dead) {
_ = bun.base64.encodeURLSafe(chars, input);
}
return out;
},
.base64 => {
defer bun.default_allocator.free(input);
const to_len = bun.base64.encodeLen(input);
const to = bun.default_allocator.alloc(u8, to_len) catch return bun.String.dead;
const wrote = bun.base64.encode(to, input);
return bun.String.createExternalGloballyAllocated(.latin1, to[0..wrote]);
},
}
}
pub fn toStringComptime(input: []const u8, global: *JSGlobalObject, comptime encoding: Encoding) JSValue {
var bun_string = toBunStringComptime(input, encoding);
return bun_string.transferToJS(global);
}
pub fn toBunString(input: []const u8, encoding: Encoding) bun.String {
return switch (encoding) {
inline else => |enc| toBunStringComptime(input, enc),
};
}
pub fn toBunStringComptime(input: []const u8, comptime encoding: Encoding) bun.String {
if (input.len == 0)
return bun.String.empty;
switch (comptime encoding) {
.ascii => {
const str, const chars = bun.String.createUninitialized(.latin1, input.len);
strings.copyLatin1IntoASCII(chars, input);
return str;
},
.latin1 => {
const str, const chars = bun.String.createUninitialized(.latin1, input.len);
@memcpy(chars, input);
return str;
},
.buffer, .utf8 => {
const converted = strings.toUTF16Alloc(bun.default_allocator, input, false, false) catch return bun.String.dead;
if (converted) |utf16| {
return bun.String.createExternalGloballyAllocated(.utf16, utf16);
}
// If we get here, it means we can safely assume the string is 100% ASCII characters
// For this, we rely on WebKit to manage the memory.
return bun.String.cloneLatin1(input);
},
.ucs2, .utf16le => {
// Avoid incomplete characters
if (input.len / 2 == 0) return bun.String.empty;
const str, const chars = bun.String.createUninitialized(.utf16, input.len / 2);
var output_bytes = std.mem.sliceAsBytes(chars);
output_bytes[output_bytes.len - 1] = 0;
@memcpy(output_bytes, input[0..output_bytes.len]);
return str;
},
.hex => {
const str, const chars = bun.String.createUninitialized(.latin1, input.len * 2);
const wrote = strings.encodeBytesToHex(chars, input);
bun.assert(wrote == chars.len);
return str;
},
.base64url => {
const to_len = bun.base64.urlSafeEncodeLen(input);
const to = bun.default_allocator.alloc(u8, to_len) catch return bun.String.dead;
const wrote = bun.base64.encodeURLSafe(to, input);
return bun.String.createExternalGloballyAllocated(.latin1, to[0..wrote]);
},
.base64 => {
const to_len = bun.base64.encodeLen(input);
const to = bun.default_allocator.alloc(u8, to_len) catch return bun.String.dead;
const wrote = bun.base64.encode(to, input);
return bun.String.createExternalGloballyAllocated(.latin1, to[0..wrote]);
},
}
}
pub fn writeU8(input: [*]const u8, len: usize, to_ptr: [*]u8, to_len: usize, comptime encoding: Encoding) !usize {
if (len == 0 or to_len == 0)
return 0;
// TODO: increase temporary buffer size for larger amounts of data
// defer {
// if (comptime encoding.isBinaryToText()) {}
// }
// if (comptime encoding.isBinaryToText()) {}
switch (comptime encoding) {
.buffer, .latin1 => {
const written = @min(len, to_len);
@memcpy(to_ptr[0..written], input[0..written]);
return written;
},
.ascii => {
const written = @min(len, to_len);
const to = to_ptr[0..written];
var remain = input[0..written];
if (bun.simdutf.validate.ascii(remain)) {
@memcpy(to_ptr[0..written], remain[0..written]);
} else {
strings.copyLatin1IntoASCII(to, remain);
}
return written;
},
.utf8 => {
// need to encode
return strings.copyLatin1IntoUTF8(to_ptr[0..to_len], []const u8, input[0..len]).written;
},
// encode latin1 into UTF16
.ucs2, .utf16le => {
if (to_len < 2)
return 0;
if (std.mem.isAligned(@intFromPtr(to_ptr), @alignOf([*]u16))) {
const buf = input[0..len];
const output = @as([*]u16, @ptrCast(@alignCast(to_ptr)))[0 .. to_len / 2];
const written = strings.copyLatin1IntoUTF16([]u16, output, []const u8, buf).written;
return written * 2;
} else {
const buf = input[0..len];
const output = @as([*]align(1) u16, @ptrCast(to_ptr))[0 .. to_len / 2];
const written = strings.copyLatin1IntoUTF16([]align(1) u16, output, []const u8, buf).written;
return written * 2;
}
},
.hex => {
return strings.decodeHexToBytesTruncate(to_ptr[0..to_len], u8, input[0..len]);
},
.base64, .base64url => {
return bun.base64.decode(to_ptr[0..to_len], input[0..len]).count;
},
}
}
pub fn byteLengthU8(input: [*]const u8, len: usize, comptime encoding: Encoding) usize {
if (len == 0)
return 0;
switch (comptime encoding) {
.utf8 => {
return strings.elementLengthLatin1IntoUTF8(input[0..len]);
},
.latin1, .ascii, .buffer => {
return len;
},
.ucs2, .utf16le => {
return strings.elementLengthUTF8IntoUTF16([]const u8, input[0..len]) * 2;
},
.hex => {
return len / 2;
},
.base64, .base64url => {
return bun.base64.decodeLen(input[0..len]);
},
// else => return &[_]u8{};
}
}
pub fn encodeIntoFrom16(input: []const u16, to: []u8, comptime encoding: Encoding, comptime allow_partial_write: bool) !usize {
return writeU16(input.ptr, input.len, to.ptr, to.len, encoding, allow_partial_write);
}
pub fn encodeIntoFrom8(input: []const u8, to: []u8, comptime encoding: Encoding) !usize {
return writeU8(input.ptr, input.len, to.ptr, to.len, encoding);
}
pub fn writeU16(input: [*]const u16, len: usize, to: [*]u8, to_len: usize, comptime encoding: Encoding, comptime allow_partial_write: bool) !usize {
if (len == 0)
return 0;
switch (comptime encoding) {
.utf8 => {
return strings.copyUTF16IntoUTF8Impl(
to[0..to_len],
[]const u16,
input[0..len],
allow_partial_write,
).written;
},
.latin1, .ascii, .buffer => {
const out = @min(len, to_len);
strings.copyU16IntoU8(to[0..to_len], input[0..out]);
return out;
},
// string is already encoded, just need to copy the data
.ucs2, .utf16le => {
if (allow_partial_write) {
const bytes_input_len = len * 2;
const written = @min(bytes_input_len, to_len);
const input_u8 = @as([*]const u8, @ptrCast(input));
bun.memmove(to[0..written], input_u8[0..written]);
return written;
} else {
const bytes_input_len = len * 2;
const written = @min(bytes_input_len, to_len);
if (written < 2) return 0;
const fixed_len = (written / 2) * 2;
const input_u8 = @as([*]const u8, @ptrCast(input));
bun.memmove(to[0..written], input_u8[0..fixed_len]);
return fixed_len;
}
},
.hex => {
return strings.decodeHexToBytesTruncate(to[0..to_len], u16, input[0..len]);
},
.base64, .base64url => {
if (to_len < 2 or len == 0)
return 0;
// very very slow case!
// shouldn't really happen though
const transcoded = strings.toUTF8Alloc(bun.default_allocator, input[0..len]) catch return 0;
defer bun.default_allocator.free(transcoded);
return writeU8(transcoded.ptr, transcoded.len, to, to_len, encoding);
},
// else => return &[_]u8{};
}
}
pub fn constructFrom(comptime T: type, input: []const T, allocator: std.mem.Allocator, comptime encoding: Encoding) []u8 {
return switch (comptime T) {
u16 => constructFromU16(input.ptr, input.len, allocator, encoding),
u8 => constructFromU8(input.ptr, input.len, allocator, encoding),
else => @compileError("Unsupported type for constructFrom: " ++ @typeName(T)),
};
}
pub fn constructFromU8(input: [*]const u8, len: usize, allocator: std.mem.Allocator, comptime encoding: Encoding) []u8 {
if (len == 0) return &[_]u8{};
switch (comptime encoding) {
.buffer => {
var to = allocator.alloc(u8, len) catch return &[_]u8{};
@memcpy(to[0..len], input[0..len]);
return to;
},
.latin1, .ascii => {
var to = allocator.alloc(u8, len) catch return &[_]u8{};
@memcpy(to[0..len], input[0..len]);
return to;
},
.utf8 => {
// need to encode
return strings.allocateLatin1IntoUTF8(allocator, []const u8, input[0..len]) catch return &[_]u8{};
},
// encode latin1 into UTF16
// return as bytes
.ucs2, .utf16le => {
var to = allocator.alloc(u16, len) catch return &[_]u8{};
_ = strings.copyLatin1IntoUTF16([]u16, to, []const u8, input[0..len]);
return std.mem.sliceAsBytes(to[0..len]);
},
.hex => {
if (len < 2)
return &[_]u8{};
var to = allocator.alloc(u8, len / 2) catch return &[_]u8{};
return to[0..strings.decodeHexToBytesTruncate(to, u8, input[0..len])];
},
.base64, .base64url => {
const slice = strings.trim(input[0..len], "\r\n\t " ++ [_]u8{std.ascii.control_code.vt});
if (slice.len == 0) return &[_]u8{};
const outlen = bun.base64.decodeLen(slice);
const to = allocator.alloc(u8, outlen) catch return &[_]u8{};
const wrote = bun.base64.decode(to[0..outlen], slice).count;
return to[0..wrote];
},
}
}
pub fn constructFromU16(input: [*]const u16, len: usize, allocator: std.mem.Allocator, comptime encoding: Encoding) []u8 {
if (len == 0) return &[_]u8{};
switch (comptime encoding) {
.utf8 => {
return strings.toUTF8AllocWithType(allocator, []const u16, input[0..len]) catch return &[_]u8{};
},
.latin1, .buffer, .ascii => {
var to = allocator.alloc(u8, len) catch return &[_]u8{};
strings.copyU16IntoU8(to[0..len], input[0..len]);
return to;
},
// string is already encoded, just need to copy the data
.ucs2, .utf16le => {
var to = std.mem.sliceAsBytes(allocator.alloc(u16, len) catch return &[_]u8{});
const bytes = std.mem.sliceAsBytes(input[0..len]);
@memcpy(to[0..bytes.len], bytes);
return to;
},
.hex => {
var to = allocator.alloc(u8, len * 2) catch return &[_]u8{};
return to[0..strings.decodeHexToBytesTruncate(to, u16, input[0..len])];
},
.base64, .base64url => {
// very very slow case!
// shouldn't really happen though
const transcoded = strings.toUTF8Alloc(allocator, input[0..len]) catch return &[_]u8{};
defer allocator.free(transcoded);
return constructFromU8(transcoded.ptr, transcoded.len, allocator, encoding);
},
}
}
comptime {
_ = &Bun__encoding__writeLatin1;
_ = &Bun__encoding__writeUTF16;
_ = &Bun__encoding__byteLengthLatin1AsUTF8;
_ = &Bun__encoding__byteLengthUTF16AsUTF8;
_ = &Bun__encoding__toString;
_ = &Bun__encoding__toStringUTF8;
_ = &Bun__encoding__constructFromLatin1;
_ = &Bun__encoding__constructFromUTF16;
}
const std = @import("std");
const JSC = bun.JSC;
const Encoding = JSC.Node.Encoding;
const bun = @import("bun");
const strings = bun.strings;
const string = bun.string;
const JSValue = JSC.JSValue;
const JSGlobalObject = JSC.JSGlobalObject;