refactor: move string-like structs into string module (#17369)

This commit is contained in:
Don Isaac
2025-02-15 21:52:43 -08:00
committed by GitHub
parent 59f3d1ca31
commit cdf62b35ff
21 changed files with 1079 additions and 1073 deletions

View File

@@ -0,0 +1,43 @@
const HashedString = @This();
const bun = @import("root").bun;
ptr: [*]const u8,
len: u32,
hash: u32,
pub const empty = HashedString{ .ptr = @as([*]const u8, @ptrFromInt(0xDEADBEEF)), .len = 0, .hash = 0 };
pub fn init(buf: []const u8) HashedString {
return HashedString{
.ptr = buf.ptr,
.len = @as(u32, @truncate(buf.len)),
.hash = @as(u32, @truncate(bun.hash(buf))),
};
}
pub fn initNoHash(buf: []const u8) HashedString {
return HashedString{
.ptr = buf.ptr,
.len = @as(u32, @truncate(buf.len)),
.hash = 0,
};
}
pub fn eql(this: HashedString, other: anytype) bool {
return Eql(this, @TypeOf(other), other);
}
fn Eql(this: HashedString, comptime Other: type, other: Other) bool {
switch (comptime Other) {
HashedString, *HashedString, *const HashedString => {
return ((@max(this.hash, other.hash) > 0 and this.hash == other.hash) or (this.ptr == other.ptr)) and this.len == other.len;
},
else => {
return @as(usize, this.len) == other.len and @as(u32, @truncate(bun.hash(other[0..other.len]))) == this.hash;
},
}
}
pub fn str(this: HashedString) []const u8 {
return this.ptr[0..this.len];
}

View File

@@ -0,0 +1,452 @@
const std = @import("std");
const bun = @import("root").bun;
const Allocator = std.mem.Allocator;
const strings = bun.strings;
const js_lexer = bun.js_lexer;
const string = bun.string;
const stringZ = bun.stringZ;
const CodePoint = bun.CodePoint;
const MutableString = @This();
allocator: Allocator,
list: std.ArrayListUnmanaged(u8),
pub fn init2048(allocator: Allocator) Allocator.Error!MutableString {
return MutableString.init(allocator, 2048);
}
pub fn clone(self: *MutableString) Allocator.Error!MutableString {
return MutableString.initCopy(self.allocator, self.list.items);
}
pub const Writer = std.io.Writer(*@This(), Allocator.Error, MutableString.writeAll);
pub fn writer(self: *MutableString) Writer {
return Writer{
.context = self,
};
}
pub fn isEmpty(this: *const MutableString) bool {
return this.list.items.len == 0;
}
pub fn deinit(str: *MutableString) void {
if (str.list.capacity > 0) {
str.list.expandToCapacity();
str.list.clearAndFree(str.allocator);
}
}
pub fn owns(this: *const MutableString, items: []const u8) bool {
return bun.isSliceInBuffer(items, this.list.items.ptr[0..this.list.capacity]);
}
pub fn growIfNeeded(self: *MutableString, amount: usize) Allocator.Error!void {
try self.list.ensureUnusedCapacity(self.allocator, amount);
}
pub fn write(self: *MutableString, bytes: anytype) Allocator.Error!usize {
bun.debugAssert(bytes.len == 0 or !bun.isSliceInBuffer(bytes, self.list.allocatedSlice()));
try self.list.appendSlice(self.allocator, bytes);
return bytes.len;
}
pub fn bufferedWriter(self: *MutableString) BufferedWriter {
return BufferedWriter{ .context = self };
}
pub fn init(allocator: Allocator, capacity: usize) Allocator.Error!MutableString {
return MutableString{ .allocator = allocator, .list = if (capacity > 0)
try std.ArrayListUnmanaged(u8).initCapacity(allocator, capacity)
else
std.ArrayListUnmanaged(u8){} };
}
pub fn initEmpty(allocator: Allocator) MutableString {
return MutableString{ .allocator = allocator, .list = .{} };
}
pub const ensureUnusedCapacity = growIfNeeded;
pub fn initCopy(allocator: Allocator, str: anytype) Allocator.Error!MutableString {
var mutable = try MutableString.init(allocator, str.len);
try mutable.copy(str);
return mutable;
}
/// Convert it to an ASCII identifier. Note: If you change this to a non-ASCII
/// identifier, you're going to potentially cause trouble with non-BMP code
/// points in target environments that don't support bracketed Unicode escapes.
pub fn ensureValidIdentifier(str: string, allocator: Allocator) Allocator.Error!string {
if (str.len == 0) {
return "_";
}
var iterator = strings.CodepointIterator.init(str);
var cursor = strings.CodepointIterator.Cursor{};
var has_needed_gap = false;
var needs_gap = false;
var start_i: usize = 0;
if (!iterator.next(&cursor)) return "_";
const JSLexerTables = @import("../js_lexer_tables.zig");
// Common case: no gap necessary. No allocation necessary.
needs_gap = !js_lexer.isIdentifierStart(cursor.c);
if (!needs_gap) {
// Are there any non-alphanumeric chars at all?
while (iterator.next(&cursor)) {
if (!js_lexer.isIdentifierContinue(cursor.c) or cursor.width > 1) {
needs_gap = true;
start_i = cursor.i;
break;
}
}
}
if (!needs_gap) {
return JSLexerTables.StrictModeReservedWordsRemap.get(str) orelse str;
}
if (needs_gap) {
var mutable = try MutableString.initCopy(allocator, if (start_i == 0)
// the first letter can be a non-identifier start
// https://github.com/oven-sh/bun/issues/2946
"_"
else
str[0..start_i]);
needs_gap = false;
var items = str[start_i..];
iterator = strings.CodepointIterator.init(items);
cursor = strings.CodepointIterator.Cursor{};
while (iterator.next(&cursor)) {
if (js_lexer.isIdentifierContinue(cursor.c) and cursor.width == 1) {
if (needs_gap) {
try mutable.appendChar('_');
needs_gap = false;
has_needed_gap = true;
}
try mutable.append(items[cursor.i .. cursor.i + @as(u32, cursor.width)]);
} else if (!needs_gap) {
needs_gap = true;
// skip the code point, replace it with a single _
}
}
// If it ends with an emoji
if (needs_gap) {
try mutable.appendChar('_');
needs_gap = false;
has_needed_gap = true;
}
if (comptime bun.Environment.allow_assert) {
bun.assert(js_lexer.isIdentifier(mutable.list.items));
}
return try mutable.list.toOwnedSlice(allocator);
}
return str;
}
pub fn len(self: *const MutableString) usize {
return self.list.items.len;
}
pub fn copy(self: *MutableString, str: anytype) Allocator.Error!void {
try self.list.ensureTotalCapacity(self.allocator, str[0..].len);
if (self.list.items.len == 0) {
try self.list.insertSlice(self.allocator, 0, str);
} else {
try self.list.replaceRange(self.allocator, 0, str[0..].len, str[0..]);
}
}
pub inline fn growBy(self: *MutableString, amount: usize) Allocator.Error!void {
try self.list.ensureUnusedCapacity(self.allocator, amount);
}
pub inline fn appendSlice(self: *MutableString, items: []const u8) Allocator.Error!void {
try self.list.appendSlice(self.allocator, items);
}
pub inline fn appendSliceExact(self: *MutableString, items: []const u8) Allocator.Error!void {
if (items.len == 0) return;
try self.list.ensureTotalCapacityPrecise(self.allocator, self.list.items.len + items.len);
var end = self.list.items.ptr + self.list.items.len;
self.list.items.len += items.len;
@memcpy(end[0..items.len], items);
}
pub inline fn reset(
self: *MutableString,
) void {
self.list.clearRetainingCapacity();
}
pub inline fn resetTo(
self: *MutableString,
index: usize,
) void {
bun.assert(index <= self.list.capacity);
self.list.items.len = index;
}
pub fn inflate(self: *MutableString, amount: usize) Allocator.Error!void {
try self.list.resize(self.allocator, amount);
}
pub inline fn appendCharNTimes(self: *MutableString, char: u8, n: usize) Allocator.Error!void {
try self.list.appendNTimes(self.allocator, char, n);
}
pub inline fn appendChar(self: *MutableString, char: u8) Allocator.Error!void {
try self.list.append(self.allocator, char);
}
pub inline fn appendCharAssumeCapacity(self: *MutableString, char: u8) void {
self.list.appendAssumeCapacity(char);
}
pub inline fn append(self: *MutableString, char: []const u8) Allocator.Error!void {
try self.list.appendSlice(self.allocator, char);
}
pub inline fn appendInt(self: *MutableString, int: u64) Allocator.Error!void {
const count = bun.fmt.fastDigitCount(int);
try self.list.ensureUnusedCapacity(self.allocator, count);
const old = self.list.items.len;
self.list.items.len += count;
bun.assert(count == std.fmt.formatIntBuf(self.list.items.ptr[old .. old + count], int, 10, .lower, .{}));
}
pub inline fn appendAssumeCapacity(self: *MutableString, char: []const u8) void {
self.list.appendSliceAssumeCapacity(
char,
);
}
pub inline fn lenI(self: *MutableString) i32 {
return @as(i32, @intCast(self.list.items.len));
}
pub fn toOwnedSlice(self: *MutableString) string {
return self.list.toOwnedSlice(self.allocator) catch bun.outOfMemory(); // TODO
}
pub fn slice(self: *MutableString) []u8 {
return self.list.items;
}
/// Clear the existing value without freeing the memory or shrinking the capacity.
pub fn move(self: *MutableString) []u8 {
const out = self.list.items;
self.list = .{};
return out;
}
/// Appends `0` if needed
pub fn sliceWithSentinel(self: *MutableString) [:0]u8 {
if (self.list.items.len > 0 and self.list.items[self.list.items.len - 1] != 0) {
self.list.append(
self.allocator,
0,
) catch unreachable;
}
return self.list.items[0 .. self.list.items.len - 1 :0];
}
pub fn toOwnedSliceLength(self: *MutableString, length: usize) string {
self.list.shrinkAndFree(self.allocator, length);
return self.list.toOwnedSlice(self.allocator) catch bun.outOfMemory(); // TODO
}
pub fn containsChar(self: *const MutableString, char: u8) bool {
return self.indexOfChar(char) != null;
}
pub fn indexOfChar(self: *const MutableString, char: u8) ?u32 {
return strings.indexOfChar(self.list.items, char);
}
pub fn lastIndexOfChar(self: *const MutableString, char: u8) ?usize {
return strings.lastIndexOfChar(self.list.items, char);
}
pub fn lastIndexOf(self: *const MutableString, str: u8) ?usize {
return strings.lastIndexOfChar(self.list.items, str);
}
pub fn indexOf(self: *const MutableString, str: u8) ?usize {
return std.mem.indexOf(u8, self.list.items, str);
}
pub fn eql(self: *MutableString, other: anytype) bool {
return std.mem.eql(u8, self.list.items, other);
}
pub fn toSocketBuffers(self: *MutableString, comptime count: usize, ranges: anytype) [count]std.posix.iovec_const {
var buffers: [count]std.posix.iovec_const = undefined;
inline for (&buffers, ranges) |*b, r| {
b.* = .{
.iov_base = self.list.items[r[0]..r[1]].ptr,
.iov_len = self.list.items[r[0]..r[1]].len,
};
}
return buffers;
}
pub const BufferedWriter = struct {
context: *MutableString,
buffer: [max]u8 = undefined,
pos: usize = 0,
const max = 2048;
pub const Writer = std.io.Writer(*BufferedWriter, Allocator.Error, BufferedWriter.writeAll);
inline fn remain(this: *BufferedWriter) []u8 {
return this.buffer[this.pos..];
}
pub fn flush(this: *BufferedWriter) Allocator.Error!void {
_ = try this.context.writeAll(this.buffer[0..this.pos]);
this.pos = 0;
}
pub fn writeAll(this: *BufferedWriter, bytes: []const u8) Allocator.Error!usize {
const pending = bytes;
if (pending.len >= max) {
try this.flush();
try this.context.append(pending);
return pending.len;
}
if (pending.len > 0) {
if (pending.len + this.pos > max) {
try this.flush();
}
@memcpy(this.remain()[0..pending.len], pending);
this.pos += pending.len;
}
return pending.len;
}
const E = bun.JSAst.E;
/// Write a E.String to the buffer.
/// This automatically encodes UTF-16 into UTF-8 using
/// the same code path as TextEncoder
pub fn writeString(this: *BufferedWriter, bytes: *E.String) Allocator.Error!usize {
if (bytes.isUTF8()) {
return try this.writeAll(bytes.slice(this.context.allocator));
}
return try this.writeAll16(bytes.slice16());
}
/// Write a UTF-16 string to the (UTF-8) buffer
/// This automatically encodes UTF-16 into UTF-8 using
/// the same code path as TextEncoder
pub fn writeAll16(this: *BufferedWriter, bytes: []const u16) Allocator.Error!usize {
const pending = bytes;
if (pending.len >= max) {
try this.flush();
try this.context.list.ensureUnusedCapacity(this.context.allocator, bytes.len * 2);
const decoded = strings.copyUTF16IntoUTF8(
this.remain()[0 .. bytes.len * 2],
[]const u16,
bytes,
true,
);
this.context.list.items.len += @as(usize, decoded.written);
return pending.len;
}
if (pending.len > 0) {
if ((pending.len * 2) + this.pos > max) {
try this.flush();
}
const decoded = strings.copyUTF16IntoUTF8(
this.remain()[0 .. bytes.len * 2],
[]const u16,
bytes,
true,
);
this.pos += @as(usize, decoded.written);
}
return pending.len;
}
pub fn writeHTMLAttributeValueString(this: *BufferedWriter, str: *E.String) Allocator.Error!void {
if (str.isUTF8()) {
try this.writeHTMLAttributeValue(str.slice(this.context.allocator));
return;
}
try this.writeHTMLAttributeValue16(str.slice16());
}
pub fn writeHTMLAttributeValue(this: *BufferedWriter, bytes: []const u8) Allocator.Error!void {
var items = bytes;
while (items.len > 0) {
// TODO: SIMD
if (strings.indexOfAny(items, "\"<>")) |j| {
_ = try this.writeAll(items[0..j]);
_ = switch (items[j]) {
'"' => try this.writeAll("&quot;"),
'<' => try this.writeAll("&lt;"),
'>' => try this.writeAll("&gt;"),
else => unreachable,
};
items = items[j + 1 ..];
continue;
}
_ = try this.writeAll(items);
break;
}
}
pub fn writeHTMLAttributeValue16(this: *BufferedWriter, bytes: []const u16) Allocator.Error!void {
var items = bytes;
while (items.len > 0) {
if (strings.indexOfAny16(items, "\"<>")) |j| {
// this won't handle strings larger than 4 GB
// that's fine though, 4 GB of SSR'd HTML is quite a lot...
_ = try this.writeAll16(items[0..j]);
_ = switch (items[j]) {
'"' => try this.writeAll("&quot;"),
'<' => try this.writeAll("&lt;"),
'>' => try this.writeAll("&gt;"),
else => unreachable,
};
items = items[j + 1 ..];
continue;
}
_ = try this.writeAll16(items);
break;
}
}
pub fn writer(this: *BufferedWriter) BufferedWriter.Writer {
return BufferedWriter.Writer{ .context = this };
}
};
pub fn writeAll(self: *MutableString, bytes: string) Allocator.Error!usize {
try self.list.appendSlice(self.allocator, bytes);
return bytes.len;
}

58
src/string/PathString.zig Normal file
View File

@@ -0,0 +1,58 @@
const std = @import("std");
const bun = @import("root").bun;
// macOS sets file path limit to 1024
// Since a pointer on x64 is 64 bits and only 46 bits are used
// We can safely store the entire path slice in a single u64.
pub const PathString = packed struct {
const PathIntLen = std.math.IntFittingRange(0, bun.MAX_PATH_BYTES);
pub const use_small_path_string = @bitSizeOf(usize) - @bitSizeOf(PathIntLen) >= 53;
pub const PathInt = if (use_small_path_string) PathIntLen else usize;
pub const PointerIntType = if (use_small_path_string) u53 else usize;
ptr: PointerIntType = 0,
len: PathInt = 0,
const JSC = bun.JSC;
pub fn estimatedSize(this: *const PathString) usize {
return @as(usize, this.len);
}
pub inline fn slice(this: anytype) []const u8 {
@setRuntimeSafety(false); // "cast causes pointer to be null" is fine here. if it is null, the len will be 0.
return @as([*]u8, @ptrFromInt(@as(usize, @intCast(this.ptr))))[0..this.len];
}
pub inline fn sliceAssumeZ(this: anytype) [:0]const u8 {
@setRuntimeSafety(false); // "cast causes pointer to be null" is fine here. if it is null, the len will be 0.
return @as([*:0]u8, @ptrFromInt(@as(usize, @intCast(this.ptr))))[0..this.len :0];
}
pub inline fn init(str: []const u8) @This() {
@setRuntimeSafety(false); // "cast causes pointer to be null" is fine here. if it is null, the len will be 0.
return .{
.ptr = @as(PointerIntType, @truncate(@intFromPtr(str.ptr))),
.len = @as(PathInt, @truncate(str.len)),
};
}
pub inline fn isEmpty(this: anytype) bool {
return this.len == 0;
}
pub fn format(self: PathString, comptime _: []const u8, _: std.fmt.FormatOptions, writer: anytype) !void {
try writer.writeAll(self.slice());
}
pub const empty = @This(){ .ptr = 0, .len = 0 };
comptime {
if (!bun.Environment.isWasm) {
if (use_small_path_string and @bitSizeOf(@This()) != 64) {
@compileError("PathString must be 64 bits");
} else if (!use_small_path_string and @bitSizeOf(@This()) != 128) {
@compileError("PathString must be 128 bits");
}
}
}
};

208
src/string/SmolStr.zig Normal file
View File

@@ -0,0 +1,208 @@
const std = @import("std");
const BabyList = @import("../baby_list.zig").BabyList;
const Allocator = std.mem.Allocator;
const assert = std.debug.assert;
/// This is a string type that stores up to 15 bytes inline on the stack, and heap allocates if it is longer
pub const SmolStr = packed struct {
__len: u32,
cap: u32,
__ptr: [*]u8,
const Tag: usize = 0x8000000000000000;
const NegatedTag: usize = ~Tag;
pub fn jsonStringify(self: *const SmolStr, writer: anytype) !void {
try writer.write(self.slice());
}
pub const Inlined = packed struct {
data: u120,
__len: u7,
_tag: u1,
pub fn len(this: Inlined) u8 {
return @intCast(this.__len);
}
pub fn setLen(this: *Inlined, new_len: u7) void {
this.__len = new_len;
}
pub fn slice(this: *Inlined) []const u8 {
return this.allChars()[0..this.__len];
}
pub fn allChars(this: *Inlined) *[15]u8 {
return @as([*]u8, @ptrCast(@as(*u128, @ptrCast(this))))[0..15];
}
};
comptime {
assert(@sizeOf(SmolStr) == @sizeOf(Inlined));
}
pub fn empty() SmolStr {
const inlined = Inlined{
.data = 0,
.__len = 0,
._tag = 1,
};
return SmolStr.fromInlined(inlined);
}
pub fn len(this: *const SmolStr) u32 {
if (this.isInlined()) {
return @intCast((@intFromPtr(this.__ptr) >> 56) & 0b01111111);
}
return this.__len;
}
pub fn ptr(this: *SmolStr) [*]u8 {
return @ptrFromInt(@as(usize, @intFromPtr(this.__ptr)) & NegatedTag);
}
pub fn ptrConst(this: *const SmolStr) [*]const u8 {
return @ptrFromInt(@as(usize, @intFromPtr(this.__ptr)) & NegatedTag);
}
pub fn markInlined(this: *SmolStr) void {
this.__ptr = @ptrFromInt(@as(usize, @intFromPtr(this.__ptr)) | Tag);
}
pub fn markHeap(this: *SmolStr) void {
this.__ptr = @ptrFromInt(@as(usize, @intFromPtr(this.__ptr)) & NegatedTag);
}
pub fn isInlined(this: *const SmolStr) bool {
return @as(usize, @intFromPtr(this.__ptr)) & Tag != 0;
}
pub fn toInlined(this: *const SmolStr) Inlined {
var inlined: Inlined = @bitCast(@as(u128, @bitCast(this.*)));
inlined._tag = 1;
return inlined;
}
pub fn fromBabyList(baby_list: BabyList(u8)) SmolStr {
var smol_str: SmolStr = .{
.__len = baby_list.len,
.cap = baby_list.cap,
.__ptr = baby_list.ptr,
};
smol_str.markHeap();
return smol_str;
}
pub fn fromInlined(inlined: Inlined) SmolStr {
var smol_str: SmolStr = @bitCast(inlined);
smol_str.markInlined();
return smol_str;
}
pub fn fromChar(char: u8) SmolStr {
var inlined = Inlined{
.data = 0,
.__len = 1,
._tag = 1,
};
inlined.allChars()[0] = char;
inlined.setLen(1);
return SmolStr.fromInlined(inlined);
}
pub fn fromSlice(allocator: Allocator, values: []const u8) Allocator.Error!SmolStr {
if (values.len > 15) {
var baby_list = try BabyList(u8).initCapacity(allocator, values.len);
baby_list.appendSliceAssumeCapacity(values);
return SmolStr.fromBabyList(baby_list);
}
var inlined = Inlined{
.data = 0,
.__len = 0,
._tag = 1,
};
if (values.len > 0) {
@memcpy(inlined.allChars()[0..values.len], values[0..values.len]);
inlined.setLen(@intCast(values.len));
}
return SmolStr.fromInlined(inlined);
}
pub fn slice(this: *const SmolStr) []const u8 {
if (this.isInlined()) {
const bytes: [*]const u8 = @ptrCast(this);
return bytes[0..this.len()];
}
return this.ptrConst()[0..this.__len];
}
pub fn appendChar(this: *SmolStr, allocator: Allocator, char: u8) Allocator.Error!void {
if (this.isInlined()) {
var inlined = this.toInlined();
if (inlined.len() + 1 > 15) {
var baby_list = try BabyList(u8).initCapacity(allocator, inlined.len() + 1);
baby_list.appendSliceAssumeCapacity(inlined.slice());
try baby_list.push(allocator, char);
// this.* = SmolStr.fromBabyList(baby_list);
this.__len = baby_list.len;
this.__ptr = baby_list.ptr;
this.cap = baby_list.cap;
this.markHeap();
return;
}
inlined.allChars()[inlined.len()] = char;
inlined.setLen(@intCast(inlined.len() + 1));
// this.* = SmolStr.fromInlined(inlined);
this.* = @bitCast(inlined);
this.markInlined();
return;
}
var baby_list = BabyList(u8){
.ptr = this.ptr(),
.len = this.__len,
.cap = this.cap,
};
try baby_list.push(allocator, char);
// this.* = SmolStr.fromBabyList(baby_list);
this.__len = baby_list.len;
this.__ptr = baby_list.ptr;
this.cap = baby_list.cap;
return;
}
pub fn appendSlice(this: *SmolStr, allocator: Allocator, values: []const u8) Allocator.Error!void {
if (this.isInlined()) {
var inlined = this.toInlined();
if (inlined.len() + values.len > 15) {
var baby_list = try BabyList(u8).initCapacity(allocator, inlined.len() + values.len);
baby_list.appendSliceAssumeCapacity(inlined.slice());
baby_list.appendSliceAssumeCapacity(values);
this.* = SmolStr.fromBabyList(baby_list);
return;
}
@memcpy(inlined.allChars()[inlined.len() .. inlined.len() + values.len], values);
inlined.setLen(@intCast(inlined.len() + values.len));
this.* = SmolStr.fromInlined(inlined);
return;
}
var baby_list = BabyList(u8){
.ptr = this.ptr(),
.len = this.__len,
.cap = this.cap,
};
try baby_list.append(allocator, values);
this.* = SmolStr.fromBabyList(baby_list);
return;
}
};

View File

@@ -0,0 +1,244 @@
const StringBuilder = @This();
const std = @import("std");
const bun = @import("root").bun;
const Allocator = std.mem.Allocator;
const Environment = bun.Environment;
const assert = bun.assert;
const DebugHashTable = if (Environment.allow_assert) std.AutoHashMapUnmanaged(u64, void) else void;
len: usize = 0,
cap: usize = 0,
ptr: ?[*]u8 = null,
pub fn initCapacity(
allocator: std.mem.Allocator,
cap: usize,
) Allocator.Error!StringBuilder {
return StringBuilder{
.cap = cap,
.len = 0,
.ptr = (try allocator.alloc(u8, cap)).ptr,
};
}
pub fn countZ(this: *StringBuilder, slice: []const u8) void {
this.cap += slice.len + 1;
}
pub fn count(this: *StringBuilder, slice: []const u8) void {
this.cap += slice.len;
}
pub fn allocate(this: *StringBuilder, allocator: Allocator) Allocator.Error!void {
const slice = try allocator.alloc(u8, this.cap);
this.ptr = slice.ptr;
this.len = 0;
}
pub fn deinit(this: *StringBuilder, allocator: Allocator) void {
if (this.ptr == null or this.cap == 0) return;
allocator.free(this.ptr.?[0..this.cap]);
}
pub fn count16(this: *StringBuilder, slice: []const u16) void {
const result = bun.simdutf.length.utf8.from.utf16.le(slice);
this.cap += result;
}
pub fn count16Z(this: *StringBuilder, slice: [:0]const u16) void {
const result = bun.strings.elementLengthUTF16IntoUTF8([:0]const u16, slice);
this.cap += result + 1;
}
pub fn append16(this: *StringBuilder, slice: []const u16, fallback_allocator: std.mem.Allocator) ?[:0]u8 {
var buf = this.writable();
if (slice.len == 0) {
buf[0] = 0;
this.len += 1;
return buf[0..0 :0];
}
const result = bun.simdutf.convert.utf16.to.utf8.with_errors.le(slice, buf);
if (result.status == .success) {
this.len += result.count + 1;
buf[result.count] = 0;
return buf[0..result.count :0];
} else {
var list = std.ArrayList(u8).init(fallback_allocator);
var out = bun.strings.toUTF8ListWithTypeBun(&list, []const u16, slice, false) catch return null;
out.append(0) catch return null;
return list.items[0 .. list.items.len - 1 :0];
}
}
pub fn appendZ(this: *StringBuilder, slice: []const u8) [:0]const u8 {
if (comptime Environment.allow_assert) {
assert(this.len + 1 <= this.cap); // didn't count everything
assert(this.ptr != null); // must call allocate first
}
bun.copy(u8, this.ptr.?[this.len..this.cap], slice);
this.ptr.?[this.len + slice.len] = 0;
const result = this.ptr.?[this.len..this.cap][0..slice.len :0];
this.len += slice.len + 1;
if (comptime Environment.allow_assert) assert(this.len <= this.cap);
return result;
}
pub fn appendStr(this: *StringBuilder, str: bun.String) []const u8 {
const slice = str.toUTF8(bun.default_allocator);
defer slice.deinit();
return this.append(slice.slice());
}
pub fn append(this: *StringBuilder, slice: []const u8) []const u8 {
if (comptime Environment.allow_assert) {
assert(this.len <= this.cap); // didn't count everything
assert(this.ptr != null); // must call allocate first
}
bun.copy(u8, this.ptr.?[this.len..this.cap], slice);
const result = this.ptr.?[this.len..this.cap][0..slice.len];
this.len += slice.len;
if (comptime Environment.allow_assert) assert(this.len <= this.cap);
return result;
}
pub fn addConcat(this: *StringBuilder, slices: []const []const u8) bun.StringPointer {
var remain = this.allocatedSlice()[this.len..];
var len: usize = 0;
for (slices) |slice| {
@memcpy(remain[0..slice.len], slice);
remain = remain[slice.len..];
len += slice.len;
}
return this.add(len);
}
pub fn add(this: *StringBuilder, len: usize) bun.StringPointer {
if (comptime Environment.allow_assert) {
assert(this.len <= this.cap); // didn't count everything
assert(this.ptr != null); // must call allocate first
}
const start = this.len;
this.len += len;
if (comptime Environment.allow_assert) assert(this.len <= this.cap);
return bun.StringPointer{ .offset = @as(u32, @truncate(start)), .length = @as(u32, @truncate(len)) };
}
pub fn appendCount(this: *StringBuilder, slice: []const u8) bun.StringPointer {
if (comptime Environment.allow_assert) {
assert(this.len <= this.cap); // didn't count everything
assert(this.ptr != null); // must call allocate first
}
const start = this.len;
bun.copy(u8, this.ptr.?[this.len..this.cap], slice);
const result = this.ptr.?[this.len..this.cap][0..slice.len];
_ = result;
this.len += slice.len;
if (comptime Environment.allow_assert) assert(this.len <= this.cap);
return bun.StringPointer{ .offset = @as(u32, @truncate(start)), .length = @as(u32, @truncate(slice.len)) };
}
pub fn appendCountZ(this: *StringBuilder, slice: []const u8) bun.StringPointer {
if (comptime Environment.allow_assert) {
assert(this.len <= this.cap); // didn't count everything
assert(this.ptr != null); // must call allocate first
}
const start = this.len;
bun.copy(u8, this.ptr.?[this.len..this.cap], slice);
this.ptr.?[this.len + slice.len] = 0;
const result = this.ptr.?[this.len..this.cap][0..slice.len];
_ = result;
this.len += slice.len;
this.len += 1;
if (comptime Environment.allow_assert) assert(this.len <= this.cap);
return bun.StringPointer{ .offset = @as(u32, @truncate(start)), .length = @as(u32, @truncate(slice.len)) };
}
pub fn fmt(this: *StringBuilder, comptime str: []const u8, args: anytype) []const u8 {
if (comptime Environment.allow_assert) {
assert(this.len <= this.cap); // didn't count everything
assert(this.ptr != null); // must call allocate first
}
const buf = this.ptr.?[this.len..this.cap];
const out = std.fmt.bufPrint(buf, str, args) catch unreachable;
this.len += out.len;
if (comptime Environment.allow_assert) assert(this.len <= this.cap);
return out;
}
pub fn fmtAppendCount(this: *StringBuilder, comptime str: []const u8, args: anytype) bun.StringPointer {
if (comptime Environment.allow_assert) {
assert(this.len <= this.cap); // didn't count everything
assert(this.ptr != null); // must call allocate first
}
const buf = this.ptr.?[this.len..this.cap];
const out = std.fmt.bufPrint(buf, str, args) catch unreachable;
const off = this.len;
this.len += out.len;
if (comptime Environment.allow_assert) assert(this.len <= this.cap);
return bun.StringPointer{
.offset = @as(u32, @truncate(off)),
.length = @as(u32, @truncate(out.len)),
};
}
pub fn fmtAppendCountZ(this: *StringBuilder, comptime str: []const u8, args: anytype) bun.StringPointer {
if (comptime Environment.allow_assert) {
assert(this.len <= this.cap); // didn't count everything
assert(this.ptr != null); // must call allocate first
}
const buf = this.ptr.?[this.len..this.cap];
const out = std.fmt.bufPrintZ(buf, str, args) catch unreachable;
const off = this.len;
this.len += out.len;
this.len += 1;
if (comptime Environment.allow_assert) assert(this.len <= this.cap);
return bun.StringPointer{
.offset = @as(u32, @truncate(off)),
.length = @as(u32, @truncate(out.len)),
};
}
pub fn fmtCount(this: *StringBuilder, comptime str: []const u8, args: anytype) void {
this.cap += std.fmt.count(str, args);
}
pub fn allocatedSlice(this: *StringBuilder) []u8 {
var ptr = this.ptr orelse return &[_]u8{};
if (comptime Environment.allow_assert) {
assert(this.cap > 0);
}
return ptr[0..this.cap];
}
pub fn writable(this: *StringBuilder) []u8 {
var ptr = this.ptr orelse return &[_]u8{};
if (comptime Environment.allow_assert) {
assert(this.cap > 0);
}
return ptr[this.len..this.cap];
}

164
src/string/StringJoiner.zig Normal file
View File

@@ -0,0 +1,164 @@
//! Rope-like data structure for joining many small strings into one big string.
//! Implemented as a linked list of potentially-owned slices and a length.
const StringJoiner = @This();
const std = @import("std");
const default_allocator = bun.default_allocator;
const bun = @import("root").bun;
const Allocator = std.mem.Allocator;
const NullableAllocator = bun.NullableAllocator;
const assert = bun.assert;
/// Temporary allocator used for nodes and duplicated strings.
/// It is recommended to use a stack-fallback allocator for this.
allocator: Allocator,
/// Total length of all nodes
len: usize = 0,
head: ?*Node = null,
tail: ?*Node = null,
/// Avoid an extra pass over the list when joining
watcher: Watcher = .{},
const Node = struct {
allocator: NullableAllocator = .{},
slice: []const u8 = "",
next: ?*Node = null,
pub fn init(joiner_alloc: Allocator, slice: []const u8, slice_alloc: ?Allocator) *Node {
const node = joiner_alloc.create(Node) catch bun.outOfMemory();
node.* = .{
.slice = slice,
.allocator = NullableAllocator.init(slice_alloc),
};
return node;
}
pub fn deinit(node: *Node, joiner_alloc: Allocator) void {
node.allocator.free(node.slice);
joiner_alloc.destroy(node);
}
};
pub const Watcher = struct {
input: []const u8 = "",
estimated_count: u32 = 0,
needs_newline: bool = false,
};
/// `data` is expected to live until `.done` is called
pub fn pushStatic(this: *StringJoiner, data: []const u8) void {
this.push(data, null);
}
/// `data` is cloned
pub fn pushCloned(this: *StringJoiner, data: []const u8) void {
if (data.len == 0) return;
this.push(
this.allocator.dupe(u8, data) catch bun.outOfMemory(),
this.allocator,
);
}
pub fn push(this: *StringJoiner, data: []const u8, allocator: ?Allocator) void {
if (data.len == 0) return;
this.len += data.len;
const new_tail = Node.init(this.allocator, data, allocator);
if (data.len > 0) {
this.watcher.estimated_count += @intFromBool(
this.watcher.input.len > 0 and
bun.strings.contains(data, this.watcher.input),
);
this.watcher.needs_newline = data[data.len - 1] != '\n';
}
if (this.tail) |current_tail| {
current_tail.next = new_tail;
} else {
assert(this.head == null);
this.head = new_tail;
}
this.tail = new_tail;
}
/// This deinits the string joiner on success, the new string is owned by `allocator`
pub fn done(this: *StringJoiner, allocator: Allocator) ![]u8 {
var current: ?*Node = this.head orelse {
assert(this.tail == null);
assert(this.len == 0);
return &.{};
};
const slice = try allocator.alloc(u8, this.len);
var remaining = slice;
while (current) |node| {
@memcpy(remaining[0..node.slice.len], node.slice);
remaining = remaining[node.slice.len..];
const prev = node;
current = node.next;
prev.deinit(this.allocator);
}
bun.assert(remaining.len == 0);
return slice;
}
/// Same as `.done`, but appends extra slice `end`
pub fn doneWithEnd(this: *StringJoiner, allocator: Allocator, end: []const u8) ![]u8 {
var current: ?*Node = this.head orelse {
assert(this.tail == null);
assert(this.len == 0);
if (end.len > 0) {
return allocator.dupe(u8, end);
}
return &.{};
};
const slice = try allocator.alloc(u8, this.len + end.len);
var remaining = slice;
while (current) |node| {
@memcpy(remaining[0..node.slice.len], node.slice);
remaining = remaining[node.slice.len..];
const prev = node;
current = node.next;
prev.deinit(this.allocator);
}
bun.assert(remaining.len == end.len);
@memcpy(remaining, end);
return slice;
}
pub fn lastByte(this: *const StringJoiner) u8 {
const slice = (this.tail orelse return 0).slice;
assert(slice.len > 0);
return slice[slice.len - 1];
}
pub fn ensureNewlineAtEnd(this: *StringJoiner) void {
if (this.watcher.needs_newline) {
this.watcher.needs_newline = false;
this.pushStatic("\n");
}
}
pub fn contains(this: *const StringJoiner, slice: []const u8) bool {
var el = this.head;
while (el) |node| {
el = node.next;
if (bun.strings.contains(node.slice, slice)) return true;
}
return false;
}

View File

@@ -0,0 +1,268 @@
const std = @import("std");
const bun = @import("root").bun;
const JSC = bun.JSC;
const OOM = bun.OOM;
pub const WTFStringImpl = *WTFStringImplStruct;
const ZigString = bun.JSC.ZigString;
pub const WTFStringImplStruct = extern struct {
m_refCount: u32 = 0,
m_length: u32 = 0,
m_ptr: extern union { latin1: [*]const u8, utf16: [*]const u16 },
m_hashAndFlags: u32 = 0,
// ---------------------------------------------------------------------
// These details must stay in sync with WTFStringImpl.h in WebKit!
// ---------------------------------------------------------------------
const s_flagCount: u32 = 8;
const s_flagMask: u32 = (1 << s_flagCount) - 1;
const s_flagStringKindCount: u32 = 4;
const s_hashZeroValue: u32 = 0;
const s_hashFlagStringKindIsAtom: u32 = @as(1, u32) << (s_flagStringKindCount);
const s_hashFlagStringKindIsSymbol: u32 = @as(1, u32) << (s_flagStringKindCount + 1);
const s_hashMaskStringKind: u32 = s_hashFlagStringKindIsAtom | s_hashFlagStringKindIsSymbol;
const s_hashFlagDidReportCost: u32 = @as(1, u32) << 3;
const s_hashFlag8BitBuffer: u32 = 1 << 2;
const s_hashMaskBufferOwnership: u32 = (1 << 0) | (1 << 1);
/// The bottom bit in the ref count indicates a static (immortal) string.
const s_refCountFlagIsStaticString = 0x1;
/// This allows us to ref / deref without disturbing the static string flag.
const s_refCountIncrement = 0x2;
// ---------------------------------------------------------------------
pub fn refCount(this: WTFStringImpl) u32 {
return this.m_refCount / s_refCountIncrement;
}
pub fn memoryCost(this: WTFStringImpl) usize {
return this.byteLength();
}
pub fn isStatic(this: WTFStringImpl) bool {
return this.m_refCount & s_refCountIncrement != 0;
}
pub fn byteLength(this: WTFStringImpl) usize {
return if (this.is8Bit()) this.m_length else this.m_length * 2;
}
extern fn WTFStringImpl__isThreadSafe(WTFStringImpl) bool;
pub fn isThreadSafe(this: WTFStringImpl) bool {
return WTFStringImpl__isThreadSafe(this);
}
pub fn byteSlice(this: WTFStringImpl) []const u8 {
return this.m_ptr.latin1[0..this.byteLength()];
}
pub inline fn is8Bit(self: WTFStringImpl) bool {
return (self.m_hashAndFlags & s_hashFlag8BitBuffer) != 0;
}
pub inline fn length(self: WTFStringImpl) u32 {
return self.m_length;
}
pub inline fn utf16Slice(self: WTFStringImpl) []const u16 {
bun.assert(!is8Bit(self));
return self.m_ptr.utf16[0..length(self)];
}
pub inline fn latin1Slice(self: WTFStringImpl) []const u8 {
bun.assert(is8Bit(self));
return self.m_ptr.latin1[0..length(self)];
}
/// Caller must ensure that the string is 8-bit and ASCII.
pub inline fn utf8Slice(self: WTFStringImpl) []const u8 {
if (comptime bun.Environment.allow_assert)
bun.assert(canUseAsUTF8(self));
return self.m_ptr.latin1[0..length(self)];
}
pub fn toZigString(this: WTFStringImpl) ZigString {
if (this.is8Bit()) {
return ZigString.init(this.latin1Slice());
} else {
return ZigString.initUTF16(this.utf16Slice());
}
}
pub inline fn deref(self: WTFStringImpl) void {
JSC.markBinding(@src());
const current_count = self.refCount();
bun.assert(current_count > 0);
Bun__WTFStringImpl__deref(self);
if (comptime bun.Environment.allow_assert) {
if (current_count > 1) {
bun.assert(self.refCount() < current_count or self.isStatic());
}
}
}
pub inline fn ref(self: WTFStringImpl) void {
JSC.markBinding(@src());
const current_count = self.refCount();
bun.assert(current_count > 0);
Bun__WTFStringImpl__ref(self);
bun.assert(self.refCount() > current_count or self.isStatic());
}
pub fn toLatin1Slice(this: WTFStringImpl) ZigString.Slice {
this.ref();
return ZigString.Slice.init(this.refCountAllocator(), this.latin1Slice());
}
extern fn Bun__WTFStringImpl__ensureHash(this: WTFStringImpl) void;
/// Compute the hash() if necessary
pub fn ensureHash(this: WTFStringImpl) void {
JSC.markBinding(@src());
Bun__WTFStringImpl__ensureHash(this);
}
pub fn toUTF8(this: WTFStringImpl, allocator: std.mem.Allocator) ZigString.Slice {
if (this.is8Bit()) {
if (bun.strings.toUTF8FromLatin1(allocator, this.latin1Slice()) catch bun.outOfMemory()) |utf8| {
return ZigString.Slice.init(allocator, utf8.items);
}
return this.toLatin1Slice();
}
return ZigString.Slice.init(
allocator,
bun.strings.toUTF8Alloc(allocator, this.utf16Slice()) catch bun.outOfMemory(),
);
}
pub const max = std.math.maxInt(u32);
pub fn toUTF8WithoutRef(this: WTFStringImpl, allocator: std.mem.Allocator) ZigString.Slice {
if (this.is8Bit()) {
if (bun.strings.toUTF8FromLatin1(allocator, this.latin1Slice()) catch bun.outOfMemory()) |utf8| {
return ZigString.Slice.init(allocator, utf8.items);
}
return ZigString.Slice.fromUTF8NeverFree(this.latin1Slice());
}
return ZigString.Slice.init(
allocator,
bun.strings.toUTF8Alloc(allocator, this.utf16Slice()) catch bun.outOfMemory(),
);
}
pub fn toOwnedSliceZ(this: WTFStringImpl, allocator: std.mem.Allocator) [:0]u8 {
if (this.is8Bit()) {
if (bun.strings.toUTF8FromLatin1Z(allocator, this.latin1Slice()) catch bun.outOfMemory()) |utf8| {
return utf8.items[0 .. utf8.items.len - 1 :0];
}
return allocator.dupeZ(u8, this.latin1Slice()) catch bun.outOfMemory();
}
return bun.strings.toUTF8AllocZ(allocator, this.utf16Slice()) catch bun.outOfMemory();
}
pub fn toUTF8IfNeeded(this: WTFStringImpl, allocator: std.mem.Allocator) ?ZigString.Slice {
if (this.is8Bit()) {
if (bun.strings.toUTF8FromLatin1(allocator, this.latin1Slice()) catch bun.outOfMemory()) |utf8| {
return ZigString.Slice.init(allocator, utf8.items);
}
return null;
}
return ZigString.Slice.init(
allocator,
bun.strings.toUTF8Alloc(allocator, this.utf16Slice()) catch bun.outOfMemory(),
);
}
/// Avoid using this in code paths that are about to get the string as a UTF-8
/// In that case, use toUTF8IfNeeded instead.
pub fn canUseAsUTF8(this: WTFStringImpl) bool {
return this.is8Bit() and bun.strings.isAllASCII(this.latin1Slice());
}
pub fn utf16ByteLength(this: WTFStringImpl) usize {
if (this.is8Bit()) {
return this.length() * 2;
} else {
return this.length();
}
}
pub fn utf8ByteLength(this: WTFStringImpl) usize {
if (this.is8Bit()) {
const input = this.latin1Slice();
return if (input.len > 0) JSC.WebCore.Encoder.byteLengthU8(input.ptr, input.len, .utf8) else 0;
} else {
const input = this.utf16Slice();
return if (input.len > 0) JSC.WebCore.Encoder.byteLengthU16(input.ptr, input.len, .utf8) else 0;
}
}
pub fn latin1ByteLength(this: WTFStringImpl) usize {
// Not all UTF-16 characters fit are representable in latin1.
// Those get truncated?
return this.length();
}
pub fn refCountAllocator(self: WTFStringImpl) std.mem.Allocator {
return std.mem.Allocator{ .ptr = self, .vtable = StringImplAllocator.VTablePtr };
}
pub fn hasPrefix(self: WTFStringImpl, text: []const u8) bool {
return Bun__WTFStringImpl__hasPrefix(self, text.ptr, text.len);
}
extern fn Bun__WTFStringImpl__deref(self: WTFStringImpl) void;
extern fn Bun__WTFStringImpl__ref(self: WTFStringImpl) void;
extern fn Bun__WTFStringImpl__hasPrefix(self: *const WTFStringImplStruct, offset: [*]const u8, length: usize) bool;
};
pub const StringImplAllocator = struct {
fn alloc(ptr: *anyopaque, len: usize, _: u8, _: usize) ?[*]u8 {
var this = bun.cast(WTFStringImpl, ptr);
const len_ = this.byteLength();
if (len_ != len) {
// we don't actually allocate, we just reference count
return null;
}
this.ref();
// we should never actually allocate
return @constCast(this.m_ptr.latin1);
}
fn resize(_: *anyopaque, _: []u8, _: u8, _: usize, _: usize) bool {
return false;
}
pub fn free(
ptr: *anyopaque,
buf: []u8,
_: u8,
_: usize,
) void {
var this = bun.cast(WTFStringImpl, ptr);
bun.assert(this.latin1Slice().ptr == buf.ptr);
bun.assert(this.latin1Slice().len == buf.len);
this.deref();
}
pub const VTable = std.mem.Allocator.VTable{
.alloc = &alloc,
.resize = &resize,
.free = &free,
};
pub const VTablePtr = &VTable;
};