diff --git a/src/bun.js/bindings/ZigString.zig b/src/bun.js/bindings/ZigString.zig index abf9f61111..ee91221403 100644 --- a/src/bun.js/bindings/ZigString.zig +++ b/src/bun.js/bindings/ZigString.zig @@ -412,7 +412,8 @@ pub const ZigString = extern struct { } pub fn mut(this: Slice) []u8 { - return @as([*]u8, @ptrFromInt(@intFromPtr(this.ptr)))[0..this.len]; + bun.assertf(!this.allocator.isNull(), "cannot mutate a borrowed ZigString.Slice", .{}); + return @constCast(this.ptr)[0..this.len]; } /// Does nothing if the slice is not allocated diff --git a/src/bun.js/webcore/Blob.zig b/src/bun.js/webcore/Blob.zig index d5d5ab337e..68c85a3138 100644 --- a/src/bun.js/webcore/Blob.zig +++ b/src/bun.js/webcore/Blob.zig @@ -34,7 +34,7 @@ content_type_was_set: bool = false, /// JavaScriptCore strings are either latin1 or UTF-16 /// When UTF-16, they're nearly always due to non-ascii characters -charset: Charset = .unknown, +charset: strings.AsciiStatus = .unknown, /// Was it created via file constructor? is_jsdom_file: bool = false, @@ -3244,7 +3244,7 @@ pub fn initWithAllASCII(bytes: []u8, allocator: std.mem.Allocator, globalThis: * .store = store, .content_type = "", .globalThis = globalThis, - .charset = .fromIsAllASCII(is_all_ascii), + .charset = .fromBool(is_all_ascii), }; } @@ -3423,7 +3423,7 @@ pub fn sharedView(this: *const Blob) []const u8 { pub const Lifetime = jsc.WebCore.Lifetime; pub fn setIsASCIIFlag(this: *Blob, is_all_ascii: bool) void { - this.charset = .fromIsAllASCII(is_all_ascii); + this.charset = .fromBool(is_all_ascii); // if this Blob represents the entire binary data // which will be pretty common // we can update the store's is_all_ascii flag @@ -4735,20 +4735,6 @@ pub fn FileCloser(comptime This: type) type { }; } -/// This takes up less space than a `?bool`. -pub const Charset = enum { - unknown, - all_ascii, - non_ascii, - - pub fn fromIsAllASCII(is_all_ascii: ?bool) Charset { - return if (is_all_ascii orelse return .unknown) - .all_ascii - else - .non_ascii; - } -}; - pub fn isAllASCII(self: *const Blob) ?bool { return switch (self.charset) { .unknown => null, diff --git a/src/string.zig b/src/string.zig index cb1ed9d85d..83e10a5a85 100644 --- a/src/string.zig +++ b/src/string.zig @@ -74,27 +74,48 @@ pub const String = extern struct { return BunString__transferToJS(this, globalThis); } - pub fn toOwnedSlice(this: String, allocator: std.mem.Allocator) ![]u8 { - const bytes, _ = try this.toOwnedSliceReturningAllASCII(allocator); + pub fn toOwnedSlice(this: String, allocator: std.mem.Allocator) OOM![]u8 { + const bytes, _ = try this.toOwnedSliceImpl(allocator); return bytes; } + /// Returns `.{ utf8_bytes, is_all_ascii }`. + /// + /// `false` means the string contains at least one non-ASCII character. pub fn toOwnedSliceReturningAllASCII(this: String, allocator: std.mem.Allocator) OOM!struct { []u8, bool } { - switch (this.tag) { - .ZigString => return .{ try this.value.ZigString.toOwnedSlice(allocator), true }, - .WTFStringImpl => { - var utf8_slice = this.value.WTFStringImpl.toUTF8WithoutRef(allocator); - if (utf8_slice.allocator.get()) |alloc| { - if (!isWTFAllocator(alloc)) { - return .{ @constCast(utf8_slice.slice()), false }; - } - } + const bytes, const ascii_status = try this.toOwnedSliceImpl(allocator); + const is_ascii = switch (ascii_status) { + .all_ascii => true, + .non_ascii => false, + .unknown => bun.strings.isAllASCII(bytes), + }; + return .{ bytes, is_ascii }; + } - return .{ @constCast((try utf8_slice.cloneIfNeeded(allocator)).slice()), true }; + fn toOwnedSliceImpl(this: String, allocator: std.mem.Allocator) !struct { []u8, AsciiStatus } { + return switch (this.tag) { + .ZigString => .{ try this.value.ZigString.toOwnedSlice(allocator), .unknown }, + .WTFStringImpl => blk: { + const utf8_slice = this.value.WTFStringImpl.toUTF8WithoutRef(allocator); + // `utf8_slice.allocator` is either null, or `allocator`. + errdefer utf8_slice.deinit(); + + const ascii_status: AsciiStatus = if (utf8_slice.allocator.isNull()) + .all_ascii // no allocation means the string was 8-bit and all ascii + else if (this.value.WTFStringImpl.is8Bit()) + .non_ascii // otherwise the allocator would be null for an 8-bit string + else + .unknown; // string was 16-bit; may or may not be all ascii + + const owned_slice = try utf8_slice.cloneIfNeeded(allocator); + // `owned_slice.allocator` is guaranteed to be `allocator`. + break :blk .{ owned_slice.mut(), ascii_status }; }, - .StaticZigString => return .{ try this.value.StaticZigString.toOwnedSlice(allocator), false }, - else => return .{ &[_]u8{}, false }, - } + .StaticZigString => .{ + try this.value.StaticZigString.toOwnedSlice(allocator), .unknown, + }, + else => return .{ &.{}, .all_ascii }, // trivially all ascii + }; } pub fn createIfDifferent(other: String, utf8_slice: []const u8) String { @@ -1237,6 +1258,7 @@ const std = @import("std"); const bun = @import("bun"); const JSError = bun.JSError; const OOM = bun.OOM; +const AsciiStatus = bun.strings.AsciiStatus; const jsc = bun.jsc; const JSValue = bun.jsc.JSValue; diff --git a/src/string/immutable.zig b/src/string/immutable.zig index 04bb476dce..ce36729315 100644 --- a/src/string/immutable.zig +++ b/src/string/immutable.zig @@ -10,6 +10,19 @@ pub const Encoding = enum { utf16, }; +pub const AsciiStatus = enum { + unknown, + all_ascii, + non_ascii, + + pub fn fromBool(is_all_ascii: ?bool) AsciiStatus { + return if (is_all_ascii orelse return .unknown) + .all_ascii + else + .non_ascii; + } +}; + /// Returned by classification functions that do not discriminate between utf8 and ascii. pub const EncodingNonAscii = enum { utf8,