Fix bun.String.toOwnedSliceReturningAllASCII (#23925)

`bun.String.toOwnedSliceReturningAllASCII` is supposed to return a
boolean indicating whether or not the string is entirely composed of
ASCII characters. However, the current implementation frequently
produces incorrect results:

* If the string is a `ZigString`, it always returns true, even though
`ZigString`s can be UTF-16 or Latin-1.
* If the string is a `StaticZigString`, it always returns false, even
though `StaticZigStrings` can be all ASCII.
* If the string is a 16-bit `WTFStringImpl`, it always returns false,
even though 16-bit `WTFString`s can be all ASCII.
* If the string is empty, it always returns false, even though empty
strings are valid ASCII strings.

`toOwnedSliceReturningAllASCII` is currently used in two places, both of
which assume its answer is accurate:

* `bun.webcore.Blob.fromJSWithoutDeferGC`
* `bun.api.ServerConfig.fromJS`

(For internal tracking: fixes ENG-21249)
This commit is contained in:
taylor.fish
2025-10-21 18:42:39 -07:00
committed by GitHub
parent 06eea5213a
commit d846e9a1e7
4 changed files with 55 additions and 33 deletions

View File

@@ -74,27 +74,48 @@ pub const String = extern struct {
return BunString__transferToJS(this, globalThis);
}
pub fn toOwnedSlice(this: String, allocator: std.mem.Allocator) ![]u8 {
const bytes, _ = try this.toOwnedSliceReturningAllASCII(allocator);
pub fn toOwnedSlice(this: String, allocator: std.mem.Allocator) OOM![]u8 {
const bytes, _ = try this.toOwnedSliceImpl(allocator);
return bytes;
}
/// Returns `.{ utf8_bytes, is_all_ascii }`.
///
/// `false` means the string contains at least one non-ASCII character.
pub fn toOwnedSliceReturningAllASCII(this: String, allocator: std.mem.Allocator) OOM!struct { []u8, bool } {
switch (this.tag) {
.ZigString => return .{ try this.value.ZigString.toOwnedSlice(allocator), true },
.WTFStringImpl => {
var utf8_slice = this.value.WTFStringImpl.toUTF8WithoutRef(allocator);
if (utf8_slice.allocator.get()) |alloc| {
if (!isWTFAllocator(alloc)) {
return .{ @constCast(utf8_slice.slice()), false };
}
}
const bytes, const ascii_status = try this.toOwnedSliceImpl(allocator);
const is_ascii = switch (ascii_status) {
.all_ascii => true,
.non_ascii => false,
.unknown => bun.strings.isAllASCII(bytes),
};
return .{ bytes, is_ascii };
}
return .{ @constCast((try utf8_slice.cloneIfNeeded(allocator)).slice()), true };
fn toOwnedSliceImpl(this: String, allocator: std.mem.Allocator) !struct { []u8, AsciiStatus } {
return switch (this.tag) {
.ZigString => .{ try this.value.ZigString.toOwnedSlice(allocator), .unknown },
.WTFStringImpl => blk: {
const utf8_slice = this.value.WTFStringImpl.toUTF8WithoutRef(allocator);
// `utf8_slice.allocator` is either null, or `allocator`.
errdefer utf8_slice.deinit();
const ascii_status: AsciiStatus = if (utf8_slice.allocator.isNull())
.all_ascii // no allocation means the string was 8-bit and all ascii
else if (this.value.WTFStringImpl.is8Bit())
.non_ascii // otherwise the allocator would be null for an 8-bit string
else
.unknown; // string was 16-bit; may or may not be all ascii
const owned_slice = try utf8_slice.cloneIfNeeded(allocator);
// `owned_slice.allocator` is guaranteed to be `allocator`.
break :blk .{ owned_slice.mut(), ascii_status };
},
.StaticZigString => return .{ try this.value.StaticZigString.toOwnedSlice(allocator), false },
else => return .{ &[_]u8{}, false },
}
.StaticZigString => .{
try this.value.StaticZigString.toOwnedSlice(allocator), .unknown,
},
else => return .{ &.{}, .all_ascii }, // trivially all ascii
};
}
pub fn createIfDifferent(other: String, utf8_slice: []const u8) String {
@@ -1237,6 +1258,7 @@ const std = @import("std");
const bun = @import("bun");
const JSError = bun.JSError;
const OOM = bun.OOM;
const AsciiStatus = bun.strings.AsciiStatus;
const jsc = bun.jsc;
const JSValue = bun.jsc.JSValue;