diff --git a/cmake/sources/ZigSources.txt b/cmake/sources/ZigSources.txt index f765dcbd29..96b823856e 100644 --- a/cmake/sources/ZigSources.txt +++ b/cmake/sources/ZigSources.txt @@ -673,12 +673,16 @@ src/StaticHashMap.zig src/string_immutable.zig src/string_types.zig src/string.zig +src/string/escapeHTML.zig src/string/HashedString.zig src/string/MutableString.zig +src/string/paths.zig src/string/PathString.zig src/string/SmolStr.zig src/string/StringBuilder.zig src/string/StringJoiner.zig +src/string/unicode.zig +src/string/visible.zig src/string/WTFStringImpl.zig src/sync.zig src/sys_uv.zig diff --git a/src/string/escapeHTML.zig b/src/string/escapeHTML.zig new file mode 100644 index 0000000000..d82fd793f2 --- /dev/null +++ b/src/string/escapeHTML.zig @@ -0,0 +1,640 @@ +pub fn escapeHTMLForLatin1Input(allocator: std.mem.Allocator, latin1: []const u8) !Escaped(u8) { + const Scalar = struct { + pub const lengths: [std.math.maxInt(u8) + 1]u4 = brk: { + var values: [std.math.maxInt(u8) + 1]u4 = undefined; + for (values, 0..) |_, i| { + switch (i) { + '"' => { + values[i] = """.len; + }, + '&' => { + values[i] = "&".len; + }, + '\'' => { + values[i] = "'".len; + }, + '<' => { + values[i] = "<".len; + }, + '>' => { + values[i] = ">".len; + }, + else => { + values[i] = 1; + }, + } + } + + break :brk values; + }; + + fn appendString(buf: [*]u8, comptime str: []const u8) callconv(bun.callconv_inline) usize { + buf[0..str.len].* = str[0..str.len].*; + return str.len; + } + + pub fn append(buf: [*]u8, char: u8) callconv(bun.callconv_inline) usize { + if (lengths[char] == 1) { + buf[0] = char; + return 1; + } + + return switch (char) { + '"' => appendString(buf, """), + '&' => appendString(buf, "&"), + '\'' => appendString(buf, "'"), + '<' => appendString(buf, "<"), + '>' => appendString(buf, ">"), + else => unreachable, + }; + } + + pub fn push(comptime len: anytype, chars_: *const [len]u8, allo: std.mem.Allocator) callconv(bun.callconv_inline) Escaped(u8) { + const chars = chars_.*; + var total: usize = 0; + + comptime var remain_to_comp = len; + comptime var comp_i = 0; + + inline while (remain_to_comp > 0) : (remain_to_comp -= 1) { + total += lengths[chars[comp_i]]; + comp_i += 1; + } + + if (total == len) { + return .{ .original = {} }; + } + + const output = allo.alloc(u8, total) catch unreachable; + var head = output.ptr; + inline for (comptime bun.range(0, len)) |i| { + head += @This().append(head, chars[i]); + } + + return Escaped(u8){ .allocated = output }; + } + }; + @setEvalBranchQuota(5000); + switch (latin1.len) { + 0 => return Escaped(u8){ .static = "" }, + 1 => return switch (latin1[0]) { + '"' => Escaped(u8){ .static = """ }, + '&' => Escaped(u8){ .static = "&" }, + '\'' => Escaped(u8){ .static = "'" }, + '<' => Escaped(u8){ .static = "<" }, + '>' => Escaped(u8){ .static = ">" }, + else => Escaped(u8){ .original = {} }, + }, + 2 => { + const first: []const u8 = switch (latin1[0]) { + '"' => """, + '&' => "&", + '\'' => "'", + '<' => "<", + '>' => ">", + else => latin1[0..1], + }; + const second: []const u8 = switch (latin1[1]) { + '"' => """, + '&' => "&", + '\'' => "'", + '<' => "<", + '>' => ">", + else => latin1[1..2], + }; + if (first.len == 1 and second.len == 1) { + return Escaped(u8){ .original = {} }; + } + + return Escaped(u8){ .allocated = strings.append(allocator, first, second) catch unreachable }; + }, + + // The simd implementation is slower for inputs less than 32 bytes. + 3 => return Scalar.push(3, latin1[0..3], allocator), + 4 => return Scalar.push(4, latin1[0..4], allocator), + 5 => return Scalar.push(5, latin1[0..5], allocator), + 6 => return Scalar.push(6, latin1[0..6], allocator), + 7 => return Scalar.push(7, latin1[0..7], allocator), + 8 => return Scalar.push(8, latin1[0..8], allocator), + 9 => return Scalar.push(9, latin1[0..9], allocator), + 10 => return Scalar.push(10, latin1[0..10], allocator), + 11 => return Scalar.push(11, latin1[0..11], allocator), + 12 => return Scalar.push(12, latin1[0..12], allocator), + 13 => return Scalar.push(13, latin1[0..13], allocator), + 14 => return Scalar.push(14, latin1[0..14], allocator), + 15 => return Scalar.push(15, latin1[0..15], allocator), + 16 => return Scalar.push(16, latin1[0..16], allocator), + 17 => return Scalar.push(17, latin1[0..17], allocator), + 18 => return Scalar.push(18, latin1[0..18], allocator), + 19 => return Scalar.push(19, latin1[0..19], allocator), + 20 => return Scalar.push(20, latin1[0..20], allocator), + 21 => return Scalar.push(21, latin1[0..21], allocator), + 22 => return Scalar.push(22, latin1[0..22], allocator), + 23 => return Scalar.push(23, latin1[0..23], allocator), + 24 => return Scalar.push(24, latin1[0..24], allocator), + 25 => return Scalar.push(25, latin1[0..25], allocator), + 26 => return Scalar.push(26, latin1[0..26], allocator), + 27 => return Scalar.push(27, latin1[0..27], allocator), + 28 => return Scalar.push(28, latin1[0..28], allocator), + 29 => return Scalar.push(29, latin1[0..29], allocator), + 30 => return Scalar.push(30, latin1[0..30], allocator), + 31 => return Scalar.push(31, latin1[0..31], allocator), + 32 => return Scalar.push(32, latin1[0..32], allocator), + + else => { + var remaining = latin1; + + const vec_chars = "\"&'<>"; + const vecs: [vec_chars.len]AsciiVector = comptime brk: { + var _vecs: [vec_chars.len]AsciiVector = undefined; + for (vec_chars, 0..) |c, i| { + _vecs[i] = @splat(c); + } + break :brk _vecs; + }; + + var any_needs_escape = false; + var buf: std.ArrayList(u8) = std.ArrayList(u8){ + .items = &.{}, + .capacity = 0, + .allocator = allocator, + }; + + if (comptime Environment.enableSIMD) { + // pass #1: scan for any characters that need escaping + // assume most strings won't need any escaping, so don't actually allocate the buffer + scan_and_allocate_lazily: while (remaining.len >= ascii_vector_size) { + if (comptime Environment.allow_assert) assert(!any_needs_escape); + const vec: AsciiVector = remaining[0..ascii_vector_size].*; + if (@reduce(.Max, @as(AsciiVectorU1, @bitCast((vec == vecs[0]))) | + @as(AsciiVectorU1, @bitCast((vec == vecs[1]))) | + @as(AsciiVectorU1, @bitCast((vec == vecs[2]))) | + @as(AsciiVectorU1, @bitCast((vec == vecs[3]))) | + @as(AsciiVectorU1, @bitCast((vec == vecs[4])))) == 1) + { + if (comptime Environment.allow_assert) assert(buf.capacity == 0); + + buf = try std.ArrayList(u8).initCapacity(allocator, latin1.len + 6); + const copy_len = @intFromPtr(remaining.ptr) - @intFromPtr(latin1.ptr); + buf.appendSliceAssumeCapacity(latin1[0..copy_len]); + any_needs_escape = true; + inline for (0..ascii_vector_size) |i| { + switch (vec[i]) { + '"' => { + buf.ensureUnusedCapacity((ascii_vector_size - i) + """.len) catch unreachable; + buf.items.ptr[buf.items.len .. buf.items.len + """.len][0..""".len].* = """.*; + buf.items.len += """.len; + }, + '&' => { + buf.ensureUnusedCapacity((ascii_vector_size - i) + "&".len) catch unreachable; + buf.items.ptr[buf.items.len .. buf.items.len + "&".len][0.."&".len].* = "&".*; + buf.items.len += "&".len; + }, + '\'' => { + buf.ensureUnusedCapacity((ascii_vector_size - i) + "'".len) catch unreachable; + buf.items.ptr[buf.items.len .. buf.items.len + "'".len][0.."'".len].* = "'".*; + buf.items.len += "'".len; + }, + '<' => { + buf.ensureUnusedCapacity((ascii_vector_size - i) + "<".len) catch unreachable; + buf.items.ptr[buf.items.len .. buf.items.len + "<".len][0.."<".len].* = "<".*; + buf.items.len += "<".len; + }, + '>' => { + buf.ensureUnusedCapacity((ascii_vector_size - i) + ">".len) catch unreachable; + buf.items.ptr[buf.items.len .. buf.items.len + ">".len][0..">".len].* = ">".*; + buf.items.len += ">".len; + }, + else => |c| { + buf.appendAssumeCapacity(c); + }, + } + } + + remaining = remaining[ascii_vector_size..]; + break :scan_and_allocate_lazily; + } + + remaining = remaining[ascii_vector_size..]; + } + } + + if (any_needs_escape) { + // pass #2: we found something that needed an escape + // so we'll go ahead and copy the buffer into a new buffer + while (remaining.len >= ascii_vector_size) { + const vec: AsciiVector = remaining[0..ascii_vector_size].*; + if (@reduce(.Max, @as(AsciiVectorU1, @bitCast((vec == vecs[0]))) | + @as(AsciiVectorU1, @bitCast((vec == vecs[1]))) | + @as(AsciiVectorU1, @bitCast((vec == vecs[2]))) | + @as(AsciiVectorU1, @bitCast((vec == vecs[3]))) | + @as(AsciiVectorU1, @bitCast((vec == vecs[4])))) == 1) + { + buf.ensureUnusedCapacity(ascii_vector_size + 6) catch unreachable; + inline for (0..ascii_vector_size) |i| { + switch (vec[i]) { + '"' => { + buf.ensureUnusedCapacity((ascii_vector_size - i) + """.len) catch unreachable; + buf.items.ptr[buf.items.len .. buf.items.len + """.len][0..""".len].* = """.*; + buf.items.len += """.len; + }, + '&' => { + buf.ensureUnusedCapacity((ascii_vector_size - i) + "&".len) catch unreachable; + buf.items.ptr[buf.items.len .. buf.items.len + "&".len][0.."&".len].* = "&".*; + buf.items.len += "&".len; + }, + '\'' => { + buf.ensureUnusedCapacity((ascii_vector_size - i) + "'".len) catch unreachable; + buf.items.ptr[buf.items.len .. buf.items.len + "'".len][0.."'".len].* = "'".*; + buf.items.len += "'".len; + }, + '<' => { + buf.ensureUnusedCapacity((ascii_vector_size - i) + "<".len) catch unreachable; + buf.items.ptr[buf.items.len .. buf.items.len + "<".len][0.."<".len].* = "<".*; + buf.items.len += "<".len; + }, + '>' => { + buf.ensureUnusedCapacity((ascii_vector_size - i) + ">".len) catch unreachable; + buf.items.ptr[buf.items.len .. buf.items.len + ">".len][0..">".len].* = ">".*; + buf.items.len += ">".len; + }, + else => |c| { + buf.appendAssumeCapacity(c); + }, + } + } + + remaining = remaining[ascii_vector_size..]; + continue; + } + + try buf.ensureUnusedCapacity(ascii_vector_size); + buf.items.ptr[buf.items.len .. buf.items.len + ascii_vector_size][0..ascii_vector_size].* = remaining[0..ascii_vector_size].*; + buf.items.len += ascii_vector_size; + remaining = remaining[ascii_vector_size..]; + } + } + + var ptr = remaining.ptr; + const end = remaining.ptr + remaining.len; + + if (!any_needs_escape) { + scan_and_allocate_lazily: while (ptr != end) : (ptr += 1) { + switch (ptr[0]) { + '"', '&', '\'', '<', '>' => |c| { + if (comptime Environment.allow_assert) assert(buf.capacity == 0); + + buf = try std.ArrayList(u8).initCapacity(allocator, latin1.len + @as(usize, Scalar.lengths[c])); + const copy_len = @intFromPtr(ptr) - @intFromPtr(latin1.ptr); + if (comptime Environment.allow_assert) assert(copy_len <= buf.capacity); + buf.items.len = copy_len; + @memcpy(buf.items[0..copy_len], latin1[0..copy_len]); + any_needs_escape = true; + break :scan_and_allocate_lazily; + }, + else => {}, + } + } + } + + while (ptr != end) : (ptr += 1) { + switch (ptr[0]) { + '"' => { + buf.appendSlice(""") catch unreachable; + }, + '&' => { + buf.appendSlice("&") catch unreachable; + }, + '\'' => { + buf.appendSlice("'") catch unreachable; // modified from escape-html; used to be ''' + }, + '<' => { + buf.appendSlice("<") catch unreachable; + }, + '>' => { + buf.appendSlice(">") catch unreachable; + }, + else => |c| { + buf.append(c) catch unreachable; + }, + } + } + + if (!any_needs_escape) { + if (comptime Environment.allow_assert) assert(buf.capacity == 0); + return Escaped(u8){ .original = {} }; + } + + return Escaped(u8){ .allocated = try buf.toOwnedSlice() }; + }, + } +} + +fn Escaped(comptime T: type) type { + return union(enum) { + static: []const u8, + original: void, + allocated: []T, + }; +} + +pub fn escapeHTMLForUTF16Input(allocator: std.mem.Allocator, utf16: []const u16) !Escaped(u16) { + const Scalar = struct { + pub const lengths: [std.math.maxInt(u8) + 1]u4 = brk: { + var values: [std.math.maxInt(u8) + 1]u4 = undefined; + for (values, 0..) |_, i| { + values[i] = switch (i) { + '"' => """.len, + '&' => "&".len, + '\'' => "'".len, + '<' => "<".len, + '>' => ">".len, + else => 1, + }; + } + + break :brk values; + }; + }; + switch (utf16.len) { + 0 => return Escaped(u16){ .static = &[_]u8{} }, + 1 => { + switch (utf16[0]) { + '"' => return Escaped(u16){ .static = """ }, + '&' => return Escaped(u16){ .static = "&" }, + '\'' => return Escaped(u16){ .static = "'" }, + '<' => return Escaped(u16){ .static = "<" }, + '>' => return Escaped(u16){ .static = ">" }, + else => return Escaped(u16){ .original = {} }, + } + }, + 2 => { + const first_16 = switch (utf16[0]) { + '"' => toUTF16Literal("""), + '&' => toUTF16Literal("&"), + '\'' => toUTF16Literal("'"), + '<' => toUTF16Literal("<"), + '>' => toUTF16Literal(">"), + else => @as([]const u16, utf16[0..1]), + }; + + const second_16 = switch (utf16[1]) { + '"' => toUTF16Literal("""), + '&' => toUTF16Literal("&"), + '\'' => toUTF16Literal("'"), + '<' => toUTF16Literal("<"), + '>' => toUTF16Literal(">"), + else => @as([]const u16, utf16[1..2]), + }; + + if (first_16.ptr == utf16.ptr and second_16.ptr == utf16.ptr + 1) { + return Escaped(u16){ .original = {} }; + } + + var buf = allocator.alloc(u16, first_16.len + second_16.len) catch unreachable; + bun.copy(u16, buf, first_16); + bun.copy(u16, buf[first_16.len..], second_16); + return Escaped(u16){ .allocated = buf }; + }, + + else => { + var remaining = utf16; + + var any_needs_escape = false; + var buf: std.ArrayList(u16) = undefined; + + if (comptime Environment.enableSIMD) { + const vec_chars = "\"&'<>"; + const vecs: [vec_chars.len]AsciiU16Vector = brk: { + var _vecs: [vec_chars.len]AsciiU16Vector = undefined; + for (vec_chars, 0..) |c, i| { + _vecs[i] = @splat(@as(u16, c)); + } + break :brk _vecs; + }; + // pass #1: scan for any characters that need escaping + // assume most strings won't need any escaping, so don't actually allocate the buffer + scan_and_allocate_lazily: while (remaining.len >= ascii_u16_vector_size) { + if (comptime Environment.allow_assert) assert(!any_needs_escape); + const vec: AsciiU16Vector = remaining[0..ascii_u16_vector_size].*; + if (@reduce(.Max, @as(AsciiVectorU16U1, @bitCast(vec > @as(AsciiU16Vector, @splat(@as(u16, 127))))) | + @as(AsciiVectorU16U1, @bitCast((vec == vecs[0]))) | + @as(AsciiVectorU16U1, @bitCast((vec == vecs[1]))) | + @as(AsciiVectorU16U1, @bitCast((vec == vecs[2]))) | + @as(AsciiVectorU16U1, @bitCast((vec == vecs[3]))) | + @as(AsciiVectorU16U1, @bitCast((vec == vecs[4])))) == 1) + { + var i: u16 = 0; + lazy: { + while (i < ascii_u16_vector_size) { + switch (remaining[i]) { + '"', '&', '\'', '<', '>' => { + any_needs_escape = true; + break :lazy; + }, + 128...std.math.maxInt(u16) => { + const cp = utf16Codepoint([]const u16, remaining[i..]); + i += @as(u16, cp.len); + }, + else => { + i += 1; + }, + } + } + } + + if (!any_needs_escape) { + remaining = remaining[i..]; + continue :scan_and_allocate_lazily; + } + + if (comptime Environment.allow_assert) assert(@intFromPtr(remaining.ptr + i) >= @intFromPtr(utf16.ptr)); + const to_copy = std.mem.sliceAsBytes(utf16)[0 .. @intFromPtr(remaining.ptr + i) - @intFromPtr(utf16.ptr)]; + const to_copy_16 = std.mem.bytesAsSlice(u16, to_copy); + buf = try std.ArrayList(u16).initCapacity(allocator, utf16.len + 6); + try buf.appendSlice(to_copy_16); + + while (i < ascii_u16_vector_size) { + switch (remaining[i]) { + '"', '&', '\'', '<', '>' => |c| { + const result = switch (c) { + '"' => toUTF16Literal("""), + '&' => toUTF16Literal("&"), + '\'' => toUTF16Literal("'"), + '<' => toUTF16Literal("<"), + '>' => toUTF16Literal(">"), + else => unreachable, + }; + + buf.appendSlice(result) catch unreachable; + i += 1; + }, + 128...std.math.maxInt(u16) => { + const cp = utf16Codepoint([]const u16, remaining[i..]); + + buf.appendSlice(remaining[i..][0..@as(usize, cp.len)]) catch unreachable; + i += @as(u16, cp.len); + }, + else => |c| { + i += 1; + buf.append(c) catch unreachable; + }, + } + } + + // edgecase: code point width could exceed asdcii_u16_vector_size + remaining = remaining[i..]; + break :scan_and_allocate_lazily; + } + + remaining = remaining[ascii_u16_vector_size..]; + } + + if (any_needs_escape) { + // pass #2: we found something that needed an escape + // but there's still some more text to + // so we'll go ahead and copy the buffer into a new buffer + while (remaining.len >= ascii_u16_vector_size) { + const vec: AsciiU16Vector = remaining[0..ascii_u16_vector_size].*; + if (@reduce(.Max, @as(AsciiVectorU16U1, @bitCast(vec > @as(AsciiU16Vector, @splat(@as(u16, 127))))) | + @as(AsciiVectorU16U1, @bitCast((vec == vecs[0]))) | + @as(AsciiVectorU16U1, @bitCast((vec == vecs[1]))) | + @as(AsciiVectorU16U1, @bitCast((vec == vecs[2]))) | + @as(AsciiVectorU16U1, @bitCast((vec == vecs[3]))) | + @as(AsciiVectorU16U1, @bitCast((vec == vecs[4])))) == 1) + { + buf.ensureUnusedCapacity(ascii_u16_vector_size) catch unreachable; + var i: u16 = 0; + while (i < ascii_u16_vector_size) { + switch (remaining[i]) { + '"' => { + buf.appendSlice(toUTF16Literal(""")) catch unreachable; + i += 1; + }, + '&' => { + buf.appendSlice(toUTF16Literal("&")) catch unreachable; + i += 1; + }, + '\'' => { + buf.appendSlice(toUTF16Literal("'")) catch unreachable; // modified from escape-html; used to be ''' + i += 1; + }, + '<' => { + buf.appendSlice(toUTF16Literal("<")) catch unreachable; + i += 1; + }, + '>' => { + buf.appendSlice(toUTF16Literal(">")) catch unreachable; + i += 1; + }, + 128...std.math.maxInt(u16) => { + const cp = utf16Codepoint([]const u16, remaining[i..]); + + buf.appendSlice(remaining[i..][0..@as(usize, cp.len)]) catch unreachable; + i += @as(u16, cp.len); + }, + else => |c| { + buf.append(c) catch unreachable; + i += 1; + }, + } + } + + remaining = remaining[i..]; + continue; + } + + try buf.ensureUnusedCapacity(ascii_u16_vector_size); + buf.items.ptr[buf.items.len .. buf.items.len + ascii_u16_vector_size][0..ascii_u16_vector_size].* = remaining[0..ascii_u16_vector_size].*; + buf.items.len += ascii_u16_vector_size; + remaining = remaining[ascii_u16_vector_size..]; + } + } + } + + var ptr = remaining.ptr; + const end = remaining.ptr + remaining.len; + + if (!any_needs_escape) { + scan_and_allocate_lazily: while (ptr != end) { + switch (ptr[0]) { + '"', '&', '\'', '<', '>' => |c| { + buf = try std.ArrayList(u16).initCapacity(allocator, utf16.len + @as(usize, Scalar.lengths[c])); + if (comptime Environment.allow_assert) assert(@intFromPtr(ptr) >= @intFromPtr(utf16.ptr)); + + const to_copy = std.mem.sliceAsBytes(utf16)[0 .. @intFromPtr(ptr) - @intFromPtr(utf16.ptr)]; + const to_copy_16 = std.mem.bytesAsSlice(u16, to_copy); + try buf.appendSlice(to_copy_16); + any_needs_escape = true; + break :scan_and_allocate_lazily; + }, + 128...std.math.maxInt(u16) => { + const cp = utf16Codepoint([]const u16, ptr[0..if (ptr + 1 == end) 1 else 2]); + + ptr += @as(u16, cp.len); + }, + else => { + ptr += 1; + }, + } + } + } + + while (ptr != end) { + switch (ptr[0]) { + '"' => { + buf.appendSlice(toUTF16Literal(""")) catch unreachable; + ptr += 1; + }, + '&' => { + buf.appendSlice(toUTF16Literal("&")) catch unreachable; + ptr += 1; + }, + '\'' => { + buf.appendSlice(toUTF16Literal("'")) catch unreachable; // modified from escape-html; used to be ''' + ptr += 1; + }, + '<' => { + buf.appendSlice(toUTF16Literal("<")) catch unreachable; + ptr += 1; + }, + '>' => { + buf.appendSlice(toUTF16Literal(">")) catch unreachable; + ptr += 1; + }, + 128...std.math.maxInt(u16) => { + const cp = utf16Codepoint([]const u16, ptr[0..if (ptr + 1 == end) 1 else 2]); + + buf.appendSlice(ptr[0..@as(usize, cp.len)]) catch unreachable; + ptr += @as(u16, cp.len); + }, + + else => |c| { + buf.append(c) catch unreachable; + ptr += 1; + }, + } + } + + if (!any_needs_escape) { + return Escaped(u16){ .original = {} }; + } + + return Escaped(u16){ .allocated = try buf.toOwnedSlice() }; + }, + } +} + +const std = @import("std"); +const bun = @import("bun"); +const Environment = bun.Environment; +const assert = bun.assert; +const ascii_u16_vector_size = strings.ascii_u16_vector_size; +const AsciiU16Vector = strings.AsciiU16Vector; +const utf16Codepoint = strings.utf16Codepoint; +const toUTF16Literal = strings.toUTF16Literal; +const strings = bun.strings; +const AsciiVectorU16U1 = strings.AsciiVectorU16U1; +const AsciiVector = strings.AsciiVector; +const ascii_vector_size = strings.ascii_vector_size; +const AsciiVectorU1 = strings.AsciiVectorU1; diff --git a/src/string/paths.zig b/src/string/paths.zig new file mode 100644 index 0000000000..da6073f39b --- /dev/null +++ b/src/string/paths.zig @@ -0,0 +1,461 @@ +/// Checks if a path is missing a windows drive letter. For windows APIs, +/// this is used for an assertion, and PosixToWinNormalizer can help make +/// an absolute path contain a drive letter. +pub fn isWindowsAbsolutePathMissingDriveLetter(comptime T: type, chars: []const T) bool { + bun.unsafeAssert(bun.path.Platform.windows.isAbsoluteT(T, chars)); + bun.unsafeAssert(chars.len > 0); + + // 'C:\hello' -> false + // This is the most common situation, so we check it first + if (!(chars[0] == '/' or chars[0] == '\\')) { + bun.unsafeAssert(chars.len > 2); + bun.unsafeAssert(chars[1] == ':'); + return false; + } + + if (chars.len > 4) { + // '\??\hello' -> false (has the NT object prefix) + if (chars[1] == '?' and + chars[2] == '?' and + (chars[3] == '/' or chars[3] == '\\')) + return false; + // '\\?\hello' -> false (has the other NT object prefix) + // '\\.\hello' -> false (has the NT device prefix) + if ((chars[1] == '/' or chars[1] == '\\') and + (chars[2] == '?' or chars[2] == '.') and + (chars[3] == '/' or chars[3] == '\\')) + return false; + } + + // A path starting with `/` can be a UNC path with forward slashes, + // or actually just a posix path. + // + // '\\Server\Share' -> false (unc) + // '\\Server\\Share' -> true (not unc because extra slashes) + // '\Server\Share' -> true (posix path) + return bun.path.windowsFilesystemRootT(T, chars).len == 1; +} + +pub fn fromWPath(buf: []u8, utf16: []const u16) [:0]const u8 { + bun.unsafeAssert(buf.len > 0); + const to_copy = trimPrefixComptime(u16, utf16, bun.windows.long_path_prefix); + const encode_into_result = copyUTF16IntoUTF8(buf[0 .. buf.len - 1], []const u16, to_copy, false); + bun.unsafeAssert(encode_into_result.written < buf.len); + buf[encode_into_result.written] = 0; + return buf[0..encode_into_result.written :0]; +} + +pub fn withoutNTPrefix(comptime T: type, path: []const T) []const T { + if (comptime !Environment.isWindows) return path; + const cmp = if (T == u8) + hasPrefixComptime + else + hasPrefixComptimeUTF16; + if (cmp(path, &bun.windows.nt_object_prefix_u8)) { + return path[bun.windows.nt_object_prefix.len..]; + } + if (cmp(path, &bun.windows.long_path_prefix_u8)) { + return path[bun.windows.long_path_prefix.len..]; + } + if (cmp(path, &bun.windows.nt_unc_object_prefix_u8)) { + return path[bun.windows.nt_unc_object_prefix.len..]; + } + return path; +} + +pub fn toNTPath(wbuf: []u16, utf8: []const u8) [:0]u16 { + if (!std.fs.path.isAbsoluteWindows(utf8)) { + return toWPathNormalized(wbuf, utf8); + } + + if (strings.hasPrefixComptime(utf8, &bun.windows.nt_object_prefix_u8) or + strings.hasPrefixComptime(utf8, &bun.windows.nt_unc_object_prefix_u8)) + { + return wbuf[0..toWPathNormalized(wbuf, utf8).len :0]; + } + + // UNC absolute path, replace leading '\\' with '\??\UNC\' + if (strings.hasPrefixComptime(utf8, "\\\\")) { + if (strings.hasPrefixComptime(utf8[2..], bun.windows.long_path_prefix_u8[2..])) { + const prefix = bun.windows.nt_object_prefix; + wbuf[0..prefix.len].* = prefix; + return wbuf[0 .. toWPathNormalized(wbuf[prefix.len..], utf8[4..]).len + prefix.len :0]; + } + const prefix = bun.windows.nt_unc_object_prefix; + wbuf[0..prefix.len].* = prefix; + return wbuf[0 .. toWPathNormalized(wbuf[prefix.len..], utf8[2..]).len + prefix.len :0]; + } + + const prefix = bun.windows.nt_object_prefix; + wbuf[0..prefix.len].* = prefix; + return wbuf[0 .. toWPathNormalized(wbuf[prefix.len..], utf8).len + prefix.len :0]; +} + +pub fn toNTPath16(wbuf: []u16, path: []const u16) [:0]u16 { + if (!std.fs.path.isAbsoluteWindowsWTF16(path)) { + return toWPathNormalized16(wbuf, path); + } + + if (strings.hasPrefixComptimeUTF16(path, &bun.windows.nt_object_prefix_u8) or + strings.hasPrefixComptimeUTF16(path, &bun.windows.nt_unc_object_prefix_u8)) + { + return wbuf[0..toWPathNormalized16(wbuf, path).len :0]; + } + + if (strings.hasPrefixComptimeUTF16(path, "\\\\")) { + if (strings.hasPrefixComptimeUTF16(path[2..], bun.windows.long_path_prefix_u8[2..])) { + const prefix = bun.windows.nt_object_prefix; + wbuf[0..prefix.len].* = prefix; + return wbuf[0 .. toWPathNormalized16(wbuf[prefix.len..], path[4..]).len + prefix.len :0]; + } + const prefix = bun.windows.nt_unc_object_prefix; + wbuf[0..prefix.len].* = prefix; + return wbuf[0 .. toWPathNormalized16(wbuf[prefix.len..], path[2..]).len + prefix.len :0]; + } + + const prefix = bun.windows.nt_object_prefix; + wbuf[0..prefix.len].* = prefix; + return wbuf[0 .. toWPathNormalized16(wbuf[prefix.len..], path).len + prefix.len :0]; +} + +pub fn toNTMaxPath(buf: []u8, utf8: []const u8) [:0]const u8 { + if (!std.fs.path.isAbsoluteWindows(utf8) or utf8.len <= 260) { + @memcpy(buf[0..utf8.len], utf8); + buf[utf8.len] = 0; + return buf[0..utf8.len :0]; + } + + const prefix = bun.windows.nt_maxpath_prefix_u8; + buf[0..prefix.len].* = prefix; + return buf[0 .. toPathNormalized(buf[prefix.len..], utf8).len + prefix.len :0]; +} + +pub fn addNTPathPrefix(wbuf: []u16, utf16: []const u16) [:0]u16 { + wbuf[0..bun.windows.nt_object_prefix.len].* = bun.windows.nt_object_prefix; + @memcpy(wbuf[bun.windows.nt_object_prefix.len..][0..utf16.len], utf16); + wbuf[utf16.len + bun.windows.nt_object_prefix.len] = 0; + return wbuf[0 .. utf16.len + bun.windows.nt_object_prefix.len :0]; +} + +pub fn addNTPathPrefixIfNeeded(wbuf: []u16, utf16: []const u16) [:0]u16 { + if (hasPrefixComptimeType(u16, utf16, bun.windows.nt_object_prefix)) { + @memcpy(wbuf[0..utf16.len], utf16); + wbuf[utf16.len] = 0; + return wbuf[0..utf16.len :0]; + } + if (hasPrefixComptimeType(u16, utf16, bun.windows.long_path_prefix)) { + // Replace prefix + return addNTPathPrefix(wbuf, utf16[bun.windows.long_path_prefix.len..]); + } + return addNTPathPrefix(wbuf, utf16); +} + +// These are the same because they don't have rules like needing a trailing slash +pub const toNTDir = toNTPath; + +pub fn toExtendedPathNormalized(wbuf: []u16, utf8: []const u8) [:0]const u16 { + bun.unsafeAssert(wbuf.len > 4); + wbuf[0..4].* = bun.windows.long_path_prefix; + return wbuf[0 .. toWPathNormalized(wbuf[4..], utf8).len + 4 :0]; +} + +pub fn toWPathNormalizeAutoExtend(wbuf: []u16, utf8: []const u8) [:0]const u16 { + if (std.fs.path.isAbsoluteWindows(utf8)) { + return toExtendedPathNormalized(wbuf, utf8); + } + + return toWPathNormalized(wbuf, utf8); +} + +pub fn toWPathNormalized(wbuf: []u16, utf8: []const u8) [:0]u16 { + const renormalized = bun.PathBufferPool.get(); + defer bun.PathBufferPool.put(renormalized); + + var path_to_use = normalizeSlashesOnly(renormalized, utf8, '\\'); + + // is there a trailing slash? Let's remove it before converting to UTF-16 + if (path_to_use.len > 3 and bun.path.isSepAny(path_to_use[path_to_use.len - 1])) { + path_to_use = path_to_use[0 .. path_to_use.len - 1]; + } + + return toWPath(wbuf, path_to_use); +} + +pub fn toWPathNormalized16(wbuf: []u16, path: []const u16) [:0]u16 { + var path_to_use = normalizeSlashesOnlyT(u16, wbuf, path, '\\', true); + + // is there a trailing slash? Let's remove it before converting to UTF-16 + if (path_to_use.len > 3 and bun.path.isSepAnyT(u16, path_to_use[path_to_use.len - 1])) { + path_to_use = path_to_use[0 .. path_to_use.len - 1]; + } + + wbuf[path_to_use.len] = 0; + + return wbuf[0..path_to_use.len :0]; +} + +pub fn toPathNormalized(buf: []u8, utf8: []const u8) [:0]const u8 { + const renormalized = bun.PathBufferPool.get(); + defer bun.PathBufferPool.put(renormalized); + + var path_to_use = normalizeSlashesOnly(renormalized, utf8, '\\'); + + // is there a trailing slash? Let's remove it before converting to UTF-16 + if (path_to_use.len > 3 and bun.path.isSepAny(path_to_use[path_to_use.len - 1])) { + path_to_use = path_to_use[0 .. path_to_use.len - 1]; + } + + return toPath(buf, path_to_use); +} + +pub fn normalizeSlashesOnlyT(comptime T: type, buf: []T, path: []const T, comptime desired_slash: u8, comptime always_copy: bool) []const T { + comptime bun.unsafeAssert(desired_slash == '/' or desired_slash == '\\'); + const undesired_slash = if (desired_slash == '/') '\\' else '/'; + + if (bun.strings.containsCharT(T, path, undesired_slash)) { + @memcpy(buf[0..path.len], path); + for (buf[0..path.len]) |*c| { + if (c.* == undesired_slash) { + c.* = desired_slash; + } + } + return buf[0..path.len]; + } + + if (comptime always_copy) { + @memcpy(buf[0..path.len], path); + return buf[0..path.len]; + } + return path; +} + +pub fn normalizeSlashesOnly(buf: []u8, utf8: []const u8, comptime desired_slash: u8) []const u8 { + return normalizeSlashesOnlyT(u8, buf, utf8, desired_slash, false); +} + +pub fn toWDirNormalized(wbuf: []u16, utf8: []const u8) [:0]const u16 { + var renormalized: ?*bun.PathBuffer = null; + defer if (renormalized) |r| bun.PathBufferPool.put(r); + + var path_to_use = utf8; + + if (bun.strings.containsChar(utf8, '/')) { + renormalized = bun.PathBufferPool.get(); + @memcpy(renormalized.?[0..utf8.len], utf8); + for (renormalized.?[0..utf8.len]) |*c| { + if (c.* == '/') { + c.* = '\\'; + } + } + path_to_use = renormalized.?[0..utf8.len]; + } + + return toWDirPath(wbuf, path_to_use); +} + +pub fn toWPath(wbuf: []u16, utf8: []const u8) [:0]u16 { + return toWPathMaybeDir(wbuf, utf8, false); +} + +pub fn toPath(buf: []u8, utf8: []const u8) [:0]u8 { + return toPathMaybeDir(buf, utf8, false); +} + +pub fn toWDirPath(wbuf: []u16, utf8: []const u8) [:0]const u16 { + return toWPathMaybeDir(wbuf, utf8, true); +} + +pub fn toKernel32Path(wbuf: []u16, utf8: []const u8) [:0]u16 { + const path = if (hasPrefixComptime(utf8, bun.windows.nt_object_prefix_u8)) + utf8[bun.windows.nt_object_prefix_u8.len..] + else + utf8; + if (hasPrefixComptime(path, bun.windows.long_path_prefix_u8)) { + return toWPath(wbuf, path); + } + if (utf8.len > 2 and bun.path.isDriveLetter(utf8[0]) and utf8[1] == ':' and bun.path.isSepAny(utf8[2])) { + wbuf[0..4].* = bun.windows.long_path_prefix; + const wpath = toWPath(wbuf[4..], path); + return wbuf[0 .. wpath.len + 4 :0]; + } + return toWPath(wbuf, path); +} + +fn isUNCPath(comptime T: type, path: []const T) bool { + return path.len >= 3 and + bun.path.Platform.windows.isSeparatorT(T, path[0]) and + bun.path.Platform.windows.isSeparatorT(T, path[1]) and + !bun.path.Platform.windows.isSeparatorT(T, path[2]) and + path[2] != '.'; +} +pub fn assertIsValidWindowsPath(comptime T: type, path: []const T) void { + if (Environment.allow_assert and Environment.isWindows) { + if (bun.path.Platform.windows.isAbsoluteT(T, path) and + isWindowsAbsolutePathMissingDriveLetter(T, path) and + // is it a null device path? that's not an error. it's just a weird file path. + !eqlComptimeT(T, path, "\\\\.\\NUL") and !eqlComptimeT(T, path, "\\\\.\\nul") and !eqlComptimeT(T, path, "\\nul") and !eqlComptimeT(T, path, "\\NUL") and !isUNCPath(T, path)) + { + std.debug.panic("Internal Error: Do not pass posix paths to Windows APIs, was given '{s}'" ++ if (Environment.isDebug) " (missing a root like 'C:\\', see PosixToWinNormalizer for why this is an assertion)" else ". Please open an issue on GitHub with a reproduction.", .{ + if (T == u8) path else bun.fmt.utf16(path), + }); + } + if (hasPrefixComptimeType(T, path, ":/") and Environment.isDebug) { + std.debug.panic("Path passed to windows API '{s}' is almost certainly invalid. Where did the drive letter go?", .{ + if (T == u8) path else bun.fmt.utf16(path), + }); + } + } +} + +pub fn toWPathMaybeDir(wbuf: []u16, utf8: []const u8, comptime add_trailing_lash: bool) [:0]u16 { + bun.unsafeAssert(wbuf.len > 0); + + var result = bun.simdutf.convert.utf8.to.utf16.with_errors.le( + utf8, + wbuf[0..wbuf.len -| (1 + @as(usize, @intFromBool(add_trailing_lash)))], + ); + + // Many Windows APIs expect normalized path slashes, particularly when the + // long path prefix is added or the nt object prefix. To make this easier, + // but a little redundant, this function always normalizes the slashes here. + // + // An example of this is GetFileAttributesW(L"C:\\hello/world.txt") being OK + // but GetFileAttributesW(L"\\\\?\\C:\\hello/world.txt") is NOT + bun.path.dangerouslyConvertPathToWindowsInPlace(u16, wbuf[0..result.count]); + + if (add_trailing_lash and result.count > 0 and wbuf[result.count - 1] != '\\') { + wbuf[result.count] = '\\'; + result.count += 1; + } + + wbuf[result.count] = 0; + + return wbuf[0..result.count :0]; +} +pub fn toPathMaybeDir(buf: []u8, utf8: []const u8, comptime add_trailing_lash: bool) [:0]u8 { + bun.unsafeAssert(buf.len > 0); + + var len = utf8.len; + @memcpy(buf[0..len], utf8[0..len]); + + if (add_trailing_lash and len > 0 and buf[len - 1] != '\\') { + buf[len] = '\\'; + len += 1; + } + buf[len] = 0; + return buf[0..len :0]; +} + +pub fn cloneNormalizingSeparators( + allocator: std.mem.Allocator, + input: []const u8, +) ![]u8 { + // remove duplicate slashes in the file path + const base = withoutTrailingSlash(input); + var tokenized = std.mem.tokenizeScalar(u8, base, std.fs.path.sep); + var buf = try allocator.alloc(u8, base.len + 2); + if (comptime Environment.allow_assert) assert(base.len > 0); + if (base[0] == std.fs.path.sep) { + buf[0] = std.fs.path.sep; + } + var remain = buf[@as(usize, @intFromBool(base[0] == std.fs.path.sep))..]; + + while (tokenized.next()) |token| { + if (token.len == 0) continue; + bun.copy(u8, remain, token); + remain[token.len..][0] = std.fs.path.sep; + remain = remain[token.len + 1 ..]; + } + if ((remain.ptr - 1) != buf.ptr and (remain.ptr - 1)[0] != std.fs.path.sep) { + remain[0] = std.fs.path.sep; + remain = remain[1..]; + } + remain[0] = 0; + + return buf[0 .. @intFromPtr(remain.ptr) - @intFromPtr(buf.ptr)]; +} + +pub fn pathContainsNodeModulesFolder(path: []const u8) bool { + return strings.contains(path, comptime std.fs.path.sep_str ++ "node_modules" ++ std.fs.path.sep_str); +} + +pub fn charIsAnySlash(char: u8) callconv(bun.callconv_inline) bool { + return char == '/' or char == '\\'; +} + +pub fn startsWithWindowsDriveLetter(s: []const u8) callconv(bun.callconv_inline) bool { + return startsWithWindowsDriveLetterT(u8, s); +} + +pub fn startsWithWindowsDriveLetterT(comptime T: type, s: []const T) callconv(bun.callconv_inline) bool { + return s.len > 2 and s[1] == ':' and switch (s[0]) { + 'a'...'z', 'A'...'Z' => true, + else => false, + }; +} + +pub fn withoutTrailingSlash(this: string) []const u8 { + var href = this; + while (href.len > 1 and (switch (href[href.len - 1]) { + '/', '\\' => true, + else => false, + })) { + href.len -= 1; + } + + return href; +} + +/// Does not strip the device root (C:\ or \\Server\Share\ portion off of the path) +pub fn withoutTrailingSlashWindowsPath(input: string) []const u8 { + if (Environment.isPosix or input.len < 3 or input[1] != ':') + return withoutTrailingSlash(input); + + const root_len = bun.path.windowsFilesystemRoot(input).len + 1; + + var path = input; + while (path.len > root_len and (switch (path[path.len - 1]) { + '/', '\\' => true, + else => false, + })) { + path.len -= 1; + } + + if (Environment.isDebug) + bun.debugAssert(!std.fs.path.isAbsolute(path) or + !isWindowsAbsolutePathMissingDriveLetter(u8, path)); + + return path; +} + +pub fn withoutLeadingSlash(this: string) []const u8 { + return std.mem.trimLeft(u8, this, "/"); +} + +pub fn withoutLeadingPathSeparator(this: string) []const u8 { + return std.mem.trimLeft(u8, this, &.{std.fs.path.sep}); +} + +pub fn removeLeadingDotSlash(slice: []const u8) callconv(bun.callconv_inline) []const u8 { + if (slice.len >= 2) { + if ((@as(u16, @bitCast(slice[0..2].*)) == comptime std.mem.readInt(u16, "./", .little)) or + (Environment.isWindows and @as(u16, @bitCast(slice[0..2].*)) == comptime std.mem.readInt(u16, ".\\", .little))) + { + return slice[2..]; + } + } + return slice; +} + +const bun = @import("bun"); +const std = @import("std"); +const Environment = bun.Environment; +const strings = bun.strings; +const hasPrefixComptime = strings.hasPrefixComptime; +const hasPrefixComptimeType = strings.hasPrefixComptimeType; +const trimPrefixComptime = strings.trimPrefixComptime; +const copyUTF16IntoUTF8 = strings.copyUTF16IntoUTF8; +const eqlComptimeT = strings.eqlComptimeT; +const string = []const u8; +const assert = bun.assert; +const hasPrefixComptimeUTF16 = strings.hasPrefixComptimeUTF16; diff --git a/src/string/unicode.zig b/src/string/unicode.zig new file mode 100644 index 0000000000..0404ff020d --- /dev/null +++ b/src/string/unicode.zig @@ -0,0 +1,2078 @@ +pub fn NewCodePointIterator(comptime CodePointType_: type, comptime zeroValue: comptime_int) type { + return struct { + const Iterator = @This(); + bytes: []const u8, + i: usize, + next_width: usize = 0, + width: u3_fast = 0, + c: CodePointType = zeroValue, + + pub const CodePointType = CodePointType_; + + pub const ZeroValue = zeroValue; + + pub const Cursor = struct { + i: u32 = 0, + c: CodePointType = zeroValue, + width: u3_fast = 0, + }; + + pub fn init(str: string) Iterator { + return Iterator{ .bytes = str, .i = 0, .c = zeroValue }; + } + + pub fn initOffset(str: string, i: usize) Iterator { + return Iterator{ .bytes = str, .i = i, .c = zeroValue }; + } + + const SkipResult = enum { + eof, + found, + not_found, + }; + + /// Advance forward until the scalar function returns true. + /// THe simd function is "best effort" and expected to sometimes return a result which `scalar` will return false for. + /// This is because we don't decode UTF-8 in the SIMD code path. + pub fn skip(it: *const Iterator, cursor: *Cursor, simd: *const fn (input: []const u8) ?usize, scalar: *const fn (CodePointType) bool) SkipResult { + while (true) { + // 1. Get current position. Check for EOF. + const current_byte_index = cursor.i; + if (current_byte_index >= it.bytes.len) { + return .not_found; // Reached end without finding + } + + // 2. Decode the *next* character using the standard iterator method. + if (!next(it, cursor)) { + return .not_found; // Reached end or error during decode + } + + // 3. Check if the character just decoded matches the scalar condition. + if (scalar(it.c)) { + return .found; // Found it! + } + + // 4. Optimization: Can we skip ahead using SIMD? + // Scan starting from the byte *after* the character we just decoded. + const next_scan_start_index = cursor.i; + if (next_scan_start_index >= it.bytes.len) { + // Just decoded the last character and it didn't match. + return .not_found; + } + const remaining_slice = it.bytes[next_scan_start_index..]; + if (remaining_slice.len == 0) { + return .not_found; + } + + // Ask SIMD for the next potential candidate. + if (simd(remaining_slice)) |pos| { + // SIMD found a potential candidate `pos` bytes ahead. + if (pos > 0) { + // Jump the byte index to the start of the potential candidate. + cursor.i = next_scan_start_index + @as(u32, @intCast(pos)); + // Reset width so next() decodes correctly from the jumped position. + cursor.width = 0; + // Loop will continue, starting the decode from the new cursor.i. + continue; + } + // If pos == 0, SIMD suggests the *immediate next* character. + // No jump needed, just let the loop iterate naturally. + // Fallthrough to the end of the loop. + } else { + // SIMD found no potential candidates in the rest of the string. + // Since the SIMD search set is a superset of the scalar check set, + // we can guarantee that no character satisfying `scalar` exists further. + // Since the current character (decoded in step 2) also didn't match, + // we can conclude the target character is not found. + return .not_found; + } + + // If we reach here, it means SIMD returned pos=0. + // Loop continues to the next iteration, processing the immediate next char. + } // End while true + + unreachable; + } + + pub inline fn next(noalias it: *const Iterator, noalias cursor: *Cursor) bool { + const pos: u32 = @as(u32, cursor.width) + cursor.i; + if (pos >= it.bytes.len) { + return false; + } + + const cp_len = wtf8ByteSequenceLength(it.bytes[pos]); + const error_char = comptime std.math.minInt(CodePointType); + + const codepoint = @as( + CodePointType, + switch (cp_len) { + 0 => return false, + 1 => it.bytes[pos], + else => decodeWTF8RuneTMultibyte(it.bytes[pos..].ptr[0..4], cp_len, CodePointType, error_char), + }, + ); + + cursor.* = Cursor{ + .i = pos, + .c = if (error_char != codepoint) + codepoint + else + unicode_replacement, + .width = if (codepoint != error_char) cp_len else 1, + }; + + return true; + } + + fn nextCodepointSlice(it: *Iterator) callconv(bun.callconv_inline) []const u8 { + const bytes = it.bytes; + const prev = it.i; + const next_ = prev + it.next_width; + if (bytes.len <= next_) return ""; + + const cp_len = utf8ByteSequenceLength(bytes[next_]); + it.next_width = cp_len; + it.i = @min(next_, bytes.len); + + const slice = bytes[prev..][0..cp_len]; + it.width = @as(u3_fast, @intCast(slice.len)); + return slice; + } + + pub fn needsUTF8Decoding(slice: string) bool { + var it = Iterator{ .bytes = slice, .i = 0 }; + + while (true) { + const part = it.nextCodepointSlice(); + @setRuntimeSafety(false); + switch (part.len) { + 0 => return false, + 1 => continue, + else => return true, + } + } + } + + pub fn scanUntilQuotedValueOrEOF(iter: *Iterator, comptime quote: CodePointType) usize { + while (iter.c > -1) { + if (!switch (iter.nextCodepoint()) { + quote => false, + '\\' => brk: { + if (iter.nextCodepoint() == quote) { + continue; + } + break :brk true; + }, + else => true, + }) { + return iter.i + 1; + } + } + + return iter.i; + } + + pub fn nextCodepoint(it: *Iterator) CodePointType { + const slice = it.nextCodepointSlice(); + + it.c = switch (slice.len) { + 0 => zeroValue, + 1 => @as(CodePointType, @intCast(slice[0])), + 2 => @as(CodePointType, @intCast(std.unicode.utf8Decode2(slice) catch unreachable)), + 3 => @as(CodePointType, @intCast(std.unicode.utf8Decode3(slice) catch unreachable)), + 4 => @as(CodePointType, @intCast(std.unicode.utf8Decode4(slice) catch unreachable)), + else => unreachable, + }; + + return it.c; + } + + /// Look ahead at the next n codepoints without advancing the iterator. + /// If fewer than n codepoints are available, then return the remainder of the string. + pub fn peek(it: *Iterator, n: usize) []const u8 { + const original_i = it.i; + defer it.i = original_i; + + var end_ix = original_i; + for (0..n) |_| { + const next_codepoint = it.nextCodepointSlice() orelse return it.bytes[original_i..]; + end_ix += next_codepoint.len; + } + + return it.bytes[original_i..end_ix]; + } + }; +} + +pub const CodepointIterator = NewCodePointIterator(CodePoint, -1); +pub const UnsignedCodepointIterator = NewCodePointIterator(u32, 0); + +pub fn containsNonBmpCodePoint(text: string) bool { + var iter = CodepointIterator.init(text); + var curs = CodepointIterator.Cursor{}; + + while (iter.next(&curs)) { + if (curs.c > 0xFFFF) { + return true; + } + } + + return false; +} + +pub fn containsNonBmpCodePointOrIsInvalidIdentifier(text: string) bool { + var iter = CodepointIterator.init(text); + var curs = CodepointIterator.Cursor{}; + + if (!iter.next(&curs)) return true; + + if (curs.c > 0xFFFF or !js_lexer.isIdentifierStart(curs.c)) + return true; + + while (iter.next(&curs)) { + if (curs.c > 0xFFFF or !js_lexer.isIdentifierContinue(curs.c)) { + return true; + } + } + + return false; +} + +/// Convert potentially ill-formed UTF-8 or UTF-16 bytes to a Unicode Codepoint. +/// - Invalid codepoints are replaced with `zero` parameter +/// - Null bytes return 0 +pub fn decodeWTF8RuneT(p: *const [4]u8, len: u3_fast, comptime T: type, comptime zero: T) T { + if (len == 0) return zero; + if (len == 1) return p[0]; + + return decodeWTF8RuneTMultibyte(p, len, T, zero); +} + +pub fn codepointSize(comptime R: type, r: R) u3_fast { + return switch (r) { + 0b0000_0000...0b0111_1111 => 1, + 0b1100_0000...0b1101_1111 => 2, + 0b1110_0000...0b1110_1111 => 3, + 0b1111_0000...0b1111_0111 => 4, + else => 0, + }; +} + +pub fn convertUTF16ToUTF8(list_: std.ArrayList(u8), comptime Type: type, utf16: Type) OOM!std.ArrayList(u8) { + var list = list_; + const result = bun.simdutf.convert.utf16.to.utf8.with_errors.le( + utf16, + list.items.ptr[0..list.capacity], + ); + if (result.status == .surrogate) { + // Slow path: there was invalid UTF-16, so we need to convert it without simdutf. + return toUTF8ListWithTypeBun(&list, Type, utf16, false); + } + + list.items.len = result.count; + return list; +} + +pub fn convertUTF16ToUTF8WithoutInvalidSurrogatePairs(list_: std.ArrayList(u8), comptime Type: type, utf16: Type) !std.ArrayList(u8) { + var list = list_; + const result = bun.simdutf.convert.utf16.to.utf8.with_errors.le( + utf16, + list.items.ptr[0..list.capacity], + ); + if (result.status == .surrogate) { + return error.SurrogatePair; + } + + list.items.len = result.count; + return list; +} + +pub fn convertUTF16ToUTF8Append(list: *std.ArrayList(u8), utf16: []const u16) !void { + const result = bun.simdutf.convert.utf16.to.utf8.with_errors.le( + utf16, + list.items.ptr[list.items.len..list.capacity], + ); + + if (result.status == .surrogate) { + // Slow path: there was invalid UTF-16, so we need to convert it without simdutf. + _ = try toUTF8ListWithTypeBun(list, []const u16, utf16, false); + return; + } + + list.items.len += result.count; +} + +pub fn toUTF8AllocWithTypeWithoutInvalidSurrogatePairs(allocator: std.mem.Allocator, comptime Type: type, utf16: Type) ![]u8 { + if (bun.FeatureFlags.use_simdutf and comptime Type == []const u16) { + const length = bun.simdutf.length.utf8.from.utf16.le(utf16); + // add 16 bytes of padding for SIMDUTF + var list = try std.ArrayList(u8).initCapacity(allocator, length + 16); + list = try convertUTF16ToUTF8(list, Type, utf16); + return list.items; + } + + var list = try std.ArrayList(u8).initCapacity(allocator, utf16.len); + list = try toUTF8ListWithType(list, Type, utf16); + return list.items; +} + +pub fn toUTF8AllocWithType(allocator: std.mem.Allocator, comptime Type: type, utf16: Type) OOM![]u8 { + if (bun.FeatureFlags.use_simdutf and comptime Type == []const u16) { + const length = bun.simdutf.length.utf8.from.utf16.le(utf16); + // add 16 bytes of padding for SIMDUTF + var list = try std.ArrayList(u8).initCapacity(allocator, length + 16); + list = try convertUTF16ToUTF8(list, Type, utf16); + return list.items; + } + + var list = try std.ArrayList(u8).initCapacity(allocator, utf16.len); + list = try toUTF8ListWithType(list, Type, utf16); + return list.items; +} + +pub fn toUTF8ListWithType(list_: std.ArrayList(u8), comptime Type: type, utf16: Type) OOM!std.ArrayList(u8) { + if (bun.FeatureFlags.use_simdutf and comptime Type == []const u16) { + var list = list_; + const length = bun.simdutf.length.utf8.from.utf16.le(utf16); + try list.ensureTotalCapacityPrecise(length + 16); + const buf = try convertUTF16ToUTF8(list, Type, utf16); + + // Commenting out because `convertUTF16ToUTF8` may convert to WTF-8 + // which uses 3 bytes for invalid surrogates, causing the length to not + // match from simdutf. + // if (Environment.allow_assert) { + // bun.unsafeAssert(buf.items.len == length); + // } + + return buf; + } + + @compileError("not implemented"); +} + +pub fn toUTF8AppendToList(list: *std.ArrayList(u8), utf16: []const u16) !void { + if (!bun.FeatureFlags.use_simdutf) { + @compileError("not implemented"); + } + const length = bun.simdutf.length.utf8.from.utf16.le(utf16); + try list.ensureUnusedCapacity(length + 16); + try convertUTF16ToUTF8Append(list, utf16); +} + +pub fn toUTF8FromLatin1(allocator: std.mem.Allocator, latin1: []const u8) !?std.ArrayList(u8) { + if (isAllASCII(latin1)) + return null; + + const list = try std.ArrayList(u8).initCapacity(allocator, latin1.len); + return try allocateLatin1IntoUTF8WithList(list, 0, []const u8, latin1); +} + +pub fn toUTF8FromLatin1Z(allocator: std.mem.Allocator, latin1: []const u8) !?std.ArrayList(u8) { + if (isAllASCII(latin1)) + return null; + + const list = try std.ArrayList(u8).initCapacity(allocator, latin1.len + 1); + var list1 = try allocateLatin1IntoUTF8WithList(list, 0, []const u8, latin1); + try list1.append(0); + return list1; +} + +pub fn toUTF8ListWithTypeBun(list: *std.ArrayList(u8), comptime Type: type, utf16: Type, comptime skip_trailing_replacement: bool) OOM!(if (skip_trailing_replacement) ?u16 else std.ArrayList(u8)) { + var utf16_remaining = utf16; + + while (firstNonASCII16(Type, utf16_remaining)) |i| { + const to_copy = utf16_remaining[0..i]; + utf16_remaining = utf16_remaining[i..]; + const token = utf16_remaining[0]; + + const replacement = utf16CodepointWithFFFDAndFirstInputChar(Type, token, utf16_remaining); + utf16_remaining = utf16_remaining[replacement.len..]; + + const count: usize = replacement.utf8Width(); + if (comptime Environment.isNative) { + try list.ensureTotalCapacityPrecise(i + count + list.items.len + @as(usize, @intFromFloat((@as(f64, @floatFromInt(@as(u52, @truncate(utf16_remaining.len)))) * 1.2)))); + } else { + try list.ensureTotalCapacityPrecise(i + count + list.items.len + utf16_remaining.len + 4); + } + list.items.len += i; + + copyU16IntoU8(list.items[list.items.len - i ..], to_copy); + + if (comptime skip_trailing_replacement) { + if (replacement.is_lead and utf16_remaining.len == 0) { + return token; + } + } + + list.items.len += count; + _ = encodeWTF8RuneT( + list.items.ptr[list.items.len - count .. list.items.len - count + 4][0..4], + u32, + @as(u32, replacement.code_point), + ); + } + + if (utf16_remaining.len > 0) { + try list.ensureTotalCapacityPrecise(utf16_remaining.len + list.items.len); + const old_len = list.items.len; + list.items.len += utf16_remaining.len; + copyU16IntoU8(list.items[old_len..], utf16_remaining); + } + + log("UTF16 {d} -> {d} UTF8", .{ utf16.len, list.items.len }); + + if (comptime skip_trailing_replacement) { + return null; + } + return list.*; +} + +pub const EncodeIntoResult = struct { + read: u32 = 0, + written: u32 = 0, +}; +pub fn allocateLatin1IntoUTF8(allocator: std.mem.Allocator, comptime Type: type, latin1_: Type) ![]u8 { + if (comptime bun.FeatureFlags.latin1_is_now_ascii) { + var out = try allocator.alloc(u8, latin1_.len); + @memcpy(out[0..latin1_.len], latin1_); + return out; + } + + const list = try std.ArrayList(u8).initCapacity(allocator, latin1_.len); + var foo = try allocateLatin1IntoUTF8WithList(list, 0, Type, latin1_); + return try foo.toOwnedSlice(); +} + +pub fn allocateLatin1IntoUTF8WithList(list_: std.ArrayList(u8), offset_into_list: usize, comptime Type: type, latin1_: Type) OOM!std.ArrayList(u8) { + var latin1 = latin1_; + var i: usize = offset_into_list; + var list = list_; + try list.ensureUnusedCapacity(latin1.len); + + while (latin1.len > 0) { + if (comptime Environment.allow_assert) assert(i < list.capacity); + var buf = list.items.ptr[i..list.capacity]; + + inner: { + var count = latin1.len / ascii_vector_size; + while (count > 0) : (count -= 1) { + const vec: AsciiVector = latin1[0..ascii_vector_size].*; + + if (@reduce(.Max, vec) > 127) { + const Int = u64; + const size = @sizeOf(Int); + + // zig or LLVM doesn't do @ctz nicely with SIMD + if (comptime ascii_vector_size >= 8) { + { + const bytes = @as(Int, @bitCast(latin1[0..size].*)); + // https://dotat.at/@/2022-06-27-tolower-swar.html + const mask = bytes & 0x8080808080808080; + + if (mask > 0) { + const first_set_byte = @ctz(mask) / 8; + if (comptime Environment.allow_assert) assert(latin1[first_set_byte] >= 127); + + buf[0..size].* = @as([size]u8, @bitCast(bytes)); + buf = buf[first_set_byte..]; + latin1 = latin1[first_set_byte..]; + break :inner; + } + + buf[0..size].* = @as([size]u8, @bitCast(bytes)); + latin1 = latin1[size..]; + buf = buf[size..]; + } + + if (comptime ascii_vector_size >= 16) { + const bytes = @as(Int, @bitCast(latin1[0..size].*)); + // https://dotat.at/@/2022-06-27-tolower-swar.html + const mask = bytes & 0x8080808080808080; + + if (mask > 0) { + const first_set_byte = @ctz(mask) / 8; + if (comptime Environment.allow_assert) assert(latin1[first_set_byte] >= 127); + + buf[0..size].* = @as([size]u8, @bitCast(bytes)); + buf = buf[first_set_byte..]; + latin1 = latin1[first_set_byte..]; + break :inner; + } + } + } + unreachable; + } + + buf[0..ascii_vector_size].* = @as([ascii_vector_size]u8, @bitCast(vec))[0..ascii_vector_size].*; + latin1 = latin1[ascii_vector_size..]; + buf = buf[ascii_vector_size..]; + } + + while (latin1.len >= 8) { + const Int = u64; + const size = @sizeOf(Int); + + const bytes = @as(Int, @bitCast(latin1[0..size].*)); + // https://dotat.at/@/2022-06-27-tolower-swar.html + const mask = bytes & 0x8080808080808080; + + if (mask > 0) { + const first_set_byte = @ctz(mask) / 8; + if (comptime Environment.allow_assert) assert(latin1[first_set_byte] >= 127); + + buf[0..size].* = @as([size]u8, @bitCast(bytes)); + latin1 = latin1[first_set_byte..]; + buf = buf[first_set_byte..]; + break :inner; + } + + buf[0..size].* = @as([size]u8, @bitCast(bytes)); + latin1 = latin1[size..]; + buf = buf[size..]; + } + + { + if (comptime Environment.allow_assert) assert(latin1.len < 8); + const end = latin1.ptr + latin1.len; + while (latin1.ptr != end and latin1[0] < 128) { + buf[0] = latin1[0]; + buf = buf[1..]; + latin1 = latin1[1..]; + } + } + } + + while (latin1.len > 0 and latin1[0] > 127) { + i = @intFromPtr(buf.ptr) - @intFromPtr(list.items.ptr); + list.items.len = i; + try list.ensureUnusedCapacity(2 + latin1.len); + buf = list.items.ptr[i..list.capacity]; + buf[0..2].* = latin1ToCodepointBytesAssumeNotASCII(latin1[0]); + latin1 = latin1[1..]; + buf = buf[2..]; + } + + i = @intFromPtr(buf.ptr) - @intFromPtr(list.items.ptr); + list.items.len = i; + } + + log("Latin1 {d} -> UTF8 {d}", .{ latin1_.len, i }); + + return list; +} + +pub const UTF16Replacement = struct { + code_point: u32 = unicode_replacement, + len: u3_fast = 0, + + /// Explicit fail boolean to distinguish between a Unicode Replacement Codepoint + /// that was already in there + /// and a genuine error. + fail: bool = false, + + can_buffer: bool = true, + is_lead: bool = false, + + pub inline fn utf8Width(replacement: UTF16Replacement) u3_fast { + return switch (replacement.code_point) { + 0...0x7F => 1, + (0x7F + 1)...0x7FF => 2, + (0x7FF + 1)...0xFFFF => 3, + else => 4, + }; + } +}; + +pub fn convertUTF8BytesIntoUTF16WithLength(sequence: *const [4]u8, len: u3_fast, remaining_len: usize) UTF16Replacement { + if (comptime Environment.allow_assert) assert(sequence[0] > 127); + switch (len) { + 2 => { + if (comptime Environment.allow_assert) { + bun.assert(sequence[0] >= 0xC0); + bun.assert(sequence[0] <= 0xDF); + } + if (sequence[1] < 0x80 or sequence[1] > 0xBF) { + return .{ .len = 1, .fail = true, .can_buffer = remaining_len < 2 }; + } + return .{ .len = len, .code_point = ((@as(u32, sequence[0]) << 6) + @as(u32, sequence[1])) - 0x00003080 }; + }, + 3 => { + if (comptime Environment.allow_assert) { + bun.assert(sequence[0] >= 0xE0); + bun.assert(sequence[0] <= 0xEF); + } + switch (sequence[0]) { + 0xE0 => { + if (sequence[1] < 0xA0 or sequence[1] > 0xBF) { + return .{ .len = 1, .fail = true, .can_buffer = remaining_len < 2 }; + } + }, + 0xED => { + if (sequence[1] < 0x80 or sequence[1] > 0x9F) { + return .{ .len = 1, .fail = true, .can_buffer = remaining_len < 2 }; + } + }, + else => { + if (sequence[1] < 0x80 or sequence[1] > 0xBF) { + return .{ .len = 1, .fail = true, .can_buffer = remaining_len < 2 }; + } + }, + } + if (sequence[2] < 0x80 or sequence[2] > 0xBF) { + return .{ .len = 2, .fail = true, .can_buffer = remaining_len < 3 }; + } + return .{ + .len = len, + .code_point = ((@as(u32, sequence[0]) << 12) + (@as(u32, sequence[1]) << 6) + @as(u32, sequence[2])) - 0x000E2080, + }; + }, + 4 => { + switch (sequence[0]) { + 0xF0 => { + if (sequence[1] < 0x90 or sequence[1] > 0xBF) { + return .{ .len = 1, .fail = true, .can_buffer = remaining_len < 2 }; + } + }, + 0xF4 => { + if (sequence[1] < 0x80 or sequence[1] > 0x8F) { + return .{ .len = 1, .fail = true, .can_buffer = remaining_len < 2 }; + } + }, + + // invalid code point + // this used to be an assertion + 0...(0xF0 - 1), 0xF4 + 1...std.math.maxInt(@TypeOf(sequence[0])) => { + return .{ .len = 1, .fail = true, .can_buffer = false }; + }, + + else => { + if (sequence[1] < 0x80 or sequence[1] > 0xBF) { + return .{ .len = 1, .fail = true, .can_buffer = remaining_len < 2 }; + } + }, + } + + if (sequence[2] < 0x80 or sequence[2] > 0xBF) { + return .{ .len = 2, .fail = true, .can_buffer = remaining_len < 3 }; + } + if (sequence[3] < 0x80 or sequence[3] > 0xBF) { + return .{ .len = 3, .fail = true, .can_buffer = remaining_len < 4 }; + } + return .{ + .len = len, + .code_point = ((@as(u32, sequence[0]) << 18) + + (@as(u32, sequence[1]) << 12) + + (@as(u32, sequence[2]) << 6) + @as(u32, sequence[3])) - 0x03C82080, + }; + }, + // invalid unicode sequence + // 1 or 0 are both invalid here + else => return UTF16Replacement{ .len = 1, .fail = true }, + } +} + +// This variation matches WebKit behavior. +// fn convertUTF8BytesIntoUTF16(sequence: *const [4]u8, remaining_len: usize) UTF16Replacement { +pub fn convertUTF8BytesIntoUTF16(bytes: []const u8) UTF16Replacement { + const sequence: [4]u8 = switch (bytes.len) { + 0 => unreachable, + 1 => [_]u8{ bytes[0], 0, 0, 0 }, + 2 => [_]u8{ bytes[0], bytes[1], 0, 0 }, + 3 => [_]u8{ bytes[0], bytes[1], bytes[2], 0 }, + else => bytes[0..4].*, + }; + if (comptime Environment.allow_assert) assert(sequence[0] > 127); + const sequence_length = nonASCIISequenceLength(sequence[0]); + return convertUTF8BytesIntoUTF16WithLength(&sequence, sequence_length, bytes.len); +} + +pub fn copyLatin1IntoUTF8(buf_: []u8, comptime Type: type, latin1_: Type) EncodeIntoResult { + return copyLatin1IntoUTF8StopOnNonASCII(buf_, Type, latin1_, false); +} + +pub fn copyLatin1IntoUTF8StopOnNonASCII(buf_: []u8, comptime Type: type, latin1_: Type, comptime stop: bool) EncodeIntoResult { + if (comptime bun.FeatureFlags.latin1_is_now_ascii) { + const to_copy = @as(u32, @truncate(@min(buf_.len, latin1_.len))); + @memcpy(buf_[0..to_copy], latin1_[0..to_copy]); + + return .{ .written = to_copy, .read = to_copy }; + } + + var buf = buf_; + var latin1 = latin1_; + + log("latin1 encode {d} -> {d}", .{ buf.len, latin1.len }); + + while (buf.len > 0 and latin1.len > 0) { + inner: { + var remaining_runs = @min(buf.len, latin1.len) / ascii_vector_size; + while (remaining_runs > 0) : (remaining_runs -= 1) { + const vec: AsciiVector = latin1[0..ascii_vector_size].*; + + if (@reduce(.Max, vec) > 127) { + if (comptime stop) return .{ .written = std.math.maxInt(u32), .read = std.math.maxInt(u32) }; + + // zig or LLVM doesn't do @ctz nicely with SIMD + if (comptime ascii_vector_size >= 8) { + const Int = u64; + const size = @sizeOf(Int); + + { + const bytes = @as(Int, @bitCast(latin1[0..size].*)); + // https://dotat.at/@/2022-06-27-tolower-swar.html + const mask = bytes & 0x8080808080808080; + + buf[0..size].* = @as([size]u8, @bitCast(bytes)); + + if (mask > 0) { + const first_set_byte = @ctz(mask) / 8; + if (comptime Environment.allow_assert) assert(latin1[first_set_byte] >= 127); + + buf = buf[first_set_byte..]; + latin1 = latin1[first_set_byte..]; + break :inner; + } + + latin1 = latin1[size..]; + buf = buf[size..]; + } + + if (comptime ascii_vector_size >= 16) { + const bytes = @as(Int, @bitCast(latin1[0..size].*)); + // https://dotat.at/@/2022-06-27-tolower-swar.html + const mask = bytes & 0x8080808080808080; + + buf[0..size].* = @as([size]u8, @bitCast(bytes)); + + if (comptime Environment.allow_assert) assert(mask > 0); + const first_set_byte = @ctz(mask) / 8; + if (comptime Environment.allow_assert) assert(latin1[first_set_byte] >= 127); + + buf = buf[first_set_byte..]; + latin1 = latin1[first_set_byte..]; + break :inner; + } + } + unreachable; + } + + buf[0..ascii_vector_size].* = @as([ascii_vector_size]u8, @bitCast(vec))[0..ascii_vector_size].*; + latin1 = latin1[ascii_vector_size..]; + buf = buf[ascii_vector_size..]; + } + + { + const Int = u64; + const size = @sizeOf(Int); + while (@min(buf.len, latin1.len) >= size) { + const bytes = @as(Int, @bitCast(latin1[0..size].*)); + buf[0..size].* = @as([size]u8, @bitCast(bytes)); + + // https://dotat.at/@/2022-06-27-tolower-swar.html + + const mask = bytes & 0x8080808080808080; + + if (mask > 0) { + const first_set_byte = @ctz(mask) / 8; + if (comptime stop) return .{ .written = std.math.maxInt(u32), .read = std.math.maxInt(u32) }; + if (comptime Environment.allow_assert) assert(latin1[first_set_byte] >= 127); + + buf = buf[first_set_byte..]; + latin1 = latin1[first_set_byte..]; + + break :inner; + } + + latin1 = latin1[size..]; + buf = buf[size..]; + } + } + + { + const end = latin1.ptr + @min(buf.len, latin1.len); + if (comptime Environment.allow_assert) assert(@intFromPtr(latin1.ptr + 8) > @intFromPtr(end)); + const start_ptr = @intFromPtr(buf.ptr); + const start_ptr_latin1 = @intFromPtr(latin1.ptr); + + while (latin1.ptr != end and latin1.ptr[0] <= 127) { + buf.ptr[0] = latin1.ptr[0]; + buf.ptr += 1; + latin1.ptr += 1; + } + + buf.len -= @intFromPtr(buf.ptr) - start_ptr; + latin1.len -= @intFromPtr(latin1.ptr) - start_ptr_latin1; + } + } + + if (latin1.len > 0) { + if (buf.len >= 2) { + if (comptime stop) return .{ .written = std.math.maxInt(u32), .read = std.math.maxInt(u32) }; + + buf[0..2].* = latin1ToCodepointBytesAssumeNotASCII(latin1[0]); + latin1 = latin1[1..]; + buf = buf[2..]; + } else { + break; + } + } + } + + return .{ + .written = @as(u32, @truncate(buf_.len - buf.len)), + .read = @as(u32, @truncate(latin1_.len - latin1.len)), + }; +} + +pub fn replaceLatin1WithUTF8(buf_: []u8) void { + var latin1 = buf_; + while (strings.firstNonASCII(latin1)) |i| { + latin1[i..][0..2].* = latin1ToCodepointBytesAssumeNotASCII(latin1[i]); + + latin1 = latin1[i + 2 ..]; + } +} + +pub fn elementLengthLatin1IntoUTF8(slice: []const u8) usize { + return bun.simdutf.length.utf8.from.latin1(slice); +} + +pub fn copyLatin1IntoUTF16(comptime Buffer: type, buf_: Buffer, comptime Type: type, latin1_: Type) EncodeIntoResult { + var buf = buf_; + var latin1 = latin1_; + while (buf.len > 0 and latin1.len > 0) { + const to_write = strings.firstNonASCII(latin1) orelse @as(u32, @truncate(@min(latin1.len, buf.len))); + if (comptime std.meta.alignment(Buffer) != @alignOf(u16)) { + strings.copyU8IntoU16WithAlignment(std.meta.alignment(Buffer), buf, latin1[0..to_write]); + } else { + strings.copyU8IntoU16(buf, latin1[0..to_write]); + } + + latin1 = latin1[to_write..]; + buf = buf[to_write..]; + if (latin1.len > 0 and buf.len >= 1) { + buf[0] = latin1ToCodepointBytesAssumeNotASCII16(latin1[0]); + latin1 = latin1[1..]; + buf = buf[1..]; + } + } + + return .{ + .read = @as(u32, @truncate(buf_.len - buf.len)), + .written = @as(u32, @truncate(latin1_.len - latin1.len)), + }; +} + +pub fn elementLengthLatin1IntoUTF16(comptime Type: type, latin1_: Type) usize { + // latin1 is always at most 1 UTF-16 code unit long + if (comptime std.meta.Child([]const u16) == Type) { + return latin1_.len; + } + + return bun.simdutf.length.utf16.from.latin1(latin1_); +} + +pub fn eqlUtf16(comptime self: string, other: []const u16) bool { + if (self.len != other.len) return false; + + if (self.len == 0) return true; + + return bun.C.memcmp(bun.cast([*]const u8, self.ptr), bun.cast([*]const u8, other.ptr), self.len * @sizeOf(u16)) == 0; +} + +pub fn toUTF8Alloc(allocator: std.mem.Allocator, js: []const u16) OOM![]u8 { + return try toUTF8AllocWithType(allocator, []const u16, js); +} + +pub fn toUTF8AllocZ(allocator: std.mem.Allocator, js: []const u16) OOM![:0]u8 { + var list = std.ArrayList(u8).init(allocator); + try toUTF8AppendToList(&list, js); + try list.append(0); + return list.items[0 .. list.items.len - 1 :0]; +} + +pub fn appendUTF8MachineWordToUTF16MachineWord(output: *[@sizeOf(usize) / 2]u16, input: *const [@sizeOf(usize) / 2]u8) callconv(bun.callconv_inline) void { + output[0 .. @sizeOf(usize) / 2].* = @as( + [4]u16, + @bitCast(@as( + @Vector(4, u16), + @as(@Vector(4, u8), @bitCast(input[0 .. @sizeOf(usize) / 2].*)), + )), + ); +} + +pub fn copyU8IntoU16(output_: []u16, input_: []const u8) callconv(bun.callconv_inline) void { + const output = output_; + const input = input_; + if (comptime Environment.allow_assert) assert(input.len <= output.len); + + // https://zig.godbolt.org/z/9rTn1orcY + + var input_ptr = input.ptr; + var output_ptr = output.ptr; + + const last_input_ptr = input_ptr + @min(input.len, output.len); + + while (last_input_ptr != input_ptr) { + output_ptr[0] = input_ptr[0]; + output_ptr += 1; + input_ptr += 1; + } +} + +pub fn copyU8IntoU16WithAlignment(comptime alignment: u21, output_: []align(alignment) u16, input_: []const u8) void { + var output = output_; + var input = input_; + const word = @sizeOf(usize) / 2; + if (comptime Environment.allow_assert) assert(input.len <= output.len); + + // un-aligned data access is slow + // so we attempt to align the data + while (!std.mem.isAligned(@intFromPtr(output.ptr), @alignOf(u16)) and input.len >= word) { + output[0] = input[0]; + output = output[1..]; + input = input[1..]; + } + + if (std.mem.isAligned(@intFromPtr(output.ptr), @alignOf(u16)) and input.len > 0) { + copyU8IntoU16(@as([*]u16, @alignCast(output.ptr))[0..output.len], input); + return; + } + + for (input, 0..) |c, i| { + output[i] = c; + } +} + +// pub fn copy(output_: []u8, input_: []const u8) callconv(bun.callconv_inline) void { +// var output = output_; +// var input = input_; +// if (comptime Environment.allow_assert) assert(input.len <= output.len); + +// if (input.len > @sizeOf(usize) * 4) { +// comptime var i: usize = 0; +// inline while (i < 4) : (i += 1) { +// appendUTF8MachineWord(output[i * @sizeOf(usize) ..][0..@sizeOf(usize)], input[i * @sizeOf(usize) ..][0..@sizeOf(usize)]); +// } +// output = output[4 * @sizeOf(usize) ..]; +// input = input[4 * @sizeOf(usize) ..]; +// } + +// while (input.len >= @sizeOf(usize)) { +// appendUTF8MachineWord(output[0..@sizeOf(usize)], input[0..@sizeOf(usize)]); +// output = output[@sizeOf(usize)..]; +// input = input[@sizeOf(usize)..]; +// } + +// for (input) |c, i| { +// output[i] = c; +// } +// } + +pub inline fn copyU16IntoU8(output: []u8, input: []align(1) const u16) void { + if (comptime Environment.allow_assert) assert(input.len <= output.len); + const count = @min(input.len, output.len); + + bun.highway.copyU16ToU8(input[0..count], output[0..count]); +} + +pub fn copyLatin1IntoASCII(dest: []u8, src: []const u8) void { + var remain = src; + var to = dest; + + const non_ascii_offset = strings.firstNonASCII(remain) orelse @as(u32, @truncate(remain.len)); + if (non_ascii_offset > 0) { + @memcpy(to[0..non_ascii_offset], remain[0..non_ascii_offset]); + remain = remain[non_ascii_offset..]; + to = to[non_ascii_offset..]; + + // ascii fast path + if (remain.len == 0) { + return; + } + } + + if (to.len >= 16 and bun.Environment.enableSIMD) { + const vector_size = 16; + // https://zig.godbolt.org/z/qezsY8T3W + const remain_in_u64 = remain[0 .. remain.len - (remain.len % vector_size)]; + const to_in_u64 = to[0 .. to.len - (to.len % vector_size)]; + var remain_as_u64 = std.mem.bytesAsSlice(u64, remain_in_u64); + var to_as_u64 = std.mem.bytesAsSlice(u64, to_in_u64); + const end_vector_len = @min(remain_as_u64.len, to_as_u64.len); + remain_as_u64 = remain_as_u64[0..end_vector_len]; + to_as_u64 = to_as_u64[0..end_vector_len]; + const end_ptr = remain_as_u64.ptr + remain_as_u64.len; + // using the pointer instead of the length is super important for the codegen + while (end_ptr != remain_as_u64.ptr) { + const buf = remain_as_u64[0]; + // this gets auto-vectorized + const mask = @as(u64, 0x7f7f7f7f7f7f7f7f); + to_as_u64[0] = buf & mask; + + remain_as_u64 = remain_as_u64[1..]; + to_as_u64 = to_as_u64[1..]; + } + remain = remain[remain_in_u64.len..]; + to = to[to_in_u64.len..]; + } + + for (to) |*to_byte| { + to_byte.* = @as(u8, @as(u7, @truncate(remain[0]))); + remain = remain[1..]; + } +} + +/// It is common on Windows to find files that are not encoded in UTF8. Most of these include +/// a 'byte-order mark' codepoint at the start of the file. The layout of this codepoint can +/// determine the encoding. +/// +/// https://en.wikipedia.org/wiki/Byte_order_mark +pub const BOM = enum { + utf8, + utf16_le, + utf16_be, + utf32_le, + utf32_be, + + pub const utf8_bytes = [_]u8{ 0xef, 0xbb, 0xbf }; + pub const utf16_le_bytes = [_]u8{ 0xff, 0xfe }; + pub const utf16_be_bytes = [_]u8{ 0xfe, 0xff }; + pub const utf32_le_bytes = [_]u8{ 0xff, 0xfe, 0x00, 0x00 }; + pub const utf32_be_bytes = [_]u8{ 0x00, 0x00, 0xfe, 0xff }; + + pub fn detect(bytes: []const u8) ?BOM { + if (bytes.len < 3) return null; + if (eqlComptimeIgnoreLen(bytes, utf8_bytes)) return .utf8; + if (eqlComptimeIgnoreLen(bytes, utf16_le_bytes)) { + // if (bytes.len > 4 and eqlComptimeIgnoreLen(bytes[2..], utf32_le_bytes[2..])) + // return .utf32_le; + return .utf16_le; + } + // if (eqlComptimeIgnoreLen(bytes, utf16_be_bytes)) return .utf16_be; + // if (bytes.len > 4 and eqlComptimeIgnoreLen(bytes, utf32_le_bytes)) return .utf32_le; + return null; + } + + pub fn detectAndSplit(bytes: []const u8) struct { ?BOM, []const u8 } { + const bom = detect(bytes); + if (bom == null) return .{ null, bytes }; + return .{ bom, bytes[bom.?.length()..] }; + } + + pub fn getHeader(bom: BOM) []const u8 { + return switch (bom) { + inline else => |t| comptime &@field(BOM, @tagName(t) ++ "_bytes"), + }; + } + + pub fn length(bom: BOM) usize { + return switch (bom) { + inline else => |t| comptime (&@field(BOM, @tagName(t) ++ "_bytes")).len, + }; + } + + /// If an allocation is needed, free the input and the caller will + /// replace it with the new return + pub fn removeAndConvertToUTF8AndFree(bom: BOM, allocator: std.mem.Allocator, bytes: []u8) OOM![]u8 { + switch (bom) { + .utf8 => { + _ = bun.c.memmove(bytes.ptr, bytes.ptr + utf8_bytes.len, bytes.len - utf8_bytes.len); + return bytes[0 .. bytes.len - utf8_bytes.len]; + }, + .utf16_le => { + const trimmed_bytes = bytes[utf16_le_bytes.len..]; + const trimmed_bytes_u16: []const u16 = @alignCast(std.mem.bytesAsSlice(u16, trimmed_bytes)); + const out = try toUTF8Alloc(allocator, trimmed_bytes_u16); + allocator.free(bytes); + return out; + }, + else => { + // TODO: this needs to re-encode, for now we just remove the BOM + const bom_bytes = bom.getHeader(); + _ = bun.c.memmove(bytes.ptr, bytes.ptr + bom_bytes.len, bytes.len - bom_bytes.len); + return bytes[0 .. bytes.len - bom_bytes.len]; + }, + } + } + + /// This is required for fs.zig's `use_shared_buffer` flag. we cannot free that pointer. + /// The returned slice will always point to the base of the input. + /// + /// Requires an arraylist in case it must be grown. + pub fn removeAndConvertToUTF8WithoutDealloc(bom: BOM, allocator: std.mem.Allocator, list: *std.ArrayListUnmanaged(u8)) ![]u8 { + const bytes = list.items; + switch (bom) { + .utf8 => { + bun.C.memmove(bytes.ptr, bytes.ptr + utf8_bytes.len, bytes.len - utf8_bytes.len); + return bytes[0 .. bytes.len - utf8_bytes.len]; + }, + .utf16_le => { + const trimmed_bytes = bytes[utf16_le_bytes.len..]; + const trimmed_bytes_u16: []const u16 = @alignCast(std.mem.bytesAsSlice(u16, trimmed_bytes)); + const out = try toUTF8Alloc(allocator, trimmed_bytes_u16); + if (list.capacity < out.len) { + try list.ensureTotalCapacity(allocator, out.len); + } + list.items.len = out.len; + @memcpy(list.items, out); + return out; + }, + else => { + // TODO: this needs to re-encode, for now we just remove the BOM + const bom_bytes = bom.getHeader(); + bun.C.memmove(bytes.ptr, bytes.ptr + bom_bytes.len, bytes.len - bom_bytes.len); + return bytes[0 .. bytes.len - bom_bytes.len]; + }, + } + } +}; + +/// @deprecated. If you are using this, you likely will need to remove other BOMs and handle encoding. +/// Use the BOM struct's `detect` and conversion functions instead. +pub fn withoutUTF8BOM(bytes: []const u8) []const u8 { + if (strings.hasPrefixComptime(bytes, BOM.utf8_bytes)) { + return bytes[BOM.utf8_bytes.len..]; + } else { + return bytes; + } +} + +// https://github.com/WebKit/WebKit/blob/443e796d1538654c34f2690e39600c70c8052b63/Source/WebCore/PAL/pal/text/TextCodecUTF8.cpp#L69 +pub fn nonASCIISequenceLength(first_byte: u8) u3_fast { + return switch (first_byte) { + 0...193 => 0, + 194...223 => 2, + 224...239 => 3, + 240...244 => 4, + 245...255 => 0, + }; +} + +/// Convert a UTF-8 string to a UTF-16 string IF there are any non-ascii characters +/// If there are no non-ascii characters, this returns null +/// This is intended to be used for strings that go to JavaScript +pub fn toUTF16Alloc(allocator: std.mem.Allocator, bytes: []const u8, comptime fail_if_invalid: bool, comptime sentinel: bool) !if (sentinel) ?[:0]u16 else ?[]u16 { + if (strings.firstNonASCII(bytes)) |i| { + const output_: ?std.ArrayList(u16) = if (comptime bun.FeatureFlags.use_simdutf) simd: { + const out_length = bun.simdutf.length.utf16.from.utf8(bytes); + if (out_length == 0) + break :simd null; + + var out = try allocator.alloc(u16, out_length + if (sentinel) 1 else 0); + log("toUTF16 {d} UTF8 -> {d} UTF16", .{ bytes.len, out_length }); + + const res = bun.simdutf.convert.utf8.to.utf16.with_errors.le(bytes, if (comptime sentinel) out[0..out_length] else out); + if (res.status == .success) { + if (comptime sentinel) { + out[out_length] = 0; + return out[0 .. out_length + 1 :0]; + } + return out; + } + + if (comptime fail_if_invalid) { + allocator.free(out); + return error.InvalidByteSequence; + } + + break :simd .{ + .items = out[0..i], + .capacity = out.len, + .allocator = allocator, + }; + } else null; + var output = output_ orelse fallback: { + var list = try std.ArrayList(u16).initCapacity(allocator, i + 2); + list.items.len = i; + strings.copyU8IntoU16(list.items, bytes[0..i]); + break :fallback list; + }; + errdefer output.deinit(); + + var remaining = bytes[i..]; + + { + const replacement = strings.convertUTF8BytesIntoUTF16(remaining); + if (comptime fail_if_invalid) { + if (replacement.fail) { + if (comptime Environment.allow_assert) assert(replacement.code_point == unicode_replacement); + return error.InvalidByteSequence; + } + } + remaining = remaining[@max(replacement.len, 1)..]; + + //#define U16_LENGTH(c) ((uint32_t)(c)<=0xffff ? 1 : 2) + switch (replacement.code_point) { + 0...0xffff => |c| { + try output.append(@as(u16, @intCast(c))); + }, + else => |c| { + try output.appendSlice(&[_]u16{ strings.u16Lead(c), strings.u16Trail(c) }); + }, + } + } + + while (strings.firstNonASCII(remaining)) |j| { + const end = output.items.len; + try output.ensureUnusedCapacity(j); + output.items.len += j; + strings.copyU8IntoU16(output.items[end..][0..j], remaining[0..j]); + remaining = remaining[j..]; + + const replacement = strings.convertUTF8BytesIntoUTF16(remaining); + if (comptime fail_if_invalid) { + if (replacement.fail) { + if (comptime Environment.allow_assert) assert(replacement.code_point == unicode_replacement); + return error.InvalidByteSequence; + } + } + remaining = remaining[@max(replacement.len, 1)..]; + + //#define U16_LENGTH(c) ((uint32_t)(c)<=0xffff ? 1 : 2) + switch (replacement.code_point) { + 0...0xffff => |c| { + try output.append(@as(u16, @intCast(c))); + }, + else => |c| { + try output.appendSlice(&[_]u16{ strings.u16Lead(c), strings.u16Trail(c) }); + }, + } + } + + if (remaining.len > 0) { + try output.ensureTotalCapacityPrecise(output.items.len + remaining.len + comptime if (sentinel) 1 else 0); + + output.items.len += remaining.len; + strings.copyU8IntoU16(output.items[output.items.len - remaining.len ..], remaining); + } + + if (comptime sentinel) { + output.items[output.items.len] = 0; + return output.items[0 .. output.items.len + 1 :0]; + } + + return output.items; + } + + return null; +} + +// this one does the thing it's named after +pub fn toUTF16AllocForReal(allocator: std.mem.Allocator, bytes: []const u8, comptime fail_if_invalid: bool, comptime sentinel: bool) !if (sentinel) [:0]u16 else []u16 { + return (try toUTF16Alloc(allocator, bytes, fail_if_invalid, sentinel)) orelse { + const output = try allocator.alloc(u16, bytes.len + if (sentinel) 1 else 0); + bun.strings.copyU8IntoU16(if (sentinel) output[0..bytes.len] else output, bytes); + + if (comptime sentinel) { + output[bytes.len] = 0; + return output[0..bytes.len :0]; + } + + return output; + }; +} + +pub fn toUTF16AllocMaybeBuffered( + allocator: std.mem.Allocator, + bytes: []const u8, + comptime fail_if_invalid: bool, + comptime flush: bool, +) error{ OutOfMemory, InvalidByteSequence }!?struct { []u16, [3]u8, u2 } { + const first_non_ascii = strings.firstNonASCII(bytes) orelse return null; + + var output: std.ArrayListUnmanaged(u16) = if (comptime bun.FeatureFlags.use_simdutf) output: { + const out_length = bun.simdutf.length.utf16.from.utf8(bytes); + + if (out_length == 0) { + break :output .{}; + } + + var out = try allocator.alloc(u16, out_length); + + const res = bun.simdutf.convert.utf8.to.utf16.with_errors.le(bytes, out); + if (res.status == .success) { + log("toUTF16 {d} UTF8 -> {d} UTF16", .{ bytes.len, out_length }); + return .{ out, .{0} ** 3, 0 }; + } + + var list = std.ArrayListUnmanaged(u16).fromOwnedSlice(out[0..first_non_ascii]); + list.capacity = out.len; + + break :output list; + } else .{}; + errdefer output.deinit(allocator); + + const start = if (output.items.len > 0) first_non_ascii else 0; + var remaining = bytes[start..]; + + var non_ascii: ?u32 = 0; + while (non_ascii) |i| : (non_ascii = strings.firstNonASCII(remaining)) { + { + const end = output.items.len; + try output.ensureUnusedCapacity(allocator, i + 2); // +2 for UTF16 codepoint + output.items.len += i; + strings.copyU8IntoU16(output.items[end..][0..i], remaining[0..i]); + remaining = remaining[i..]; + } + + const sequence: [4]u8 = switch (remaining.len) { + 0 => unreachable, + 1 => .{ remaining[0], 0, 0, 0 }, + 2 => .{ remaining[0], remaining[1], 0, 0 }, + 3 => .{ remaining[0], remaining[1], remaining[2], 0 }, + else => remaining[0..4].*, + }; + + const converted_length = strings.nonASCIISequenceLength(sequence[0]); + + const converted = strings.convertUTF8BytesIntoUTF16WithLength(&sequence, converted_length, remaining.len); + + if (comptime !flush) { + if (converted.fail and converted.can_buffer and converted_length > remaining.len) { + const buffered: [3]u8 = switch (remaining.len) { + else => unreachable, + 1 => .{ remaining[0], 0, 0 }, + 2 => .{ remaining[0], remaining[1], 0 }, + 3 => .{ remaining[0], remaining[1], remaining[2] }, + }; + return .{ output.items, buffered, @intCast(remaining.len) }; + } + } + + if (comptime fail_if_invalid) { + if (converted.fail) { + if (comptime Environment.allow_assert) { + bun.assert(converted.code_point == unicode_replacement); + } + return error.InvalidByteSequence; + } + } + + remaining = remaining[@max(converted.len, 1)..]; + + // #define U16_LENGTH(c) ((uint32_t)(c)<=0xffff ? 1 : 2) + switch (converted.code_point) { + 0...0xffff => |c| output.appendAssumeCapacity(@intCast(c)), + else => |c| output.appendSliceAssumeCapacity(&.{ strings.u16Lead(c), strings.u16Trail(c) }), + } + } + + if (remaining.len > 0) { + try output.ensureTotalCapacityPrecise(allocator, output.items.len + remaining.len); + output.items.len += remaining.len; + strings.copyU8IntoU16(output.items[output.items.len - remaining.len ..], remaining); + } + + log("toUTF16 {d} UTF8 -> {d} UTF16", .{ bytes.len, output.items.len }); + return .{ output.items, .{0} ** 3, 0 }; +} + +pub fn utf16CodepointWithFFFD(comptime Type: type, input: Type) UTF16Replacement { + return utf16CodepointWithFFFDAndFirstInputChar(Type, input[0], input); +} + +fn utf16CodepointWithFFFDAndFirstInputChar(comptime Type: type, char: std.meta.Elem(Type), input: Type) UTF16Replacement { + const c0 = @as(u21, char); + + if (c0 & ~@as(u21, 0x03ff) == 0xd800) { + // surrogate pair + if (input.len == 1) + return .{ + .len = 1, + .is_lead = true, + }; + //error.DanglingSurrogateHalf; + const c1 = @as(u21, input[1]); + if (c1 & ~@as(u21, 0x03ff) != 0xdc00) + if (input.len == 1) { + return .{ + .len = 1, + }; + } else { + return .{ + .fail = true, + .len = 1, + .code_point = strings.unicode_replacement, + .is_lead = true, + }; + }; + // return error.ExpectedSecondSurrogateHalf; + + return .{ .len = 2, .code_point = 0x10000 + (((c0 & 0x03ff) << 10) | (c1 & 0x03ff)) }; + } else if (c0 & ~@as(u21, 0x03ff) == 0xdc00) { + // return error.UnexpectedSecondSurrogateHalf; + return .{ .fail = true, .len = 1, .code_point = unicode_replacement }; + } else { + return .{ .code_point = c0, .len = 1 }; + } +} + +pub fn utf16Codepoint(comptime Type: type, input: Type) UTF16Replacement { + const c0 = @as(u21, input[0]); + + if (c0 & ~@as(u21, 0x03ff) == 0xd800) { + // surrogate pair + if (input.len == 1) + return .{ + .len = 1, + }; + //error.DanglingSurrogateHalf; + const c1 = @as(u21, input[1]); + if (c1 & ~@as(u21, 0x03ff) != 0xdc00) + if (input.len == 1) + return .{ + .len = 1, + }; + // return error.ExpectedSecondSurrogateHalf; + + return .{ .len = 2, .code_point = 0x10000 + (((c0 & 0x03ff) << 10) | (c1 & 0x03ff)) }; + } else if (c0 & ~@as(u21, 0x03ff) == 0xdc00) { + // return error.UnexpectedSecondSurrogateHalf; + return .{ .len = 1 }; + } else { + return .{ .code_point = c0, .len = 1 }; + } +} + +// TODO: remove this +pub const w = toUTF16Literal; + +pub fn toUTF16Literal(comptime str: []const u8) [:0]const u16 { + return literal(u16, str); +} + +pub fn literal(comptime T: type, comptime str: []const u8) *const [literalLength(T, str):0]T { + const Holder = struct { + pub const value = switch (T) { + u8 => (str[0..str.len].* ++ .{0})[0..str.len :0], + u16 => std.unicode.utf8ToUtf16LeStringLiteral(str), + else => @compileError("unsupported type " ++ @typeName(T) ++ " in strings.literal() call."), + }; + }; + + return Holder.value; +} + +fn literalLength(comptime T: type, comptime str: string) usize { + return comptime switch (T) { + u8 => str.len, + u16 => std.unicode.calcUtf16LeLen(str) catch unreachable, + else => 0, // let other errors report first + }; +} + +// Copyright (c) 2008-2009 Bjoern Hoehrmann +// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. +pub fn isValidUTF8WithoutSIMD(slice: []const u8) bool { + var state: u8 = 0; + + for (slice) |byte| { + state = decodeCheck(state, byte); + } + return state == UTF8_ACCEPT; +} + +pub fn isValidUTF8(slice: []const u8) bool { + if (bun.FeatureFlags.use_simdutf) + return bun.simdutf.validate.utf8(slice); + + return isValidUTF8WithoutSIMD(slice); +} + +pub fn isAllASCII(slice: []const u8) bool { + if (@inComptime()) { + for (slice) |char| { + if (char > 127) { + return false; + } + } + return true; + } + + return bun.simdutf.validate.ascii(slice); +} + +const UTF8_ACCEPT: u8 = 0; +const UTF8_REJECT: u8 = 12; + +const utf8d: [364]u8 = .{ + // The first part of the table maps bytes to character classes that + // to reduce the size of the transition table and create bitmasks. + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 8, 8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 10, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 11, 6, 6, 6, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + + // The second part is a transition table that maps a combination + // of a state of the automaton and a character class to a state. + 0, 12, 24, 36, 60, 96, 84, 12, 12, 12, 48, 72, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 0, 12, 12, 12, 12, 12, 0, + 12, 0, 12, 12, 12, 24, 12, 12, 12, 12, 12, 24, 12, 24, 12, 12, 12, 12, 12, 12, 12, 12, 12, 24, 12, 12, 12, 12, 12, 24, 12, 12, + 12, 12, 12, 12, 12, 24, 12, 12, 12, 12, 12, 12, 12, 12, 12, 36, 12, 36, 12, 12, 12, 36, 12, 12, 12, 12, 12, 36, 12, 36, 12, 12, + 12, 36, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, +}; + +pub fn decodeCheck(state: u8, byte: u8) u8 { + const char_type: u32 = utf8d[byte]; + // we dont care about the codep + // codep = if (*state != UTF8_ACCEPT) (byte & 0x3f) | (*codep << 6) else (0xff >> char_type) & (byte); + + const value = @as(u32, 256) + state + char_type; + if (value >= utf8d.len) return UTF8_REJECT; + return utf8d[value]; +} + +// #define U16_LEAD(supplementary) (UChar)(((supplementary)>>10)+0xd7c0) +pub fn u16Lead(supplementary: anytype) callconv(bun.callconv_inline) u16 { + return @intCast((supplementary >> 10) + 0xd7c0); +} + +// #define U16_TRAIL(supplementary) (UChar)(((supplementary)&0x3ff)|0xdc00) +pub fn u16Trail(supplementary: anytype) callconv(bun.callconv_inline) u16 { + return @intCast((supplementary & 0x3ff) | 0xdc00); +} + +// #define U16_IS_TRAIL(c) (((c)&0xfffffc00)==0xdc00) +pub fn u16IsTrail(supplementary: u16) callconv(bun.callconv_inline) bool { + return (@as(u32, @intCast(supplementary)) & 0xfffffc00) == 0xdc00; +} + +// #define U16_IS_LEAD(c) (((c)&0xfffffc00)==0xd800) +pub fn u16IsLead(supplementary: u16) callconv(bun.callconv_inline) bool { + return (@as(u32, @intCast(supplementary)) & 0xfffffc00) == 0xd800; +} + +// #define U16_GET_SUPPLEMENTARY(lead, trail) \ +// (((UChar32)(lead)<<10UL)+(UChar32)(trail)-U16_SURROGATE_OFFSET) +pub fn u16GetSupplementary(lead: u32, trail: u32) callconv(bun.callconv_inline) u32 { + const shifted = lead << 10; + return (shifted + trail) - u16_surrogate_offset; +} + +// #define U16_SURROGATE_OFFSET ((0xd800<<10UL)+0xdc00-0x10000) +pub const u16_surrogate_offset = 56613888; + +pub inline fn utf8ByteSequenceLength(first_byte: u8) u3_fast { + return switch (first_byte) { + 0b0000_0000...0b0111_1111 => 1, + 0b1100_0000...0b1101_1111 => 2, + 0b1110_0000...0b1110_1111 => 3, + 0b1111_0000...0b1111_0111 => 4, + else => 0, + }; +} + +/// Same as `utf8ByteSequenceLength`, but assumes the byte is valid UTF-8. +/// +/// You should only use this function if you know the string you are getting the byte from is valid UTF-8. +pub inline fn utf8ByteSequenceLengthUnsafe(first_byte: u8) u3_fast { + return switch (first_byte) { + 0b0000_0000...0b0111_1111 => 1, + 0b1100_0000...0b1101_1111 => 2, + 0b1110_0000...0b1110_1111 => 3, + 0b1111_0000...0b1111_0111 => 4, + else => unreachable, + }; +} + +/// This will simply ignore invalid UTF-8 and just do it +pub fn convertUTF8toUTF16InBuffer( + buf: []u16, + input: []const u8, +) []u16 { + // TODO(@paperclover): implement error handling here. + // for now this will cause invalid utf-8 to be ignored and become empty. + // this is lame because of https://github.com/oven-sh/bun/issues/8197 + // it will cause process.env.whatever to be len=0 instead of the data + // but it's better than failing the run entirely + // + // the reason i didn't implement the fallback is purely because our + // code in this file is too chaotic. it is left as a TODO + if (input.len == 0) return buf[0..0]; + const result = bun.simdutf.convert.utf8.to.utf16.le(input, buf); + return buf[0..result]; +} + +pub fn convertUTF8toUTF16InBufferZ( + buf: []u16, + input: []const u8, +) [:0]u16 { + // TODO: see convertUTF8toUTF16InBuffer + if (input.len == 0) { + buf[0] = 0; + return buf[0..0 :0]; + } + const result = bun.simdutf.convert.utf8.to.utf16.le(input, buf); + buf[result] = 0; + return buf[0..result :0]; +} + +pub fn convertUTF16toUTF8InBuffer( + buf: []u8, + input: []const u16, +) ![]const u8 { + // See above + if (input.len == 0) return &[_]u8{}; + const result = bun.simdutf.convert.utf16.to.utf8.le(input, buf); + // switch (result.status) { + // .success => return buf[0..result.count], + // // TODO(@paperclover): handle surrogate + // .surrogate => @panic("TODO: handle surrogate in convertUTF8toUTF16"), + // else => @panic("TODO: handle error in convertUTF16toUTF8InBuffer"), + // } + return buf[0..result]; +} + +pub fn latin1ToCodepointAssumeNotASCII(char: u8, comptime CodePointType: type) CodePointType { + return @as( + CodePointType, + @intCast(latin1ToCodepointBytesAssumeNotASCII16(char)), + ); +} + +const latin1_to_utf16_conversion_table = [256]u16{ + 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, // 00-07 + 0x0008, 0x0009, 0x000A, 0x000B, 0x000C, 0x000D, 0x000E, 0x000F, // 08-0F + 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, // 10-17 + 0x0018, 0x0019, 0x001A, 0x001B, 0x001C, 0x001D, 0x001E, 0x001F, // 18-1F + 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, // 20-27 + 0x0028, 0x0029, 0x002A, 0x002B, 0x002C, 0x002D, 0x002E, 0x002F, // 28-2F + 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, // 30-37 + 0x0038, 0x0039, 0x003A, 0x003B, 0x003C, 0x003D, 0x003E, 0x003F, // 38-3F + 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, // 40-47 + 0x0048, 0x0049, 0x004A, 0x004B, 0x004C, 0x004D, 0x004E, 0x004F, // 48-4F + 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, // 50-57 + 0x0058, 0x0059, 0x005A, 0x005B, 0x005C, 0x005D, 0x005E, 0x005F, // 58-5F + 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, // 60-67 + 0x0068, 0x0069, 0x006A, 0x006B, 0x006C, 0x006D, 0x006E, 0x006F, // 68-6F + 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, // 70-77 + 0x0078, 0x0079, 0x007A, 0x007B, 0x007C, 0x007D, 0x007E, 0x007F, // 78-7F + 0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, // 80-87 + 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F, // 88-8F + 0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, // 90-97 + 0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178, // 98-9F + 0x00A0, 0x00A1, 0x00A2, 0x00A3, 0x00A4, 0x00A5, 0x00A6, 0x00A7, // A0-A7 + 0x00A8, 0x00A9, 0x00AA, 0x00AB, 0x00AC, 0x00AD, 0x00AE, 0x00AF, // A8-AF + 0x00B0, 0x00B1, 0x00B2, 0x00B3, 0x00B4, 0x00B5, 0x00B6, 0x00B7, // B0-B7 + 0x00B8, 0x00B9, 0x00BA, 0x00BB, 0x00BC, 0x00BD, 0x00BE, 0x00BF, // B8-BF + 0x00C0, 0x00C1, 0x00C2, 0x00C3, 0x00C4, 0x00C5, 0x00C6, 0x00C7, // C0-C7 + 0x00C8, 0x00C9, 0x00CA, 0x00CB, 0x00CC, 0x00CD, 0x00CE, 0x00CF, // C8-CF + 0x00D0, 0x00D1, 0x00D2, 0x00D3, 0x00D4, 0x00D5, 0x00D6, 0x00D7, // D0-D7 + 0x00D8, 0x00D9, 0x00DA, 0x00DB, 0x00DC, 0x00DD, 0x00DE, 0x00DF, // D8-DF + 0x00E0, 0x00E1, 0x00E2, 0x00E3, 0x00E4, 0x00E5, 0x00E6, 0x00E7, // E0-E7 + 0x00E8, 0x00E9, 0x00EA, 0x00EB, 0x00EC, 0x00ED, 0x00EE, 0x00EF, // E8-EF + 0x00F0, 0x00F1, 0x00F2, 0x00F3, 0x00F4, 0x00F5, 0x00F6, 0x00F7, // F0-F7 + 0x00F8, 0x00F9, 0x00FA, 0x00FB, 0x00FC, 0x00FD, 0x00FE, 0x00FF, // F8-FF +}; + +pub fn latin1ToCodepointBytesAssumeNotASCII(char: u32) [2]u8 { + var bytes = [4]u8{ 0, 0, 0, 0 }; + _ = encodeWTF8Rune(&bytes, @as(i32, @intCast(char))); + return bytes[0..2].*; +} + +pub fn latin1ToCodepointBytesAssumeNotASCII16(char: u32) u16 { + return latin1_to_utf16_conversion_table[@as(u8, @truncate(char))]; +} + +pub fn copyUTF16IntoUTF8(buf: []u8, comptime Type: type, utf16: Type, comptime allow_partial_write: bool) EncodeIntoResult { + if (comptime Type == []const u16) { + if (bun.FeatureFlags.use_simdutf) { + if (utf16.len == 0) + return .{ .read = 0, .written = 0 }; + const trimmed = bun.simdutf.trim.utf16(utf16); + if (trimmed.len == 0) + return .{ .read = 0, .written = 0 }; + + const out_len = if (buf.len <= (trimmed.len * 3 + 2)) + bun.simdutf.length.utf8.from.utf16.le(trimmed) + else + buf.len; + + return copyUTF16IntoUTF8WithBuffer(buf, Type, utf16, trimmed, out_len, allow_partial_write); + } + } + + return copyUTF16IntoUTF8WithBuffer(buf, Type, utf16, utf16, utf16.len, allow_partial_write); +} + +pub fn copyUTF16IntoUTF8WithBuffer(buf: []u8, comptime Type: type, utf16: Type, trimmed: Type, out_len: usize, comptime allow_partial_write: bool) EncodeIntoResult { + var remaining = buf; + var utf16_remaining = utf16; + var ended_on_non_ascii = false; + + brk: { + if (comptime Type == []const u16) { + if (bun.FeatureFlags.use_simdutf) { + log("UTF16 {d} -> UTF8 {d}", .{ utf16.len, out_len }); + if (remaining.len >= out_len) { + const result = bun.simdutf.convert.utf16.to.utf8.with_errors.le(trimmed, remaining); + if (result.status == .surrogate) break :brk; + + return EncodeIntoResult{ + .read = @as(u32, @truncate(trimmed.len)), + .written = @as(u32, @truncate(result.count)), + }; + } + } + } + } + + while (firstNonASCII16(Type, utf16_remaining)) |i| { + const end = @min(i, remaining.len); + if (end > 0) copyU16IntoU8(remaining, utf16_remaining[0..end]); + remaining = remaining[end..]; + utf16_remaining = utf16_remaining[end..]; + + if (@min(utf16_remaining.len, remaining.len) == 0) + break; + + const replacement = utf16CodepointWithFFFD(Type, utf16_remaining); + + const width: usize = replacement.utf8Width(); + if (width > remaining.len) { + ended_on_non_ascii = width > 1; + if (comptime allow_partial_write) switch (width) { + 2 => { + if (remaining.len > 0) { + //only first will be written + remaining[0] = @as(u8, @truncate(0xC0 | (replacement.code_point >> 6))); + remaining = remaining[remaining.len..]; + } + }, + 3 => { + //only first to second written + switch (remaining.len) { + 1 => { + remaining[0] = @as(u8, @truncate(0xE0 | (replacement.code_point >> 12))); + remaining = remaining[remaining.len..]; + }, + 2 => { + remaining[0] = @as(u8, @truncate(0xE0 | (replacement.code_point >> 12))); + remaining[1] = @as(u8, @truncate(0x80 | (replacement.code_point >> 6) & 0x3F)); + remaining = remaining[remaining.len..]; + }, + else => {}, + } + }, + 4 => { + //only 1 to 3 written + switch (remaining.len) { + 1 => { + remaining[0] = @as(u8, @truncate(0xF0 | (replacement.code_point >> 18))); + remaining = remaining[remaining.len..]; + }, + 2 => { + remaining[0] = @as(u8, @truncate(0xF0 | (replacement.code_point >> 18))); + remaining[1] = @as(u8, @truncate(0x80 | (replacement.code_point >> 12) & 0x3F)); + remaining = remaining[remaining.len..]; + }, + 3 => { + remaining[0] = @as(u8, @truncate(0xF0 | (replacement.code_point >> 18))); + remaining[1] = @as(u8, @truncate(0x80 | (replacement.code_point >> 12) & 0x3F)); + remaining[2] = @as(u8, @truncate(0x80 | (replacement.code_point >> 6) & 0x3F)); + remaining = remaining[remaining.len..]; + }, + else => {}, + } + }, + + else => {}, + }; + break; + } + + utf16_remaining = utf16_remaining[replacement.len..]; + _ = encodeWTF8RuneT(remaining.ptr[0..4], u32, @as(u32, replacement.code_point)); + remaining = remaining[width..]; + } + + if (remaining.len > 0 and !ended_on_non_ascii and utf16_remaining.len > 0) { + const len = @min(remaining.len, utf16_remaining.len); + copyU16IntoU8(remaining[0..len], utf16_remaining[0..len]); + utf16_remaining = utf16_remaining[len..]; + remaining = remaining[len..]; + } + + return .{ + .read = @as(u32, @truncate(utf16.len - utf16_remaining.len)), + .written = @as(u32, @truncate(buf.len - remaining.len)), + }; +} + +pub fn elementLengthUTF16IntoUTF8(comptime Type: type, utf16: Type) usize { + if (bun.FeatureFlags.use_simdutf) { + return bun.simdutf.length.utf8.from.utf16.le(utf16); + } + + var utf16_remaining = utf16; + var count: usize = 0; + + while (firstNonASCII16(Type, utf16_remaining)) |i| { + count += i; + + utf16_remaining = utf16_remaining[i..]; + + const replacement = utf16Codepoint(Type, utf16_remaining); + + count += replacement.utf8Width(); + utf16_remaining = utf16_remaining[replacement.len..]; + } + + return count + utf16_remaining.len; +} + +pub fn elementLengthUTF8IntoUTF16(comptime Type: type, utf8: Type) usize { + var utf8_remaining = utf8; + var count: usize = 0; + + if (bun.FeatureFlags.use_simdutf) { + return bun.simdutf.length.utf16.from.utf8(utf8); + } + + while (firstNonASCII(utf8_remaining)) |i| { + count += i; + + utf8_remaining = utf8_remaining[i..]; + + const replacement = utf16Codepoint(Type, utf8_remaining); + + count += replacement.len; + utf8_remaining = utf8_remaining[@min(replacement.utf8Width(), utf8_remaining.len)..]; + } + + return count + utf8_remaining.len; +} + +// Check utf16 string equals utf8 string without allocating extra memory +pub fn utf16EqlString(text: []const u16, str: string) bool { + if (text.len > str.len) { + // Strings can't be equal if UTF-16 encoding is longer than UTF-8 encoding + return false; + } + + var temp = [4]u8{ 0, 0, 0, 0 }; + const n = text.len; + var j: usize = 0; + var i: usize = 0; + // TODO: is it safe to just make this u32 or u21? + var r1: i32 = undefined; + while (i < n) : (i += 1) { + r1 = text[i]; + if (r1 >= 0xD800 and r1 <= 0xDBFF and i + 1 < n) { + const r2: i32 = text[i + 1]; + if (r2 >= 0xDC00 and r2 <= 0xDFFF) { + r1 = (r1 - 0xD800) << 10 | (r2 - 0xDC00) + 0x10000; + i += 1; + } + } + + const width = encodeWTF8Rune(&temp, r1); + if (j + width > str.len) { + return false; + } + for (0..width) |k| { + if (temp[k] != str[j]) { + return false; + } + j += 1; + } + } + + return j == str.len; +} + +pub fn encodeUTF8Comptime(comptime cp: u32) []const u8 { + const HEADER_CONT_BYTE: u8 = 0b10000000; + const HEADER_2BYTE: u8 = 0b11000000; + const HEADER_3BYTE: u8 = 0b11100000; + const HEADER_4BYTE: u8 = 0b11100000; + + return switch (cp) { + 0x0...0x7F => return &[_]u8{@intCast(cp)}, + 0x80...0x7FF => { + return &[_]u8{ + HEADER_2BYTE | @as(u8, cp >> 6), + HEADER_CONT_BYTE | @as(u8, cp & 0b00111111), + }; + }, + 0x800...0xFFFF => { + return &[_]u8{ + HEADER_3BYTE | @as(u8, cp >> 12), + HEADER_CONT_BYTE | @as(u8, (cp >> 6) & 0b00111111), + HEADER_CONT_BYTE | @as(u8, cp & 0b00111111), + }; + }, + 0x10000...0x10FFFF => { + return &[_]u8{ + HEADER_4BYTE | @as(u8, cp >> 18), + HEADER_CONT_BYTE | @as(u8, (cp >> 12) & 0b00111111), + HEADER_CONT_BYTE | @as(u8, (cp >> 6) & 0b00111111), + HEADER_CONT_BYTE | @as(u8, cp & 0b00111111), + }; + }, + else => @compileError("Invalid UTF-8 codepoint!"), + }; +} + +// This is a clone of golang's "utf8.EncodeRune" that has been modified to encode using +// WTF-8 instead. See https://simonsapin.github.io/wtf-8/ for more info. +pub fn encodeWTF8Rune(p: *[4]u8, r: i32) u3_fast { + return @call( + .always_inline, + encodeWTF8RuneT, + .{ + p, + u32, + @as(u32, @intCast(r)), + }, + ); +} + +pub fn encodeWTF8RuneT(p: *[4]u8, comptime R: type, r: R) u3_fast { + switch (r) { + 0...0x7F => { + p[0] = @as(u8, @intCast(r)); + return 1; + }, + (0x7F + 1)...0x7FF => { + p[0] = @as(u8, @truncate(0xC0 | ((r >> 6)))); + p[1] = @as(u8, @truncate(0x80 | (r & 0x3F))); + return 2; + }, + (0x7FF + 1)...0xFFFF => { + p[0] = @as(u8, @truncate(0xE0 | ((r >> 12)))); + p[1] = @as(u8, @truncate(0x80 | ((r >> 6) & 0x3F))); + p[2] = @as(u8, @truncate(0x80 | (r & 0x3F))); + return 3; + }, + else => { + p[0] = @as(u8, @truncate(0xF0 | ((r >> 18)))); + p[1] = @as(u8, @truncate(0x80 | ((r >> 12) & 0x3F))); + p[2] = @as(u8, @truncate(0x80 | ((r >> 6) & 0x3F))); + p[3] = @as(u8, @truncate(0x80 | (r & 0x3F))); + return 4; + }, + } +} + +pub fn wtf8Sequence(code_point: u32) [4]u8 { + return switch (code_point) { + 0...0x7f => .{ + @intCast(code_point), + 0, + 0, + 0, + }, + (0x7f + 1)...0x7ff => .{ + @truncate(0xc0 | (code_point >> 6)), + @truncate(0x80 | (code_point & 0x3f)), + 0, + 0, + }, + (0x7ff + 1)...0xffff => .{ + @truncate(0xe0 | (code_point >> 12)), + @truncate(0x80 | ((code_point >> 6) & 0x3f)), + @truncate(0x80 | (code_point & 0x3f)), + 0, + }, + else => .{ + @truncate(0xf0 | (code_point >> 18)), + @truncate(0x80 | ((code_point >> 12) & 0x3f)), + @truncate(0x80 | ((code_point >> 6) & 0x3f)), + @truncate(0x80 | (code_point & 0x3f)), + }, + }; +} + +pub inline fn wtf8ByteSequenceLength(first_byte: u8) u8 { + return switch (first_byte) { + 0...0x80 - 1 => 1, + else => if ((first_byte & 0xE0) == 0xC0) + 2 + else if ((first_byte & 0xF0) == 0xE0) + 3 + else if ((first_byte & 0xF8) == 0xF0) + 4 + else + 1, + }; +} + +/// 0 == invalid +pub inline fn wtf8ByteSequenceLengthWithInvalid(first_byte: u8) u8 { + return switch (first_byte) { + 0...0x80 - 1 => 1, + else => if ((first_byte & 0xE0) == 0xC0) + 2 + else if ((first_byte & 0xF0) == 0xE0) + 3 + else if ((first_byte & 0xF8) == 0xF0) + 4 + else + 1, + }; +} + +/// Convert potentially ill-formed UTF-8 or UTF-16 bytes to a Unicode Codepoint. +/// Invalid codepoints are replaced with `zero` parameter +/// This is a clone of esbuild's decodeWTF8Rune +/// which was a clone of golang's "utf8.DecodeRune" that was modified to decode using WTF-8 instead. +/// Asserts a multi-byte codepoint +pub inline fn decodeWTF8RuneTMultibyte(p: *const [4]u8, len: u3_fast, comptime T: type, comptime zero: T) T { + if (comptime Environment.allow_assert) assert(len > 1); + + const s1 = p[1]; + if ((s1 & 0xC0) != 0x80) return zero; + + if (len == 2) { + const cp = @as(T, p[0] & 0x1F) << 6 | @as(T, s1 & 0x3F); + if (cp < 0x80) return zero; + return cp; + } + + const s2 = p[2]; + + if ((s2 & 0xC0) != 0x80) return zero; + + if (len == 3) { + const cp = (@as(T, p[0] & 0x0F) << 12) | (@as(T, s1 & 0x3F) << 6) | (@as(T, s2 & 0x3F)); + if (cp < 0x800) return zero; + return cp; + } + + const s3 = p[3]; + + if ((s3 & 0xC0) != 0x80) return zero; + + { + const cp = (@as(T, p[0] & 0x07) << 18) | (@as(T, s1 & 0x3F) << 12) | (@as(T, s2 & 0x3F) << 6) | (@as(T, s3 & 0x3F)); + if (cp < 0x10000 or cp > 0x10FFFF) return zero; + return cp; + } + + unreachable; +} + +const eqlComptimeIgnoreLen = strings.eqlComptimeIgnoreLen; +const bun = @import("bun"); +const std = @import("std"); +const string = []const u8; +const strings = bun.strings; +const u3_fast = strings.u3_fast; +const CodePoint = bun.CodePoint; +const js_lexer = bun.js_lexer; +const OOM = bun.OOM; +const unicode_replacement = strings.unicode_replacement; +const Environment = bun.Environment; +const log = strings.log; +const firstNonASCII16 = strings.firstNonASCII16; +const firstNonASCII = strings.firstNonASCII; + +const assert = bun.assert; +const ascii_vector_size = strings.ascii_vector_size; +const AsciiVector = strings.AsciiVector; diff --git a/src/string/visible.zig b/src/string/visible.zig new file mode 100644 index 0000000000..295124c639 --- /dev/null +++ b/src/string/visible.zig @@ -0,0 +1,831 @@ +pub fn isZeroWidthCodepointType(comptime T: type, cp: T) bool { + if (cp <= 0x1f) { + return true; + } + + if (cp >= 0x7f and cp <= 0x9f) { + // C1 control characters + return true; + } + + if (comptime @sizeOf(T) == 1) { + return false; + } + + if (cp >= 0x300 and cp <= 0x36f) { + // Combining Diacritical Marks + return true; + } + + if (cp >= 0x200b and cp <= 0x200f) { + // Modifying Invisible Characters + return true; + } + + if (cp >= 0x20d0 and cp <= 0x20ff) + // Combining Diacritical Marks for Symbols + return true; + + if (cp >= 0xfe00 and cp <= 0xfe0f) + // Variation Selectors + return true; + if (cp >= 0xfe20 and cp <= 0xfe2f) + // Combining Half Marks + return true; + + if (cp == 0xfeff) + // Zero Width No-Break Space (BOM, ZWNBSP) + return true; + + if (cp >= 0xe0100 and cp <= 0xe01ef) + // Variation Selectors + return true; + + return false; +} + +/// Official unicode reference: https://www.unicode.org/Public/UCD/latest/ucd/EastAsianWidth.txt +/// Tag legend: +/// - `W` (wide) -> true +/// - `F` (full-width) -> true +/// - `H` (half-width) -> false +/// - `N` (neutral) -> false +/// - `Na` (narrow) -> false +/// - `A` (ambiguous) -> false? +/// +/// To regenerate the switch body list, run: +/// ```js +/// [...(await (await fetch("https://www.unicode.org/Public/UCD/latest/ucd/EastAsianWidth.txt")).text()).matchAll(/^([\dA-F]{4,})(?:\.\.([\dA-F]{4,}))?\s+;\s+(\w+)\s+#\s+(.*?)\s*$/gm)].flatMap(([,start, end, type, comment]) => ( +/// (['W', 'F'].includes(type)) ? [` ${(end ? `0x${start}...0x${end}` : `0x${start}`)}, // ${''.padStart(17 - start.length - (end ? end.length + 5 : 0))}[${type}] ${comment}`] : [] +/// )).join('\n') +/// ``` +pub fn isFullWidthCodepointType(comptime T: type, cp: T) bool { + if (!(cp >= 0x1100)) { + return false; + } + + return switch (cp) { + 0x1100...0x115F, // [W] Lo [96] HANGUL CHOSEONG KIYEOK..HANGUL CHOSEONG FILLER + 0x231A...0x231B, // [W] So [2] WATCH..HOURGLASS + 0x2329, // [W] Ps LEFT-POINTING ANGLE BRACKET + 0x232A, // [W] Pe RIGHT-POINTING ANGLE BRACKET + 0x23E9...0x23EC, // [W] So [4] BLACK RIGHT-POINTING DOUBLE TRIANGLE..BLACK DOWN-POINTING DOUBLE TRIANGLE + 0x23F0, // [W] So ALARM CLOCK + 0x23F3, // [W] So HOURGLASS WITH FLOWING SAND + 0x25FD...0x25FE, // [W] Sm [2] WHITE MEDIUM SMALL SQUARE..BLACK MEDIUM SMALL SQUARE + 0x2614...0x2615, // [W] So [2] UMBRELLA WITH RAIN DROPS..HOT BEVERAGE + 0x2648...0x2653, // [W] So [12] ARIES..PISCES + 0x267F, // [W] So WHEELCHAIR SYMBOL + 0x2693, // [W] So ANCHOR + 0x26A1, // [W] So HIGH VOLTAGE SIGN + 0x26AA...0x26AB, // [W] So [2] MEDIUM WHITE CIRCLE..MEDIUM BLACK CIRCLE + 0x26BD...0x26BE, // [W] So [2] SOCCER BALL..BASEBALL + 0x26C4...0x26C5, // [W] So [2] SNOWMAN WITHOUT SNOW..SUN BEHIND CLOUD + 0x26CE, // [W] So OPHIUCHUS + 0x26D4, // [W] So NO ENTRY + 0x26EA, // [W] So CHURCH + 0x26F2...0x26F3, // [W] So [2] FOUNTAIN..FLAG IN HOLE + 0x26F5, // [W] So SAILBOAT + 0x26FA, // [W] So TENT + 0x26FD, // [W] So FUEL PUMP + 0x2705, // [W] So WHITE HEAVY CHECK MARK + 0x270A...0x270B, // [W] So [2] RAISED FIST..RAISED HAND + 0x2728, // [W] So SPARKLES + 0x274C, // [W] So CROSS MARK + 0x274E, // [W] So NEGATIVE SQUARED CROSS MARK + 0x2753...0x2755, // [W] So [3] BLACK QUESTION MARK ORNAMENT..WHITE EXCLAMATION MARK ORNAMENT + 0x2757, // [W] So HEAVY EXCLAMATION MARK SYMBOL + 0x2795...0x2797, // [W] So [3] HEAVY PLUS SIGN..HEAVY DIVISION SIGN + 0x27B0, // [W] So CURLY LOOP + 0x27BF, // [W] So DOUBLE CURLY LOOP + 0x2B1B...0x2B1C, // [W] So [2] BLACK LARGE SQUARE..WHITE LARGE SQUARE + 0x2B50, // [W] So WHITE MEDIUM STAR + 0x2B55, // [W] So HEAVY LARGE CIRCLE + 0x2E80...0x2E99, // [W] So [26] CJK RADICAL REPEAT..CJK RADICAL RAP + 0x2E9B...0x2EF3, // [W] So [89] CJK RADICAL CHOKE..CJK RADICAL C-SIMPLIFIED TURTLE + 0x2F00...0x2FD5, // [W] So [214] KANGXI RADICAL ONE..KANGXI RADICAL FLUTE + 0x2FF0...0x2FFF, // [W] So [16] IDEOGRAPHIC DESCRIPTION CHARACTER LEFT TO RIGHT..IDEOGRAPHIC DESCRIPTION CHARACTER ROTATION + 0x3000, // [F] Zs IDEOGRAPHIC SPACE + 0x3001...0x3003, // [W] Po [3] IDEOGRAPHIC COMMA..DITTO MARK + 0x3004, // [W] So JAPANESE INDUSTRIAL STANDARD SYMBOL + 0x3005, // [W] Lm IDEOGRAPHIC ITERATION MARK + 0x3006, // [W] Lo IDEOGRAPHIC CLOSING MARK + 0x3007, // [W] Nl IDEOGRAPHIC NUMBER ZERO + 0x3008, // [W] Ps LEFT ANGLE BRACKET + 0x3009, // [W] Pe RIGHT ANGLE BRACKET + 0x300A, // [W] Ps LEFT DOUBLE ANGLE BRACKET + 0x300B, // [W] Pe RIGHT DOUBLE ANGLE BRACKET + 0x300C, // [W] Ps LEFT CORNER BRACKET + 0x300D, // [W] Pe RIGHT CORNER BRACKET + 0x300E, // [W] Ps LEFT WHITE CORNER BRACKET + 0x300F, // [W] Pe RIGHT WHITE CORNER BRACKET + 0x3010, // [W] Ps LEFT BLACK LENTICULAR BRACKET + 0x3011, // [W] Pe RIGHT BLACK LENTICULAR BRACKET + 0x3012...0x3013, // [W] So [2] POSTAL MARK..GETA MARK + 0x3014, // [W] Ps LEFT TORTOISE SHELL BRACKET + 0x3015, // [W] Pe RIGHT TORTOISE SHELL BRACKET + 0x3016, // [W] Ps LEFT WHITE LENTICULAR BRACKET + 0x3017, // [W] Pe RIGHT WHITE LENTICULAR BRACKET + 0x3018, // [W] Ps LEFT WHITE TORTOISE SHELL BRACKET + 0x3019, // [W] Pe RIGHT WHITE TORTOISE SHELL BRACKET + 0x301A, // [W] Ps LEFT WHITE SQUARE BRACKET + 0x301B, // [W] Pe RIGHT WHITE SQUARE BRACKET + 0x301C, // [W] Pd WAVE DASH + 0x301D, // [W] Ps REVERSED DOUBLE PRIME QUOTATION MARK + 0x301E...0x301F, // [W] Pe [2] DOUBLE PRIME QUOTATION MARK..LOW DOUBLE PRIME QUOTATION MARK + 0x3020, // [W] So POSTAL MARK FACE + 0x3021...0x3029, // [W] Nl [9] HANGZHOU NUMERAL ONE..HANGZHOU NUMERAL NINE + 0x302A...0x302D, // [W] Mn [4] IDEOGRAPHIC LEVEL TONE MARK..IDEOGRAPHIC ENTERING TONE MARK + 0x302E...0x302F, // [W] Mc [2] HANGUL SINGLE DOT TONE MARK..HANGUL DOUBLE DOT TONE MARK + 0x3030, // [W] Pd WAVY DASH + 0x3031...0x3035, // [W] Lm [5] VERTICAL KANA REPEAT MARK..VERTICAL KANA REPEAT MARK LOWER HALF + 0x3036...0x3037, // [W] So [2] CIRCLED POSTAL MARK..IDEOGRAPHIC TELEGRAPH LINE FEED SEPARATOR SYMBOL + 0x3038...0x303A, // [W] Nl [3] HANGZHOU NUMERAL TEN..HANGZHOU NUMERAL THIRTY + 0x303B, // [W] Lm VERTICAL IDEOGRAPHIC ITERATION MARK + 0x303C, // [W] Lo MASU MARK + 0x303D, // [W] Po PART ALTERNATION MARK + 0x303E, // [W] So IDEOGRAPHIC VARIATION INDICATOR + 0x3041...0x3096, // [W] Lo [86] HIRAGANA LETTER SMALL A..HIRAGANA LETTER SMALL KE + 0x3099...0x309A, // [W] Mn [2] COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK..COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK + 0x309B...0x309C, // [W] Sk [2] KATAKANA-HIRAGANA VOICED SOUND MARK..KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK + 0x309D...0x309E, // [W] Lm [2] HIRAGANA ITERATION MARK..HIRAGANA VOICED ITERATION MARK + 0x309F, // [W] Lo HIRAGANA DIGRAPH YORI + 0x30A0, // [W] Pd KATAKANA-HIRAGANA DOUBLE HYPHEN + 0x30A1...0x30FA, // [W] Lo [90] KATAKANA LETTER SMALL A..KATAKANA LETTER VO + 0x30FB, // [W] Po KATAKANA MIDDLE DOT + 0x30FC...0x30FE, // [W] Lm [3] KATAKANA-HIRAGANA PROLONGED SOUND MARK..KATAKANA VOICED ITERATION MARK + 0x30FF, // [W] Lo KATAKANA DIGRAPH KOTO + 0x3105...0x312F, // [W] Lo [43] BOPOMOFO LETTER B..BOPOMOFO LETTER NN + 0x3131...0x318E, // [W] Lo [94] HANGUL LETTER KIYEOK..HANGUL LETTER ARAEAE + 0x3190...0x3191, // [W] So [2] IDEOGRAPHIC ANNOTATION LINKING MARK..IDEOGRAPHIC ANNOTATION REVERSE MARK + 0x3192...0x3195, // [W] No [4] IDEOGRAPHIC ANNOTATION ONE MARK..IDEOGRAPHIC ANNOTATION FOUR MARK + 0x3196...0x319F, // [W] So [10] IDEOGRAPHIC ANNOTATION TOP MARK..IDEOGRAPHIC ANNOTATION MAN MARK + 0x31A0...0x31BF, // [W] Lo [32] BOPOMOFO LETTER BU..BOPOMOFO LETTER AH + 0x31C0...0x31E3, // [W] So [36] CJK STROKE T..CJK STROKE Q + 0x31EF, // [W] So IDEOGRAPHIC DESCRIPTION CHARACTER SUBTRACTION + 0x31F0...0x31FF, // [W] Lo [16] KATAKANA LETTER SMALL KU..KATAKANA LETTER SMALL RO + 0x3200...0x321E, // [W] So [31] PARENTHESIZED HANGUL KIYEOK..PARENTHESIZED KOREAN CHARACTER O HU + 0x3220...0x3229, // [W] No [10] PARENTHESIZED IDEOGRAPH ONE..PARENTHESIZED IDEOGRAPH TEN + 0x322A...0x3247, // [W] So [30] PARENTHESIZED IDEOGRAPH MOON..CIRCLED IDEOGRAPH KOTO + 0x3250, // [W] So PARTNERSHIP SIGN + 0x3251...0x325F, // [W] No [15] CIRCLED NUMBER TWENTY ONE..CIRCLED NUMBER THIRTY FIVE + 0x3260...0x327F, // [W] So [32] CIRCLED HANGUL KIYEOK..KOREAN STANDARD SYMBOL + 0x3280...0x3289, // [W] No [10] CIRCLED IDEOGRAPH ONE..CIRCLED IDEOGRAPH TEN + 0x328A...0x32B0, // [W] So [39] CIRCLED IDEOGRAPH MOON..CIRCLED IDEOGRAPH NIGHT + 0x32B1...0x32BF, // [W] No [15] CIRCLED NUMBER THIRTY SIX..CIRCLED NUMBER FIFTY + 0x32C0...0x32FF, // [W] So [64] IDEOGRAPHIC TELEGRAPH SYMBOL FOR JANUARY..SQUARE ERA NAME REIWA + 0x3300...0x33FF, // [W] So [256] SQUARE APAATO..SQUARE GAL + 0x3400...0x4DBF, // [W] Lo [6592] CJK UNIFIED IDEOGRAPH-3400..CJK UNIFIED IDEOGRAPH-4DBF + 0x4E00...0x9FFF, // [W] Lo [20992] CJK UNIFIED IDEOGRAPH-4E00..CJK UNIFIED IDEOGRAPH-9FFF + 0xA000...0xA014, // [W] Lo [21] YI SYLLABLE IT..YI SYLLABLE E + 0xA015, // [W] Lm YI SYLLABLE WU + 0xA016...0xA48C, // [W] Lo [1143] YI SYLLABLE BIT..YI SYLLABLE YYR + 0xA490...0xA4C6, // [W] So [55] YI RADICAL QOT..YI RADICAL KE + 0xA960...0xA97C, // [W] Lo [29] HANGUL CHOSEONG TIKEUT-MIEUM..HANGUL CHOSEONG SSANGYEORINHIEUH + 0xAC00...0xD7A3, // [W] Lo [11172] HANGUL SYLLABLE GA..HANGUL SYLLABLE HIH + 0xF900...0xFA6D, // [W] Lo [366] CJK COMPATIBILITY IDEOGRAPH-F900..CJK COMPATIBILITY IDEOGRAPH-FA6D + 0xFA6E...0xFA6F, // [W] Cn [2] .. + 0xFA70...0xFAD9, // [W] Lo [106] CJK COMPATIBILITY IDEOGRAPH-FA70..CJK COMPATIBILITY IDEOGRAPH-FAD9 + 0xFADA...0xFAFF, // [W] Cn [38] .. + 0xFE10...0xFE16, // [W] Po [7] PRESENTATION FORM FOR VERTICAL COMMA..PRESENTATION FORM FOR VERTICAL QUESTION MARK + 0xFE17, // [W] Ps PRESENTATION FORM FOR VERTICAL LEFT WHITE LENTICULAR BRACKET + 0xFE18, // [W] Pe PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRAKCET + 0xFE19, // [W] Po PRESENTATION FORM FOR VERTICAL HORIZONTAL ELLIPSIS + 0xFE30, // [W] Po PRESENTATION FORM FOR VERTICAL TWO DOT LEADER + 0xFE31...0xFE32, // [W] Pd [2] PRESENTATION FORM FOR VERTICAL EM DASH..PRESENTATION FORM FOR VERTICAL EN DASH + 0xFE33...0xFE34, // [W] Pc [2] PRESENTATION FORM FOR VERTICAL LOW LINE..PRESENTATION FORM FOR VERTICAL WAVY LOW LINE + 0xFE35, // [W] Ps PRESENTATION FORM FOR VERTICAL LEFT PARENTHESIS + 0xFE36, // [W] Pe PRESENTATION FORM FOR VERTICAL RIGHT PARENTHESIS + 0xFE37, // [W] Ps PRESENTATION FORM FOR VERTICAL LEFT CURLY BRACKET + 0xFE38, // [W] Pe PRESENTATION FORM FOR VERTICAL RIGHT CURLY BRACKET + 0xFE39, // [W] Ps PRESENTATION FORM FOR VERTICAL LEFT TORTOISE SHELL BRACKET + 0xFE3A, // [W] Pe PRESENTATION FORM FOR VERTICAL RIGHT TORTOISE SHELL BRACKET + 0xFE3B, // [W] Ps PRESENTATION FORM FOR VERTICAL LEFT BLACK LENTICULAR BRACKET + 0xFE3C, // [W] Pe PRESENTATION FORM FOR VERTICAL RIGHT BLACK LENTICULAR BRACKET + 0xFE3D, // [W] Ps PRESENTATION FORM FOR VERTICAL LEFT DOUBLE ANGLE BRACKET + 0xFE3E, // [W] Pe PRESENTATION FORM FOR VERTICAL RIGHT DOUBLE ANGLE BRACKET + 0xFE3F, // [W] Ps PRESENTATION FORM FOR VERTICAL LEFT ANGLE BRACKET + 0xFE40, // [W] Pe PRESENTATION FORM FOR VERTICAL RIGHT ANGLE BRACKET + 0xFE41, // [W] Ps PRESENTATION FORM FOR VERTICAL LEFT CORNER BRACKET + 0xFE42, // [W] Pe PRESENTATION FORM FOR VERTICAL RIGHT CORNER BRACKET + 0xFE43, // [W] Ps PRESENTATION FORM FOR VERTICAL LEFT WHITE CORNER BRACKET + 0xFE44, // [W] Pe PRESENTATION FORM FOR VERTICAL RIGHT WHITE CORNER BRACKET + 0xFE45...0xFE46, // [W] Po [2] SESAME DOT..WHITE SESAME DOT + 0xFE47, // [W] Ps PRESENTATION FORM FOR VERTICAL LEFT SQUARE BRACKET + 0xFE48, // [W] Pe PRESENTATION FORM FOR VERTICAL RIGHT SQUARE BRACKET + 0xFE49...0xFE4C, // [W] Po [4] DASHED OVERLINE..DOUBLE WAVY OVERLINE + 0xFE4D...0xFE4F, // [W] Pc [3] DASHED LOW LINE..WAVY LOW LINE + 0xFE50...0xFE52, // [W] Po [3] SMALL COMMA..SMALL FULL STOP + 0xFE54...0xFE57, // [W] Po [4] SMALL SEMICOLON..SMALL EXCLAMATION MARK + 0xFE58, // [W] Pd SMALL EM DASH + 0xFE59, // [W] Ps SMALL LEFT PARENTHESIS + 0xFE5A, // [W] Pe SMALL RIGHT PARENTHESIS + 0xFE5B, // [W] Ps SMALL LEFT CURLY BRACKET + 0xFE5C, // [W] Pe SMALL RIGHT CURLY BRACKET + 0xFE5D, // [W] Ps SMALL LEFT TORTOISE SHELL BRACKET + 0xFE5E, // [W] Pe SMALL RIGHT TORTOISE SHELL BRACKET + 0xFE5F...0xFE61, // [W] Po [3] SMALL NUMBER SIGN..SMALL ASTERISK + 0xFE62, // [W] Sm SMALL PLUS SIGN + 0xFE63, // [W] Pd SMALL HYPHEN-MINUS + 0xFE64...0xFE66, // [W] Sm [3] SMALL LESS-THAN SIGN..SMALL EQUALS SIGN + 0xFE68, // [W] Po SMALL REVERSE SOLIDUS + 0xFE69, // [W] Sc SMALL DOLLAR SIGN + 0xFE6A...0xFE6B, // [W] Po [2] SMALL PERCENT SIGN..SMALL COMMERCIAL AT + 0xFF01...0xFF03, // [F] Po [3] FULLWIDTH EXCLAMATION MARK..FULLWIDTH NUMBER SIGN + 0xFF04, // [F] Sc FULLWIDTH DOLLAR SIGN + 0xFF05...0xFF07, // [F] Po [3] FULLWIDTH PERCENT SIGN..FULLWIDTH APOSTROPHE + 0xFF08, // [F] Ps FULLWIDTH LEFT PARENTHESIS + 0xFF09, // [F] Pe FULLWIDTH RIGHT PARENTHESIS + 0xFF0A, // [F] Po FULLWIDTH ASTERISK + 0xFF0B, // [F] Sm FULLWIDTH PLUS SIGN + 0xFF0C, // [F] Po FULLWIDTH COMMA + 0xFF0D, // [F] Pd FULLWIDTH HYPHEN-MINUS + 0xFF0E...0xFF0F, // [F] Po [2] FULLWIDTH FULL STOP..FULLWIDTH SOLIDUS + 0xFF10...0xFF19, // [F] Nd [10] FULLWIDTH DIGIT ZERO..FULLWIDTH DIGIT NINE + 0xFF1A...0xFF1B, // [F] Po [2] FULLWIDTH COLON..FULLWIDTH SEMICOLON + 0xFF1C...0xFF1E, // [F] Sm [3] FULLWIDTH LESS-THAN SIGN..FULLWIDTH GREATER-THAN SIGN + 0xFF1F...0xFF20, // [F] Po [2] FULLWIDTH QUESTION MARK..FULLWIDTH COMMERCIAL AT + 0xFF21...0xFF3A, // [F] Lu [26] FULLWIDTH LATIN CAPITAL LETTER A..FULLWIDTH LATIN CAPITAL LETTER Z + 0xFF3B, // [F] Ps FULLWIDTH LEFT SQUARE BRACKET + 0xFF3C, // [F] Po FULLWIDTH REVERSE SOLIDUS + 0xFF3D, // [F] Pe FULLWIDTH RIGHT SQUARE BRACKET + 0xFF3E, // [F] Sk FULLWIDTH CIRCUMFLEX ACCENT + 0xFF3F, // [F] Pc FULLWIDTH LOW LINE + 0xFF40, // [F] Sk FULLWIDTH GRAVE ACCENT + 0xFF41...0xFF5A, // [F] Ll [26] FULLWIDTH LATIN SMALL LETTER A..FULLWIDTH LATIN SMALL LETTER Z + 0xFF5B, // [F] Ps FULLWIDTH LEFT CURLY BRACKET + 0xFF5C, // [F] Sm FULLWIDTH VERTICAL LINE + 0xFF5D, // [F] Pe FULLWIDTH RIGHT CURLY BRACKET + 0xFF5E, // [F] Sm FULLWIDTH TILDE + 0xFF5F, // [F] Ps FULLWIDTH LEFT WHITE PARENTHESIS + 0xFF60, // [F] Pe FULLWIDTH RIGHT WHITE PARENTHESIS + 0xFFE0...0xFFE1, // [F] Sc [2] FULLWIDTH CENT SIGN..FULLWIDTH POUND SIGN + 0xFFE2, // [F] Sm FULLWIDTH NOT SIGN + 0xFFE3, // [F] Sk FULLWIDTH MACRON + 0xFFE4, // [F] So FULLWIDTH BROKEN BAR + 0xFFE5...0xFFE6, // [F] Sc [2] FULLWIDTH YEN SIGN..FULLWIDTH WON SIGN + 0x16FE0...0x16FE1, // [W] Lm [2] TANGUT ITERATION MARK..NUSHU ITERATION MARK + 0x16FE2, // [W] Po OLD CHINESE HOOK MARK + 0x16FE3, // [W] Lm OLD CHINESE ITERATION MARK + 0x16FE4, // [W] Mn KHITAN SMALL SCRIPT FILLER + 0x16FF0...0x16FF1, // [W] Mc [2] VIETNAMESE ALTERNATE READING MARK CA..VIETNAMESE ALTERNATE READING MARK NHAY + 0x17000...0x187F7, // [W] Lo [6136] TANGUT IDEOGRAPH-17000..TANGUT IDEOGRAPH-187F7 + 0x18800...0x18AFF, // [W] Lo [768] TANGUT COMPONENT-001..TANGUT COMPONENT-768 + 0x18B00...0x18CD5, // [W] Lo [470] KHITAN SMALL SCRIPT CHARACTER-18B00..KHITAN SMALL SCRIPT CHARACTER-18CD5 + 0x18D00...0x18D08, // [W] Lo [9] TANGUT IDEOGRAPH-18D00..TANGUT IDEOGRAPH-18D08 + 0x1AFF0...0x1AFF3, // [W] Lm [4] KATAKANA LETTER MINNAN TONE-2..KATAKANA LETTER MINNAN TONE-5 + 0x1AFF5...0x1AFFB, // [W] Lm [7] KATAKANA LETTER MINNAN TONE-7..KATAKANA LETTER MINNAN NASALIZED TONE-5 + 0x1AFFD...0x1AFFE, // [W] Lm [2] KATAKANA LETTER MINNAN NASALIZED TONE-7..KATAKANA LETTER MINNAN NASALIZED TONE-8 + 0x1B000...0x1B0FF, // [W] Lo [256] KATAKANA LETTER ARCHAIC E..HENTAIGANA LETTER RE-2 + 0x1B100...0x1B122, // [W] Lo [35] HENTAIGANA LETTER RE-3..KATAKANA LETTER ARCHAIC WU + 0x1B132, // [W] Lo HIRAGANA LETTER SMALL KO + 0x1B150...0x1B152, // [W] Lo [3] HIRAGANA LETTER SMALL WI..HIRAGANA LETTER SMALL WO + 0x1B155, // [W] Lo KATAKANA LETTER SMALL KO + 0x1B164...0x1B167, // [W] Lo [4] KATAKANA LETTER SMALL WI..KATAKANA LETTER SMALL N + 0x1B170...0x1B2FB, // [W] Lo [396] NUSHU CHARACTER-1B170..NUSHU CHARACTER-1B2FB + 0x1F004, // [W] So MAHJONG TILE RED DRAGON + 0x1F0CF, // [W] So PLAYING CARD BLACK JOKER + 0x1F18E, // [W] So NEGATIVE SQUARED AB + 0x1F191...0x1F19A, // [W] So [10] SQUARED CL..SQUARED VS + 0x1F200...0x1F202, // [W] So [3] SQUARE HIRAGANA HOKA..SQUARED KATAKANA SA + 0x1F210...0x1F23B, // [W] So [44] SQUARED CJK UNIFIED IDEOGRAPH-624B..SQUARED CJK UNIFIED IDEOGRAPH-914D + 0x1F240...0x1F248, // [W] So [9] TORTOISE SHELL BRACKETED CJK UNIFIED IDEOGRAPH-672C..TORTOISE SHELL BRACKETED CJK UNIFIED IDEOGRAPH-6557 + 0x1F250...0x1F251, // [W] So [2] CIRCLED IDEOGRAPH ADVANTAGE..CIRCLED IDEOGRAPH ACCEPT + 0x1F260...0x1F265, // [W] So [6] ROUNDED SYMBOL FOR FU..ROUNDED SYMBOL FOR CAI + 0x1F300...0x1F320, // [W] So [33] CYCLONE..SHOOTING STAR + 0x1F32D...0x1F335, // [W] So [9] HOT DOG..CACTUS + 0x1F337...0x1F37C, // [W] So [70] TULIP..BABY BOTTLE + 0x1F37E...0x1F393, // [W] So [22] BOTTLE WITH POPPING CORK..GRADUATION CAP + 0x1F3A0...0x1F3CA, // [W] So [43] CAROUSEL HORSE..SWIMMER + 0x1F3CF...0x1F3D3, // [W] So [5] CRICKET BAT AND BALL..TABLE TENNIS PADDLE AND BALL + 0x1F3E0...0x1F3F0, // [W] So [17] HOUSE BUILDING..EUROPEAN CASTLE + 0x1F3F4, // [W] So WAVING BLACK FLAG + 0x1F3F8...0x1F3FA, // [W] So [3] BADMINTON RACQUET AND SHUTTLECOCK..AMPHORA + 0x1F3FB...0x1F3FF, // [W] Sk [5] EMOJI MODIFIER FITZPATRICK TYPE-1-2..EMOJI MODIFIER FITZPATRICK TYPE-6 + 0x1F400...0x1F43E, // [W] So [63] RAT..PAW PRINTS + 0x1F440, // [W] So EYES + 0x1F442...0x1F4FC, // [W] So [187] EAR..VIDEOCASSETTE + 0x1F4FF...0x1F53D, // [W] So [63] PRAYER BEADS..DOWN-POINTING SMALL RED TRIANGLE + 0x1F54B...0x1F54E, // [W] So [4] KAABA..MENORAH WITH NINE BRANCHES + 0x1F550...0x1F567, // [W] So [24] CLOCK FACE ONE OCLOCK..CLOCK FACE TWELVE-THIRTY + 0x1F57A, // [W] So MAN DANCING + 0x1F595...0x1F596, // [W] So [2] REVERSED HAND WITH MIDDLE FINGER EXTENDED..RAISED HAND WITH PART BETWEEN MIDDLE AND RING FINGERS + 0x1F5A4, // [W] So BLACK HEART + 0x1F5FB...0x1F5FF, // [W] So [5] MOUNT FUJI..MOYAI + 0x1F600...0x1F64F, // [W] So [80] GRINNING FACE..PERSON WITH FOLDED HANDS + 0x1F680...0x1F6C5, // [W] So [70] ROCKET..LEFT LUGGAGE + 0x1F6CC, // [W] So SLEEPING ACCOMMODATION + 0x1F6D0...0x1F6D2, // [W] So [3] PLACE OF WORSHIP..SHOPPING TROLLEY + 0x1F6D5...0x1F6D7, // [W] So [3] HINDU TEMPLE..ELEVATOR + 0x1F6DC...0x1F6DF, // [W] So [4] WIRELESS..RING BUOY + 0x1F6EB...0x1F6EC, // [W] So [2] AIRPLANE DEPARTURE..AIRPLANE ARRIVING + 0x1F6F4...0x1F6FC, // [W] So [9] SCOOTER..ROLLER SKATE + 0x1F7E0...0x1F7EB, // [W] So [12] LARGE ORANGE CIRCLE..LARGE BROWN SQUARE + 0x1F7F0, // [W] So HEAVY EQUALS SIGN + 0x1F90C...0x1F93A, // [W] So [47] PINCHED FINGERS..FENCER + 0x1F93C...0x1F945, // [W] So [10] WRESTLERS..GOAL NET + 0x1F947...0x1F9FF, // [W] So [185] FIRST PLACE MEDAL..NAZAR AMULET + 0x1FA70...0x1FA7C, // [W] So [13] BALLET SHOES..CRUTCH + 0x1FA80...0x1FA88, // [W] So [9] YO-YO..FLUTE + 0x1FA90...0x1FABD, // [W] So [46] RINGED PLANET..WING + 0x1FABF...0x1FAC5, // [W] So [7] GOOSE..PERSON WITH CROWN + 0x1FACE...0x1FADB, // [W] So [14] MOOSE..PEA POD + 0x1FAE0...0x1FAE8, // [W] So [9] MELTING FACE..SHAKING FACE + 0x1FAF0...0x1FAF8, // [W] So [9] HAND WITH INDEX FINGER AND THUMB CROSSED..RIGHTWARDS PUSHING HAND + 0x20000...0x2A6DF, // [W] Lo [42720] CJK UNIFIED IDEOGRAPH-20000..CJK UNIFIED IDEOGRAPH-2A6DF + 0x2A6E0...0x2A6FF, // [W] Cn [32] .. + 0x2A700...0x2B739, // [W] Lo [4154] CJK UNIFIED IDEOGRAPH-2A700..CJK UNIFIED IDEOGRAPH-2B739 + 0x2B73A...0x2B73F, // [W] Cn [6] .. + 0x2B740...0x2B81D, // [W] Lo [222] CJK UNIFIED IDEOGRAPH-2B740..CJK UNIFIED IDEOGRAPH-2B81D + 0x2B81E...0x2B81F, // [W] Cn [2] .. + 0x2B820...0x2CEA1, // [W] Lo [5762] CJK UNIFIED IDEOGRAPH-2B820..CJK UNIFIED IDEOGRAPH-2CEA1 + 0x2CEA2...0x2CEAF, // [W] Cn [14] .. + 0x2CEB0...0x2EBE0, // [W] Lo [7473] CJK UNIFIED IDEOGRAPH-2CEB0..CJK UNIFIED IDEOGRAPH-2EBE0 + 0x2EBE1...0x2EBEF, // [W] Cn [15] .. + 0x2EBF0...0x2EE5D, // [W] Lo [622] CJK UNIFIED IDEOGRAPH-2EBF0..CJK UNIFIED IDEOGRAPH-2EE5D + 0x2EE5E...0x2F7FF, // [W] Cn [2466] .. + 0x2F800...0x2FA1D, // [W] Lo [542] CJK COMPATIBILITY IDEOGRAPH-2F800..CJK COMPATIBILITY IDEOGRAPH-2FA1D + 0x2FA1E...0x2FA1F, // [W] Cn [2] .. + 0x2FA20...0x2FFFD, // [W] Cn [1502] .. + 0x30000...0x3134A, // [W] Lo [4939] CJK UNIFIED IDEOGRAPH-30000..CJK UNIFIED IDEOGRAPH-3134A + 0x3134B...0x3134F, // [W] Cn [5] .. + 0x31350...0x323AF, // [W] Lo [4192] CJK UNIFIED IDEOGRAPH-31350..CJK UNIFIED IDEOGRAPH-323AF + 0x323B0...0x3FFFD, // [W] Cn [56398] .. + => true, + else => false, + }; +} + +pub fn isAmgiguousCodepointType(comptime T: type, cp: T) bool { + return switch (cp) { + 0xA1, + 0xA4, + 0xA7, + 0xA8, + 0xAA, + 0xAD, + 0xAE, + 0xB0...0xB4, + 0xB6...0xBA, + 0xBC...0xBF, + 0xC6, + 0xD0, + 0xD7, + 0xD8, + 0xDE...0xE1, + 0xE6, + 0xE8...0xEA, + 0xEC, + 0xED, + 0xF0, + 0xF2, + 0xF3, + 0xF7...0xFA, + 0xFC, + 0xFE, + 0x101, + 0x111, + 0x113, + 0x11B, + 0x126, + 0x127, + 0x12B, + 0x131...0x133, + 0x138, + 0x13F...0x142, + 0x144, + 0x148...0x14B, + 0x14D, + 0x152, + 0x153, + 0x166, + 0x167, + 0x16B, + 0x1CE, + 0x1D0, + 0x1D2, + 0x1D4, + 0x1D6, + 0x1D8, + 0x1DA, + 0x1DC, + 0x251, + 0x261, + 0x2C4, + 0x2C7, + 0x2C9...0x2CB, + 0x2CD, + 0x2D0, + 0x2D8...0x2DB, + 0x2DD, + 0x2DF, + 0x300...0x36F, + 0x391...0x3A1, + 0x3A3...0x3A9, + 0x3B1...0x3C1, + 0x3C3...0x3C9, + 0x401, + 0x410...0x44F, + 0x451, + 0x2010, + 0x2013...0x2016, + 0x2018, + 0x2019, + 0x201C, + 0x201D, + 0x2020...0x2022, + 0x2024...0x2027, + 0x2030, + 0x2032, + 0x2033, + 0x2035, + 0x203B, + 0x203E, + 0x2074, + 0x207F, + 0x2081...0x2084, + 0x20AC, + 0x2103, + 0x2105, + 0x2109, + 0x2113, + 0x2116, + 0x2121, + 0x2122, + 0x2126, + 0x212B, + 0x2153, + 0x2154, + 0x215B...0x215E, + 0x2160...0x216B, + 0x2170...0x2179, + 0x2189, + 0x2190...0x2199, + 0x21B8, + 0x21B9, + 0x21D2, + 0x21D4, + 0x21E7, + 0x2200, + 0x2202, + 0x2203, + 0x2207, + 0x2208, + 0x220B, + 0x220F, + 0x2211, + 0x2215, + 0x221A, + 0x221D...0x2220, + 0x2223, + 0x2225, + 0x2227...0x222C, + 0x222E, + 0x2234...0x2237, + 0x223C, + 0x223D, + 0x2248, + 0x224C, + 0x2252, + 0x2260, + 0x2261, + 0x2264...0x2267, + 0x226A, + 0x226B, + 0x226E, + 0x226F, + 0x2282, + 0x2283, + 0x2286, + 0x2287, + 0x2295, + 0x2299, + 0x22A5, + 0x22BF, + 0x2312, + 0x2460...0x24E9, + 0x24EB...0x254B, + 0x2550...0x2573, + 0x2580...0x258F, + 0x2592...0x2595, + 0x25A0, + 0x25A1, + 0x25A3...0x25A9, + 0x25B2, + 0x25B3, + 0x25B6, + 0x25B7, + 0x25BC, + 0x25BD, + 0x25C0, + 0x25C1, + 0x25C6...0x25C8, + 0x25CB, + 0x25CE...0x25D1, + 0x25E2...0x25E5, + 0x25EF, + 0x2605, + 0x2606, + 0x2609, + 0x260E, + 0x260F, + 0x261C, + 0x261E, + 0x2640, + 0x2642, + 0x2660, + 0x2661, + 0x2663...0x2665, + 0x2667...0x266A, + 0x266C, + 0x266D, + 0x266F, + 0x269E, + 0x269F, + 0x26BF, + 0x26C6...0x26CD, + 0x26CF...0x26D3, + 0x26D5...0x26E1, + 0x26E3, + 0x26E8, + 0x26E9, + 0x26EB...0x26F1, + 0x26F4, + 0x26F6...0x26F9, + 0x26FB, + 0x26FC, + 0x26FE, + 0x26FF, + 0x273D, + 0x2776...0x277F, + 0x2B56...0x2B59, + 0x3248...0x324F, + 0xE000...0xF8FF, + 0xFE00...0xFE0F, + 0xFFFD, + 0x1F100...0x1F10A, + 0x1F110...0x1F12D, + 0x1F130...0x1F169, + 0x1F170...0x1F18D, + 0x1F18F, + 0x1F190, + 0x1F19B...0x1F1AC, + 0xE0100...0xE01EF, + 0xF0000...0xFFFFD, + 0x100000...0x10FFFD, + => true, + else => false, + }; +} + +pub fn visibleCodepointWidth(cp: u32, ambiguousAsWide: bool) u3_fast { + return visibleCodepointWidthType(u32, cp, ambiguousAsWide); +} + +pub fn visibleCodepointWidthMaybeEmoji(cp: u32, maybe_emoji: bool, ambiguousAsWide: bool) u3_fast { + // UCHAR_EMOJI=57, + if (maybe_emoji and icu_hasBinaryProperty(cp, 57)) { + return 2; + } + return visibleCodepointWidth(cp, ambiguousAsWide); +} + +pub fn visibleCodepointWidthType(comptime T: type, cp: T, ambiguousAsWide: bool) u3_fast { + if (isZeroWidthCodepointType(T, cp)) { + return 0; + } + + if (isFullWidthCodepointType(T, cp)) { + return 2; + } + if (ambiguousAsWide and isAmgiguousCodepointType(T, cp)) { + return 2; + } + + return 1; +} + +pub const visible = struct { + // Ref: https://cs.stanford.edu/people/miles/iso8859.html + fn visibleLatin1Width(input_: []const u8) usize { + var length: usize = 0; + var input = input_; + const input_end_ptr = input.ptr + input.len - (input.len % 16); + var input_ptr = input.ptr; + while (input_ptr != input_end_ptr) { + const input_chunk: [16]u8 = input_ptr[0..16].*; + const sums: @Vector(16, u8) = [16]u8{ + visibleLatin1WidthScalar(input_chunk[0]), + visibleLatin1WidthScalar(input_chunk[1]), + visibleLatin1WidthScalar(input_chunk[2]), + visibleLatin1WidthScalar(input_chunk[3]), + visibleLatin1WidthScalar(input_chunk[4]), + visibleLatin1WidthScalar(input_chunk[5]), + visibleLatin1WidthScalar(input_chunk[6]), + visibleLatin1WidthScalar(input_chunk[7]), + visibleLatin1WidthScalar(input_chunk[8]), + visibleLatin1WidthScalar(input_chunk[9]), + visibleLatin1WidthScalar(input_chunk[10]), + visibleLatin1WidthScalar(input_chunk[11]), + visibleLatin1WidthScalar(input_chunk[12]), + visibleLatin1WidthScalar(input_chunk[13]), + visibleLatin1WidthScalar(input_chunk[14]), + visibleLatin1WidthScalar(input_chunk[15]), + }; + length += @reduce(.Add, sums); + input_ptr += 16; + } + input.len %= 16; + input.ptr = input_ptr; + + for (input) |byte| length += visibleLatin1WidthScalar(byte); + return length; + } + + fn visibleLatin1WidthScalar(c: u8) u1 { + return if ((c >= 127 and c <= 159) or c < 32) 0 else 1; + } + + fn visibleLatin1WidthExcludeANSIColors(input_: anytype) usize { + var length: usize = 0; + var input = input_; + + const ElementType = std.meta.Child(@TypeOf(input_)); + const indexFn = if (comptime ElementType == u8) strings.indexOfCharUsize else strings.indexOfChar16Usize; + + while (indexFn(input, '\x1b')) |i| { + length += visibleLatin1Width(input[0..i]); + input = input[i..]; + + if (input.len < 3) return length; + + if (input[1] == '[') { + const end = indexFn(input[2..], 'm') orelse return length; + input = input[end + 3 ..]; + } else { + input = input[1..]; + } + } + + length += visibleLatin1Width(input); + + return length; + } + + fn visibleUTF8WidthFn(input: []const u8, comptime asciiFn: anytype) usize { + var bytes = input; + var len: usize = 0; + while (bun.strings.firstNonASCII(bytes)) |i| { + len += asciiFn(bytes[0..i]); + const this_chunk = bytes[i..]; + const byte = this_chunk[0]; + + const skip = bun.strings.wtf8ByteSequenceLengthWithInvalid(byte); + const cp_bytes: [4]u8 = switch (@min(@as(usize, skip), this_chunk.len)) { + inline 1, 2, 3, 4 => |cp_len| .{ + byte, + if (comptime cp_len > 1) this_chunk[1] else 0, + if (comptime cp_len > 2) this_chunk[2] else 0, + if (comptime cp_len > 3) this_chunk[3] else 0, + }, + else => unreachable, + }; + + const cp = decodeWTF8RuneTMultibyte(&cp_bytes, skip, u32, unicode_replacement); + len += visibleCodepointWidth(cp, false); + + bytes = bytes[@min(i + skip, bytes.len)..]; + } + + len += asciiFn(bytes); + + return len; + } + + fn visibleUTF16WidthFn(input_: []const u16, exclude_ansi_colors: bool, ambiguousAsWide: bool) usize { + var input = input_; + var len: usize = 0; + var prev: ?u21 = 0; + var break_state = grapheme.BreakState{}; + var break_start: u21 = 0; + var saw_1b = false; + var saw_bracket = false; + var stretch_len: usize = 0; + + while (true) { + { + const idx = firstNonASCII16([]const u16, input) orelse input.len; + for (0..idx) |j| { + const cp = input[j]; + defer prev = cp; + + if (saw_bracket) { + if (cp == 'm') { + saw_1b = false; + saw_bracket = false; + stretch_len = 0; + continue; + } + stretch_len += visibleCodepointWidth(cp, ambiguousAsWide); + continue; + } + if (saw_1b) { + if (cp == '[') { + saw_bracket = true; + stretch_len = 0; + continue; + } + len += visibleCodepointWidth(cp, ambiguousAsWide); + continue; + } + if (!exclude_ansi_colors or cp != 0x1b) { + if (prev) |prev_| { + const should_break = grapheme.graphemeBreak(prev_, cp, &break_state); + if (should_break) { + len += visibleCodepointWidthMaybeEmoji(break_start, cp == 0xFE0F, ambiguousAsWide); + break_start = cp; + } else { + // + } + } else { + len += visibleCodepointWidth(cp, ambiguousAsWide); + break_start = cp; + } + continue; + } + saw_1b = true; + continue; + } + len += stretch_len; + input = input[idx..]; + } + if (input.len == 0) break; + const replacement = utf16CodepointWithFFFD([]const u16, input); + defer input = input[replacement.len..]; + if (replacement.fail) continue; + const cp: u21 = @intCast(replacement.code_point); + defer prev = cp; + + if (prev) |prev_| { + const should_break = grapheme.graphemeBreak(prev_, cp, &break_state); + if (should_break) { + len += visibleCodepointWidthMaybeEmoji(break_start, cp == 0xFE0F, ambiguousAsWide); + break_start = cp; + } + } else { + len += visibleCodepointWidth(cp, ambiguousAsWide); + break_start = cp; + } + } + if (break_start > 0) { + len += visibleCodepointWidthMaybeEmoji(break_start, (prev orelse 0) == 0xFE0F, ambiguousAsWide); + } + return len; + } + + fn visibleLatin1WidthFn(input: []const u8) usize { + return visibleLatin1Width(input); + } + + pub const width = struct { + pub fn latin1(input: []const u8) usize { + return visibleLatin1Width(input); + } + + pub fn utf8(input: []const u8) usize { + return visibleUTF8WidthFn(input, visibleLatin1Width); + } + + pub fn utf16(input: []const u16, ambiguousAsWide: bool) usize { + return visibleUTF16WidthFn(input, false, ambiguousAsWide); + } + + pub const exclude_ansi_colors = struct { + pub fn latin1(input: []const u8) usize { + return visibleLatin1WidthExcludeANSIColors(input); + } + + pub fn utf8(input: []const u8) usize { + return visibleUTF8WidthFn(input, visibleLatin1WidthExcludeANSIColors); + } + + pub fn utf16(input: []const u16, ambiguousAsWide: bool) usize { + return visibleUTF16WidthFn(input, true, ambiguousAsWide); + } + }; + }; +}; + +// extern "C" bool icu_hasBinaryProperty(UChar32 cp, unsigned int prop) +extern fn icu_hasBinaryProperty(c: u32, which: c_uint) bool; + +const bun = @import("bun"); +const std = @import("std"); +const u3_fast = strings.u3_fast; +const decodeWTF8RuneTMultibyte = strings.decodeWTF8RuneTMultibyte; +const grapheme = strings.grapheme; +const strings = bun.strings; +const unicode_replacement = strings.unicode_replacement; +const firstNonASCII16 = strings.firstNonASCII16; +const firstNonASCII = strings.firstNonASCII; +const utf16CodepointWithFFFD = strings.utf16CodepointWithFFFD; diff --git a/src/string_immutable.zig b/src/string_immutable.zig index d0443bde93..63a1d8991a 100644 --- a/src/string_immutable.zig +++ b/src/string_immutable.zig @@ -1,14 +1,3 @@ -const std = @import("std"); -const expect = std.testing.expect; -const Environment = @import("./env.zig"); -const string = bun.string; -const CodePoint = bun.CodePoint; -const bun = @import("bun"); -const log = bun.Output.scoped(.STR, true); -const js_lexer = @import("./js_lexer.zig"); -const grapheme = @import("./grapheme.zig"); -const OOM = bun.OOM; - /// memmem is provided by libc on posix, but implemented in zig for windows. pub const memmem = bun.sys.workaround_symbols.memmem; @@ -57,44 +46,6 @@ pub fn containsCaseInsensitiveASCII(self: string, str: string) callconv(bun.call return false; } -pub fn removeLeadingDotSlash(slice: []const u8) callconv(bun.callconv_inline) []const u8 { - if (slice.len >= 2) { - if ((@as(u16, @bitCast(slice[0..2].*)) == comptime std.mem.readInt(u16, "./", .little)) or - (Environment.isWindows and @as(u16, @bitCast(slice[0..2].*)) == comptime std.mem.readInt(u16, ".\\", .little))) - { - return slice[2..]; - } - } - return slice; -} - -// TODO: remove this -pub const w = toUTF16Literal; - -pub fn toUTF16Literal(comptime str: []const u8) [:0]const u16 { - return literal(u16, str); -} - -pub fn literal(comptime T: type, comptime str: []const u8) *const [literalLength(T, str):0]T { - const Holder = struct { - pub const value = switch (T) { - u8 => (str[0..str.len].* ++ .{0})[0..str.len :0], - u16 => std.unicode.utf8ToUtf16LeStringLiteral(str), - else => @compileError("unsupported type " ++ @typeName(T) ++ " in strings.literal() call."), - }; - }; - - return Holder.value; -} - -fn literalLength(comptime T: type, comptime str: string) usize { - return comptime switch (T) { - u8 => str.len, - u16 => std.unicode.calcUtf16LeLen(str) catch unreachable, - else => 0, // let other errors report first - }; -} - pub const OptionalUsize = std.meta.Int(.unsigned, @bitSizeOf(usize) - 1); pub fn indexOfAny(slice: string, comptime str: []const u8) ?OptionalUsize { return switch (comptime str.len) { @@ -551,43 +502,6 @@ pub const SplitIterator = struct { } }; -// -- -// This is faster when the string is found, by about 2x for a 8 MB file. -// It is slower when the string is NOT found -// fn indexOfPosN(comptime T: type, buf: []const u8, start_index: usize, delimiter: []const u8, comptime n: comptime_int) ?usize { -// const k = delimiter.len; -// const V8x32 = @Vector(n, T); -// const V1x32 = @Vector(n, u1); -// const Vbx32 = @Vector(n, bool); -// const first = @splat(n, delimiter[0]); -// const last = @splat(n, delimiter[k - 1]); - -// var end: usize = start_index + n; -// var start: usize = end - n; -// while (end < buf.len) { -// start = end - n; -// const last_end = @min(end + k - 1, buf.len); -// const last_start = last_end - n; - -// // Look for the first character in the delimter -// const first_chunk: V8x32 = buf[start..end][0..n].*; -// const last_chunk: V8x32 = buf[last_start..last_end][0..n].*; -// const mask = @bitCast(V1x32, first == first_chunk) & @bitCast(V1x32, last == last_chunk); - -// if (@reduce(.Or, mask) != 0) { -// // TODO: Use __builtin_clz??? -// for (@as([n]bool, @bitCast(Vbx32, mask))) |match, i| { -// if (match and eqlLong(buf[start + i .. start + i + k], delimiter, false)) { -// return start + i; -// } -// } -// } -// end = @min(end + n, buf.len); -// } -// if (start < buf.len) return std.mem.indexOfPos(T, buf, start_index, delimiter); -// return null; // Not found -// } - pub fn cat(allocator: std.mem.Allocator, first: string, second: string) !string { var out = try allocator.alloc(u8, first.len + second.len); bun.copy(u8, out, first); @@ -839,48 +753,6 @@ pub fn endsWithCharOrIsZeroLength(self: string, char: u8) callconv(bun.callconv_ return self.len == 0 or self[self.len - 1] == char; } -pub fn withoutTrailingSlash(this: string) []const u8 { - var href = this; - while (href.len > 1 and (switch (href[href.len - 1]) { - '/', '\\' => true, - else => false, - })) { - href.len -= 1; - } - - return href; -} - -/// Does not strip the device root (C:\ or \\Server\Share\ portion off of the path) -pub fn withoutTrailingSlashWindowsPath(input: string) []const u8 { - if (Environment.isPosix or input.len < 3 or input[1] != ':') - return withoutTrailingSlash(input); - - const root_len = bun.path.windowsFilesystemRoot(input).len + 1; - - var path = input; - while (path.len > root_len and (switch (path[path.len - 1]) { - '/', '\\' => true, - else => false, - })) { - path.len -= 1; - } - - if (Environment.isDebug) - bun.debugAssert(!std.fs.path.isAbsolute(path) or - !isWindowsAbsolutePathMissingDriveLetter(u8, path)); - - return path; -} - -pub fn withoutLeadingSlash(this: string) []const u8 { - return std.mem.trimLeft(u8, this, "/"); -} - -pub fn withoutLeadingPathSeparator(this: string) []const u8 { - return std.mem.trimLeft(u8, this, &.{std.fs.path.sep}); -} - pub fn endsWithAny(self: string, str: string) bool { const end = self[self.len - 1]; for (str) |char| { @@ -1245,2591 +1117,8 @@ pub fn index(self: string, str: string) i32 { } } -pub fn eqlUtf16(comptime self: string, other: []const u16) bool { - if (self.len != other.len) return false; - - if (self.len == 0) return true; - - return bun.C.memcmp(bun.cast([*]const u8, self.ptr), bun.cast([*]const u8, other.ptr), self.len * @sizeOf(u16)) == 0; -} - -pub fn toUTF8Alloc(allocator: std.mem.Allocator, js: []const u16) OOM![]u8 { - return try toUTF8AllocWithType(allocator, []const u16, js); -} - -pub fn toUTF8AllocZ(allocator: std.mem.Allocator, js: []const u16) OOM![:0]u8 { - var list = std.ArrayList(u8).init(allocator); - try toUTF8AppendToList(&list, js); - try list.append(0); - return list.items[0 .. list.items.len - 1 :0]; -} - -pub fn appendUTF8MachineWordToUTF16MachineWord(output: *[@sizeOf(usize) / 2]u16, input: *const [@sizeOf(usize) / 2]u8) callconv(bun.callconv_inline) void { - output[0 .. @sizeOf(usize) / 2].* = @as( - [4]u16, - @bitCast(@as( - @Vector(4, u16), - @as(@Vector(4, u8), @bitCast(input[0 .. @sizeOf(usize) / 2].*)), - )), - ); -} - -pub fn copyU8IntoU16(output_: []u16, input_: []const u8) callconv(bun.callconv_inline) void { - const output = output_; - const input = input_; - if (comptime Environment.allow_assert) assert(input.len <= output.len); - - // https://zig.godbolt.org/z/9rTn1orcY - - var input_ptr = input.ptr; - var output_ptr = output.ptr; - - const last_input_ptr = input_ptr + @min(input.len, output.len); - - while (last_input_ptr != input_ptr) { - output_ptr[0] = input_ptr[0]; - output_ptr += 1; - input_ptr += 1; - } -} - -pub fn copyU8IntoU16WithAlignment(comptime alignment: u21, output_: []align(alignment) u16, input_: []const u8) void { - var output = output_; - var input = input_; - const word = @sizeOf(usize) / 2; - if (comptime Environment.allow_assert) assert(input.len <= output.len); - - // un-aligned data access is slow - // so we attempt to align the data - while (!std.mem.isAligned(@intFromPtr(output.ptr), @alignOf(u16)) and input.len >= word) { - output[0] = input[0]; - output = output[1..]; - input = input[1..]; - } - - if (std.mem.isAligned(@intFromPtr(output.ptr), @alignOf(u16)) and input.len > 0) { - copyU8IntoU16(@as([*]u16, @alignCast(output.ptr))[0..output.len], input); - return; - } - - for (input, 0..) |c, i| { - output[i] = c; - } -} - -// pub fn copy(output_: []u8, input_: []const u8) callconv(bun.callconv_inline) void { -// var output = output_; -// var input = input_; -// if (comptime Environment.allow_assert) assert(input.len <= output.len); - -// if (input.len > @sizeOf(usize) * 4) { -// comptime var i: usize = 0; -// inline while (i < 4) : (i += 1) { -// appendUTF8MachineWord(output[i * @sizeOf(usize) ..][0..@sizeOf(usize)], input[i * @sizeOf(usize) ..][0..@sizeOf(usize)]); -// } -// output = output[4 * @sizeOf(usize) ..]; -// input = input[4 * @sizeOf(usize) ..]; -// } - -// while (input.len >= @sizeOf(usize)) { -// appendUTF8MachineWord(output[0..@sizeOf(usize)], input[0..@sizeOf(usize)]); -// output = output[@sizeOf(usize)..]; -// input = input[@sizeOf(usize)..]; -// } - -// for (input) |c, i| { -// output[i] = c; -// } -// } - -pub inline fn copyU16IntoU8(output: []u8, input: []align(1) const u16) void { - if (comptime Environment.allow_assert) assert(input.len <= output.len); - const count = @min(input.len, output.len); - - bun.highway.copyU16ToU8(input[0..count], output[0..count]); -} - const strings = @This(); -pub fn copyLatin1IntoASCII(dest: []u8, src: []const u8) void { - var remain = src; - var to = dest; - - const non_ascii_offset = strings.firstNonASCII(remain) orelse @as(u32, @truncate(remain.len)); - if (non_ascii_offset > 0) { - @memcpy(to[0..non_ascii_offset], remain[0..non_ascii_offset]); - remain = remain[non_ascii_offset..]; - to = to[non_ascii_offset..]; - - // ascii fast path - if (remain.len == 0) { - return; - } - } - - if (to.len >= 16 and bun.Environment.enableSIMD) { - const vector_size = 16; - // https://zig.godbolt.org/z/qezsY8T3W - const remain_in_u64 = remain[0 .. remain.len - (remain.len % vector_size)]; - const to_in_u64 = to[0 .. to.len - (to.len % vector_size)]; - var remain_as_u64 = std.mem.bytesAsSlice(u64, remain_in_u64); - var to_as_u64 = std.mem.bytesAsSlice(u64, to_in_u64); - const end_vector_len = @min(remain_as_u64.len, to_as_u64.len); - remain_as_u64 = remain_as_u64[0..end_vector_len]; - to_as_u64 = to_as_u64[0..end_vector_len]; - const end_ptr = remain_as_u64.ptr + remain_as_u64.len; - // using the pointer instead of the length is super important for the codegen - while (end_ptr != remain_as_u64.ptr) { - const buf = remain_as_u64[0]; - // this gets auto-vectorized - const mask = @as(u64, 0x7f7f7f7f7f7f7f7f); - to_as_u64[0] = buf & mask; - - remain_as_u64 = remain_as_u64[1..]; - to_as_u64 = to_as_u64[1..]; - } - remain = remain[remain_in_u64.len..]; - to = to[to_in_u64.len..]; - } - - for (to) |*to_byte| { - to_byte.* = @as(u8, @as(u7, @truncate(remain[0]))); - remain = remain[1..]; - } -} - -/// It is common on Windows to find files that are not encoded in UTF8. Most of these include -/// a 'byte-order mark' codepoint at the start of the file. The layout of this codepoint can -/// determine the encoding. -/// -/// https://en.wikipedia.org/wiki/Byte_order_mark -pub const BOM = enum { - utf8, - utf16_le, - utf16_be, - utf32_le, - utf32_be, - - pub const utf8_bytes = [_]u8{ 0xef, 0xbb, 0xbf }; - pub const utf16_le_bytes = [_]u8{ 0xff, 0xfe }; - pub const utf16_be_bytes = [_]u8{ 0xfe, 0xff }; - pub const utf32_le_bytes = [_]u8{ 0xff, 0xfe, 0x00, 0x00 }; - pub const utf32_be_bytes = [_]u8{ 0x00, 0x00, 0xfe, 0xff }; - - pub fn detect(bytes: []const u8) ?BOM { - if (bytes.len < 3) return null; - if (eqlComptimeIgnoreLen(bytes, utf8_bytes)) return .utf8; - if (eqlComptimeIgnoreLen(bytes, utf16_le_bytes)) { - // if (bytes.len > 4 and eqlComptimeIgnoreLen(bytes[2..], utf32_le_bytes[2..])) - // return .utf32_le; - return .utf16_le; - } - // if (eqlComptimeIgnoreLen(bytes, utf16_be_bytes)) return .utf16_be; - // if (bytes.len > 4 and eqlComptimeIgnoreLen(bytes, utf32_le_bytes)) return .utf32_le; - return null; - } - - pub fn detectAndSplit(bytes: []const u8) struct { ?BOM, []const u8 } { - const bom = detect(bytes); - if (bom == null) return .{ null, bytes }; - return .{ bom, bytes[bom.?.length()..] }; - } - - pub fn getHeader(bom: BOM) []const u8 { - return switch (bom) { - inline else => |t| comptime &@field(BOM, @tagName(t) ++ "_bytes"), - }; - } - - pub fn length(bom: BOM) usize { - return switch (bom) { - inline else => |t| comptime (&@field(BOM, @tagName(t) ++ "_bytes")).len, - }; - } - - /// If an allocation is needed, free the input and the caller will - /// replace it with the new return - pub fn removeAndConvertToUTF8AndFree(bom: BOM, allocator: std.mem.Allocator, bytes: []u8) OOM![]u8 { - switch (bom) { - .utf8 => { - _ = bun.c.memmove(bytes.ptr, bytes.ptr + utf8_bytes.len, bytes.len - utf8_bytes.len); - return bytes[0 .. bytes.len - utf8_bytes.len]; - }, - .utf16_le => { - const trimmed_bytes = bytes[utf16_le_bytes.len..]; - const trimmed_bytes_u16: []const u16 = @alignCast(std.mem.bytesAsSlice(u16, trimmed_bytes)); - const out = try toUTF8Alloc(allocator, trimmed_bytes_u16); - allocator.free(bytes); - return out; - }, - else => { - // TODO: this needs to re-encode, for now we just remove the BOM - const bom_bytes = bom.getHeader(); - _ = bun.c.memmove(bytes.ptr, bytes.ptr + bom_bytes.len, bytes.len - bom_bytes.len); - return bytes[0 .. bytes.len - bom_bytes.len]; - }, - } - } - - /// This is required for fs.zig's `use_shared_buffer` flag. we cannot free that pointer. - /// The returned slice will always point to the base of the input. - /// - /// Requires an arraylist in case it must be grown. - pub fn removeAndConvertToUTF8WithoutDealloc(bom: BOM, allocator: std.mem.Allocator, list: *std.ArrayListUnmanaged(u8)) ![]u8 { - const bytes = list.items; - switch (bom) { - .utf8 => { - bun.C.memmove(bytes.ptr, bytes.ptr + utf8_bytes.len, bytes.len - utf8_bytes.len); - return bytes[0 .. bytes.len - utf8_bytes.len]; - }, - .utf16_le => { - const trimmed_bytes = bytes[utf16_le_bytes.len..]; - const trimmed_bytes_u16: []const u16 = @alignCast(std.mem.bytesAsSlice(u16, trimmed_bytes)); - const out = try toUTF8Alloc(allocator, trimmed_bytes_u16); - if (list.capacity < out.len) { - try list.ensureTotalCapacity(allocator, out.len); - } - list.items.len = out.len; - @memcpy(list.items, out); - return out; - }, - else => { - // TODO: this needs to re-encode, for now we just remove the BOM - const bom_bytes = bom.getHeader(); - bun.C.memmove(bytes.ptr, bytes.ptr + bom_bytes.len, bytes.len - bom_bytes.len); - return bytes[0 .. bytes.len - bom_bytes.len]; - }, - } - } -}; - -/// @deprecated. If you are using this, you likely will need to remove other BOMs and handle encoding. -/// Use the BOM struct's `detect` and conversion functions instead. -pub fn withoutUTF8BOM(bytes: []const u8) []const u8 { - if (strings.hasPrefixComptime(bytes, BOM.utf8_bytes)) { - return bytes[BOM.utf8_bytes.len..]; - } else { - return bytes; - } -} - -// https://github.com/WebKit/WebKit/blob/443e796d1538654c34f2690e39600c70c8052b63/Source/WebCore/PAL/pal/text/TextCodecUTF8.cpp#L69 -pub fn nonASCIISequenceLength(first_byte: u8) u3_fast { - return switch (first_byte) { - 0...193 => 0, - 194...223 => 2, - 224...239 => 3, - 240...244 => 4, - 245...255 => 0, - }; -} - -/// Convert a UTF-8 string to a UTF-16 string IF there are any non-ascii characters -/// If there are no non-ascii characters, this returns null -/// This is intended to be used for strings that go to JavaScript -pub fn toUTF16Alloc(allocator: std.mem.Allocator, bytes: []const u8, comptime fail_if_invalid: bool, comptime sentinel: bool) !if (sentinel) ?[:0]u16 else ?[]u16 { - if (strings.firstNonASCII(bytes)) |i| { - const output_: ?std.ArrayList(u16) = if (comptime bun.FeatureFlags.use_simdutf) simd: { - const out_length = bun.simdutf.length.utf16.from.utf8(bytes); - if (out_length == 0) - break :simd null; - - var out = try allocator.alloc(u16, out_length + if (sentinel) 1 else 0); - log("toUTF16 {d} UTF8 -> {d} UTF16", .{ bytes.len, out_length }); - - const res = bun.simdutf.convert.utf8.to.utf16.with_errors.le(bytes, if (comptime sentinel) out[0..out_length] else out); - if (res.status == .success) { - if (comptime sentinel) { - out[out_length] = 0; - return out[0 .. out_length + 1 :0]; - } - return out; - } - - if (comptime fail_if_invalid) { - allocator.free(out); - return error.InvalidByteSequence; - } - - break :simd .{ - .items = out[0..i], - .capacity = out.len, - .allocator = allocator, - }; - } else null; - var output = output_ orelse fallback: { - var list = try std.ArrayList(u16).initCapacity(allocator, i + 2); - list.items.len = i; - strings.copyU8IntoU16(list.items, bytes[0..i]); - break :fallback list; - }; - errdefer output.deinit(); - - var remaining = bytes[i..]; - - { - const replacement = strings.convertUTF8BytesIntoUTF16(remaining); - if (comptime fail_if_invalid) { - if (replacement.fail) { - if (comptime Environment.allow_assert) assert(replacement.code_point == unicode_replacement); - return error.InvalidByteSequence; - } - } - remaining = remaining[@max(replacement.len, 1)..]; - - //#define U16_LENGTH(c) ((uint32_t)(c)<=0xffff ? 1 : 2) - switch (replacement.code_point) { - 0...0xffff => |c| { - try output.append(@as(u16, @intCast(c))); - }, - else => |c| { - try output.appendSlice(&[_]u16{ strings.u16Lead(c), strings.u16Trail(c) }); - }, - } - } - - while (strings.firstNonASCII(remaining)) |j| { - const end = output.items.len; - try output.ensureUnusedCapacity(j); - output.items.len += j; - strings.copyU8IntoU16(output.items[end..][0..j], remaining[0..j]); - remaining = remaining[j..]; - - const replacement = strings.convertUTF8BytesIntoUTF16(remaining); - if (comptime fail_if_invalid) { - if (replacement.fail) { - if (comptime Environment.allow_assert) assert(replacement.code_point == unicode_replacement); - return error.InvalidByteSequence; - } - } - remaining = remaining[@max(replacement.len, 1)..]; - - //#define U16_LENGTH(c) ((uint32_t)(c)<=0xffff ? 1 : 2) - switch (replacement.code_point) { - 0...0xffff => |c| { - try output.append(@as(u16, @intCast(c))); - }, - else => |c| { - try output.appendSlice(&[_]u16{ strings.u16Lead(c), strings.u16Trail(c) }); - }, - } - } - - if (remaining.len > 0) { - try output.ensureTotalCapacityPrecise(output.items.len + remaining.len + comptime if (sentinel) 1 else 0); - - output.items.len += remaining.len; - strings.copyU8IntoU16(output.items[output.items.len - remaining.len ..], remaining); - } - - if (comptime sentinel) { - output.items[output.items.len] = 0; - return output.items[0 .. output.items.len + 1 :0]; - } - - return output.items; - } - - return null; -} - -// this one does the thing it's named after -pub fn toUTF16AllocForReal(allocator: std.mem.Allocator, bytes: []const u8, comptime fail_if_invalid: bool, comptime sentinel: bool) !if (sentinel) [:0]u16 else []u16 { - return (try toUTF16Alloc(allocator, bytes, fail_if_invalid, sentinel)) orelse { - const output = try allocator.alloc(u16, bytes.len + if (sentinel) 1 else 0); - bun.strings.copyU8IntoU16(if (sentinel) output[0..bytes.len] else output, bytes); - - if (comptime sentinel) { - output[bytes.len] = 0; - return output[0..bytes.len :0]; - } - - return output; - }; -} - -pub fn toUTF16AllocMaybeBuffered( - allocator: std.mem.Allocator, - bytes: []const u8, - comptime fail_if_invalid: bool, - comptime flush: bool, -) error{ OutOfMemory, InvalidByteSequence }!?struct { []u16, [3]u8, u2 } { - const first_non_ascii = strings.firstNonASCII(bytes) orelse return null; - - var output: std.ArrayListUnmanaged(u16) = if (comptime bun.FeatureFlags.use_simdutf) output: { - const out_length = bun.simdutf.length.utf16.from.utf8(bytes); - - if (out_length == 0) { - break :output .{}; - } - - var out = try allocator.alloc(u16, out_length); - - const res = bun.simdutf.convert.utf8.to.utf16.with_errors.le(bytes, out); - if (res.status == .success) { - log("toUTF16 {d} UTF8 -> {d} UTF16", .{ bytes.len, out_length }); - return .{ out, .{0} ** 3, 0 }; - } - - var list = std.ArrayListUnmanaged(u16).fromOwnedSlice(out[0..first_non_ascii]); - list.capacity = out.len; - - break :output list; - } else .{}; - errdefer output.deinit(allocator); - - const start = if (output.items.len > 0) first_non_ascii else 0; - var remaining = bytes[start..]; - - var non_ascii: ?u32 = 0; - while (non_ascii) |i| : (non_ascii = strings.firstNonASCII(remaining)) { - { - const end = output.items.len; - try output.ensureUnusedCapacity(allocator, i + 2); // +2 for UTF16 codepoint - output.items.len += i; - strings.copyU8IntoU16(output.items[end..][0..i], remaining[0..i]); - remaining = remaining[i..]; - } - - const sequence: [4]u8 = switch (remaining.len) { - 0 => unreachable, - 1 => .{ remaining[0], 0, 0, 0 }, - 2 => .{ remaining[0], remaining[1], 0, 0 }, - 3 => .{ remaining[0], remaining[1], remaining[2], 0 }, - else => remaining[0..4].*, - }; - - const converted_length = strings.nonASCIISequenceLength(sequence[0]); - - const converted = strings.convertUTF8BytesIntoUTF16WithLength(&sequence, converted_length, remaining.len); - - if (comptime !flush) { - if (converted.fail and converted.can_buffer and converted_length > remaining.len) { - const buffered: [3]u8 = switch (remaining.len) { - else => unreachable, - 1 => .{ remaining[0], 0, 0 }, - 2 => .{ remaining[0], remaining[1], 0 }, - 3 => .{ remaining[0], remaining[1], remaining[2] }, - }; - return .{ output.items, buffered, @intCast(remaining.len) }; - } - } - - if (comptime fail_if_invalid) { - if (converted.fail) { - if (comptime Environment.allow_assert) { - bun.assert(converted.code_point == unicode_replacement); - } - return error.InvalidByteSequence; - } - } - - remaining = remaining[@max(converted.len, 1)..]; - - // #define U16_LENGTH(c) ((uint32_t)(c)<=0xffff ? 1 : 2) - switch (converted.code_point) { - 0...0xffff => |c| output.appendAssumeCapacity(@intCast(c)), - else => |c| output.appendSliceAssumeCapacity(&.{ strings.u16Lead(c), strings.u16Trail(c) }), - } - } - - if (remaining.len > 0) { - try output.ensureTotalCapacityPrecise(allocator, output.items.len + remaining.len); - output.items.len += remaining.len; - strings.copyU8IntoU16(output.items[output.items.len - remaining.len ..], remaining); - } - - log("toUTF16 {d} UTF8 -> {d} UTF16", .{ bytes.len, output.items.len }); - return .{ output.items, .{0} ** 3, 0 }; -} - -pub fn utf16CodepointWithFFFD(comptime Type: type, input: Type) UTF16Replacement { - return utf16CodepointWithFFFDAndFirstInputChar(Type, input[0], input); -} - -fn utf16CodepointWithFFFDAndFirstInputChar(comptime Type: type, char: std.meta.Elem(Type), input: Type) UTF16Replacement { - const c0 = @as(u21, char); - - if (c0 & ~@as(u21, 0x03ff) == 0xd800) { - // surrogate pair - if (input.len == 1) - return .{ - .len = 1, - .is_lead = true, - }; - //error.DanglingSurrogateHalf; - const c1 = @as(u21, input[1]); - if (c1 & ~@as(u21, 0x03ff) != 0xdc00) - if (input.len == 1) { - return .{ - .len = 1, - }; - } else { - return .{ - .fail = true, - .len = 1, - .code_point = unicode_replacement, - .is_lead = true, - }; - }; - // return error.ExpectedSecondSurrogateHalf; - - return .{ .len = 2, .code_point = 0x10000 + (((c0 & 0x03ff) << 10) | (c1 & 0x03ff)) }; - } else if (c0 & ~@as(u21, 0x03ff) == 0xdc00) { - // return error.UnexpectedSecondSurrogateHalf; - return .{ .fail = true, .len = 1, .code_point = unicode_replacement }; - } else { - return .{ .code_point = c0, .len = 1 }; - } -} - -pub fn utf16Codepoint(comptime Type: type, input: Type) UTF16Replacement { - const c0 = @as(u21, input[0]); - - if (c0 & ~@as(u21, 0x03ff) == 0xd800) { - // surrogate pair - if (input.len == 1) - return .{ - .len = 1, - }; - //error.DanglingSurrogateHalf; - const c1 = @as(u21, input[1]); - if (c1 & ~@as(u21, 0x03ff) != 0xdc00) - if (input.len == 1) - return .{ - .len = 1, - }; - // return error.ExpectedSecondSurrogateHalf; - - return .{ .len = 2, .code_point = 0x10000 + (((c0 & 0x03ff) << 10) | (c1 & 0x03ff)) }; - } else if (c0 & ~@as(u21, 0x03ff) == 0xdc00) { - // return error.UnexpectedSecondSurrogateHalf; - return .{ .len = 1 }; - } else { - return .{ .code_point = c0, .len = 1 }; - } -} - -/// Checks if a path is missing a windows drive letter. For windows APIs, -/// this is used for an assertion, and PosixToWinNormalizer can help make -/// an absolute path contain a drive letter. -pub fn isWindowsAbsolutePathMissingDriveLetter(comptime T: type, chars: []const T) bool { - bun.unsafeAssert(bun.path.Platform.windows.isAbsoluteT(T, chars)); - bun.unsafeAssert(chars.len > 0); - - // 'C:\hello' -> false - // This is the most common situation, so we check it first - if (!(chars[0] == '/' or chars[0] == '\\')) { - bun.unsafeAssert(chars.len > 2); - bun.unsafeAssert(chars[1] == ':'); - return false; - } - - if (chars.len > 4) { - // '\??\hello' -> false (has the NT object prefix) - if (chars[1] == '?' and - chars[2] == '?' and - (chars[3] == '/' or chars[3] == '\\')) - return false; - // '\\?\hello' -> false (has the other NT object prefix) - // '\\.\hello' -> false (has the NT device prefix) - if ((chars[1] == '/' or chars[1] == '\\') and - (chars[2] == '?' or chars[2] == '.') and - (chars[3] == '/' or chars[3] == '\\')) - return false; - } - - // A path starting with `/` can be a UNC path with forward slashes, - // or actually just a posix path. - // - // '\\Server\Share' -> false (unc) - // '\\Server\\Share' -> true (not unc because extra slashes) - // '\Server\Share' -> true (posix path) - return bun.path.windowsFilesystemRootT(T, chars).len == 1; -} - -pub fn fromWPath(buf: []u8, utf16: []const u16) [:0]const u8 { - bun.unsafeAssert(buf.len > 0); - const to_copy = trimPrefixComptime(u16, utf16, bun.windows.long_path_prefix); - const encode_into_result = copyUTF16IntoUTF8(buf[0 .. buf.len - 1], []const u16, to_copy, false); - bun.unsafeAssert(encode_into_result.written < buf.len); - buf[encode_into_result.written] = 0; - return buf[0..encode_into_result.written :0]; -} - -pub fn withoutNTPrefix(comptime T: type, path: []const T) []const T { - if (comptime !Environment.isWindows) return path; - const cmp = if (T == u8) - hasPrefixComptime - else - hasPrefixComptimeUTF16; - if (cmp(path, &bun.windows.nt_object_prefix_u8)) { - return path[bun.windows.nt_object_prefix.len..]; - } - if (cmp(path, &bun.windows.long_path_prefix_u8)) { - return path[bun.windows.long_path_prefix.len..]; - } - if (cmp(path, &bun.windows.nt_unc_object_prefix_u8)) { - return path[bun.windows.nt_unc_object_prefix.len..]; - } - return path; -} - -pub fn toNTPath(wbuf: []u16, utf8: []const u8) [:0]u16 { - if (!std.fs.path.isAbsoluteWindows(utf8)) { - return toWPathNormalized(wbuf, utf8); - } - - if (strings.hasPrefixComptime(utf8, &bun.windows.nt_object_prefix_u8) or - strings.hasPrefixComptime(utf8, &bun.windows.nt_unc_object_prefix_u8)) - { - return wbuf[0..toWPathNormalized(wbuf, utf8).len :0]; - } - - // UNC absolute path, replace leading '\\' with '\??\UNC\' - if (strings.hasPrefixComptime(utf8, "\\\\")) { - if (strings.hasPrefixComptime(utf8[2..], bun.windows.long_path_prefix_u8[2..])) { - const prefix = bun.windows.nt_object_prefix; - wbuf[0..prefix.len].* = prefix; - return wbuf[0 .. toWPathNormalized(wbuf[prefix.len..], utf8[4..]).len + prefix.len :0]; - } - const prefix = bun.windows.nt_unc_object_prefix; - wbuf[0..prefix.len].* = prefix; - return wbuf[0 .. toWPathNormalized(wbuf[prefix.len..], utf8[2..]).len + prefix.len :0]; - } - - const prefix = bun.windows.nt_object_prefix; - wbuf[0..prefix.len].* = prefix; - return wbuf[0 .. toWPathNormalized(wbuf[prefix.len..], utf8).len + prefix.len :0]; -} - -pub fn toNTPath16(wbuf: []u16, path: []const u16) [:0]u16 { - if (!std.fs.path.isAbsoluteWindowsWTF16(path)) { - return toWPathNormalized16(wbuf, path); - } - - if (strings.hasPrefixComptimeUTF16(path, &bun.windows.nt_object_prefix_u8) or - strings.hasPrefixComptimeUTF16(path, &bun.windows.nt_unc_object_prefix_u8)) - { - return wbuf[0..toWPathNormalized16(wbuf, path).len :0]; - } - - if (strings.hasPrefixComptimeUTF16(path, "\\\\")) { - if (strings.hasPrefixComptimeUTF16(path[2..], bun.windows.long_path_prefix_u8[2..])) { - const prefix = bun.windows.nt_object_prefix; - wbuf[0..prefix.len].* = prefix; - return wbuf[0 .. toWPathNormalized16(wbuf[prefix.len..], path[4..]).len + prefix.len :0]; - } - const prefix = bun.windows.nt_unc_object_prefix; - wbuf[0..prefix.len].* = prefix; - return wbuf[0 .. toWPathNormalized16(wbuf[prefix.len..], path[2..]).len + prefix.len :0]; - } - - const prefix = bun.windows.nt_object_prefix; - wbuf[0..prefix.len].* = prefix; - return wbuf[0 .. toWPathNormalized16(wbuf[prefix.len..], path).len + prefix.len :0]; -} - -pub fn toNTMaxPath(buf: []u8, utf8: []const u8) [:0]const u8 { - if (!std.fs.path.isAbsoluteWindows(utf8) or utf8.len <= 260) { - @memcpy(buf[0..utf8.len], utf8); - buf[utf8.len] = 0; - return buf[0..utf8.len :0]; - } - - const prefix = bun.windows.nt_maxpath_prefix_u8; - buf[0..prefix.len].* = prefix; - return buf[0 .. toPathNormalized(buf[prefix.len..], utf8).len + prefix.len :0]; -} - -pub fn addNTPathPrefix(wbuf: []u16, utf16: []const u16) [:0]u16 { - wbuf[0..bun.windows.nt_object_prefix.len].* = bun.windows.nt_object_prefix; - @memcpy(wbuf[bun.windows.nt_object_prefix.len..][0..utf16.len], utf16); - wbuf[utf16.len + bun.windows.nt_object_prefix.len] = 0; - return wbuf[0 .. utf16.len + bun.windows.nt_object_prefix.len :0]; -} - -pub fn addNTPathPrefixIfNeeded(wbuf: []u16, utf16: []const u16) [:0]u16 { - if (hasPrefixComptimeType(u16, utf16, bun.windows.nt_object_prefix)) { - @memcpy(wbuf[0..utf16.len], utf16); - wbuf[utf16.len] = 0; - return wbuf[0..utf16.len :0]; - } - if (hasPrefixComptimeType(u16, utf16, bun.windows.long_path_prefix)) { - // Replace prefix - return addNTPathPrefix(wbuf, utf16[bun.windows.long_path_prefix.len..]); - } - return addNTPathPrefix(wbuf, utf16); -} - -// These are the same because they don't have rules like needing a trailing slash -pub const toNTDir = toNTPath; - -pub fn toExtendedPathNormalized(wbuf: []u16, utf8: []const u8) [:0]const u16 { - bun.unsafeAssert(wbuf.len > 4); - wbuf[0..4].* = bun.windows.long_path_prefix; - return wbuf[0 .. toWPathNormalized(wbuf[4..], utf8).len + 4 :0]; -} - -pub fn toWPathNormalizeAutoExtend(wbuf: []u16, utf8: []const u8) [:0]const u16 { - if (std.fs.path.isAbsoluteWindows(utf8)) { - return toExtendedPathNormalized(wbuf, utf8); - } - - return toWPathNormalized(wbuf, utf8); -} - -pub fn toWPathNormalized(wbuf: []u16, utf8: []const u8) [:0]u16 { - const renormalized = bun.PathBufferPool.get(); - defer bun.PathBufferPool.put(renormalized); - - var path_to_use = normalizeSlashesOnly(renormalized, utf8, '\\'); - - // is there a trailing slash? Let's remove it before converting to UTF-16 - if (path_to_use.len > 3 and bun.path.isSepAny(path_to_use[path_to_use.len - 1])) { - path_to_use = path_to_use[0 .. path_to_use.len - 1]; - } - - return toWPath(wbuf, path_to_use); -} - -pub fn toWPathNormalized16(wbuf: []u16, path: []const u16) [:0]u16 { - var path_to_use = normalizeSlashesOnlyT(u16, wbuf, path, '\\', true); - - // is there a trailing slash? Let's remove it before converting to UTF-16 - if (path_to_use.len > 3 and bun.path.isSepAnyT(u16, path_to_use[path_to_use.len - 1])) { - path_to_use = path_to_use[0 .. path_to_use.len - 1]; - } - - wbuf[path_to_use.len] = 0; - - return wbuf[0..path_to_use.len :0]; -} - -pub fn toPathNormalized(buf: []u8, utf8: []const u8) [:0]const u8 { - const renormalized = bun.PathBufferPool.get(); - defer bun.PathBufferPool.put(renormalized); - - var path_to_use = normalizeSlashesOnly(renormalized, utf8, '\\'); - - // is there a trailing slash? Let's remove it before converting to UTF-16 - if (path_to_use.len > 3 and bun.path.isSepAny(path_to_use[path_to_use.len - 1])) { - path_to_use = path_to_use[0 .. path_to_use.len - 1]; - } - - return toPath(buf, path_to_use); -} - -pub fn normalizeSlashesOnlyT(comptime T: type, buf: []T, path: []const T, comptime desired_slash: u8, comptime always_copy: bool) []const T { - comptime bun.unsafeAssert(desired_slash == '/' or desired_slash == '\\'); - const undesired_slash = if (desired_slash == '/') '\\' else '/'; - - if (bun.strings.containsCharT(T, path, undesired_slash)) { - @memcpy(buf[0..path.len], path); - for (buf[0..path.len]) |*c| { - if (c.* == undesired_slash) { - c.* = desired_slash; - } - } - return buf[0..path.len]; - } - - if (comptime always_copy) { - @memcpy(buf[0..path.len], path); - return buf[0..path.len]; - } - return path; -} - -pub fn normalizeSlashesOnly(buf: []u8, utf8: []const u8, comptime desired_slash: u8) []const u8 { - return normalizeSlashesOnlyT(u8, buf, utf8, desired_slash, false); -} - -pub fn toWDirNormalized(wbuf: []u16, utf8: []const u8) [:0]const u16 { - var renormalized: ?*bun.PathBuffer = null; - defer if (renormalized) |r| bun.PathBufferPool.put(r); - - var path_to_use = utf8; - - if (bun.strings.containsChar(utf8, '/')) { - renormalized = bun.PathBufferPool.get(); - @memcpy(renormalized.?[0..utf8.len], utf8); - for (renormalized.?[0..utf8.len]) |*c| { - if (c.* == '/') { - c.* = '\\'; - } - } - path_to_use = renormalized.?[0..utf8.len]; - } - - return toWDirPath(wbuf, path_to_use); -} - -pub fn toWPath(wbuf: []u16, utf8: []const u8) [:0]u16 { - return toWPathMaybeDir(wbuf, utf8, false); -} - -pub fn toPath(buf: []u8, utf8: []const u8) [:0]u8 { - return toPathMaybeDir(buf, utf8, false); -} - -pub fn toWDirPath(wbuf: []u16, utf8: []const u8) [:0]const u16 { - return toWPathMaybeDir(wbuf, utf8, true); -} - -pub fn toKernel32Path(wbuf: []u16, utf8: []const u8) [:0]u16 { - const path = if (hasPrefixComptime(utf8, bun.windows.nt_object_prefix_u8)) - utf8[bun.windows.nt_object_prefix_u8.len..] - else - utf8; - if (hasPrefixComptime(path, bun.windows.long_path_prefix_u8)) { - return toWPath(wbuf, path); - } - if (utf8.len > 2 and bun.path.isDriveLetter(utf8[0]) and utf8[1] == ':' and bun.path.isSepAny(utf8[2])) { - wbuf[0..4].* = bun.windows.long_path_prefix; - const wpath = toWPath(wbuf[4..], path); - return wbuf[0 .. wpath.len + 4 :0]; - } - return toWPath(wbuf, path); -} - -fn isUNCPath(comptime T: type, path: []const T) bool { - return path.len >= 3 and - bun.path.Platform.windows.isSeparatorT(T, path[0]) and - bun.path.Platform.windows.isSeparatorT(T, path[1]) and - !bun.path.Platform.windows.isSeparatorT(T, path[2]) and - path[2] != '.'; -} -pub fn assertIsValidWindowsPath(comptime T: type, path: []const T) void { - if (Environment.allow_assert and Environment.isWindows) { - if (bun.path.Platform.windows.isAbsoluteT(T, path) and - isWindowsAbsolutePathMissingDriveLetter(T, path) and - // is it a null device path? that's not an error. it's just a weird file path. - !eqlComptimeT(T, path, "\\\\.\\NUL") and !eqlComptimeT(T, path, "\\\\.\\nul") and !eqlComptimeT(T, path, "\\nul") and !eqlComptimeT(T, path, "\\NUL") and !isUNCPath(T, path)) - { - std.debug.panic("Internal Error: Do not pass posix paths to Windows APIs, was given '{s}'" ++ if (Environment.isDebug) " (missing a root like 'C:\\', see PosixToWinNormalizer for why this is an assertion)" else ". Please open an issue on GitHub with a reproduction.", .{ - if (T == u8) path else bun.fmt.utf16(path), - }); - } - if (hasPrefixComptimeType(T, path, ":/") and Environment.isDebug) { - std.debug.panic("Path passed to windows API '{s}' is almost certainly invalid. Where did the drive letter go?", .{ - if (T == u8) path else bun.fmt.utf16(path), - }); - } - } -} - -pub fn toWPathMaybeDir(wbuf: []u16, utf8: []const u8, comptime add_trailing_lash: bool) [:0]u16 { - bun.unsafeAssert(wbuf.len > 0); - - var result = bun.simdutf.convert.utf8.to.utf16.with_errors.le( - utf8, - wbuf[0..wbuf.len -| (1 + @as(usize, @intFromBool(add_trailing_lash)))], - ); - - // Many Windows APIs expect normalized path slashes, particularly when the - // long path prefix is added or the nt object prefix. To make this easier, - // but a little redundant, this function always normalizes the slashes here. - // - // An example of this is GetFileAttributesW(L"C:\\hello/world.txt") being OK - // but GetFileAttributesW(L"\\\\?\\C:\\hello/world.txt") is NOT - bun.path.dangerouslyConvertPathToWindowsInPlace(u16, wbuf[0..result.count]); - - if (add_trailing_lash and result.count > 0 and wbuf[result.count - 1] != '\\') { - wbuf[result.count] = '\\'; - result.count += 1; - } - - wbuf[result.count] = 0; - - return wbuf[0..result.count :0]; -} -pub fn toPathMaybeDir(buf: []u8, utf8: []const u8, comptime add_trailing_lash: bool) [:0]u8 { - bun.unsafeAssert(buf.len > 0); - - var len = utf8.len; - @memcpy(buf[0..len], utf8[0..len]); - - if (add_trailing_lash and len > 0 and buf[len - 1] != '\\') { - buf[len] = '\\'; - len += 1; - } - buf[len] = 0; - return buf[0..len :0]; -} - -pub fn convertUTF16ToUTF8(list_: std.ArrayList(u8), comptime Type: type, utf16: Type) OOM!std.ArrayList(u8) { - var list = list_; - const result = bun.simdutf.convert.utf16.to.utf8.with_errors.le( - utf16, - list.items.ptr[0..list.capacity], - ); - if (result.status == .surrogate) { - // Slow path: there was invalid UTF-16, so we need to convert it without simdutf. - return toUTF8ListWithTypeBun(&list, Type, utf16, false); - } - - list.items.len = result.count; - return list; -} - -pub fn convertUTF16ToUTF8WithoutInvalidSurrogatePairs(list_: std.ArrayList(u8), comptime Type: type, utf16: Type) !std.ArrayList(u8) { - var list = list_; - const result = bun.simdutf.convert.utf16.to.utf8.with_errors.le( - utf16, - list.items.ptr[0..list.capacity], - ); - if (result.status == .surrogate) { - return error.SurrogatePair; - } - - list.items.len = result.count; - return list; -} - -pub fn convertUTF16ToUTF8Append(list: *std.ArrayList(u8), utf16: []const u16) !void { - const result = bun.simdutf.convert.utf16.to.utf8.with_errors.le( - utf16, - list.items.ptr[list.items.len..list.capacity], - ); - - if (result.status == .surrogate) { - // Slow path: there was invalid UTF-16, so we need to convert it without simdutf. - _ = try toUTF8ListWithTypeBun(list, []const u16, utf16, false); - return; - } - - list.items.len += result.count; -} - -pub fn toUTF8AllocWithTypeWithoutInvalidSurrogatePairs(allocator: std.mem.Allocator, comptime Type: type, utf16: Type) ![]u8 { - if (bun.FeatureFlags.use_simdutf and comptime Type == []const u16) { - const length = bun.simdutf.length.utf8.from.utf16.le(utf16); - // add 16 bytes of padding for SIMDUTF - var list = try std.ArrayList(u8).initCapacity(allocator, length + 16); - list = try convertUTF16ToUTF8(list, Type, utf16); - return list.items; - } - - var list = try std.ArrayList(u8).initCapacity(allocator, utf16.len); - list = try toUTF8ListWithType(list, Type, utf16); - return list.items; -} - -pub fn toUTF8AllocWithType(allocator: std.mem.Allocator, comptime Type: type, utf16: Type) OOM![]u8 { - if (bun.FeatureFlags.use_simdutf and comptime Type == []const u16) { - const length = bun.simdutf.length.utf8.from.utf16.le(utf16); - // add 16 bytes of padding for SIMDUTF - var list = try std.ArrayList(u8).initCapacity(allocator, length + 16); - list = try convertUTF16ToUTF8(list, Type, utf16); - return list.items; - } - - var list = try std.ArrayList(u8).initCapacity(allocator, utf16.len); - list = try toUTF8ListWithType(list, Type, utf16); - return list.items; -} - -pub fn toUTF8ListWithType(list_: std.ArrayList(u8), comptime Type: type, utf16: Type) OOM!std.ArrayList(u8) { - if (bun.FeatureFlags.use_simdutf and comptime Type == []const u16) { - var list = list_; - const length = bun.simdutf.length.utf8.from.utf16.le(utf16); - try list.ensureTotalCapacityPrecise(length + 16); - const buf = try convertUTF16ToUTF8(list, Type, utf16); - - // Commenting out because `convertUTF16ToUTF8` may convert to WTF-8 - // which uses 3 bytes for invalid surrogates, causing the length to not - // match from simdutf. - // if (Environment.allow_assert) { - // bun.unsafeAssert(buf.items.len == length); - // } - - return buf; - } - - @compileError("not implemented"); -} - -pub fn toUTF8AppendToList(list: *std.ArrayList(u8), utf16: []const u16) !void { - if (!bun.FeatureFlags.use_simdutf) { - @compileError("not implemented"); - } - const length = bun.simdutf.length.utf8.from.utf16.le(utf16); - try list.ensureUnusedCapacity(length + 16); - try convertUTF16ToUTF8Append(list, utf16); -} - -pub fn toUTF8FromLatin1(allocator: std.mem.Allocator, latin1: []const u8) !?std.ArrayList(u8) { - if (isAllASCII(latin1)) - return null; - - const list = try std.ArrayList(u8).initCapacity(allocator, latin1.len); - return try allocateLatin1IntoUTF8WithList(list, 0, []const u8, latin1); -} - -pub fn toUTF8FromLatin1Z(allocator: std.mem.Allocator, latin1: []const u8) !?std.ArrayList(u8) { - if (isAllASCII(latin1)) - return null; - - const list = try std.ArrayList(u8).initCapacity(allocator, latin1.len + 1); - var list1 = try allocateLatin1IntoUTF8WithList(list, 0, []const u8, latin1); - try list1.append(0); - return list1; -} - -pub fn toUTF8ListWithTypeBun(list: *std.ArrayList(u8), comptime Type: type, utf16: Type, comptime skip_trailing_replacement: bool) OOM!(if (skip_trailing_replacement) ?u16 else std.ArrayList(u8)) { - var utf16_remaining = utf16; - - while (firstNonASCII16(Type, utf16_remaining)) |i| { - const to_copy = utf16_remaining[0..i]; - utf16_remaining = utf16_remaining[i..]; - const token = utf16_remaining[0]; - - const replacement = utf16CodepointWithFFFDAndFirstInputChar(Type, token, utf16_remaining); - utf16_remaining = utf16_remaining[replacement.len..]; - - const count: usize = replacement.utf8Width(); - if (comptime Environment.isNative) { - try list.ensureTotalCapacityPrecise(i + count + list.items.len + @as(usize, @intFromFloat((@as(f64, @floatFromInt(@as(u52, @truncate(utf16_remaining.len)))) * 1.2)))); - } else { - try list.ensureTotalCapacityPrecise(i + count + list.items.len + utf16_remaining.len + 4); - } - list.items.len += i; - - copyU16IntoU8(list.items[list.items.len - i ..], to_copy); - - if (comptime skip_trailing_replacement) { - if (replacement.is_lead and utf16_remaining.len == 0) { - return token; - } - } - - list.items.len += count; - _ = encodeWTF8RuneT( - list.items.ptr[list.items.len - count .. list.items.len - count + 4][0..4], - u32, - @as(u32, replacement.code_point), - ); - } - - if (utf16_remaining.len > 0) { - try list.ensureTotalCapacityPrecise(utf16_remaining.len + list.items.len); - const old_len = list.items.len; - list.items.len += utf16_remaining.len; - copyU16IntoU8(list.items[old_len..], utf16_remaining); - } - - log("UTF16 {d} -> {d} UTF8", .{ utf16.len, list.items.len }); - - if (comptime skip_trailing_replacement) { - return null; - } - return list.*; -} - -pub const EncodeIntoResult = struct { - read: u32 = 0, - written: u32 = 0, -}; -pub fn allocateLatin1IntoUTF8(allocator: std.mem.Allocator, comptime Type: type, latin1_: Type) ![]u8 { - if (comptime bun.FeatureFlags.latin1_is_now_ascii) { - var out = try allocator.alloc(u8, latin1_.len); - @memcpy(out[0..latin1_.len], latin1_); - return out; - } - - const list = try std.ArrayList(u8).initCapacity(allocator, latin1_.len); - var foo = try allocateLatin1IntoUTF8WithList(list, 0, Type, latin1_); - return try foo.toOwnedSlice(); -} - -pub fn allocateLatin1IntoUTF8WithList(list_: std.ArrayList(u8), offset_into_list: usize, comptime Type: type, latin1_: Type) OOM!std.ArrayList(u8) { - var latin1 = latin1_; - var i: usize = offset_into_list; - var list = list_; - try list.ensureUnusedCapacity(latin1.len); - - while (latin1.len > 0) { - if (comptime Environment.allow_assert) assert(i < list.capacity); - var buf = list.items.ptr[i..list.capacity]; - - inner: { - var count = latin1.len / ascii_vector_size; - while (count > 0) : (count -= 1) { - const vec: AsciiVector = latin1[0..ascii_vector_size].*; - - if (@reduce(.Max, vec) > 127) { - const Int = u64; - const size = @sizeOf(Int); - - // zig or LLVM doesn't do @ctz nicely with SIMD - if (comptime ascii_vector_size >= 8) { - { - const bytes = @as(Int, @bitCast(latin1[0..size].*)); - // https://dotat.at/@/2022-06-27-tolower-swar.html - const mask = bytes & 0x8080808080808080; - - if (mask > 0) { - const first_set_byte = @ctz(mask) / 8; - if (comptime Environment.allow_assert) assert(latin1[first_set_byte] >= 127); - - buf[0..size].* = @as([size]u8, @bitCast(bytes)); - buf = buf[first_set_byte..]; - latin1 = latin1[first_set_byte..]; - break :inner; - } - - buf[0..size].* = @as([size]u8, @bitCast(bytes)); - latin1 = latin1[size..]; - buf = buf[size..]; - } - - if (comptime ascii_vector_size >= 16) { - const bytes = @as(Int, @bitCast(latin1[0..size].*)); - // https://dotat.at/@/2022-06-27-tolower-swar.html - const mask = bytes & 0x8080808080808080; - - if (mask > 0) { - const first_set_byte = @ctz(mask) / 8; - if (comptime Environment.allow_assert) assert(latin1[first_set_byte] >= 127); - - buf[0..size].* = @as([size]u8, @bitCast(bytes)); - buf = buf[first_set_byte..]; - latin1 = latin1[first_set_byte..]; - break :inner; - } - } - } - unreachable; - } - - buf[0..ascii_vector_size].* = @as([ascii_vector_size]u8, @bitCast(vec))[0..ascii_vector_size].*; - latin1 = latin1[ascii_vector_size..]; - buf = buf[ascii_vector_size..]; - } - - while (latin1.len >= 8) { - const Int = u64; - const size = @sizeOf(Int); - - const bytes = @as(Int, @bitCast(latin1[0..size].*)); - // https://dotat.at/@/2022-06-27-tolower-swar.html - const mask = bytes & 0x8080808080808080; - - if (mask > 0) { - const first_set_byte = @ctz(mask) / 8; - if (comptime Environment.allow_assert) assert(latin1[first_set_byte] >= 127); - - buf[0..size].* = @as([size]u8, @bitCast(bytes)); - latin1 = latin1[first_set_byte..]; - buf = buf[first_set_byte..]; - break :inner; - } - - buf[0..size].* = @as([size]u8, @bitCast(bytes)); - latin1 = latin1[size..]; - buf = buf[size..]; - } - - { - if (comptime Environment.allow_assert) assert(latin1.len < 8); - const end = latin1.ptr + latin1.len; - while (latin1.ptr != end and latin1[0] < 128) { - buf[0] = latin1[0]; - buf = buf[1..]; - latin1 = latin1[1..]; - } - } - } - - while (latin1.len > 0 and latin1[0] > 127) { - i = @intFromPtr(buf.ptr) - @intFromPtr(list.items.ptr); - list.items.len = i; - try list.ensureUnusedCapacity(2 + latin1.len); - buf = list.items.ptr[i..list.capacity]; - buf[0..2].* = latin1ToCodepointBytesAssumeNotASCII(latin1[0]); - latin1 = latin1[1..]; - buf = buf[2..]; - } - - i = @intFromPtr(buf.ptr) - @intFromPtr(list.items.ptr); - list.items.len = i; - } - - log("Latin1 {d} -> UTF8 {d}", .{ latin1_.len, i }); - - return list; -} - -pub const UTF16Replacement = struct { - code_point: u32 = unicode_replacement, - len: u3_fast = 0, - - /// Explicit fail boolean to distinguish between a Unicode Replacement Codepoint - /// that was already in there - /// and a genuine error. - fail: bool = false, - - can_buffer: bool = true, - is_lead: bool = false, - - pub inline fn utf8Width(replacement: UTF16Replacement) u3_fast { - return switch (replacement.code_point) { - 0...0x7F => 1, - (0x7F + 1)...0x7FF => 2, - (0x7FF + 1)...0xFFFF => 3, - else => 4, - }; - } -}; - -fn convertUTF8BytesIntoUTF16WithLength(sequence: *const [4]u8, len: u3_fast, remaining_len: usize) UTF16Replacement { - if (comptime Environment.allow_assert) assert(sequence[0] > 127); - switch (len) { - 2 => { - if (comptime Environment.allow_assert) { - bun.assert(sequence[0] >= 0xC0); - bun.assert(sequence[0] <= 0xDF); - } - if (sequence[1] < 0x80 or sequence[1] > 0xBF) { - return .{ .len = 1, .fail = true, .can_buffer = remaining_len < 2 }; - } - return .{ .len = len, .code_point = ((@as(u32, sequence[0]) << 6) + @as(u32, sequence[1])) - 0x00003080 }; - }, - 3 => { - if (comptime Environment.allow_assert) { - bun.assert(sequence[0] >= 0xE0); - bun.assert(sequence[0] <= 0xEF); - } - switch (sequence[0]) { - 0xE0 => { - if (sequence[1] < 0xA0 or sequence[1] > 0xBF) { - return .{ .len = 1, .fail = true, .can_buffer = remaining_len < 2 }; - } - }, - 0xED => { - if (sequence[1] < 0x80 or sequence[1] > 0x9F) { - return .{ .len = 1, .fail = true, .can_buffer = remaining_len < 2 }; - } - }, - else => { - if (sequence[1] < 0x80 or sequence[1] > 0xBF) { - return .{ .len = 1, .fail = true, .can_buffer = remaining_len < 2 }; - } - }, - } - if (sequence[2] < 0x80 or sequence[2] > 0xBF) { - return .{ .len = 2, .fail = true, .can_buffer = remaining_len < 3 }; - } - return .{ - .len = len, - .code_point = ((@as(u32, sequence[0]) << 12) + (@as(u32, sequence[1]) << 6) + @as(u32, sequence[2])) - 0x000E2080, - }; - }, - 4 => { - switch (sequence[0]) { - 0xF0 => { - if (sequence[1] < 0x90 or sequence[1] > 0xBF) { - return .{ .len = 1, .fail = true, .can_buffer = remaining_len < 2 }; - } - }, - 0xF4 => { - if (sequence[1] < 0x80 or sequence[1] > 0x8F) { - return .{ .len = 1, .fail = true, .can_buffer = remaining_len < 2 }; - } - }, - - // invalid code point - // this used to be an assertion - 0...(0xF0 - 1), 0xF4 + 1...std.math.maxInt(@TypeOf(sequence[0])) => { - return .{ .len = 1, .fail = true, .can_buffer = false }; - }, - - else => { - if (sequence[1] < 0x80 or sequence[1] > 0xBF) { - return .{ .len = 1, .fail = true, .can_buffer = remaining_len < 2 }; - } - }, - } - - if (sequence[2] < 0x80 or sequence[2] > 0xBF) { - return .{ .len = 2, .fail = true, .can_buffer = remaining_len < 3 }; - } - if (sequence[3] < 0x80 or sequence[3] > 0xBF) { - return .{ .len = 3, .fail = true, .can_buffer = remaining_len < 4 }; - } - return .{ - .len = len, - .code_point = ((@as(u32, sequence[0]) << 18) + - (@as(u32, sequence[1]) << 12) + - (@as(u32, sequence[2]) << 6) + @as(u32, sequence[3])) - 0x03C82080, - }; - }, - // invalid unicode sequence - // 1 or 0 are both invalid here - else => return UTF16Replacement{ .len = 1, .fail = true }, - } -} - -// This variation matches WebKit behavior. -// fn convertUTF8BytesIntoUTF16(sequence: *const [4]u8, remaining_len: usize) UTF16Replacement { -fn convertUTF8BytesIntoUTF16(bytes: []const u8) UTF16Replacement { - const sequence: [4]u8 = switch (bytes.len) { - 0 => unreachable, - 1 => [_]u8{ bytes[0], 0, 0, 0 }, - 2 => [_]u8{ bytes[0], bytes[1], 0, 0 }, - 3 => [_]u8{ bytes[0], bytes[1], bytes[2], 0 }, - else => bytes[0..4].*, - }; - if (comptime Environment.allow_assert) assert(sequence[0] > 127); - const sequence_length = nonASCIISequenceLength(sequence[0]); - return convertUTF8BytesIntoUTF16WithLength(&sequence, sequence_length, bytes.len); -} - -pub fn copyLatin1IntoUTF8(buf_: []u8, comptime Type: type, latin1_: Type) EncodeIntoResult { - return copyLatin1IntoUTF8StopOnNonASCII(buf_, Type, latin1_, false); -} - -pub fn copyLatin1IntoUTF8StopOnNonASCII(buf_: []u8, comptime Type: type, latin1_: Type, comptime stop: bool) EncodeIntoResult { - if (comptime bun.FeatureFlags.latin1_is_now_ascii) { - const to_copy = @as(u32, @truncate(@min(buf_.len, latin1_.len))); - @memcpy(buf_[0..to_copy], latin1_[0..to_copy]); - - return .{ .written = to_copy, .read = to_copy }; - } - - var buf = buf_; - var latin1 = latin1_; - - log("latin1 encode {d} -> {d}", .{ buf.len, latin1.len }); - - while (buf.len > 0 and latin1.len > 0) { - inner: { - var remaining_runs = @min(buf.len, latin1.len) / ascii_vector_size; - while (remaining_runs > 0) : (remaining_runs -= 1) { - const vec: AsciiVector = latin1[0..ascii_vector_size].*; - - if (@reduce(.Max, vec) > 127) { - if (comptime stop) return .{ .written = std.math.maxInt(u32), .read = std.math.maxInt(u32) }; - - // zig or LLVM doesn't do @ctz nicely with SIMD - if (comptime ascii_vector_size >= 8) { - const Int = u64; - const size = @sizeOf(Int); - - { - const bytes = @as(Int, @bitCast(latin1[0..size].*)); - // https://dotat.at/@/2022-06-27-tolower-swar.html - const mask = bytes & 0x8080808080808080; - - buf[0..size].* = @as([size]u8, @bitCast(bytes)); - - if (mask > 0) { - const first_set_byte = @ctz(mask) / 8; - if (comptime Environment.allow_assert) assert(latin1[first_set_byte] >= 127); - - buf = buf[first_set_byte..]; - latin1 = latin1[first_set_byte..]; - break :inner; - } - - latin1 = latin1[size..]; - buf = buf[size..]; - } - - if (comptime ascii_vector_size >= 16) { - const bytes = @as(Int, @bitCast(latin1[0..size].*)); - // https://dotat.at/@/2022-06-27-tolower-swar.html - const mask = bytes & 0x8080808080808080; - - buf[0..size].* = @as([size]u8, @bitCast(bytes)); - - if (comptime Environment.allow_assert) assert(mask > 0); - const first_set_byte = @ctz(mask) / 8; - if (comptime Environment.allow_assert) assert(latin1[first_set_byte] >= 127); - - buf = buf[first_set_byte..]; - latin1 = latin1[first_set_byte..]; - break :inner; - } - } - unreachable; - } - - buf[0..ascii_vector_size].* = @as([ascii_vector_size]u8, @bitCast(vec))[0..ascii_vector_size].*; - latin1 = latin1[ascii_vector_size..]; - buf = buf[ascii_vector_size..]; - } - - { - const Int = u64; - const size = @sizeOf(Int); - while (@min(buf.len, latin1.len) >= size) { - const bytes = @as(Int, @bitCast(latin1[0..size].*)); - buf[0..size].* = @as([size]u8, @bitCast(bytes)); - - // https://dotat.at/@/2022-06-27-tolower-swar.html - - const mask = bytes & 0x8080808080808080; - - if (mask > 0) { - const first_set_byte = @ctz(mask) / 8; - if (comptime stop) return .{ .written = std.math.maxInt(u32), .read = std.math.maxInt(u32) }; - if (comptime Environment.allow_assert) assert(latin1[first_set_byte] >= 127); - - buf = buf[first_set_byte..]; - latin1 = latin1[first_set_byte..]; - - break :inner; - } - - latin1 = latin1[size..]; - buf = buf[size..]; - } - } - - { - const end = latin1.ptr + @min(buf.len, latin1.len); - if (comptime Environment.allow_assert) assert(@intFromPtr(latin1.ptr + 8) > @intFromPtr(end)); - const start_ptr = @intFromPtr(buf.ptr); - const start_ptr_latin1 = @intFromPtr(latin1.ptr); - - while (latin1.ptr != end and latin1.ptr[0] <= 127) { - buf.ptr[0] = latin1.ptr[0]; - buf.ptr += 1; - latin1.ptr += 1; - } - - buf.len -= @intFromPtr(buf.ptr) - start_ptr; - latin1.len -= @intFromPtr(latin1.ptr) - start_ptr_latin1; - } - } - - if (latin1.len > 0) { - if (buf.len >= 2) { - if (comptime stop) return .{ .written = std.math.maxInt(u32), .read = std.math.maxInt(u32) }; - - buf[0..2].* = latin1ToCodepointBytesAssumeNotASCII(latin1[0]); - latin1 = latin1[1..]; - buf = buf[2..]; - } else { - break; - } - } - } - - return .{ - .written = @as(u32, @truncate(buf_.len - buf.len)), - .read = @as(u32, @truncate(latin1_.len - latin1.len)), - }; -} - -pub fn replaceLatin1WithUTF8(buf_: []u8) void { - var latin1 = buf_; - while (strings.firstNonASCII(latin1)) |i| { - latin1[i..][0..2].* = latin1ToCodepointBytesAssumeNotASCII(latin1[i]); - - latin1 = latin1[i + 2 ..]; - } -} - -pub fn elementLengthLatin1IntoUTF8(slice: []const u8) usize { - return bun.simdutf.length.utf8.from.latin1(slice); -} - -pub fn copyLatin1IntoUTF16(comptime Buffer: type, buf_: Buffer, comptime Type: type, latin1_: Type) EncodeIntoResult { - var buf = buf_; - var latin1 = latin1_; - while (buf.len > 0 and latin1.len > 0) { - const to_write = strings.firstNonASCII(latin1) orelse @as(u32, @truncate(@min(latin1.len, buf.len))); - if (comptime std.meta.alignment(Buffer) != @alignOf(u16)) { - strings.copyU8IntoU16WithAlignment(std.meta.alignment(Buffer), buf, latin1[0..to_write]); - } else { - strings.copyU8IntoU16(buf, latin1[0..to_write]); - } - - latin1 = latin1[to_write..]; - buf = buf[to_write..]; - if (latin1.len > 0 and buf.len >= 1) { - buf[0] = latin1ToCodepointBytesAssumeNotASCII16(latin1[0]); - latin1 = latin1[1..]; - buf = buf[1..]; - } - } - - return .{ - .read = @as(u32, @truncate(buf_.len - buf.len)), - .written = @as(u32, @truncate(latin1_.len - latin1.len)), - }; -} - -pub fn elementLengthLatin1IntoUTF16(comptime Type: type, latin1_: Type) usize { - // latin1 is always at most 1 UTF-16 code unit long - if (comptime std.meta.Child([]const u16) == Type) { - return latin1_.len; - } - - return bun.simdutf.length.utf16.from.latin1(latin1_); -} - -pub fn escapeHTMLForLatin1Input(allocator: std.mem.Allocator, latin1: []const u8) !Escaped(u8) { - const Scalar = struct { - pub const lengths: [std.math.maxInt(u8) + 1]u4 = brk: { - var values: [std.math.maxInt(u8) + 1]u4 = undefined; - for (values, 0..) |_, i| { - switch (i) { - '"' => { - values[i] = """.len; - }, - '&' => { - values[i] = "&".len; - }, - '\'' => { - values[i] = "'".len; - }, - '<' => { - values[i] = "<".len; - }, - '>' => { - values[i] = ">".len; - }, - else => { - values[i] = 1; - }, - } - } - - break :brk values; - }; - - fn appendString(buf: [*]u8, comptime str: []const u8) callconv(bun.callconv_inline) usize { - buf[0..str.len].* = str[0..str.len].*; - return str.len; - } - - pub fn append(buf: [*]u8, char: u8) callconv(bun.callconv_inline) usize { - if (lengths[char] == 1) { - buf[0] = char; - return 1; - } - - return switch (char) { - '"' => appendString(buf, """), - '&' => appendString(buf, "&"), - '\'' => appendString(buf, "'"), - '<' => appendString(buf, "<"), - '>' => appendString(buf, ">"), - else => unreachable, - }; - } - - pub fn push(comptime len: anytype, chars_: *const [len]u8, allo: std.mem.Allocator) callconv(bun.callconv_inline) Escaped(u8) { - const chars = chars_.*; - var total: usize = 0; - - comptime var remain_to_comp = len; - comptime var comp_i = 0; - - inline while (remain_to_comp > 0) : (remain_to_comp -= 1) { - total += lengths[chars[comp_i]]; - comp_i += 1; - } - - if (total == len) { - return .{ .original = {} }; - } - - const output = allo.alloc(u8, total) catch unreachable; - var head = output.ptr; - inline for (comptime bun.range(0, len)) |i| { - head += @This().append(head, chars[i]); - } - - return Escaped(u8){ .allocated = output }; - } - }; - @setEvalBranchQuota(5000); - switch (latin1.len) { - 0 => return Escaped(u8){ .static = "" }, - 1 => return switch (latin1[0]) { - '"' => Escaped(u8){ .static = """ }, - '&' => Escaped(u8){ .static = "&" }, - '\'' => Escaped(u8){ .static = "'" }, - '<' => Escaped(u8){ .static = "<" }, - '>' => Escaped(u8){ .static = ">" }, - else => Escaped(u8){ .original = {} }, - }, - 2 => { - const first: []const u8 = switch (latin1[0]) { - '"' => """, - '&' => "&", - '\'' => "'", - '<' => "<", - '>' => ">", - else => latin1[0..1], - }; - const second: []const u8 = switch (latin1[1]) { - '"' => """, - '&' => "&", - '\'' => "'", - '<' => "<", - '>' => ">", - else => latin1[1..2], - }; - if (first.len == 1 and second.len == 1) { - return Escaped(u8){ .original = {} }; - } - - return Escaped(u8){ .allocated = strings.append(allocator, first, second) catch unreachable }; - }, - - // The simd implementation is slower for inputs less than 32 bytes. - 3 => return Scalar.push(3, latin1[0..3], allocator), - 4 => return Scalar.push(4, latin1[0..4], allocator), - 5 => return Scalar.push(5, latin1[0..5], allocator), - 6 => return Scalar.push(6, latin1[0..6], allocator), - 7 => return Scalar.push(7, latin1[0..7], allocator), - 8 => return Scalar.push(8, latin1[0..8], allocator), - 9 => return Scalar.push(9, latin1[0..9], allocator), - 10 => return Scalar.push(10, latin1[0..10], allocator), - 11 => return Scalar.push(11, latin1[0..11], allocator), - 12 => return Scalar.push(12, latin1[0..12], allocator), - 13 => return Scalar.push(13, latin1[0..13], allocator), - 14 => return Scalar.push(14, latin1[0..14], allocator), - 15 => return Scalar.push(15, latin1[0..15], allocator), - 16 => return Scalar.push(16, latin1[0..16], allocator), - 17 => return Scalar.push(17, latin1[0..17], allocator), - 18 => return Scalar.push(18, latin1[0..18], allocator), - 19 => return Scalar.push(19, latin1[0..19], allocator), - 20 => return Scalar.push(20, latin1[0..20], allocator), - 21 => return Scalar.push(21, latin1[0..21], allocator), - 22 => return Scalar.push(22, latin1[0..22], allocator), - 23 => return Scalar.push(23, latin1[0..23], allocator), - 24 => return Scalar.push(24, latin1[0..24], allocator), - 25 => return Scalar.push(25, latin1[0..25], allocator), - 26 => return Scalar.push(26, latin1[0..26], allocator), - 27 => return Scalar.push(27, latin1[0..27], allocator), - 28 => return Scalar.push(28, latin1[0..28], allocator), - 29 => return Scalar.push(29, latin1[0..29], allocator), - 30 => return Scalar.push(30, latin1[0..30], allocator), - 31 => return Scalar.push(31, latin1[0..31], allocator), - 32 => return Scalar.push(32, latin1[0..32], allocator), - - else => { - var remaining = latin1; - - const vec_chars = "\"&'<>"; - const vecs: [vec_chars.len]AsciiVector = comptime brk: { - var _vecs: [vec_chars.len]AsciiVector = undefined; - for (vec_chars, 0..) |c, i| { - _vecs[i] = @splat(c); - } - break :brk _vecs; - }; - - var any_needs_escape = false; - var buf: std.ArrayList(u8) = std.ArrayList(u8){ - .items = &.{}, - .capacity = 0, - .allocator = allocator, - }; - - if (comptime Environment.enableSIMD) { - // pass #1: scan for any characters that need escaping - // assume most strings won't need any escaping, so don't actually allocate the buffer - scan_and_allocate_lazily: while (remaining.len >= ascii_vector_size) { - if (comptime Environment.allow_assert) assert(!any_needs_escape); - const vec: AsciiVector = remaining[0..ascii_vector_size].*; - if (@reduce(.Max, @as(AsciiVectorU1, @bitCast((vec == vecs[0]))) | - @as(AsciiVectorU1, @bitCast((vec == vecs[1]))) | - @as(AsciiVectorU1, @bitCast((vec == vecs[2]))) | - @as(AsciiVectorU1, @bitCast((vec == vecs[3]))) | - @as(AsciiVectorU1, @bitCast((vec == vecs[4])))) == 1) - { - if (comptime Environment.allow_assert) assert(buf.capacity == 0); - - buf = try std.ArrayList(u8).initCapacity(allocator, latin1.len + 6); - const copy_len = @intFromPtr(remaining.ptr) - @intFromPtr(latin1.ptr); - buf.appendSliceAssumeCapacity(latin1[0..copy_len]); - any_needs_escape = true; - inline for (0..ascii_vector_size) |i| { - switch (vec[i]) { - '"' => { - buf.ensureUnusedCapacity((ascii_vector_size - i) + """.len) catch unreachable; - buf.items.ptr[buf.items.len .. buf.items.len + """.len][0..""".len].* = """.*; - buf.items.len += """.len; - }, - '&' => { - buf.ensureUnusedCapacity((ascii_vector_size - i) + "&".len) catch unreachable; - buf.items.ptr[buf.items.len .. buf.items.len + "&".len][0.."&".len].* = "&".*; - buf.items.len += "&".len; - }, - '\'' => { - buf.ensureUnusedCapacity((ascii_vector_size - i) + "'".len) catch unreachable; - buf.items.ptr[buf.items.len .. buf.items.len + "'".len][0.."'".len].* = "'".*; - buf.items.len += "'".len; - }, - '<' => { - buf.ensureUnusedCapacity((ascii_vector_size - i) + "<".len) catch unreachable; - buf.items.ptr[buf.items.len .. buf.items.len + "<".len][0.."<".len].* = "<".*; - buf.items.len += "<".len; - }, - '>' => { - buf.ensureUnusedCapacity((ascii_vector_size - i) + ">".len) catch unreachable; - buf.items.ptr[buf.items.len .. buf.items.len + ">".len][0..">".len].* = ">".*; - buf.items.len += ">".len; - }, - else => |c| { - buf.appendAssumeCapacity(c); - }, - } - } - - remaining = remaining[ascii_vector_size..]; - break :scan_and_allocate_lazily; - } - - remaining = remaining[ascii_vector_size..]; - } - } - - if (any_needs_escape) { - // pass #2: we found something that needed an escape - // so we'll go ahead and copy the buffer into a new buffer - while (remaining.len >= ascii_vector_size) { - const vec: AsciiVector = remaining[0..ascii_vector_size].*; - if (@reduce(.Max, @as(AsciiVectorU1, @bitCast((vec == vecs[0]))) | - @as(AsciiVectorU1, @bitCast((vec == vecs[1]))) | - @as(AsciiVectorU1, @bitCast((vec == vecs[2]))) | - @as(AsciiVectorU1, @bitCast((vec == vecs[3]))) | - @as(AsciiVectorU1, @bitCast((vec == vecs[4])))) == 1) - { - buf.ensureUnusedCapacity(ascii_vector_size + 6) catch unreachable; - inline for (0..ascii_vector_size) |i| { - switch (vec[i]) { - '"' => { - buf.ensureUnusedCapacity((ascii_vector_size - i) + """.len) catch unreachable; - buf.items.ptr[buf.items.len .. buf.items.len + """.len][0..""".len].* = """.*; - buf.items.len += """.len; - }, - '&' => { - buf.ensureUnusedCapacity((ascii_vector_size - i) + "&".len) catch unreachable; - buf.items.ptr[buf.items.len .. buf.items.len + "&".len][0.."&".len].* = "&".*; - buf.items.len += "&".len; - }, - '\'' => { - buf.ensureUnusedCapacity((ascii_vector_size - i) + "'".len) catch unreachable; - buf.items.ptr[buf.items.len .. buf.items.len + "'".len][0.."'".len].* = "'".*; - buf.items.len += "'".len; - }, - '<' => { - buf.ensureUnusedCapacity((ascii_vector_size - i) + "<".len) catch unreachable; - buf.items.ptr[buf.items.len .. buf.items.len + "<".len][0.."<".len].* = "<".*; - buf.items.len += "<".len; - }, - '>' => { - buf.ensureUnusedCapacity((ascii_vector_size - i) + ">".len) catch unreachable; - buf.items.ptr[buf.items.len .. buf.items.len + ">".len][0..">".len].* = ">".*; - buf.items.len += ">".len; - }, - else => |c| { - buf.appendAssumeCapacity(c); - }, - } - } - - remaining = remaining[ascii_vector_size..]; - continue; - } - - try buf.ensureUnusedCapacity(ascii_vector_size); - buf.items.ptr[buf.items.len .. buf.items.len + ascii_vector_size][0..ascii_vector_size].* = remaining[0..ascii_vector_size].*; - buf.items.len += ascii_vector_size; - remaining = remaining[ascii_vector_size..]; - } - } - - var ptr = remaining.ptr; - const end = remaining.ptr + remaining.len; - - if (!any_needs_escape) { - scan_and_allocate_lazily: while (ptr != end) : (ptr += 1) { - switch (ptr[0]) { - '"', '&', '\'', '<', '>' => |c| { - if (comptime Environment.allow_assert) assert(buf.capacity == 0); - - buf = try std.ArrayList(u8).initCapacity(allocator, latin1.len + @as(usize, Scalar.lengths[c])); - const copy_len = @intFromPtr(ptr) - @intFromPtr(latin1.ptr); - if (comptime Environment.allow_assert) assert(copy_len <= buf.capacity); - buf.items.len = copy_len; - @memcpy(buf.items[0..copy_len], latin1[0..copy_len]); - any_needs_escape = true; - break :scan_and_allocate_lazily; - }, - else => {}, - } - } - } - - while (ptr != end) : (ptr += 1) { - switch (ptr[0]) { - '"' => { - buf.appendSlice(""") catch unreachable; - }, - '&' => { - buf.appendSlice("&") catch unreachable; - }, - '\'' => { - buf.appendSlice("'") catch unreachable; // modified from escape-html; used to be ''' - }, - '<' => { - buf.appendSlice("<") catch unreachable; - }, - '>' => { - buf.appendSlice(">") catch unreachable; - }, - else => |c| { - buf.append(c) catch unreachable; - }, - } - } - - if (!any_needs_escape) { - if (comptime Environment.allow_assert) assert(buf.capacity == 0); - return Escaped(u8){ .original = {} }; - } - - return Escaped(u8){ .allocated = try buf.toOwnedSlice() }; - }, - } -} - -fn Escaped(comptime T: type) type { - return union(enum) { - static: []const u8, - original: void, - allocated: []T, - }; -} - -pub fn escapeHTMLForUTF16Input(allocator: std.mem.Allocator, utf16: []const u16) !Escaped(u16) { - const Scalar = struct { - pub const lengths: [std.math.maxInt(u8) + 1]u4 = brk: { - var values: [std.math.maxInt(u8) + 1]u4 = undefined; - for (values, 0..) |_, i| { - values[i] = switch (i) { - '"' => """.len, - '&' => "&".len, - '\'' => "'".len, - '<' => "<".len, - '>' => ">".len, - else => 1, - }; - } - - break :brk values; - }; - }; - switch (utf16.len) { - 0 => return Escaped(u16){ .static = &[_]u8{} }, - 1 => { - switch (utf16[0]) { - '"' => return Escaped(u16){ .static = """ }, - '&' => return Escaped(u16){ .static = "&" }, - '\'' => return Escaped(u16){ .static = "'" }, - '<' => return Escaped(u16){ .static = "<" }, - '>' => return Escaped(u16){ .static = ">" }, - else => return Escaped(u16){ .original = {} }, - } - }, - 2 => { - const first_16 = switch (utf16[0]) { - '"' => toUTF16Literal("""), - '&' => toUTF16Literal("&"), - '\'' => toUTF16Literal("'"), - '<' => toUTF16Literal("<"), - '>' => toUTF16Literal(">"), - else => @as([]const u16, utf16[0..1]), - }; - - const second_16 = switch (utf16[1]) { - '"' => toUTF16Literal("""), - '&' => toUTF16Literal("&"), - '\'' => toUTF16Literal("'"), - '<' => toUTF16Literal("<"), - '>' => toUTF16Literal(">"), - else => @as([]const u16, utf16[1..2]), - }; - - if (first_16.ptr == utf16.ptr and second_16.ptr == utf16.ptr + 1) { - return Escaped(u16){ .original = {} }; - } - - var buf = allocator.alloc(u16, first_16.len + second_16.len) catch unreachable; - bun.copy(u16, buf, first_16); - bun.copy(u16, buf[first_16.len..], second_16); - return Escaped(u16){ .allocated = buf }; - }, - - else => { - var remaining = utf16; - - var any_needs_escape = false; - var buf: std.ArrayList(u16) = undefined; - - if (comptime Environment.enableSIMD) { - const vec_chars = "\"&'<>"; - const vecs: [vec_chars.len]AsciiU16Vector = brk: { - var _vecs: [vec_chars.len]AsciiU16Vector = undefined; - for (vec_chars, 0..) |c, i| { - _vecs[i] = @splat(@as(u16, c)); - } - break :brk _vecs; - }; - // pass #1: scan for any characters that need escaping - // assume most strings won't need any escaping, so don't actually allocate the buffer - scan_and_allocate_lazily: while (remaining.len >= ascii_u16_vector_size) { - if (comptime Environment.allow_assert) assert(!any_needs_escape); - const vec: AsciiU16Vector = remaining[0..ascii_u16_vector_size].*; - if (@reduce(.Max, @as(AsciiVectorU16U1, @bitCast(vec > @as(AsciiU16Vector, @splat(@as(u16, 127))))) | - @as(AsciiVectorU16U1, @bitCast((vec == vecs[0]))) | - @as(AsciiVectorU16U1, @bitCast((vec == vecs[1]))) | - @as(AsciiVectorU16U1, @bitCast((vec == vecs[2]))) | - @as(AsciiVectorU16U1, @bitCast((vec == vecs[3]))) | - @as(AsciiVectorU16U1, @bitCast((vec == vecs[4])))) == 1) - { - var i: u16 = 0; - lazy: { - while (i < ascii_u16_vector_size) { - switch (remaining[i]) { - '"', '&', '\'', '<', '>' => { - any_needs_escape = true; - break :lazy; - }, - 128...std.math.maxInt(u16) => { - const cp = utf16Codepoint([]const u16, remaining[i..]); - i += @as(u16, cp.len); - }, - else => { - i += 1; - }, - } - } - } - - if (!any_needs_escape) { - remaining = remaining[i..]; - continue :scan_and_allocate_lazily; - } - - if (comptime Environment.allow_assert) assert(@intFromPtr(remaining.ptr + i) >= @intFromPtr(utf16.ptr)); - const to_copy = std.mem.sliceAsBytes(utf16)[0 .. @intFromPtr(remaining.ptr + i) - @intFromPtr(utf16.ptr)]; - const to_copy_16 = std.mem.bytesAsSlice(u16, to_copy); - buf = try std.ArrayList(u16).initCapacity(allocator, utf16.len + 6); - try buf.appendSlice(to_copy_16); - - while (i < ascii_u16_vector_size) { - switch (remaining[i]) { - '"', '&', '\'', '<', '>' => |c| { - const result = switch (c) { - '"' => toUTF16Literal("""), - '&' => toUTF16Literal("&"), - '\'' => toUTF16Literal("'"), - '<' => toUTF16Literal("<"), - '>' => toUTF16Literal(">"), - else => unreachable, - }; - - buf.appendSlice(result) catch unreachable; - i += 1; - }, - 128...std.math.maxInt(u16) => { - const cp = utf16Codepoint([]const u16, remaining[i..]); - - buf.appendSlice(remaining[i..][0..@as(usize, cp.len)]) catch unreachable; - i += @as(u16, cp.len); - }, - else => |c| { - i += 1; - buf.append(c) catch unreachable; - }, - } - } - - // edgecase: code point width could exceed asdcii_u16_vector_size - remaining = remaining[i..]; - break :scan_and_allocate_lazily; - } - - remaining = remaining[ascii_u16_vector_size..]; - } - - if (any_needs_escape) { - // pass #2: we found something that needed an escape - // but there's still some more text to - // so we'll go ahead and copy the buffer into a new buffer - while (remaining.len >= ascii_u16_vector_size) { - const vec: AsciiU16Vector = remaining[0..ascii_u16_vector_size].*; - if (@reduce(.Max, @as(AsciiVectorU16U1, @bitCast(vec > @as(AsciiU16Vector, @splat(@as(u16, 127))))) | - @as(AsciiVectorU16U1, @bitCast((vec == vecs[0]))) | - @as(AsciiVectorU16U1, @bitCast((vec == vecs[1]))) | - @as(AsciiVectorU16U1, @bitCast((vec == vecs[2]))) | - @as(AsciiVectorU16U1, @bitCast((vec == vecs[3]))) | - @as(AsciiVectorU16U1, @bitCast((vec == vecs[4])))) == 1) - { - buf.ensureUnusedCapacity(ascii_u16_vector_size) catch unreachable; - var i: u16 = 0; - while (i < ascii_u16_vector_size) { - switch (remaining[i]) { - '"' => { - buf.appendSlice(toUTF16Literal(""")) catch unreachable; - i += 1; - }, - '&' => { - buf.appendSlice(toUTF16Literal("&")) catch unreachable; - i += 1; - }, - '\'' => { - buf.appendSlice(toUTF16Literal("'")) catch unreachable; // modified from escape-html; used to be ''' - i += 1; - }, - '<' => { - buf.appendSlice(toUTF16Literal("<")) catch unreachable; - i += 1; - }, - '>' => { - buf.appendSlice(toUTF16Literal(">")) catch unreachable; - i += 1; - }, - 128...std.math.maxInt(u16) => { - const cp = utf16Codepoint([]const u16, remaining[i..]); - - buf.appendSlice(remaining[i..][0..@as(usize, cp.len)]) catch unreachable; - i += @as(u16, cp.len); - }, - else => |c| { - buf.append(c) catch unreachable; - i += 1; - }, - } - } - - remaining = remaining[i..]; - continue; - } - - try buf.ensureUnusedCapacity(ascii_u16_vector_size); - buf.items.ptr[buf.items.len .. buf.items.len + ascii_u16_vector_size][0..ascii_u16_vector_size].* = remaining[0..ascii_u16_vector_size].*; - buf.items.len += ascii_u16_vector_size; - remaining = remaining[ascii_u16_vector_size..]; - } - } - } - - var ptr = remaining.ptr; - const end = remaining.ptr + remaining.len; - - if (!any_needs_escape) { - scan_and_allocate_lazily: while (ptr != end) { - switch (ptr[0]) { - '"', '&', '\'', '<', '>' => |c| { - buf = try std.ArrayList(u16).initCapacity(allocator, utf16.len + @as(usize, Scalar.lengths[c])); - if (comptime Environment.allow_assert) assert(@intFromPtr(ptr) >= @intFromPtr(utf16.ptr)); - - const to_copy = std.mem.sliceAsBytes(utf16)[0 .. @intFromPtr(ptr) - @intFromPtr(utf16.ptr)]; - const to_copy_16 = std.mem.bytesAsSlice(u16, to_copy); - try buf.appendSlice(to_copy_16); - any_needs_escape = true; - break :scan_and_allocate_lazily; - }, - 128...std.math.maxInt(u16) => { - const cp = utf16Codepoint([]const u16, ptr[0..if (ptr + 1 == end) 1 else 2]); - - ptr += @as(u16, cp.len); - }, - else => { - ptr += 1; - }, - } - } - } - - while (ptr != end) { - switch (ptr[0]) { - '"' => { - buf.appendSlice(toUTF16Literal(""")) catch unreachable; - ptr += 1; - }, - '&' => { - buf.appendSlice(toUTF16Literal("&")) catch unreachable; - ptr += 1; - }, - '\'' => { - buf.appendSlice(toUTF16Literal("'")) catch unreachable; // modified from escape-html; used to be ''' - ptr += 1; - }, - '<' => { - buf.appendSlice(toUTF16Literal("<")) catch unreachable; - ptr += 1; - }, - '>' => { - buf.appendSlice(toUTF16Literal(">")) catch unreachable; - ptr += 1; - }, - 128...std.math.maxInt(u16) => { - const cp = utf16Codepoint([]const u16, ptr[0..if (ptr + 1 == end) 1 else 2]); - - buf.appendSlice(ptr[0..@as(usize, cp.len)]) catch unreachable; - ptr += @as(u16, cp.len); - }, - - else => |c| { - buf.append(c) catch unreachable; - ptr += 1; - }, - } - } - - if (!any_needs_escape) { - return Escaped(u16){ .original = {} }; - } - - return Escaped(u16){ .allocated = try buf.toOwnedSlice() }; - }, - } -} - -pub fn latin1ToCodepointAssumeNotASCII(char: u8, comptime CodePointType: type) CodePointType { - return @as( - CodePointType, - @intCast(latin1ToCodepointBytesAssumeNotASCII16(char)), - ); -} - -const latin1_to_utf16_conversion_table = [256]u16{ - 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, // 00-07 - 0x0008, 0x0009, 0x000A, 0x000B, 0x000C, 0x000D, 0x000E, 0x000F, // 08-0F - 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, // 10-17 - 0x0018, 0x0019, 0x001A, 0x001B, 0x001C, 0x001D, 0x001E, 0x001F, // 18-1F - 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, // 20-27 - 0x0028, 0x0029, 0x002A, 0x002B, 0x002C, 0x002D, 0x002E, 0x002F, // 28-2F - 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, // 30-37 - 0x0038, 0x0039, 0x003A, 0x003B, 0x003C, 0x003D, 0x003E, 0x003F, // 38-3F - 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, // 40-47 - 0x0048, 0x0049, 0x004A, 0x004B, 0x004C, 0x004D, 0x004E, 0x004F, // 48-4F - 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, // 50-57 - 0x0058, 0x0059, 0x005A, 0x005B, 0x005C, 0x005D, 0x005E, 0x005F, // 58-5F - 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, // 60-67 - 0x0068, 0x0069, 0x006A, 0x006B, 0x006C, 0x006D, 0x006E, 0x006F, // 68-6F - 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, // 70-77 - 0x0078, 0x0079, 0x007A, 0x007B, 0x007C, 0x007D, 0x007E, 0x007F, // 78-7F - 0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, // 80-87 - 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F, // 88-8F - 0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, // 90-97 - 0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178, // 98-9F - 0x00A0, 0x00A1, 0x00A2, 0x00A3, 0x00A4, 0x00A5, 0x00A6, 0x00A7, // A0-A7 - 0x00A8, 0x00A9, 0x00AA, 0x00AB, 0x00AC, 0x00AD, 0x00AE, 0x00AF, // A8-AF - 0x00B0, 0x00B1, 0x00B2, 0x00B3, 0x00B4, 0x00B5, 0x00B6, 0x00B7, // B0-B7 - 0x00B8, 0x00B9, 0x00BA, 0x00BB, 0x00BC, 0x00BD, 0x00BE, 0x00BF, // B8-BF - 0x00C0, 0x00C1, 0x00C2, 0x00C3, 0x00C4, 0x00C5, 0x00C6, 0x00C7, // C0-C7 - 0x00C8, 0x00C9, 0x00CA, 0x00CB, 0x00CC, 0x00CD, 0x00CE, 0x00CF, // C8-CF - 0x00D0, 0x00D1, 0x00D2, 0x00D3, 0x00D4, 0x00D5, 0x00D6, 0x00D7, // D0-D7 - 0x00D8, 0x00D9, 0x00DA, 0x00DB, 0x00DC, 0x00DD, 0x00DE, 0x00DF, // D8-DF - 0x00E0, 0x00E1, 0x00E2, 0x00E3, 0x00E4, 0x00E5, 0x00E6, 0x00E7, // E0-E7 - 0x00E8, 0x00E9, 0x00EA, 0x00EB, 0x00EC, 0x00ED, 0x00EE, 0x00EF, // E8-EF - 0x00F0, 0x00F1, 0x00F2, 0x00F3, 0x00F4, 0x00F5, 0x00F6, 0x00F7, // F0-F7 - 0x00F8, 0x00F9, 0x00FA, 0x00FB, 0x00FC, 0x00FD, 0x00FE, 0x00FF, // F8-FF -}; - -pub fn latin1ToCodepointBytesAssumeNotASCII(char: u32) [2]u8 { - var bytes = [4]u8{ 0, 0, 0, 0 }; - _ = encodeWTF8Rune(&bytes, @as(i32, @intCast(char))); - return bytes[0..2].*; -} - -pub fn latin1ToCodepointBytesAssumeNotASCII16(char: u32) u16 { - return latin1_to_utf16_conversion_table[@as(u8, @truncate(char))]; -} - -pub fn copyUTF16IntoUTF8(buf: []u8, comptime Type: type, utf16: Type, comptime allow_partial_write: bool) EncodeIntoResult { - if (comptime Type == []const u16) { - if (bun.FeatureFlags.use_simdutf) { - if (utf16.len == 0) - return .{ .read = 0, .written = 0 }; - const trimmed = bun.simdutf.trim.utf16(utf16); - if (trimmed.len == 0) - return .{ .read = 0, .written = 0 }; - - const out_len = if (buf.len <= (trimmed.len * 3 + 2)) - bun.simdutf.length.utf8.from.utf16.le(trimmed) - else - buf.len; - - return copyUTF16IntoUTF8WithBuffer(buf, Type, utf16, trimmed, out_len, allow_partial_write); - } - } - - return copyUTF16IntoUTF8WithBuffer(buf, Type, utf16, utf16, utf16.len, allow_partial_write); -} - -pub fn copyUTF16IntoUTF8WithBuffer(buf: []u8, comptime Type: type, utf16: Type, trimmed: Type, out_len: usize, comptime allow_partial_write: bool) EncodeIntoResult { - var remaining = buf; - var utf16_remaining = utf16; - var ended_on_non_ascii = false; - - brk: { - if (comptime Type == []const u16) { - if (bun.FeatureFlags.use_simdutf) { - log("UTF16 {d} -> UTF8 {d}", .{ utf16.len, out_len }); - if (remaining.len >= out_len) { - const result = bun.simdutf.convert.utf16.to.utf8.with_errors.le(trimmed, remaining); - if (result.status == .surrogate) break :brk; - - return EncodeIntoResult{ - .read = @as(u32, @truncate(trimmed.len)), - .written = @as(u32, @truncate(result.count)), - }; - } - } - } - } - - while (firstNonASCII16(Type, utf16_remaining)) |i| { - const end = @min(i, remaining.len); - if (end > 0) copyU16IntoU8(remaining, utf16_remaining[0..end]); - remaining = remaining[end..]; - utf16_remaining = utf16_remaining[end..]; - - if (@min(utf16_remaining.len, remaining.len) == 0) - break; - - const replacement = utf16CodepointWithFFFD(Type, utf16_remaining); - - const width: usize = replacement.utf8Width(); - if (width > remaining.len) { - ended_on_non_ascii = width > 1; - if (comptime allow_partial_write) switch (width) { - 2 => { - if (remaining.len > 0) { - //only first will be written - remaining[0] = @as(u8, @truncate(0xC0 | (replacement.code_point >> 6))); - remaining = remaining[remaining.len..]; - } - }, - 3 => { - //only first to second written - switch (remaining.len) { - 1 => { - remaining[0] = @as(u8, @truncate(0xE0 | (replacement.code_point >> 12))); - remaining = remaining[remaining.len..]; - }, - 2 => { - remaining[0] = @as(u8, @truncate(0xE0 | (replacement.code_point >> 12))); - remaining[1] = @as(u8, @truncate(0x80 | (replacement.code_point >> 6) & 0x3F)); - remaining = remaining[remaining.len..]; - }, - else => {}, - } - }, - 4 => { - //only 1 to 3 written - switch (remaining.len) { - 1 => { - remaining[0] = @as(u8, @truncate(0xF0 | (replacement.code_point >> 18))); - remaining = remaining[remaining.len..]; - }, - 2 => { - remaining[0] = @as(u8, @truncate(0xF0 | (replacement.code_point >> 18))); - remaining[1] = @as(u8, @truncate(0x80 | (replacement.code_point >> 12) & 0x3F)); - remaining = remaining[remaining.len..]; - }, - 3 => { - remaining[0] = @as(u8, @truncate(0xF0 | (replacement.code_point >> 18))); - remaining[1] = @as(u8, @truncate(0x80 | (replacement.code_point >> 12) & 0x3F)); - remaining[2] = @as(u8, @truncate(0x80 | (replacement.code_point >> 6) & 0x3F)); - remaining = remaining[remaining.len..]; - }, - else => {}, - } - }, - - else => {}, - }; - break; - } - - utf16_remaining = utf16_remaining[replacement.len..]; - _ = encodeWTF8RuneT(remaining.ptr[0..4], u32, @as(u32, replacement.code_point)); - remaining = remaining[width..]; - } - - if (remaining.len > 0 and !ended_on_non_ascii and utf16_remaining.len > 0) { - const len = @min(remaining.len, utf16_remaining.len); - copyU16IntoU8(remaining[0..len], utf16_remaining[0..len]); - utf16_remaining = utf16_remaining[len..]; - remaining = remaining[len..]; - } - - return .{ - .read = @as(u32, @truncate(utf16.len - utf16_remaining.len)), - .written = @as(u32, @truncate(buf.len - remaining.len)), - }; -} - -pub fn elementLengthUTF16IntoUTF8(comptime Type: type, utf16: Type) usize { - if (bun.FeatureFlags.use_simdutf) { - return bun.simdutf.length.utf8.from.utf16.le(utf16); - } - - var utf16_remaining = utf16; - var count: usize = 0; - - while (firstNonASCII16(Type, utf16_remaining)) |i| { - count += i; - - utf16_remaining = utf16_remaining[i..]; - - const replacement = utf16Codepoint(Type, utf16_remaining); - - count += replacement.utf8Width(); - utf16_remaining = utf16_remaining[replacement.len..]; - } - - return count + utf16_remaining.len; -} - -pub fn elementLengthUTF8IntoUTF16(comptime Type: type, utf8: Type) usize { - var utf8_remaining = utf8; - var count: usize = 0; - - if (bun.FeatureFlags.use_simdutf) { - return bun.simdutf.length.utf16.from.utf8(utf8); - } - - while (firstNonASCII(utf8_remaining)) |i| { - count += i; - - utf8_remaining = utf8_remaining[i..]; - - const replacement = utf16Codepoint(Type, utf8_remaining); - - count += replacement.len; - utf8_remaining = utf8_remaining[@min(replacement.utf8Width(), utf8_remaining.len)..]; - } - - return count + utf8_remaining.len; -} - -// Check utf16 string equals utf8 string without allocating extra memory -pub fn utf16EqlString(text: []const u16, str: string) bool { - if (text.len > str.len) { - // Strings can't be equal if UTF-16 encoding is longer than UTF-8 encoding - return false; - } - - var temp = [4]u8{ 0, 0, 0, 0 }; - const n = text.len; - var j: usize = 0; - var i: usize = 0; - // TODO: is it safe to just make this u32 or u21? - var r1: i32 = undefined; - while (i < n) : (i += 1) { - r1 = text[i]; - if (r1 >= 0xD800 and r1 <= 0xDBFF and i + 1 < n) { - const r2: i32 = text[i + 1]; - if (r2 >= 0xDC00 and r2 <= 0xDFFF) { - r1 = (r1 - 0xD800) << 10 | (r2 - 0xDC00) + 0x10000; - i += 1; - } - } - - const width = encodeWTF8Rune(&temp, r1); - if (j + width > str.len) { - return false; - } - for (0..width) |k| { - if (temp[k] != str[j]) { - return false; - } - j += 1; - } - } - - return j == str.len; -} - -pub fn encodeUTF8Comptime(comptime cp: u32) []const u8 { - const HEADER_CONT_BYTE: u8 = 0b10000000; - const HEADER_2BYTE: u8 = 0b11000000; - const HEADER_3BYTE: u8 = 0b11100000; - const HEADER_4BYTE: u8 = 0b11100000; - - return switch (cp) { - 0x0...0x7F => return &[_]u8{@intCast(cp)}, - 0x80...0x7FF => { - return &[_]u8{ - HEADER_2BYTE | @as(u8, cp >> 6), - HEADER_CONT_BYTE | @as(u8, cp & 0b00111111), - }; - }, - 0x800...0xFFFF => { - return &[_]u8{ - HEADER_3BYTE | @as(u8, cp >> 12), - HEADER_CONT_BYTE | @as(u8, (cp >> 6) & 0b00111111), - HEADER_CONT_BYTE | @as(u8, cp & 0b00111111), - }; - }, - 0x10000...0x10FFFF => { - return &[_]u8{ - HEADER_4BYTE | @as(u8, cp >> 18), - HEADER_CONT_BYTE | @as(u8, (cp >> 12) & 0b00111111), - HEADER_CONT_BYTE | @as(u8, (cp >> 6) & 0b00111111), - HEADER_CONT_BYTE | @as(u8, cp & 0b00111111), - }; - }, - else => @compileError("Invalid UTF-8 codepoint!"), - }; -} - -// This is a clone of golang's "utf8.EncodeRune" that has been modified to encode using -// WTF-8 instead. See https://simonsapin.github.io/wtf-8/ for more info. -pub fn encodeWTF8Rune(p: *[4]u8, r: i32) u3_fast { - return @call( - .always_inline, - encodeWTF8RuneT, - .{ - p, - u32, - @as(u32, @intCast(r)), - }, - ); -} - -pub fn encodeWTF8RuneT(p: *[4]u8, comptime R: type, r: R) u3_fast { - switch (r) { - 0...0x7F => { - p[0] = @as(u8, @intCast(r)); - return 1; - }, - (0x7F + 1)...0x7FF => { - p[0] = @as(u8, @truncate(0xC0 | ((r >> 6)))); - p[1] = @as(u8, @truncate(0x80 | (r & 0x3F))); - return 2; - }, - (0x7FF + 1)...0xFFFF => { - p[0] = @as(u8, @truncate(0xE0 | ((r >> 12)))); - p[1] = @as(u8, @truncate(0x80 | ((r >> 6) & 0x3F))); - p[2] = @as(u8, @truncate(0x80 | (r & 0x3F))); - return 3; - }, - else => { - p[0] = @as(u8, @truncate(0xF0 | ((r >> 18)))); - p[1] = @as(u8, @truncate(0x80 | ((r >> 12) & 0x3F))); - p[2] = @as(u8, @truncate(0x80 | ((r >> 6) & 0x3F))); - p[3] = @as(u8, @truncate(0x80 | (r & 0x3F))); - return 4; - }, - } -} - -pub fn wtf8Sequence(code_point: u32) [4]u8 { - return switch (code_point) { - 0...0x7f => .{ - @intCast(code_point), - 0, - 0, - 0, - }, - (0x7f + 1)...0x7ff => .{ - @truncate(0xc0 | (code_point >> 6)), - @truncate(0x80 | (code_point & 0x3f)), - 0, - 0, - }, - (0x7ff + 1)...0xffff => .{ - @truncate(0xe0 | (code_point >> 12)), - @truncate(0x80 | ((code_point >> 6) & 0x3f)), - @truncate(0x80 | (code_point & 0x3f)), - 0, - }, - else => .{ - @truncate(0xf0 | (code_point >> 18)), - @truncate(0x80 | ((code_point >> 12) & 0x3f)), - @truncate(0x80 | ((code_point >> 6) & 0x3f)), - @truncate(0x80 | (code_point & 0x3f)), - }, - }; -} - -pub inline fn wtf8ByteSequenceLength(first_byte: u8) u8 { - return switch (first_byte) { - 0...0x80 - 1 => 1, - else => if ((first_byte & 0xE0) == 0xC0) - 2 - else if ((first_byte & 0xF0) == 0xE0) - 3 - else if ((first_byte & 0xF8) == 0xF0) - 4 - else - 1, - }; -} - -/// 0 == invalid -pub inline fn wtf8ByteSequenceLengthWithInvalid(first_byte: u8) u8 { - return switch (first_byte) { - 0...0x80 - 1 => 1, - else => if ((first_byte & 0xE0) == 0xC0) - 2 - else if ((first_byte & 0xF0) == 0xE0) - 3 - else if ((first_byte & 0xF8) == 0xF0) - 4 - else - 1, - }; -} - -/// Convert potentially ill-formed UTF-8 or UTF-16 bytes to a Unicode Codepoint. -/// Invalid codepoints are replaced with `zero` parameter -/// This is a clone of esbuild's decodeWTF8Rune -/// which was a clone of golang's "utf8.DecodeRune" that was modified to decode using WTF-8 instead. -/// Asserts a multi-byte codepoint -pub inline fn decodeWTF8RuneTMultibyte(p: *const [4]u8, len: u3_fast, comptime T: type, comptime zero: T) T { - if (comptime Environment.allow_assert) assert(len > 1); - - const s1 = p[1]; - if ((s1 & 0xC0) != 0x80) return zero; - - if (len == 2) { - const cp = @as(T, p[0] & 0x1F) << 6 | @as(T, s1 & 0x3F); - if (cp < 0x80) return zero; - return cp; - } - - const s2 = p[2]; - - if ((s2 & 0xC0) != 0x80) return zero; - - if (len == 3) { - const cp = (@as(T, p[0] & 0x0F) << 12) | (@as(T, s1 & 0x3F) << 6) | (@as(T, s2 & 0x3F)); - if (cp < 0x800) return zero; - return cp; - } - - const s3 = p[3]; - - if ((s3 & 0xC0) != 0x80) return zero; - - { - const cp = (@as(T, p[0] & 0x07) << 18) | (@as(T, s1 & 0x3F) << 12) | (@as(T, s2 & 0x3F) << 6) | (@as(T, s3 & 0x3F)); - if (cp < 0x10000 or cp > 0x10FFFF) return zero; - return cp; - } - - unreachable; -} - pub const ascii_vector_size = if (Environment.isWasm) 8 else 16; pub const ascii_u16_vector_size = if (Environment.isWasm) 4 else 8; pub const AsciiVectorInt = std.meta.Int(.unsigned, ascii_vector_size); @@ -3846,100 +1135,6 @@ pub const AsciiVectorU16U1 = @Vector(ascii_u16_vector_size, u1); pub const AsciiU16Vector = @Vector(ascii_u16_vector_size, u16); pub const max_4_ascii: @Vector(4, u8) = @splat(@as(u8, 127)); -const UTF8_ACCEPT: u8 = 0; -const UTF8_REJECT: u8 = 12; - -const utf8d: [364]u8 = .{ - // The first part of the table maps bytes to character classes that - // to reduce the size of the transition table and create bitmasks. - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 8, 8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 10, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 11, 6, 6, 6, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - - // The second part is a transition table that maps a combination - // of a state of the automaton and a character class to a state. - 0, 12, 24, 36, 60, 96, 84, 12, 12, 12, 48, 72, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 0, 12, 12, 12, 12, 12, 0, - 12, 0, 12, 12, 12, 24, 12, 12, 12, 12, 12, 24, 12, 24, 12, 12, 12, 12, 12, 12, 12, 12, 12, 24, 12, 12, 12, 12, 12, 24, 12, 12, - 12, 12, 12, 12, 12, 24, 12, 12, 12, 12, 12, 12, 12, 12, 12, 36, 12, 36, 12, 12, 12, 36, 12, 12, 12, 12, 12, 36, 12, 36, 12, 12, - 12, 36, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, -}; - -pub fn decodeCheck(state: u8, byte: u8) u8 { - const char_type: u32 = utf8d[byte]; - // we dont care about the codep - // codep = if (*state != UTF8_ACCEPT) (byte & 0x3f) | (*codep << 6) else (0xff >> char_type) & (byte); - - const value = @as(u32, 256) + state + char_type; - if (value >= utf8d.len) return UTF8_REJECT; - return utf8d[value]; -} - -// Copyright (c) 2008-2009 Bjoern Hoehrmann -// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. -pub fn isValidUTF8WithoutSIMD(slice: []const u8) bool { - var state: u8 = 0; - - for (slice) |byte| { - state = decodeCheck(state, byte); - } - return state == UTF8_ACCEPT; -} - -pub fn isValidUTF8(slice: []const u8) bool { - if (bun.FeatureFlags.use_simdutf) - return bun.simdutf.validate.utf8(slice); - - return isValidUTF8WithoutSIMD(slice); -} - -pub fn isAllASCII(slice: []const u8) bool { - if (@inComptime()) { - for (slice) |char| { - if (char > 127) { - return false; - } - } - return true; - } - - return bun.simdutf.validate.ascii(slice); -} - -// #define U16_LEAD(supplementary) (UChar)(((supplementary)>>10)+0xd7c0) -pub fn u16Lead(supplementary: anytype) callconv(bun.callconv_inline) u16 { - return @intCast((supplementary >> 10) + 0xd7c0); -} - -// #define U16_TRAIL(supplementary) (UChar)(((supplementary)&0x3ff)|0xdc00) -pub fn u16Trail(supplementary: anytype) callconv(bun.callconv_inline) u16 { - return @intCast((supplementary & 0x3ff) | 0xdc00); -} - -// #define U16_IS_TRAIL(c) (((c)&0xfffffc00)==0xdc00) -pub fn u16IsTrail(supplementary: u16) callconv(bun.callconv_inline) bool { - return (@as(u32, @intCast(supplementary)) & 0xfffffc00) == 0xdc00; -} - -// #define U16_IS_LEAD(c) (((c)&0xfffffc00)==0xd800) -pub fn u16IsLead(supplementary: u16) callconv(bun.callconv_inline) bool { - return (@as(u32, @intCast(supplementary)) & 0xfffffc00) == 0xd800; -} - -// #define U16_GET_SUPPLEMENTARY(lead, trail) \ -// (((UChar32)(lead)<<10UL)+(UChar32)(trail)-U16_SURROGATE_OFFSET) -pub fn u16GetSupplementary(lead: u32, trail: u32) callconv(bun.callconv_inline) u32 { - const shifted = lead << 10; - return (shifted + trail) - u16_surrogate_offset; -} - -// #define U16_SURROGATE_OFFSET ((0xd800<<10UL)+0xdc00-0x10000) -pub const u16_surrogate_offset = 56613888; - pub fn firstNonASCII(slice: []const u8) ?u32 { const result = bun.simdutf.validate.with_errors.ascii(slice); if (result.status == .success) { @@ -4541,101 +1736,6 @@ pub fn firstNonASCII16(comptime Slice: type, slice: Slice) ?u32 { return null; } -/// Convert potentially ill-formed UTF-8 or UTF-16 bytes to a Unicode Codepoint. -/// - Invalid codepoints are replaced with `zero` parameter -/// - Null bytes return 0 -pub fn decodeWTF8RuneT(p: *const [4]u8, len: u3_fast, comptime T: type, comptime zero: T) T { - if (len == 0) return zero; - if (len == 1) return p[0]; - - return decodeWTF8RuneTMultibyte(p, len, T, zero); -} - -pub fn codepointSize(comptime R: type, r: R) u3_fast { - return switch (r) { - 0b0000_0000...0b0111_1111 => 1, - 0b1100_0000...0b1101_1111 => 2, - 0b1110_0000...0b1110_1111 => 3, - 0b1111_0000...0b1111_0111 => 4, - else => 0, - }; -} - -// /// Encode Type into UTF-8 bytes. -// /// - Invalid unicode data becomes U+FFFD REPLACEMENT CHARACTER. -// /// - -// pub fn encodeUTF8RuneT(out: *[4]u8, comptime R: type, c: R) u3_fast { -// switch (c) { -// 0b0000_0000...0b0111_1111 => { -// out[0] = @intCast(u8, c); -// return 1; -// }, -// 0b1100_0000...0b1101_1111 => { -// out[0] = @truncate(u8, 0b11000000 | (c >> 6)); -// out[1] = @truncate(u8, 0b10000000 | c & 0b111111); -// return 2; -// }, - -// 0b1110_0000...0b1110_1111 => { -// if (0xd800 <= c and c <= 0xdfff) { -// // Replacement character -// out[0..3].* = [_]u8{ 0xEF, 0xBF, 0xBD }; - -// return 3; -// } - -// out[0] = @truncate(u8, 0b11100000 | (c >> 12)); -// out[1] = @truncate(u8, 0b10000000 | (c >> 6) & 0b111111); -// out[2] = @truncate(u8, 0b10000000 | c & 0b111111); -// return 3; -// }, -// 0b1111_0000...0b1111_0111 => { -// out[0] = @truncate(u8, 0b11110000 | (c >> 18)); -// out[1] = @truncate(u8, 0b10000000 | (c >> 12) & 0b111111); -// out[2] = @truncate(u8, 0b10000000 | (c >> 6) & 0b111111); -// out[3] = @truncate(u8, 0b10000000 | c & 0b111111); -// return 4; -// }, -// else => { -// // Replacement character -// out[0..3].* = [_]u8{ 0xEF, 0xBF, 0xBD }; - -// return 3; -// }, -// } -// } - -pub fn containsNonBmpCodePoint(text: string) bool { - var iter = CodepointIterator.init(text); - var curs = CodepointIterator.Cursor{}; - - while (iter.next(&curs)) { - if (curs.c > 0xFFFF) { - return true; - } - } - - return false; -} - -pub fn containsNonBmpCodePointOrIsInvalidIdentifier(text: string) bool { - var iter = CodepointIterator.init(text); - var curs = CodepointIterator.Cursor{}; - - if (!iter.next(&curs)) return true; - - if (curs.c > 0xFFFF or !js_lexer.isIdentifierStart(curs.c)) - return true; - - while (iter.next(&curs)) { - if (curs.c > 0xFFFF or !js_lexer.isIdentifierContinue(curs.c)) { - return true; - } - } - - return false; -} - // this is std.mem.trim except it doesn't forcibly change the slice to be const pub fn trim(slice: anytype, comptime values_to_strip: []const u8) @TypeOf(slice) { var begin: usize = 0; @@ -4689,7 +1789,7 @@ pub fn cmpStringsDesc(_: void, a: string, b: string) bool { /// Every time you read a non^2 sized integer, Zig masks off the extra bits. /// This is a meaningful performance difference, including in release builds. -const u3_fast = u8; +pub const u3_fast = u8; pub fn sortAsc(in: []string) void { // TODO: experiment with simd to see if it's faster @@ -4726,238 +1826,6 @@ pub fn toASCIIHexValue(character: u8) u8 { }; } -pub inline fn utf8ByteSequenceLength(first_byte: u8) u3_fast { - return switch (first_byte) { - 0b0000_0000...0b0111_1111 => 1, - 0b1100_0000...0b1101_1111 => 2, - 0b1110_0000...0b1110_1111 => 3, - 0b1111_0000...0b1111_0111 => 4, - else => 0, - }; -} - -/// Same as `utf8ByteSequenceLength`, but assumes the byte is valid UTF-8. -/// -/// You should only use this function if you know the string you are getting the byte from is valid UTF-8. -pub inline fn utf8ByteSequenceLengthUnsafe(first_byte: u8) u3_fast { - return switch (first_byte) { - 0b0000_0000...0b0111_1111 => 1, - 0b1100_0000...0b1101_1111 => 2, - 0b1110_0000...0b1110_1111 => 3, - 0b1111_0000...0b1111_0111 => 4, - else => unreachable, - }; -} - -pub fn NewCodePointIterator(comptime CodePointType_: type, comptime zeroValue: comptime_int) type { - return struct { - const Iterator = @This(); - bytes: []const u8, - i: usize, - next_width: usize = 0, - width: u3_fast = 0, - c: CodePointType = zeroValue, - - pub const CodePointType = CodePointType_; - - pub const ZeroValue = zeroValue; - - pub const Cursor = struct { - i: u32 = 0, - c: CodePointType = zeroValue, - width: u3_fast = 0, - }; - - pub fn init(str: string) Iterator { - return Iterator{ .bytes = str, .i = 0, .c = zeroValue }; - } - - pub fn initOffset(str: string, i: usize) Iterator { - return Iterator{ .bytes = str, .i = i, .c = zeroValue }; - } - - const SkipResult = enum { - eof, - found, - not_found, - }; - - /// Advance forward until the scalar function returns true. - /// THe simd function is "best effort" and expected to sometimes return a result which `scalar` will return false for. - /// This is because we don't decode UTF-8 in the SIMD code path. - pub fn skip(it: *const Iterator, cursor: *Cursor, simd: *const fn (input: []const u8) ?usize, scalar: *const fn (CodePointType) bool) SkipResult { - while (true) { - // 1. Get current position. Check for EOF. - const current_byte_index = cursor.i; - if (current_byte_index >= it.bytes.len) { - return .not_found; // Reached end without finding - } - - // 2. Decode the *next* character using the standard iterator method. - if (!next(it, cursor)) { - return .not_found; // Reached end or error during decode - } - - // 3. Check if the character just decoded matches the scalar condition. - if (scalar(it.c)) { - return .found; // Found it! - } - - // 4. Optimization: Can we skip ahead using SIMD? - // Scan starting from the byte *after* the character we just decoded. - const next_scan_start_index = cursor.i; - if (next_scan_start_index >= it.bytes.len) { - // Just decoded the last character and it didn't match. - return .not_found; - } - const remaining_slice = it.bytes[next_scan_start_index..]; - if (remaining_slice.len == 0) { - return .not_found; - } - - // Ask SIMD for the next potential candidate. - if (simd(remaining_slice)) |pos| { - // SIMD found a potential candidate `pos` bytes ahead. - if (pos > 0) { - // Jump the byte index to the start of the potential candidate. - cursor.i = next_scan_start_index + @as(u32, @intCast(pos)); - // Reset width so next() decodes correctly from the jumped position. - cursor.width = 0; - // Loop will continue, starting the decode from the new cursor.i. - continue; - } - // If pos == 0, SIMD suggests the *immediate next* character. - // No jump needed, just let the loop iterate naturally. - // Fallthrough to the end of the loop. - } else { - // SIMD found no potential candidates in the rest of the string. - // Since the SIMD search set is a superset of the scalar check set, - // we can guarantee that no character satisfying `scalar` exists further. - // Since the current character (decoded in step 2) also didn't match, - // we can conclude the target character is not found. - return .not_found; - } - - // If we reach here, it means SIMD returned pos=0. - // Loop continues to the next iteration, processing the immediate next char. - } // End while true - - unreachable; - } - - pub inline fn next(noalias it: *const Iterator, noalias cursor: *Cursor) bool { - const pos: u32 = @as(u32, cursor.width) + cursor.i; - if (pos >= it.bytes.len) { - return false; - } - - const cp_len = wtf8ByteSequenceLength(it.bytes[pos]); - const error_char = comptime std.math.minInt(CodePointType); - - const codepoint = @as( - CodePointType, - switch (cp_len) { - 0 => return false, - 1 => it.bytes[pos], - else => decodeWTF8RuneTMultibyte(it.bytes[pos..].ptr[0..4], cp_len, CodePointType, error_char), - }, - ); - - cursor.* = Cursor{ - .i = pos, - .c = if (error_char != codepoint) - codepoint - else - unicode_replacement, - .width = if (codepoint != error_char) cp_len else 1, - }; - - return true; - } - - fn nextCodepointSlice(it: *Iterator) callconv(bun.callconv_inline) []const u8 { - const bytes = it.bytes; - const prev = it.i; - const next_ = prev + it.next_width; - if (bytes.len <= next_) return ""; - - const cp_len = utf8ByteSequenceLength(bytes[next_]); - it.next_width = cp_len; - it.i = @min(next_, bytes.len); - - const slice = bytes[prev..][0..cp_len]; - it.width = @as(u3_fast, @intCast(slice.len)); - return slice; - } - - pub fn needsUTF8Decoding(slice: string) bool { - var it = Iterator{ .bytes = slice, .i = 0 }; - - while (true) { - const part = it.nextCodepointSlice(); - @setRuntimeSafety(false); - switch (part.len) { - 0 => return false, - 1 => continue, - else => return true, - } - } - } - - pub fn scanUntilQuotedValueOrEOF(iter: *Iterator, comptime quote: CodePointType) usize { - while (iter.c > -1) { - if (!switch (iter.nextCodepoint()) { - quote => false, - '\\' => brk: { - if (iter.nextCodepoint() == quote) { - continue; - } - break :brk true; - }, - else => true, - }) { - return iter.i + 1; - } - } - - return iter.i; - } - - pub fn nextCodepoint(it: *Iterator) CodePointType { - const slice = it.nextCodepointSlice(); - - it.c = switch (slice.len) { - 0 => zeroValue, - 1 => @as(CodePointType, @intCast(slice[0])), - 2 => @as(CodePointType, @intCast(std.unicode.utf8Decode2(slice) catch unreachable)), - 3 => @as(CodePointType, @intCast(std.unicode.utf8Decode3(slice) catch unreachable)), - 4 => @as(CodePointType, @intCast(std.unicode.utf8Decode4(slice) catch unreachable)), - else => unreachable, - }; - - return it.c; - } - - /// Look ahead at the next n codepoints without advancing the iterator. - /// If fewer than n codepoints are available, then return the remainder of the string. - pub fn peek(it: *Iterator, n: usize) []const u8 { - const original_i = it.i; - defer it.i = original_i; - - var end_ix = original_i; - for (0..n) |_| { - const next_codepoint = it.nextCodepointSlice() orelse return it.bytes[original_i..]; - end_ix += next_codepoint.len; - } - - return it.bytes[original_i..end_ix]; - } - }; -} - -pub const CodepointIterator = NewCodePointIterator(CodePoint, -1); -pub const UnsignedCodepointIterator = NewCodePointIterator(u32, 0); - pub fn NewLengthSorter(comptime Type: type, comptime field: string) type { return struct { const LengthSorter = @This(); @@ -5096,35 +1964,6 @@ pub fn isIPV6Address(input: []const u8) bool { return bun.c_ares.ares_inet_pton(std.posix.AF.INET6, ip_addr_str.ptr, &sockaddr) > 0; } -pub fn cloneNormalizingSeparators( - allocator: std.mem.Allocator, - input: []const u8, -) ![]u8 { - // remove duplicate slashes in the file path - const base = withoutTrailingSlash(input); - var tokenized = std.mem.tokenizeScalar(u8, base, std.fs.path.sep); - var buf = try allocator.alloc(u8, base.len + 2); - if (comptime Environment.allow_assert) assert(base.len > 0); - if (base[0] == std.fs.path.sep) { - buf[0] = std.fs.path.sep; - } - var remain = buf[@as(usize, @intFromBool(base[0] == std.fs.path.sep))..]; - - while (tokenized.next()) |token| { - if (token.len == 0) continue; - bun.copy(u8, remain, token); - remain[token.len..][0] = std.fs.path.sep; - remain = remain[token.len + 1 ..]; - } - if ((remain.ptr - 1) != buf.ptr and (remain.ptr - 1)[0] != std.fs.path.sep) { - remain[0] = std.fs.path.sep; - remain = remain[1..]; - } - remain[0] = 0; - - return buf[0 .. @intFromPtr(remain.ptr) - @intFromPtr(buf.ptr)]; -} - pub fn leftHasAnyInRight(to_check: []const string, against: []const string) bool { for (to_check) |check| { for (against) |item| { @@ -5252,69 +2091,6 @@ pub fn concatIfNeeded( bun.unsafeAssert(remain.len == 0); } -/// This will simply ignore invalid UTF-8 and just do it -pub fn convertUTF8toUTF16InBuffer( - buf: []u16, - input: []const u8, -) []u16 { - // TODO(@paperclover): implement error handling here. - // for now this will cause invalid utf-8 to be ignored and become empty. - // this is lame because of https://github.com/oven-sh/bun/issues/8197 - // it will cause process.env.whatever to be len=0 instead of the data - // but it's better than failing the run entirely - // - // the reason i didn't implement the fallback is purely because our - // code in this file is too chaotic. it is left as a TODO - if (input.len == 0) return buf[0..0]; - const result = bun.simdutf.convert.utf8.to.utf16.le(input, buf); - return buf[0..result]; -} - -pub fn convertUTF8toUTF16InBufferZ( - buf: []u16, - input: []const u8, -) [:0]u16 { - // TODO: see convertUTF8toUTF16InBuffer - if (input.len == 0) { - buf[0] = 0; - return buf[0..0 :0]; - } - const result = bun.simdutf.convert.utf8.to.utf16.le(input, buf); - buf[result] = 0; - return buf[0..result :0]; -} - -pub fn convertUTF16toUTF8InBuffer( - buf: []u8, - input: []const u16, -) ![]const u8 { - // See above - if (input.len == 0) return &[_]u8{}; - const result = bun.simdutf.convert.utf16.to.utf8.le(input, buf); - // switch (result.status) { - // .success => return buf[0..result.count], - // // TODO(@paperclover): handle surrogate - // .surrogate => @panic("TODO: handle surrogate in convertUTF8toUTF16"), - // else => @panic("TODO: handle error in convertUTF16toUTF8InBuffer"), - // } - return buf[0..result]; -} - -pub fn charIsAnySlash(char: u8) callconv(bun.callconv_inline) bool { - return char == '/' or char == '\\'; -} - -pub fn startsWithWindowsDriveLetter(s: []const u8) callconv(bun.callconv_inline) bool { - return startsWithWindowsDriveLetterT(u8, s); -} - -pub fn startsWithWindowsDriveLetterT(comptime T: type, s: []const T) callconv(bun.callconv_inline) bool { - return s.len > 2 and s[1] == ':' and switch (s[0]) { - 'a'...'z', 'A'...'Z' => true, - else => false, - }; -} - pub fn mustEscapeYAMLString(contents: []const u8) bool { if (contents.len == 0) return true; @@ -5325,827 +2101,6 @@ pub fn mustEscapeYAMLString(contents: []const u8) bool { else => true, }; } -pub fn pathContainsNodeModulesFolder(path: []const u8) bool { - return strings.contains(path, comptime std.fs.path.sep_str ++ "node_modules" ++ std.fs.path.sep_str); -} - -pub fn isZeroWidthCodepointType(comptime T: type, cp: T) bool { - if (cp <= 0x1f) { - return true; - } - - if (cp >= 0x7f and cp <= 0x9f) { - // C1 control characters - return true; - } - - if (comptime @sizeOf(T) == 1) { - return false; - } - - if (cp >= 0x300 and cp <= 0x36f) { - // Combining Diacritical Marks - return true; - } - - if (cp >= 0x200b and cp <= 0x200f) { - // Modifying Invisible Characters - return true; - } - - if (cp >= 0x20d0 and cp <= 0x20ff) - // Combining Diacritical Marks for Symbols - return true; - - if (cp >= 0xfe00 and cp <= 0xfe0f) - // Variation Selectors - return true; - if (cp >= 0xfe20 and cp <= 0xfe2f) - // Combining Half Marks - return true; - - if (cp == 0xfeff) - // Zero Width No-Break Space (BOM, ZWNBSP) - return true; - - if (cp >= 0xe0100 and cp <= 0xe01ef) - // Variation Selectors - return true; - - return false; -} - -/// Official unicode reference: https://www.unicode.org/Public/UCD/latest/ucd/EastAsianWidth.txt -/// Tag legend: -/// - `W` (wide) -> true -/// - `F` (full-width) -> true -/// - `H` (half-width) -> false -/// - `N` (neutral) -> false -/// - `Na` (narrow) -> false -/// - `A` (ambiguous) -> false? -/// -/// To regenerate the switch body list, run: -/// ```js -/// [...(await (await fetch("https://www.unicode.org/Public/UCD/latest/ucd/EastAsianWidth.txt")).text()).matchAll(/^([\dA-F]{4,})(?:\.\.([\dA-F]{4,}))?\s+;\s+(\w+)\s+#\s+(.*?)\s*$/gm)].flatMap(([,start, end, type, comment]) => ( -/// (['W', 'F'].includes(type)) ? [` ${(end ? `0x${start}...0x${end}` : `0x${start}`)}, // ${''.padStart(17 - start.length - (end ? end.length + 5 : 0))}[${type}] ${comment}`] : [] -/// )).join('\n') -/// ``` -pub fn isFullWidthCodepointType(comptime T: type, cp: T) bool { - if (!(cp >= 0x1100)) { - return false; - } - - return switch (cp) { - 0x1100...0x115F, // [W] Lo [96] HANGUL CHOSEONG KIYEOK..HANGUL CHOSEONG FILLER - 0x231A...0x231B, // [W] So [2] WATCH..HOURGLASS - 0x2329, // [W] Ps LEFT-POINTING ANGLE BRACKET - 0x232A, // [W] Pe RIGHT-POINTING ANGLE BRACKET - 0x23E9...0x23EC, // [W] So [4] BLACK RIGHT-POINTING DOUBLE TRIANGLE..BLACK DOWN-POINTING DOUBLE TRIANGLE - 0x23F0, // [W] So ALARM CLOCK - 0x23F3, // [W] So HOURGLASS WITH FLOWING SAND - 0x25FD...0x25FE, // [W] Sm [2] WHITE MEDIUM SMALL SQUARE..BLACK MEDIUM SMALL SQUARE - 0x2614...0x2615, // [W] So [2] UMBRELLA WITH RAIN DROPS..HOT BEVERAGE - 0x2648...0x2653, // [W] So [12] ARIES..PISCES - 0x267F, // [W] So WHEELCHAIR SYMBOL - 0x2693, // [W] So ANCHOR - 0x26A1, // [W] So HIGH VOLTAGE SIGN - 0x26AA...0x26AB, // [W] So [2] MEDIUM WHITE CIRCLE..MEDIUM BLACK CIRCLE - 0x26BD...0x26BE, // [W] So [2] SOCCER BALL..BASEBALL - 0x26C4...0x26C5, // [W] So [2] SNOWMAN WITHOUT SNOW..SUN BEHIND CLOUD - 0x26CE, // [W] So OPHIUCHUS - 0x26D4, // [W] So NO ENTRY - 0x26EA, // [W] So CHURCH - 0x26F2...0x26F3, // [W] So [2] FOUNTAIN..FLAG IN HOLE - 0x26F5, // [W] So SAILBOAT - 0x26FA, // [W] So TENT - 0x26FD, // [W] So FUEL PUMP - 0x2705, // [W] So WHITE HEAVY CHECK MARK - 0x270A...0x270B, // [W] So [2] RAISED FIST..RAISED HAND - 0x2728, // [W] So SPARKLES - 0x274C, // [W] So CROSS MARK - 0x274E, // [W] So NEGATIVE SQUARED CROSS MARK - 0x2753...0x2755, // [W] So [3] BLACK QUESTION MARK ORNAMENT..WHITE EXCLAMATION MARK ORNAMENT - 0x2757, // [W] So HEAVY EXCLAMATION MARK SYMBOL - 0x2795...0x2797, // [W] So [3] HEAVY PLUS SIGN..HEAVY DIVISION SIGN - 0x27B0, // [W] So CURLY LOOP - 0x27BF, // [W] So DOUBLE CURLY LOOP - 0x2B1B...0x2B1C, // [W] So [2] BLACK LARGE SQUARE..WHITE LARGE SQUARE - 0x2B50, // [W] So WHITE MEDIUM STAR - 0x2B55, // [W] So HEAVY LARGE CIRCLE - 0x2E80...0x2E99, // [W] So [26] CJK RADICAL REPEAT..CJK RADICAL RAP - 0x2E9B...0x2EF3, // [W] So [89] CJK RADICAL CHOKE..CJK RADICAL C-SIMPLIFIED TURTLE - 0x2F00...0x2FD5, // [W] So [214] KANGXI RADICAL ONE..KANGXI RADICAL FLUTE - 0x2FF0...0x2FFF, // [W] So [16] IDEOGRAPHIC DESCRIPTION CHARACTER LEFT TO RIGHT..IDEOGRAPHIC DESCRIPTION CHARACTER ROTATION - 0x3000, // [F] Zs IDEOGRAPHIC SPACE - 0x3001...0x3003, // [W] Po [3] IDEOGRAPHIC COMMA..DITTO MARK - 0x3004, // [W] So JAPANESE INDUSTRIAL STANDARD SYMBOL - 0x3005, // [W] Lm IDEOGRAPHIC ITERATION MARK - 0x3006, // [W] Lo IDEOGRAPHIC CLOSING MARK - 0x3007, // [W] Nl IDEOGRAPHIC NUMBER ZERO - 0x3008, // [W] Ps LEFT ANGLE BRACKET - 0x3009, // [W] Pe RIGHT ANGLE BRACKET - 0x300A, // [W] Ps LEFT DOUBLE ANGLE BRACKET - 0x300B, // [W] Pe RIGHT DOUBLE ANGLE BRACKET - 0x300C, // [W] Ps LEFT CORNER BRACKET - 0x300D, // [W] Pe RIGHT CORNER BRACKET - 0x300E, // [W] Ps LEFT WHITE CORNER BRACKET - 0x300F, // [W] Pe RIGHT WHITE CORNER BRACKET - 0x3010, // [W] Ps LEFT BLACK LENTICULAR BRACKET - 0x3011, // [W] Pe RIGHT BLACK LENTICULAR BRACKET - 0x3012...0x3013, // [W] So [2] POSTAL MARK..GETA MARK - 0x3014, // [W] Ps LEFT TORTOISE SHELL BRACKET - 0x3015, // [W] Pe RIGHT TORTOISE SHELL BRACKET - 0x3016, // [W] Ps LEFT WHITE LENTICULAR BRACKET - 0x3017, // [W] Pe RIGHT WHITE LENTICULAR BRACKET - 0x3018, // [W] Ps LEFT WHITE TORTOISE SHELL BRACKET - 0x3019, // [W] Pe RIGHT WHITE TORTOISE SHELL BRACKET - 0x301A, // [W] Ps LEFT WHITE SQUARE BRACKET - 0x301B, // [W] Pe RIGHT WHITE SQUARE BRACKET - 0x301C, // [W] Pd WAVE DASH - 0x301D, // [W] Ps REVERSED DOUBLE PRIME QUOTATION MARK - 0x301E...0x301F, // [W] Pe [2] DOUBLE PRIME QUOTATION MARK..LOW DOUBLE PRIME QUOTATION MARK - 0x3020, // [W] So POSTAL MARK FACE - 0x3021...0x3029, // [W] Nl [9] HANGZHOU NUMERAL ONE..HANGZHOU NUMERAL NINE - 0x302A...0x302D, // [W] Mn [4] IDEOGRAPHIC LEVEL TONE MARK..IDEOGRAPHIC ENTERING TONE MARK - 0x302E...0x302F, // [W] Mc [2] HANGUL SINGLE DOT TONE MARK..HANGUL DOUBLE DOT TONE MARK - 0x3030, // [W] Pd WAVY DASH - 0x3031...0x3035, // [W] Lm [5] VERTICAL KANA REPEAT MARK..VERTICAL KANA REPEAT MARK LOWER HALF - 0x3036...0x3037, // [W] So [2] CIRCLED POSTAL MARK..IDEOGRAPHIC TELEGRAPH LINE FEED SEPARATOR SYMBOL - 0x3038...0x303A, // [W] Nl [3] HANGZHOU NUMERAL TEN..HANGZHOU NUMERAL THIRTY - 0x303B, // [W] Lm VERTICAL IDEOGRAPHIC ITERATION MARK - 0x303C, // [W] Lo MASU MARK - 0x303D, // [W] Po PART ALTERNATION MARK - 0x303E, // [W] So IDEOGRAPHIC VARIATION INDICATOR - 0x3041...0x3096, // [W] Lo [86] HIRAGANA LETTER SMALL A..HIRAGANA LETTER SMALL KE - 0x3099...0x309A, // [W] Mn [2] COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK..COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK - 0x309B...0x309C, // [W] Sk [2] KATAKANA-HIRAGANA VOICED SOUND MARK..KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK - 0x309D...0x309E, // [W] Lm [2] HIRAGANA ITERATION MARK..HIRAGANA VOICED ITERATION MARK - 0x309F, // [W] Lo HIRAGANA DIGRAPH YORI - 0x30A0, // [W] Pd KATAKANA-HIRAGANA DOUBLE HYPHEN - 0x30A1...0x30FA, // [W] Lo [90] KATAKANA LETTER SMALL A..KATAKANA LETTER VO - 0x30FB, // [W] Po KATAKANA MIDDLE DOT - 0x30FC...0x30FE, // [W] Lm [3] KATAKANA-HIRAGANA PROLONGED SOUND MARK..KATAKANA VOICED ITERATION MARK - 0x30FF, // [W] Lo KATAKANA DIGRAPH KOTO - 0x3105...0x312F, // [W] Lo [43] BOPOMOFO LETTER B..BOPOMOFO LETTER NN - 0x3131...0x318E, // [W] Lo [94] HANGUL LETTER KIYEOK..HANGUL LETTER ARAEAE - 0x3190...0x3191, // [W] So [2] IDEOGRAPHIC ANNOTATION LINKING MARK..IDEOGRAPHIC ANNOTATION REVERSE MARK - 0x3192...0x3195, // [W] No [4] IDEOGRAPHIC ANNOTATION ONE MARK..IDEOGRAPHIC ANNOTATION FOUR MARK - 0x3196...0x319F, // [W] So [10] IDEOGRAPHIC ANNOTATION TOP MARK..IDEOGRAPHIC ANNOTATION MAN MARK - 0x31A0...0x31BF, // [W] Lo [32] BOPOMOFO LETTER BU..BOPOMOFO LETTER AH - 0x31C0...0x31E3, // [W] So [36] CJK STROKE T..CJK STROKE Q - 0x31EF, // [W] So IDEOGRAPHIC DESCRIPTION CHARACTER SUBTRACTION - 0x31F0...0x31FF, // [W] Lo [16] KATAKANA LETTER SMALL KU..KATAKANA LETTER SMALL RO - 0x3200...0x321E, // [W] So [31] PARENTHESIZED HANGUL KIYEOK..PARENTHESIZED KOREAN CHARACTER O HU - 0x3220...0x3229, // [W] No [10] PARENTHESIZED IDEOGRAPH ONE..PARENTHESIZED IDEOGRAPH TEN - 0x322A...0x3247, // [W] So [30] PARENTHESIZED IDEOGRAPH MOON..CIRCLED IDEOGRAPH KOTO - 0x3250, // [W] So PARTNERSHIP SIGN - 0x3251...0x325F, // [W] No [15] CIRCLED NUMBER TWENTY ONE..CIRCLED NUMBER THIRTY FIVE - 0x3260...0x327F, // [W] So [32] CIRCLED HANGUL KIYEOK..KOREAN STANDARD SYMBOL - 0x3280...0x3289, // [W] No [10] CIRCLED IDEOGRAPH ONE..CIRCLED IDEOGRAPH TEN - 0x328A...0x32B0, // [W] So [39] CIRCLED IDEOGRAPH MOON..CIRCLED IDEOGRAPH NIGHT - 0x32B1...0x32BF, // [W] No [15] CIRCLED NUMBER THIRTY SIX..CIRCLED NUMBER FIFTY - 0x32C0...0x32FF, // [W] So [64] IDEOGRAPHIC TELEGRAPH SYMBOL FOR JANUARY..SQUARE ERA NAME REIWA - 0x3300...0x33FF, // [W] So [256] SQUARE APAATO..SQUARE GAL - 0x3400...0x4DBF, // [W] Lo [6592] CJK UNIFIED IDEOGRAPH-3400..CJK UNIFIED IDEOGRAPH-4DBF - 0x4E00...0x9FFF, // [W] Lo [20992] CJK UNIFIED IDEOGRAPH-4E00..CJK UNIFIED IDEOGRAPH-9FFF - 0xA000...0xA014, // [W] Lo [21] YI SYLLABLE IT..YI SYLLABLE E - 0xA015, // [W] Lm YI SYLLABLE WU - 0xA016...0xA48C, // [W] Lo [1143] YI SYLLABLE BIT..YI SYLLABLE YYR - 0xA490...0xA4C6, // [W] So [55] YI RADICAL QOT..YI RADICAL KE - 0xA960...0xA97C, // [W] Lo [29] HANGUL CHOSEONG TIKEUT-MIEUM..HANGUL CHOSEONG SSANGYEORINHIEUH - 0xAC00...0xD7A3, // [W] Lo [11172] HANGUL SYLLABLE GA..HANGUL SYLLABLE HIH - 0xF900...0xFA6D, // [W] Lo [366] CJK COMPATIBILITY IDEOGRAPH-F900..CJK COMPATIBILITY IDEOGRAPH-FA6D - 0xFA6E...0xFA6F, // [W] Cn [2] .. - 0xFA70...0xFAD9, // [W] Lo [106] CJK COMPATIBILITY IDEOGRAPH-FA70..CJK COMPATIBILITY IDEOGRAPH-FAD9 - 0xFADA...0xFAFF, // [W] Cn [38] .. - 0xFE10...0xFE16, // [W] Po [7] PRESENTATION FORM FOR VERTICAL COMMA..PRESENTATION FORM FOR VERTICAL QUESTION MARK - 0xFE17, // [W] Ps PRESENTATION FORM FOR VERTICAL LEFT WHITE LENTICULAR BRACKET - 0xFE18, // [W] Pe PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRAKCET - 0xFE19, // [W] Po PRESENTATION FORM FOR VERTICAL HORIZONTAL ELLIPSIS - 0xFE30, // [W] Po PRESENTATION FORM FOR VERTICAL TWO DOT LEADER - 0xFE31...0xFE32, // [W] Pd [2] PRESENTATION FORM FOR VERTICAL EM DASH..PRESENTATION FORM FOR VERTICAL EN DASH - 0xFE33...0xFE34, // [W] Pc [2] PRESENTATION FORM FOR VERTICAL LOW LINE..PRESENTATION FORM FOR VERTICAL WAVY LOW LINE - 0xFE35, // [W] Ps PRESENTATION FORM FOR VERTICAL LEFT PARENTHESIS - 0xFE36, // [W] Pe PRESENTATION FORM FOR VERTICAL RIGHT PARENTHESIS - 0xFE37, // [W] Ps PRESENTATION FORM FOR VERTICAL LEFT CURLY BRACKET - 0xFE38, // [W] Pe PRESENTATION FORM FOR VERTICAL RIGHT CURLY BRACKET - 0xFE39, // [W] Ps PRESENTATION FORM FOR VERTICAL LEFT TORTOISE SHELL BRACKET - 0xFE3A, // [W] Pe PRESENTATION FORM FOR VERTICAL RIGHT TORTOISE SHELL BRACKET - 0xFE3B, // [W] Ps PRESENTATION FORM FOR VERTICAL LEFT BLACK LENTICULAR BRACKET - 0xFE3C, // [W] Pe PRESENTATION FORM FOR VERTICAL RIGHT BLACK LENTICULAR BRACKET - 0xFE3D, // [W] Ps PRESENTATION FORM FOR VERTICAL LEFT DOUBLE ANGLE BRACKET - 0xFE3E, // [W] Pe PRESENTATION FORM FOR VERTICAL RIGHT DOUBLE ANGLE BRACKET - 0xFE3F, // [W] Ps PRESENTATION FORM FOR VERTICAL LEFT ANGLE BRACKET - 0xFE40, // [W] Pe PRESENTATION FORM FOR VERTICAL RIGHT ANGLE BRACKET - 0xFE41, // [W] Ps PRESENTATION FORM FOR VERTICAL LEFT CORNER BRACKET - 0xFE42, // [W] Pe PRESENTATION FORM FOR VERTICAL RIGHT CORNER BRACKET - 0xFE43, // [W] Ps PRESENTATION FORM FOR VERTICAL LEFT WHITE CORNER BRACKET - 0xFE44, // [W] Pe PRESENTATION FORM FOR VERTICAL RIGHT WHITE CORNER BRACKET - 0xFE45...0xFE46, // [W] Po [2] SESAME DOT..WHITE SESAME DOT - 0xFE47, // [W] Ps PRESENTATION FORM FOR VERTICAL LEFT SQUARE BRACKET - 0xFE48, // [W] Pe PRESENTATION FORM FOR VERTICAL RIGHT SQUARE BRACKET - 0xFE49...0xFE4C, // [W] Po [4] DASHED OVERLINE..DOUBLE WAVY OVERLINE - 0xFE4D...0xFE4F, // [W] Pc [3] DASHED LOW LINE..WAVY LOW LINE - 0xFE50...0xFE52, // [W] Po [3] SMALL COMMA..SMALL FULL STOP - 0xFE54...0xFE57, // [W] Po [4] SMALL SEMICOLON..SMALL EXCLAMATION MARK - 0xFE58, // [W] Pd SMALL EM DASH - 0xFE59, // [W] Ps SMALL LEFT PARENTHESIS - 0xFE5A, // [W] Pe SMALL RIGHT PARENTHESIS - 0xFE5B, // [W] Ps SMALL LEFT CURLY BRACKET - 0xFE5C, // [W] Pe SMALL RIGHT CURLY BRACKET - 0xFE5D, // [W] Ps SMALL LEFT TORTOISE SHELL BRACKET - 0xFE5E, // [W] Pe SMALL RIGHT TORTOISE SHELL BRACKET - 0xFE5F...0xFE61, // [W] Po [3] SMALL NUMBER SIGN..SMALL ASTERISK - 0xFE62, // [W] Sm SMALL PLUS SIGN - 0xFE63, // [W] Pd SMALL HYPHEN-MINUS - 0xFE64...0xFE66, // [W] Sm [3] SMALL LESS-THAN SIGN..SMALL EQUALS SIGN - 0xFE68, // [W] Po SMALL REVERSE SOLIDUS - 0xFE69, // [W] Sc SMALL DOLLAR SIGN - 0xFE6A...0xFE6B, // [W] Po [2] SMALL PERCENT SIGN..SMALL COMMERCIAL AT - 0xFF01...0xFF03, // [F] Po [3] FULLWIDTH EXCLAMATION MARK..FULLWIDTH NUMBER SIGN - 0xFF04, // [F] Sc FULLWIDTH DOLLAR SIGN - 0xFF05...0xFF07, // [F] Po [3] FULLWIDTH PERCENT SIGN..FULLWIDTH APOSTROPHE - 0xFF08, // [F] Ps FULLWIDTH LEFT PARENTHESIS - 0xFF09, // [F] Pe FULLWIDTH RIGHT PARENTHESIS - 0xFF0A, // [F] Po FULLWIDTH ASTERISK - 0xFF0B, // [F] Sm FULLWIDTH PLUS SIGN - 0xFF0C, // [F] Po FULLWIDTH COMMA - 0xFF0D, // [F] Pd FULLWIDTH HYPHEN-MINUS - 0xFF0E...0xFF0F, // [F] Po [2] FULLWIDTH FULL STOP..FULLWIDTH SOLIDUS - 0xFF10...0xFF19, // [F] Nd [10] FULLWIDTH DIGIT ZERO..FULLWIDTH DIGIT NINE - 0xFF1A...0xFF1B, // [F] Po [2] FULLWIDTH COLON..FULLWIDTH SEMICOLON - 0xFF1C...0xFF1E, // [F] Sm [3] FULLWIDTH LESS-THAN SIGN..FULLWIDTH GREATER-THAN SIGN - 0xFF1F...0xFF20, // [F] Po [2] FULLWIDTH QUESTION MARK..FULLWIDTH COMMERCIAL AT - 0xFF21...0xFF3A, // [F] Lu [26] FULLWIDTH LATIN CAPITAL LETTER A..FULLWIDTH LATIN CAPITAL LETTER Z - 0xFF3B, // [F] Ps FULLWIDTH LEFT SQUARE BRACKET - 0xFF3C, // [F] Po FULLWIDTH REVERSE SOLIDUS - 0xFF3D, // [F] Pe FULLWIDTH RIGHT SQUARE BRACKET - 0xFF3E, // [F] Sk FULLWIDTH CIRCUMFLEX ACCENT - 0xFF3F, // [F] Pc FULLWIDTH LOW LINE - 0xFF40, // [F] Sk FULLWIDTH GRAVE ACCENT - 0xFF41...0xFF5A, // [F] Ll [26] FULLWIDTH LATIN SMALL LETTER A..FULLWIDTH LATIN SMALL LETTER Z - 0xFF5B, // [F] Ps FULLWIDTH LEFT CURLY BRACKET - 0xFF5C, // [F] Sm FULLWIDTH VERTICAL LINE - 0xFF5D, // [F] Pe FULLWIDTH RIGHT CURLY BRACKET - 0xFF5E, // [F] Sm FULLWIDTH TILDE - 0xFF5F, // [F] Ps FULLWIDTH LEFT WHITE PARENTHESIS - 0xFF60, // [F] Pe FULLWIDTH RIGHT WHITE PARENTHESIS - 0xFFE0...0xFFE1, // [F] Sc [2] FULLWIDTH CENT SIGN..FULLWIDTH POUND SIGN - 0xFFE2, // [F] Sm FULLWIDTH NOT SIGN - 0xFFE3, // [F] Sk FULLWIDTH MACRON - 0xFFE4, // [F] So FULLWIDTH BROKEN BAR - 0xFFE5...0xFFE6, // [F] Sc [2] FULLWIDTH YEN SIGN..FULLWIDTH WON SIGN - 0x16FE0...0x16FE1, // [W] Lm [2] TANGUT ITERATION MARK..NUSHU ITERATION MARK - 0x16FE2, // [W] Po OLD CHINESE HOOK MARK - 0x16FE3, // [W] Lm OLD CHINESE ITERATION MARK - 0x16FE4, // [W] Mn KHITAN SMALL SCRIPT FILLER - 0x16FF0...0x16FF1, // [W] Mc [2] VIETNAMESE ALTERNATE READING MARK CA..VIETNAMESE ALTERNATE READING MARK NHAY - 0x17000...0x187F7, // [W] Lo [6136] TANGUT IDEOGRAPH-17000..TANGUT IDEOGRAPH-187F7 - 0x18800...0x18AFF, // [W] Lo [768] TANGUT COMPONENT-001..TANGUT COMPONENT-768 - 0x18B00...0x18CD5, // [W] Lo [470] KHITAN SMALL SCRIPT CHARACTER-18B00..KHITAN SMALL SCRIPT CHARACTER-18CD5 - 0x18D00...0x18D08, // [W] Lo [9] TANGUT IDEOGRAPH-18D00..TANGUT IDEOGRAPH-18D08 - 0x1AFF0...0x1AFF3, // [W] Lm [4] KATAKANA LETTER MINNAN TONE-2..KATAKANA LETTER MINNAN TONE-5 - 0x1AFF5...0x1AFFB, // [W] Lm [7] KATAKANA LETTER MINNAN TONE-7..KATAKANA LETTER MINNAN NASALIZED TONE-5 - 0x1AFFD...0x1AFFE, // [W] Lm [2] KATAKANA LETTER MINNAN NASALIZED TONE-7..KATAKANA LETTER MINNAN NASALIZED TONE-8 - 0x1B000...0x1B0FF, // [W] Lo [256] KATAKANA LETTER ARCHAIC E..HENTAIGANA LETTER RE-2 - 0x1B100...0x1B122, // [W] Lo [35] HENTAIGANA LETTER RE-3..KATAKANA LETTER ARCHAIC WU - 0x1B132, // [W] Lo HIRAGANA LETTER SMALL KO - 0x1B150...0x1B152, // [W] Lo [3] HIRAGANA LETTER SMALL WI..HIRAGANA LETTER SMALL WO - 0x1B155, // [W] Lo KATAKANA LETTER SMALL KO - 0x1B164...0x1B167, // [W] Lo [4] KATAKANA LETTER SMALL WI..KATAKANA LETTER SMALL N - 0x1B170...0x1B2FB, // [W] Lo [396] NUSHU CHARACTER-1B170..NUSHU CHARACTER-1B2FB - 0x1F004, // [W] So MAHJONG TILE RED DRAGON - 0x1F0CF, // [W] So PLAYING CARD BLACK JOKER - 0x1F18E, // [W] So NEGATIVE SQUARED AB - 0x1F191...0x1F19A, // [W] So [10] SQUARED CL..SQUARED VS - 0x1F200...0x1F202, // [W] So [3] SQUARE HIRAGANA HOKA..SQUARED KATAKANA SA - 0x1F210...0x1F23B, // [W] So [44] SQUARED CJK UNIFIED IDEOGRAPH-624B..SQUARED CJK UNIFIED IDEOGRAPH-914D - 0x1F240...0x1F248, // [W] So [9] TORTOISE SHELL BRACKETED CJK UNIFIED IDEOGRAPH-672C..TORTOISE SHELL BRACKETED CJK UNIFIED IDEOGRAPH-6557 - 0x1F250...0x1F251, // [W] So [2] CIRCLED IDEOGRAPH ADVANTAGE..CIRCLED IDEOGRAPH ACCEPT - 0x1F260...0x1F265, // [W] So [6] ROUNDED SYMBOL FOR FU..ROUNDED SYMBOL FOR CAI - 0x1F300...0x1F320, // [W] So [33] CYCLONE..SHOOTING STAR - 0x1F32D...0x1F335, // [W] So [9] HOT DOG..CACTUS - 0x1F337...0x1F37C, // [W] So [70] TULIP..BABY BOTTLE - 0x1F37E...0x1F393, // [W] So [22] BOTTLE WITH POPPING CORK..GRADUATION CAP - 0x1F3A0...0x1F3CA, // [W] So [43] CAROUSEL HORSE..SWIMMER - 0x1F3CF...0x1F3D3, // [W] So [5] CRICKET BAT AND BALL..TABLE TENNIS PADDLE AND BALL - 0x1F3E0...0x1F3F0, // [W] So [17] HOUSE BUILDING..EUROPEAN CASTLE - 0x1F3F4, // [W] So WAVING BLACK FLAG - 0x1F3F8...0x1F3FA, // [W] So [3] BADMINTON RACQUET AND SHUTTLECOCK..AMPHORA - 0x1F3FB...0x1F3FF, // [W] Sk [5] EMOJI MODIFIER FITZPATRICK TYPE-1-2..EMOJI MODIFIER FITZPATRICK TYPE-6 - 0x1F400...0x1F43E, // [W] So [63] RAT..PAW PRINTS - 0x1F440, // [W] So EYES - 0x1F442...0x1F4FC, // [W] So [187] EAR..VIDEOCASSETTE - 0x1F4FF...0x1F53D, // [W] So [63] PRAYER BEADS..DOWN-POINTING SMALL RED TRIANGLE - 0x1F54B...0x1F54E, // [W] So [4] KAABA..MENORAH WITH NINE BRANCHES - 0x1F550...0x1F567, // [W] So [24] CLOCK FACE ONE OCLOCK..CLOCK FACE TWELVE-THIRTY - 0x1F57A, // [W] So MAN DANCING - 0x1F595...0x1F596, // [W] So [2] REVERSED HAND WITH MIDDLE FINGER EXTENDED..RAISED HAND WITH PART BETWEEN MIDDLE AND RING FINGERS - 0x1F5A4, // [W] So BLACK HEART - 0x1F5FB...0x1F5FF, // [W] So [5] MOUNT FUJI..MOYAI - 0x1F600...0x1F64F, // [W] So [80] GRINNING FACE..PERSON WITH FOLDED HANDS - 0x1F680...0x1F6C5, // [W] So [70] ROCKET..LEFT LUGGAGE - 0x1F6CC, // [W] So SLEEPING ACCOMMODATION - 0x1F6D0...0x1F6D2, // [W] So [3] PLACE OF WORSHIP..SHOPPING TROLLEY - 0x1F6D5...0x1F6D7, // [W] So [3] HINDU TEMPLE..ELEVATOR - 0x1F6DC...0x1F6DF, // [W] So [4] WIRELESS..RING BUOY - 0x1F6EB...0x1F6EC, // [W] So [2] AIRPLANE DEPARTURE..AIRPLANE ARRIVING - 0x1F6F4...0x1F6FC, // [W] So [9] SCOOTER..ROLLER SKATE - 0x1F7E0...0x1F7EB, // [W] So [12] LARGE ORANGE CIRCLE..LARGE BROWN SQUARE - 0x1F7F0, // [W] So HEAVY EQUALS SIGN - 0x1F90C...0x1F93A, // [W] So [47] PINCHED FINGERS..FENCER - 0x1F93C...0x1F945, // [W] So [10] WRESTLERS..GOAL NET - 0x1F947...0x1F9FF, // [W] So [185] FIRST PLACE MEDAL..NAZAR AMULET - 0x1FA70...0x1FA7C, // [W] So [13] BALLET SHOES..CRUTCH - 0x1FA80...0x1FA88, // [W] So [9] YO-YO..FLUTE - 0x1FA90...0x1FABD, // [W] So [46] RINGED PLANET..WING - 0x1FABF...0x1FAC5, // [W] So [7] GOOSE..PERSON WITH CROWN - 0x1FACE...0x1FADB, // [W] So [14] MOOSE..PEA POD - 0x1FAE0...0x1FAE8, // [W] So [9] MELTING FACE..SHAKING FACE - 0x1FAF0...0x1FAF8, // [W] So [9] HAND WITH INDEX FINGER AND THUMB CROSSED..RIGHTWARDS PUSHING HAND - 0x20000...0x2A6DF, // [W] Lo [42720] CJK UNIFIED IDEOGRAPH-20000..CJK UNIFIED IDEOGRAPH-2A6DF - 0x2A6E0...0x2A6FF, // [W] Cn [32] .. - 0x2A700...0x2B739, // [W] Lo [4154] CJK UNIFIED IDEOGRAPH-2A700..CJK UNIFIED IDEOGRAPH-2B739 - 0x2B73A...0x2B73F, // [W] Cn [6] .. - 0x2B740...0x2B81D, // [W] Lo [222] CJK UNIFIED IDEOGRAPH-2B740..CJK UNIFIED IDEOGRAPH-2B81D - 0x2B81E...0x2B81F, // [W] Cn [2] .. - 0x2B820...0x2CEA1, // [W] Lo [5762] CJK UNIFIED IDEOGRAPH-2B820..CJK UNIFIED IDEOGRAPH-2CEA1 - 0x2CEA2...0x2CEAF, // [W] Cn [14] .. - 0x2CEB0...0x2EBE0, // [W] Lo [7473] CJK UNIFIED IDEOGRAPH-2CEB0..CJK UNIFIED IDEOGRAPH-2EBE0 - 0x2EBE1...0x2EBEF, // [W] Cn [15] .. - 0x2EBF0...0x2EE5D, // [W] Lo [622] CJK UNIFIED IDEOGRAPH-2EBF0..CJK UNIFIED IDEOGRAPH-2EE5D - 0x2EE5E...0x2F7FF, // [W] Cn [2466] .. - 0x2F800...0x2FA1D, // [W] Lo [542] CJK COMPATIBILITY IDEOGRAPH-2F800..CJK COMPATIBILITY IDEOGRAPH-2FA1D - 0x2FA1E...0x2FA1F, // [W] Cn [2] .. - 0x2FA20...0x2FFFD, // [W] Cn [1502] .. - 0x30000...0x3134A, // [W] Lo [4939] CJK UNIFIED IDEOGRAPH-30000..CJK UNIFIED IDEOGRAPH-3134A - 0x3134B...0x3134F, // [W] Cn [5] .. - 0x31350...0x323AF, // [W] Lo [4192] CJK UNIFIED IDEOGRAPH-31350..CJK UNIFIED IDEOGRAPH-323AF - 0x323B0...0x3FFFD, // [W] Cn [56398] .. - => true, - else => false, - }; -} - -pub fn isAmgiguousCodepointType(comptime T: type, cp: T) bool { - return switch (cp) { - 0xA1, - 0xA4, - 0xA7, - 0xA8, - 0xAA, - 0xAD, - 0xAE, - 0xB0...0xB4, - 0xB6...0xBA, - 0xBC...0xBF, - 0xC6, - 0xD0, - 0xD7, - 0xD8, - 0xDE...0xE1, - 0xE6, - 0xE8...0xEA, - 0xEC, - 0xED, - 0xF0, - 0xF2, - 0xF3, - 0xF7...0xFA, - 0xFC, - 0xFE, - 0x101, - 0x111, - 0x113, - 0x11B, - 0x126, - 0x127, - 0x12B, - 0x131...0x133, - 0x138, - 0x13F...0x142, - 0x144, - 0x148...0x14B, - 0x14D, - 0x152, - 0x153, - 0x166, - 0x167, - 0x16B, - 0x1CE, - 0x1D0, - 0x1D2, - 0x1D4, - 0x1D6, - 0x1D8, - 0x1DA, - 0x1DC, - 0x251, - 0x261, - 0x2C4, - 0x2C7, - 0x2C9...0x2CB, - 0x2CD, - 0x2D0, - 0x2D8...0x2DB, - 0x2DD, - 0x2DF, - 0x300...0x36F, - 0x391...0x3A1, - 0x3A3...0x3A9, - 0x3B1...0x3C1, - 0x3C3...0x3C9, - 0x401, - 0x410...0x44F, - 0x451, - 0x2010, - 0x2013...0x2016, - 0x2018, - 0x2019, - 0x201C, - 0x201D, - 0x2020...0x2022, - 0x2024...0x2027, - 0x2030, - 0x2032, - 0x2033, - 0x2035, - 0x203B, - 0x203E, - 0x2074, - 0x207F, - 0x2081...0x2084, - 0x20AC, - 0x2103, - 0x2105, - 0x2109, - 0x2113, - 0x2116, - 0x2121, - 0x2122, - 0x2126, - 0x212B, - 0x2153, - 0x2154, - 0x215B...0x215E, - 0x2160...0x216B, - 0x2170...0x2179, - 0x2189, - 0x2190...0x2199, - 0x21B8, - 0x21B9, - 0x21D2, - 0x21D4, - 0x21E7, - 0x2200, - 0x2202, - 0x2203, - 0x2207, - 0x2208, - 0x220B, - 0x220F, - 0x2211, - 0x2215, - 0x221A, - 0x221D...0x2220, - 0x2223, - 0x2225, - 0x2227...0x222C, - 0x222E, - 0x2234...0x2237, - 0x223C, - 0x223D, - 0x2248, - 0x224C, - 0x2252, - 0x2260, - 0x2261, - 0x2264...0x2267, - 0x226A, - 0x226B, - 0x226E, - 0x226F, - 0x2282, - 0x2283, - 0x2286, - 0x2287, - 0x2295, - 0x2299, - 0x22A5, - 0x22BF, - 0x2312, - 0x2460...0x24E9, - 0x24EB...0x254B, - 0x2550...0x2573, - 0x2580...0x258F, - 0x2592...0x2595, - 0x25A0, - 0x25A1, - 0x25A3...0x25A9, - 0x25B2, - 0x25B3, - 0x25B6, - 0x25B7, - 0x25BC, - 0x25BD, - 0x25C0, - 0x25C1, - 0x25C6...0x25C8, - 0x25CB, - 0x25CE...0x25D1, - 0x25E2...0x25E5, - 0x25EF, - 0x2605, - 0x2606, - 0x2609, - 0x260E, - 0x260F, - 0x261C, - 0x261E, - 0x2640, - 0x2642, - 0x2660, - 0x2661, - 0x2663...0x2665, - 0x2667...0x266A, - 0x266C, - 0x266D, - 0x266F, - 0x269E, - 0x269F, - 0x26BF, - 0x26C6...0x26CD, - 0x26CF...0x26D3, - 0x26D5...0x26E1, - 0x26E3, - 0x26E8, - 0x26E9, - 0x26EB...0x26F1, - 0x26F4, - 0x26F6...0x26F9, - 0x26FB, - 0x26FC, - 0x26FE, - 0x26FF, - 0x273D, - 0x2776...0x277F, - 0x2B56...0x2B59, - 0x3248...0x324F, - 0xE000...0xF8FF, - 0xFE00...0xFE0F, - 0xFFFD, - 0x1F100...0x1F10A, - 0x1F110...0x1F12D, - 0x1F130...0x1F169, - 0x1F170...0x1F18D, - 0x1F18F, - 0x1F190, - 0x1F19B...0x1F1AC, - 0xE0100...0xE01EF, - 0xF0000...0xFFFFD, - 0x100000...0x10FFFD, - => true, - else => false, - }; -} - -pub fn visibleCodepointWidth(cp: u32, ambiguousAsWide: bool) u3_fast { - return visibleCodepointWidthType(u32, cp, ambiguousAsWide); -} - -pub fn visibleCodepointWidthMaybeEmoji(cp: u32, maybe_emoji: bool, ambiguousAsWide: bool) u3_fast { - // UCHAR_EMOJI=57, - if (maybe_emoji and icu_hasBinaryProperty(cp, 57)) { - return 2; - } - return visibleCodepointWidth(cp, ambiguousAsWide); -} - -pub fn visibleCodepointWidthType(comptime T: type, cp: T, ambiguousAsWide: bool) u3_fast { - if (isZeroWidthCodepointType(T, cp)) { - return 0; - } - - if (isFullWidthCodepointType(T, cp)) { - return 2; - } - if (ambiguousAsWide and isAmgiguousCodepointType(T, cp)) { - return 2; - } - - return 1; -} - -pub const visible = struct { - // Ref: https://cs.stanford.edu/people/miles/iso8859.html - fn visibleLatin1Width(input_: []const u8) usize { - var length: usize = 0; - var input = input_; - const input_end_ptr = input.ptr + input.len - (input.len % 16); - var input_ptr = input.ptr; - while (input_ptr != input_end_ptr) { - const input_chunk: [16]u8 = input_ptr[0..16].*; - const sums: @Vector(16, u8) = [16]u8{ - visibleLatin1WidthScalar(input_chunk[0]), - visibleLatin1WidthScalar(input_chunk[1]), - visibleLatin1WidthScalar(input_chunk[2]), - visibleLatin1WidthScalar(input_chunk[3]), - visibleLatin1WidthScalar(input_chunk[4]), - visibleLatin1WidthScalar(input_chunk[5]), - visibleLatin1WidthScalar(input_chunk[6]), - visibleLatin1WidthScalar(input_chunk[7]), - visibleLatin1WidthScalar(input_chunk[8]), - visibleLatin1WidthScalar(input_chunk[9]), - visibleLatin1WidthScalar(input_chunk[10]), - visibleLatin1WidthScalar(input_chunk[11]), - visibleLatin1WidthScalar(input_chunk[12]), - visibleLatin1WidthScalar(input_chunk[13]), - visibleLatin1WidthScalar(input_chunk[14]), - visibleLatin1WidthScalar(input_chunk[15]), - }; - length += @reduce(.Add, sums); - input_ptr += 16; - } - input.len %= 16; - input.ptr = input_ptr; - - for (input) |byte| length += visibleLatin1WidthScalar(byte); - return length; - } - - fn visibleLatin1WidthScalar(c: u8) u1 { - return if ((c >= 127 and c <= 159) or c < 32) 0 else 1; - } - - fn visibleLatin1WidthExcludeANSIColors(input_: anytype) usize { - var length: usize = 0; - var input = input_; - - const ElementType = std.meta.Child(@TypeOf(input_)); - const indexFn = if (comptime ElementType == u8) strings.indexOfCharUsize else strings.indexOfChar16Usize; - - while (indexFn(input, '\x1b')) |i| { - length += visibleLatin1Width(input[0..i]); - input = input[i..]; - - if (input.len < 3) return length; - - if (input[1] == '[') { - const end = indexFn(input[2..], 'm') orelse return length; - input = input[end + 3 ..]; - } else { - input = input[1..]; - } - } - - length += visibleLatin1Width(input); - - return length; - } - - fn visibleUTF8WidthFn(input: []const u8, comptime asciiFn: anytype) usize { - var bytes = input; - var len: usize = 0; - while (bun.strings.firstNonASCII(bytes)) |i| { - len += asciiFn(bytes[0..i]); - const this_chunk = bytes[i..]; - const byte = this_chunk[0]; - - const skip = bun.strings.wtf8ByteSequenceLengthWithInvalid(byte); - const cp_bytes: [4]u8 = switch (@min(@as(usize, skip), this_chunk.len)) { - inline 1, 2, 3, 4 => |cp_len| .{ - byte, - if (comptime cp_len > 1) this_chunk[1] else 0, - if (comptime cp_len > 2) this_chunk[2] else 0, - if (comptime cp_len > 3) this_chunk[3] else 0, - }, - else => unreachable, - }; - - const cp = decodeWTF8RuneTMultibyte(&cp_bytes, skip, u32, unicode_replacement); - len += visibleCodepointWidth(cp, false); - - bytes = bytes[@min(i + skip, bytes.len)..]; - } - - len += asciiFn(bytes); - - return len; - } - - fn visibleUTF16WidthFn(input_: []const u16, exclude_ansi_colors: bool, ambiguousAsWide: bool) usize { - var input = input_; - var len: usize = 0; - var prev: ?u21 = 0; - var break_state = grapheme.BreakState{}; - var break_start: u21 = 0; - var saw_1b = false; - var saw_bracket = false; - var stretch_len: usize = 0; - - while (true) { - { - const idx = firstNonASCII16([]const u16, input) orelse input.len; - for (0..idx) |j| { - const cp = input[j]; - defer prev = cp; - - if (saw_bracket) { - if (cp == 'm') { - saw_1b = false; - saw_bracket = false; - stretch_len = 0; - continue; - } - stretch_len += visibleCodepointWidth(cp, ambiguousAsWide); - continue; - } - if (saw_1b) { - if (cp == '[') { - saw_bracket = true; - stretch_len = 0; - continue; - } - len += visibleCodepointWidth(cp, ambiguousAsWide); - continue; - } - if (!exclude_ansi_colors or cp != 0x1b) { - if (prev) |prev_| { - const should_break = grapheme.graphemeBreak(prev_, cp, &break_state); - if (should_break) { - len += visibleCodepointWidthMaybeEmoji(break_start, cp == 0xFE0F, ambiguousAsWide); - break_start = cp; - } else { - // - } - } else { - len += visibleCodepointWidth(cp, ambiguousAsWide); - break_start = cp; - } - continue; - } - saw_1b = true; - continue; - } - len += stretch_len; - input = input[idx..]; - } - if (input.len == 0) break; - const replacement = utf16CodepointWithFFFD([]const u16, input); - defer input = input[replacement.len..]; - if (replacement.fail) continue; - const cp: u21 = @intCast(replacement.code_point); - defer prev = cp; - - if (prev) |prev_| { - const should_break = grapheme.graphemeBreak(prev_, cp, &break_state); - if (should_break) { - len += visibleCodepointWidthMaybeEmoji(break_start, cp == 0xFE0F, ambiguousAsWide); - break_start = cp; - } - } else { - len += visibleCodepointWidth(cp, ambiguousAsWide); - break_start = cp; - } - } - if (break_start > 0) { - len += visibleCodepointWidthMaybeEmoji(break_start, (prev orelse 0) == 0xFE0F, ambiguousAsWide); - } - return len; - } - - fn visibleLatin1WidthFn(input: []const u8) usize { - return visibleLatin1Width(input); - } - - pub const width = struct { - pub fn latin1(input: []const u8) usize { - return visibleLatin1Width(input); - } - - pub fn utf8(input: []const u8) usize { - return visibleUTF8WidthFn(input, visibleLatin1Width); - } - - pub fn utf16(input: []const u16, ambiguousAsWide: bool) usize { - return visibleUTF16WidthFn(input, false, ambiguousAsWide); - } - - pub const exclude_ansi_colors = struct { - pub fn latin1(input: []const u8) usize { - return visibleLatin1WidthExcludeANSIColors(input); - } - - pub fn utf8(input: []const u8) usize { - return visibleUTF8WidthFn(input, visibleLatin1WidthExcludeANSIColors); - } - - pub fn utf16(input: []const u16, ambiguousAsWide: bool) usize { - return visibleUTF16WidthFn(input, true, ambiguousAsWide); - } - }; - }; -}; pub const QuoteEscapeFormatFlags = struct { quote_char: u8, @@ -6209,9 +2164,6 @@ pub fn withoutPrefixIfPossibleComptime(input: string, comptime prefix: string) ? return null; } -// extern "C" bool icu_hasBinaryProperty(UChar32 cp, unsigned int prop) -extern fn icu_hasBinaryProperty(c: u32, which: c_uint) bool; - const assert = bun.assert; /// Returns the first byte of the string and the rest of the string excluding the first byte @@ -6267,3 +2219,139 @@ pub fn percentEncodeWrite( // Write the rest of the string try writer.appendSlice(remaining); } + +pub const CodepointIterator = unicode.CodepointIterator; +pub const NewCodePointIterator = unicode.NewCodePointIterator; +pub const UnsignedCodepointIterator = unicode.UnsignedCodepointIterator; +pub const EncodeIntoResult = unicode.EncodeIntoResult; +pub const BOM = unicode.BOM; +pub const allocateLatin1IntoUTF8 = unicode.allocateLatin1IntoUTF8; +pub const allocateLatin1IntoUTF8WithList = unicode.allocateLatin1IntoUTF8WithList; +pub const appendUTF8MachineWordToUTF16MachineWord = unicode.appendUTF8MachineWordToUTF16MachineWord; +pub const codepointSize = unicode.codepointSize; +pub const containsNonBmpCodePoint = unicode.containsNonBmpCodePoint; +pub const containsNonBmpCodePointOrIsInvalidIdentifier = unicode.containsNonBmpCodePointOrIsInvalidIdentifier; +pub const convertUTF16ToUTF8 = unicode.convertUTF16ToUTF8; +pub const convertUTF16ToUTF8Append = unicode.convertUTF16ToUTF8Append; +pub const convertUTF16ToUTF8WithoutInvalidSurrogatePairs = unicode.convertUTF16ToUTF8WithoutInvalidSurrogatePairs; +pub const convertUTF16toUTF8InBuffer = unicode.convertUTF16toUTF8InBuffer; +pub const convertUTF8BytesIntoUTF16 = unicode.convertUTF8BytesIntoUTF16; +pub const convertUTF8BytesIntoUTF16WithLength = unicode.convertUTF8BytesIntoUTF16WithLength; +pub const convertUTF8toUTF16InBuffer = unicode.convertUTF8toUTF16InBuffer; +pub const convertUTF8toUTF16InBufferZ = unicode.convertUTF8toUTF16InBufferZ; +pub const copyLatin1IntoASCII = unicode.copyLatin1IntoASCII; +pub const copyLatin1IntoUTF16 = unicode.copyLatin1IntoUTF16; +pub const copyLatin1IntoUTF8 = unicode.copyLatin1IntoUTF8; +pub const copyLatin1IntoUTF8StopOnNonASCII = unicode.copyLatin1IntoUTF8StopOnNonASCII; +pub const copyU16IntoU8 = unicode.copyU16IntoU8; +pub const copyU8IntoU16 = unicode.copyU8IntoU16; +pub const copyU8IntoU16WithAlignment = unicode.copyU8IntoU16WithAlignment; +pub const copyUTF16IntoUTF8 = unicode.copyUTF16IntoUTF8; +pub const copyUTF16IntoUTF8WithBuffer = unicode.copyUTF16IntoUTF8WithBuffer; +pub const decodeCheck = unicode.decodeCheck; +pub const decodeWTF8RuneT = unicode.decodeWTF8RuneT; +pub const decodeWTF8RuneTMultibyte = unicode.decodeWTF8RuneTMultibyte; +pub const elementLengthLatin1IntoUTF16 = unicode.elementLengthLatin1IntoUTF16; +pub const elementLengthLatin1IntoUTF8 = unicode.elementLengthLatin1IntoUTF8; +pub const elementLengthUTF16IntoUTF8 = unicode.elementLengthUTF16IntoUTF8; +pub const elementLengthUTF8IntoUTF16 = unicode.elementLengthUTF8IntoUTF16; +pub const encodeUTF8Comptime = unicode.encodeUTF8Comptime; +pub const encodeWTF8Rune = unicode.encodeWTF8Rune; +pub const encodeWTF8RuneT = unicode.encodeWTF8RuneT; +pub const eqlUtf16 = unicode.eqlUtf16; +pub const isAllASCII = unicode.isAllASCII; +pub const isValidUTF8 = unicode.isValidUTF8; +pub const isValidUTF8WithoutSIMD = unicode.isValidUTF8WithoutSIMD; +pub const latin1ToCodepointAssumeNotASCII = unicode.latin1ToCodepointAssumeNotASCII; +pub const latin1ToCodepointBytesAssumeNotASCII = unicode.latin1ToCodepointBytesAssumeNotASCII; +pub const latin1ToCodepointBytesAssumeNotASCII16 = unicode.latin1ToCodepointBytesAssumeNotASCII16; +pub const literal = unicode.literal; +pub const nonASCIISequenceLength = unicode.nonASCIISequenceLength; +pub const replaceLatin1WithUTF8 = unicode.replaceLatin1WithUTF8; +pub const toUTF16Alloc = unicode.toUTF16Alloc; +pub const toUTF16AllocForReal = unicode.toUTF16AllocForReal; +pub const toUTF16AllocMaybeBuffered = unicode.toUTF16AllocMaybeBuffered; +pub const toUTF16Literal = unicode.toUTF16Literal; +pub const toUTF8Alloc = unicode.toUTF8Alloc; +pub const toUTF8AllocWithType = unicode.toUTF8AllocWithType; +pub const toUTF8AllocWithTypeWithoutInvalidSurrogatePairs = unicode.toUTF8AllocWithTypeWithoutInvalidSurrogatePairs; +pub const toUTF8AllocZ = unicode.toUTF8AllocZ; +pub const toUTF8AppendToList = unicode.toUTF8AppendToList; +pub const toUTF8FromLatin1 = unicode.toUTF8FromLatin1; +pub const toUTF8FromLatin1Z = unicode.toUTF8FromLatin1Z; +pub const toUTF8ListWithType = unicode.toUTF8ListWithType; +pub const toUTF8ListWithTypeBun = unicode.toUTF8ListWithTypeBun; +pub const u16GetSupplementary = unicode.u16GetSupplementary; +pub const u16IsLead = unicode.u16IsLead; +pub const u16IsTrail = unicode.u16IsTrail; +pub const u16Lead = unicode.u16Lead; +pub const u16Trail = unicode.u16Trail; +pub const utf16Codepoint = unicode.utf16Codepoint; +pub const utf16CodepointWithFFFD = unicode.utf16CodepointWithFFFD; +pub const utf16EqlString = unicode.utf16EqlString; +pub const utf8ByteSequenceLength = unicode.utf8ByteSequenceLength; +pub const utf8ByteSequenceLengthUnsafe = unicode.utf8ByteSequenceLengthUnsafe; +pub const w = unicode.w; +pub const withoutUTF8BOM = unicode.withoutUTF8BOM; +pub const wtf8ByteSequenceLength = unicode.wtf8ByteSequenceLength; +pub const wtf8ByteSequenceLengthWithInvalid = unicode.wtf8ByteSequenceLengthWithInvalid; +pub const wtf8Sequence = unicode.wtf8Sequence; +const unicode = @import("./string/unicode.zig"); + +const _visible = @import("./string/visible.zig"); +pub const isAmgiguousCodepointType = _visible.isAmgiguousCodepointType; +pub const isFullWidthCodepointType = _visible.isFullWidthCodepointType; +pub const isZeroWidthCodepointType = _visible.isZeroWidthCodepointType; +pub const visible = _visible.visible; +pub const visibleCodepointWidth = _visible.visibleCodepointWidth; +pub const visibleCodepointWidthMaybeEmoji = _visible.visibleCodepointWidthMaybeEmoji; +pub const visibleCodepointWidthType = _visible.visibleCodepointWidthType; + +const _escapeHTML = @import("./string/escapeHTML.zig"); +pub const escapeHTMLForLatin1Input = _escapeHTML.escapeHTMLForLatin1Input; +pub const escapeHTMLForUTF16Input = _escapeHTML.escapeHTMLForUTF16Input; + +const _paths = @import("./string/paths.zig"); +pub const addNTPathPrefix = _paths.addNTPathPrefix; +pub const addNTPathPrefixIfNeeded = _paths.addNTPathPrefixIfNeeded; +pub const assertIsValidWindowsPath = _paths.assertIsValidWindowsPath; +pub const charIsAnySlash = _paths.charIsAnySlash; +pub const cloneNormalizingSeparators = _paths.cloneNormalizingSeparators; +pub const fromWPath = _paths.fromWPath; +pub const isWindowsAbsolutePathMissingDriveLetter = _paths.isWindowsAbsolutePathMissingDriveLetter; +pub const normalizeSlashesOnly = _paths.normalizeSlashesOnly; +pub const normalizeSlashesOnlyT = _paths.normalizeSlashesOnlyT; +pub const pathContainsNodeModulesFolder = _paths.pathContainsNodeModulesFolder; +pub const removeLeadingDotSlash = _paths.removeLeadingDotSlash; +pub const startsWithWindowsDriveLetter = _paths.startsWithWindowsDriveLetter; +pub const startsWithWindowsDriveLetterT = _paths.startsWithWindowsDriveLetterT; +pub const toExtendedPathNormalized = _paths.toExtendedPathNormalized; +pub const toKernel32Path = _paths.toKernel32Path; +pub const toNTMaxPath = _paths.toNTMaxPath; +pub const toNTPath = _paths.toNTPath; +pub const toNTPath16 = _paths.toNTPath16; +pub const toPath = _paths.toPath; +pub const toPathMaybeDir = _paths.toPathMaybeDir; +pub const toPathNormalized = _paths.toPathNormalized; +pub const toWDirNormalized = _paths.toWDirNormalized; +pub const toWDirPath = _paths.toWDirPath; +pub const toWPath = _paths.toWPath; +pub const toWPathMaybeDir = _paths.toWPathMaybeDir; +pub const toWPathNormalizeAutoExtend = _paths.toWPathNormalizeAutoExtend; +pub const toWPathNormalized = _paths.toWPathNormalized; +pub const toWPathNormalized16 = _paths.toWPathNormalized16; +pub const withoutLeadingPathSeparator = _paths.withoutLeadingPathSeparator; +pub const withoutLeadingSlash = _paths.withoutLeadingSlash; +pub const withoutNTPrefix = _paths.withoutNTPrefix; +pub const withoutTrailingSlash = _paths.withoutTrailingSlash; +pub const withoutTrailingSlashWindowsPath = _paths.withoutTrailingSlashWindowsPath; + +pub const log = bun.Output.scoped(.STR, true); +pub const grapheme = @import("./grapheme.zig"); + +const std = @import("std"); +const Environment = @import("./env.zig"); +const string = bun.string; +const bun = @import("bun"); +const js_lexer = @import("./js_lexer.zig"); +const OOM = bun.OOM;