Split up string_immutable into more files (#20446)

Co-authored-by: Jarred-Sumner <709451+Jarred-Sumner@users.noreply.github.com>
This commit is contained in:
Jarred Sumner
2025-06-17 10:59:07 -07:00
committed by GitHub
parent 8a1d8047f1
commit b99a1256ff
6 changed files with 4151 additions and 4049 deletions

View File

@@ -673,12 +673,16 @@ src/StaticHashMap.zig
src/string_immutable.zig
src/string_types.zig
src/string.zig
src/string/escapeHTML.zig
src/string/HashedString.zig
src/string/MutableString.zig
src/string/paths.zig
src/string/PathString.zig
src/string/SmolStr.zig
src/string/StringBuilder.zig
src/string/StringJoiner.zig
src/string/unicode.zig
src/string/visible.zig
src/string/WTFStringImpl.zig
src/sync.zig
src/sys_uv.zig

640
src/string/escapeHTML.zig Normal file
View File

@@ -0,0 +1,640 @@
pub fn escapeHTMLForLatin1Input(allocator: std.mem.Allocator, latin1: []const u8) !Escaped(u8) {
const Scalar = struct {
pub const lengths: [std.math.maxInt(u8) + 1]u4 = brk: {
var values: [std.math.maxInt(u8) + 1]u4 = undefined;
for (values, 0..) |_, i| {
switch (i) {
'"' => {
values[i] = "&quot;".len;
},
'&' => {
values[i] = "&amp;".len;
},
'\'' => {
values[i] = "&#x27;".len;
},
'<' => {
values[i] = "&lt;".len;
},
'>' => {
values[i] = "&gt;".len;
},
else => {
values[i] = 1;
},
}
}
break :brk values;
};
fn appendString(buf: [*]u8, comptime str: []const u8) callconv(bun.callconv_inline) usize {
buf[0..str.len].* = str[0..str.len].*;
return str.len;
}
pub fn append(buf: [*]u8, char: u8) callconv(bun.callconv_inline) usize {
if (lengths[char] == 1) {
buf[0] = char;
return 1;
}
return switch (char) {
'"' => appendString(buf, "&quot;"),
'&' => appendString(buf, "&amp;"),
'\'' => appendString(buf, "&#x27;"),
'<' => appendString(buf, "&lt;"),
'>' => appendString(buf, "&gt;"),
else => unreachable,
};
}
pub fn push(comptime len: anytype, chars_: *const [len]u8, allo: std.mem.Allocator) callconv(bun.callconv_inline) Escaped(u8) {
const chars = chars_.*;
var total: usize = 0;
comptime var remain_to_comp = len;
comptime var comp_i = 0;
inline while (remain_to_comp > 0) : (remain_to_comp -= 1) {
total += lengths[chars[comp_i]];
comp_i += 1;
}
if (total == len) {
return .{ .original = {} };
}
const output = allo.alloc(u8, total) catch unreachable;
var head = output.ptr;
inline for (comptime bun.range(0, len)) |i| {
head += @This().append(head, chars[i]);
}
return Escaped(u8){ .allocated = output };
}
};
@setEvalBranchQuota(5000);
switch (latin1.len) {
0 => return Escaped(u8){ .static = "" },
1 => return switch (latin1[0]) {
'"' => Escaped(u8){ .static = "&quot;" },
'&' => Escaped(u8){ .static = "&amp;" },
'\'' => Escaped(u8){ .static = "&#x27;" },
'<' => Escaped(u8){ .static = "&lt;" },
'>' => Escaped(u8){ .static = "&gt;" },
else => Escaped(u8){ .original = {} },
},
2 => {
const first: []const u8 = switch (latin1[0]) {
'"' => "&quot;",
'&' => "&amp;",
'\'' => "&#x27;",
'<' => "&lt;",
'>' => "&gt;",
else => latin1[0..1],
};
const second: []const u8 = switch (latin1[1]) {
'"' => "&quot;",
'&' => "&amp;",
'\'' => "&#x27;",
'<' => "&lt;",
'>' => "&gt;",
else => latin1[1..2],
};
if (first.len == 1 and second.len == 1) {
return Escaped(u8){ .original = {} };
}
return Escaped(u8){ .allocated = strings.append(allocator, first, second) catch unreachable };
},
// The simd implementation is slower for inputs less than 32 bytes.
3 => return Scalar.push(3, latin1[0..3], allocator),
4 => return Scalar.push(4, latin1[0..4], allocator),
5 => return Scalar.push(5, latin1[0..5], allocator),
6 => return Scalar.push(6, latin1[0..6], allocator),
7 => return Scalar.push(7, latin1[0..7], allocator),
8 => return Scalar.push(8, latin1[0..8], allocator),
9 => return Scalar.push(9, latin1[0..9], allocator),
10 => return Scalar.push(10, latin1[0..10], allocator),
11 => return Scalar.push(11, latin1[0..11], allocator),
12 => return Scalar.push(12, latin1[0..12], allocator),
13 => return Scalar.push(13, latin1[0..13], allocator),
14 => return Scalar.push(14, latin1[0..14], allocator),
15 => return Scalar.push(15, latin1[0..15], allocator),
16 => return Scalar.push(16, latin1[0..16], allocator),
17 => return Scalar.push(17, latin1[0..17], allocator),
18 => return Scalar.push(18, latin1[0..18], allocator),
19 => return Scalar.push(19, latin1[0..19], allocator),
20 => return Scalar.push(20, latin1[0..20], allocator),
21 => return Scalar.push(21, latin1[0..21], allocator),
22 => return Scalar.push(22, latin1[0..22], allocator),
23 => return Scalar.push(23, latin1[0..23], allocator),
24 => return Scalar.push(24, latin1[0..24], allocator),
25 => return Scalar.push(25, latin1[0..25], allocator),
26 => return Scalar.push(26, latin1[0..26], allocator),
27 => return Scalar.push(27, latin1[0..27], allocator),
28 => return Scalar.push(28, latin1[0..28], allocator),
29 => return Scalar.push(29, latin1[0..29], allocator),
30 => return Scalar.push(30, latin1[0..30], allocator),
31 => return Scalar.push(31, latin1[0..31], allocator),
32 => return Scalar.push(32, latin1[0..32], allocator),
else => {
var remaining = latin1;
const vec_chars = "\"&'<>";
const vecs: [vec_chars.len]AsciiVector = comptime brk: {
var _vecs: [vec_chars.len]AsciiVector = undefined;
for (vec_chars, 0..) |c, i| {
_vecs[i] = @splat(c);
}
break :brk _vecs;
};
var any_needs_escape = false;
var buf: std.ArrayList(u8) = std.ArrayList(u8){
.items = &.{},
.capacity = 0,
.allocator = allocator,
};
if (comptime Environment.enableSIMD) {
// pass #1: scan for any characters that need escaping
// assume most strings won't need any escaping, so don't actually allocate the buffer
scan_and_allocate_lazily: while (remaining.len >= ascii_vector_size) {
if (comptime Environment.allow_assert) assert(!any_needs_escape);
const vec: AsciiVector = remaining[0..ascii_vector_size].*;
if (@reduce(.Max, @as(AsciiVectorU1, @bitCast((vec == vecs[0]))) |
@as(AsciiVectorU1, @bitCast((vec == vecs[1]))) |
@as(AsciiVectorU1, @bitCast((vec == vecs[2]))) |
@as(AsciiVectorU1, @bitCast((vec == vecs[3]))) |
@as(AsciiVectorU1, @bitCast((vec == vecs[4])))) == 1)
{
if (comptime Environment.allow_assert) assert(buf.capacity == 0);
buf = try std.ArrayList(u8).initCapacity(allocator, latin1.len + 6);
const copy_len = @intFromPtr(remaining.ptr) - @intFromPtr(latin1.ptr);
buf.appendSliceAssumeCapacity(latin1[0..copy_len]);
any_needs_escape = true;
inline for (0..ascii_vector_size) |i| {
switch (vec[i]) {
'"' => {
buf.ensureUnusedCapacity((ascii_vector_size - i) + "&quot;".len) catch unreachable;
buf.items.ptr[buf.items.len .. buf.items.len + "&quot;".len][0.."&quot;".len].* = "&quot;".*;
buf.items.len += "&quot;".len;
},
'&' => {
buf.ensureUnusedCapacity((ascii_vector_size - i) + "&amp;".len) catch unreachable;
buf.items.ptr[buf.items.len .. buf.items.len + "&amp;".len][0.."&amp;".len].* = "&amp;".*;
buf.items.len += "&amp;".len;
},
'\'' => {
buf.ensureUnusedCapacity((ascii_vector_size - i) + "&#x27;".len) catch unreachable;
buf.items.ptr[buf.items.len .. buf.items.len + "&#x27;".len][0.."&#x27;".len].* = "&#x27;".*;
buf.items.len += "&#x27;".len;
},
'<' => {
buf.ensureUnusedCapacity((ascii_vector_size - i) + "&lt;".len) catch unreachable;
buf.items.ptr[buf.items.len .. buf.items.len + "&lt;".len][0.."&lt;".len].* = "&lt;".*;
buf.items.len += "&lt;".len;
},
'>' => {
buf.ensureUnusedCapacity((ascii_vector_size - i) + "&gt;".len) catch unreachable;
buf.items.ptr[buf.items.len .. buf.items.len + "&gt;".len][0.."&gt;".len].* = "&gt;".*;
buf.items.len += "&gt;".len;
},
else => |c| {
buf.appendAssumeCapacity(c);
},
}
}
remaining = remaining[ascii_vector_size..];
break :scan_and_allocate_lazily;
}
remaining = remaining[ascii_vector_size..];
}
}
if (any_needs_escape) {
// pass #2: we found something that needed an escape
// so we'll go ahead and copy the buffer into a new buffer
while (remaining.len >= ascii_vector_size) {
const vec: AsciiVector = remaining[0..ascii_vector_size].*;
if (@reduce(.Max, @as(AsciiVectorU1, @bitCast((vec == vecs[0]))) |
@as(AsciiVectorU1, @bitCast((vec == vecs[1]))) |
@as(AsciiVectorU1, @bitCast((vec == vecs[2]))) |
@as(AsciiVectorU1, @bitCast((vec == vecs[3]))) |
@as(AsciiVectorU1, @bitCast((vec == vecs[4])))) == 1)
{
buf.ensureUnusedCapacity(ascii_vector_size + 6) catch unreachable;
inline for (0..ascii_vector_size) |i| {
switch (vec[i]) {
'"' => {
buf.ensureUnusedCapacity((ascii_vector_size - i) + "&quot;".len) catch unreachable;
buf.items.ptr[buf.items.len .. buf.items.len + "&quot;".len][0.."&quot;".len].* = "&quot;".*;
buf.items.len += "&quot;".len;
},
'&' => {
buf.ensureUnusedCapacity((ascii_vector_size - i) + "&amp;".len) catch unreachable;
buf.items.ptr[buf.items.len .. buf.items.len + "&amp;".len][0.."&amp;".len].* = "&amp;".*;
buf.items.len += "&amp;".len;
},
'\'' => {
buf.ensureUnusedCapacity((ascii_vector_size - i) + "&#x27;".len) catch unreachable;
buf.items.ptr[buf.items.len .. buf.items.len + "&#x27;".len][0.."&#x27;".len].* = "&#x27;".*;
buf.items.len += "&#x27;".len;
},
'<' => {
buf.ensureUnusedCapacity((ascii_vector_size - i) + "&lt;".len) catch unreachable;
buf.items.ptr[buf.items.len .. buf.items.len + "&lt;".len][0.."&lt;".len].* = "&lt;".*;
buf.items.len += "&lt;".len;
},
'>' => {
buf.ensureUnusedCapacity((ascii_vector_size - i) + "&gt;".len) catch unreachable;
buf.items.ptr[buf.items.len .. buf.items.len + "&gt;".len][0.."&gt;".len].* = "&gt;".*;
buf.items.len += "&gt;".len;
},
else => |c| {
buf.appendAssumeCapacity(c);
},
}
}
remaining = remaining[ascii_vector_size..];
continue;
}
try buf.ensureUnusedCapacity(ascii_vector_size);
buf.items.ptr[buf.items.len .. buf.items.len + ascii_vector_size][0..ascii_vector_size].* = remaining[0..ascii_vector_size].*;
buf.items.len += ascii_vector_size;
remaining = remaining[ascii_vector_size..];
}
}
var ptr = remaining.ptr;
const end = remaining.ptr + remaining.len;
if (!any_needs_escape) {
scan_and_allocate_lazily: while (ptr != end) : (ptr += 1) {
switch (ptr[0]) {
'"', '&', '\'', '<', '>' => |c| {
if (comptime Environment.allow_assert) assert(buf.capacity == 0);
buf = try std.ArrayList(u8).initCapacity(allocator, latin1.len + @as(usize, Scalar.lengths[c]));
const copy_len = @intFromPtr(ptr) - @intFromPtr(latin1.ptr);
if (comptime Environment.allow_assert) assert(copy_len <= buf.capacity);
buf.items.len = copy_len;
@memcpy(buf.items[0..copy_len], latin1[0..copy_len]);
any_needs_escape = true;
break :scan_and_allocate_lazily;
},
else => {},
}
}
}
while (ptr != end) : (ptr += 1) {
switch (ptr[0]) {
'"' => {
buf.appendSlice("&quot;") catch unreachable;
},
'&' => {
buf.appendSlice("&amp;") catch unreachable;
},
'\'' => {
buf.appendSlice("&#x27;") catch unreachable; // modified from escape-html; used to be '&#39'
},
'<' => {
buf.appendSlice("&lt;") catch unreachable;
},
'>' => {
buf.appendSlice("&gt;") catch unreachable;
},
else => |c| {
buf.append(c) catch unreachable;
},
}
}
if (!any_needs_escape) {
if (comptime Environment.allow_assert) assert(buf.capacity == 0);
return Escaped(u8){ .original = {} };
}
return Escaped(u8){ .allocated = try buf.toOwnedSlice() };
},
}
}
fn Escaped(comptime T: type) type {
return union(enum) {
static: []const u8,
original: void,
allocated: []T,
};
}
pub fn escapeHTMLForUTF16Input(allocator: std.mem.Allocator, utf16: []const u16) !Escaped(u16) {
const Scalar = struct {
pub const lengths: [std.math.maxInt(u8) + 1]u4 = brk: {
var values: [std.math.maxInt(u8) + 1]u4 = undefined;
for (values, 0..) |_, i| {
values[i] = switch (i) {
'"' => "&quot;".len,
'&' => "&amp;".len,
'\'' => "&#x27;".len,
'<' => "&lt;".len,
'>' => "&gt;".len,
else => 1,
};
}
break :brk values;
};
};
switch (utf16.len) {
0 => return Escaped(u16){ .static = &[_]u8{} },
1 => {
switch (utf16[0]) {
'"' => return Escaped(u16){ .static = "&quot;" },
'&' => return Escaped(u16){ .static = "&amp;" },
'\'' => return Escaped(u16){ .static = "&#x27;" },
'<' => return Escaped(u16){ .static = "&lt;" },
'>' => return Escaped(u16){ .static = "&gt;" },
else => return Escaped(u16){ .original = {} },
}
},
2 => {
const first_16 = switch (utf16[0]) {
'"' => toUTF16Literal("&quot;"),
'&' => toUTF16Literal("&amp;"),
'\'' => toUTF16Literal("&#x27;"),
'<' => toUTF16Literal("&lt;"),
'>' => toUTF16Literal("&gt;"),
else => @as([]const u16, utf16[0..1]),
};
const second_16 = switch (utf16[1]) {
'"' => toUTF16Literal("&quot;"),
'&' => toUTF16Literal("&amp;"),
'\'' => toUTF16Literal("&#x27;"),
'<' => toUTF16Literal("&lt;"),
'>' => toUTF16Literal("&gt;"),
else => @as([]const u16, utf16[1..2]),
};
if (first_16.ptr == utf16.ptr and second_16.ptr == utf16.ptr + 1) {
return Escaped(u16){ .original = {} };
}
var buf = allocator.alloc(u16, first_16.len + second_16.len) catch unreachable;
bun.copy(u16, buf, first_16);
bun.copy(u16, buf[first_16.len..], second_16);
return Escaped(u16){ .allocated = buf };
},
else => {
var remaining = utf16;
var any_needs_escape = false;
var buf: std.ArrayList(u16) = undefined;
if (comptime Environment.enableSIMD) {
const vec_chars = "\"&'<>";
const vecs: [vec_chars.len]AsciiU16Vector = brk: {
var _vecs: [vec_chars.len]AsciiU16Vector = undefined;
for (vec_chars, 0..) |c, i| {
_vecs[i] = @splat(@as(u16, c));
}
break :brk _vecs;
};
// pass #1: scan for any characters that need escaping
// assume most strings won't need any escaping, so don't actually allocate the buffer
scan_and_allocate_lazily: while (remaining.len >= ascii_u16_vector_size) {
if (comptime Environment.allow_assert) assert(!any_needs_escape);
const vec: AsciiU16Vector = remaining[0..ascii_u16_vector_size].*;
if (@reduce(.Max, @as(AsciiVectorU16U1, @bitCast(vec > @as(AsciiU16Vector, @splat(@as(u16, 127))))) |
@as(AsciiVectorU16U1, @bitCast((vec == vecs[0]))) |
@as(AsciiVectorU16U1, @bitCast((vec == vecs[1]))) |
@as(AsciiVectorU16U1, @bitCast((vec == vecs[2]))) |
@as(AsciiVectorU16U1, @bitCast((vec == vecs[3]))) |
@as(AsciiVectorU16U1, @bitCast((vec == vecs[4])))) == 1)
{
var i: u16 = 0;
lazy: {
while (i < ascii_u16_vector_size) {
switch (remaining[i]) {
'"', '&', '\'', '<', '>' => {
any_needs_escape = true;
break :lazy;
},
128...std.math.maxInt(u16) => {
const cp = utf16Codepoint([]const u16, remaining[i..]);
i += @as(u16, cp.len);
},
else => {
i += 1;
},
}
}
}
if (!any_needs_escape) {
remaining = remaining[i..];
continue :scan_and_allocate_lazily;
}
if (comptime Environment.allow_assert) assert(@intFromPtr(remaining.ptr + i) >= @intFromPtr(utf16.ptr));
const to_copy = std.mem.sliceAsBytes(utf16)[0 .. @intFromPtr(remaining.ptr + i) - @intFromPtr(utf16.ptr)];
const to_copy_16 = std.mem.bytesAsSlice(u16, to_copy);
buf = try std.ArrayList(u16).initCapacity(allocator, utf16.len + 6);
try buf.appendSlice(to_copy_16);
while (i < ascii_u16_vector_size) {
switch (remaining[i]) {
'"', '&', '\'', '<', '>' => |c| {
const result = switch (c) {
'"' => toUTF16Literal("&quot;"),
'&' => toUTF16Literal("&amp;"),
'\'' => toUTF16Literal("&#x27;"),
'<' => toUTF16Literal("&lt;"),
'>' => toUTF16Literal("&gt;"),
else => unreachable,
};
buf.appendSlice(result) catch unreachable;
i += 1;
},
128...std.math.maxInt(u16) => {
const cp = utf16Codepoint([]const u16, remaining[i..]);
buf.appendSlice(remaining[i..][0..@as(usize, cp.len)]) catch unreachable;
i += @as(u16, cp.len);
},
else => |c| {
i += 1;
buf.append(c) catch unreachable;
},
}
}
// edgecase: code point width could exceed asdcii_u16_vector_size
remaining = remaining[i..];
break :scan_and_allocate_lazily;
}
remaining = remaining[ascii_u16_vector_size..];
}
if (any_needs_escape) {
// pass #2: we found something that needed an escape
// but there's still some more text to
// so we'll go ahead and copy the buffer into a new buffer
while (remaining.len >= ascii_u16_vector_size) {
const vec: AsciiU16Vector = remaining[0..ascii_u16_vector_size].*;
if (@reduce(.Max, @as(AsciiVectorU16U1, @bitCast(vec > @as(AsciiU16Vector, @splat(@as(u16, 127))))) |
@as(AsciiVectorU16U1, @bitCast((vec == vecs[0]))) |
@as(AsciiVectorU16U1, @bitCast((vec == vecs[1]))) |
@as(AsciiVectorU16U1, @bitCast((vec == vecs[2]))) |
@as(AsciiVectorU16U1, @bitCast((vec == vecs[3]))) |
@as(AsciiVectorU16U1, @bitCast((vec == vecs[4])))) == 1)
{
buf.ensureUnusedCapacity(ascii_u16_vector_size) catch unreachable;
var i: u16 = 0;
while (i < ascii_u16_vector_size) {
switch (remaining[i]) {
'"' => {
buf.appendSlice(toUTF16Literal("&quot;")) catch unreachable;
i += 1;
},
'&' => {
buf.appendSlice(toUTF16Literal("&amp;")) catch unreachable;
i += 1;
},
'\'' => {
buf.appendSlice(toUTF16Literal("&#x27;")) catch unreachable; // modified from escape-html; used to be '&#39'
i += 1;
},
'<' => {
buf.appendSlice(toUTF16Literal("&lt;")) catch unreachable;
i += 1;
},
'>' => {
buf.appendSlice(toUTF16Literal("&gt;")) catch unreachable;
i += 1;
},
128...std.math.maxInt(u16) => {
const cp = utf16Codepoint([]const u16, remaining[i..]);
buf.appendSlice(remaining[i..][0..@as(usize, cp.len)]) catch unreachable;
i += @as(u16, cp.len);
},
else => |c| {
buf.append(c) catch unreachable;
i += 1;
},
}
}
remaining = remaining[i..];
continue;
}
try buf.ensureUnusedCapacity(ascii_u16_vector_size);
buf.items.ptr[buf.items.len .. buf.items.len + ascii_u16_vector_size][0..ascii_u16_vector_size].* = remaining[0..ascii_u16_vector_size].*;
buf.items.len += ascii_u16_vector_size;
remaining = remaining[ascii_u16_vector_size..];
}
}
}
var ptr = remaining.ptr;
const end = remaining.ptr + remaining.len;
if (!any_needs_escape) {
scan_and_allocate_lazily: while (ptr != end) {
switch (ptr[0]) {
'"', '&', '\'', '<', '>' => |c| {
buf = try std.ArrayList(u16).initCapacity(allocator, utf16.len + @as(usize, Scalar.lengths[c]));
if (comptime Environment.allow_assert) assert(@intFromPtr(ptr) >= @intFromPtr(utf16.ptr));
const to_copy = std.mem.sliceAsBytes(utf16)[0 .. @intFromPtr(ptr) - @intFromPtr(utf16.ptr)];
const to_copy_16 = std.mem.bytesAsSlice(u16, to_copy);
try buf.appendSlice(to_copy_16);
any_needs_escape = true;
break :scan_and_allocate_lazily;
},
128...std.math.maxInt(u16) => {
const cp = utf16Codepoint([]const u16, ptr[0..if (ptr + 1 == end) 1 else 2]);
ptr += @as(u16, cp.len);
},
else => {
ptr += 1;
},
}
}
}
while (ptr != end) {
switch (ptr[0]) {
'"' => {
buf.appendSlice(toUTF16Literal("&quot;")) catch unreachable;
ptr += 1;
},
'&' => {
buf.appendSlice(toUTF16Literal("&amp;")) catch unreachable;
ptr += 1;
},
'\'' => {
buf.appendSlice(toUTF16Literal("&#x27;")) catch unreachable; // modified from escape-html; used to be '&#39'
ptr += 1;
},
'<' => {
buf.appendSlice(toUTF16Literal("&lt;")) catch unreachable;
ptr += 1;
},
'>' => {
buf.appendSlice(toUTF16Literal("&gt;")) catch unreachable;
ptr += 1;
},
128...std.math.maxInt(u16) => {
const cp = utf16Codepoint([]const u16, ptr[0..if (ptr + 1 == end) 1 else 2]);
buf.appendSlice(ptr[0..@as(usize, cp.len)]) catch unreachable;
ptr += @as(u16, cp.len);
},
else => |c| {
buf.append(c) catch unreachable;
ptr += 1;
},
}
}
if (!any_needs_escape) {
return Escaped(u16){ .original = {} };
}
return Escaped(u16){ .allocated = try buf.toOwnedSlice() };
},
}
}
const std = @import("std");
const bun = @import("bun");
const Environment = bun.Environment;
const assert = bun.assert;
const ascii_u16_vector_size = strings.ascii_u16_vector_size;
const AsciiU16Vector = strings.AsciiU16Vector;
const utf16Codepoint = strings.utf16Codepoint;
const toUTF16Literal = strings.toUTF16Literal;
const strings = bun.strings;
const AsciiVectorU16U1 = strings.AsciiVectorU16U1;
const AsciiVector = strings.AsciiVector;
const ascii_vector_size = strings.ascii_vector_size;
const AsciiVectorU1 = strings.AsciiVectorU1;

461
src/string/paths.zig Normal file
View File

@@ -0,0 +1,461 @@
/// Checks if a path is missing a windows drive letter. For windows APIs,
/// this is used for an assertion, and PosixToWinNormalizer can help make
/// an absolute path contain a drive letter.
pub fn isWindowsAbsolutePathMissingDriveLetter(comptime T: type, chars: []const T) bool {
bun.unsafeAssert(bun.path.Platform.windows.isAbsoluteT(T, chars));
bun.unsafeAssert(chars.len > 0);
// 'C:\hello' -> false
// This is the most common situation, so we check it first
if (!(chars[0] == '/' or chars[0] == '\\')) {
bun.unsafeAssert(chars.len > 2);
bun.unsafeAssert(chars[1] == ':');
return false;
}
if (chars.len > 4) {
// '\??\hello' -> false (has the NT object prefix)
if (chars[1] == '?' and
chars[2] == '?' and
(chars[3] == '/' or chars[3] == '\\'))
return false;
// '\\?\hello' -> false (has the other NT object prefix)
// '\\.\hello' -> false (has the NT device prefix)
if ((chars[1] == '/' or chars[1] == '\\') and
(chars[2] == '?' or chars[2] == '.') and
(chars[3] == '/' or chars[3] == '\\'))
return false;
}
// A path starting with `/` can be a UNC path with forward slashes,
// or actually just a posix path.
//
// '\\Server\Share' -> false (unc)
// '\\Server\\Share' -> true (not unc because extra slashes)
// '\Server\Share' -> true (posix path)
return bun.path.windowsFilesystemRootT(T, chars).len == 1;
}
pub fn fromWPath(buf: []u8, utf16: []const u16) [:0]const u8 {
bun.unsafeAssert(buf.len > 0);
const to_copy = trimPrefixComptime(u16, utf16, bun.windows.long_path_prefix);
const encode_into_result = copyUTF16IntoUTF8(buf[0 .. buf.len - 1], []const u16, to_copy, false);
bun.unsafeAssert(encode_into_result.written < buf.len);
buf[encode_into_result.written] = 0;
return buf[0..encode_into_result.written :0];
}
pub fn withoutNTPrefix(comptime T: type, path: []const T) []const T {
if (comptime !Environment.isWindows) return path;
const cmp = if (T == u8)
hasPrefixComptime
else
hasPrefixComptimeUTF16;
if (cmp(path, &bun.windows.nt_object_prefix_u8)) {
return path[bun.windows.nt_object_prefix.len..];
}
if (cmp(path, &bun.windows.long_path_prefix_u8)) {
return path[bun.windows.long_path_prefix.len..];
}
if (cmp(path, &bun.windows.nt_unc_object_prefix_u8)) {
return path[bun.windows.nt_unc_object_prefix.len..];
}
return path;
}
pub fn toNTPath(wbuf: []u16, utf8: []const u8) [:0]u16 {
if (!std.fs.path.isAbsoluteWindows(utf8)) {
return toWPathNormalized(wbuf, utf8);
}
if (strings.hasPrefixComptime(utf8, &bun.windows.nt_object_prefix_u8) or
strings.hasPrefixComptime(utf8, &bun.windows.nt_unc_object_prefix_u8))
{
return wbuf[0..toWPathNormalized(wbuf, utf8).len :0];
}
// UNC absolute path, replace leading '\\' with '\??\UNC\'
if (strings.hasPrefixComptime(utf8, "\\\\")) {
if (strings.hasPrefixComptime(utf8[2..], bun.windows.long_path_prefix_u8[2..])) {
const prefix = bun.windows.nt_object_prefix;
wbuf[0..prefix.len].* = prefix;
return wbuf[0 .. toWPathNormalized(wbuf[prefix.len..], utf8[4..]).len + prefix.len :0];
}
const prefix = bun.windows.nt_unc_object_prefix;
wbuf[0..prefix.len].* = prefix;
return wbuf[0 .. toWPathNormalized(wbuf[prefix.len..], utf8[2..]).len + prefix.len :0];
}
const prefix = bun.windows.nt_object_prefix;
wbuf[0..prefix.len].* = prefix;
return wbuf[0 .. toWPathNormalized(wbuf[prefix.len..], utf8).len + prefix.len :0];
}
pub fn toNTPath16(wbuf: []u16, path: []const u16) [:0]u16 {
if (!std.fs.path.isAbsoluteWindowsWTF16(path)) {
return toWPathNormalized16(wbuf, path);
}
if (strings.hasPrefixComptimeUTF16(path, &bun.windows.nt_object_prefix_u8) or
strings.hasPrefixComptimeUTF16(path, &bun.windows.nt_unc_object_prefix_u8))
{
return wbuf[0..toWPathNormalized16(wbuf, path).len :0];
}
if (strings.hasPrefixComptimeUTF16(path, "\\\\")) {
if (strings.hasPrefixComptimeUTF16(path[2..], bun.windows.long_path_prefix_u8[2..])) {
const prefix = bun.windows.nt_object_prefix;
wbuf[0..prefix.len].* = prefix;
return wbuf[0 .. toWPathNormalized16(wbuf[prefix.len..], path[4..]).len + prefix.len :0];
}
const prefix = bun.windows.nt_unc_object_prefix;
wbuf[0..prefix.len].* = prefix;
return wbuf[0 .. toWPathNormalized16(wbuf[prefix.len..], path[2..]).len + prefix.len :0];
}
const prefix = bun.windows.nt_object_prefix;
wbuf[0..prefix.len].* = prefix;
return wbuf[0 .. toWPathNormalized16(wbuf[prefix.len..], path).len + prefix.len :0];
}
pub fn toNTMaxPath(buf: []u8, utf8: []const u8) [:0]const u8 {
if (!std.fs.path.isAbsoluteWindows(utf8) or utf8.len <= 260) {
@memcpy(buf[0..utf8.len], utf8);
buf[utf8.len] = 0;
return buf[0..utf8.len :0];
}
const prefix = bun.windows.nt_maxpath_prefix_u8;
buf[0..prefix.len].* = prefix;
return buf[0 .. toPathNormalized(buf[prefix.len..], utf8).len + prefix.len :0];
}
pub fn addNTPathPrefix(wbuf: []u16, utf16: []const u16) [:0]u16 {
wbuf[0..bun.windows.nt_object_prefix.len].* = bun.windows.nt_object_prefix;
@memcpy(wbuf[bun.windows.nt_object_prefix.len..][0..utf16.len], utf16);
wbuf[utf16.len + bun.windows.nt_object_prefix.len] = 0;
return wbuf[0 .. utf16.len + bun.windows.nt_object_prefix.len :0];
}
pub fn addNTPathPrefixIfNeeded(wbuf: []u16, utf16: []const u16) [:0]u16 {
if (hasPrefixComptimeType(u16, utf16, bun.windows.nt_object_prefix)) {
@memcpy(wbuf[0..utf16.len], utf16);
wbuf[utf16.len] = 0;
return wbuf[0..utf16.len :0];
}
if (hasPrefixComptimeType(u16, utf16, bun.windows.long_path_prefix)) {
// Replace prefix
return addNTPathPrefix(wbuf, utf16[bun.windows.long_path_prefix.len..]);
}
return addNTPathPrefix(wbuf, utf16);
}
// These are the same because they don't have rules like needing a trailing slash
pub const toNTDir = toNTPath;
pub fn toExtendedPathNormalized(wbuf: []u16, utf8: []const u8) [:0]const u16 {
bun.unsafeAssert(wbuf.len > 4);
wbuf[0..4].* = bun.windows.long_path_prefix;
return wbuf[0 .. toWPathNormalized(wbuf[4..], utf8).len + 4 :0];
}
pub fn toWPathNormalizeAutoExtend(wbuf: []u16, utf8: []const u8) [:0]const u16 {
if (std.fs.path.isAbsoluteWindows(utf8)) {
return toExtendedPathNormalized(wbuf, utf8);
}
return toWPathNormalized(wbuf, utf8);
}
pub fn toWPathNormalized(wbuf: []u16, utf8: []const u8) [:0]u16 {
const renormalized = bun.PathBufferPool.get();
defer bun.PathBufferPool.put(renormalized);
var path_to_use = normalizeSlashesOnly(renormalized, utf8, '\\');
// is there a trailing slash? Let's remove it before converting to UTF-16
if (path_to_use.len > 3 and bun.path.isSepAny(path_to_use[path_to_use.len - 1])) {
path_to_use = path_to_use[0 .. path_to_use.len - 1];
}
return toWPath(wbuf, path_to_use);
}
pub fn toWPathNormalized16(wbuf: []u16, path: []const u16) [:0]u16 {
var path_to_use = normalizeSlashesOnlyT(u16, wbuf, path, '\\', true);
// is there a trailing slash? Let's remove it before converting to UTF-16
if (path_to_use.len > 3 and bun.path.isSepAnyT(u16, path_to_use[path_to_use.len - 1])) {
path_to_use = path_to_use[0 .. path_to_use.len - 1];
}
wbuf[path_to_use.len] = 0;
return wbuf[0..path_to_use.len :0];
}
pub fn toPathNormalized(buf: []u8, utf8: []const u8) [:0]const u8 {
const renormalized = bun.PathBufferPool.get();
defer bun.PathBufferPool.put(renormalized);
var path_to_use = normalizeSlashesOnly(renormalized, utf8, '\\');
// is there a trailing slash? Let's remove it before converting to UTF-16
if (path_to_use.len > 3 and bun.path.isSepAny(path_to_use[path_to_use.len - 1])) {
path_to_use = path_to_use[0 .. path_to_use.len - 1];
}
return toPath(buf, path_to_use);
}
pub fn normalizeSlashesOnlyT(comptime T: type, buf: []T, path: []const T, comptime desired_slash: u8, comptime always_copy: bool) []const T {
comptime bun.unsafeAssert(desired_slash == '/' or desired_slash == '\\');
const undesired_slash = if (desired_slash == '/') '\\' else '/';
if (bun.strings.containsCharT(T, path, undesired_slash)) {
@memcpy(buf[0..path.len], path);
for (buf[0..path.len]) |*c| {
if (c.* == undesired_slash) {
c.* = desired_slash;
}
}
return buf[0..path.len];
}
if (comptime always_copy) {
@memcpy(buf[0..path.len], path);
return buf[0..path.len];
}
return path;
}
pub fn normalizeSlashesOnly(buf: []u8, utf8: []const u8, comptime desired_slash: u8) []const u8 {
return normalizeSlashesOnlyT(u8, buf, utf8, desired_slash, false);
}
pub fn toWDirNormalized(wbuf: []u16, utf8: []const u8) [:0]const u16 {
var renormalized: ?*bun.PathBuffer = null;
defer if (renormalized) |r| bun.PathBufferPool.put(r);
var path_to_use = utf8;
if (bun.strings.containsChar(utf8, '/')) {
renormalized = bun.PathBufferPool.get();
@memcpy(renormalized.?[0..utf8.len], utf8);
for (renormalized.?[0..utf8.len]) |*c| {
if (c.* == '/') {
c.* = '\\';
}
}
path_to_use = renormalized.?[0..utf8.len];
}
return toWDirPath(wbuf, path_to_use);
}
pub fn toWPath(wbuf: []u16, utf8: []const u8) [:0]u16 {
return toWPathMaybeDir(wbuf, utf8, false);
}
pub fn toPath(buf: []u8, utf8: []const u8) [:0]u8 {
return toPathMaybeDir(buf, utf8, false);
}
pub fn toWDirPath(wbuf: []u16, utf8: []const u8) [:0]const u16 {
return toWPathMaybeDir(wbuf, utf8, true);
}
pub fn toKernel32Path(wbuf: []u16, utf8: []const u8) [:0]u16 {
const path = if (hasPrefixComptime(utf8, bun.windows.nt_object_prefix_u8))
utf8[bun.windows.nt_object_prefix_u8.len..]
else
utf8;
if (hasPrefixComptime(path, bun.windows.long_path_prefix_u8)) {
return toWPath(wbuf, path);
}
if (utf8.len > 2 and bun.path.isDriveLetter(utf8[0]) and utf8[1] == ':' and bun.path.isSepAny(utf8[2])) {
wbuf[0..4].* = bun.windows.long_path_prefix;
const wpath = toWPath(wbuf[4..], path);
return wbuf[0 .. wpath.len + 4 :0];
}
return toWPath(wbuf, path);
}
fn isUNCPath(comptime T: type, path: []const T) bool {
return path.len >= 3 and
bun.path.Platform.windows.isSeparatorT(T, path[0]) and
bun.path.Platform.windows.isSeparatorT(T, path[1]) and
!bun.path.Platform.windows.isSeparatorT(T, path[2]) and
path[2] != '.';
}
pub fn assertIsValidWindowsPath(comptime T: type, path: []const T) void {
if (Environment.allow_assert and Environment.isWindows) {
if (bun.path.Platform.windows.isAbsoluteT(T, path) and
isWindowsAbsolutePathMissingDriveLetter(T, path) and
// is it a null device path? that's not an error. it's just a weird file path.
!eqlComptimeT(T, path, "\\\\.\\NUL") and !eqlComptimeT(T, path, "\\\\.\\nul") and !eqlComptimeT(T, path, "\\nul") and !eqlComptimeT(T, path, "\\NUL") and !isUNCPath(T, path))
{
std.debug.panic("Internal Error: Do not pass posix paths to Windows APIs, was given '{s}'" ++ if (Environment.isDebug) " (missing a root like 'C:\\', see PosixToWinNormalizer for why this is an assertion)" else ". Please open an issue on GitHub with a reproduction.", .{
if (T == u8) path else bun.fmt.utf16(path),
});
}
if (hasPrefixComptimeType(T, path, ":/") and Environment.isDebug) {
std.debug.panic("Path passed to windows API '{s}' is almost certainly invalid. Where did the drive letter go?", .{
if (T == u8) path else bun.fmt.utf16(path),
});
}
}
}
pub fn toWPathMaybeDir(wbuf: []u16, utf8: []const u8, comptime add_trailing_lash: bool) [:0]u16 {
bun.unsafeAssert(wbuf.len > 0);
var result = bun.simdutf.convert.utf8.to.utf16.with_errors.le(
utf8,
wbuf[0..wbuf.len -| (1 + @as(usize, @intFromBool(add_trailing_lash)))],
);
// Many Windows APIs expect normalized path slashes, particularly when the
// long path prefix is added or the nt object prefix. To make this easier,
// but a little redundant, this function always normalizes the slashes here.
//
// An example of this is GetFileAttributesW(L"C:\\hello/world.txt") being OK
// but GetFileAttributesW(L"\\\\?\\C:\\hello/world.txt") is NOT
bun.path.dangerouslyConvertPathToWindowsInPlace(u16, wbuf[0..result.count]);
if (add_trailing_lash and result.count > 0 and wbuf[result.count - 1] != '\\') {
wbuf[result.count] = '\\';
result.count += 1;
}
wbuf[result.count] = 0;
return wbuf[0..result.count :0];
}
pub fn toPathMaybeDir(buf: []u8, utf8: []const u8, comptime add_trailing_lash: bool) [:0]u8 {
bun.unsafeAssert(buf.len > 0);
var len = utf8.len;
@memcpy(buf[0..len], utf8[0..len]);
if (add_trailing_lash and len > 0 and buf[len - 1] != '\\') {
buf[len] = '\\';
len += 1;
}
buf[len] = 0;
return buf[0..len :0];
}
pub fn cloneNormalizingSeparators(
allocator: std.mem.Allocator,
input: []const u8,
) ![]u8 {
// remove duplicate slashes in the file path
const base = withoutTrailingSlash(input);
var tokenized = std.mem.tokenizeScalar(u8, base, std.fs.path.sep);
var buf = try allocator.alloc(u8, base.len + 2);
if (comptime Environment.allow_assert) assert(base.len > 0);
if (base[0] == std.fs.path.sep) {
buf[0] = std.fs.path.sep;
}
var remain = buf[@as(usize, @intFromBool(base[0] == std.fs.path.sep))..];
while (tokenized.next()) |token| {
if (token.len == 0) continue;
bun.copy(u8, remain, token);
remain[token.len..][0] = std.fs.path.sep;
remain = remain[token.len + 1 ..];
}
if ((remain.ptr - 1) != buf.ptr and (remain.ptr - 1)[0] != std.fs.path.sep) {
remain[0] = std.fs.path.sep;
remain = remain[1..];
}
remain[0] = 0;
return buf[0 .. @intFromPtr(remain.ptr) - @intFromPtr(buf.ptr)];
}
pub fn pathContainsNodeModulesFolder(path: []const u8) bool {
return strings.contains(path, comptime std.fs.path.sep_str ++ "node_modules" ++ std.fs.path.sep_str);
}
pub fn charIsAnySlash(char: u8) callconv(bun.callconv_inline) bool {
return char == '/' or char == '\\';
}
pub fn startsWithWindowsDriveLetter(s: []const u8) callconv(bun.callconv_inline) bool {
return startsWithWindowsDriveLetterT(u8, s);
}
pub fn startsWithWindowsDriveLetterT(comptime T: type, s: []const T) callconv(bun.callconv_inline) bool {
return s.len > 2 and s[1] == ':' and switch (s[0]) {
'a'...'z', 'A'...'Z' => true,
else => false,
};
}
pub fn withoutTrailingSlash(this: string) []const u8 {
var href = this;
while (href.len > 1 and (switch (href[href.len - 1]) {
'/', '\\' => true,
else => false,
})) {
href.len -= 1;
}
return href;
}
/// Does not strip the device root (C:\ or \\Server\Share\ portion off of the path)
pub fn withoutTrailingSlashWindowsPath(input: string) []const u8 {
if (Environment.isPosix or input.len < 3 or input[1] != ':')
return withoutTrailingSlash(input);
const root_len = bun.path.windowsFilesystemRoot(input).len + 1;
var path = input;
while (path.len > root_len and (switch (path[path.len - 1]) {
'/', '\\' => true,
else => false,
})) {
path.len -= 1;
}
if (Environment.isDebug)
bun.debugAssert(!std.fs.path.isAbsolute(path) or
!isWindowsAbsolutePathMissingDriveLetter(u8, path));
return path;
}
pub fn withoutLeadingSlash(this: string) []const u8 {
return std.mem.trimLeft(u8, this, "/");
}
pub fn withoutLeadingPathSeparator(this: string) []const u8 {
return std.mem.trimLeft(u8, this, &.{std.fs.path.sep});
}
pub fn removeLeadingDotSlash(slice: []const u8) callconv(bun.callconv_inline) []const u8 {
if (slice.len >= 2) {
if ((@as(u16, @bitCast(slice[0..2].*)) == comptime std.mem.readInt(u16, "./", .little)) or
(Environment.isWindows and @as(u16, @bitCast(slice[0..2].*)) == comptime std.mem.readInt(u16, ".\\", .little)))
{
return slice[2..];
}
}
return slice;
}
const bun = @import("bun");
const std = @import("std");
const Environment = bun.Environment;
const strings = bun.strings;
const hasPrefixComptime = strings.hasPrefixComptime;
const hasPrefixComptimeType = strings.hasPrefixComptimeType;
const trimPrefixComptime = strings.trimPrefixComptime;
const copyUTF16IntoUTF8 = strings.copyUTF16IntoUTF8;
const eqlComptimeT = strings.eqlComptimeT;
const string = []const u8;
const assert = bun.assert;
const hasPrefixComptimeUTF16 = strings.hasPrefixComptimeUTF16;

2078
src/string/unicode.zig Normal file

File diff suppressed because it is too large Load Diff

831
src/string/visible.zig Normal file
View File

@@ -0,0 +1,831 @@
pub fn isZeroWidthCodepointType(comptime T: type, cp: T) bool {
if (cp <= 0x1f) {
return true;
}
if (cp >= 0x7f and cp <= 0x9f) {
// C1 control characters
return true;
}
if (comptime @sizeOf(T) == 1) {
return false;
}
if (cp >= 0x300 and cp <= 0x36f) {
// Combining Diacritical Marks
return true;
}
if (cp >= 0x200b and cp <= 0x200f) {
// Modifying Invisible Characters
return true;
}
if (cp >= 0x20d0 and cp <= 0x20ff)
// Combining Diacritical Marks for Symbols
return true;
if (cp >= 0xfe00 and cp <= 0xfe0f)
// Variation Selectors
return true;
if (cp >= 0xfe20 and cp <= 0xfe2f)
// Combining Half Marks
return true;
if (cp == 0xfeff)
// Zero Width No-Break Space (BOM, ZWNBSP)
return true;
if (cp >= 0xe0100 and cp <= 0xe01ef)
// Variation Selectors
return true;
return false;
}
/// Official unicode reference: https://www.unicode.org/Public/UCD/latest/ucd/EastAsianWidth.txt
/// Tag legend:
/// - `W` (wide) -> true
/// - `F` (full-width) -> true
/// - `H` (half-width) -> false
/// - `N` (neutral) -> false
/// - `Na` (narrow) -> false
/// - `A` (ambiguous) -> false?
///
/// To regenerate the switch body list, run:
/// ```js
/// [...(await (await fetch("https://www.unicode.org/Public/UCD/latest/ucd/EastAsianWidth.txt")).text()).matchAll(/^([\dA-F]{4,})(?:\.\.([\dA-F]{4,}))?\s+;\s+(\w+)\s+#\s+(.*?)\s*$/gm)].flatMap(([,start, end, type, comment]) => (
/// (['W', 'F'].includes(type)) ? [` ${(end ? `0x${start}...0x${end}` : `0x${start}`)}, // ${''.padStart(17 - start.length - (end ? end.length + 5 : 0))}[${type}] ${comment}`] : []
/// )).join('\n')
/// ```
pub fn isFullWidthCodepointType(comptime T: type, cp: T) bool {
if (!(cp >= 0x1100)) {
return false;
}
return switch (cp) {
0x1100...0x115F, // [W] Lo [96] HANGUL CHOSEONG KIYEOK..HANGUL CHOSEONG FILLER
0x231A...0x231B, // [W] So [2] WATCH..HOURGLASS
0x2329, // [W] Ps LEFT-POINTING ANGLE BRACKET
0x232A, // [W] Pe RIGHT-POINTING ANGLE BRACKET
0x23E9...0x23EC, // [W] So [4] BLACK RIGHT-POINTING DOUBLE TRIANGLE..BLACK DOWN-POINTING DOUBLE TRIANGLE
0x23F0, // [W] So ALARM CLOCK
0x23F3, // [W] So HOURGLASS WITH FLOWING SAND
0x25FD...0x25FE, // [W] Sm [2] WHITE MEDIUM SMALL SQUARE..BLACK MEDIUM SMALL SQUARE
0x2614...0x2615, // [W] So [2] UMBRELLA WITH RAIN DROPS..HOT BEVERAGE
0x2648...0x2653, // [W] So [12] ARIES..PISCES
0x267F, // [W] So WHEELCHAIR SYMBOL
0x2693, // [W] So ANCHOR
0x26A1, // [W] So HIGH VOLTAGE SIGN
0x26AA...0x26AB, // [W] So [2] MEDIUM WHITE CIRCLE..MEDIUM BLACK CIRCLE
0x26BD...0x26BE, // [W] So [2] SOCCER BALL..BASEBALL
0x26C4...0x26C5, // [W] So [2] SNOWMAN WITHOUT SNOW..SUN BEHIND CLOUD
0x26CE, // [W] So OPHIUCHUS
0x26D4, // [W] So NO ENTRY
0x26EA, // [W] So CHURCH
0x26F2...0x26F3, // [W] So [2] FOUNTAIN..FLAG IN HOLE
0x26F5, // [W] So SAILBOAT
0x26FA, // [W] So TENT
0x26FD, // [W] So FUEL PUMP
0x2705, // [W] So WHITE HEAVY CHECK MARK
0x270A...0x270B, // [W] So [2] RAISED FIST..RAISED HAND
0x2728, // [W] So SPARKLES
0x274C, // [W] So CROSS MARK
0x274E, // [W] So NEGATIVE SQUARED CROSS MARK
0x2753...0x2755, // [W] So [3] BLACK QUESTION MARK ORNAMENT..WHITE EXCLAMATION MARK ORNAMENT
0x2757, // [W] So HEAVY EXCLAMATION MARK SYMBOL
0x2795...0x2797, // [W] So [3] HEAVY PLUS SIGN..HEAVY DIVISION SIGN
0x27B0, // [W] So CURLY LOOP
0x27BF, // [W] So DOUBLE CURLY LOOP
0x2B1B...0x2B1C, // [W] So [2] BLACK LARGE SQUARE..WHITE LARGE SQUARE
0x2B50, // [W] So WHITE MEDIUM STAR
0x2B55, // [W] So HEAVY LARGE CIRCLE
0x2E80...0x2E99, // [W] So [26] CJK RADICAL REPEAT..CJK RADICAL RAP
0x2E9B...0x2EF3, // [W] So [89] CJK RADICAL CHOKE..CJK RADICAL C-SIMPLIFIED TURTLE
0x2F00...0x2FD5, // [W] So [214] KANGXI RADICAL ONE..KANGXI RADICAL FLUTE
0x2FF0...0x2FFF, // [W] So [16] IDEOGRAPHIC DESCRIPTION CHARACTER LEFT TO RIGHT..IDEOGRAPHIC DESCRIPTION CHARACTER ROTATION
0x3000, // [F] Zs IDEOGRAPHIC SPACE
0x3001...0x3003, // [W] Po [3] IDEOGRAPHIC COMMA..DITTO MARK
0x3004, // [W] So JAPANESE INDUSTRIAL STANDARD SYMBOL
0x3005, // [W] Lm IDEOGRAPHIC ITERATION MARK
0x3006, // [W] Lo IDEOGRAPHIC CLOSING MARK
0x3007, // [W] Nl IDEOGRAPHIC NUMBER ZERO
0x3008, // [W] Ps LEFT ANGLE BRACKET
0x3009, // [W] Pe RIGHT ANGLE BRACKET
0x300A, // [W] Ps LEFT DOUBLE ANGLE BRACKET
0x300B, // [W] Pe RIGHT DOUBLE ANGLE BRACKET
0x300C, // [W] Ps LEFT CORNER BRACKET
0x300D, // [W] Pe RIGHT CORNER BRACKET
0x300E, // [W] Ps LEFT WHITE CORNER BRACKET
0x300F, // [W] Pe RIGHT WHITE CORNER BRACKET
0x3010, // [W] Ps LEFT BLACK LENTICULAR BRACKET
0x3011, // [W] Pe RIGHT BLACK LENTICULAR BRACKET
0x3012...0x3013, // [W] So [2] POSTAL MARK..GETA MARK
0x3014, // [W] Ps LEFT TORTOISE SHELL BRACKET
0x3015, // [W] Pe RIGHT TORTOISE SHELL BRACKET
0x3016, // [W] Ps LEFT WHITE LENTICULAR BRACKET
0x3017, // [W] Pe RIGHT WHITE LENTICULAR BRACKET
0x3018, // [W] Ps LEFT WHITE TORTOISE SHELL BRACKET
0x3019, // [W] Pe RIGHT WHITE TORTOISE SHELL BRACKET
0x301A, // [W] Ps LEFT WHITE SQUARE BRACKET
0x301B, // [W] Pe RIGHT WHITE SQUARE BRACKET
0x301C, // [W] Pd WAVE DASH
0x301D, // [W] Ps REVERSED DOUBLE PRIME QUOTATION MARK
0x301E...0x301F, // [W] Pe [2] DOUBLE PRIME QUOTATION MARK..LOW DOUBLE PRIME QUOTATION MARK
0x3020, // [W] So POSTAL MARK FACE
0x3021...0x3029, // [W] Nl [9] HANGZHOU NUMERAL ONE..HANGZHOU NUMERAL NINE
0x302A...0x302D, // [W] Mn [4] IDEOGRAPHIC LEVEL TONE MARK..IDEOGRAPHIC ENTERING TONE MARK
0x302E...0x302F, // [W] Mc [2] HANGUL SINGLE DOT TONE MARK..HANGUL DOUBLE DOT TONE MARK
0x3030, // [W] Pd WAVY DASH
0x3031...0x3035, // [W] Lm [5] VERTICAL KANA REPEAT MARK..VERTICAL KANA REPEAT MARK LOWER HALF
0x3036...0x3037, // [W] So [2] CIRCLED POSTAL MARK..IDEOGRAPHIC TELEGRAPH LINE FEED SEPARATOR SYMBOL
0x3038...0x303A, // [W] Nl [3] HANGZHOU NUMERAL TEN..HANGZHOU NUMERAL THIRTY
0x303B, // [W] Lm VERTICAL IDEOGRAPHIC ITERATION MARK
0x303C, // [W] Lo MASU MARK
0x303D, // [W] Po PART ALTERNATION MARK
0x303E, // [W] So IDEOGRAPHIC VARIATION INDICATOR
0x3041...0x3096, // [W] Lo [86] HIRAGANA LETTER SMALL A..HIRAGANA LETTER SMALL KE
0x3099...0x309A, // [W] Mn [2] COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK..COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
0x309B...0x309C, // [W] Sk [2] KATAKANA-HIRAGANA VOICED SOUND MARK..KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
0x309D...0x309E, // [W] Lm [2] HIRAGANA ITERATION MARK..HIRAGANA VOICED ITERATION MARK
0x309F, // [W] Lo HIRAGANA DIGRAPH YORI
0x30A0, // [W] Pd KATAKANA-HIRAGANA DOUBLE HYPHEN
0x30A1...0x30FA, // [W] Lo [90] KATAKANA LETTER SMALL A..KATAKANA LETTER VO
0x30FB, // [W] Po KATAKANA MIDDLE DOT
0x30FC...0x30FE, // [W] Lm [3] KATAKANA-HIRAGANA PROLONGED SOUND MARK..KATAKANA VOICED ITERATION MARK
0x30FF, // [W] Lo KATAKANA DIGRAPH KOTO
0x3105...0x312F, // [W] Lo [43] BOPOMOFO LETTER B..BOPOMOFO LETTER NN
0x3131...0x318E, // [W] Lo [94] HANGUL LETTER KIYEOK..HANGUL LETTER ARAEAE
0x3190...0x3191, // [W] So [2] IDEOGRAPHIC ANNOTATION LINKING MARK..IDEOGRAPHIC ANNOTATION REVERSE MARK
0x3192...0x3195, // [W] No [4] IDEOGRAPHIC ANNOTATION ONE MARK..IDEOGRAPHIC ANNOTATION FOUR MARK
0x3196...0x319F, // [W] So [10] IDEOGRAPHIC ANNOTATION TOP MARK..IDEOGRAPHIC ANNOTATION MAN MARK
0x31A0...0x31BF, // [W] Lo [32] BOPOMOFO LETTER BU..BOPOMOFO LETTER AH
0x31C0...0x31E3, // [W] So [36] CJK STROKE T..CJK STROKE Q
0x31EF, // [W] So IDEOGRAPHIC DESCRIPTION CHARACTER SUBTRACTION
0x31F0...0x31FF, // [W] Lo [16] KATAKANA LETTER SMALL KU..KATAKANA LETTER SMALL RO
0x3200...0x321E, // [W] So [31] PARENTHESIZED HANGUL KIYEOK..PARENTHESIZED KOREAN CHARACTER O HU
0x3220...0x3229, // [W] No [10] PARENTHESIZED IDEOGRAPH ONE..PARENTHESIZED IDEOGRAPH TEN
0x322A...0x3247, // [W] So [30] PARENTHESIZED IDEOGRAPH MOON..CIRCLED IDEOGRAPH KOTO
0x3250, // [W] So PARTNERSHIP SIGN
0x3251...0x325F, // [W] No [15] CIRCLED NUMBER TWENTY ONE..CIRCLED NUMBER THIRTY FIVE
0x3260...0x327F, // [W] So [32] CIRCLED HANGUL KIYEOK..KOREAN STANDARD SYMBOL
0x3280...0x3289, // [W] No [10] CIRCLED IDEOGRAPH ONE..CIRCLED IDEOGRAPH TEN
0x328A...0x32B0, // [W] So [39] CIRCLED IDEOGRAPH MOON..CIRCLED IDEOGRAPH NIGHT
0x32B1...0x32BF, // [W] No [15] CIRCLED NUMBER THIRTY SIX..CIRCLED NUMBER FIFTY
0x32C0...0x32FF, // [W] So [64] IDEOGRAPHIC TELEGRAPH SYMBOL FOR JANUARY..SQUARE ERA NAME REIWA
0x3300...0x33FF, // [W] So [256] SQUARE APAATO..SQUARE GAL
0x3400...0x4DBF, // [W] Lo [6592] CJK UNIFIED IDEOGRAPH-3400..CJK UNIFIED IDEOGRAPH-4DBF
0x4E00...0x9FFF, // [W] Lo [20992] CJK UNIFIED IDEOGRAPH-4E00..CJK UNIFIED IDEOGRAPH-9FFF
0xA000...0xA014, // [W] Lo [21] YI SYLLABLE IT..YI SYLLABLE E
0xA015, // [W] Lm YI SYLLABLE WU
0xA016...0xA48C, // [W] Lo [1143] YI SYLLABLE BIT..YI SYLLABLE YYR
0xA490...0xA4C6, // [W] So [55] YI RADICAL QOT..YI RADICAL KE
0xA960...0xA97C, // [W] Lo [29] HANGUL CHOSEONG TIKEUT-MIEUM..HANGUL CHOSEONG SSANGYEORINHIEUH
0xAC00...0xD7A3, // [W] Lo [11172] HANGUL SYLLABLE GA..HANGUL SYLLABLE HIH
0xF900...0xFA6D, // [W] Lo [366] CJK COMPATIBILITY IDEOGRAPH-F900..CJK COMPATIBILITY IDEOGRAPH-FA6D
0xFA6E...0xFA6F, // [W] Cn [2] <reserved-FA6E>..<reserved-FA6F>
0xFA70...0xFAD9, // [W] Lo [106] CJK COMPATIBILITY IDEOGRAPH-FA70..CJK COMPATIBILITY IDEOGRAPH-FAD9
0xFADA...0xFAFF, // [W] Cn [38] <reserved-FADA>..<reserved-FAFF>
0xFE10...0xFE16, // [W] Po [7] PRESENTATION FORM FOR VERTICAL COMMA..PRESENTATION FORM FOR VERTICAL QUESTION MARK
0xFE17, // [W] Ps PRESENTATION FORM FOR VERTICAL LEFT WHITE LENTICULAR BRACKET
0xFE18, // [W] Pe PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRAKCET
0xFE19, // [W] Po PRESENTATION FORM FOR VERTICAL HORIZONTAL ELLIPSIS
0xFE30, // [W] Po PRESENTATION FORM FOR VERTICAL TWO DOT LEADER
0xFE31...0xFE32, // [W] Pd [2] PRESENTATION FORM FOR VERTICAL EM DASH..PRESENTATION FORM FOR VERTICAL EN DASH
0xFE33...0xFE34, // [W] Pc [2] PRESENTATION FORM FOR VERTICAL LOW LINE..PRESENTATION FORM FOR VERTICAL WAVY LOW LINE
0xFE35, // [W] Ps PRESENTATION FORM FOR VERTICAL LEFT PARENTHESIS
0xFE36, // [W] Pe PRESENTATION FORM FOR VERTICAL RIGHT PARENTHESIS
0xFE37, // [W] Ps PRESENTATION FORM FOR VERTICAL LEFT CURLY BRACKET
0xFE38, // [W] Pe PRESENTATION FORM FOR VERTICAL RIGHT CURLY BRACKET
0xFE39, // [W] Ps PRESENTATION FORM FOR VERTICAL LEFT TORTOISE SHELL BRACKET
0xFE3A, // [W] Pe PRESENTATION FORM FOR VERTICAL RIGHT TORTOISE SHELL BRACKET
0xFE3B, // [W] Ps PRESENTATION FORM FOR VERTICAL LEFT BLACK LENTICULAR BRACKET
0xFE3C, // [W] Pe PRESENTATION FORM FOR VERTICAL RIGHT BLACK LENTICULAR BRACKET
0xFE3D, // [W] Ps PRESENTATION FORM FOR VERTICAL LEFT DOUBLE ANGLE BRACKET
0xFE3E, // [W] Pe PRESENTATION FORM FOR VERTICAL RIGHT DOUBLE ANGLE BRACKET
0xFE3F, // [W] Ps PRESENTATION FORM FOR VERTICAL LEFT ANGLE BRACKET
0xFE40, // [W] Pe PRESENTATION FORM FOR VERTICAL RIGHT ANGLE BRACKET
0xFE41, // [W] Ps PRESENTATION FORM FOR VERTICAL LEFT CORNER BRACKET
0xFE42, // [W] Pe PRESENTATION FORM FOR VERTICAL RIGHT CORNER BRACKET
0xFE43, // [W] Ps PRESENTATION FORM FOR VERTICAL LEFT WHITE CORNER BRACKET
0xFE44, // [W] Pe PRESENTATION FORM FOR VERTICAL RIGHT WHITE CORNER BRACKET
0xFE45...0xFE46, // [W] Po [2] SESAME DOT..WHITE SESAME DOT
0xFE47, // [W] Ps PRESENTATION FORM FOR VERTICAL LEFT SQUARE BRACKET
0xFE48, // [W] Pe PRESENTATION FORM FOR VERTICAL RIGHT SQUARE BRACKET
0xFE49...0xFE4C, // [W] Po [4] DASHED OVERLINE..DOUBLE WAVY OVERLINE
0xFE4D...0xFE4F, // [W] Pc [3] DASHED LOW LINE..WAVY LOW LINE
0xFE50...0xFE52, // [W] Po [3] SMALL COMMA..SMALL FULL STOP
0xFE54...0xFE57, // [W] Po [4] SMALL SEMICOLON..SMALL EXCLAMATION MARK
0xFE58, // [W] Pd SMALL EM DASH
0xFE59, // [W] Ps SMALL LEFT PARENTHESIS
0xFE5A, // [W] Pe SMALL RIGHT PARENTHESIS
0xFE5B, // [W] Ps SMALL LEFT CURLY BRACKET
0xFE5C, // [W] Pe SMALL RIGHT CURLY BRACKET
0xFE5D, // [W] Ps SMALL LEFT TORTOISE SHELL BRACKET
0xFE5E, // [W] Pe SMALL RIGHT TORTOISE SHELL BRACKET
0xFE5F...0xFE61, // [W] Po [3] SMALL NUMBER SIGN..SMALL ASTERISK
0xFE62, // [W] Sm SMALL PLUS SIGN
0xFE63, // [W] Pd SMALL HYPHEN-MINUS
0xFE64...0xFE66, // [W] Sm [3] SMALL LESS-THAN SIGN..SMALL EQUALS SIGN
0xFE68, // [W] Po SMALL REVERSE SOLIDUS
0xFE69, // [W] Sc SMALL DOLLAR SIGN
0xFE6A...0xFE6B, // [W] Po [2] SMALL PERCENT SIGN..SMALL COMMERCIAL AT
0xFF01...0xFF03, // [F] Po [3] FULLWIDTH EXCLAMATION MARK..FULLWIDTH NUMBER SIGN
0xFF04, // [F] Sc FULLWIDTH DOLLAR SIGN
0xFF05...0xFF07, // [F] Po [3] FULLWIDTH PERCENT SIGN..FULLWIDTH APOSTROPHE
0xFF08, // [F] Ps FULLWIDTH LEFT PARENTHESIS
0xFF09, // [F] Pe FULLWIDTH RIGHT PARENTHESIS
0xFF0A, // [F] Po FULLWIDTH ASTERISK
0xFF0B, // [F] Sm FULLWIDTH PLUS SIGN
0xFF0C, // [F] Po FULLWIDTH COMMA
0xFF0D, // [F] Pd FULLWIDTH HYPHEN-MINUS
0xFF0E...0xFF0F, // [F] Po [2] FULLWIDTH FULL STOP..FULLWIDTH SOLIDUS
0xFF10...0xFF19, // [F] Nd [10] FULLWIDTH DIGIT ZERO..FULLWIDTH DIGIT NINE
0xFF1A...0xFF1B, // [F] Po [2] FULLWIDTH COLON..FULLWIDTH SEMICOLON
0xFF1C...0xFF1E, // [F] Sm [3] FULLWIDTH LESS-THAN SIGN..FULLWIDTH GREATER-THAN SIGN
0xFF1F...0xFF20, // [F] Po [2] FULLWIDTH QUESTION MARK..FULLWIDTH COMMERCIAL AT
0xFF21...0xFF3A, // [F] Lu [26] FULLWIDTH LATIN CAPITAL LETTER A..FULLWIDTH LATIN CAPITAL LETTER Z
0xFF3B, // [F] Ps FULLWIDTH LEFT SQUARE BRACKET
0xFF3C, // [F] Po FULLWIDTH REVERSE SOLIDUS
0xFF3D, // [F] Pe FULLWIDTH RIGHT SQUARE BRACKET
0xFF3E, // [F] Sk FULLWIDTH CIRCUMFLEX ACCENT
0xFF3F, // [F] Pc FULLWIDTH LOW LINE
0xFF40, // [F] Sk FULLWIDTH GRAVE ACCENT
0xFF41...0xFF5A, // [F] Ll [26] FULLWIDTH LATIN SMALL LETTER A..FULLWIDTH LATIN SMALL LETTER Z
0xFF5B, // [F] Ps FULLWIDTH LEFT CURLY BRACKET
0xFF5C, // [F] Sm FULLWIDTH VERTICAL LINE
0xFF5D, // [F] Pe FULLWIDTH RIGHT CURLY BRACKET
0xFF5E, // [F] Sm FULLWIDTH TILDE
0xFF5F, // [F] Ps FULLWIDTH LEFT WHITE PARENTHESIS
0xFF60, // [F] Pe FULLWIDTH RIGHT WHITE PARENTHESIS
0xFFE0...0xFFE1, // [F] Sc [2] FULLWIDTH CENT SIGN..FULLWIDTH POUND SIGN
0xFFE2, // [F] Sm FULLWIDTH NOT SIGN
0xFFE3, // [F] Sk FULLWIDTH MACRON
0xFFE4, // [F] So FULLWIDTH BROKEN BAR
0xFFE5...0xFFE6, // [F] Sc [2] FULLWIDTH YEN SIGN..FULLWIDTH WON SIGN
0x16FE0...0x16FE1, // [W] Lm [2] TANGUT ITERATION MARK..NUSHU ITERATION MARK
0x16FE2, // [W] Po OLD CHINESE HOOK MARK
0x16FE3, // [W] Lm OLD CHINESE ITERATION MARK
0x16FE4, // [W] Mn KHITAN SMALL SCRIPT FILLER
0x16FF0...0x16FF1, // [W] Mc [2] VIETNAMESE ALTERNATE READING MARK CA..VIETNAMESE ALTERNATE READING MARK NHAY
0x17000...0x187F7, // [W] Lo [6136] TANGUT IDEOGRAPH-17000..TANGUT IDEOGRAPH-187F7
0x18800...0x18AFF, // [W] Lo [768] TANGUT COMPONENT-001..TANGUT COMPONENT-768
0x18B00...0x18CD5, // [W] Lo [470] KHITAN SMALL SCRIPT CHARACTER-18B00..KHITAN SMALL SCRIPT CHARACTER-18CD5
0x18D00...0x18D08, // [W] Lo [9] TANGUT IDEOGRAPH-18D00..TANGUT IDEOGRAPH-18D08
0x1AFF0...0x1AFF3, // [W] Lm [4] KATAKANA LETTER MINNAN TONE-2..KATAKANA LETTER MINNAN TONE-5
0x1AFF5...0x1AFFB, // [W] Lm [7] KATAKANA LETTER MINNAN TONE-7..KATAKANA LETTER MINNAN NASALIZED TONE-5
0x1AFFD...0x1AFFE, // [W] Lm [2] KATAKANA LETTER MINNAN NASALIZED TONE-7..KATAKANA LETTER MINNAN NASALIZED TONE-8
0x1B000...0x1B0FF, // [W] Lo [256] KATAKANA LETTER ARCHAIC E..HENTAIGANA LETTER RE-2
0x1B100...0x1B122, // [W] Lo [35] HENTAIGANA LETTER RE-3..KATAKANA LETTER ARCHAIC WU
0x1B132, // [W] Lo HIRAGANA LETTER SMALL KO
0x1B150...0x1B152, // [W] Lo [3] HIRAGANA LETTER SMALL WI..HIRAGANA LETTER SMALL WO
0x1B155, // [W] Lo KATAKANA LETTER SMALL KO
0x1B164...0x1B167, // [W] Lo [4] KATAKANA LETTER SMALL WI..KATAKANA LETTER SMALL N
0x1B170...0x1B2FB, // [W] Lo [396] NUSHU CHARACTER-1B170..NUSHU CHARACTER-1B2FB
0x1F004, // [W] So MAHJONG TILE RED DRAGON
0x1F0CF, // [W] So PLAYING CARD BLACK JOKER
0x1F18E, // [W] So NEGATIVE SQUARED AB
0x1F191...0x1F19A, // [W] So [10] SQUARED CL..SQUARED VS
0x1F200...0x1F202, // [W] So [3] SQUARE HIRAGANA HOKA..SQUARED KATAKANA SA
0x1F210...0x1F23B, // [W] So [44] SQUARED CJK UNIFIED IDEOGRAPH-624B..SQUARED CJK UNIFIED IDEOGRAPH-914D
0x1F240...0x1F248, // [W] So [9] TORTOISE SHELL BRACKETED CJK UNIFIED IDEOGRAPH-672C..TORTOISE SHELL BRACKETED CJK UNIFIED IDEOGRAPH-6557
0x1F250...0x1F251, // [W] So [2] CIRCLED IDEOGRAPH ADVANTAGE..CIRCLED IDEOGRAPH ACCEPT
0x1F260...0x1F265, // [W] So [6] ROUNDED SYMBOL FOR FU..ROUNDED SYMBOL FOR CAI
0x1F300...0x1F320, // [W] So [33] CYCLONE..SHOOTING STAR
0x1F32D...0x1F335, // [W] So [9] HOT DOG..CACTUS
0x1F337...0x1F37C, // [W] So [70] TULIP..BABY BOTTLE
0x1F37E...0x1F393, // [W] So [22] BOTTLE WITH POPPING CORK..GRADUATION CAP
0x1F3A0...0x1F3CA, // [W] So [43] CAROUSEL HORSE..SWIMMER
0x1F3CF...0x1F3D3, // [W] So [5] CRICKET BAT AND BALL..TABLE TENNIS PADDLE AND BALL
0x1F3E0...0x1F3F0, // [W] So [17] HOUSE BUILDING..EUROPEAN CASTLE
0x1F3F4, // [W] So WAVING BLACK FLAG
0x1F3F8...0x1F3FA, // [W] So [3] BADMINTON RACQUET AND SHUTTLECOCK..AMPHORA
0x1F3FB...0x1F3FF, // [W] Sk [5] EMOJI MODIFIER FITZPATRICK TYPE-1-2..EMOJI MODIFIER FITZPATRICK TYPE-6
0x1F400...0x1F43E, // [W] So [63] RAT..PAW PRINTS
0x1F440, // [W] So EYES
0x1F442...0x1F4FC, // [W] So [187] EAR..VIDEOCASSETTE
0x1F4FF...0x1F53D, // [W] So [63] PRAYER BEADS..DOWN-POINTING SMALL RED TRIANGLE
0x1F54B...0x1F54E, // [W] So [4] KAABA..MENORAH WITH NINE BRANCHES
0x1F550...0x1F567, // [W] So [24] CLOCK FACE ONE OCLOCK..CLOCK FACE TWELVE-THIRTY
0x1F57A, // [W] So MAN DANCING
0x1F595...0x1F596, // [W] So [2] REVERSED HAND WITH MIDDLE FINGER EXTENDED..RAISED HAND WITH PART BETWEEN MIDDLE AND RING FINGERS
0x1F5A4, // [W] So BLACK HEART
0x1F5FB...0x1F5FF, // [W] So [5] MOUNT FUJI..MOYAI
0x1F600...0x1F64F, // [W] So [80] GRINNING FACE..PERSON WITH FOLDED HANDS
0x1F680...0x1F6C5, // [W] So [70] ROCKET..LEFT LUGGAGE
0x1F6CC, // [W] So SLEEPING ACCOMMODATION
0x1F6D0...0x1F6D2, // [W] So [3] PLACE OF WORSHIP..SHOPPING TROLLEY
0x1F6D5...0x1F6D7, // [W] So [3] HINDU TEMPLE..ELEVATOR
0x1F6DC...0x1F6DF, // [W] So [4] WIRELESS..RING BUOY
0x1F6EB...0x1F6EC, // [W] So [2] AIRPLANE DEPARTURE..AIRPLANE ARRIVING
0x1F6F4...0x1F6FC, // [W] So [9] SCOOTER..ROLLER SKATE
0x1F7E0...0x1F7EB, // [W] So [12] LARGE ORANGE CIRCLE..LARGE BROWN SQUARE
0x1F7F0, // [W] So HEAVY EQUALS SIGN
0x1F90C...0x1F93A, // [W] So [47] PINCHED FINGERS..FENCER
0x1F93C...0x1F945, // [W] So [10] WRESTLERS..GOAL NET
0x1F947...0x1F9FF, // [W] So [185] FIRST PLACE MEDAL..NAZAR AMULET
0x1FA70...0x1FA7C, // [W] So [13] BALLET SHOES..CRUTCH
0x1FA80...0x1FA88, // [W] So [9] YO-YO..FLUTE
0x1FA90...0x1FABD, // [W] So [46] RINGED PLANET..WING
0x1FABF...0x1FAC5, // [W] So [7] GOOSE..PERSON WITH CROWN
0x1FACE...0x1FADB, // [W] So [14] MOOSE..PEA POD
0x1FAE0...0x1FAE8, // [W] So [9] MELTING FACE..SHAKING FACE
0x1FAF0...0x1FAF8, // [W] So [9] HAND WITH INDEX FINGER AND THUMB CROSSED..RIGHTWARDS PUSHING HAND
0x20000...0x2A6DF, // [W] Lo [42720] CJK UNIFIED IDEOGRAPH-20000..CJK UNIFIED IDEOGRAPH-2A6DF
0x2A6E0...0x2A6FF, // [W] Cn [32] <reserved-2A6E0>..<reserved-2A6FF>
0x2A700...0x2B739, // [W] Lo [4154] CJK UNIFIED IDEOGRAPH-2A700..CJK UNIFIED IDEOGRAPH-2B739
0x2B73A...0x2B73F, // [W] Cn [6] <reserved-2B73A>..<reserved-2B73F>
0x2B740...0x2B81D, // [W] Lo [222] CJK UNIFIED IDEOGRAPH-2B740..CJK UNIFIED IDEOGRAPH-2B81D
0x2B81E...0x2B81F, // [W] Cn [2] <reserved-2B81E>..<reserved-2B81F>
0x2B820...0x2CEA1, // [W] Lo [5762] CJK UNIFIED IDEOGRAPH-2B820..CJK UNIFIED IDEOGRAPH-2CEA1
0x2CEA2...0x2CEAF, // [W] Cn [14] <reserved-2CEA2>..<reserved-2CEAF>
0x2CEB0...0x2EBE0, // [W] Lo [7473] CJK UNIFIED IDEOGRAPH-2CEB0..CJK UNIFIED IDEOGRAPH-2EBE0
0x2EBE1...0x2EBEF, // [W] Cn [15] <reserved-2EBE1>..<reserved-2EBEF>
0x2EBF0...0x2EE5D, // [W] Lo [622] CJK UNIFIED IDEOGRAPH-2EBF0..CJK UNIFIED IDEOGRAPH-2EE5D
0x2EE5E...0x2F7FF, // [W] Cn [2466] <reserved-2EE5E>..<reserved-2F7FF>
0x2F800...0x2FA1D, // [W] Lo [542] CJK COMPATIBILITY IDEOGRAPH-2F800..CJK COMPATIBILITY IDEOGRAPH-2FA1D
0x2FA1E...0x2FA1F, // [W] Cn [2] <reserved-2FA1E>..<reserved-2FA1F>
0x2FA20...0x2FFFD, // [W] Cn [1502] <reserved-2FA20>..<reserved-2FFFD>
0x30000...0x3134A, // [W] Lo [4939] CJK UNIFIED IDEOGRAPH-30000..CJK UNIFIED IDEOGRAPH-3134A
0x3134B...0x3134F, // [W] Cn [5] <reserved-3134B>..<reserved-3134F>
0x31350...0x323AF, // [W] Lo [4192] CJK UNIFIED IDEOGRAPH-31350..CJK UNIFIED IDEOGRAPH-323AF
0x323B0...0x3FFFD, // [W] Cn [56398] <reserved-323B0>..<reserved-3FFFD>
=> true,
else => false,
};
}
pub fn isAmgiguousCodepointType(comptime T: type, cp: T) bool {
return switch (cp) {
0xA1,
0xA4,
0xA7,
0xA8,
0xAA,
0xAD,
0xAE,
0xB0...0xB4,
0xB6...0xBA,
0xBC...0xBF,
0xC6,
0xD0,
0xD7,
0xD8,
0xDE...0xE1,
0xE6,
0xE8...0xEA,
0xEC,
0xED,
0xF0,
0xF2,
0xF3,
0xF7...0xFA,
0xFC,
0xFE,
0x101,
0x111,
0x113,
0x11B,
0x126,
0x127,
0x12B,
0x131...0x133,
0x138,
0x13F...0x142,
0x144,
0x148...0x14B,
0x14D,
0x152,
0x153,
0x166,
0x167,
0x16B,
0x1CE,
0x1D0,
0x1D2,
0x1D4,
0x1D6,
0x1D8,
0x1DA,
0x1DC,
0x251,
0x261,
0x2C4,
0x2C7,
0x2C9...0x2CB,
0x2CD,
0x2D0,
0x2D8...0x2DB,
0x2DD,
0x2DF,
0x300...0x36F,
0x391...0x3A1,
0x3A3...0x3A9,
0x3B1...0x3C1,
0x3C3...0x3C9,
0x401,
0x410...0x44F,
0x451,
0x2010,
0x2013...0x2016,
0x2018,
0x2019,
0x201C,
0x201D,
0x2020...0x2022,
0x2024...0x2027,
0x2030,
0x2032,
0x2033,
0x2035,
0x203B,
0x203E,
0x2074,
0x207F,
0x2081...0x2084,
0x20AC,
0x2103,
0x2105,
0x2109,
0x2113,
0x2116,
0x2121,
0x2122,
0x2126,
0x212B,
0x2153,
0x2154,
0x215B...0x215E,
0x2160...0x216B,
0x2170...0x2179,
0x2189,
0x2190...0x2199,
0x21B8,
0x21B9,
0x21D2,
0x21D4,
0x21E7,
0x2200,
0x2202,
0x2203,
0x2207,
0x2208,
0x220B,
0x220F,
0x2211,
0x2215,
0x221A,
0x221D...0x2220,
0x2223,
0x2225,
0x2227...0x222C,
0x222E,
0x2234...0x2237,
0x223C,
0x223D,
0x2248,
0x224C,
0x2252,
0x2260,
0x2261,
0x2264...0x2267,
0x226A,
0x226B,
0x226E,
0x226F,
0x2282,
0x2283,
0x2286,
0x2287,
0x2295,
0x2299,
0x22A5,
0x22BF,
0x2312,
0x2460...0x24E9,
0x24EB...0x254B,
0x2550...0x2573,
0x2580...0x258F,
0x2592...0x2595,
0x25A0,
0x25A1,
0x25A3...0x25A9,
0x25B2,
0x25B3,
0x25B6,
0x25B7,
0x25BC,
0x25BD,
0x25C0,
0x25C1,
0x25C6...0x25C8,
0x25CB,
0x25CE...0x25D1,
0x25E2...0x25E5,
0x25EF,
0x2605,
0x2606,
0x2609,
0x260E,
0x260F,
0x261C,
0x261E,
0x2640,
0x2642,
0x2660,
0x2661,
0x2663...0x2665,
0x2667...0x266A,
0x266C,
0x266D,
0x266F,
0x269E,
0x269F,
0x26BF,
0x26C6...0x26CD,
0x26CF...0x26D3,
0x26D5...0x26E1,
0x26E3,
0x26E8,
0x26E9,
0x26EB...0x26F1,
0x26F4,
0x26F6...0x26F9,
0x26FB,
0x26FC,
0x26FE,
0x26FF,
0x273D,
0x2776...0x277F,
0x2B56...0x2B59,
0x3248...0x324F,
0xE000...0xF8FF,
0xFE00...0xFE0F,
0xFFFD,
0x1F100...0x1F10A,
0x1F110...0x1F12D,
0x1F130...0x1F169,
0x1F170...0x1F18D,
0x1F18F,
0x1F190,
0x1F19B...0x1F1AC,
0xE0100...0xE01EF,
0xF0000...0xFFFFD,
0x100000...0x10FFFD,
=> true,
else => false,
};
}
pub fn visibleCodepointWidth(cp: u32, ambiguousAsWide: bool) u3_fast {
return visibleCodepointWidthType(u32, cp, ambiguousAsWide);
}
pub fn visibleCodepointWidthMaybeEmoji(cp: u32, maybe_emoji: bool, ambiguousAsWide: bool) u3_fast {
// UCHAR_EMOJI=57,
if (maybe_emoji and icu_hasBinaryProperty(cp, 57)) {
return 2;
}
return visibleCodepointWidth(cp, ambiguousAsWide);
}
pub fn visibleCodepointWidthType(comptime T: type, cp: T, ambiguousAsWide: bool) u3_fast {
if (isZeroWidthCodepointType(T, cp)) {
return 0;
}
if (isFullWidthCodepointType(T, cp)) {
return 2;
}
if (ambiguousAsWide and isAmgiguousCodepointType(T, cp)) {
return 2;
}
return 1;
}
pub const visible = struct {
// Ref: https://cs.stanford.edu/people/miles/iso8859.html
fn visibleLatin1Width(input_: []const u8) usize {
var length: usize = 0;
var input = input_;
const input_end_ptr = input.ptr + input.len - (input.len % 16);
var input_ptr = input.ptr;
while (input_ptr != input_end_ptr) {
const input_chunk: [16]u8 = input_ptr[0..16].*;
const sums: @Vector(16, u8) = [16]u8{
visibleLatin1WidthScalar(input_chunk[0]),
visibleLatin1WidthScalar(input_chunk[1]),
visibleLatin1WidthScalar(input_chunk[2]),
visibleLatin1WidthScalar(input_chunk[3]),
visibleLatin1WidthScalar(input_chunk[4]),
visibleLatin1WidthScalar(input_chunk[5]),
visibleLatin1WidthScalar(input_chunk[6]),
visibleLatin1WidthScalar(input_chunk[7]),
visibleLatin1WidthScalar(input_chunk[8]),
visibleLatin1WidthScalar(input_chunk[9]),
visibleLatin1WidthScalar(input_chunk[10]),
visibleLatin1WidthScalar(input_chunk[11]),
visibleLatin1WidthScalar(input_chunk[12]),
visibleLatin1WidthScalar(input_chunk[13]),
visibleLatin1WidthScalar(input_chunk[14]),
visibleLatin1WidthScalar(input_chunk[15]),
};
length += @reduce(.Add, sums);
input_ptr += 16;
}
input.len %= 16;
input.ptr = input_ptr;
for (input) |byte| length += visibleLatin1WidthScalar(byte);
return length;
}
fn visibleLatin1WidthScalar(c: u8) u1 {
return if ((c >= 127 and c <= 159) or c < 32) 0 else 1;
}
fn visibleLatin1WidthExcludeANSIColors(input_: anytype) usize {
var length: usize = 0;
var input = input_;
const ElementType = std.meta.Child(@TypeOf(input_));
const indexFn = if (comptime ElementType == u8) strings.indexOfCharUsize else strings.indexOfChar16Usize;
while (indexFn(input, '\x1b')) |i| {
length += visibleLatin1Width(input[0..i]);
input = input[i..];
if (input.len < 3) return length;
if (input[1] == '[') {
const end = indexFn(input[2..], 'm') orelse return length;
input = input[end + 3 ..];
} else {
input = input[1..];
}
}
length += visibleLatin1Width(input);
return length;
}
fn visibleUTF8WidthFn(input: []const u8, comptime asciiFn: anytype) usize {
var bytes = input;
var len: usize = 0;
while (bun.strings.firstNonASCII(bytes)) |i| {
len += asciiFn(bytes[0..i]);
const this_chunk = bytes[i..];
const byte = this_chunk[0];
const skip = bun.strings.wtf8ByteSequenceLengthWithInvalid(byte);
const cp_bytes: [4]u8 = switch (@min(@as(usize, skip), this_chunk.len)) {
inline 1, 2, 3, 4 => |cp_len| .{
byte,
if (comptime cp_len > 1) this_chunk[1] else 0,
if (comptime cp_len > 2) this_chunk[2] else 0,
if (comptime cp_len > 3) this_chunk[3] else 0,
},
else => unreachable,
};
const cp = decodeWTF8RuneTMultibyte(&cp_bytes, skip, u32, unicode_replacement);
len += visibleCodepointWidth(cp, false);
bytes = bytes[@min(i + skip, bytes.len)..];
}
len += asciiFn(bytes);
return len;
}
fn visibleUTF16WidthFn(input_: []const u16, exclude_ansi_colors: bool, ambiguousAsWide: bool) usize {
var input = input_;
var len: usize = 0;
var prev: ?u21 = 0;
var break_state = grapheme.BreakState{};
var break_start: u21 = 0;
var saw_1b = false;
var saw_bracket = false;
var stretch_len: usize = 0;
while (true) {
{
const idx = firstNonASCII16([]const u16, input) orelse input.len;
for (0..idx) |j| {
const cp = input[j];
defer prev = cp;
if (saw_bracket) {
if (cp == 'm') {
saw_1b = false;
saw_bracket = false;
stretch_len = 0;
continue;
}
stretch_len += visibleCodepointWidth(cp, ambiguousAsWide);
continue;
}
if (saw_1b) {
if (cp == '[') {
saw_bracket = true;
stretch_len = 0;
continue;
}
len += visibleCodepointWidth(cp, ambiguousAsWide);
continue;
}
if (!exclude_ansi_colors or cp != 0x1b) {
if (prev) |prev_| {
const should_break = grapheme.graphemeBreak(prev_, cp, &break_state);
if (should_break) {
len += visibleCodepointWidthMaybeEmoji(break_start, cp == 0xFE0F, ambiguousAsWide);
break_start = cp;
} else {
//
}
} else {
len += visibleCodepointWidth(cp, ambiguousAsWide);
break_start = cp;
}
continue;
}
saw_1b = true;
continue;
}
len += stretch_len;
input = input[idx..];
}
if (input.len == 0) break;
const replacement = utf16CodepointWithFFFD([]const u16, input);
defer input = input[replacement.len..];
if (replacement.fail) continue;
const cp: u21 = @intCast(replacement.code_point);
defer prev = cp;
if (prev) |prev_| {
const should_break = grapheme.graphemeBreak(prev_, cp, &break_state);
if (should_break) {
len += visibleCodepointWidthMaybeEmoji(break_start, cp == 0xFE0F, ambiguousAsWide);
break_start = cp;
}
} else {
len += visibleCodepointWidth(cp, ambiguousAsWide);
break_start = cp;
}
}
if (break_start > 0) {
len += visibleCodepointWidthMaybeEmoji(break_start, (prev orelse 0) == 0xFE0F, ambiguousAsWide);
}
return len;
}
fn visibleLatin1WidthFn(input: []const u8) usize {
return visibleLatin1Width(input);
}
pub const width = struct {
pub fn latin1(input: []const u8) usize {
return visibleLatin1Width(input);
}
pub fn utf8(input: []const u8) usize {
return visibleUTF8WidthFn(input, visibleLatin1Width);
}
pub fn utf16(input: []const u16, ambiguousAsWide: bool) usize {
return visibleUTF16WidthFn(input, false, ambiguousAsWide);
}
pub const exclude_ansi_colors = struct {
pub fn latin1(input: []const u8) usize {
return visibleLatin1WidthExcludeANSIColors(input);
}
pub fn utf8(input: []const u8) usize {
return visibleUTF8WidthFn(input, visibleLatin1WidthExcludeANSIColors);
}
pub fn utf16(input: []const u16, ambiguousAsWide: bool) usize {
return visibleUTF16WidthFn(input, true, ambiguousAsWide);
}
};
};
};
// extern "C" bool icu_hasBinaryProperty(UChar32 cp, unsigned int prop)
extern fn icu_hasBinaryProperty(c: u32, which: c_uint) bool;
const bun = @import("bun");
const std = @import("std");
const u3_fast = strings.u3_fast;
const decodeWTF8RuneTMultibyte = strings.decodeWTF8RuneTMultibyte;
const grapheme = strings.grapheme;
const strings = bun.strings;
const unicode_replacement = strings.unicode_replacement;
const firstNonASCII16 = strings.firstNonASCII16;
const firstNonASCII = strings.firstNonASCII;
const utf16CodepointWithFFFD = strings.utf16CodepointWithFFFD;

File diff suppressed because it is too large Load Diff