mirror of
https://github.com/oven-sh/bun
synced 2026-02-09 10:28:47 +00:00
Split up string_immutable into more files (#20446)
Co-authored-by: Jarred-Sumner <709451+Jarred-Sumner@users.noreply.github.com>
This commit is contained in:
@@ -673,12 +673,16 @@ src/StaticHashMap.zig
|
||||
src/string_immutable.zig
|
||||
src/string_types.zig
|
||||
src/string.zig
|
||||
src/string/escapeHTML.zig
|
||||
src/string/HashedString.zig
|
||||
src/string/MutableString.zig
|
||||
src/string/paths.zig
|
||||
src/string/PathString.zig
|
||||
src/string/SmolStr.zig
|
||||
src/string/StringBuilder.zig
|
||||
src/string/StringJoiner.zig
|
||||
src/string/unicode.zig
|
||||
src/string/visible.zig
|
||||
src/string/WTFStringImpl.zig
|
||||
src/sync.zig
|
||||
src/sys_uv.zig
|
||||
|
||||
640
src/string/escapeHTML.zig
Normal file
640
src/string/escapeHTML.zig
Normal file
@@ -0,0 +1,640 @@
|
||||
pub fn escapeHTMLForLatin1Input(allocator: std.mem.Allocator, latin1: []const u8) !Escaped(u8) {
|
||||
const Scalar = struct {
|
||||
pub const lengths: [std.math.maxInt(u8) + 1]u4 = brk: {
|
||||
var values: [std.math.maxInt(u8) + 1]u4 = undefined;
|
||||
for (values, 0..) |_, i| {
|
||||
switch (i) {
|
||||
'"' => {
|
||||
values[i] = """.len;
|
||||
},
|
||||
'&' => {
|
||||
values[i] = "&".len;
|
||||
},
|
||||
'\'' => {
|
||||
values[i] = "'".len;
|
||||
},
|
||||
'<' => {
|
||||
values[i] = "<".len;
|
||||
},
|
||||
'>' => {
|
||||
values[i] = ">".len;
|
||||
},
|
||||
else => {
|
||||
values[i] = 1;
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
break :brk values;
|
||||
};
|
||||
|
||||
fn appendString(buf: [*]u8, comptime str: []const u8) callconv(bun.callconv_inline) usize {
|
||||
buf[0..str.len].* = str[0..str.len].*;
|
||||
return str.len;
|
||||
}
|
||||
|
||||
pub fn append(buf: [*]u8, char: u8) callconv(bun.callconv_inline) usize {
|
||||
if (lengths[char] == 1) {
|
||||
buf[0] = char;
|
||||
return 1;
|
||||
}
|
||||
|
||||
return switch (char) {
|
||||
'"' => appendString(buf, """),
|
||||
'&' => appendString(buf, "&"),
|
||||
'\'' => appendString(buf, "'"),
|
||||
'<' => appendString(buf, "<"),
|
||||
'>' => appendString(buf, ">"),
|
||||
else => unreachable,
|
||||
};
|
||||
}
|
||||
|
||||
pub fn push(comptime len: anytype, chars_: *const [len]u8, allo: std.mem.Allocator) callconv(bun.callconv_inline) Escaped(u8) {
|
||||
const chars = chars_.*;
|
||||
var total: usize = 0;
|
||||
|
||||
comptime var remain_to_comp = len;
|
||||
comptime var comp_i = 0;
|
||||
|
||||
inline while (remain_to_comp > 0) : (remain_to_comp -= 1) {
|
||||
total += lengths[chars[comp_i]];
|
||||
comp_i += 1;
|
||||
}
|
||||
|
||||
if (total == len) {
|
||||
return .{ .original = {} };
|
||||
}
|
||||
|
||||
const output = allo.alloc(u8, total) catch unreachable;
|
||||
var head = output.ptr;
|
||||
inline for (comptime bun.range(0, len)) |i| {
|
||||
head += @This().append(head, chars[i]);
|
||||
}
|
||||
|
||||
return Escaped(u8){ .allocated = output };
|
||||
}
|
||||
};
|
||||
@setEvalBranchQuota(5000);
|
||||
switch (latin1.len) {
|
||||
0 => return Escaped(u8){ .static = "" },
|
||||
1 => return switch (latin1[0]) {
|
||||
'"' => Escaped(u8){ .static = """ },
|
||||
'&' => Escaped(u8){ .static = "&" },
|
||||
'\'' => Escaped(u8){ .static = "'" },
|
||||
'<' => Escaped(u8){ .static = "<" },
|
||||
'>' => Escaped(u8){ .static = ">" },
|
||||
else => Escaped(u8){ .original = {} },
|
||||
},
|
||||
2 => {
|
||||
const first: []const u8 = switch (latin1[0]) {
|
||||
'"' => """,
|
||||
'&' => "&",
|
||||
'\'' => "'",
|
||||
'<' => "<",
|
||||
'>' => ">",
|
||||
else => latin1[0..1],
|
||||
};
|
||||
const second: []const u8 = switch (latin1[1]) {
|
||||
'"' => """,
|
||||
'&' => "&",
|
||||
'\'' => "'",
|
||||
'<' => "<",
|
||||
'>' => ">",
|
||||
else => latin1[1..2],
|
||||
};
|
||||
if (first.len == 1 and second.len == 1) {
|
||||
return Escaped(u8){ .original = {} };
|
||||
}
|
||||
|
||||
return Escaped(u8){ .allocated = strings.append(allocator, first, second) catch unreachable };
|
||||
},
|
||||
|
||||
// The simd implementation is slower for inputs less than 32 bytes.
|
||||
3 => return Scalar.push(3, latin1[0..3], allocator),
|
||||
4 => return Scalar.push(4, latin1[0..4], allocator),
|
||||
5 => return Scalar.push(5, latin1[0..5], allocator),
|
||||
6 => return Scalar.push(6, latin1[0..6], allocator),
|
||||
7 => return Scalar.push(7, latin1[0..7], allocator),
|
||||
8 => return Scalar.push(8, latin1[0..8], allocator),
|
||||
9 => return Scalar.push(9, latin1[0..9], allocator),
|
||||
10 => return Scalar.push(10, latin1[0..10], allocator),
|
||||
11 => return Scalar.push(11, latin1[0..11], allocator),
|
||||
12 => return Scalar.push(12, latin1[0..12], allocator),
|
||||
13 => return Scalar.push(13, latin1[0..13], allocator),
|
||||
14 => return Scalar.push(14, latin1[0..14], allocator),
|
||||
15 => return Scalar.push(15, latin1[0..15], allocator),
|
||||
16 => return Scalar.push(16, latin1[0..16], allocator),
|
||||
17 => return Scalar.push(17, latin1[0..17], allocator),
|
||||
18 => return Scalar.push(18, latin1[0..18], allocator),
|
||||
19 => return Scalar.push(19, latin1[0..19], allocator),
|
||||
20 => return Scalar.push(20, latin1[0..20], allocator),
|
||||
21 => return Scalar.push(21, latin1[0..21], allocator),
|
||||
22 => return Scalar.push(22, latin1[0..22], allocator),
|
||||
23 => return Scalar.push(23, latin1[0..23], allocator),
|
||||
24 => return Scalar.push(24, latin1[0..24], allocator),
|
||||
25 => return Scalar.push(25, latin1[0..25], allocator),
|
||||
26 => return Scalar.push(26, latin1[0..26], allocator),
|
||||
27 => return Scalar.push(27, latin1[0..27], allocator),
|
||||
28 => return Scalar.push(28, latin1[0..28], allocator),
|
||||
29 => return Scalar.push(29, latin1[0..29], allocator),
|
||||
30 => return Scalar.push(30, latin1[0..30], allocator),
|
||||
31 => return Scalar.push(31, latin1[0..31], allocator),
|
||||
32 => return Scalar.push(32, latin1[0..32], allocator),
|
||||
|
||||
else => {
|
||||
var remaining = latin1;
|
||||
|
||||
const vec_chars = "\"&'<>";
|
||||
const vecs: [vec_chars.len]AsciiVector = comptime brk: {
|
||||
var _vecs: [vec_chars.len]AsciiVector = undefined;
|
||||
for (vec_chars, 0..) |c, i| {
|
||||
_vecs[i] = @splat(c);
|
||||
}
|
||||
break :brk _vecs;
|
||||
};
|
||||
|
||||
var any_needs_escape = false;
|
||||
var buf: std.ArrayList(u8) = std.ArrayList(u8){
|
||||
.items = &.{},
|
||||
.capacity = 0,
|
||||
.allocator = allocator,
|
||||
};
|
||||
|
||||
if (comptime Environment.enableSIMD) {
|
||||
// pass #1: scan for any characters that need escaping
|
||||
// assume most strings won't need any escaping, so don't actually allocate the buffer
|
||||
scan_and_allocate_lazily: while (remaining.len >= ascii_vector_size) {
|
||||
if (comptime Environment.allow_assert) assert(!any_needs_escape);
|
||||
const vec: AsciiVector = remaining[0..ascii_vector_size].*;
|
||||
if (@reduce(.Max, @as(AsciiVectorU1, @bitCast((vec == vecs[0]))) |
|
||||
@as(AsciiVectorU1, @bitCast((vec == vecs[1]))) |
|
||||
@as(AsciiVectorU1, @bitCast((vec == vecs[2]))) |
|
||||
@as(AsciiVectorU1, @bitCast((vec == vecs[3]))) |
|
||||
@as(AsciiVectorU1, @bitCast((vec == vecs[4])))) == 1)
|
||||
{
|
||||
if (comptime Environment.allow_assert) assert(buf.capacity == 0);
|
||||
|
||||
buf = try std.ArrayList(u8).initCapacity(allocator, latin1.len + 6);
|
||||
const copy_len = @intFromPtr(remaining.ptr) - @intFromPtr(latin1.ptr);
|
||||
buf.appendSliceAssumeCapacity(latin1[0..copy_len]);
|
||||
any_needs_escape = true;
|
||||
inline for (0..ascii_vector_size) |i| {
|
||||
switch (vec[i]) {
|
||||
'"' => {
|
||||
buf.ensureUnusedCapacity((ascii_vector_size - i) + """.len) catch unreachable;
|
||||
buf.items.ptr[buf.items.len .. buf.items.len + """.len][0..""".len].* = """.*;
|
||||
buf.items.len += """.len;
|
||||
},
|
||||
'&' => {
|
||||
buf.ensureUnusedCapacity((ascii_vector_size - i) + "&".len) catch unreachable;
|
||||
buf.items.ptr[buf.items.len .. buf.items.len + "&".len][0.."&".len].* = "&".*;
|
||||
buf.items.len += "&".len;
|
||||
},
|
||||
'\'' => {
|
||||
buf.ensureUnusedCapacity((ascii_vector_size - i) + "'".len) catch unreachable;
|
||||
buf.items.ptr[buf.items.len .. buf.items.len + "'".len][0.."'".len].* = "'".*;
|
||||
buf.items.len += "'".len;
|
||||
},
|
||||
'<' => {
|
||||
buf.ensureUnusedCapacity((ascii_vector_size - i) + "<".len) catch unreachable;
|
||||
buf.items.ptr[buf.items.len .. buf.items.len + "<".len][0.."<".len].* = "<".*;
|
||||
buf.items.len += "<".len;
|
||||
},
|
||||
'>' => {
|
||||
buf.ensureUnusedCapacity((ascii_vector_size - i) + ">".len) catch unreachable;
|
||||
buf.items.ptr[buf.items.len .. buf.items.len + ">".len][0..">".len].* = ">".*;
|
||||
buf.items.len += ">".len;
|
||||
},
|
||||
else => |c| {
|
||||
buf.appendAssumeCapacity(c);
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
remaining = remaining[ascii_vector_size..];
|
||||
break :scan_and_allocate_lazily;
|
||||
}
|
||||
|
||||
remaining = remaining[ascii_vector_size..];
|
||||
}
|
||||
}
|
||||
|
||||
if (any_needs_escape) {
|
||||
// pass #2: we found something that needed an escape
|
||||
// so we'll go ahead and copy the buffer into a new buffer
|
||||
while (remaining.len >= ascii_vector_size) {
|
||||
const vec: AsciiVector = remaining[0..ascii_vector_size].*;
|
||||
if (@reduce(.Max, @as(AsciiVectorU1, @bitCast((vec == vecs[0]))) |
|
||||
@as(AsciiVectorU1, @bitCast((vec == vecs[1]))) |
|
||||
@as(AsciiVectorU1, @bitCast((vec == vecs[2]))) |
|
||||
@as(AsciiVectorU1, @bitCast((vec == vecs[3]))) |
|
||||
@as(AsciiVectorU1, @bitCast((vec == vecs[4])))) == 1)
|
||||
{
|
||||
buf.ensureUnusedCapacity(ascii_vector_size + 6) catch unreachable;
|
||||
inline for (0..ascii_vector_size) |i| {
|
||||
switch (vec[i]) {
|
||||
'"' => {
|
||||
buf.ensureUnusedCapacity((ascii_vector_size - i) + """.len) catch unreachable;
|
||||
buf.items.ptr[buf.items.len .. buf.items.len + """.len][0..""".len].* = """.*;
|
||||
buf.items.len += """.len;
|
||||
},
|
||||
'&' => {
|
||||
buf.ensureUnusedCapacity((ascii_vector_size - i) + "&".len) catch unreachable;
|
||||
buf.items.ptr[buf.items.len .. buf.items.len + "&".len][0.."&".len].* = "&".*;
|
||||
buf.items.len += "&".len;
|
||||
},
|
||||
'\'' => {
|
||||
buf.ensureUnusedCapacity((ascii_vector_size - i) + "'".len) catch unreachable;
|
||||
buf.items.ptr[buf.items.len .. buf.items.len + "'".len][0.."'".len].* = "'".*;
|
||||
buf.items.len += "'".len;
|
||||
},
|
||||
'<' => {
|
||||
buf.ensureUnusedCapacity((ascii_vector_size - i) + "<".len) catch unreachable;
|
||||
buf.items.ptr[buf.items.len .. buf.items.len + "<".len][0.."<".len].* = "<".*;
|
||||
buf.items.len += "<".len;
|
||||
},
|
||||
'>' => {
|
||||
buf.ensureUnusedCapacity((ascii_vector_size - i) + ">".len) catch unreachable;
|
||||
buf.items.ptr[buf.items.len .. buf.items.len + ">".len][0..">".len].* = ">".*;
|
||||
buf.items.len += ">".len;
|
||||
},
|
||||
else => |c| {
|
||||
buf.appendAssumeCapacity(c);
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
remaining = remaining[ascii_vector_size..];
|
||||
continue;
|
||||
}
|
||||
|
||||
try buf.ensureUnusedCapacity(ascii_vector_size);
|
||||
buf.items.ptr[buf.items.len .. buf.items.len + ascii_vector_size][0..ascii_vector_size].* = remaining[0..ascii_vector_size].*;
|
||||
buf.items.len += ascii_vector_size;
|
||||
remaining = remaining[ascii_vector_size..];
|
||||
}
|
||||
}
|
||||
|
||||
var ptr = remaining.ptr;
|
||||
const end = remaining.ptr + remaining.len;
|
||||
|
||||
if (!any_needs_escape) {
|
||||
scan_and_allocate_lazily: while (ptr != end) : (ptr += 1) {
|
||||
switch (ptr[0]) {
|
||||
'"', '&', '\'', '<', '>' => |c| {
|
||||
if (comptime Environment.allow_assert) assert(buf.capacity == 0);
|
||||
|
||||
buf = try std.ArrayList(u8).initCapacity(allocator, latin1.len + @as(usize, Scalar.lengths[c]));
|
||||
const copy_len = @intFromPtr(ptr) - @intFromPtr(latin1.ptr);
|
||||
if (comptime Environment.allow_assert) assert(copy_len <= buf.capacity);
|
||||
buf.items.len = copy_len;
|
||||
@memcpy(buf.items[0..copy_len], latin1[0..copy_len]);
|
||||
any_needs_escape = true;
|
||||
break :scan_and_allocate_lazily;
|
||||
},
|
||||
else => {},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
while (ptr != end) : (ptr += 1) {
|
||||
switch (ptr[0]) {
|
||||
'"' => {
|
||||
buf.appendSlice(""") catch unreachable;
|
||||
},
|
||||
'&' => {
|
||||
buf.appendSlice("&") catch unreachable;
|
||||
},
|
||||
'\'' => {
|
||||
buf.appendSlice("'") catch unreachable; // modified from escape-html; used to be '''
|
||||
},
|
||||
'<' => {
|
||||
buf.appendSlice("<") catch unreachable;
|
||||
},
|
||||
'>' => {
|
||||
buf.appendSlice(">") catch unreachable;
|
||||
},
|
||||
else => |c| {
|
||||
buf.append(c) catch unreachable;
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
if (!any_needs_escape) {
|
||||
if (comptime Environment.allow_assert) assert(buf.capacity == 0);
|
||||
return Escaped(u8){ .original = {} };
|
||||
}
|
||||
|
||||
return Escaped(u8){ .allocated = try buf.toOwnedSlice() };
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
fn Escaped(comptime T: type) type {
|
||||
return union(enum) {
|
||||
static: []const u8,
|
||||
original: void,
|
||||
allocated: []T,
|
||||
};
|
||||
}
|
||||
|
||||
pub fn escapeHTMLForUTF16Input(allocator: std.mem.Allocator, utf16: []const u16) !Escaped(u16) {
|
||||
const Scalar = struct {
|
||||
pub const lengths: [std.math.maxInt(u8) + 1]u4 = brk: {
|
||||
var values: [std.math.maxInt(u8) + 1]u4 = undefined;
|
||||
for (values, 0..) |_, i| {
|
||||
values[i] = switch (i) {
|
||||
'"' => """.len,
|
||||
'&' => "&".len,
|
||||
'\'' => "'".len,
|
||||
'<' => "<".len,
|
||||
'>' => ">".len,
|
||||
else => 1,
|
||||
};
|
||||
}
|
||||
|
||||
break :brk values;
|
||||
};
|
||||
};
|
||||
switch (utf16.len) {
|
||||
0 => return Escaped(u16){ .static = &[_]u8{} },
|
||||
1 => {
|
||||
switch (utf16[0]) {
|
||||
'"' => return Escaped(u16){ .static = """ },
|
||||
'&' => return Escaped(u16){ .static = "&" },
|
||||
'\'' => return Escaped(u16){ .static = "'" },
|
||||
'<' => return Escaped(u16){ .static = "<" },
|
||||
'>' => return Escaped(u16){ .static = ">" },
|
||||
else => return Escaped(u16){ .original = {} },
|
||||
}
|
||||
},
|
||||
2 => {
|
||||
const first_16 = switch (utf16[0]) {
|
||||
'"' => toUTF16Literal("""),
|
||||
'&' => toUTF16Literal("&"),
|
||||
'\'' => toUTF16Literal("'"),
|
||||
'<' => toUTF16Literal("<"),
|
||||
'>' => toUTF16Literal(">"),
|
||||
else => @as([]const u16, utf16[0..1]),
|
||||
};
|
||||
|
||||
const second_16 = switch (utf16[1]) {
|
||||
'"' => toUTF16Literal("""),
|
||||
'&' => toUTF16Literal("&"),
|
||||
'\'' => toUTF16Literal("'"),
|
||||
'<' => toUTF16Literal("<"),
|
||||
'>' => toUTF16Literal(">"),
|
||||
else => @as([]const u16, utf16[1..2]),
|
||||
};
|
||||
|
||||
if (first_16.ptr == utf16.ptr and second_16.ptr == utf16.ptr + 1) {
|
||||
return Escaped(u16){ .original = {} };
|
||||
}
|
||||
|
||||
var buf = allocator.alloc(u16, first_16.len + second_16.len) catch unreachable;
|
||||
bun.copy(u16, buf, first_16);
|
||||
bun.copy(u16, buf[first_16.len..], second_16);
|
||||
return Escaped(u16){ .allocated = buf };
|
||||
},
|
||||
|
||||
else => {
|
||||
var remaining = utf16;
|
||||
|
||||
var any_needs_escape = false;
|
||||
var buf: std.ArrayList(u16) = undefined;
|
||||
|
||||
if (comptime Environment.enableSIMD) {
|
||||
const vec_chars = "\"&'<>";
|
||||
const vecs: [vec_chars.len]AsciiU16Vector = brk: {
|
||||
var _vecs: [vec_chars.len]AsciiU16Vector = undefined;
|
||||
for (vec_chars, 0..) |c, i| {
|
||||
_vecs[i] = @splat(@as(u16, c));
|
||||
}
|
||||
break :brk _vecs;
|
||||
};
|
||||
// pass #1: scan for any characters that need escaping
|
||||
// assume most strings won't need any escaping, so don't actually allocate the buffer
|
||||
scan_and_allocate_lazily: while (remaining.len >= ascii_u16_vector_size) {
|
||||
if (comptime Environment.allow_assert) assert(!any_needs_escape);
|
||||
const vec: AsciiU16Vector = remaining[0..ascii_u16_vector_size].*;
|
||||
if (@reduce(.Max, @as(AsciiVectorU16U1, @bitCast(vec > @as(AsciiU16Vector, @splat(@as(u16, 127))))) |
|
||||
@as(AsciiVectorU16U1, @bitCast((vec == vecs[0]))) |
|
||||
@as(AsciiVectorU16U1, @bitCast((vec == vecs[1]))) |
|
||||
@as(AsciiVectorU16U1, @bitCast((vec == vecs[2]))) |
|
||||
@as(AsciiVectorU16U1, @bitCast((vec == vecs[3]))) |
|
||||
@as(AsciiVectorU16U1, @bitCast((vec == vecs[4])))) == 1)
|
||||
{
|
||||
var i: u16 = 0;
|
||||
lazy: {
|
||||
while (i < ascii_u16_vector_size) {
|
||||
switch (remaining[i]) {
|
||||
'"', '&', '\'', '<', '>' => {
|
||||
any_needs_escape = true;
|
||||
break :lazy;
|
||||
},
|
||||
128...std.math.maxInt(u16) => {
|
||||
const cp = utf16Codepoint([]const u16, remaining[i..]);
|
||||
i += @as(u16, cp.len);
|
||||
},
|
||||
else => {
|
||||
i += 1;
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!any_needs_escape) {
|
||||
remaining = remaining[i..];
|
||||
continue :scan_and_allocate_lazily;
|
||||
}
|
||||
|
||||
if (comptime Environment.allow_assert) assert(@intFromPtr(remaining.ptr + i) >= @intFromPtr(utf16.ptr));
|
||||
const to_copy = std.mem.sliceAsBytes(utf16)[0 .. @intFromPtr(remaining.ptr + i) - @intFromPtr(utf16.ptr)];
|
||||
const to_copy_16 = std.mem.bytesAsSlice(u16, to_copy);
|
||||
buf = try std.ArrayList(u16).initCapacity(allocator, utf16.len + 6);
|
||||
try buf.appendSlice(to_copy_16);
|
||||
|
||||
while (i < ascii_u16_vector_size) {
|
||||
switch (remaining[i]) {
|
||||
'"', '&', '\'', '<', '>' => |c| {
|
||||
const result = switch (c) {
|
||||
'"' => toUTF16Literal("""),
|
||||
'&' => toUTF16Literal("&"),
|
||||
'\'' => toUTF16Literal("'"),
|
||||
'<' => toUTF16Literal("<"),
|
||||
'>' => toUTF16Literal(">"),
|
||||
else => unreachable,
|
||||
};
|
||||
|
||||
buf.appendSlice(result) catch unreachable;
|
||||
i += 1;
|
||||
},
|
||||
128...std.math.maxInt(u16) => {
|
||||
const cp = utf16Codepoint([]const u16, remaining[i..]);
|
||||
|
||||
buf.appendSlice(remaining[i..][0..@as(usize, cp.len)]) catch unreachable;
|
||||
i += @as(u16, cp.len);
|
||||
},
|
||||
else => |c| {
|
||||
i += 1;
|
||||
buf.append(c) catch unreachable;
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
// edgecase: code point width could exceed asdcii_u16_vector_size
|
||||
remaining = remaining[i..];
|
||||
break :scan_and_allocate_lazily;
|
||||
}
|
||||
|
||||
remaining = remaining[ascii_u16_vector_size..];
|
||||
}
|
||||
|
||||
if (any_needs_escape) {
|
||||
// pass #2: we found something that needed an escape
|
||||
// but there's still some more text to
|
||||
// so we'll go ahead and copy the buffer into a new buffer
|
||||
while (remaining.len >= ascii_u16_vector_size) {
|
||||
const vec: AsciiU16Vector = remaining[0..ascii_u16_vector_size].*;
|
||||
if (@reduce(.Max, @as(AsciiVectorU16U1, @bitCast(vec > @as(AsciiU16Vector, @splat(@as(u16, 127))))) |
|
||||
@as(AsciiVectorU16U1, @bitCast((vec == vecs[0]))) |
|
||||
@as(AsciiVectorU16U1, @bitCast((vec == vecs[1]))) |
|
||||
@as(AsciiVectorU16U1, @bitCast((vec == vecs[2]))) |
|
||||
@as(AsciiVectorU16U1, @bitCast((vec == vecs[3]))) |
|
||||
@as(AsciiVectorU16U1, @bitCast((vec == vecs[4])))) == 1)
|
||||
{
|
||||
buf.ensureUnusedCapacity(ascii_u16_vector_size) catch unreachable;
|
||||
var i: u16 = 0;
|
||||
while (i < ascii_u16_vector_size) {
|
||||
switch (remaining[i]) {
|
||||
'"' => {
|
||||
buf.appendSlice(toUTF16Literal(""")) catch unreachable;
|
||||
i += 1;
|
||||
},
|
||||
'&' => {
|
||||
buf.appendSlice(toUTF16Literal("&")) catch unreachable;
|
||||
i += 1;
|
||||
},
|
||||
'\'' => {
|
||||
buf.appendSlice(toUTF16Literal("'")) catch unreachable; // modified from escape-html; used to be '''
|
||||
i += 1;
|
||||
},
|
||||
'<' => {
|
||||
buf.appendSlice(toUTF16Literal("<")) catch unreachable;
|
||||
i += 1;
|
||||
},
|
||||
'>' => {
|
||||
buf.appendSlice(toUTF16Literal(">")) catch unreachable;
|
||||
i += 1;
|
||||
},
|
||||
128...std.math.maxInt(u16) => {
|
||||
const cp = utf16Codepoint([]const u16, remaining[i..]);
|
||||
|
||||
buf.appendSlice(remaining[i..][0..@as(usize, cp.len)]) catch unreachable;
|
||||
i += @as(u16, cp.len);
|
||||
},
|
||||
else => |c| {
|
||||
buf.append(c) catch unreachable;
|
||||
i += 1;
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
remaining = remaining[i..];
|
||||
continue;
|
||||
}
|
||||
|
||||
try buf.ensureUnusedCapacity(ascii_u16_vector_size);
|
||||
buf.items.ptr[buf.items.len .. buf.items.len + ascii_u16_vector_size][0..ascii_u16_vector_size].* = remaining[0..ascii_u16_vector_size].*;
|
||||
buf.items.len += ascii_u16_vector_size;
|
||||
remaining = remaining[ascii_u16_vector_size..];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var ptr = remaining.ptr;
|
||||
const end = remaining.ptr + remaining.len;
|
||||
|
||||
if (!any_needs_escape) {
|
||||
scan_and_allocate_lazily: while (ptr != end) {
|
||||
switch (ptr[0]) {
|
||||
'"', '&', '\'', '<', '>' => |c| {
|
||||
buf = try std.ArrayList(u16).initCapacity(allocator, utf16.len + @as(usize, Scalar.lengths[c]));
|
||||
if (comptime Environment.allow_assert) assert(@intFromPtr(ptr) >= @intFromPtr(utf16.ptr));
|
||||
|
||||
const to_copy = std.mem.sliceAsBytes(utf16)[0 .. @intFromPtr(ptr) - @intFromPtr(utf16.ptr)];
|
||||
const to_copy_16 = std.mem.bytesAsSlice(u16, to_copy);
|
||||
try buf.appendSlice(to_copy_16);
|
||||
any_needs_escape = true;
|
||||
break :scan_and_allocate_lazily;
|
||||
},
|
||||
128...std.math.maxInt(u16) => {
|
||||
const cp = utf16Codepoint([]const u16, ptr[0..if (ptr + 1 == end) 1 else 2]);
|
||||
|
||||
ptr += @as(u16, cp.len);
|
||||
},
|
||||
else => {
|
||||
ptr += 1;
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
while (ptr != end) {
|
||||
switch (ptr[0]) {
|
||||
'"' => {
|
||||
buf.appendSlice(toUTF16Literal(""")) catch unreachable;
|
||||
ptr += 1;
|
||||
},
|
||||
'&' => {
|
||||
buf.appendSlice(toUTF16Literal("&")) catch unreachable;
|
||||
ptr += 1;
|
||||
},
|
||||
'\'' => {
|
||||
buf.appendSlice(toUTF16Literal("'")) catch unreachable; // modified from escape-html; used to be '''
|
||||
ptr += 1;
|
||||
},
|
||||
'<' => {
|
||||
buf.appendSlice(toUTF16Literal("<")) catch unreachable;
|
||||
ptr += 1;
|
||||
},
|
||||
'>' => {
|
||||
buf.appendSlice(toUTF16Literal(">")) catch unreachable;
|
||||
ptr += 1;
|
||||
},
|
||||
128...std.math.maxInt(u16) => {
|
||||
const cp = utf16Codepoint([]const u16, ptr[0..if (ptr + 1 == end) 1 else 2]);
|
||||
|
||||
buf.appendSlice(ptr[0..@as(usize, cp.len)]) catch unreachable;
|
||||
ptr += @as(u16, cp.len);
|
||||
},
|
||||
|
||||
else => |c| {
|
||||
buf.append(c) catch unreachable;
|
||||
ptr += 1;
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
if (!any_needs_escape) {
|
||||
return Escaped(u16){ .original = {} };
|
||||
}
|
||||
|
||||
return Escaped(u16){ .allocated = try buf.toOwnedSlice() };
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
const std = @import("std");
|
||||
const bun = @import("bun");
|
||||
const Environment = bun.Environment;
|
||||
const assert = bun.assert;
|
||||
const ascii_u16_vector_size = strings.ascii_u16_vector_size;
|
||||
const AsciiU16Vector = strings.AsciiU16Vector;
|
||||
const utf16Codepoint = strings.utf16Codepoint;
|
||||
const toUTF16Literal = strings.toUTF16Literal;
|
||||
const strings = bun.strings;
|
||||
const AsciiVectorU16U1 = strings.AsciiVectorU16U1;
|
||||
const AsciiVector = strings.AsciiVector;
|
||||
const ascii_vector_size = strings.ascii_vector_size;
|
||||
const AsciiVectorU1 = strings.AsciiVectorU1;
|
||||
461
src/string/paths.zig
Normal file
461
src/string/paths.zig
Normal file
@@ -0,0 +1,461 @@
|
||||
/// Checks if a path is missing a windows drive letter. For windows APIs,
|
||||
/// this is used for an assertion, and PosixToWinNormalizer can help make
|
||||
/// an absolute path contain a drive letter.
|
||||
pub fn isWindowsAbsolutePathMissingDriveLetter(comptime T: type, chars: []const T) bool {
|
||||
bun.unsafeAssert(bun.path.Platform.windows.isAbsoluteT(T, chars));
|
||||
bun.unsafeAssert(chars.len > 0);
|
||||
|
||||
// 'C:\hello' -> false
|
||||
// This is the most common situation, so we check it first
|
||||
if (!(chars[0] == '/' or chars[0] == '\\')) {
|
||||
bun.unsafeAssert(chars.len > 2);
|
||||
bun.unsafeAssert(chars[1] == ':');
|
||||
return false;
|
||||
}
|
||||
|
||||
if (chars.len > 4) {
|
||||
// '\??\hello' -> false (has the NT object prefix)
|
||||
if (chars[1] == '?' and
|
||||
chars[2] == '?' and
|
||||
(chars[3] == '/' or chars[3] == '\\'))
|
||||
return false;
|
||||
// '\\?\hello' -> false (has the other NT object prefix)
|
||||
// '\\.\hello' -> false (has the NT device prefix)
|
||||
if ((chars[1] == '/' or chars[1] == '\\') and
|
||||
(chars[2] == '?' or chars[2] == '.') and
|
||||
(chars[3] == '/' or chars[3] == '\\'))
|
||||
return false;
|
||||
}
|
||||
|
||||
// A path starting with `/` can be a UNC path with forward slashes,
|
||||
// or actually just a posix path.
|
||||
//
|
||||
// '\\Server\Share' -> false (unc)
|
||||
// '\\Server\\Share' -> true (not unc because extra slashes)
|
||||
// '\Server\Share' -> true (posix path)
|
||||
return bun.path.windowsFilesystemRootT(T, chars).len == 1;
|
||||
}
|
||||
|
||||
pub fn fromWPath(buf: []u8, utf16: []const u16) [:0]const u8 {
|
||||
bun.unsafeAssert(buf.len > 0);
|
||||
const to_copy = trimPrefixComptime(u16, utf16, bun.windows.long_path_prefix);
|
||||
const encode_into_result = copyUTF16IntoUTF8(buf[0 .. buf.len - 1], []const u16, to_copy, false);
|
||||
bun.unsafeAssert(encode_into_result.written < buf.len);
|
||||
buf[encode_into_result.written] = 0;
|
||||
return buf[0..encode_into_result.written :0];
|
||||
}
|
||||
|
||||
pub fn withoutNTPrefix(comptime T: type, path: []const T) []const T {
|
||||
if (comptime !Environment.isWindows) return path;
|
||||
const cmp = if (T == u8)
|
||||
hasPrefixComptime
|
||||
else
|
||||
hasPrefixComptimeUTF16;
|
||||
if (cmp(path, &bun.windows.nt_object_prefix_u8)) {
|
||||
return path[bun.windows.nt_object_prefix.len..];
|
||||
}
|
||||
if (cmp(path, &bun.windows.long_path_prefix_u8)) {
|
||||
return path[bun.windows.long_path_prefix.len..];
|
||||
}
|
||||
if (cmp(path, &bun.windows.nt_unc_object_prefix_u8)) {
|
||||
return path[bun.windows.nt_unc_object_prefix.len..];
|
||||
}
|
||||
return path;
|
||||
}
|
||||
|
||||
pub fn toNTPath(wbuf: []u16, utf8: []const u8) [:0]u16 {
|
||||
if (!std.fs.path.isAbsoluteWindows(utf8)) {
|
||||
return toWPathNormalized(wbuf, utf8);
|
||||
}
|
||||
|
||||
if (strings.hasPrefixComptime(utf8, &bun.windows.nt_object_prefix_u8) or
|
||||
strings.hasPrefixComptime(utf8, &bun.windows.nt_unc_object_prefix_u8))
|
||||
{
|
||||
return wbuf[0..toWPathNormalized(wbuf, utf8).len :0];
|
||||
}
|
||||
|
||||
// UNC absolute path, replace leading '\\' with '\??\UNC\'
|
||||
if (strings.hasPrefixComptime(utf8, "\\\\")) {
|
||||
if (strings.hasPrefixComptime(utf8[2..], bun.windows.long_path_prefix_u8[2..])) {
|
||||
const prefix = bun.windows.nt_object_prefix;
|
||||
wbuf[0..prefix.len].* = prefix;
|
||||
return wbuf[0 .. toWPathNormalized(wbuf[prefix.len..], utf8[4..]).len + prefix.len :0];
|
||||
}
|
||||
const prefix = bun.windows.nt_unc_object_prefix;
|
||||
wbuf[0..prefix.len].* = prefix;
|
||||
return wbuf[0 .. toWPathNormalized(wbuf[prefix.len..], utf8[2..]).len + prefix.len :0];
|
||||
}
|
||||
|
||||
const prefix = bun.windows.nt_object_prefix;
|
||||
wbuf[0..prefix.len].* = prefix;
|
||||
return wbuf[0 .. toWPathNormalized(wbuf[prefix.len..], utf8).len + prefix.len :0];
|
||||
}
|
||||
|
||||
pub fn toNTPath16(wbuf: []u16, path: []const u16) [:0]u16 {
|
||||
if (!std.fs.path.isAbsoluteWindowsWTF16(path)) {
|
||||
return toWPathNormalized16(wbuf, path);
|
||||
}
|
||||
|
||||
if (strings.hasPrefixComptimeUTF16(path, &bun.windows.nt_object_prefix_u8) or
|
||||
strings.hasPrefixComptimeUTF16(path, &bun.windows.nt_unc_object_prefix_u8))
|
||||
{
|
||||
return wbuf[0..toWPathNormalized16(wbuf, path).len :0];
|
||||
}
|
||||
|
||||
if (strings.hasPrefixComptimeUTF16(path, "\\\\")) {
|
||||
if (strings.hasPrefixComptimeUTF16(path[2..], bun.windows.long_path_prefix_u8[2..])) {
|
||||
const prefix = bun.windows.nt_object_prefix;
|
||||
wbuf[0..prefix.len].* = prefix;
|
||||
return wbuf[0 .. toWPathNormalized16(wbuf[prefix.len..], path[4..]).len + prefix.len :0];
|
||||
}
|
||||
const prefix = bun.windows.nt_unc_object_prefix;
|
||||
wbuf[0..prefix.len].* = prefix;
|
||||
return wbuf[0 .. toWPathNormalized16(wbuf[prefix.len..], path[2..]).len + prefix.len :0];
|
||||
}
|
||||
|
||||
const prefix = bun.windows.nt_object_prefix;
|
||||
wbuf[0..prefix.len].* = prefix;
|
||||
return wbuf[0 .. toWPathNormalized16(wbuf[prefix.len..], path).len + prefix.len :0];
|
||||
}
|
||||
|
||||
pub fn toNTMaxPath(buf: []u8, utf8: []const u8) [:0]const u8 {
|
||||
if (!std.fs.path.isAbsoluteWindows(utf8) or utf8.len <= 260) {
|
||||
@memcpy(buf[0..utf8.len], utf8);
|
||||
buf[utf8.len] = 0;
|
||||
return buf[0..utf8.len :0];
|
||||
}
|
||||
|
||||
const prefix = bun.windows.nt_maxpath_prefix_u8;
|
||||
buf[0..prefix.len].* = prefix;
|
||||
return buf[0 .. toPathNormalized(buf[prefix.len..], utf8).len + prefix.len :0];
|
||||
}
|
||||
|
||||
pub fn addNTPathPrefix(wbuf: []u16, utf16: []const u16) [:0]u16 {
|
||||
wbuf[0..bun.windows.nt_object_prefix.len].* = bun.windows.nt_object_prefix;
|
||||
@memcpy(wbuf[bun.windows.nt_object_prefix.len..][0..utf16.len], utf16);
|
||||
wbuf[utf16.len + bun.windows.nt_object_prefix.len] = 0;
|
||||
return wbuf[0 .. utf16.len + bun.windows.nt_object_prefix.len :0];
|
||||
}
|
||||
|
||||
pub fn addNTPathPrefixIfNeeded(wbuf: []u16, utf16: []const u16) [:0]u16 {
|
||||
if (hasPrefixComptimeType(u16, utf16, bun.windows.nt_object_prefix)) {
|
||||
@memcpy(wbuf[0..utf16.len], utf16);
|
||||
wbuf[utf16.len] = 0;
|
||||
return wbuf[0..utf16.len :0];
|
||||
}
|
||||
if (hasPrefixComptimeType(u16, utf16, bun.windows.long_path_prefix)) {
|
||||
// Replace prefix
|
||||
return addNTPathPrefix(wbuf, utf16[bun.windows.long_path_prefix.len..]);
|
||||
}
|
||||
return addNTPathPrefix(wbuf, utf16);
|
||||
}
|
||||
|
||||
// These are the same because they don't have rules like needing a trailing slash
|
||||
pub const toNTDir = toNTPath;
|
||||
|
||||
pub fn toExtendedPathNormalized(wbuf: []u16, utf8: []const u8) [:0]const u16 {
|
||||
bun.unsafeAssert(wbuf.len > 4);
|
||||
wbuf[0..4].* = bun.windows.long_path_prefix;
|
||||
return wbuf[0 .. toWPathNormalized(wbuf[4..], utf8).len + 4 :0];
|
||||
}
|
||||
|
||||
pub fn toWPathNormalizeAutoExtend(wbuf: []u16, utf8: []const u8) [:0]const u16 {
|
||||
if (std.fs.path.isAbsoluteWindows(utf8)) {
|
||||
return toExtendedPathNormalized(wbuf, utf8);
|
||||
}
|
||||
|
||||
return toWPathNormalized(wbuf, utf8);
|
||||
}
|
||||
|
||||
pub fn toWPathNormalized(wbuf: []u16, utf8: []const u8) [:0]u16 {
|
||||
const renormalized = bun.PathBufferPool.get();
|
||||
defer bun.PathBufferPool.put(renormalized);
|
||||
|
||||
var path_to_use = normalizeSlashesOnly(renormalized, utf8, '\\');
|
||||
|
||||
// is there a trailing slash? Let's remove it before converting to UTF-16
|
||||
if (path_to_use.len > 3 and bun.path.isSepAny(path_to_use[path_to_use.len - 1])) {
|
||||
path_to_use = path_to_use[0 .. path_to_use.len - 1];
|
||||
}
|
||||
|
||||
return toWPath(wbuf, path_to_use);
|
||||
}
|
||||
|
||||
pub fn toWPathNormalized16(wbuf: []u16, path: []const u16) [:0]u16 {
|
||||
var path_to_use = normalizeSlashesOnlyT(u16, wbuf, path, '\\', true);
|
||||
|
||||
// is there a trailing slash? Let's remove it before converting to UTF-16
|
||||
if (path_to_use.len > 3 and bun.path.isSepAnyT(u16, path_to_use[path_to_use.len - 1])) {
|
||||
path_to_use = path_to_use[0 .. path_to_use.len - 1];
|
||||
}
|
||||
|
||||
wbuf[path_to_use.len] = 0;
|
||||
|
||||
return wbuf[0..path_to_use.len :0];
|
||||
}
|
||||
|
||||
pub fn toPathNormalized(buf: []u8, utf8: []const u8) [:0]const u8 {
|
||||
const renormalized = bun.PathBufferPool.get();
|
||||
defer bun.PathBufferPool.put(renormalized);
|
||||
|
||||
var path_to_use = normalizeSlashesOnly(renormalized, utf8, '\\');
|
||||
|
||||
// is there a trailing slash? Let's remove it before converting to UTF-16
|
||||
if (path_to_use.len > 3 and bun.path.isSepAny(path_to_use[path_to_use.len - 1])) {
|
||||
path_to_use = path_to_use[0 .. path_to_use.len - 1];
|
||||
}
|
||||
|
||||
return toPath(buf, path_to_use);
|
||||
}
|
||||
|
||||
pub fn normalizeSlashesOnlyT(comptime T: type, buf: []T, path: []const T, comptime desired_slash: u8, comptime always_copy: bool) []const T {
|
||||
comptime bun.unsafeAssert(desired_slash == '/' or desired_slash == '\\');
|
||||
const undesired_slash = if (desired_slash == '/') '\\' else '/';
|
||||
|
||||
if (bun.strings.containsCharT(T, path, undesired_slash)) {
|
||||
@memcpy(buf[0..path.len], path);
|
||||
for (buf[0..path.len]) |*c| {
|
||||
if (c.* == undesired_slash) {
|
||||
c.* = desired_slash;
|
||||
}
|
||||
}
|
||||
return buf[0..path.len];
|
||||
}
|
||||
|
||||
if (comptime always_copy) {
|
||||
@memcpy(buf[0..path.len], path);
|
||||
return buf[0..path.len];
|
||||
}
|
||||
return path;
|
||||
}
|
||||
|
||||
pub fn normalizeSlashesOnly(buf: []u8, utf8: []const u8, comptime desired_slash: u8) []const u8 {
|
||||
return normalizeSlashesOnlyT(u8, buf, utf8, desired_slash, false);
|
||||
}
|
||||
|
||||
pub fn toWDirNormalized(wbuf: []u16, utf8: []const u8) [:0]const u16 {
|
||||
var renormalized: ?*bun.PathBuffer = null;
|
||||
defer if (renormalized) |r| bun.PathBufferPool.put(r);
|
||||
|
||||
var path_to_use = utf8;
|
||||
|
||||
if (bun.strings.containsChar(utf8, '/')) {
|
||||
renormalized = bun.PathBufferPool.get();
|
||||
@memcpy(renormalized.?[0..utf8.len], utf8);
|
||||
for (renormalized.?[0..utf8.len]) |*c| {
|
||||
if (c.* == '/') {
|
||||
c.* = '\\';
|
||||
}
|
||||
}
|
||||
path_to_use = renormalized.?[0..utf8.len];
|
||||
}
|
||||
|
||||
return toWDirPath(wbuf, path_to_use);
|
||||
}
|
||||
|
||||
pub fn toWPath(wbuf: []u16, utf8: []const u8) [:0]u16 {
|
||||
return toWPathMaybeDir(wbuf, utf8, false);
|
||||
}
|
||||
|
||||
pub fn toPath(buf: []u8, utf8: []const u8) [:0]u8 {
|
||||
return toPathMaybeDir(buf, utf8, false);
|
||||
}
|
||||
|
||||
pub fn toWDirPath(wbuf: []u16, utf8: []const u8) [:0]const u16 {
|
||||
return toWPathMaybeDir(wbuf, utf8, true);
|
||||
}
|
||||
|
||||
pub fn toKernel32Path(wbuf: []u16, utf8: []const u8) [:0]u16 {
|
||||
const path = if (hasPrefixComptime(utf8, bun.windows.nt_object_prefix_u8))
|
||||
utf8[bun.windows.nt_object_prefix_u8.len..]
|
||||
else
|
||||
utf8;
|
||||
if (hasPrefixComptime(path, bun.windows.long_path_prefix_u8)) {
|
||||
return toWPath(wbuf, path);
|
||||
}
|
||||
if (utf8.len > 2 and bun.path.isDriveLetter(utf8[0]) and utf8[1] == ':' and bun.path.isSepAny(utf8[2])) {
|
||||
wbuf[0..4].* = bun.windows.long_path_prefix;
|
||||
const wpath = toWPath(wbuf[4..], path);
|
||||
return wbuf[0 .. wpath.len + 4 :0];
|
||||
}
|
||||
return toWPath(wbuf, path);
|
||||
}
|
||||
|
||||
fn isUNCPath(comptime T: type, path: []const T) bool {
|
||||
return path.len >= 3 and
|
||||
bun.path.Platform.windows.isSeparatorT(T, path[0]) and
|
||||
bun.path.Platform.windows.isSeparatorT(T, path[1]) and
|
||||
!bun.path.Platform.windows.isSeparatorT(T, path[2]) and
|
||||
path[2] != '.';
|
||||
}
|
||||
pub fn assertIsValidWindowsPath(comptime T: type, path: []const T) void {
|
||||
if (Environment.allow_assert and Environment.isWindows) {
|
||||
if (bun.path.Platform.windows.isAbsoluteT(T, path) and
|
||||
isWindowsAbsolutePathMissingDriveLetter(T, path) and
|
||||
// is it a null device path? that's not an error. it's just a weird file path.
|
||||
!eqlComptimeT(T, path, "\\\\.\\NUL") and !eqlComptimeT(T, path, "\\\\.\\nul") and !eqlComptimeT(T, path, "\\nul") and !eqlComptimeT(T, path, "\\NUL") and !isUNCPath(T, path))
|
||||
{
|
||||
std.debug.panic("Internal Error: Do not pass posix paths to Windows APIs, was given '{s}'" ++ if (Environment.isDebug) " (missing a root like 'C:\\', see PosixToWinNormalizer for why this is an assertion)" else ". Please open an issue on GitHub with a reproduction.", .{
|
||||
if (T == u8) path else bun.fmt.utf16(path),
|
||||
});
|
||||
}
|
||||
if (hasPrefixComptimeType(T, path, ":/") and Environment.isDebug) {
|
||||
std.debug.panic("Path passed to windows API '{s}' is almost certainly invalid. Where did the drive letter go?", .{
|
||||
if (T == u8) path else bun.fmt.utf16(path),
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn toWPathMaybeDir(wbuf: []u16, utf8: []const u8, comptime add_trailing_lash: bool) [:0]u16 {
|
||||
bun.unsafeAssert(wbuf.len > 0);
|
||||
|
||||
var result = bun.simdutf.convert.utf8.to.utf16.with_errors.le(
|
||||
utf8,
|
||||
wbuf[0..wbuf.len -| (1 + @as(usize, @intFromBool(add_trailing_lash)))],
|
||||
);
|
||||
|
||||
// Many Windows APIs expect normalized path slashes, particularly when the
|
||||
// long path prefix is added or the nt object prefix. To make this easier,
|
||||
// but a little redundant, this function always normalizes the slashes here.
|
||||
//
|
||||
// An example of this is GetFileAttributesW(L"C:\\hello/world.txt") being OK
|
||||
// but GetFileAttributesW(L"\\\\?\\C:\\hello/world.txt") is NOT
|
||||
bun.path.dangerouslyConvertPathToWindowsInPlace(u16, wbuf[0..result.count]);
|
||||
|
||||
if (add_trailing_lash and result.count > 0 and wbuf[result.count - 1] != '\\') {
|
||||
wbuf[result.count] = '\\';
|
||||
result.count += 1;
|
||||
}
|
||||
|
||||
wbuf[result.count] = 0;
|
||||
|
||||
return wbuf[0..result.count :0];
|
||||
}
|
||||
pub fn toPathMaybeDir(buf: []u8, utf8: []const u8, comptime add_trailing_lash: bool) [:0]u8 {
|
||||
bun.unsafeAssert(buf.len > 0);
|
||||
|
||||
var len = utf8.len;
|
||||
@memcpy(buf[0..len], utf8[0..len]);
|
||||
|
||||
if (add_trailing_lash and len > 0 and buf[len - 1] != '\\') {
|
||||
buf[len] = '\\';
|
||||
len += 1;
|
||||
}
|
||||
buf[len] = 0;
|
||||
return buf[0..len :0];
|
||||
}
|
||||
|
||||
pub fn cloneNormalizingSeparators(
|
||||
allocator: std.mem.Allocator,
|
||||
input: []const u8,
|
||||
) ![]u8 {
|
||||
// remove duplicate slashes in the file path
|
||||
const base = withoutTrailingSlash(input);
|
||||
var tokenized = std.mem.tokenizeScalar(u8, base, std.fs.path.sep);
|
||||
var buf = try allocator.alloc(u8, base.len + 2);
|
||||
if (comptime Environment.allow_assert) assert(base.len > 0);
|
||||
if (base[0] == std.fs.path.sep) {
|
||||
buf[0] = std.fs.path.sep;
|
||||
}
|
||||
var remain = buf[@as(usize, @intFromBool(base[0] == std.fs.path.sep))..];
|
||||
|
||||
while (tokenized.next()) |token| {
|
||||
if (token.len == 0) continue;
|
||||
bun.copy(u8, remain, token);
|
||||
remain[token.len..][0] = std.fs.path.sep;
|
||||
remain = remain[token.len + 1 ..];
|
||||
}
|
||||
if ((remain.ptr - 1) != buf.ptr and (remain.ptr - 1)[0] != std.fs.path.sep) {
|
||||
remain[0] = std.fs.path.sep;
|
||||
remain = remain[1..];
|
||||
}
|
||||
remain[0] = 0;
|
||||
|
||||
return buf[0 .. @intFromPtr(remain.ptr) - @intFromPtr(buf.ptr)];
|
||||
}
|
||||
|
||||
pub fn pathContainsNodeModulesFolder(path: []const u8) bool {
|
||||
return strings.contains(path, comptime std.fs.path.sep_str ++ "node_modules" ++ std.fs.path.sep_str);
|
||||
}
|
||||
|
||||
pub fn charIsAnySlash(char: u8) callconv(bun.callconv_inline) bool {
|
||||
return char == '/' or char == '\\';
|
||||
}
|
||||
|
||||
pub fn startsWithWindowsDriveLetter(s: []const u8) callconv(bun.callconv_inline) bool {
|
||||
return startsWithWindowsDriveLetterT(u8, s);
|
||||
}
|
||||
|
||||
pub fn startsWithWindowsDriveLetterT(comptime T: type, s: []const T) callconv(bun.callconv_inline) bool {
|
||||
return s.len > 2 and s[1] == ':' and switch (s[0]) {
|
||||
'a'...'z', 'A'...'Z' => true,
|
||||
else => false,
|
||||
};
|
||||
}
|
||||
|
||||
pub fn withoutTrailingSlash(this: string) []const u8 {
|
||||
var href = this;
|
||||
while (href.len > 1 and (switch (href[href.len - 1]) {
|
||||
'/', '\\' => true,
|
||||
else => false,
|
||||
})) {
|
||||
href.len -= 1;
|
||||
}
|
||||
|
||||
return href;
|
||||
}
|
||||
|
||||
/// Does not strip the device root (C:\ or \\Server\Share\ portion off of the path)
|
||||
pub fn withoutTrailingSlashWindowsPath(input: string) []const u8 {
|
||||
if (Environment.isPosix or input.len < 3 or input[1] != ':')
|
||||
return withoutTrailingSlash(input);
|
||||
|
||||
const root_len = bun.path.windowsFilesystemRoot(input).len + 1;
|
||||
|
||||
var path = input;
|
||||
while (path.len > root_len and (switch (path[path.len - 1]) {
|
||||
'/', '\\' => true,
|
||||
else => false,
|
||||
})) {
|
||||
path.len -= 1;
|
||||
}
|
||||
|
||||
if (Environment.isDebug)
|
||||
bun.debugAssert(!std.fs.path.isAbsolute(path) or
|
||||
!isWindowsAbsolutePathMissingDriveLetter(u8, path));
|
||||
|
||||
return path;
|
||||
}
|
||||
|
||||
pub fn withoutLeadingSlash(this: string) []const u8 {
|
||||
return std.mem.trimLeft(u8, this, "/");
|
||||
}
|
||||
|
||||
pub fn withoutLeadingPathSeparator(this: string) []const u8 {
|
||||
return std.mem.trimLeft(u8, this, &.{std.fs.path.sep});
|
||||
}
|
||||
|
||||
pub fn removeLeadingDotSlash(slice: []const u8) callconv(bun.callconv_inline) []const u8 {
|
||||
if (slice.len >= 2) {
|
||||
if ((@as(u16, @bitCast(slice[0..2].*)) == comptime std.mem.readInt(u16, "./", .little)) or
|
||||
(Environment.isWindows and @as(u16, @bitCast(slice[0..2].*)) == comptime std.mem.readInt(u16, ".\\", .little)))
|
||||
{
|
||||
return slice[2..];
|
||||
}
|
||||
}
|
||||
return slice;
|
||||
}
|
||||
|
||||
const bun = @import("bun");
|
||||
const std = @import("std");
|
||||
const Environment = bun.Environment;
|
||||
const strings = bun.strings;
|
||||
const hasPrefixComptime = strings.hasPrefixComptime;
|
||||
const hasPrefixComptimeType = strings.hasPrefixComptimeType;
|
||||
const trimPrefixComptime = strings.trimPrefixComptime;
|
||||
const copyUTF16IntoUTF8 = strings.copyUTF16IntoUTF8;
|
||||
const eqlComptimeT = strings.eqlComptimeT;
|
||||
const string = []const u8;
|
||||
const assert = bun.assert;
|
||||
const hasPrefixComptimeUTF16 = strings.hasPrefixComptimeUTF16;
|
||||
2078
src/string/unicode.zig
Normal file
2078
src/string/unicode.zig
Normal file
File diff suppressed because it is too large
Load Diff
831
src/string/visible.zig
Normal file
831
src/string/visible.zig
Normal file
@@ -0,0 +1,831 @@
|
||||
pub fn isZeroWidthCodepointType(comptime T: type, cp: T) bool {
|
||||
if (cp <= 0x1f) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (cp >= 0x7f and cp <= 0x9f) {
|
||||
// C1 control characters
|
||||
return true;
|
||||
}
|
||||
|
||||
if (comptime @sizeOf(T) == 1) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (cp >= 0x300 and cp <= 0x36f) {
|
||||
// Combining Diacritical Marks
|
||||
return true;
|
||||
}
|
||||
|
||||
if (cp >= 0x200b and cp <= 0x200f) {
|
||||
// Modifying Invisible Characters
|
||||
return true;
|
||||
}
|
||||
|
||||
if (cp >= 0x20d0 and cp <= 0x20ff)
|
||||
// Combining Diacritical Marks for Symbols
|
||||
return true;
|
||||
|
||||
if (cp >= 0xfe00 and cp <= 0xfe0f)
|
||||
// Variation Selectors
|
||||
return true;
|
||||
if (cp >= 0xfe20 and cp <= 0xfe2f)
|
||||
// Combining Half Marks
|
||||
return true;
|
||||
|
||||
if (cp == 0xfeff)
|
||||
// Zero Width No-Break Space (BOM, ZWNBSP)
|
||||
return true;
|
||||
|
||||
if (cp >= 0xe0100 and cp <= 0xe01ef)
|
||||
// Variation Selectors
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/// Official unicode reference: https://www.unicode.org/Public/UCD/latest/ucd/EastAsianWidth.txt
|
||||
/// Tag legend:
|
||||
/// - `W` (wide) -> true
|
||||
/// - `F` (full-width) -> true
|
||||
/// - `H` (half-width) -> false
|
||||
/// - `N` (neutral) -> false
|
||||
/// - `Na` (narrow) -> false
|
||||
/// - `A` (ambiguous) -> false?
|
||||
///
|
||||
/// To regenerate the switch body list, run:
|
||||
/// ```js
|
||||
/// [...(await (await fetch("https://www.unicode.org/Public/UCD/latest/ucd/EastAsianWidth.txt")).text()).matchAll(/^([\dA-F]{4,})(?:\.\.([\dA-F]{4,}))?\s+;\s+(\w+)\s+#\s+(.*?)\s*$/gm)].flatMap(([,start, end, type, comment]) => (
|
||||
/// (['W', 'F'].includes(type)) ? [` ${(end ? `0x${start}...0x${end}` : `0x${start}`)}, // ${''.padStart(17 - start.length - (end ? end.length + 5 : 0))}[${type}] ${comment}`] : []
|
||||
/// )).join('\n')
|
||||
/// ```
|
||||
pub fn isFullWidthCodepointType(comptime T: type, cp: T) bool {
|
||||
if (!(cp >= 0x1100)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return switch (cp) {
|
||||
0x1100...0x115F, // [W] Lo [96] HANGUL CHOSEONG KIYEOK..HANGUL CHOSEONG FILLER
|
||||
0x231A...0x231B, // [W] So [2] WATCH..HOURGLASS
|
||||
0x2329, // [W] Ps LEFT-POINTING ANGLE BRACKET
|
||||
0x232A, // [W] Pe RIGHT-POINTING ANGLE BRACKET
|
||||
0x23E9...0x23EC, // [W] So [4] BLACK RIGHT-POINTING DOUBLE TRIANGLE..BLACK DOWN-POINTING DOUBLE TRIANGLE
|
||||
0x23F0, // [W] So ALARM CLOCK
|
||||
0x23F3, // [W] So HOURGLASS WITH FLOWING SAND
|
||||
0x25FD...0x25FE, // [W] Sm [2] WHITE MEDIUM SMALL SQUARE..BLACK MEDIUM SMALL SQUARE
|
||||
0x2614...0x2615, // [W] So [2] UMBRELLA WITH RAIN DROPS..HOT BEVERAGE
|
||||
0x2648...0x2653, // [W] So [12] ARIES..PISCES
|
||||
0x267F, // [W] So WHEELCHAIR SYMBOL
|
||||
0x2693, // [W] So ANCHOR
|
||||
0x26A1, // [W] So HIGH VOLTAGE SIGN
|
||||
0x26AA...0x26AB, // [W] So [2] MEDIUM WHITE CIRCLE..MEDIUM BLACK CIRCLE
|
||||
0x26BD...0x26BE, // [W] So [2] SOCCER BALL..BASEBALL
|
||||
0x26C4...0x26C5, // [W] So [2] SNOWMAN WITHOUT SNOW..SUN BEHIND CLOUD
|
||||
0x26CE, // [W] So OPHIUCHUS
|
||||
0x26D4, // [W] So NO ENTRY
|
||||
0x26EA, // [W] So CHURCH
|
||||
0x26F2...0x26F3, // [W] So [2] FOUNTAIN..FLAG IN HOLE
|
||||
0x26F5, // [W] So SAILBOAT
|
||||
0x26FA, // [W] So TENT
|
||||
0x26FD, // [W] So FUEL PUMP
|
||||
0x2705, // [W] So WHITE HEAVY CHECK MARK
|
||||
0x270A...0x270B, // [W] So [2] RAISED FIST..RAISED HAND
|
||||
0x2728, // [W] So SPARKLES
|
||||
0x274C, // [W] So CROSS MARK
|
||||
0x274E, // [W] So NEGATIVE SQUARED CROSS MARK
|
||||
0x2753...0x2755, // [W] So [3] BLACK QUESTION MARK ORNAMENT..WHITE EXCLAMATION MARK ORNAMENT
|
||||
0x2757, // [W] So HEAVY EXCLAMATION MARK SYMBOL
|
||||
0x2795...0x2797, // [W] So [3] HEAVY PLUS SIGN..HEAVY DIVISION SIGN
|
||||
0x27B0, // [W] So CURLY LOOP
|
||||
0x27BF, // [W] So DOUBLE CURLY LOOP
|
||||
0x2B1B...0x2B1C, // [W] So [2] BLACK LARGE SQUARE..WHITE LARGE SQUARE
|
||||
0x2B50, // [W] So WHITE MEDIUM STAR
|
||||
0x2B55, // [W] So HEAVY LARGE CIRCLE
|
||||
0x2E80...0x2E99, // [W] So [26] CJK RADICAL REPEAT..CJK RADICAL RAP
|
||||
0x2E9B...0x2EF3, // [W] So [89] CJK RADICAL CHOKE..CJK RADICAL C-SIMPLIFIED TURTLE
|
||||
0x2F00...0x2FD5, // [W] So [214] KANGXI RADICAL ONE..KANGXI RADICAL FLUTE
|
||||
0x2FF0...0x2FFF, // [W] So [16] IDEOGRAPHIC DESCRIPTION CHARACTER LEFT TO RIGHT..IDEOGRAPHIC DESCRIPTION CHARACTER ROTATION
|
||||
0x3000, // [F] Zs IDEOGRAPHIC SPACE
|
||||
0x3001...0x3003, // [W] Po [3] IDEOGRAPHIC COMMA..DITTO MARK
|
||||
0x3004, // [W] So JAPANESE INDUSTRIAL STANDARD SYMBOL
|
||||
0x3005, // [W] Lm IDEOGRAPHIC ITERATION MARK
|
||||
0x3006, // [W] Lo IDEOGRAPHIC CLOSING MARK
|
||||
0x3007, // [W] Nl IDEOGRAPHIC NUMBER ZERO
|
||||
0x3008, // [W] Ps LEFT ANGLE BRACKET
|
||||
0x3009, // [W] Pe RIGHT ANGLE BRACKET
|
||||
0x300A, // [W] Ps LEFT DOUBLE ANGLE BRACKET
|
||||
0x300B, // [W] Pe RIGHT DOUBLE ANGLE BRACKET
|
||||
0x300C, // [W] Ps LEFT CORNER BRACKET
|
||||
0x300D, // [W] Pe RIGHT CORNER BRACKET
|
||||
0x300E, // [W] Ps LEFT WHITE CORNER BRACKET
|
||||
0x300F, // [W] Pe RIGHT WHITE CORNER BRACKET
|
||||
0x3010, // [W] Ps LEFT BLACK LENTICULAR BRACKET
|
||||
0x3011, // [W] Pe RIGHT BLACK LENTICULAR BRACKET
|
||||
0x3012...0x3013, // [W] So [2] POSTAL MARK..GETA MARK
|
||||
0x3014, // [W] Ps LEFT TORTOISE SHELL BRACKET
|
||||
0x3015, // [W] Pe RIGHT TORTOISE SHELL BRACKET
|
||||
0x3016, // [W] Ps LEFT WHITE LENTICULAR BRACKET
|
||||
0x3017, // [W] Pe RIGHT WHITE LENTICULAR BRACKET
|
||||
0x3018, // [W] Ps LEFT WHITE TORTOISE SHELL BRACKET
|
||||
0x3019, // [W] Pe RIGHT WHITE TORTOISE SHELL BRACKET
|
||||
0x301A, // [W] Ps LEFT WHITE SQUARE BRACKET
|
||||
0x301B, // [W] Pe RIGHT WHITE SQUARE BRACKET
|
||||
0x301C, // [W] Pd WAVE DASH
|
||||
0x301D, // [W] Ps REVERSED DOUBLE PRIME QUOTATION MARK
|
||||
0x301E...0x301F, // [W] Pe [2] DOUBLE PRIME QUOTATION MARK..LOW DOUBLE PRIME QUOTATION MARK
|
||||
0x3020, // [W] So POSTAL MARK FACE
|
||||
0x3021...0x3029, // [W] Nl [9] HANGZHOU NUMERAL ONE..HANGZHOU NUMERAL NINE
|
||||
0x302A...0x302D, // [W] Mn [4] IDEOGRAPHIC LEVEL TONE MARK..IDEOGRAPHIC ENTERING TONE MARK
|
||||
0x302E...0x302F, // [W] Mc [2] HANGUL SINGLE DOT TONE MARK..HANGUL DOUBLE DOT TONE MARK
|
||||
0x3030, // [W] Pd WAVY DASH
|
||||
0x3031...0x3035, // [W] Lm [5] VERTICAL KANA REPEAT MARK..VERTICAL KANA REPEAT MARK LOWER HALF
|
||||
0x3036...0x3037, // [W] So [2] CIRCLED POSTAL MARK..IDEOGRAPHIC TELEGRAPH LINE FEED SEPARATOR SYMBOL
|
||||
0x3038...0x303A, // [W] Nl [3] HANGZHOU NUMERAL TEN..HANGZHOU NUMERAL THIRTY
|
||||
0x303B, // [W] Lm VERTICAL IDEOGRAPHIC ITERATION MARK
|
||||
0x303C, // [W] Lo MASU MARK
|
||||
0x303D, // [W] Po PART ALTERNATION MARK
|
||||
0x303E, // [W] So IDEOGRAPHIC VARIATION INDICATOR
|
||||
0x3041...0x3096, // [W] Lo [86] HIRAGANA LETTER SMALL A..HIRAGANA LETTER SMALL KE
|
||||
0x3099...0x309A, // [W] Mn [2] COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK..COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
|
||||
0x309B...0x309C, // [W] Sk [2] KATAKANA-HIRAGANA VOICED SOUND MARK..KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
|
||||
0x309D...0x309E, // [W] Lm [2] HIRAGANA ITERATION MARK..HIRAGANA VOICED ITERATION MARK
|
||||
0x309F, // [W] Lo HIRAGANA DIGRAPH YORI
|
||||
0x30A0, // [W] Pd KATAKANA-HIRAGANA DOUBLE HYPHEN
|
||||
0x30A1...0x30FA, // [W] Lo [90] KATAKANA LETTER SMALL A..KATAKANA LETTER VO
|
||||
0x30FB, // [W] Po KATAKANA MIDDLE DOT
|
||||
0x30FC...0x30FE, // [W] Lm [3] KATAKANA-HIRAGANA PROLONGED SOUND MARK..KATAKANA VOICED ITERATION MARK
|
||||
0x30FF, // [W] Lo KATAKANA DIGRAPH KOTO
|
||||
0x3105...0x312F, // [W] Lo [43] BOPOMOFO LETTER B..BOPOMOFO LETTER NN
|
||||
0x3131...0x318E, // [W] Lo [94] HANGUL LETTER KIYEOK..HANGUL LETTER ARAEAE
|
||||
0x3190...0x3191, // [W] So [2] IDEOGRAPHIC ANNOTATION LINKING MARK..IDEOGRAPHIC ANNOTATION REVERSE MARK
|
||||
0x3192...0x3195, // [W] No [4] IDEOGRAPHIC ANNOTATION ONE MARK..IDEOGRAPHIC ANNOTATION FOUR MARK
|
||||
0x3196...0x319F, // [W] So [10] IDEOGRAPHIC ANNOTATION TOP MARK..IDEOGRAPHIC ANNOTATION MAN MARK
|
||||
0x31A0...0x31BF, // [W] Lo [32] BOPOMOFO LETTER BU..BOPOMOFO LETTER AH
|
||||
0x31C0...0x31E3, // [W] So [36] CJK STROKE T..CJK STROKE Q
|
||||
0x31EF, // [W] So IDEOGRAPHIC DESCRIPTION CHARACTER SUBTRACTION
|
||||
0x31F0...0x31FF, // [W] Lo [16] KATAKANA LETTER SMALL KU..KATAKANA LETTER SMALL RO
|
||||
0x3200...0x321E, // [W] So [31] PARENTHESIZED HANGUL KIYEOK..PARENTHESIZED KOREAN CHARACTER O HU
|
||||
0x3220...0x3229, // [W] No [10] PARENTHESIZED IDEOGRAPH ONE..PARENTHESIZED IDEOGRAPH TEN
|
||||
0x322A...0x3247, // [W] So [30] PARENTHESIZED IDEOGRAPH MOON..CIRCLED IDEOGRAPH KOTO
|
||||
0x3250, // [W] So PARTNERSHIP SIGN
|
||||
0x3251...0x325F, // [W] No [15] CIRCLED NUMBER TWENTY ONE..CIRCLED NUMBER THIRTY FIVE
|
||||
0x3260...0x327F, // [W] So [32] CIRCLED HANGUL KIYEOK..KOREAN STANDARD SYMBOL
|
||||
0x3280...0x3289, // [W] No [10] CIRCLED IDEOGRAPH ONE..CIRCLED IDEOGRAPH TEN
|
||||
0x328A...0x32B0, // [W] So [39] CIRCLED IDEOGRAPH MOON..CIRCLED IDEOGRAPH NIGHT
|
||||
0x32B1...0x32BF, // [W] No [15] CIRCLED NUMBER THIRTY SIX..CIRCLED NUMBER FIFTY
|
||||
0x32C0...0x32FF, // [W] So [64] IDEOGRAPHIC TELEGRAPH SYMBOL FOR JANUARY..SQUARE ERA NAME REIWA
|
||||
0x3300...0x33FF, // [W] So [256] SQUARE APAATO..SQUARE GAL
|
||||
0x3400...0x4DBF, // [W] Lo [6592] CJK UNIFIED IDEOGRAPH-3400..CJK UNIFIED IDEOGRAPH-4DBF
|
||||
0x4E00...0x9FFF, // [W] Lo [20992] CJK UNIFIED IDEOGRAPH-4E00..CJK UNIFIED IDEOGRAPH-9FFF
|
||||
0xA000...0xA014, // [W] Lo [21] YI SYLLABLE IT..YI SYLLABLE E
|
||||
0xA015, // [W] Lm YI SYLLABLE WU
|
||||
0xA016...0xA48C, // [W] Lo [1143] YI SYLLABLE BIT..YI SYLLABLE YYR
|
||||
0xA490...0xA4C6, // [W] So [55] YI RADICAL QOT..YI RADICAL KE
|
||||
0xA960...0xA97C, // [W] Lo [29] HANGUL CHOSEONG TIKEUT-MIEUM..HANGUL CHOSEONG SSANGYEORINHIEUH
|
||||
0xAC00...0xD7A3, // [W] Lo [11172] HANGUL SYLLABLE GA..HANGUL SYLLABLE HIH
|
||||
0xF900...0xFA6D, // [W] Lo [366] CJK COMPATIBILITY IDEOGRAPH-F900..CJK COMPATIBILITY IDEOGRAPH-FA6D
|
||||
0xFA6E...0xFA6F, // [W] Cn [2] <reserved-FA6E>..<reserved-FA6F>
|
||||
0xFA70...0xFAD9, // [W] Lo [106] CJK COMPATIBILITY IDEOGRAPH-FA70..CJK COMPATIBILITY IDEOGRAPH-FAD9
|
||||
0xFADA...0xFAFF, // [W] Cn [38] <reserved-FADA>..<reserved-FAFF>
|
||||
0xFE10...0xFE16, // [W] Po [7] PRESENTATION FORM FOR VERTICAL COMMA..PRESENTATION FORM FOR VERTICAL QUESTION MARK
|
||||
0xFE17, // [W] Ps PRESENTATION FORM FOR VERTICAL LEFT WHITE LENTICULAR BRACKET
|
||||
0xFE18, // [W] Pe PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRAKCET
|
||||
0xFE19, // [W] Po PRESENTATION FORM FOR VERTICAL HORIZONTAL ELLIPSIS
|
||||
0xFE30, // [W] Po PRESENTATION FORM FOR VERTICAL TWO DOT LEADER
|
||||
0xFE31...0xFE32, // [W] Pd [2] PRESENTATION FORM FOR VERTICAL EM DASH..PRESENTATION FORM FOR VERTICAL EN DASH
|
||||
0xFE33...0xFE34, // [W] Pc [2] PRESENTATION FORM FOR VERTICAL LOW LINE..PRESENTATION FORM FOR VERTICAL WAVY LOW LINE
|
||||
0xFE35, // [W] Ps PRESENTATION FORM FOR VERTICAL LEFT PARENTHESIS
|
||||
0xFE36, // [W] Pe PRESENTATION FORM FOR VERTICAL RIGHT PARENTHESIS
|
||||
0xFE37, // [W] Ps PRESENTATION FORM FOR VERTICAL LEFT CURLY BRACKET
|
||||
0xFE38, // [W] Pe PRESENTATION FORM FOR VERTICAL RIGHT CURLY BRACKET
|
||||
0xFE39, // [W] Ps PRESENTATION FORM FOR VERTICAL LEFT TORTOISE SHELL BRACKET
|
||||
0xFE3A, // [W] Pe PRESENTATION FORM FOR VERTICAL RIGHT TORTOISE SHELL BRACKET
|
||||
0xFE3B, // [W] Ps PRESENTATION FORM FOR VERTICAL LEFT BLACK LENTICULAR BRACKET
|
||||
0xFE3C, // [W] Pe PRESENTATION FORM FOR VERTICAL RIGHT BLACK LENTICULAR BRACKET
|
||||
0xFE3D, // [W] Ps PRESENTATION FORM FOR VERTICAL LEFT DOUBLE ANGLE BRACKET
|
||||
0xFE3E, // [W] Pe PRESENTATION FORM FOR VERTICAL RIGHT DOUBLE ANGLE BRACKET
|
||||
0xFE3F, // [W] Ps PRESENTATION FORM FOR VERTICAL LEFT ANGLE BRACKET
|
||||
0xFE40, // [W] Pe PRESENTATION FORM FOR VERTICAL RIGHT ANGLE BRACKET
|
||||
0xFE41, // [W] Ps PRESENTATION FORM FOR VERTICAL LEFT CORNER BRACKET
|
||||
0xFE42, // [W] Pe PRESENTATION FORM FOR VERTICAL RIGHT CORNER BRACKET
|
||||
0xFE43, // [W] Ps PRESENTATION FORM FOR VERTICAL LEFT WHITE CORNER BRACKET
|
||||
0xFE44, // [W] Pe PRESENTATION FORM FOR VERTICAL RIGHT WHITE CORNER BRACKET
|
||||
0xFE45...0xFE46, // [W] Po [2] SESAME DOT..WHITE SESAME DOT
|
||||
0xFE47, // [W] Ps PRESENTATION FORM FOR VERTICAL LEFT SQUARE BRACKET
|
||||
0xFE48, // [W] Pe PRESENTATION FORM FOR VERTICAL RIGHT SQUARE BRACKET
|
||||
0xFE49...0xFE4C, // [W] Po [4] DASHED OVERLINE..DOUBLE WAVY OVERLINE
|
||||
0xFE4D...0xFE4F, // [W] Pc [3] DASHED LOW LINE..WAVY LOW LINE
|
||||
0xFE50...0xFE52, // [W] Po [3] SMALL COMMA..SMALL FULL STOP
|
||||
0xFE54...0xFE57, // [W] Po [4] SMALL SEMICOLON..SMALL EXCLAMATION MARK
|
||||
0xFE58, // [W] Pd SMALL EM DASH
|
||||
0xFE59, // [W] Ps SMALL LEFT PARENTHESIS
|
||||
0xFE5A, // [W] Pe SMALL RIGHT PARENTHESIS
|
||||
0xFE5B, // [W] Ps SMALL LEFT CURLY BRACKET
|
||||
0xFE5C, // [W] Pe SMALL RIGHT CURLY BRACKET
|
||||
0xFE5D, // [W] Ps SMALL LEFT TORTOISE SHELL BRACKET
|
||||
0xFE5E, // [W] Pe SMALL RIGHT TORTOISE SHELL BRACKET
|
||||
0xFE5F...0xFE61, // [W] Po [3] SMALL NUMBER SIGN..SMALL ASTERISK
|
||||
0xFE62, // [W] Sm SMALL PLUS SIGN
|
||||
0xFE63, // [W] Pd SMALL HYPHEN-MINUS
|
||||
0xFE64...0xFE66, // [W] Sm [3] SMALL LESS-THAN SIGN..SMALL EQUALS SIGN
|
||||
0xFE68, // [W] Po SMALL REVERSE SOLIDUS
|
||||
0xFE69, // [W] Sc SMALL DOLLAR SIGN
|
||||
0xFE6A...0xFE6B, // [W] Po [2] SMALL PERCENT SIGN..SMALL COMMERCIAL AT
|
||||
0xFF01...0xFF03, // [F] Po [3] FULLWIDTH EXCLAMATION MARK..FULLWIDTH NUMBER SIGN
|
||||
0xFF04, // [F] Sc FULLWIDTH DOLLAR SIGN
|
||||
0xFF05...0xFF07, // [F] Po [3] FULLWIDTH PERCENT SIGN..FULLWIDTH APOSTROPHE
|
||||
0xFF08, // [F] Ps FULLWIDTH LEFT PARENTHESIS
|
||||
0xFF09, // [F] Pe FULLWIDTH RIGHT PARENTHESIS
|
||||
0xFF0A, // [F] Po FULLWIDTH ASTERISK
|
||||
0xFF0B, // [F] Sm FULLWIDTH PLUS SIGN
|
||||
0xFF0C, // [F] Po FULLWIDTH COMMA
|
||||
0xFF0D, // [F] Pd FULLWIDTH HYPHEN-MINUS
|
||||
0xFF0E...0xFF0F, // [F] Po [2] FULLWIDTH FULL STOP..FULLWIDTH SOLIDUS
|
||||
0xFF10...0xFF19, // [F] Nd [10] FULLWIDTH DIGIT ZERO..FULLWIDTH DIGIT NINE
|
||||
0xFF1A...0xFF1B, // [F] Po [2] FULLWIDTH COLON..FULLWIDTH SEMICOLON
|
||||
0xFF1C...0xFF1E, // [F] Sm [3] FULLWIDTH LESS-THAN SIGN..FULLWIDTH GREATER-THAN SIGN
|
||||
0xFF1F...0xFF20, // [F] Po [2] FULLWIDTH QUESTION MARK..FULLWIDTH COMMERCIAL AT
|
||||
0xFF21...0xFF3A, // [F] Lu [26] FULLWIDTH LATIN CAPITAL LETTER A..FULLWIDTH LATIN CAPITAL LETTER Z
|
||||
0xFF3B, // [F] Ps FULLWIDTH LEFT SQUARE BRACKET
|
||||
0xFF3C, // [F] Po FULLWIDTH REVERSE SOLIDUS
|
||||
0xFF3D, // [F] Pe FULLWIDTH RIGHT SQUARE BRACKET
|
||||
0xFF3E, // [F] Sk FULLWIDTH CIRCUMFLEX ACCENT
|
||||
0xFF3F, // [F] Pc FULLWIDTH LOW LINE
|
||||
0xFF40, // [F] Sk FULLWIDTH GRAVE ACCENT
|
||||
0xFF41...0xFF5A, // [F] Ll [26] FULLWIDTH LATIN SMALL LETTER A..FULLWIDTH LATIN SMALL LETTER Z
|
||||
0xFF5B, // [F] Ps FULLWIDTH LEFT CURLY BRACKET
|
||||
0xFF5C, // [F] Sm FULLWIDTH VERTICAL LINE
|
||||
0xFF5D, // [F] Pe FULLWIDTH RIGHT CURLY BRACKET
|
||||
0xFF5E, // [F] Sm FULLWIDTH TILDE
|
||||
0xFF5F, // [F] Ps FULLWIDTH LEFT WHITE PARENTHESIS
|
||||
0xFF60, // [F] Pe FULLWIDTH RIGHT WHITE PARENTHESIS
|
||||
0xFFE0...0xFFE1, // [F] Sc [2] FULLWIDTH CENT SIGN..FULLWIDTH POUND SIGN
|
||||
0xFFE2, // [F] Sm FULLWIDTH NOT SIGN
|
||||
0xFFE3, // [F] Sk FULLWIDTH MACRON
|
||||
0xFFE4, // [F] So FULLWIDTH BROKEN BAR
|
||||
0xFFE5...0xFFE6, // [F] Sc [2] FULLWIDTH YEN SIGN..FULLWIDTH WON SIGN
|
||||
0x16FE0...0x16FE1, // [W] Lm [2] TANGUT ITERATION MARK..NUSHU ITERATION MARK
|
||||
0x16FE2, // [W] Po OLD CHINESE HOOK MARK
|
||||
0x16FE3, // [W] Lm OLD CHINESE ITERATION MARK
|
||||
0x16FE4, // [W] Mn KHITAN SMALL SCRIPT FILLER
|
||||
0x16FF0...0x16FF1, // [W] Mc [2] VIETNAMESE ALTERNATE READING MARK CA..VIETNAMESE ALTERNATE READING MARK NHAY
|
||||
0x17000...0x187F7, // [W] Lo [6136] TANGUT IDEOGRAPH-17000..TANGUT IDEOGRAPH-187F7
|
||||
0x18800...0x18AFF, // [W] Lo [768] TANGUT COMPONENT-001..TANGUT COMPONENT-768
|
||||
0x18B00...0x18CD5, // [W] Lo [470] KHITAN SMALL SCRIPT CHARACTER-18B00..KHITAN SMALL SCRIPT CHARACTER-18CD5
|
||||
0x18D00...0x18D08, // [W] Lo [9] TANGUT IDEOGRAPH-18D00..TANGUT IDEOGRAPH-18D08
|
||||
0x1AFF0...0x1AFF3, // [W] Lm [4] KATAKANA LETTER MINNAN TONE-2..KATAKANA LETTER MINNAN TONE-5
|
||||
0x1AFF5...0x1AFFB, // [W] Lm [7] KATAKANA LETTER MINNAN TONE-7..KATAKANA LETTER MINNAN NASALIZED TONE-5
|
||||
0x1AFFD...0x1AFFE, // [W] Lm [2] KATAKANA LETTER MINNAN NASALIZED TONE-7..KATAKANA LETTER MINNAN NASALIZED TONE-8
|
||||
0x1B000...0x1B0FF, // [W] Lo [256] KATAKANA LETTER ARCHAIC E..HENTAIGANA LETTER RE-2
|
||||
0x1B100...0x1B122, // [W] Lo [35] HENTAIGANA LETTER RE-3..KATAKANA LETTER ARCHAIC WU
|
||||
0x1B132, // [W] Lo HIRAGANA LETTER SMALL KO
|
||||
0x1B150...0x1B152, // [W] Lo [3] HIRAGANA LETTER SMALL WI..HIRAGANA LETTER SMALL WO
|
||||
0x1B155, // [W] Lo KATAKANA LETTER SMALL KO
|
||||
0x1B164...0x1B167, // [W] Lo [4] KATAKANA LETTER SMALL WI..KATAKANA LETTER SMALL N
|
||||
0x1B170...0x1B2FB, // [W] Lo [396] NUSHU CHARACTER-1B170..NUSHU CHARACTER-1B2FB
|
||||
0x1F004, // [W] So MAHJONG TILE RED DRAGON
|
||||
0x1F0CF, // [W] So PLAYING CARD BLACK JOKER
|
||||
0x1F18E, // [W] So NEGATIVE SQUARED AB
|
||||
0x1F191...0x1F19A, // [W] So [10] SQUARED CL..SQUARED VS
|
||||
0x1F200...0x1F202, // [W] So [3] SQUARE HIRAGANA HOKA..SQUARED KATAKANA SA
|
||||
0x1F210...0x1F23B, // [W] So [44] SQUARED CJK UNIFIED IDEOGRAPH-624B..SQUARED CJK UNIFIED IDEOGRAPH-914D
|
||||
0x1F240...0x1F248, // [W] So [9] TORTOISE SHELL BRACKETED CJK UNIFIED IDEOGRAPH-672C..TORTOISE SHELL BRACKETED CJK UNIFIED IDEOGRAPH-6557
|
||||
0x1F250...0x1F251, // [W] So [2] CIRCLED IDEOGRAPH ADVANTAGE..CIRCLED IDEOGRAPH ACCEPT
|
||||
0x1F260...0x1F265, // [W] So [6] ROUNDED SYMBOL FOR FU..ROUNDED SYMBOL FOR CAI
|
||||
0x1F300...0x1F320, // [W] So [33] CYCLONE..SHOOTING STAR
|
||||
0x1F32D...0x1F335, // [W] So [9] HOT DOG..CACTUS
|
||||
0x1F337...0x1F37C, // [W] So [70] TULIP..BABY BOTTLE
|
||||
0x1F37E...0x1F393, // [W] So [22] BOTTLE WITH POPPING CORK..GRADUATION CAP
|
||||
0x1F3A0...0x1F3CA, // [W] So [43] CAROUSEL HORSE..SWIMMER
|
||||
0x1F3CF...0x1F3D3, // [W] So [5] CRICKET BAT AND BALL..TABLE TENNIS PADDLE AND BALL
|
||||
0x1F3E0...0x1F3F0, // [W] So [17] HOUSE BUILDING..EUROPEAN CASTLE
|
||||
0x1F3F4, // [W] So WAVING BLACK FLAG
|
||||
0x1F3F8...0x1F3FA, // [W] So [3] BADMINTON RACQUET AND SHUTTLECOCK..AMPHORA
|
||||
0x1F3FB...0x1F3FF, // [W] Sk [5] EMOJI MODIFIER FITZPATRICK TYPE-1-2..EMOJI MODIFIER FITZPATRICK TYPE-6
|
||||
0x1F400...0x1F43E, // [W] So [63] RAT..PAW PRINTS
|
||||
0x1F440, // [W] So EYES
|
||||
0x1F442...0x1F4FC, // [W] So [187] EAR..VIDEOCASSETTE
|
||||
0x1F4FF...0x1F53D, // [W] So [63] PRAYER BEADS..DOWN-POINTING SMALL RED TRIANGLE
|
||||
0x1F54B...0x1F54E, // [W] So [4] KAABA..MENORAH WITH NINE BRANCHES
|
||||
0x1F550...0x1F567, // [W] So [24] CLOCK FACE ONE OCLOCK..CLOCK FACE TWELVE-THIRTY
|
||||
0x1F57A, // [W] So MAN DANCING
|
||||
0x1F595...0x1F596, // [W] So [2] REVERSED HAND WITH MIDDLE FINGER EXTENDED..RAISED HAND WITH PART BETWEEN MIDDLE AND RING FINGERS
|
||||
0x1F5A4, // [W] So BLACK HEART
|
||||
0x1F5FB...0x1F5FF, // [W] So [5] MOUNT FUJI..MOYAI
|
||||
0x1F600...0x1F64F, // [W] So [80] GRINNING FACE..PERSON WITH FOLDED HANDS
|
||||
0x1F680...0x1F6C5, // [W] So [70] ROCKET..LEFT LUGGAGE
|
||||
0x1F6CC, // [W] So SLEEPING ACCOMMODATION
|
||||
0x1F6D0...0x1F6D2, // [W] So [3] PLACE OF WORSHIP..SHOPPING TROLLEY
|
||||
0x1F6D5...0x1F6D7, // [W] So [3] HINDU TEMPLE..ELEVATOR
|
||||
0x1F6DC...0x1F6DF, // [W] So [4] WIRELESS..RING BUOY
|
||||
0x1F6EB...0x1F6EC, // [W] So [2] AIRPLANE DEPARTURE..AIRPLANE ARRIVING
|
||||
0x1F6F4...0x1F6FC, // [W] So [9] SCOOTER..ROLLER SKATE
|
||||
0x1F7E0...0x1F7EB, // [W] So [12] LARGE ORANGE CIRCLE..LARGE BROWN SQUARE
|
||||
0x1F7F0, // [W] So HEAVY EQUALS SIGN
|
||||
0x1F90C...0x1F93A, // [W] So [47] PINCHED FINGERS..FENCER
|
||||
0x1F93C...0x1F945, // [W] So [10] WRESTLERS..GOAL NET
|
||||
0x1F947...0x1F9FF, // [W] So [185] FIRST PLACE MEDAL..NAZAR AMULET
|
||||
0x1FA70...0x1FA7C, // [W] So [13] BALLET SHOES..CRUTCH
|
||||
0x1FA80...0x1FA88, // [W] So [9] YO-YO..FLUTE
|
||||
0x1FA90...0x1FABD, // [W] So [46] RINGED PLANET..WING
|
||||
0x1FABF...0x1FAC5, // [W] So [7] GOOSE..PERSON WITH CROWN
|
||||
0x1FACE...0x1FADB, // [W] So [14] MOOSE..PEA POD
|
||||
0x1FAE0...0x1FAE8, // [W] So [9] MELTING FACE..SHAKING FACE
|
||||
0x1FAF0...0x1FAF8, // [W] So [9] HAND WITH INDEX FINGER AND THUMB CROSSED..RIGHTWARDS PUSHING HAND
|
||||
0x20000...0x2A6DF, // [W] Lo [42720] CJK UNIFIED IDEOGRAPH-20000..CJK UNIFIED IDEOGRAPH-2A6DF
|
||||
0x2A6E0...0x2A6FF, // [W] Cn [32] <reserved-2A6E0>..<reserved-2A6FF>
|
||||
0x2A700...0x2B739, // [W] Lo [4154] CJK UNIFIED IDEOGRAPH-2A700..CJK UNIFIED IDEOGRAPH-2B739
|
||||
0x2B73A...0x2B73F, // [W] Cn [6] <reserved-2B73A>..<reserved-2B73F>
|
||||
0x2B740...0x2B81D, // [W] Lo [222] CJK UNIFIED IDEOGRAPH-2B740..CJK UNIFIED IDEOGRAPH-2B81D
|
||||
0x2B81E...0x2B81F, // [W] Cn [2] <reserved-2B81E>..<reserved-2B81F>
|
||||
0x2B820...0x2CEA1, // [W] Lo [5762] CJK UNIFIED IDEOGRAPH-2B820..CJK UNIFIED IDEOGRAPH-2CEA1
|
||||
0x2CEA2...0x2CEAF, // [W] Cn [14] <reserved-2CEA2>..<reserved-2CEAF>
|
||||
0x2CEB0...0x2EBE0, // [W] Lo [7473] CJK UNIFIED IDEOGRAPH-2CEB0..CJK UNIFIED IDEOGRAPH-2EBE0
|
||||
0x2EBE1...0x2EBEF, // [W] Cn [15] <reserved-2EBE1>..<reserved-2EBEF>
|
||||
0x2EBF0...0x2EE5D, // [W] Lo [622] CJK UNIFIED IDEOGRAPH-2EBF0..CJK UNIFIED IDEOGRAPH-2EE5D
|
||||
0x2EE5E...0x2F7FF, // [W] Cn [2466] <reserved-2EE5E>..<reserved-2F7FF>
|
||||
0x2F800...0x2FA1D, // [W] Lo [542] CJK COMPATIBILITY IDEOGRAPH-2F800..CJK COMPATIBILITY IDEOGRAPH-2FA1D
|
||||
0x2FA1E...0x2FA1F, // [W] Cn [2] <reserved-2FA1E>..<reserved-2FA1F>
|
||||
0x2FA20...0x2FFFD, // [W] Cn [1502] <reserved-2FA20>..<reserved-2FFFD>
|
||||
0x30000...0x3134A, // [W] Lo [4939] CJK UNIFIED IDEOGRAPH-30000..CJK UNIFIED IDEOGRAPH-3134A
|
||||
0x3134B...0x3134F, // [W] Cn [5] <reserved-3134B>..<reserved-3134F>
|
||||
0x31350...0x323AF, // [W] Lo [4192] CJK UNIFIED IDEOGRAPH-31350..CJK UNIFIED IDEOGRAPH-323AF
|
||||
0x323B0...0x3FFFD, // [W] Cn [56398] <reserved-323B0>..<reserved-3FFFD>
|
||||
=> true,
|
||||
else => false,
|
||||
};
|
||||
}
|
||||
|
||||
pub fn isAmgiguousCodepointType(comptime T: type, cp: T) bool {
|
||||
return switch (cp) {
|
||||
0xA1,
|
||||
0xA4,
|
||||
0xA7,
|
||||
0xA8,
|
||||
0xAA,
|
||||
0xAD,
|
||||
0xAE,
|
||||
0xB0...0xB4,
|
||||
0xB6...0xBA,
|
||||
0xBC...0xBF,
|
||||
0xC6,
|
||||
0xD0,
|
||||
0xD7,
|
||||
0xD8,
|
||||
0xDE...0xE1,
|
||||
0xE6,
|
||||
0xE8...0xEA,
|
||||
0xEC,
|
||||
0xED,
|
||||
0xF0,
|
||||
0xF2,
|
||||
0xF3,
|
||||
0xF7...0xFA,
|
||||
0xFC,
|
||||
0xFE,
|
||||
0x101,
|
||||
0x111,
|
||||
0x113,
|
||||
0x11B,
|
||||
0x126,
|
||||
0x127,
|
||||
0x12B,
|
||||
0x131...0x133,
|
||||
0x138,
|
||||
0x13F...0x142,
|
||||
0x144,
|
||||
0x148...0x14B,
|
||||
0x14D,
|
||||
0x152,
|
||||
0x153,
|
||||
0x166,
|
||||
0x167,
|
||||
0x16B,
|
||||
0x1CE,
|
||||
0x1D0,
|
||||
0x1D2,
|
||||
0x1D4,
|
||||
0x1D6,
|
||||
0x1D8,
|
||||
0x1DA,
|
||||
0x1DC,
|
||||
0x251,
|
||||
0x261,
|
||||
0x2C4,
|
||||
0x2C7,
|
||||
0x2C9...0x2CB,
|
||||
0x2CD,
|
||||
0x2D0,
|
||||
0x2D8...0x2DB,
|
||||
0x2DD,
|
||||
0x2DF,
|
||||
0x300...0x36F,
|
||||
0x391...0x3A1,
|
||||
0x3A3...0x3A9,
|
||||
0x3B1...0x3C1,
|
||||
0x3C3...0x3C9,
|
||||
0x401,
|
||||
0x410...0x44F,
|
||||
0x451,
|
||||
0x2010,
|
||||
0x2013...0x2016,
|
||||
0x2018,
|
||||
0x2019,
|
||||
0x201C,
|
||||
0x201D,
|
||||
0x2020...0x2022,
|
||||
0x2024...0x2027,
|
||||
0x2030,
|
||||
0x2032,
|
||||
0x2033,
|
||||
0x2035,
|
||||
0x203B,
|
||||
0x203E,
|
||||
0x2074,
|
||||
0x207F,
|
||||
0x2081...0x2084,
|
||||
0x20AC,
|
||||
0x2103,
|
||||
0x2105,
|
||||
0x2109,
|
||||
0x2113,
|
||||
0x2116,
|
||||
0x2121,
|
||||
0x2122,
|
||||
0x2126,
|
||||
0x212B,
|
||||
0x2153,
|
||||
0x2154,
|
||||
0x215B...0x215E,
|
||||
0x2160...0x216B,
|
||||
0x2170...0x2179,
|
||||
0x2189,
|
||||
0x2190...0x2199,
|
||||
0x21B8,
|
||||
0x21B9,
|
||||
0x21D2,
|
||||
0x21D4,
|
||||
0x21E7,
|
||||
0x2200,
|
||||
0x2202,
|
||||
0x2203,
|
||||
0x2207,
|
||||
0x2208,
|
||||
0x220B,
|
||||
0x220F,
|
||||
0x2211,
|
||||
0x2215,
|
||||
0x221A,
|
||||
0x221D...0x2220,
|
||||
0x2223,
|
||||
0x2225,
|
||||
0x2227...0x222C,
|
||||
0x222E,
|
||||
0x2234...0x2237,
|
||||
0x223C,
|
||||
0x223D,
|
||||
0x2248,
|
||||
0x224C,
|
||||
0x2252,
|
||||
0x2260,
|
||||
0x2261,
|
||||
0x2264...0x2267,
|
||||
0x226A,
|
||||
0x226B,
|
||||
0x226E,
|
||||
0x226F,
|
||||
0x2282,
|
||||
0x2283,
|
||||
0x2286,
|
||||
0x2287,
|
||||
0x2295,
|
||||
0x2299,
|
||||
0x22A5,
|
||||
0x22BF,
|
||||
0x2312,
|
||||
0x2460...0x24E9,
|
||||
0x24EB...0x254B,
|
||||
0x2550...0x2573,
|
||||
0x2580...0x258F,
|
||||
0x2592...0x2595,
|
||||
0x25A0,
|
||||
0x25A1,
|
||||
0x25A3...0x25A9,
|
||||
0x25B2,
|
||||
0x25B3,
|
||||
0x25B6,
|
||||
0x25B7,
|
||||
0x25BC,
|
||||
0x25BD,
|
||||
0x25C0,
|
||||
0x25C1,
|
||||
0x25C6...0x25C8,
|
||||
0x25CB,
|
||||
0x25CE...0x25D1,
|
||||
0x25E2...0x25E5,
|
||||
0x25EF,
|
||||
0x2605,
|
||||
0x2606,
|
||||
0x2609,
|
||||
0x260E,
|
||||
0x260F,
|
||||
0x261C,
|
||||
0x261E,
|
||||
0x2640,
|
||||
0x2642,
|
||||
0x2660,
|
||||
0x2661,
|
||||
0x2663...0x2665,
|
||||
0x2667...0x266A,
|
||||
0x266C,
|
||||
0x266D,
|
||||
0x266F,
|
||||
0x269E,
|
||||
0x269F,
|
||||
0x26BF,
|
||||
0x26C6...0x26CD,
|
||||
0x26CF...0x26D3,
|
||||
0x26D5...0x26E1,
|
||||
0x26E3,
|
||||
0x26E8,
|
||||
0x26E9,
|
||||
0x26EB...0x26F1,
|
||||
0x26F4,
|
||||
0x26F6...0x26F9,
|
||||
0x26FB,
|
||||
0x26FC,
|
||||
0x26FE,
|
||||
0x26FF,
|
||||
0x273D,
|
||||
0x2776...0x277F,
|
||||
0x2B56...0x2B59,
|
||||
0x3248...0x324F,
|
||||
0xE000...0xF8FF,
|
||||
0xFE00...0xFE0F,
|
||||
0xFFFD,
|
||||
0x1F100...0x1F10A,
|
||||
0x1F110...0x1F12D,
|
||||
0x1F130...0x1F169,
|
||||
0x1F170...0x1F18D,
|
||||
0x1F18F,
|
||||
0x1F190,
|
||||
0x1F19B...0x1F1AC,
|
||||
0xE0100...0xE01EF,
|
||||
0xF0000...0xFFFFD,
|
||||
0x100000...0x10FFFD,
|
||||
=> true,
|
||||
else => false,
|
||||
};
|
||||
}
|
||||
|
||||
pub fn visibleCodepointWidth(cp: u32, ambiguousAsWide: bool) u3_fast {
|
||||
return visibleCodepointWidthType(u32, cp, ambiguousAsWide);
|
||||
}
|
||||
|
||||
pub fn visibleCodepointWidthMaybeEmoji(cp: u32, maybe_emoji: bool, ambiguousAsWide: bool) u3_fast {
|
||||
// UCHAR_EMOJI=57,
|
||||
if (maybe_emoji and icu_hasBinaryProperty(cp, 57)) {
|
||||
return 2;
|
||||
}
|
||||
return visibleCodepointWidth(cp, ambiguousAsWide);
|
||||
}
|
||||
|
||||
pub fn visibleCodepointWidthType(comptime T: type, cp: T, ambiguousAsWide: bool) u3_fast {
|
||||
if (isZeroWidthCodepointType(T, cp)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (isFullWidthCodepointType(T, cp)) {
|
||||
return 2;
|
||||
}
|
||||
if (ambiguousAsWide and isAmgiguousCodepointType(T, cp)) {
|
||||
return 2;
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
pub const visible = struct {
|
||||
// Ref: https://cs.stanford.edu/people/miles/iso8859.html
|
||||
fn visibleLatin1Width(input_: []const u8) usize {
|
||||
var length: usize = 0;
|
||||
var input = input_;
|
||||
const input_end_ptr = input.ptr + input.len - (input.len % 16);
|
||||
var input_ptr = input.ptr;
|
||||
while (input_ptr != input_end_ptr) {
|
||||
const input_chunk: [16]u8 = input_ptr[0..16].*;
|
||||
const sums: @Vector(16, u8) = [16]u8{
|
||||
visibleLatin1WidthScalar(input_chunk[0]),
|
||||
visibleLatin1WidthScalar(input_chunk[1]),
|
||||
visibleLatin1WidthScalar(input_chunk[2]),
|
||||
visibleLatin1WidthScalar(input_chunk[3]),
|
||||
visibleLatin1WidthScalar(input_chunk[4]),
|
||||
visibleLatin1WidthScalar(input_chunk[5]),
|
||||
visibleLatin1WidthScalar(input_chunk[6]),
|
||||
visibleLatin1WidthScalar(input_chunk[7]),
|
||||
visibleLatin1WidthScalar(input_chunk[8]),
|
||||
visibleLatin1WidthScalar(input_chunk[9]),
|
||||
visibleLatin1WidthScalar(input_chunk[10]),
|
||||
visibleLatin1WidthScalar(input_chunk[11]),
|
||||
visibleLatin1WidthScalar(input_chunk[12]),
|
||||
visibleLatin1WidthScalar(input_chunk[13]),
|
||||
visibleLatin1WidthScalar(input_chunk[14]),
|
||||
visibleLatin1WidthScalar(input_chunk[15]),
|
||||
};
|
||||
length += @reduce(.Add, sums);
|
||||
input_ptr += 16;
|
||||
}
|
||||
input.len %= 16;
|
||||
input.ptr = input_ptr;
|
||||
|
||||
for (input) |byte| length += visibleLatin1WidthScalar(byte);
|
||||
return length;
|
||||
}
|
||||
|
||||
fn visibleLatin1WidthScalar(c: u8) u1 {
|
||||
return if ((c >= 127 and c <= 159) or c < 32) 0 else 1;
|
||||
}
|
||||
|
||||
fn visibleLatin1WidthExcludeANSIColors(input_: anytype) usize {
|
||||
var length: usize = 0;
|
||||
var input = input_;
|
||||
|
||||
const ElementType = std.meta.Child(@TypeOf(input_));
|
||||
const indexFn = if (comptime ElementType == u8) strings.indexOfCharUsize else strings.indexOfChar16Usize;
|
||||
|
||||
while (indexFn(input, '\x1b')) |i| {
|
||||
length += visibleLatin1Width(input[0..i]);
|
||||
input = input[i..];
|
||||
|
||||
if (input.len < 3) return length;
|
||||
|
||||
if (input[1] == '[') {
|
||||
const end = indexFn(input[2..], 'm') orelse return length;
|
||||
input = input[end + 3 ..];
|
||||
} else {
|
||||
input = input[1..];
|
||||
}
|
||||
}
|
||||
|
||||
length += visibleLatin1Width(input);
|
||||
|
||||
return length;
|
||||
}
|
||||
|
||||
fn visibleUTF8WidthFn(input: []const u8, comptime asciiFn: anytype) usize {
|
||||
var bytes = input;
|
||||
var len: usize = 0;
|
||||
while (bun.strings.firstNonASCII(bytes)) |i| {
|
||||
len += asciiFn(bytes[0..i]);
|
||||
const this_chunk = bytes[i..];
|
||||
const byte = this_chunk[0];
|
||||
|
||||
const skip = bun.strings.wtf8ByteSequenceLengthWithInvalid(byte);
|
||||
const cp_bytes: [4]u8 = switch (@min(@as(usize, skip), this_chunk.len)) {
|
||||
inline 1, 2, 3, 4 => |cp_len| .{
|
||||
byte,
|
||||
if (comptime cp_len > 1) this_chunk[1] else 0,
|
||||
if (comptime cp_len > 2) this_chunk[2] else 0,
|
||||
if (comptime cp_len > 3) this_chunk[3] else 0,
|
||||
},
|
||||
else => unreachable,
|
||||
};
|
||||
|
||||
const cp = decodeWTF8RuneTMultibyte(&cp_bytes, skip, u32, unicode_replacement);
|
||||
len += visibleCodepointWidth(cp, false);
|
||||
|
||||
bytes = bytes[@min(i + skip, bytes.len)..];
|
||||
}
|
||||
|
||||
len += asciiFn(bytes);
|
||||
|
||||
return len;
|
||||
}
|
||||
|
||||
fn visibleUTF16WidthFn(input_: []const u16, exclude_ansi_colors: bool, ambiguousAsWide: bool) usize {
|
||||
var input = input_;
|
||||
var len: usize = 0;
|
||||
var prev: ?u21 = 0;
|
||||
var break_state = grapheme.BreakState{};
|
||||
var break_start: u21 = 0;
|
||||
var saw_1b = false;
|
||||
var saw_bracket = false;
|
||||
var stretch_len: usize = 0;
|
||||
|
||||
while (true) {
|
||||
{
|
||||
const idx = firstNonASCII16([]const u16, input) orelse input.len;
|
||||
for (0..idx) |j| {
|
||||
const cp = input[j];
|
||||
defer prev = cp;
|
||||
|
||||
if (saw_bracket) {
|
||||
if (cp == 'm') {
|
||||
saw_1b = false;
|
||||
saw_bracket = false;
|
||||
stretch_len = 0;
|
||||
continue;
|
||||
}
|
||||
stretch_len += visibleCodepointWidth(cp, ambiguousAsWide);
|
||||
continue;
|
||||
}
|
||||
if (saw_1b) {
|
||||
if (cp == '[') {
|
||||
saw_bracket = true;
|
||||
stretch_len = 0;
|
||||
continue;
|
||||
}
|
||||
len += visibleCodepointWidth(cp, ambiguousAsWide);
|
||||
continue;
|
||||
}
|
||||
if (!exclude_ansi_colors or cp != 0x1b) {
|
||||
if (prev) |prev_| {
|
||||
const should_break = grapheme.graphemeBreak(prev_, cp, &break_state);
|
||||
if (should_break) {
|
||||
len += visibleCodepointWidthMaybeEmoji(break_start, cp == 0xFE0F, ambiguousAsWide);
|
||||
break_start = cp;
|
||||
} else {
|
||||
//
|
||||
}
|
||||
} else {
|
||||
len += visibleCodepointWidth(cp, ambiguousAsWide);
|
||||
break_start = cp;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
saw_1b = true;
|
||||
continue;
|
||||
}
|
||||
len += stretch_len;
|
||||
input = input[idx..];
|
||||
}
|
||||
if (input.len == 0) break;
|
||||
const replacement = utf16CodepointWithFFFD([]const u16, input);
|
||||
defer input = input[replacement.len..];
|
||||
if (replacement.fail) continue;
|
||||
const cp: u21 = @intCast(replacement.code_point);
|
||||
defer prev = cp;
|
||||
|
||||
if (prev) |prev_| {
|
||||
const should_break = grapheme.graphemeBreak(prev_, cp, &break_state);
|
||||
if (should_break) {
|
||||
len += visibleCodepointWidthMaybeEmoji(break_start, cp == 0xFE0F, ambiguousAsWide);
|
||||
break_start = cp;
|
||||
}
|
||||
} else {
|
||||
len += visibleCodepointWidth(cp, ambiguousAsWide);
|
||||
break_start = cp;
|
||||
}
|
||||
}
|
||||
if (break_start > 0) {
|
||||
len += visibleCodepointWidthMaybeEmoji(break_start, (prev orelse 0) == 0xFE0F, ambiguousAsWide);
|
||||
}
|
||||
return len;
|
||||
}
|
||||
|
||||
fn visibleLatin1WidthFn(input: []const u8) usize {
|
||||
return visibleLatin1Width(input);
|
||||
}
|
||||
|
||||
pub const width = struct {
|
||||
pub fn latin1(input: []const u8) usize {
|
||||
return visibleLatin1Width(input);
|
||||
}
|
||||
|
||||
pub fn utf8(input: []const u8) usize {
|
||||
return visibleUTF8WidthFn(input, visibleLatin1Width);
|
||||
}
|
||||
|
||||
pub fn utf16(input: []const u16, ambiguousAsWide: bool) usize {
|
||||
return visibleUTF16WidthFn(input, false, ambiguousAsWide);
|
||||
}
|
||||
|
||||
pub const exclude_ansi_colors = struct {
|
||||
pub fn latin1(input: []const u8) usize {
|
||||
return visibleLatin1WidthExcludeANSIColors(input);
|
||||
}
|
||||
|
||||
pub fn utf8(input: []const u8) usize {
|
||||
return visibleUTF8WidthFn(input, visibleLatin1WidthExcludeANSIColors);
|
||||
}
|
||||
|
||||
pub fn utf16(input: []const u16, ambiguousAsWide: bool) usize {
|
||||
return visibleUTF16WidthFn(input, true, ambiguousAsWide);
|
||||
}
|
||||
};
|
||||
};
|
||||
};
|
||||
|
||||
// extern "C" bool icu_hasBinaryProperty(UChar32 cp, unsigned int prop)
|
||||
extern fn icu_hasBinaryProperty(c: u32, which: c_uint) bool;
|
||||
|
||||
const bun = @import("bun");
|
||||
const std = @import("std");
|
||||
const u3_fast = strings.u3_fast;
|
||||
const decodeWTF8RuneTMultibyte = strings.decodeWTF8RuneTMultibyte;
|
||||
const grapheme = strings.grapheme;
|
||||
const strings = bun.strings;
|
||||
const unicode_replacement = strings.unicode_replacement;
|
||||
const firstNonASCII16 = strings.firstNonASCII16;
|
||||
const firstNonASCII = strings.firstNonASCII;
|
||||
const utf16CodepointWithFFFD = strings.utf16CodepointWithFFFD;
|
||||
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user