mirror of
https://github.com/oven-sh/bun
synced 2026-02-09 18:38:55 +00:00
Add Bun.Glob (#6861)
* initial glob impl * Add `Bun.globMatch` * Glob boilerplate setup * Experiment with Rust glob implementation * Rust impl is slow revert * Setup glob walking * Basic glob walker working * Fix segfault * Recursive directory traversal * Fix glob match non-ascii * Make faster lil bit * use arena * ASCII fast path * Experiment with packed codepoint cursor Results in ~4% perf boost if the glob pattern needs to create/manipulate cursors (for example when the pattern uses braces) * Try converting to u32 array Made it pretty slow * Lazily create codepoint buffer * Different walk algorithm * Fast path optimizations * Add `dot` option to `Glob` * . * Fix some bugs * Fix bug, clean up lil bit * Windows fix * Non absolute paths * use specific version of fast-glob for benchmarks and tests * . * Fix some stuff * Fix more stuff * Add `hasPendingActivity()` to glob * accident * Symlinks * fast-glob e2e tests * remove * woops * Fix relative paths * Fix absolute * add test for `onlyFiles` * Fix invalid surrogate pairs problem * Rename: `match/matchSync` -> `scan/scanSync` and `matchString` -> `match` * forgot to close cwd fd * Update types * Add stress test * Port `micromatch` / `glob-match` / `globlin` tests * fix stale reference arena thing * stupid bug * Add builtins to classes code generator and add `Glob.scanIter()` * all iterables * generate fixtures, remove from git * fix test * Fix * woops on test * Fix stuff licenses license `has_pending_activity` to usize cwd threadSafe fix atomic compile errors `GlobWalker` own `cwd` Fix windows path and absolute test stuff * Fixes * Fix stuff * Use Syscall.close * Use private symbols for underlying scan functions to preevent misuse * Update types * Fix build for zig * Fix tests * Fix more tests * Prevent these tests from GC'ing too much * Make this benchmark work in Node and Bun * Fix memory leak * Add leak test * Fix windows * comment about arena allocator use for glob walker * Make leak test run in separate process * Iterator api for glob * GlobWalker.Iterator * fix leak test * Remove old impl * filter functions wip start * stuff * wip lockfile use glob * glob working with lockfile * revert lockfile changes * Update bun.lockb * Manually set to cwd to prevent test failing on linux CI --------- Co-authored-by: Jarred Sumner <jarred@jarredsumner.com> Co-authored-by: Jarred Sumner <709451+Jarred-Sumner@users.noreply.github.com>
This commit is contained in:
@@ -1743,9 +1743,12 @@ pub fn toUTF8ListWithType(list_: std.ArrayList(u8), comptime Type: type, utf16:
|
||||
const length = bun.simdutf.length.utf8.from.utf16.le(utf16);
|
||||
try list.ensureTotalCapacityPrecise(length + 16);
|
||||
const buf = try convertUTF16ToUTF8(list, Type, utf16);
|
||||
if (Environment.allow_assert) {
|
||||
std.debug.assert(buf.items.len == length);
|
||||
}
|
||||
// Commenting out because `convertUTF16ToUTF8` may convert to WTF-8
|
||||
// which uses 3 bytes for invalid surrogates, causing the length to not
|
||||
// match from simdutf.
|
||||
// if (Environment.allow_assert) {
|
||||
// std.debug.assert(buf.items.len == length);
|
||||
// }
|
||||
return buf;
|
||||
}
|
||||
|
||||
@@ -4488,6 +4491,147 @@ pub inline fn utf8ByteSequenceLength(first_byte: u8) u3 {
|
||||
};
|
||||
}
|
||||
|
||||
pub const PackedCodepointIterator = struct {
|
||||
const Iterator = @This();
|
||||
const CodePointType = u32;
|
||||
const zeroValue = 0;
|
||||
|
||||
bytes: []const u8,
|
||||
i: usize,
|
||||
next_width: usize = 0,
|
||||
width: u3 = 0,
|
||||
c: CodePointType = zeroValue,
|
||||
|
||||
pub const ZeroValue = zeroValue;
|
||||
|
||||
pub const Cursor = packed struct {
|
||||
i: u32 = 0,
|
||||
c: u29 = zeroValue,
|
||||
width: u3 = 0,
|
||||
pub const CodePointType = u29;
|
||||
};
|
||||
|
||||
pub fn init(str: string) Iterator {
|
||||
return Iterator{ .bytes = str, .i = 0, .c = zeroValue };
|
||||
}
|
||||
|
||||
pub fn initOffset(str: string, i: usize) Iterator {
|
||||
return Iterator{ .bytes = str, .i = i, .c = zeroValue };
|
||||
}
|
||||
|
||||
pub inline fn next(it: *const Iterator, cursor: *Cursor) bool {
|
||||
const pos: u32 = @as(u32, cursor.width) + cursor.i;
|
||||
if (pos >= it.bytes.len) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const cp_len = wtf8ByteSequenceLength(it.bytes[pos]);
|
||||
const error_char = comptime std.math.minInt(CodePointType);
|
||||
|
||||
const codepoint = @as(
|
||||
CodePointType,
|
||||
switch (cp_len) {
|
||||
0 => return false,
|
||||
1 => it.bytes[pos],
|
||||
else => decodeWTF8RuneTMultibyte(it.bytes[pos..].ptr[0..4], cp_len, CodePointType, error_char),
|
||||
},
|
||||
);
|
||||
|
||||
{
|
||||
@setRuntimeSafety(false);
|
||||
cursor.* = Cursor{
|
||||
.i = pos,
|
||||
.c = if (error_char != codepoint)
|
||||
@truncate(codepoint)
|
||||
else
|
||||
unicode_replacement,
|
||||
.width = if (codepoint != error_char) cp_len else 1,
|
||||
};
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
inline fn nextCodepointSlice(it: *Iterator) []const u8 {
|
||||
const bytes = it.bytes;
|
||||
const prev = it.i;
|
||||
const next_ = prev + it.next_width;
|
||||
if (bytes.len <= next_) return "";
|
||||
|
||||
const cp_len = utf8ByteSequenceLength(bytes[next_]);
|
||||
it.next_width = cp_len;
|
||||
it.i = @min(next_, bytes.len);
|
||||
|
||||
const slice = bytes[prev..][0..cp_len];
|
||||
it.width = @as(u3, @intCast(slice.len));
|
||||
return slice;
|
||||
}
|
||||
|
||||
pub fn needsUTF8Decoding(slice: string) bool {
|
||||
var it = Iterator{ .bytes = slice, .i = 0 };
|
||||
|
||||
while (true) {
|
||||
const part = it.nextCodepointSlice();
|
||||
@setRuntimeSafety(false);
|
||||
switch (part.len) {
|
||||
0 => return false,
|
||||
1 => continue,
|
||||
else => return true,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn scanUntilQuotedValueOrEOF(iter: *Iterator, comptime quote: CodePointType) usize {
|
||||
while (iter.c > -1) {
|
||||
if (!switch (iter.nextCodepoint()) {
|
||||
quote => false,
|
||||
'\\' => brk: {
|
||||
if (iter.nextCodepoint() == quote) {
|
||||
continue;
|
||||
}
|
||||
break :brk true;
|
||||
},
|
||||
else => true,
|
||||
}) {
|
||||
return iter.i + 1;
|
||||
}
|
||||
}
|
||||
|
||||
return iter.i;
|
||||
}
|
||||
|
||||
pub fn nextCodepoint(it: *Iterator) CodePointType {
|
||||
const slice = it.nextCodepointSlice();
|
||||
|
||||
it.c = switch (slice.len) {
|
||||
0 => zeroValue,
|
||||
1 => @as(CodePointType, @intCast(slice[0])),
|
||||
2 => @as(CodePointType, @intCast(std.unicode.utf8Decode2(slice) catch unreachable)),
|
||||
3 => @as(CodePointType, @intCast(std.unicode.utf8Decode3(slice) catch unreachable)),
|
||||
4 => @as(CodePointType, @intCast(std.unicode.utf8Decode4(slice) catch unreachable)),
|
||||
else => unreachable,
|
||||
};
|
||||
|
||||
return it.c;
|
||||
}
|
||||
|
||||
/// Look ahead at the next n codepoints without advancing the iterator.
|
||||
/// If fewer than n codepoints are available, then return the remainder of the string.
|
||||
pub fn peek(it: *Iterator, n: usize) []const u8 {
|
||||
const original_i = it.i;
|
||||
defer it.i = original_i;
|
||||
|
||||
var end_ix = original_i;
|
||||
var found: usize = 0;
|
||||
while (found < n) : (found += 1) {
|
||||
const next_codepoint = it.nextCodepointSlice() orelse return it.bytes[original_i..];
|
||||
end_ix += next_codepoint.len;
|
||||
}
|
||||
|
||||
return it.bytes[original_i..end_ix];
|
||||
}
|
||||
};
|
||||
|
||||
pub fn NewCodePointIterator(comptime CodePointType: type, comptime zeroValue: comptime_int) type {
|
||||
return struct {
|
||||
const Iterator = @This();
|
||||
@@ -4497,6 +4641,8 @@ pub fn NewCodePointIterator(comptime CodePointType: type, comptime zeroValue: co
|
||||
width: u3 = 0,
|
||||
c: CodePointType = zeroValue,
|
||||
|
||||
pub const ZeroValue = zeroValue;
|
||||
|
||||
pub const Cursor = struct {
|
||||
i: u32 = 0,
|
||||
c: CodePointType = zeroValue,
|
||||
|
||||
Reference in New Issue
Block a user