Add Bun.Glob (#6861)

* initial glob impl

* Add `Bun.globMatch`

* Glob boilerplate setup

* Experiment with Rust glob implementation

* Rust impl is slow revert

* Setup glob walking

* Basic glob walker working

* Fix segfault

* Recursive directory traversal

* Fix glob match non-ascii

* Make faster lil bit

* use arena

* ASCII fast path

* Experiment with packed codepoint cursor

Results in ~4% perf boost if the glob pattern needs to create/manipulate cursors (for example when the pattern uses braces)

* Try converting to u32 array

Made it pretty slow

* Lazily create codepoint buffer

* Different walk algorithm

* Fast path optimizations

* Add `dot` option to `Glob`

* .

* Fix some bugs

* Fix bug, clean up lil bit

* Windows fix

* Non absolute paths

* use specific version of fast-glob for benchmarks and tests

* .

* Fix some stuff

* Fix more stuff

* Add `hasPendingActivity()` to glob

* accident

* Symlinks

* fast-glob e2e tests

* remove

* woops

* Fix relative paths

* Fix absolute

* add test for `onlyFiles`

* Fix invalid surrogate pairs problem

* Rename: `match/matchSync` -> `scan/scanSync` and `matchString` -> `match`

* forgot to close cwd fd

* Update types

* Add stress test

* Port `micromatch` / `glob-match` / `globlin` tests

* fix stale reference arena thing

* stupid bug

* Add builtins to classes code generator and add `Glob.scanIter()`

* all iterables

* generate fixtures, remove from git

* fix test

* Fix

* woops on test

* Fix stuff

licenses

license

`has_pending_activity` to usize

cwd threadSafe fix atomic compile errors

`GlobWalker` own `cwd`

Fix windows path and absolute test

stuff

* Fixes

* Fix stuff

* Use Syscall.close

* Use private symbols for underlying scan functions to preevent misuse

* Update types

* Fix build for zig

* Fix tests

* Fix more tests

* Prevent these tests from GC'ing too much

* Make this benchmark work in Node and Bun

* Fix memory leak

* Add leak test

* Fix windows

* comment about arena allocator use for glob walker

* Make leak test run in separate process

* Iterator api for glob

* GlobWalker.Iterator

* fix leak test

* Remove old impl

* filter functions wip start

* stuff

* wip lockfile use glob

* glob working with lockfile

* revert lockfile changes

* Update bun.lockb

* Manually set to cwd to prevent test failing on linux CI

---------

Co-authored-by: Jarred Sumner <jarred@jarredsumner.com>
Co-authored-by: Jarred Sumner <709451+Jarred-Sumner@users.noreply.github.com>
This commit is contained in:
Zack Radisic
2023-11-22 14:26:09 -08:00
committed by GitHub
parent 81067477dc
commit 6ba4e950cc
29 changed files with 7347 additions and 71 deletions

View File

@@ -1743,9 +1743,12 @@ pub fn toUTF8ListWithType(list_: std.ArrayList(u8), comptime Type: type, utf16:
const length = bun.simdutf.length.utf8.from.utf16.le(utf16);
try list.ensureTotalCapacityPrecise(length + 16);
const buf = try convertUTF16ToUTF8(list, Type, utf16);
if (Environment.allow_assert) {
std.debug.assert(buf.items.len == length);
}
// Commenting out because `convertUTF16ToUTF8` may convert to WTF-8
// which uses 3 bytes for invalid surrogates, causing the length to not
// match from simdutf.
// if (Environment.allow_assert) {
// std.debug.assert(buf.items.len == length);
// }
return buf;
}
@@ -4488,6 +4491,147 @@ pub inline fn utf8ByteSequenceLength(first_byte: u8) u3 {
};
}
pub const PackedCodepointIterator = struct {
const Iterator = @This();
const CodePointType = u32;
const zeroValue = 0;
bytes: []const u8,
i: usize,
next_width: usize = 0,
width: u3 = 0,
c: CodePointType = zeroValue,
pub const ZeroValue = zeroValue;
pub const Cursor = packed struct {
i: u32 = 0,
c: u29 = zeroValue,
width: u3 = 0,
pub const CodePointType = u29;
};
pub fn init(str: string) Iterator {
return Iterator{ .bytes = str, .i = 0, .c = zeroValue };
}
pub fn initOffset(str: string, i: usize) Iterator {
return Iterator{ .bytes = str, .i = i, .c = zeroValue };
}
pub inline fn next(it: *const Iterator, cursor: *Cursor) bool {
const pos: u32 = @as(u32, cursor.width) + cursor.i;
if (pos >= it.bytes.len) {
return false;
}
const cp_len = wtf8ByteSequenceLength(it.bytes[pos]);
const error_char = comptime std.math.minInt(CodePointType);
const codepoint = @as(
CodePointType,
switch (cp_len) {
0 => return false,
1 => it.bytes[pos],
else => decodeWTF8RuneTMultibyte(it.bytes[pos..].ptr[0..4], cp_len, CodePointType, error_char),
},
);
{
@setRuntimeSafety(false);
cursor.* = Cursor{
.i = pos,
.c = if (error_char != codepoint)
@truncate(codepoint)
else
unicode_replacement,
.width = if (codepoint != error_char) cp_len else 1,
};
}
return true;
}
inline fn nextCodepointSlice(it: *Iterator) []const u8 {
const bytes = it.bytes;
const prev = it.i;
const next_ = prev + it.next_width;
if (bytes.len <= next_) return "";
const cp_len = utf8ByteSequenceLength(bytes[next_]);
it.next_width = cp_len;
it.i = @min(next_, bytes.len);
const slice = bytes[prev..][0..cp_len];
it.width = @as(u3, @intCast(slice.len));
return slice;
}
pub fn needsUTF8Decoding(slice: string) bool {
var it = Iterator{ .bytes = slice, .i = 0 };
while (true) {
const part = it.nextCodepointSlice();
@setRuntimeSafety(false);
switch (part.len) {
0 => return false,
1 => continue,
else => return true,
}
}
}
pub fn scanUntilQuotedValueOrEOF(iter: *Iterator, comptime quote: CodePointType) usize {
while (iter.c > -1) {
if (!switch (iter.nextCodepoint()) {
quote => false,
'\\' => brk: {
if (iter.nextCodepoint() == quote) {
continue;
}
break :brk true;
},
else => true,
}) {
return iter.i + 1;
}
}
return iter.i;
}
pub fn nextCodepoint(it: *Iterator) CodePointType {
const slice = it.nextCodepointSlice();
it.c = switch (slice.len) {
0 => zeroValue,
1 => @as(CodePointType, @intCast(slice[0])),
2 => @as(CodePointType, @intCast(std.unicode.utf8Decode2(slice) catch unreachable)),
3 => @as(CodePointType, @intCast(std.unicode.utf8Decode3(slice) catch unreachable)),
4 => @as(CodePointType, @intCast(std.unicode.utf8Decode4(slice) catch unreachable)),
else => unreachable,
};
return it.c;
}
/// Look ahead at the next n codepoints without advancing the iterator.
/// If fewer than n codepoints are available, then return the remainder of the string.
pub fn peek(it: *Iterator, n: usize) []const u8 {
const original_i = it.i;
defer it.i = original_i;
var end_ix = original_i;
var found: usize = 0;
while (found < n) : (found += 1) {
const next_codepoint = it.nextCodepointSlice() orelse return it.bytes[original_i..];
end_ix += next_codepoint.len;
}
return it.bytes[original_i..end_ix];
}
};
pub fn NewCodePointIterator(comptime CodePointType: type, comptime zeroValue: comptime_int) type {
return struct {
const Iterator = @This();
@@ -4497,6 +4641,8 @@ pub fn NewCodePointIterator(comptime CodePointType: type, comptime zeroValue: co
width: u3 = 0,
c: CodePointType = zeroValue,
pub const ZeroValue = zeroValue;
pub const Cursor = struct {
i: u32 = 0,
c: CodePointType = zeroValue,