Add Bun.Glob (#6861)

* initial glob impl * Add `Bun.globMatch` * Glob boilerplate setup * Experiment with Rust glob implementation * Rust impl is slow revert * Setup glob walking * Basic glob walker working * Fix segfault * Recursive directory traversal * Fix glob match non-ascii * Make faster lil bit * use arena * ASCII fast path * Experiment with packed codepoint cursor Results in ~4% perf boost if the glob pattern needs to create/manipulate cursors (for example when the pattern uses braces) * Try converting to u32 array Made it pretty slow * Lazily create codepoint buffer * Different walk algorithm * Fast path optimizations * Add `dot` option to `Glob` * . * Fix some bugs * Fix bug, clean up lil bit * Windows fix * Non absolute paths * use specific version of fast-glob for benchmarks and tests * . * Fix some stuff * Fix more stuff * Add `hasPendingActivity()` to glob * accident * Symlinks * fast-glob e2e tests * remove * woops * Fix relative paths * Fix absolute * add test for `onlyFiles` * Fix invalid surrogate pairs problem * Rename: `match/matchSync` -> `scan/scanSync` and `matchString` -> `match` * forgot to close cwd fd * Update types * Add stress test * Port `micromatch` / `glob-match` / `globlin` tests * fix stale reference arena thing * stupid bug * Add builtins to classes code generator and add `Glob.scanIter()` * all iterables * generate fixtures, remove from git * fix test * Fix * woops on test * Fix stuff licenses license `has_pending_activity` to usize cwd threadSafe fix atomic compile errors `GlobWalker` own `cwd` Fix windows path and absolute test stuff * Fixes * Fix stuff * Use Syscall.close * Use private symbols for underlying scan functions to preevent misuse * Update types * Fix build for zig * Fix tests * Fix more tests * Prevent these tests from GC'ing too much * Make this benchmark work in Node and Bun * Fix memory leak * Add leak test * Fix windows * comment about arena allocator use for glob walker * Make leak test run in separate process * Iterator api for glob * GlobWalker.Iterator * fix leak test * Remove old impl * filter functions wip start * stuff * wip lockfile use glob * glob working with lockfile * revert lockfile changes * Update bun.lockb * Manually set to cwd to prevent test failing on linux CI --------- Co-authored-by: Jarred Sumner <jarred@jarredsumner.com> Co-authored-by: Jarred Sumner <709451+Jarred-Sumner@users.noreply.github.com>
2026-02-09 18:38:55 +00:00 · 2023-11-22 14:26:09 -08:00
parent 81067477dc
commit 6ba4e950cc
29 changed files with 7347 additions and 71 deletions
--- a/src/string_immutable.zig
+++ b/src/string_immutable.zig
@@ -1743,9 +1743,12 @@ pub fn toUTF8ListWithType(list_: std.ArrayList(u8), comptime Type: type, utf16:
        const length = bun.simdutf.length.utf8.from.utf16.le(utf16);
        try list.ensureTotalCapacityPrecise(length + 16);
        const buf = try convertUTF16ToUTF8(list, Type, utf16);
-        if (Environment.allow_assert) {
-            std.debug.assert(buf.items.len == length);
-        }
+        // Commenting out because `convertUTF16ToUTF8` may convert to WTF-8
+        // which uses 3 bytes for invalid surrogates, causing the length to not
+        // match from simdutf.
+        // if (Environment.allow_assert) {
+        //     std.debug.assert(buf.items.len == length);
+        // }
        return buf;
    }

@@ -4488,6 +4491,147 @@ pub inline fn utf8ByteSequenceLength(first_byte: u8) u3 {
    };
 }

+pub const PackedCodepointIterator = struct {
+    const Iterator = @This();
+    const CodePointType = u32;
+    const zeroValue = 0;
+
+    bytes: []const u8,
+    i: usize,
+    next_width: usize = 0,
+    width: u3 = 0,
+    c: CodePointType = zeroValue,
+
+    pub const ZeroValue = zeroValue;
+
+    pub const Cursor = packed struct {
+        i: u32 = 0,
+        c: u29 = zeroValue,
+        width: u3 = 0,
+        pub const CodePointType = u29;
+    };
+
+    pub fn init(str: string) Iterator {
+        return Iterator{ .bytes = str, .i = 0, .c = zeroValue };
+    }
+
+    pub fn initOffset(str: string, i: usize) Iterator {
+        return Iterator{ .bytes = str, .i = i, .c = zeroValue };
+    }
+
+    pub inline fn next(it: *const Iterator, cursor: *Cursor) bool {
+        const pos: u32 = @as(u32, cursor.width) + cursor.i;
+        if (pos >= it.bytes.len) {
+            return false;
+        }
+
+        const cp_len = wtf8ByteSequenceLength(it.bytes[pos]);
+        const error_char = comptime std.math.minInt(CodePointType);
+
+        const codepoint = @as(
+            CodePointType,
+            switch (cp_len) {
+                0 => return false,
+                1 => it.bytes[pos],
+                else => decodeWTF8RuneTMultibyte(it.bytes[pos..].ptr[0..4], cp_len, CodePointType, error_char),
+            },
+        );
+
+        {
+            @setRuntimeSafety(false);
+            cursor.* = Cursor{
+                .i = pos,
+                .c = if (error_char != codepoint)
+                    @truncate(codepoint)
+                else
+                    unicode_replacement,
+                .width = if (codepoint != error_char) cp_len else 1,
+            };
+        }
+
+        return true;
+    }
+
+    inline fn nextCodepointSlice(it: *Iterator) []const u8 {
+        const bytes = it.bytes;
+        const prev = it.i;
+        const next_ = prev + it.next_width;
+        if (bytes.len <= next_) return "";
+
+        const cp_len = utf8ByteSequenceLength(bytes[next_]);
+        it.next_width = cp_len;
+        it.i = @min(next_, bytes.len);
+
+        const slice = bytes[prev..][0..cp_len];
+        it.width = @as(u3, @intCast(slice.len));
+        return slice;
+    }
+
+    pub fn needsUTF8Decoding(slice: string) bool {
+        var it = Iterator{ .bytes = slice, .i = 0 };
+
+        while (true) {
+            const part = it.nextCodepointSlice();
+            @setRuntimeSafety(false);
+            switch (part.len) {
+                0 => return false,
+                1 => continue,
+                else => return true,
+            }
+        }
+    }
+
+    pub fn scanUntilQuotedValueOrEOF(iter: *Iterator, comptime quote: CodePointType) usize {
+        while (iter.c > -1) {
+            if (!switch (iter.nextCodepoint()) {
+                quote => false,
+                '\\' => brk: {
+                    if (iter.nextCodepoint() == quote) {
+                        continue;
+                    }
+                    break :brk true;
+                },
+                else => true,
+            }) {
+                return iter.i + 1;
+            }
+        }
+
+        return iter.i;
+    }
+
+    pub fn nextCodepoint(it: *Iterator) CodePointType {
+        const slice = it.nextCodepointSlice();
+
+        it.c = switch (slice.len) {
+            0 => zeroValue,
+            1 => @as(CodePointType, @intCast(slice[0])),
+            2 => @as(CodePointType, @intCast(std.unicode.utf8Decode2(slice) catch unreachable)),
+            3 => @as(CodePointType, @intCast(std.unicode.utf8Decode3(slice) catch unreachable)),
+            4 => @as(CodePointType, @intCast(std.unicode.utf8Decode4(slice) catch unreachable)),
+            else => unreachable,
+        };
+
+        return it.c;
+    }
+
+    /// Look ahead at the next n codepoints without advancing the iterator.
+    /// If fewer than n codepoints are available, then return the remainder of the string.
+    pub fn peek(it: *Iterator, n: usize) []const u8 {
+        const original_i = it.i;
+        defer it.i = original_i;
+
+        var end_ix = original_i;
+        var found: usize = 0;
+        while (found < n) : (found += 1) {
+            const next_codepoint = it.nextCodepointSlice() orelse return it.bytes[original_i..];
+            end_ix += next_codepoint.len;
+        }
+
+        return it.bytes[original_i..end_ix];
+    }
+};
+
 pub fn NewCodePointIterator(comptime CodePointType: type, comptime zeroValue: comptime_int) type {
    return struct {
        const Iterator = @This();
@@ -4497,6 +4641,8 @@ pub fn NewCodePointIterator(comptime CodePointType: type, comptime zeroValue: co
        width: u3 = 0,
        c: CodePointType = zeroValue,

+        pub const ZeroValue = zeroValue;
+
        pub const Cursor = struct {
            i: u32 = 0,
            c: CodePointType = zeroValue,