Files
bun.sh/src/glob.zig
2024-05-31 00:03:57 -07:00

2189 lines
93 KiB
Zig

// Portions of this file are derived from works under the MIT License:
//
// Copyright (c) 2023 Devon Govett
// Copyright (c) 2023 Stephen Gregoratto
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
const std = @import("std");
const bun = @import("root").bun;
const eqlComptime = @import("./string_immutable.zig").eqlComptime;
const expect = std.testing.expect;
const isAllAscii = @import("./string_immutable.zig").isAllASCII;
const math = std.math;
const mem = std.mem;
const isWindows = @import("builtin").os.tag == .windows;
const Allocator = std.mem.Allocator;
const Arena = std.heap.ArenaAllocator;
const ArrayList = std.ArrayListUnmanaged;
const ArrayListManaged = std.ArrayList;
const BunString = bun.String;
const C = @import("./c.zig");
const CodepointIterator = @import("./string_immutable.zig").PackedCodepointIterator;
const Codepoint = CodepointIterator.Cursor.CodePointType;
const Dirent = @import("./bun.js/node/types.zig").Dirent;
const DirIterator = @import("./bun.js/node/dir_iterator.zig");
const EntryKind = @import("./bun.js/node/types.zig").Dirent.Kind;
const GlobAscii = @import("./glob_ascii.zig");
const JSC = bun.JSC;
const Maybe = JSC.Maybe;
const PathLike = @import("./bun.js/node/types.zig").PathLike;
const PathString = @import("./string_types.zig").PathString;
const ResolvePath = @import("./resolver/resolve_path.zig");
const Syscall = bun.sys;
const ZigString = @import("./bun.js/bindings/bindings.zig").ZigString;
// const Codepoint = u32;
const Cursor = CodepointIterator.Cursor;
const log = bun.Output.scoped(.Glob, false);
const CursorState = struct {
cursor: CodepointIterator.Cursor = .{},
/// The index in terms of codepoints
// cp_idx: usize,
fn init(iterator: *const CodepointIterator) CursorState {
var this_cursor: CodepointIterator.Cursor = .{};
_ = iterator.next(&this_cursor);
return .{
// .cp_idx = 0,
.cursor = this_cursor,
};
}
/// Return cursor pos of next codepoint without modifying the current.
///
/// NOTE: If there is no next codepoint (cursor is at the last one), then
/// the returned cursor will have `c` as zero value and `i` will be >=
/// sourceBytes.len
fn peek(this: *const CursorState, iterator: *const CodepointIterator) CursorState {
var cpy = this.*;
// If outside of bounds
if (!iterator.next(&cpy.cursor)) {
// This will make `i >= sourceBytes.len`
cpy.cursor.i += cpy.cursor.width;
cpy.cursor.width = 1;
cpy.cursor.c = CodepointIterator.ZeroValue;
}
// cpy.cp_idx += 1;
return cpy;
}
fn bump(this: *CursorState, iterator: *const CodepointIterator) void {
if (!iterator.next(&this.cursor)) {
this.cursor.i += this.cursor.width;
this.cursor.width = 1;
this.cursor.c = CodepointIterator.ZeroValue;
}
// this.cp_idx += 1;
}
inline fn manualBumpAscii(this: *CursorState, i: u32, nextCp: Codepoint) void {
this.cursor.i += i;
this.cursor.c = nextCp;
this.cursor.width = 1;
}
inline fn manualPeekAscii(this: *CursorState, i: u32, nextCp: Codepoint) CursorState {
return .{
.cursor = CodepointIterator.Cursor{
.i = this.cursor.i + i,
.c = @truncate(nextCp),
.width = 1,
},
};
}
};
pub const BunGlobWalker = GlobWalker_(null, SyscallAccessor, false);
fn dummyFilterTrue(val: []const u8) bool {
_ = val;
return true;
}
fn dummyFilterFalse(val: []const u8) bool {
_ = val;
return false;
}
pub fn statatWindows(fd: bun.FileDescriptor, path: [:0]const u8) Maybe(bun.Stat) {
if (comptime !bun.Environment.isWindows) @compileError("oi don't use this");
var buf: bun.PathBuffer = undefined;
const dir = switch (Syscall.getFdPath(fd, &buf)) {
.err => |e| return .{ .err = e },
.result => |s| s,
};
const parts: []const []const u8 = &.{
dir[0..dir.len],
path,
};
const statpath = ResolvePath.joinZBuf(&buf, parts, .auto);
return Syscall.stat(statpath);
}
pub const SyscallAccessor = struct {
const count_fds = true;
const Handle = struct {
value: bun.FileDescriptor,
const zero = Handle{ .value = bun.FileDescriptor.zero };
pub fn isZero(this: Handle) bool {
return this.value == bun.FileDescriptor.zero;
}
pub fn eql(this: Handle, other: Handle) bool {
return this.value == other.value;
}
};
const DirIter = struct {
value: DirIterator.WrappedIterator,
pub inline fn next(self: *DirIter) Maybe(?DirIterator.IteratorResult) {
return self.value.next();
}
pub inline fn iterate(dir: Handle) DirIter {
return .{ .value = DirIterator.WrappedIterator.init(dir.value.asDir()) };
}
};
pub fn open(path: [:0]const u8) !Maybe(Handle) {
return switch (Syscall.open(path, std.os.O.DIRECTORY | std.os.O.RDONLY, 0)) {
.err => |err| .{ .err = err },
.result => |fd| .{ .result = Handle{ .value = fd } },
};
}
pub fn statat(handle: Handle, path: [:0]const u8) Maybe(bun.Stat) {
if (comptime bun.Environment.isWindows) return statatWindows(handle.value, path);
return switch (Syscall.fstatat(handle.value, path)) {
.err => |err| .{ .err = err },
.result => |s| .{ .result = s },
};
}
pub fn openat(handle: Handle, path: [:0]const u8) !Maybe(Handle) {
return switch (Syscall.openat(handle.value, path, std.os.O.DIRECTORY | std.os.O.RDONLY, 0)) {
.err => |err| .{ .err = err },
.result => |fd| .{ .result = Handle{ .value = fd } },
};
}
pub fn close(handle: Handle) ?Syscall.Error {
return Syscall.close(handle.value);
}
pub fn getcwd(path_buf: *bun.PathBuffer) Maybe([]const u8) {
return Syscall.getcwd(path_buf);
}
};
pub const DirEntryAccessor = struct {
const FS = bun.fs.FileSystem;
const count_fds = false;
const Handle = struct {
value: ?*FS.DirEntry,
const zero = Handle{ .value = null };
pub fn isZero(this: Handle) bool {
return this.value == null;
}
pub fn eql(this: Handle, other: Handle) bool {
// TODO this might not be quite right, we're comparing pointers, not the underlying directory
// On the other hand, DirEntries are only ever created once (per generation), so this should be fine?
// Realistically, as closing the handle is a no-op, this should be fine either way.
return this.value == other.value;
}
};
const DirIter = struct {
value: ?FS.DirEntry.EntryMap.Iterator,
const IterResult = struct {
name: NameWrapper,
kind: std.fs.File.Kind,
const NameWrapper = struct {
value: []const u8,
pub fn slice(this: NameWrapper) []const u8 {
return this.value;
}
};
};
pub inline fn next(self: *DirIter) Maybe(?IterResult) {
if (self.value) |*value| {
const nextval = value.next() orelse return .{ .result = null };
const name = nextval.key_ptr.*;
const kind = nextval.value_ptr.*.kind(&FS.instance.fs, true);
const fskind = switch (kind) {
.file => std.fs.File.Kind.file,
.dir => std.fs.File.Kind.directory,
};
return .{
.result = .{
.name = IterResult.NameWrapper{ .value = name },
.kind = fskind,
},
};
} else {
return .{ .result = null };
}
}
pub inline fn iterate(dir: Handle) DirIter {
const entry = dir.value orelse return DirIter{ .value = null };
return .{ .value = entry.data.iterator() };
}
};
pub fn statat(handle: Handle, path_: [:0]const u8) Maybe(bun.Stat) {
var path: [:0]const u8 = path_;
var buf: bun.PathBuffer = undefined;
if (!bun.path.Platform.auto.isAbsolute(path)) {
if (handle.value) |entry| {
const slice = bun.path.joinStringBuf(&buf, [_][]const u8{ entry.dir, path }, .auto);
buf[slice.len] = 0;
path = buf[0..slice.len :0];
}
}
return Syscall.stat(path);
}
pub fn open(path: [:0]const u8) !Maybe(Handle) {
return openat(Handle.zero, path);
}
pub fn openat(handle: Handle, path_: [:0]const u8) !Maybe(Handle) {
var path: []const u8 = path_;
var buf: bun.PathBuffer = undefined;
if (!bun.path.Platform.auto.isAbsolute(path)) {
if (handle.value) |entry| {
path = bun.path.joinStringBuf(&buf, [_][]const u8{ entry.dir, path }, .auto);
}
}
// TODO do we want to propagate ENOTDIR through the 'Maybe' to match the SyscallAccessor?
// The glob implementation specifically checks for this error when dealing with symlinks
// return .{ .err = Syscall.Error.fromCode(bun.C.E.NOTDIR, Syscall.Tag.open) };
const res = FS.instance.fs.readDirectory(path, null, 0, false) catch |err| {
return err;
};
switch (res.*) {
.entries => |entry| {
return .{ .result = Handle{ .value = entry } };
},
.err => |err| {
return err.original_err;
},
}
}
pub inline fn close(handle: Handle) ?Syscall.Error {
// TODO is this a noop?
_ = handle;
return null;
}
pub fn getcwd(path_buf: *bun.PathBuffer) Maybe([]const u8) {
@memcpy(path_buf, bun.fs.FileSystem.instance.fs.cwd);
}
};
pub fn GlobWalker_(
comptime ignore_filter_fn: ?*const fn ([]const u8) bool,
comptime Accessor: type,
comptime sentinel: bool,
) type {
const is_ignored: *const fn ([]const u8) bool = if (comptime ignore_filter_fn) |func| func else dummyFilterFalse;
const count_fds = Accessor.count_fds and bun.Environment.isDebug;
const stdJoin = comptime if (!sentinel) std.fs.path.join else std.fs.path.joinZ;
const bunJoin = comptime if (!sentinel) ResolvePath.join else ResolvePath.joinZ;
const MatchedPath = comptime if (!sentinel) []const u8 else [:0]const u8;
return struct {
const GlobWalker = @This();
pub const Result = Maybe(void);
arena: Arena = undefined,
/// not owned by this struct
pattern: []const u8 = "",
pattern_codepoints: []u32 = &[_]u32{},
cp_len: u32 = 0,
/// If the pattern contains "./" or "../"
has_relative_components: bool = false,
end_byte_of_basename_excluding_special_syntax: u32 = 0,
basename_excluding_special_syntax_component_idx: u32 = 0,
patternComponents: ArrayList(Component) = .{},
matchedPaths: MatchedMap = .{},
i: u32 = 0,
dot: bool = false,
absolute: bool = false,
cwd: []const u8 = "",
follow_symlinks: bool = false,
error_on_broken_symlinks: bool = false,
only_files: bool = true,
pathBuf: bun.PathBuffer = undefined,
// iteration state
workbuf: ArrayList(WorkItem) = ArrayList(WorkItem){},
/// Array hashmap used as a set (values are the keys)
/// to store matched paths and prevent duplicates
///
/// BunString is used so that we can call BunString.toJSArray()
/// on the result of `.keys()` to give the result back to JS
///
/// The only type of string impl we use is ZigString since
/// all matched paths are UTF-8 (DirIterator converts them on
/// windows) and allocated on the arnea
///
/// Multiple patterns are not supported so right now this is
/// only possible when running a pattern like:
///
/// `foo/**/*`
///
/// Use `.keys()` to get the matched paths
const MatchedMap = std.ArrayHashMapUnmanaged(BunString, void, struct {
pub fn hash(_: @This(), this: BunString) u32 {
bun.assert(this.tag == .ZigString);
const slice = this.byteSlice();
if (comptime sentinel) {
const slicez = slice[0 .. slice.len - 1 :0];
return std.array_hash_map.hashString(slicez);
}
return std.array_hash_map.hashString(slice);
}
pub fn eql(_: @This(), this: BunString, other: BunString, _: usize) bool {
return this.eql(other);
}
}, true);
/// The glob walker references the .directory.path so its not safe to
/// copy/move this
const IterState = union(enum) {
/// Pops the next item off the work stack
get_next,
/// Currently iterating over a directory
directory: Directory,
/// Two particular cases where this is used:
///
/// 1. A pattern with no special glob syntax was supplied, for example: `/Users/zackradisic/foo/bar`
///
/// In that case, the mere existence of the file/dir counts as a match, so we can eschew directory
/// iterating and walking for a simple stat call to the path.
///
/// 2. Pattern ending in literal optimization
///
/// With a pattern like: `packages/**/package.json`, once the iteration component index reaches
/// the final component, which is a literal string ("package.json"), we can similarly make a
/// single stat call to complete the pattern.
matched: MatchedPath,
const Directory = struct {
fd: Accessor.Handle,
iter: Accessor.DirIter,
path: bun.PathBuffer,
dir_path: [:0]const u8,
component_idx: u32,
pattern: *Component,
next_pattern: ?*Component,
is_last: bool,
iter_closed: bool = false,
at_cwd: bool = false,
};
};
pub const Iterator = struct {
walker: *GlobWalker,
iter_state: IterState = .get_next,
cwd_fd: Accessor.Handle = Accessor.Handle.zero,
empty_dir_path: [0:0]u8 = [0:0]u8{},
/// This is to make sure in debug/tests that we are closing file descriptors
/// We should only have max 2 open at a time. One for the cwd, and one for the
/// directory being iterated on.
fds_open: if (count_fds) usize else u0 = 0,
pub fn init(this: *Iterator) !Maybe(void) {
log("Iterator init pattern={s}", .{this.walker.pattern});
var was_absolute = false;
const root_work_item = brk: {
var use_posix = bun.Environment.isPosix;
const is_absolute = if (bun.Environment.isPosix) std.fs.path.isAbsolute(this.walker.pattern) else std.fs.path.isAbsolute(this.walker.pattern) or is_absolute: {
use_posix = true;
break :is_absolute std.fs.path.isAbsolutePosix(this.walker.pattern);
};
if (!is_absolute) break :brk WorkItem.new(this.walker.cwd, 0, .directory);
was_absolute = true;
var path_without_special_syntax = this.walker.pattern[0..this.walker.end_byte_of_basename_excluding_special_syntax];
var starting_component_idx = this.walker.basename_excluding_special_syntax_component_idx;
if (path_without_special_syntax.len == 0) {
path_without_special_syntax = if (!bun.Environment.isWindows) "/" else ResolvePath.windowsFilesystemRoot(this.walker.cwd);
} else {
// Skip the components associated with the literal path
starting_component_idx += 1;
// This means we got a pattern without any special glob syntax, for example:
// `/Users/zackradisic/foo/bar`
//
// In that case we don't need to do any walking and can just open up the FS entry
if (starting_component_idx >= this.walker.patternComponents.items.len) {
const path = try this.walker.arena.allocator().dupeZ(u8, path_without_special_syntax);
const fd = switch (try Accessor.open(path)) {
.err => |e| {
if (e.getErrno() == bun.C.E.NOTDIR) {
this.iter_state = .{ .matched = path };
return Maybe(void).success;
}
// Doesn't exist
if (e.getErrno() == bun.C.E.NOENT) {
this.iter_state = .get_next;
return Maybe(void).success;
}
const errpath = try this.walker.arena.allocator().dupeZ(u8, path);
return .{ .err = e.withPath(errpath) };
},
.result => |fd| fd,
};
_ = Accessor.close(fd);
this.iter_state = .{ .matched = path };
return Maybe(void).success;
}
// In the above branch, if `starting_compoennt_dix >= pattern_components.len` then
// it should also mean that `end_byte_of_basename_excluding_special_syntax >= pattern.len`
//
// So if we see that `end_byte_of_basename_excluding_special_syntax < this.walker.pattern.len` we
// miscalculated the values
bun.assert(this.walker.end_byte_of_basename_excluding_special_syntax < this.walker.pattern.len);
}
break :brk WorkItem.new(
path_without_special_syntax,
starting_component_idx,
.directory,
);
};
var path_buf: *bun.PathBuffer = &this.walker.pathBuf;
const root_path = root_work_item.path;
@memcpy(path_buf[0..root_path.len], root_path[0..root_path.len]);
path_buf[root_path.len] = 0;
const cwd_fd = switch (try Accessor.open(path_buf[0..root_path.len :0])) {
.err => |err| return .{ .err = this.walker.handleSysErrWithPath(err, @ptrCast(path_buf[0 .. root_path.len + 1])) },
.result => |fd| fd,
};
if (comptime count_fds) {
this.fds_open += 1;
}
this.cwd_fd = cwd_fd;
switch (if (was_absolute) try this.transitionToDirIterState(
root_work_item,
false,
) else try this.transitionToDirIterState(
root_work_item,
true,
)) {
.err => |err| return .{ .err = err },
else => {},
}
return Maybe(void).success;
}
pub fn deinit(this: *Iterator) void {
defer {
bun.debugAssert(this.fds_open == 0);
}
this.closeCwdFd();
switch (this.iter_state) {
.directory => |dir| {
if (!dir.iter_closed) {
this.closeDisallowingCwd(dir.fd);
}
},
else => {},
}
while (this.walker.workbuf.popOrNull()) |work_item| {
if (work_item.fd) |fd| {
this.closeDisallowingCwd(fd);
}
}
if (comptime count_fds) {
bun.debugAssert(this.fds_open == 0);
}
}
pub fn closeCwdFd(this: *Iterator) void {
if (this.cwd_fd.isZero()) return;
_ = Accessor.close(this.cwd_fd);
if (comptime count_fds) this.fds_open -= 1;
}
pub fn closeDisallowingCwd(this: *Iterator, fd: Accessor.Handle) void {
if (fd.isZero() or fd.eql(this.cwd_fd)) return;
_ = Accessor.close(fd);
if (comptime count_fds) this.fds_open -= 1;
}
pub fn bumpOpenFds(this: *Iterator) void {
if (comptime count_fds) {
this.fds_open += 1;
// If this is over 2 then this means that there is a bug in the iterator code
bun.debugAssert(this.fds_open <= 2);
}
}
fn transitionToDirIterState(
this: *Iterator,
work_item: WorkItem,
comptime root: bool,
) !Maybe(void) {
log("transition => {s}", .{work_item.path});
this.iter_state = .{ .directory = .{
.fd = Accessor.Handle.zero,
.iter = undefined,
.path = undefined,
.dir_path = undefined,
.component_idx = 0,
.pattern = undefined,
.next_pattern = null,
.is_last = false,
.iter_closed = false,
.at_cwd = false,
} };
var dir_path: [:0]u8 = dir_path: {
if (comptime root) {
if (!this.walker.absolute) {
this.iter_state.directory.path[0] = 0;
break :dir_path this.iter_state.directory.path[0..0 :0];
}
}
// TODO Optimization: On posix systems filepaths are already null byte terminated so we can skip this if thats the case
@memcpy(this.iter_state.directory.path[0..work_item.path.len], work_item.path);
this.iter_state.directory.path[work_item.path.len] = 0;
break :dir_path this.iter_state.directory.path[0..work_item.path.len :0];
};
var had_dot_dot = false;
const component_idx = this.walker.skipSpecialComponents(work_item.idx, &dir_path, &this.iter_state.directory.path, &had_dot_dot);
const fd: Accessor.Handle = fd: {
if (work_item.fd) |fd| break :fd fd;
if (comptime root) {
if (had_dot_dot) break :fd switch (try Accessor.openat(this.cwd_fd, dir_path)) {
.err => |err| return .{
.err = this.walker.handleSysErrWithPath(err, dir_path),
},
.result => |fd_| brk: {
this.bumpOpenFds();
break :brk fd_;
},
};
this.iter_state.directory.at_cwd = true;
break :fd this.cwd_fd;
}
break :fd switch (try Accessor.openat(this.cwd_fd, dir_path)) {
.err => |err| return .{
.err = this.walker.handleSysErrWithPath(err, dir_path),
},
.result => |fd_| brk: {
this.bumpOpenFds();
break :brk fd_;
},
};
};
// Optimization:
// If we have a pattern like:
// `packages/*/package.json`
// ^ and we are at this component, with let's say
// a directory named: `packages/frontend/`
//
// Then we can just open `packages/frontend/package.json` without
// doing any iteration on the current directory.
//
// More generally, we can apply this optimization if we are on the
// last component and it is a literal with no special syntax.
if (component_idx == this.walker.patternComponents.items.len -| 1 and
this.walker.patternComponents.items[component_idx].syntax_hint == .Literal)
{
defer {
this.closeDisallowingCwd(fd);
}
const stackbuf_size = 256;
var stfb = std.heap.stackFallback(stackbuf_size, this.walker.arena.allocator());
const pathz = try stfb.get().dupeZ(u8, this.walker.patternComponents.items[component_idx].patternSlice(this.walker.pattern));
const stat_result: bun.Stat = switch (Accessor.statat(fd, pathz)) {
.err => |e_| {
var e: bun.sys.Error = e_;
if (e.getErrno() == bun.C.E.NOENT) {
this.iter_state = .get_next;
return Maybe(void).success;
}
return .{ .err = e.withPath(this.walker.patternComponents.items[component_idx].patternSlice(this.walker.pattern)) };
},
.result => |stat| stat,
};
const matches = (bun.S.ISDIR(@intCast(stat_result.mode)) and !this.walker.only_files) or bun.S.ISREG(@intCast(stat_result.mode)) or !this.walker.only_files;
if (matches) {
if (try this.walker.prepareMatchedPath(pathz, dir_path)) |path| {
this.iter_state = .{ .matched = path };
} else {
this.iter_state = .get_next;
}
} else {
this.iter_state = .get_next;
}
return Maybe(void).success;
}
this.iter_state.directory.dir_path = dir_path;
this.iter_state.directory.component_idx = component_idx;
this.iter_state.directory.pattern = &this.walker.patternComponents.items[component_idx];
this.iter_state.directory.next_pattern = if (component_idx + 1 < this.walker.patternComponents.items.len) &this.walker.patternComponents.items[component_idx + 1] else null;
this.iter_state.directory.is_last = component_idx == this.walker.patternComponents.items.len - 1;
this.iter_state.directory.at_cwd = false;
this.iter_state.directory.fd = Accessor.Handle.zero;
log("Transition(dirpath={s}, fd={}, component_idx={d})", .{ dir_path, fd, component_idx });
this.iter_state.directory.fd = fd;
const iterator = Accessor.DirIter.iterate(fd);
this.iter_state.directory.iter = iterator;
this.iter_state.directory.iter_closed = false;
return Maybe(void).success;
}
pub fn next(this: *Iterator) !Maybe(?MatchedPath) {
while (true) {
switch (this.iter_state) {
.matched => |path| {
this.iter_state = .get_next;
return .{ .result = path };
},
.get_next => {
// Done
if (this.walker.workbuf.items.len == 0) return .{ .result = null };
const work_item = this.walker.workbuf.pop();
switch (work_item.kind) {
.directory => {
switch (try this.transitionToDirIterState(work_item, false)) {
.err => |err| return .{ .err = err },
else => {},
}
continue;
},
.symlink => {
var scratch_path_buf: *bun.PathBuffer = &this.walker.pathBuf;
@memcpy(scratch_path_buf[0..work_item.path.len], work_item.path);
scratch_path_buf[work_item.path.len] = 0;
var symlink_full_path_z: [:0]u8 = scratch_path_buf[0..work_item.path.len :0];
const entry_name = symlink_full_path_z[work_item.entry_start..symlink_full_path_z.len];
var has_dot_dot = false;
const component_idx = this.walker.skipSpecialComponents(work_item.idx, &symlink_full_path_z, scratch_path_buf, &has_dot_dot);
var pattern = this.walker.patternComponents.items[component_idx];
const next_pattern = if (component_idx + 1 < this.walker.patternComponents.items.len) &this.walker.patternComponents.items[component_idx + 1] else null;
const is_last = component_idx == this.walker.patternComponents.items.len - 1;
this.iter_state = .get_next;
const maybe_dir_fd: ?Accessor.Handle = switch (try Accessor.openat(this.cwd_fd, symlink_full_path_z)) {
.err => |err| brk: {
if (@as(usize, @intCast(err.errno)) == @as(usize, @intFromEnum(bun.C.E.NOTDIR))) {
break :brk null;
}
if (this.walker.error_on_broken_symlinks) return .{ .err = this.walker.handleSysErrWithPath(err, symlink_full_path_z) };
// Broken symlink, but if `only_files` is false we still want to append
// it to the matched paths
if (!this.walker.only_files) {
// (See case A and B in the comment for `matchPatternFile()`)
// When we encounter a symlink we call the catch all
// matching function: `matchPatternImpl()` to see if we can avoid following the symlink.
// So for case A, we just need to check if the pattern is the last pattern.
if (is_last or
(pattern.syntax_hint == .Double and
component_idx + 1 == this.walker.patternComponents.items.len -| 1 and
next_pattern.?.syntax_hint != .Double and
this.walker.matchPatternImpl(next_pattern.?, entry_name)))
{
return .{ .result = try this.walker.prepareMatchedPathSymlink(symlink_full_path_z) orelse continue };
}
}
continue;
},
.result => |fd| brk: {
this.bumpOpenFds();
break :brk fd;
},
};
const dir_fd = maybe_dir_fd orelse {
// No directory file descriptor, it's a file
if (is_last)
return .{ .result = try this.walker.prepareMatchedPathSymlink(symlink_full_path_z) orelse continue };
if (pattern.syntax_hint == .Double and
component_idx + 1 == this.walker.patternComponents.items.len -| 1 and
next_pattern.?.syntax_hint != .Double and
this.walker.matchPatternImpl(next_pattern.?, entry_name))
{
return .{ .result = try this.walker.prepareMatchedPathSymlink(symlink_full_path_z) orelse continue };
}
continue;
};
var add_dir: bool = false;
// TODO this function calls `matchPatternImpl(pattern,
// entry_name)` which is redundant because we already called
// that when we first encountered the symlink
const recursion_idx_bump_ = this.walker.matchPatternDir(&pattern, next_pattern, entry_name, component_idx, is_last, &add_dir);
if (recursion_idx_bump_) |recursion_idx_bump| {
if (recursion_idx_bump == 2) {
try this.walker.workbuf.append(
this.walker.arena.allocator(),
WorkItem.newWithFd(work_item.path, component_idx + recursion_idx_bump, .directory, dir_fd),
);
try this.walker.workbuf.append(
this.walker.arena.allocator(),
WorkItem.newWithFd(work_item.path, component_idx, .directory, dir_fd),
);
} else {
try this.walker.workbuf.append(
this.walker.arena.allocator(),
WorkItem.newWithFd(work_item.path, component_idx + recursion_idx_bump, .directory, dir_fd),
);
}
}
if (add_dir and !this.walker.only_files) {
return .{ .result = try this.walker.prepareMatchedPathSymlink(symlink_full_path_z) orelse continue };
}
continue;
},
}
},
.directory => |*dir| {
const entry = switch (dir.iter.next()) {
.err => |err| {
if (!dir.at_cwd) this.closeDisallowingCwd(dir.fd);
dir.iter_closed = true;
return .{ .err = this.walker.handleSysErrWithPath(err, dir.dir_path) };
},
.result => |ent| ent,
} orelse {
if (!dir.at_cwd) this.closeDisallowingCwd(dir.fd);
dir.iter_closed = true;
this.iter_state = .get_next;
continue;
};
log("dir: {s} entry: {s}", .{ dir.dir_path, entry.name.slice() });
const dir_iter_state: *const IterState.Directory = &this.iter_state.directory;
const entry_name = entry.name.slice();
switch (entry.kind) {
.file => {
const matches = this.walker.matchPatternFile(entry_name, dir_iter_state.component_idx, dir.is_last, dir_iter_state.pattern, dir_iter_state.next_pattern);
if (matches) {
const prepared = try this.walker.prepareMatchedPath(entry_name, dir.dir_path) orelse continue;
return .{ .result = prepared };
}
continue;
},
.directory => {
var add_dir: bool = false;
const recursion_idx_bump_ = this.walker.matchPatternDir(dir_iter_state.pattern, dir_iter_state.next_pattern, entry_name, dir_iter_state.component_idx, dir_iter_state.is_last, &add_dir);
if (recursion_idx_bump_) |recursion_idx_bump| {
const subdir_parts: []const []const u8 = &[_][]const u8{
dir.dir_path[0..dir.dir_path.len],
entry_name,
};
const subdir_entry_name = try this.walker.join(subdir_parts);
if (recursion_idx_bump == 2) {
try this.walker.workbuf.append(
this.walker.arena.allocator(),
WorkItem.new(subdir_entry_name, dir_iter_state.component_idx + recursion_idx_bump, .directory),
);
try this.walker.workbuf.append(
this.walker.arena.allocator(),
WorkItem.new(subdir_entry_name, dir_iter_state.component_idx, .directory),
);
} else {
try this.walker.workbuf.append(
this.walker.arena.allocator(),
WorkItem.new(subdir_entry_name, dir_iter_state.component_idx + recursion_idx_bump, .directory),
);
}
}
if (add_dir and !this.walker.only_files) {
const prepared_path = try this.walker.prepareMatchedPath(entry_name, dir.dir_path) orelse continue;
return .{ .result = prepared_path };
}
continue;
},
.sym_link => {
if (this.walker.follow_symlinks) {
// Following a symlink requires additional syscalls, so
// we first try it against our "catch-all" pattern match
// function
const matches = this.walker.matchPatternImpl(dir_iter_state.pattern, entry_name);
if (!matches) continue;
const subdir_parts: []const []const u8 = &[_][]const u8{
dir.dir_path[0..dir.dir_path.len],
entry_name,
};
const entry_start: u32 = @intCast(if (dir.dir_path.len == 0) 0 else dir.dir_path.len + 1);
// const subdir_entry_name = try this.arena.allocator().dupe(u8, ResolvePath.join(subdir_parts, .auto));
const subdir_entry_name = try this.walker.join(subdir_parts);
try this.walker.workbuf.append(
this.walker.arena.allocator(),
WorkItem.newSymlink(subdir_entry_name, dir_iter_state.component_idx, entry_start),
);
continue;
}
if (this.walker.only_files) continue;
const matches = this.walker.matchPatternFile(entry_name, dir_iter_state.component_idx, dir_iter_state.is_last, dir_iter_state.pattern, dir_iter_state.next_pattern);
if (matches) {
const prepared_path = try this.walker.prepareMatchedPath(entry_name, dir.dir_path) orelse continue;
return .{ .result = prepared_path };
}
continue;
},
else => continue,
}
},
}
}
}
};
const WorkItem = struct {
path: []const u8,
idx: u32,
kind: Kind,
entry_start: u32 = 0,
fd: ?Accessor.Handle = null,
const Kind = enum {
directory,
symlink,
};
fn new(path: []const u8, idx: u32, kind: Kind) WorkItem {
return .{
.path = path,
.idx = idx,
.kind = kind,
};
}
fn newWithFd(path: []const u8, idx: u32, kind: Kind, fd: Accessor.Handle) WorkItem {
return .{
.path = path,
.idx = idx,
.kind = kind,
.fd = fd,
};
}
fn newSymlink(path: []const u8, idx: u32, entry_start: u32) WorkItem {
return .{
.path = path,
.idx = idx,
.kind = .symlink,
.entry_start = entry_start,
};
}
};
/// A component is each part of a glob pattern, separated by directory
/// separator:
/// `src/**/*.ts` -> `src`, `**`, `*.ts`
const Component = struct {
start: u32,
len: u32,
syntax_hint: SyntaxHint = .None,
trailing_sep: bool = false,
is_ascii: bool = false,
/// Only used when component is not ascii
unicode_set: bool = false,
start_cp: u32 = 0,
end_cp: u32 = 0,
pub fn patternSlice(this: *const Component, pattern: []const u8) []const u8 {
return pattern[this.start .. this.start + this.len - @as(u1, @bitCast(this.trailing_sep))];
}
pub fn patternSliceCp(this: *const Component, pattern: []u32) []u32 {
return pattern[this.start_cp .. this.end_cp - @as(u1, @bitCast(this.trailing_sep))];
}
const SyntaxHint = enum {
None,
Single,
Double,
/// Uses special fast-path matching for components like: `*.ts`
WildcardFilepath,
/// Uses special fast-patch matching for literal components e.g.
/// "node_modules", becomes memcmp
Literal,
/// ./fixtures/*.ts
/// ^
Dot,
/// ../
DotBack,
fn isSpecialSyntax(this: SyntaxHint) bool {
return switch (this) {
.Literal => false,
else => true,
};
}
};
};
/// The arena parameter is dereferenced and copied if all allocations go well and nothing goes wrong
pub fn init(
this: *GlobWalker,
arena: *Arena,
pattern: []const u8,
dot: bool,
absolute: bool,
follow_symlinks: bool,
error_on_broken_symlinks: bool,
only_files: bool,
) !Maybe(void) {
return try this.initWithCwd(
arena,
pattern,
bun.fs.FileSystem.instance.top_level_dir,
dot,
absolute,
follow_symlinks,
error_on_broken_symlinks,
only_files,
);
}
pub fn convertUtf8ToCodepoints(codepoints: []u32, pattern: []const u8) void {
_ = bun.simdutf.convert.utf8.to.utf32.le(pattern, codepoints);
}
pub fn debugPatternComopnents(this: *GlobWalker) void {
const pattern = this.pattern;
const components = &this.patternComponents;
const ptr = @intFromPtr(this);
log("GlobWalker(0x{x}) components:", .{ptr});
for (components.items) |cmp| {
switch (cmp.syntax_hint) {
.Single => log(" *", .{}),
.Double => log(" **", .{}),
.Dot => log(" .", .{}),
.DotBack => log(" ../", .{}),
.Literal, .WildcardFilepath, .None => log(" hint={s} component_str={s}", .{ @tagName(cmp.syntax_hint), cmp.patternSlice(pattern) }),
}
}
}
/// `cwd` should be allocated with the arena
/// The arena parameter is dereferenced and copied if all allocations go well and nothing goes wrong
pub fn initWithCwd(
this: *GlobWalker,
arena: *Arena,
pattern: []const u8,
cwd: []const u8,
dot: bool,
absolute: bool,
follow_symlinks: bool,
error_on_broken_symlinks: bool,
only_files: bool,
) !Maybe(void) {
log("initWithCwd(cwd={s})", .{cwd});
this.* = .{
.cwd = cwd,
.pattern = pattern,
.dot = dot,
.absolute = absolute,
.follow_symlinks = follow_symlinks,
.error_on_broken_symlinks = error_on_broken_symlinks,
.only_files = only_files,
.basename_excluding_special_syntax_component_idx = 0,
.end_byte_of_basename_excluding_special_syntax = 0,
};
try GlobWalker.buildPatternComponents(
arena,
&this.patternComponents,
pattern,
&this.cp_len,
&this.pattern_codepoints,
&this.has_relative_components,
&this.end_byte_of_basename_excluding_special_syntax,
&this.basename_excluding_special_syntax_component_idx,
);
// copy arena after all allocations are successful
this.arena = arena.*;
if (bun.Environment.allow_assert) {
this.debugPatternComopnents();
}
return Maybe(void).success;
}
/// NOTE This also calls deinit on the arena, if you don't want to do that then
pub fn deinit(this: *GlobWalker, comptime clear_arena: bool) void {
log("GlobWalker.deinit", .{});
if (comptime clear_arena) {
this.arena.deinit();
}
}
pub fn handleSysErrWithPath(
this: *GlobWalker,
err: Syscall.Error,
path_buf: [:0]const u8,
) Syscall.Error {
std.mem.copyForwards(u8, this.pathBuf[0 .. path_buf.len + 1], @as([]const u8, @ptrCast(path_buf[0 .. path_buf.len + 1])));
return err.withPath(this.pathBuf[0 .. path_buf.len + 1]);
}
pub fn walk(this: *GlobWalker) !Maybe(void) {
if (this.patternComponents.items.len == 0) return Maybe(void).success;
var iter = GlobWalker.Iterator{ .walker = this };
defer iter.deinit();
switch (try iter.init()) {
.err => |err| return .{ .err = err },
else => {},
}
while (switch (try iter.next()) {
.err => |err| return .{ .err = err },
.result => |matched_path| matched_path,
}) |path| {
log("walker: matched path: {s}", .{path});
// The paths are already put into this.matchedPaths, which we use for the output,
// so we don't need to do anything here
}
return Maybe(void).success;
}
// NOTE you must check that the pattern at `idx` has `syntax_hint == .Dot` or
// `syntax_hint == .DotBack` first
fn collapseDots(
this: *GlobWalker,
idx: u32,
dir_path: *[:0]u8,
path_buf: *bun.PathBuffer,
encountered_dot_dot: *bool,
) u32 {
var component_idx = idx;
var len = dir_path.len;
while (component_idx < this.patternComponents.items.len) {
switch (this.patternComponents.items[component_idx].syntax_hint) {
.Dot => {
defer component_idx += 1;
if (len + 2 >= bun.MAX_PATH_BYTES) @panic("Invalid path");
if (len == 0) {
path_buf[len] = '.';
path_buf[len + 1] = 0;
len += 1;
} else {
path_buf[len] = '/';
path_buf[len + 1] = '.';
path_buf[len + 2] = 0;
len += 2;
}
},
.DotBack => {
defer component_idx += 1;
encountered_dot_dot.* = true;
if (dir_path.len + 3 >= bun.MAX_PATH_BYTES) @panic("Invalid path");
if (len == 0) {
path_buf[len] = '.';
path_buf[len + 1] = '.';
path_buf[len + 2] = 0;
len += 2;
} else {
path_buf[len] = '/';
path_buf[len + 1] = '.';
path_buf[len + 2] = '.';
path_buf[len + 3] = 0;
len += 3;
}
},
else => break,
}
}
dir_path.len = len;
return component_idx;
}
// NOTE you must check that the pattern at `idx` has `syntax_hint == .Double` first
fn collapseSuccessiveDoubleWildcards(this: *GlobWalker, idx: u32) u32 {
var component_idx = idx;
const pattern = this.patternComponents.items[idx];
_ = pattern;
// Collapse successive double wildcards
while (component_idx + 1 < this.patternComponents.items.len and
this.patternComponents.items[component_idx + 1].syntax_hint == .Double) : (component_idx += 1)
{}
return component_idx;
}
pub fn skipSpecialComponents(
this: *GlobWalker,
work_item_idx: u32,
dir_path: *[:0]u8,
scratch_path_buf: *bun.PathBuffer,
encountered_dot_dot: *bool,
) u32 {
var component_idx = work_item_idx;
// Skip `.` and `..` while also appending them to `dir_path`
component_idx = switch (this.patternComponents.items[component_idx].syntax_hint) {
.Dot => this.collapseDots(
component_idx,
dir_path,
scratch_path_buf,
encountered_dot_dot,
),
.DotBack => this.collapseDots(
component_idx,
dir_path,
scratch_path_buf,
encountered_dot_dot,
),
else => component_idx,
};
// Skip to the last `**` if there is a chain of them
component_idx = switch (this.patternComponents.items[component_idx].syntax_hint) {
.Double => this.collapseSuccessiveDoubleWildcards(component_idx),
else => component_idx,
};
return component_idx;
}
fn matchPatternDir(
this: *GlobWalker,
pattern: *Component,
next_pattern: ?*Component,
entry_name: []const u8,
component_idx: u32,
is_last: bool,
add: *bool,
) ?u32 {
if (!this.dot and GlobWalker.startsWithDot(entry_name)) return null;
if (is_ignored(entry_name)) return null;
// Handle double wildcard `**`, this could possibly
// propagate the `**` to the directory's children
if (pattern.syntax_hint == .Double) {
// Stop the double wildcard if it matches the pattern afer it
// Example: src/**/*.js
// - Matches: src/bun.js/
// src/bun.js/foo/bar/baz.js
if (!is_last and this.matchPatternImpl(next_pattern.?, entry_name)) {
// But if the next pattern is the last
// component, it should match and propagate the
// double wildcard recursion to the directory's
// children
if (component_idx + 1 == this.patternComponents.items.len - 1) {
add.* = true;
return 0;
}
// In the normal case skip over the next pattern
// since we matched it, example:
// BEFORE: src/**/node_modules/**/*.js
// ^
// AFTER: src/**/node_modules/**/*.js
// ^
return 2;
}
if (is_last) {
add.* = true;
}
return 0;
}
const matches = this.matchPatternImpl(pattern, entry_name);
if (matches) {
if (is_last) {
add.* = true;
return null;
}
return 1;
}
return null;
}
/// A file can only match if:
/// a) it matches against the last pattern, or
/// b) it matches the next pattern, provided the current
/// pattern is a double wildcard and the next pattern is
/// not a double wildcard
///
/// Examples:
/// a -> `src/foo/index.ts` matches
/// b -> `src/**/*.ts` (on 2nd pattern) matches
fn matchPatternFile(
this: *GlobWalker,
entry_name: []const u8,
component_idx: u32,
is_last: bool,
pattern: *Component,
next_pattern: ?*Component,
) bool {
if (pattern.trailing_sep) return false;
// Handle case b)
if (!is_last) return pattern.syntax_hint == .Double and
component_idx + 1 == this.patternComponents.items.len -| 1 and
next_pattern.?.syntax_hint != .Double and
this.matchPatternImpl(next_pattern.?, entry_name);
// Handle case a)
return this.matchPatternImpl(pattern, entry_name);
}
fn matchPatternImpl(
this: *GlobWalker,
pattern_component: *Component,
filepath: []const u8,
) bool {
log("matchPatternImpl: {s}", .{filepath});
if (!this.dot and GlobWalker.startsWithDot(filepath)) return false;
if (is_ignored(filepath)) return false;
return switch (pattern_component.syntax_hint) {
.Double, .Single => true,
.WildcardFilepath => if (comptime !isWindows)
matchWildcardFilepath(pattern_component.patternSlice(this.pattern), filepath)
else
this.matchPatternSlow(pattern_component, filepath),
.Literal => if (comptime !isWindows)
matchWildcardLiteral(pattern_component.patternSlice(this.pattern), filepath)
else
this.matchPatternSlow(pattern_component, filepath),
else => this.matchPatternSlow(pattern_component, filepath),
};
}
fn matchPatternSlow(this: *GlobWalker, pattern_component: *Component, filepath: []const u8) bool {
// windows filepaths are utf-16 so GlobAscii.match will never work
if (comptime !isWindows) {
if (pattern_component.is_ascii and isAllAscii(filepath))
return GlobAscii.match(
pattern_component.patternSlice(this.pattern),
filepath,
);
}
const codepoints = this.componentStringUnicode(pattern_component);
return matchImpl(
codepoints,
filepath,
);
}
fn componentStringUnicode(this: *GlobWalker, pattern_component: *Component) []const u32 {
if (comptime isWindows) {
return this.componentStringUnicodeWindows(pattern_component);
} else {
return this.componentStringUnicodePosix(pattern_component);
}
}
fn componentStringUnicodeWindows(this: *GlobWalker, pattern_component: *Component) []const u32 {
return pattern_component.patternSliceCp(this.pattern_codepoints);
}
fn componentStringUnicodePosix(this: *GlobWalker, pattern_component: *Component) []const u32 {
if (pattern_component.unicode_set) return pattern_component.patternSliceCp(this.pattern_codepoints);
const codepoints = pattern_component.patternSliceCp(this.pattern_codepoints);
GlobWalker.convertUtf8ToCodepoints(
codepoints,
pattern_component.patternSlice(this.pattern),
);
pattern_component.unicode_set = true;
return codepoints;
}
inline fn matchedPathToBunString(matched_path: MatchedPath) BunString {
if (comptime sentinel) {
return BunString.fromBytes(matched_path[0 .. matched_path.len + 1]);
}
return BunString.fromBytes(matched_path);
}
fn prepareMatchedPathSymlink(this: *GlobWalker, symlink_full_path: []const u8) !?MatchedPath {
const result = try this.matchedPaths.getOrPut(this.arena.allocator(), BunString.fromBytes(symlink_full_path));
if (result.found_existing) {
log("(dupe) prepared match: {s}", .{symlink_full_path});
return null;
}
if (comptime !sentinel) {
const slice = try this.arena.allocator().dupe(u8, symlink_full_path);
result.key_ptr.* = matchedPathToBunString(slice);
return slice;
}
const slicez = try this.arena.allocator().dupeZ(u8, symlink_full_path);
result.key_ptr.* = matchedPathToBunString(slicez);
return slicez;
}
fn prepareMatchedPath(this: *GlobWalker, entry_name: []const u8, dir_name: []const u8) !?MatchedPath {
const subdir_parts: []const []const u8 = &[_][]const u8{
dir_name[0..dir_name.len],
entry_name,
};
const name_matched_path = try this.join(subdir_parts);
const name = matchedPathToBunString(name_matched_path);
const result = try this.matchedPaths.getOrPutValue(this.arena.allocator(), name, {});
if (result.found_existing) {
log("(dupe) prepared match: {s}", .{name_matched_path});
this.arena.allocator().free(name_matched_path);
return null;
}
result.key_ptr.* = name;
// if (comptime sentinel) return name[0 .. name.len - 1 :0];
log("prepared match: {s}", .{name_matched_path});
return name_matched_path;
}
fn appendMatchedPath(
this: *GlobWalker,
entry_name: []const u8,
dir_name: [:0]const u8,
) !void {
const subdir_parts: []const []const u8 = &[_][]const u8{
dir_name[0..dir_name.len],
entry_name,
};
const name_matched_path = try this.join(subdir_parts);
const name = matchedPathToBunString(name_matched_path);
const result = try this.matchedPaths.getOrPut(this.arena.allocator(), name);
if (result.found_existing) {
this.arena.allocator().free(name_matched_path);
log("(dupe) prepared match: {s}", .{name_matched_path});
return;
}
result.key_ptr.* = name;
}
fn appendMatchedPathSymlink(this: *GlobWalker, symlink_full_path: []const u8) !void {
const name = try this.arena.allocator().dupe(u8, symlink_full_path);
try this.matchedPaths.put(this.arena.allocator(), BunString.fromBytes(name), {});
}
inline fn join(this: *GlobWalker, subdir_parts: []const []const u8) !MatchedPath {
if (!this.absolute) {
// If relative paths enabled, stdlib join is preferred over
// ResolvePath.joinBuf because it doesn't try to normalize the path
return try stdJoin(this.arena.allocator(), subdir_parts);
}
const out = try this.arena.allocator().dupe(u8, bunJoin(subdir_parts, .auto));
if (comptime sentinel) return out[0 .. out.len - 1 :0];
return out;
}
inline fn startsWithDot(filepath: []const u8) bool {
return filepath.len > 0 and filepath[0] == '.';
}
fn checkSpecialSyntax(pattern: []const u8) bool {
if (pattern.len < 16) {
for (pattern[0..]) |c| {
switch (c) {
'*', '[', '{', '?', '!' => return true,
else => {},
}
}
return false;
}
const syntax_tokens = comptime [_]u8{ '*', '[', '{', '?', '!' };
const needles: [syntax_tokens.len]@Vector(16, u8) = comptime needles: {
var needles: [syntax_tokens.len]@Vector(16, u8) = undefined;
for (syntax_tokens, 0..) |tok, i| {
needles[i] = @splat(tok);
}
break :needles needles;
};
var i: usize = 0;
while (i + 16 <= pattern.len) : (i += 16) {
const haystack: @Vector(16, u8) = pattern[i..][0..16].*;
inline for (needles) |needle| {
if (std.simd.firstTrue(needle == haystack) != null) return true;
}
}
if (i < pattern.len) {
for (pattern[i..]) |c| {
inline for (syntax_tokens) |tok| {
if (c == tok) return true;
}
}
}
return false;
}
fn makeComponent(
pattern: []const u8,
start_cp: u32,
end_cp: u32,
start_byte: u32,
end_byte: u32,
has_relative_patterns: *bool,
) ?Component {
var component: Component = .{
.start = start_byte,
.len = end_byte - start_byte,
.start_cp = start_cp,
.end_cp = end_cp,
};
if (component.len == 0) return null;
out: {
if (component.len == 1 and pattern[component.start] == '.') {
component.syntax_hint = .Dot;
has_relative_patterns.* = true;
break :out;
}
if (component.len == 2 and pattern[component.start] == '.' and pattern[component.start] == '.') {
component.syntax_hint = .DotBack;
has_relative_patterns.* = true;
break :out;
}
if (!GlobWalker.checkSpecialSyntax(pattern[component.start .. component.start + component.len])) {
component.syntax_hint = .Literal;
break :out;
}
switch (component.len) {
1 => {
if (pattern[component.start] == '*') {
component.syntax_hint = .Single;
}
break :out;
},
2 => {
if (pattern[component.start] == '*' and pattern[component.start + 1] == '*') {
component.syntax_hint = .Double;
break :out;
}
},
else => {},
}
out_of_check_wildcard_filepath: {
if (component.len > 1 and
pattern[component.start] == '*' and
pattern[component.start + 1] == '.' and
component.start + 2 < pattern.len)
{
for (pattern[component.start + 2 ..]) |c| {
switch (c) {
// The fast path checks that path[1..] == pattern[1..],
// this will obviously not work if additional
// glob syntax is present in the pattern, so we
// must not apply this optimization if we see
// special glob syntax.
//
// This is not a complete check, there can be
// false negatives, but that's okay, it just
// means we don't apply the optimization.
//
// We also don't need to look for the `!` token,
// because that only applies negation if at the
// beginning of the string.
'[', '{', '?', '*' => break :out_of_check_wildcard_filepath,
else => {},
}
}
component.syntax_hint = .WildcardFilepath;
break :out;
}
}
}
if (component.syntax_hint != .Single and component.syntax_hint != .Double) {
if (isAllAscii(pattern[component.start .. component.start + component.len])) {
component.is_ascii = true;
}
} else {
component.is_ascii = true;
}
if (pattern[component.start + component.len -| 1] == '/') {
component.trailing_sep = true;
} else if (comptime bun.Environment.isWindows) {
component.trailing_sep = pattern[component.start + component.len -| 1] == '\\';
}
return component;
}
fn buildPatternComponents(
arena: *Arena,
patternComponents: *ArrayList(Component),
pattern: []const u8,
out_cp_len: *u32,
out_pattern_cp: *[]u32,
has_relative_patterns: *bool,
end_byte_of_basename_excluding_special_syntax: *u32,
basename_excluding_special_syntax_component_idx: *u32,
) !void {
var start_cp: u32 = 0;
var start_byte: u32 = 0;
const iter = CodepointIterator.init(pattern);
var cursor = CodepointIterator.Cursor{};
var cp_len: u32 = 0;
var prevIsBackslash = false;
var saw_special = false;
while (iter.next(&cursor)) : (cp_len += 1) {
const c = cursor.c;
switch (c) {
'\\' => {
if (comptime isWindows) {
var end_cp = cp_len;
var end_byte = cursor.i;
// is last char
if (cursor.i + cursor.width == pattern.len) {
end_cp += 1;
end_byte += cursor.width;
}
if (makeComponent(
pattern,
start_cp,
end_cp,
start_byte,
end_byte,
has_relative_patterns,
)) |component| {
saw_special = saw_special or component.syntax_hint.isSpecialSyntax();
if (!saw_special) {
basename_excluding_special_syntax_component_idx.* = @intCast(patternComponents.items.len);
end_byte_of_basename_excluding_special_syntax.* = cursor.i + cursor.width;
}
try patternComponents.append(arena.allocator(), component);
}
start_cp = cp_len + 1;
start_byte = cursor.i + cursor.width;
continue;
}
if (prevIsBackslash) {
prevIsBackslash = false;
continue;
}
prevIsBackslash = true;
},
'/' => {
var end_cp = cp_len;
var end_byte = cursor.i;
// is last char
if (cursor.i + cursor.width == pattern.len) {
end_cp += 1;
end_byte += cursor.width;
}
if (makeComponent(
pattern,
start_cp,
end_cp,
start_byte,
end_byte,
has_relative_patterns,
)) |component| {
saw_special = saw_special or component.syntax_hint.isSpecialSyntax();
if (!saw_special) {
basename_excluding_special_syntax_component_idx.* = @intCast(patternComponents.items.len);
end_byte_of_basename_excluding_special_syntax.* = cursor.i + cursor.width;
}
try patternComponents.append(arena.allocator(), component);
}
start_cp = cp_len + 1;
start_byte = cursor.i + cursor.width;
},
// TODO: Support other escaping glob syntax
else => {},
}
}
out_cp_len.* = cp_len;
const codepoints = try arena.allocator().alloc(u32, cp_len);
// On Windows filepaths are UTF-16 so its better to fill the codepoints buffer upfront
if (comptime isWindows) {
GlobWalker.convertUtf8ToCodepoints(codepoints, pattern);
}
out_pattern_cp.* = codepoints;
const end_cp = cp_len;
if (makeComponent(
pattern,
start_cp,
end_cp,
start_byte,
@intCast(pattern.len),
has_relative_patterns,
)) |component| {
saw_special = saw_special or component.syntax_hint.isSpecialSyntax();
if (!saw_special) {
basename_excluding_special_syntax_component_idx.* = @intCast(patternComponents.items.len);
end_byte_of_basename_excluding_special_syntax.* = cursor.i + cursor.width;
}
try patternComponents.append(arena.allocator(), component);
} else if (!saw_special) {
basename_excluding_special_syntax_component_idx.* = @intCast(patternComponents.items.len);
end_byte_of_basename_excluding_special_syntax.* = cursor.i + cursor.width;
}
}
};
}
// From: https://github.com/The-King-of-Toasters/globlin
/// State for matching a glob against a string
pub const GlobState = struct {
// These store character indices into the glob and path strings.
path_index: CursorState = .{},
glob_index: u32 = 0,
// When we hit a * or **, we store the state for backtracking.
wildcard: Wildcard = .{},
globstar: Wildcard = .{},
fn init(path_iter: *const CodepointIterator) GlobState {
var this = GlobState{};
// this.glob_index = CursorState.init(glob_iter);
this.path_index = CursorState.init(path_iter);
return this;
}
fn skipBraces(self: *GlobState, glob: []const u32, stop_on_comma: bool) BraceState {
var braces: u32 = 1;
var in_brackets = false;
while (self.glob_index < glob.len and braces > 0) : (self.glob_index += 1) {
switch (glob[self.glob_index]) {
// Skip nested braces
'{' => if (!in_brackets) {
braces += 1;
},
'}' => if (!in_brackets) {
braces -= 1;
},
',' => if (stop_on_comma and braces == 1 and !in_brackets) {
self.glob_index += 1;
return .Comma;
},
'*', '?', '[' => |c| if (!in_brackets) {
if (c == '[')
in_brackets = true;
},
']' => in_brackets = false,
'\\' => self.glob_index += 1,
else => {},
}
}
if (braces != 0)
return .Invalid;
return .EndBrace;
}
inline fn backtrack(self: *GlobState) void {
self.glob_index = self.wildcard.glob_index;
self.path_index = self.wildcard.path_index;
}
};
const Wildcard = struct {
// Using u32 rather than usize for these results in 10% faster performance.
// glob_index: CursorState = .{},
glob_index: u32 = 0,
path_index: CursorState = .{},
};
const BraceState = enum { Invalid, Comma, EndBrace };
const BraceStack = struct {
stack: [10]GlobState = undefined,
len: u32 = 0,
longest_brace_match: CursorState = .{},
inline fn push(self: *BraceStack, state: *const GlobState) GlobState {
self.stack[self.len] = state.*;
self.len += 1;
return GlobState{
.path_index = state.path_index,
.glob_index = state.glob_index + 1,
};
}
inline fn pop(self: *BraceStack, state: *const GlobState) GlobState {
self.len -= 1;
const s = GlobState{
.glob_index = state.glob_index,
.path_index = self.longest_brace_match,
// Restore star state if needed later.
.wildcard = self.stack[self.len].wildcard,
.globstar = self.stack[self.len].globstar,
};
if (self.len == 0)
self.longest_brace_match = .{};
return s;
}
inline fn last(self: *const BraceStack) *const GlobState {
return &self.stack[self.len - 1];
}
};
/// This function checks returns a boolean value if the pathname `path` matches
/// the pattern `glob`.
///
/// The supported pattern syntax for `glob` is:
///
/// "?"
/// Matches any single character.
/// "*"
/// Matches zero or more characters, except for path separators ('/' or '\').
/// "**"
/// Matches zero or more characters, including path separators.
/// Must match a complete path segment, i.e. followed by a path separator or
/// at the end of the pattern.
/// "[ab]"
/// Matches one of the characters contained in the brackets.
/// Character ranges (e.g. "[a-z]") are also supported.
/// Use "[!ab]" or "[^ab]" to match any character *except* those contained
/// in the brackets.
/// "{a,b}"
/// Match one of the patterns contained in the braces.
/// Any of the wildcards listed above can be used in the sub patterns.
/// Braces may be nested up to 10 levels deep.
/// "!"
/// Negates the result when at the start of the pattern.
/// Multiple "!" characters negate the pattern multiple times.
/// "\"
/// Used to escape any of the special characters above.
pub fn matchImpl(glob: []const u32, path: []const u8) bool {
const path_iter = CodepointIterator.init(path);
// This algorithm is based on https://research.swtch.com/glob
var state = GlobState.init(&path_iter);
// Store the state when we see an opening '{' brace in a stack.
// Up to 10 nested braces are supported.
var brace_stack = BraceStack{};
// First, check if the pattern is negated with a leading '!' character.
// Multiple negations can occur.
var negated = false;
while (state.glob_index < glob.len and glob[state.glob_index] == '!') {
negated = !negated;
state.glob_index += 1;
}
while (state.glob_index < glob.len or state.path_index.cursor.i < path.len) {
if (state.glob_index < glob.len) {
switch (glob[state.glob_index]) {
'*' => {
const is_globstar = state.glob_index + 1 < glob.len and glob[state.glob_index + 1] == '*';
// const is_globstar = state.glob_index.cursor.i + state.glob_index.cursor.width < glob.len and
// state.glob_index.peek(&glob_iter).cursor.c == '*';
if (is_globstar) {
// Coalesce multiple ** segments into one.
var index = state.glob_index + 2;
state.glob_index = skipGlobstars(glob, &index) - 2;
}
state.wildcard.glob_index = state.glob_index;
state.wildcard.path_index = state.path_index.peek(&path_iter);
// ** allows path separators, whereas * does not.
// However, ** must be a full path component, i.e. a/**/b not a**b.
if (is_globstar) {
// Skip wildcards
state.glob_index += 2;
if (glob.len == state.glob_index) {
// A trailing ** segment without a following separator.
state.globstar = state.wildcard;
} else if (glob[state.glob_index] == '/' and
(state.glob_index < 3 or glob[state.glob_index - 3] == '/'))
{
// Matched a full /**/ segment. If the last character in the path was a separator,
// skip the separator in the glob so we search for the next character.
// In effect, this makes the whole segment optional so that a/**/b matches a/b.
if (state.path_index.cursor.i == 0 or
(state.path_index.cursor.i < path.len and
isSeparator(path[state.path_index.cursor.i - 1])))
{
state.glob_index += 1;
}
// The allows_sep flag allows separator characters in ** matches.
// one is a '/', which prevents a/**/b from matching a/bb.
state.globstar = state.wildcard;
}
} else {
state.glob_index += 1;
}
// If we are in a * segment and hit a separator,
// either jump back to a previous ** or end the wildcard.
if (state.globstar.path_index.cursor.i != state.wildcard.path_index.cursor.i and
state.path_index.cursor.i < path.len and
isSeparator(state.path_index.cursor.c))
{
// Special case: don't jump back for a / at the end of the glob.
if (state.globstar.path_index.cursor.i > 0 and state.path_index.cursor.i + state.path_index.cursor.width < path.len) {
state.glob_index = state.globstar.glob_index;
state.wildcard.glob_index = state.globstar.glob_index;
} else {
state.wildcard.path_index.cursor.i = 0;
}
}
// If the next char is a special brace separator,
// skip to the end of the braces so we don't try to match it.
if (brace_stack.len > 0 and
state.glob_index < glob.len and
(glob[state.glob_index] == ',' or glob[state.glob_index] == '}'))
{
if (state.skipBraces(glob, false) == .Invalid)
return false; // invalid pattern!
}
continue;
},
'?' => if (state.path_index.cursor.i < path.len) {
if (!isSeparator(state.path_index.cursor.c)) {
state.glob_index += 1;
state.path_index.bump(&path_iter);
continue;
}
},
'[' => if (state.path_index.cursor.i < path.len) {
state.glob_index += 1;
const c = state.path_index.cursor.c;
// Check if the character class is negated.
var class_negated = false;
if (state.glob_index < glob.len and
(glob[state.glob_index] == '^' or glob[state.glob_index] == '!'))
{
class_negated = true;
state.glob_index += 1;
}
// Try each range.
var first = true;
var is_match = false;
while (state.glob_index < glob.len and (first or glob[state.glob_index] != ']')) {
var low = glob[state.glob_index];
if (!unescape(&low, glob, &state.glob_index))
return false; // Invalid pattern
state.glob_index += 1;
// If there is a - and the following character is not ],
// read the range end character.
const high = if (state.glob_index + 1 < glob.len and
glob[state.glob_index] == '-' and glob[state.glob_index + 1] != ']')
blk: {
state.glob_index += 1;
var h = glob[state.glob_index];
if (!unescape(&h, glob, &state.glob_index))
return false; // Invalid pattern!
state.glob_index += 1;
break :blk h;
} else low;
if (low <= c and c <= high)
is_match = true;
first = false;
}
if (state.glob_index >= glob.len)
return false; // Invalid pattern!
state.glob_index += 1;
if (is_match != class_negated) {
state.path_index.bump(&path_iter);
continue;
}
},
'{' => if (state.path_index.cursor.i < path.len) {
if (brace_stack.len >= brace_stack.stack.len)
return false; // Invalid pattern! Too many nested braces.
// Push old state to the stack, and reset current state.
state = brace_stack.push(&state);
continue;
},
'}' => if (brace_stack.len > 0) {
// If we hit the end of the braces, we matched the last option.
brace_stack.longest_brace_match = if (state.path_index.cursor.i >= brace_stack.longest_brace_match.cursor.i)
state.path_index
else
brace_stack.longest_brace_match;
state.glob_index += 1;
state = brace_stack.pop(&state);
continue;
},
',' => if (brace_stack.len > 0) {
// If we hit a comma, we matched one of the options!
// But we still need to check the others in case there is a longer match.
brace_stack.longest_brace_match = if (state.path_index.cursor.i >= brace_stack.longest_brace_match.cursor.i)
state.path_index
else
brace_stack.longest_brace_match;
state.path_index = brace_stack.last().path_index;
state.glob_index += 1;
state.wildcard = Wildcard{};
state.globstar = Wildcard{};
continue;
},
else => |c| if (state.path_index.cursor.i < path.len) {
var cc = c;
// Match escaped characters as literals.
if (!unescape(&cc, glob, &state.glob_index))
return false; // Invalid pattern;
const is_match = if (cc == '/')
isSeparator(state.path_index.cursor.c)
else
state.path_index.cursor.c == cc;
if (is_match) {
if (brace_stack.len > 0 and
state.glob_index > 0 and
glob[state.glob_index - 1] == '}')
{
brace_stack.longest_brace_match = state.path_index;
state = brace_stack.pop(&state);
}
state.glob_index += 1;
state.path_index.bump(&path_iter);
// If this is not a separator, lock in the previous globstar.
if (cc != '/')
state.globstar.path_index.cursor.i = 0;
continue;
}
},
}
}
// If we didn't match, restore state to the previous star pattern.
if (state.wildcard.path_index.cursor.i > 0 and state.wildcard.path_index.cursor.i <= path.len) {
state.backtrack();
continue;
}
if (brace_stack.len > 0) {
// If in braces, find next option and reset path to index where we saw the '{'
switch (state.skipBraces(glob, true)) {
.Invalid => return false,
.Comma => {
state.path_index = brace_stack.last().path_index;
continue;
},
.EndBrace => {},
}
// Hit the end. Pop the stack.
// If we matched a previous option, use that.
if (brace_stack.longest_brace_match.cursor.i > 0) {
state = brace_stack.pop(&state);
continue;
} else {
// Didn't match. Restore state, and check if we need to jump back to a star pattern.
state = brace_stack.last().*;
brace_stack.len -= 1;
if (state.wildcard.path_index.cursor.i > 0 and state.wildcard.path_index.cursor.i <= path.len) {
state.backtrack();
continue;
}
}
}
return negated;
}
return !negated;
}
pub inline fn isSeparator(c: Codepoint) bool {
if (comptime @import("builtin").os.tag == .windows) return c == '/' or c == '\\';
return c == '/';
}
inline fn unescape(c: *u32, glob: []const u32, glob_index: *u32) bool {
if (c.* == '\\') {
glob_index.* += 1;
if (glob_index.* >= glob.len)
return false; // Invalid pattern!
c.* = switch (glob[glob_index.*]) {
'a' => '\x61',
'b' => '\x08',
'n' => '\n',
'r' => '\r',
't' => '\t',
else => |cc| cc,
};
}
return true;
}
const GLOB_STAR_MATCH_STR: []const u32 = &[_]u32{ '/', '*', '*' };
// src/**/**/foo.ts
inline fn skipGlobstars(glob: []const u32, glob_index: *u32) u32 {
// Coalesce multiple ** segments into one.
while (glob_index.* + 3 <= glob.len and
// std.mem.eql(u8, glob[glob_index.*..][0..3], "/**"))
std.mem.eql(u32, glob[glob_index.*..][0..3], GLOB_STAR_MATCH_STR))
{
glob_index.* += 3;
}
return glob_index.*;
}
const MatchAscii = struct {};
pub fn matchWildcardFilepath(glob: []const u8, path: []const u8) bool {
const needle = glob[1..];
const needle_len: u32 = @intCast(needle.len);
if (path.len < needle_len) return false;
return std.mem.eql(u8, needle, path[path.len - needle_len ..]);
}
pub fn matchWildcardLiteral(literal: []const u8, path: []const u8) bool {
return std.mem.eql(u8, literal, path);
}
/// Returns true if the given string contains glob syntax,
/// excluding those escaped with backslashes
/// TODO: this doesn't play nicely with Windows directory separator and
/// backslashing, should we just require the user to supply posix filepaths?
pub fn detectGlobSyntax(potential_pattern: []const u8) bool {
// Negation only allowed in the beginning of the pattern
if (potential_pattern.len > 0 and potential_pattern[0] == '!') return true;
// In descending order of how popular the token is
const SPECIAL_SYNTAX: [4]u8 = comptime [_]u8{ '*', '{', '[', '?' };
inline for (SPECIAL_SYNTAX) |token| {
var slice = potential_pattern[0..];
while (slice.len > 0) {
if (std.mem.indexOfScalar(u8, slice, token)) |idx| {
// Check for even number of backslashes preceding the
// token to know that it's not escaped
var i = idx;
var backslash_count: u16 = 0;
while (i > 0 and potential_pattern[i - 1] == '\\') : (i -= 1) {
backslash_count += 1;
}
if (backslash_count % 2 == 0) return true;
slice = slice[idx + 1 ..];
} else break;
}
}
return false;
}