const bun = @import("root").bun; const std = @import("std"); const builtin = @import("builtin"); const Arena = std.heap.ArenaAllocator; const Allocator = std.mem.Allocator; const ArrayList = std.ArrayList; const JSC = bun.JSC; const JSValue = bun.JSC.JSValue; const JSPromise = bun.JSC.JSPromise; const JSGlobalObject = bun.JSC.JSGlobalObject; const Which = @import("../which.zig"); const Braces = @import("./braces.zig"); const Syscall = @import("../sys.zig"); const Glob = @import("../glob.zig"); const ResolvePath = @import("../resolver/resolve_path.zig"); const DirIterator = @import("../bun.js/node/dir_iterator.zig"); const CodepointIterator = @import("../string_immutable.zig").PackedCodepointIterator; const isAllAscii = @import("../string_immutable.zig").isAllASCII; const TaggedPointerUnion = @import("../tagged_pointer.zig").TaggedPointerUnion; pub const eval = @import("./interpreter.zig"); pub const interpret = @import("./interpreter.zig"); pub const subproc = @import("./subproc.zig"); pub const EnvMap = interpret.EnvMap; pub const EnvStr = interpret.EnvStr; pub const Interpreter = eval.Interpreter; pub const InterpreterMini = eval.InterpreterMini; pub const Subprocess = subproc.ShellSubprocess; pub const SubprocessMini = subproc.ShellSubprocessMini; const GlobWalker = Glob.GlobWalker_(null, true); // const GlobWalker = Glob.BunGlobWalker; pub const SUBSHELL_TODO_ERROR = "Subshells are not implemented, please open GitHub issue."; /// The strings in this type are allocated with event loop ctx allocator pub const ShellErr = union(enum) { sys: JSC.SystemError, custom: []const u8, invalid_arguments: struct { val: []const u8 = "" }, todo: []const u8, pub fn newSys(e: Syscall.Error) @This() { return .{ .sys = e.toSystemError(), }; } pub fn throwJS(this: @This(), globalThis: *JSC.JSGlobalObject) void { switch (this) { .sys => { const err = this.sys.toErrorInstance(globalThis); globalThis.throwValue(err); }, .custom => { var str = JSC.ZigString.init(this.custom); str.markUTF8(); const err_value = str.toErrorInstance(globalThis); globalThis.vm().throwError(globalThis, err_value); // this.bunVM().allocator.free(JSC.ZigString.untagged(str._unsafe_ptr_do_not_use)[0..str.len]); }, .invalid_arguments => { globalThis.throwInvalidArguments("{s}", .{this.invalid_arguments.val}); }, .todo => { globalThis.throwTODO(this.todo); }, } } pub fn throwMini(this: @This()) void { switch (this) { .sys => { const err = this.sys; const str = std.fmt.allocPrint(bun.default_allocator, "bunsh: {s}: {}", .{ err.message, err.path }) catch bun.outOfMemory(); bun.Output.prettyErrorln("error: Failed to due to error {s}", .{str}); bun.Global.exit(1); }, .custom => { bun.Output.prettyErrorln("error: Failed to due to error {s}", .{this.custom}); bun.Global.exit(1); }, .invalid_arguments => { const str = std.fmt.allocPrint(bun.default_allocator, "bunsh: invalid arguments: {s}", .{this.invalid_arguments.val}) catch bun.outOfMemory(); bun.Output.prettyErrorln("error: Failed to due to error {s}", .{str}); bun.Global.exit(1); }, .todo => { bun.Output.prettyErrorln("error: Failed to due to error TODO: {s}", .{this.todo}); bun.Global.exit(1); }, } } pub fn deinit(this: @This(), allocator: Allocator) void { switch (this) { .sys => { // this.sys. }, .custom => allocator.free(this.custom), .invalid_arguments => {}, .todo => allocator.free(this.todo), } } }; pub fn Result(comptime T: anytype) type { return union(enum) { result: T, err: ShellErr, pub const success: @This() = @This(){ .result = std.mem.zeroes(T), }; }; } pub const ShellError = error{ Init, Process, GlobalThisThrown, Spawn }; pub const ParseError = error{ Expected, Unknown, Lex, }; extern "C" fn setenv(name: [*:0]const u8, value: [*:0]const u8, overwrite: i32) i32; fn setEnv(name: [*:0]const u8, value: [*:0]const u8) void { // TODO: windows _ = setenv(name, value, 1); } /// [0] => read end /// [1] => write end pub const Pipe = [2]bun.FileDescriptor; const log = bun.Output.scoped(.SHELL, false); const logsys = bun.Output.scoped(.SYS, false); pub const GlobalJS = struct { globalThis: *JSC.JSGlobalObject, pub inline fn init(g: *JSC.JSGlobalObject) GlobalJS { return .{ .globalThis = g, }; } pub inline fn allocator(this: @This()) Allocator { return this.globalThis.bunVM().allocator; } pub inline fn eventLoopCtx(this: @This()) *JSC.VirtualMachine { return this.globalThis.bunVM(); } pub inline fn throwInvalidArguments(this: @This(), comptime fmt: []const u8, args: anytype) bun.shell.ShellErr { return .{ .invalid_arguments = .{ .val = std.fmt.allocPrint(this.globalThis.bunVM().allocator, fmt, args) catch bun.outOfMemory() }, }; } pub inline fn throwTODO(this: @This(), msg: []const u8) bun.shell.ShellErr { return .{ .todo = std.fmt.allocPrint(this.globalThis.bunVM().allocator, "{s}", .{msg}) catch bun.outOfMemory(), }; } pub inline fn throwError(this: @This(), err: bun.sys.Error) void { this.globalThis.throwValue(err.toJSC(this.globalThis)); } pub inline fn handleError(this: @This(), err: anytype, comptime fmt: []const u8) bun.shell.ShellErr { const str = std.fmt.allocPrint(this.globalThis.bunVM().allocator, "{s} " ++ fmt, .{@errorName(err)}) catch bun.outOfMemory(); return .{ .custom = str, }; } pub inline fn throw(this: @This(), comptime fmt: []const u8, args: anytype) bun.shell.ShellErr { const str = std.fmt.allocPrint(this.globalThis.bunVM().allocator, fmt, args) catch bun.outOfMemory(); return .{ .custom = str, }; } pub inline fn createNullDelimitedEnvMap(this: @This(), alloc: Allocator) ![:null]?[*:0]u8 { return this.globalThis.bunVM().bundler.env.map.createNullDelimitedEnvMap(alloc); } pub inline fn getAllocator(this: @This()) Allocator { return this.globalThis.bunVM().allocator; } pub inline fn enqueueTaskConcurrentWaitPid(this: @This(), task: anytype) void { this.globalThis.bunVMConcurrently().enqueueTaskConcurrent(JSC.ConcurrentTask.create(JSC.Task.init(task))); } pub inline fn topLevelDir(this: @This()) []const u8 { return this.globalThis.bunVM().bundler.fs.top_level_dir; } pub inline fn env(this: @This()) *bun.DotEnv.Loader { return this.globalThis.bunVM().bundler.env; } pub inline fn platformEventLoop(this: @This()) *JSC.PlatformEventLoop { const loop = JSC.AbstractVM(this.eventLoopCtx()); return loop.platformEventLoop(); } pub inline fn actuallyThrow(this: @This(), shellerr: bun.shell.ShellErr) void { shellerr.throwJS(this.globalThis); } }; pub const GlobalMini = struct { mini: *JSC.MiniEventLoop, pub inline fn init(g: *JSC.MiniEventLoop) @This() { return .{ .mini = g, }; } pub inline fn env(this: @This()) *bun.DotEnv.Loader { return this.mini.env.?; } pub inline fn allocator(this: @This()) Allocator { return this.mini.allocator; } pub inline fn eventLoopCtx(this: @This()) *JSC.MiniEventLoop { return this.mini; } // pub inline fn throwShellErr(this: @This(), shell_err: bun.shell.ShellErr pub inline fn throwTODO(this: @This(), msg: []const u8) bun.shell.ShellErr { return .{ .todo = std.fmt.allocPrint(this.mini.allocator, "{s}", .{msg}) catch bun.outOfMemory(), }; } pub inline fn throwInvalidArguments(this: @This(), comptime fmt: []const u8, args: anytype) bun.shell.ShellErr { return .{ .invalid_arguments = .{ .val = std.fmt.allocPrint(this.allocator(), fmt, args) catch bun.outOfMemory() }, }; } pub inline fn handleError(this: @This(), err: anytype, comptime fmt: []const u8) bun.shell.ShellErr { const str = std.fmt.allocPrint(this.mini.allocator, "{s} " ++ fmt, .{@errorName(err)}) catch bun.outOfMemory(); return .{ .custom = str, }; } pub inline fn createNullDelimitedEnvMap(this: @This(), alloc: Allocator) ![:null]?[*:0]u8 { return this.mini.env.?.map.createNullDelimitedEnvMap(alloc); } pub inline fn getAllocator(this: @This()) Allocator { return this.mini.allocator; } pub inline fn enqueueTaskConcurrentWaitPid(this: @This(), task: anytype) void { var anytask = bun.default_allocator.create(JSC.AnyTaskWithExtraContext) catch bun.outOfMemory(); _ = anytask.from(task, "runFromMainThreadMini"); this.mini.enqueueTaskConcurrent(anytask); } pub inline fn topLevelDir(this: @This()) []const u8 { return this.mini.top_level_dir; } pub inline fn throw(this: @This(), comptime fmt: []const u8, args: anytype) bun.shell.ShellErr { const str = std.fmt.allocPrint(this.allocator(), fmt, args) catch bun.outOfMemory(); return .{ .custom = str, }; } pub inline fn actuallyThrow(this: @This(), shellerr: bun.shell.ShellErr) void { _ = this; // autofix shellerr.throwMini(); } pub inline fn platformEventLoop(this: @This()) *JSC.PlatformEventLoop { const loop = JSC.AbstractVM(this.eventLoopCtx()); return loop.platformEventLoop(); } }; // const GlobalHandle = if (JSC.EventLoopKind == .js) GlobalJS else GlobalMini; pub const AST = struct { pub const Script = struct { stmts: []Stmt, }; pub const Stmt = struct { exprs: []Expr, }; pub const Expr = union(Expr.Tag) { assign: []Assign, cond: *Conditional, pipeline: *Pipeline, cmd: *Cmd, subshell: Script, pub fn asPipelineItem(this: *Expr) ?PipelineItem { return switch (this.*) { .assign => .{ .assigns = this.assign }, .cmd => .{ .cmd = this.cmd }, .subshell => .{ .subshell = this.subshell }, else => null, }; } pub const Tag = enum { assign, cond, pipeline, cmd, subshell }; }; pub const Conditional = struct { op: Op, left: Expr, right: Expr, const Op = enum { And, Or }; }; pub const Pipeline = struct { items: []PipelineItem, }; pub const PipelineItem = union(enum) { cmd: *Cmd, assigns: []Assign, subshell: Script, }; pub const CmdOrAssigns = union(CmdOrAssigns.Tag) { cmd: Cmd, assigns: []Assign, pub const Tag = enum { cmd, assigns }; pub fn to_pipeline_item(this: CmdOrAssigns, alloc: Allocator) PipelineItem { switch (this) { .cmd => |cmd| { const cmd_ptr = try alloc.create(Cmd); cmd_ptr.* = cmd; return .{ .cmd = cmd_ptr }; }, .assigns => |assigns| { return .{ .assign = assigns }; }, } } pub fn to_expr(this: CmdOrAssigns, alloc: Allocator) !Expr { switch (this) { .cmd => |cmd| { const cmd_ptr = try alloc.create(Cmd); cmd_ptr.* = cmd; return .{ .cmd = cmd_ptr }; }, .assigns => |assigns| { return .{ .assign = assigns }; }, } } }; /// A "buffer" from a JS object can be piped from and to, and also have /// output from commands redirected into it. Only BunFile, ArrayBufferView /// are supported. pub const JSBuf = struct { idx: u32, pub fn new(idx: u32) JSBuf { return .{ .idx = idx }; } }; /// A Subprocess from JS pub const JSProc = struct { idx: JSValue }; pub const Assign = struct { label: []const u8, value: Atom, pub fn new(label: []const u8, value: Atom) Assign { return .{ .label = label, .value = value, }; } }; pub const Cmd = struct { assigns: []Assign, name_and_args: []Atom, redirect: RedirectFlags = .{}, redirect_file: ?Redirect = null, /// Bit flags for redirects: /// - `>` = Redirect.Stdout /// - `1>` = Redirect.Stdout /// - `2>` = Redirect.Stderr /// - `&>` = Redirect.Stdout | Redirect.Stderr /// - `>>` = Redirect.Append | Redirect.Stdout /// - `1>>` = Redirect.Append | Redirect.Stdout /// - `2>>` = Redirect.Append | Redirect.Stderr /// - `&>>` = Redirect.Append | Redirect.Stdout | Redirect.Stderr /// /// Multiple redirects and redirecting stdin is not supported yet. pub const RedirectFlags = packed struct(u8) { stdin: bool = false, stdout: bool = false, stderr: bool = false, append: bool = false, __unused: u4 = 0, pub fn @"<"() RedirectFlags { return .{ .stdin = true }; } pub fn @"<<"() RedirectFlags { return .{ .stdin = true, .append = true }; } pub fn @">"() RedirectFlags { return .{ .stdout = true }; } pub fn @">>"() RedirectFlags { return .{ .append = true, .stdout = true }; } pub fn @"&>"() RedirectFlags { return .{ .stdout = true, .stderr = true }; } pub fn @"&>>"() RedirectFlags { return .{ .append = true, .stdout = true, .stderr = true }; } pub fn merge(a: RedirectFlags, b: RedirectFlags) RedirectFlags { const anum: u8 = @bitCast(a); const bnum: u8 = @bitCast(b); return @bitCast(anum | bnum); } }; pub const Redirect = union(enum) { atom: Atom, jsbuf: JSBuf, }; }; pub const Atom = union(Atom.Tag) { simple: SimpleAtom, compound: CompoundAtom, pub const Tag = enum(u8) { simple, compound }; pub fn atomsLen(this: *const Atom) u32 { return switch (this.*) { .simple => 1, .compound => @intCast(this.compound.atoms.len), }; } pub fn new_simple(atom: SimpleAtom) Atom { return .{ .simple = atom }; } pub fn is_compound(self: *const Atom) bool { switch (self.*) { .compound => return true, else => return false, } } pub fn has_expansions(self: *const Atom) bool { return self.has_glob_expansion() or self.has_brace_expansion(); } pub fn has_glob_expansion(self: *const Atom) bool { return switch (self.*) { .simple => self.simple.glob_hint(), .compound => self.compound.glob_hint, }; } pub fn has_brace_expansion(self: *const Atom) bool { return switch (self.*) { .simple => false, .compound => self.compound.brace_expansion_hint, }; } }; pub const SimpleAtom = union(enum) { Var: []const u8, Text: []const u8, asterisk, double_asterisk, brace_begin, brace_end, comma, cmd_subst: struct { script: Script, quoted: bool = false, }, pub fn glob_hint(this: SimpleAtom) bool { return switch (this) { .asterisk, .double_asterisk => true, else => false, }; } pub fn mightNeedIO(this: SimpleAtom) bool { return switch (this) { .asterisk, .double_asterisk, .cmd_subst => true, else => false, }; } }; pub const CompoundAtom = struct { atoms: []SimpleAtom, brace_expansion_hint: bool = false, glob_hint: bool = false, }; }; pub const Parser = struct { strpool: []const u8, tokens: []const Token, alloc: Allocator, jsobjs: []JSValue, current: u32 = 0, errors: std.ArrayList(Error), inside_subshell: ?SubshellKind = null, const SubshellKind = enum { cmd_subst, normal, pub fn closing_tok(this: SubshellKind) TokenTag { return switch (this) { .cmd_subst => TokenTag.CmdSubstEnd, .normal => TokenTag.CloseParen, }; } }; // FIXME error location const Error = struct { msg: []const u8 }; pub fn new( allocator: Allocator, lex_result: LexResult, jsobjs: []JSValue, ) !Parser { return .{ .strpool = lex_result.strpool, .tokens = lex_result.tokens, .alloc = allocator, .jsobjs = jsobjs, .errors = std.ArrayList(Error).init(allocator), }; } pub fn make_subparser(this: *Parser, kind: SubshellKind) Parser { const subparser = .{ .strpool = this.strpool, .tokens = this.tokens, .alloc = this.alloc, .jsobjs = this.jsobjs, .current = this.current, // We replace the old Parser's struct with the updated error list // when this subparser is done .errors = this.errors, .inside_subshell = kind, }; return subparser; } pub fn continue_from_subparser(this: *Parser, subparser: *Parser) void { // this.current = if (this.tokens[subparser.current] == .Eof) subparser.current else subparser; this.current = if (subparser.current >= this.tokens.len) subparser.current else subparser.current + 1; this.errors = subparser.errors; } pub fn parse(self: *Parser) !AST.Script { // Check for subshell syntax which is not supported rn for (self.tokens) |tok| { switch (tok) { .OpenParen => { try self.add_error("Unexpected `(`, subshells are currently not supported right now. Escape the `(` or open a GitHub issue.", .{}); return ParseError.Expected; }, .CloseParen => { try self.add_error("Unexpected `(`, subshells are currently not supported right now. Escape the `(` or open a GitHub issue.", .{}); return ParseError.Expected; }, else => {}, } } return try self.parse_impl(); } pub fn parse_impl(self: *Parser) !AST.Script { var stmts = ArrayList(AST.Stmt).init(self.alloc); if (self.tokens.len == 0 or self.tokens.len == 1 and self.tokens[0] == .Eof) return .{ .stmts = stmts.items[0..stmts.items.len] }; while (if (self.inside_subshell == null) !self.match(.Eof) else !self.match_any(&.{ .Eof, self.inside_subshell.?.closing_tok() })) { try stmts.append(try self.parse_stmt()); } if (self.inside_subshell) |kind| { _ = self.expect_any(&.{ .Eof, kind.closing_tok() }); } else { _ = self.expect(.Eof); } return .{ .stmts = stmts.items[0..stmts.items.len] }; } pub fn parse_stmt(self: *Parser) !AST.Stmt { var exprs = std.ArrayList(AST.Expr).init(self.alloc); while (if (self.inside_subshell == null) !self.match_any_comptime(&.{ .Semicolon, .Newline, .Eof }) else !self.match_any(&.{ .Semicolon, .Newline, .Eof, self.inside_subshell.?.closing_tok() })) { const expr = try self.parse_expr(); try exprs.append(expr); } return .{ .exprs = exprs.items[0..], }; } fn parse_expr(self: *Parser) !AST.Expr { return self.parse_cond(); } fn parse_cond(self: *Parser) !AST.Expr { var left = try self.parse_pipeline(); while (self.match_any_comptime(&.{ .DoubleAmpersand, .DoublePipe })) { const op: AST.Conditional.Op = op: { const previous = @as(TokenTag, self.prev()); switch (previous) { .DoubleAmpersand => break :op .And, .DoublePipe => break :op .Or, else => unreachable, } }; const right = try self.parse_pipeline(); const conditional = try self.allocate(AST.Conditional, .{ .op = op, .left = left, .right = right }); left = .{ .cond = conditional }; } return left; } fn parse_pipeline(self: *Parser) !AST.Expr { var expr = try self.parse_subshell(); if (self.peek() == .Pipe) { var pipeline_items = std.ArrayList(AST.PipelineItem).init(self.alloc); try pipeline_items.append(expr.asPipelineItem() orelse { try self.add_error_expected_pipeline_item(@as(AST.Expr.Tag, expr)); return ParseError.Expected; }); while (self.match(.Pipe)) { expr = try self.parse_subshell(); try pipeline_items.append(expr.asPipelineItem() orelse { try self.add_error_expected_pipeline_item(@as(AST.Expr.Tag, expr)); return ParseError.Expected; }); } const pipeline = try self.allocate(AST.Pipeline, .{ .items = pipeline_items.items[0..] }); return .{ .pipeline = pipeline }; } return expr; } /// Placeholder for when we fully support subshells fn parse_subshell(self: *Parser) anyerror!AST.Expr { // if (self.peek() == .OpenParen) { // _ = self.expect(.OpenParen); // const script = try self.parse_impl(true); // _ = self.expect(.CloseParen); // return .{ .subshell = script }; // } // return (try self.parse_cmd_or_assigns()).to_expr(self.alloc); return (try self.parse_cmd_or_assigns()).to_expr(self.alloc); } fn parse_cmd_or_assigns(self: *Parser) !AST.CmdOrAssigns { var assigns = std.ArrayList(AST.Assign).init(self.alloc); while (if (self.inside_subshell == null) !self.check_any_comptime(&.{ .Semicolon, .Newline, .Eof }) else !self.check_any(&.{ .Semicolon, .Newline, .Eof, self.inside_subshell.?.closing_tok() })) { if (try self.parse_assign()) |assign| { try assigns.append(assign); } else { break; } } if (if (self.inside_subshell == null) self.match_any_comptime(&.{ .Semicolon, .Newline, .Eof }) else self.match_any(&.{ .Semicolon, .Newline, .Eof, self.inside_subshell.?.closing_tok() })) { if (assigns.items.len == 0) { try self.add_error("expected a command or assignment", .{}); return ParseError.Expected; } return .{ .assigns = assigns.items[0..] }; } const name = try self.parse_atom() orelse { if (assigns.items.len == 0) { try self.add_error("expected a command or assignment but got: \"{s}\"", .{@tagName(self.peek())}); return ParseError.Expected; } return .{ .assigns = assigns.items[0..] }; }; var name_and_args = std.ArrayList(AST.Atom).init(self.alloc); try name_and_args.append(name); while (try self.parse_atom()) |arg| { try name_and_args.append(arg); } // TODO Parse redirects (need to update lexer to have tokens for different parts e.g. &>>) const has_redirect = self.match(.Redirect); const redirect = if (has_redirect) self.prev().Redirect else AST.Cmd.RedirectFlags{}; const redirect_file: ?AST.Cmd.Redirect = redirect_file: { if (has_redirect) { if (self.match(.JSObjRef)) { const obj_ref = self.prev().JSObjRef; break :redirect_file .{ .jsbuf = AST.JSBuf.new(obj_ref) }; } const redirect_file = try self.parse_atom() orelse { try self.add_error("Redirection with no file", .{}); return ParseError.Expected; }; break :redirect_file .{ .atom = redirect_file }; } break :redirect_file null; }; // TODO check for multiple redirects and error return .{ .cmd = .{ .assigns = assigns.items[0..], .name_and_args = name_and_args.items[0..], .redirect = redirect, .redirect_file = redirect_file, } }; } /// Try to parse an assignment. If no assignment could be parsed then return /// null and backtrack the parser state fn parse_assign(self: *Parser) !?AST.Assign { const old = self.current; _ = old; switch (self.peek()) { .Text => |txtrng| { const start_idx = self.current; _ = self.expect(.Text); const txt = self.text(txtrng); const var_decl: ?AST.Assign = var_decl: { if (hasEqSign(txt)) |eq_idx| { // If it starts with = then it's not valid assignment (e.g. `=FOO`) if (eq_idx == 0) break :var_decl null; const label = txt[0..eq_idx]; if (!isValidVarName(label)) { break :var_decl null; } if (eq_idx == txt.len - 1) { if (self.peek() == .Delimit) { _ = self.expect_delimit(); break :var_decl .{ .label = label, .value = .{ .simple = .{ .Text = "" } }, }; } const atom = try self.parse_atom() orelse { try self.add_error("Expected an atom", .{}); return ParseError.Expected; }; break :var_decl .{ .label = label, .value = atom, }; } const txt_value = txt[eq_idx + 1 .. txt.len]; _ = self.expect_delimit(); break :var_decl .{ .label = label, .value = .{ .simple = .{ .Text = txt_value } }, }; } break :var_decl null; }; if (var_decl) |vd| { return vd; } // Rollback self.current = start_idx; return null; }, else => return null, } } fn parse_atom(self: *Parser) !?AST.Atom { var array_alloc = std.heap.stackFallback(@sizeOf(AST.SimpleAtom), self.alloc); var atoms = try std.ArrayList(AST.SimpleAtom).initCapacity(array_alloc.get(), 1); var has_brace_open = false; var has_brace_close = false; var has_comma = false; var has_glob_syntax = false; { while (switch (self.peek()) { .Delimit => brk: { _ = self.expect(.Delimit); break :brk false; }, .Eof, .Semicolon, .Newline => false, else => |t| brk: { if (self.inside_subshell != null and self.inside_subshell.?.closing_tok() == t) break :brk false; break :brk true; }, }) { const next = self.peek_n(1); const next_delimits = self.delimits(next); const peeked = self.peek(); const should_break = next_delimits; switch (peeked) { .Asterisk => { has_glob_syntax = true; _ = self.expect(.Asterisk); try atoms.append(.asterisk); if (next_delimits) { _ = self.match(.Delimit); break; } }, .DoubleAsterisk => { has_glob_syntax = true; _ = self.expect(.DoubleAsterisk); try atoms.append(.double_asterisk); if (next_delimits) { _ = self.match(.Delimit); break; } }, .BraceBegin => { has_brace_open = true; _ = self.expect(.BraceBegin); try atoms.append(.brace_begin); // TODO in this case we know it can't possibly be the beginning of a brace expansion so maybe its faster to just change it to text here if (next_delimits) { _ = self.match(.Delimit); if (should_break) break; } }, .BraceEnd => { has_brace_close = true; _ = self.expect(.BraceEnd); try atoms.append(.brace_end); if (next_delimits) { _ = self.match(.Delimit); break; } }, .Comma => { has_comma = true; _ = self.expect(.Comma); try atoms.append(.comma); if (next_delimits) { _ = self.match(.Delimit); if (should_break) break; } }, .CmdSubstBegin => { _ = self.expect(.CmdSubstBegin); const is_quoted = self.match(.CmdSubstQuoted); var subparser = self.make_subparser(.cmd_subst); const script = try subparser.parse_impl(); try atoms.append(.{ .cmd_subst = .{ .script = script, .quoted = is_quoted, } }); self.continue_from_subparser(&subparser); if (self.delimits(self.peek())) { _ = self.match(.Delimit); if (should_break) break; } }, .Text => |txtrng| { _ = self.expect(.Text); const txt = self.text(txtrng); try atoms.append(.{ .Text = txt }); if (next_delimits) { _ = self.match(.Delimit); if (should_break) break; } }, .Var => |txtrng| { _ = self.expect(.Var); const txt = self.text(txtrng); try atoms.append(.{ .Var = txt }); if (next_delimits) { _ = self.match(.Delimit); if (should_break) break; } }, .OpenParen, .CloseParen => { try self.add_error("Unexpected token: `{s}`", .{if (peeked == .OpenParen) "(" else ")"}); return null; }, else => return null, } } } return switch (atoms.items.len) { 0 => null, 1 => { std.debug.assert(atoms.capacity == 1); return AST.Atom.new_simple(atoms.items[0]); }, else => .{ .compound = .{ .atoms = atoms.items[0..atoms.items.len], .brace_expansion_hint = has_brace_open and has_brace_close and has_comma, .glob_hint = has_glob_syntax, } }, }; } fn allocate(self: *const Parser, comptime T: type, val: T) !*T { const heap = try self.alloc.create(T); heap.* = val; return heap; } fn text(self: *const Parser, range: Token.TextRange) []const u8 { return self.strpool[range.start..range.end]; } fn advance(self: *Parser) Token { if (!self.is_at_end()) { self.current += 1; } return self.prev(); } fn is_at_end(self: *Parser) bool { return self.peek() == .Eof or self.inside_subshell != null and self.inside_subshell.?.closing_tok() == self.peek(); } fn expect(self: *Parser, toktag: TokenTag) Token { std.debug.assert(toktag == @as(TokenTag, self.peek())); if (self.check(toktag)) { return self.advance(); } unreachable; } fn expect_any(self: *Parser, toktags: []const TokenTag) Token { // std.debug.assert(toktag == @as(TokenTag, self.peek())); const peeked = self.peek(); for (toktags) |toktag| { if (toktag == @as(TokenTag, peeked)) return self.advance(); } unreachable; } fn delimits(self: *Parser, tok: Token) bool { return tok == .Delimit or tok == .Semicolon or tok == .Semicolon or tok == .Eof or (self.inside_subshell != null and tok == self.inside_subshell.?.closing_tok()); } fn expect_delimit(self: *Parser) Token { std.debug.assert(self.delimits(self.peek())); if (self.check(.Delimit) or self.check(.Semicolon) or self.check(.Newline) or self.check(.Eof) or (self.inside_subshell != null and self.check(self.inside_subshell.?.closing_tok()))) { return self.advance(); } unreachable; } /// Consumes token if it matches fn match(self: *Parser, toktag: TokenTag) bool { if (@as(TokenTag, self.peek()) == toktag) { _ = self.advance(); return true; } return false; } fn match_any_comptime(self: *Parser, comptime toktags: []const TokenTag) bool { const peeked = @as(TokenTag, self.peek()); inline for (toktags) |tag| { if (peeked == tag) { _ = self.advance(); return true; } } return false; } fn match_any(self: *Parser, toktags: []const TokenTag) bool { const peeked = @as(TokenTag, self.peek()); for (toktags) |tag| { if (peeked == tag) { _ = self.advance(); return true; } } return false; } fn check_any_comptime(self: *Parser, comptime toktags: []const TokenTag) bool { const peeked = @as(TokenTag, self.peek()); inline for (toktags) |tag| { if (peeked == tag) { return true; } } return false; } fn check_any(self: *Parser, toktags: []const TokenTag) bool { const peeked = @as(TokenTag, self.peek()); for (toktags) |tag| { if (peeked == tag) { return true; } } return false; } fn check(self: *Parser, toktag: TokenTag) bool { return @as(TokenTag, self.peek()) == @as(TokenTag, toktag); } fn peek(self: *Parser) Token { return self.tokens[self.current]; } fn peek_n(self: *Parser, n: u32) Token { if (self.current + n >= self.tokens.len) { return self.tokens[self.tokens.len - 1]; } return self.tokens[self.current + n]; } fn prev(self: *Parser) Token { return self.tokens[self.current - 1]; } pub fn combineErrors(self: *Parser) []const u8 { const errors = self.errors.items[0..]; const str = str: { const size = size: { var i: usize = 0; for (errors) |e| { i += e.msg.len; } break :size i; }; var buf = self.alloc.alloc(u8, size) catch bun.outOfMemory(); var i: usize = 0; for (errors) |e| { @memcpy(buf[i .. i + e.msg.len], e.msg); i += e.msg.len; } break :str buf; }; return str; } fn add_error(self: *Parser, comptime fmt: []const u8, args: anytype) !void { const error_msg = try std.fmt.allocPrint(self.alloc, fmt, args); try self.errors.append(.{ .msg = error_msg }); } fn add_error_expected_pipeline_item(self: *Parser, kind: AST.Expr.Tag) !void { const error_msg = try std.fmt.allocPrint(self.alloc, "Expected a command, assignment, or subshell but got: {s}", .{@tagName(kind)}); try self.errors.append(.{ .msg = error_msg }); } }; pub const TokenTag = enum { Pipe, DoublePipe, Ampersand, DoubleAmpersand, Redirect, Dollar, Asterisk, DoubleAsterisk, Eq, Semicolon, Newline, // Comment, BraceBegin, Comma, BraceEnd, CmdSubstBegin, CmdSubstQuoted, CmdSubstEnd, OpenParen, CloseParen, Var, Text, JSObjRef, Delimit, Eof, }; pub const Token = union(TokenTag) { /// | Pipe, /// || DoublePipe, /// & Ampersand, /// && DoubleAmpersand, Redirect: AST.Cmd.RedirectFlags, /// $ Dollar, // `*` Asterisk, DoubleAsterisk, /// = Eq, /// ; Semicolon, /// \n (unescaped newline) Newline, BraceBegin, Comma, BraceEnd, CmdSubstBegin, /// When cmd subst is wrapped in quotes, then it should be interpreted as literal string, not word split-ed arguments to a cmd. /// We lose quotation context in the AST, so we don't know how to disambiguate that. /// So this is a quick hack to give the AST that context. /// /// This matches this shell behaviour: /// echo test$(echo "1 2") -> test1 2\n /// echo "test$(echo "1 2")" -> test1 2\n CmdSubstQuoted, CmdSubstEnd, OpenParen, CloseParen, Var: TextRange, Text: TextRange, JSObjRef: u32, Delimit, Eof, pub const TextRange = struct { start: u32, end: u32, }; pub fn asHumanReadable(self: Token, strpool: []const u8) []const u8 { switch (self) { .Pipe => "`|`", .DoublePipe => "`||`", .Ampersand => "`&`", .DoubleAmpersand => "`&&`", .Redirect => "`>`", .Dollar => "`$`", .Asterisk => "`*`", .DoubleAsterisk => "`**`", .Eq => "`+`", .Semicolon => "`;`", .Newline => "`\\n`", // Comment, .BraceBegin => "`{`", .Comma => "`,`", .BraceEnd => "`}`", .CmdSubstBegin => "`$(`", .CmdSubstQuoted => "CmdSubstQuoted", .CmdSubstEnd => "`)`", .OpenParen => "`(`", .CloseParen => "`)", .Var => strpool[self.Var.start..self.Var.end], .Text => strpool[self.Text.start..self.Text.end], .JSObjRef => "JSObjRef", .Delimit => "Delimit", .Eof => "EOF", } } pub fn debug(self: Token, buf: []const u8) void { switch (self) { .Var => |txt| { std.debug.print("(var) {s}\n", .{buf[txt.start..txt.end]}); }, .Text => |txt| { std.debug.print("(txt) {s}\n", .{buf[txt.start..txt.end]}); }, else => { std.debug.print("{s}\n", .{@tagName(self)}); }, } } }; pub const LexerAscii = NewLexer(.ascii); pub const LexerUnicode = NewLexer(.wtf8); pub const LexResult = struct { errors: []LexError, tokens: []const Token, strpool: []const u8, pub fn combineErrors(this: *const LexResult, arena: Allocator) []const u8 { const errors = this.errors; const str = str: { const size = size: { var i: usize = 0; for (errors) |e| { i += e.msg.len; } break :size i; }; var buf = arena.alloc(u8, size) catch bun.outOfMemory(); var i: usize = 0; for (errors) |e| { @memcpy(buf[i .. i + e.msg.len], e.msg); i += e.msg.len; } break :str buf; }; return str; } }; pub const LexError = struct { /// Allocated with lexer arena msg: []const u8, }; pub const LEX_JS_OBJREF_PREFIX = "~__bun_"; pub const LEX_JS_STRING_PREFIX = "~__bunstr_"; pub fn NewLexer(comptime encoding: StringEncoding) type { const Chars = ShellCharIter(encoding); return struct { chars: Chars, /// Tell us the beginning of a "word", indexes into the string pool (`buf`) /// Anytime a word is added, this needs to be updated word_start: u32 = 0, /// Keeps track of the end of a "word", indexes into the string pool (`buf`), /// anytime characters are added to the string pool this needs to be updated j: u32 = 0, strpool: ArrayList(u8), tokens: ArrayList(Token), delimit_quote: bool = false, in_subshell: ?SubShellKind = null, errors: std.ArrayList(LexError), /// Contains a list of strings we need to escape /// Not owned by this struct string_refs: []bun.String, const SubShellKind = enum { /// (echo hi; echo hello) normal, /// `echo hi; echo hello` backtick, /// $(echo hi; echo hello) dollar, }; const LexerError = error{ OutOfMemory, Utf8CannotEncodeSurrogateHalf, Utf8InvalidStartByte, CodepointTooLarge, }; pub const js_objref_prefix = "$__bun_"; const State = Chars.State; const InputChar = Chars.InputChar; const BacktrackSnapshot = struct { chars: Chars, j: u32, word_start: u32, delimit_quote: bool, }; pub fn new(alloc: Allocator, src: []const u8, strings_to_escape: []bun.String) @This() { return .{ .chars = Chars.init(src), .tokens = ArrayList(Token).init(alloc), .strpool = ArrayList(u8).init(alloc), .errors = ArrayList(LexError).init(alloc), .string_refs = strings_to_escape, }; } pub fn get_result(self: @This()) LexResult { return .{ .tokens = self.tokens.items[0..], .strpool = self.strpool.items[0..], .errors = self.errors.items[0..], }; } pub fn add_error(self: *@This(), msg: []const u8) void { const start = self.strpool.items.len; self.strpool.appendSlice(msg) catch bun.outOfMemory(); const end = self.strpool.items.len; self.errors.append(.{ .msg = self.strpool.items[start..end] }) catch bun.outOfMemory(); } fn make_sublexer(self: *@This(), kind: SubShellKind) @This() { log("[lex] make sublexer", .{}); var sublexer = .{ .chars = self.chars, .strpool = self.strpool, .tokens = self.tokens, .errors = self.errors, .in_subshell = kind, .word_start = self.word_start, .j = self.j, .string_refs = self.string_refs, }; sublexer.chars.state = .Normal; return sublexer; } fn continue_from_sublexer(self: *@This(), sublexer: *@This()) void { log("[lex] drop sublexer", .{}); self.strpool = sublexer.strpool; self.tokens = sublexer.tokens; self.errors = sublexer.errors; self.chars = sublexer.chars; self.word_start = sublexer.word_start; self.j = sublexer.j; self.delimit_quote = sublexer.delimit_quote; } fn make_snapshot(self: *@This()) BacktrackSnapshot { return .{ .chars = self.chars, .j = self.j, .word_start = self.word_start, .delimit_quote = self.delimit_quote, }; } fn backtrack(self: *@This(), snap: BacktrackSnapshot) void { self.chars = snap.chars; self.j = snap.j; self.word_start = snap.word_start; self.delimit_quote = snap.delimit_quote; } fn last_tok_tag(self: *@This()) ?TokenTag { if (self.tokens.items.len == 0) return null; return @as(TokenTag, self.tokens.items[self.tokens.items.len - 1]); } pub fn lex(self: *@This()) LexerError!void { while (true) { const input = self.eat() orelse { try self.break_word(true); break; }; const char = input.char; const escaped = input.escaped; // Special token to denote substituted JS variables if (char == '~') { if (self.looksLikeJSStringRef()) { if (self.eatJSStringRef()) |bunstr| { try self.break_word(false); try self.handleJSStringRef(bunstr); continue; } } else if (self.looksLikeJSObjRef()) { if (self.eatJSObjRef()) |tok| { if (self.chars.state == .Double) { self.add_error("JS object reference not allowed in double quotes"); return; } try self.break_word(false); try self.tokens.append(tok); continue; } } } // Handle non-escaped chars: // 1. special syntax (operators, etc.) // 2. lexing state switchers (quotes) // 3. word breakers (spaces, etc.) else if (!escaped) escaped: { switch (char) { '#' => { if (self.chars.state == .Single or self.chars.state == .Double) break :escaped; const whitespace_preceding = if (self.chars.prev) |prev| Chars.isWhitespace(prev) else true; if (!whitespace_preceding) break :escaped; try self.break_word(true); self.eatComment(); continue; }, ';' => { if (self.chars.state == .Single or self.chars.state == .Double) break :escaped; try self.break_word(true); try self.tokens.append(.Semicolon); continue; }, '\n' => { if (self.chars.state == .Single or self.chars.state == .Double) break :escaped; try self.break_word(true); try self.tokens.append(.Newline); continue; }, // glob asterisks '*' => { if (self.chars.state == .Single or self.chars.state == .Double) break :escaped; if (self.peek()) |next| { if (!next.escaped and next.char == '*') { _ = self.eat(); try self.break_word(false); try self.tokens.append(.DoubleAsterisk); continue; } } try self.break_word(false); try self.tokens.append(.Asterisk); continue; }, // brace expansion syntax '{' => { if (self.chars.state == .Single or self.chars.state == .Double) break :escaped; try self.break_word(false); try self.tokens.append(.BraceBegin); continue; }, ',' => { if (self.chars.state == .Single or self.chars.state == .Double) break :escaped; try self.break_word(false); try self.tokens.append(.Comma); continue; }, '}' => { if (self.chars.state == .Single or self.chars.state == .Double) break :escaped; try self.break_word(false); try self.tokens.append(.BraceEnd); continue; }, // Command substitution '`' => { if (self.chars.state == .Single) break :escaped; if (self.in_subshell == .backtick) { try self.break_word(true); if (self.last_tok_tag()) |toktag| { if (toktag != .Delimit) try self.tokens.append(.Delimit); } try self.tokens.append(.CmdSubstEnd); return; } else { try self.eat_subshell(.backtick); } }, // Command substitution/vars '$' => { if (self.chars.state == .Single) break :escaped; const peeked = self.peek() orelse InputChar{ .char = 0 }; if (!peeked.escaped and peeked.char == '(') { try self.break_word(false); try self.eat_subshell(.dollar); continue; } // const snapshot = self.make_snapshot(); // Handle variable try self.break_word(false); const var_tok = try self.eat_var(); // empty var if (var_tok.start == var_tok.end) { try self.appendCharToStrPool('$'); try self.break_word(false); } else { try self.tokens.append(.{ .Var = var_tok }); } self.word_start = self.j; continue; }, '(' => { if (self.chars.state == .Single or self.chars.state == .Double) break :escaped; try self.break_word(true); try self.eat_subshell(.normal); continue; }, ')' => { if (self.chars.state == .Single or self.chars.state == .Double) break :escaped; if (self.in_subshell != .dollar and self.in_subshell != .normal) { self.add_error("Unexpected ')'"); continue; } try self.break_word(true); if (self.last_tok_tag()) |toktag| { if (toktag != .Delimit) try self.tokens.append(.Delimit); } if (self.in_subshell == .dollar) { try self.tokens.append(.CmdSubstEnd); } else if (self.in_subshell == .normal) { try self.tokens.append(.CloseParen); } return; }, '0'...'9' => { if (self.chars.state != .Normal) break :escaped; const snapshot = self.make_snapshot(); if (self.eat_redirect(input)) |redirect| { try self.break_word(true); try self.tokens.append(.{ .Redirect = redirect }); continue; } self.backtrack(snapshot); break :escaped; }, // Operators '|' => { if (self.chars.state == .Single or self.chars.state == .Double) break :escaped; try self.break_word(true); const next = self.peek() orelse { self.add_error("Unexpected EOF"); return; }; if (!next.escaped and next.char == '&') { self.add_error("Piping stdout and stderr (`|&`) is not supported yet. Please file an issue on GitHub."); return; } if (next.escaped or next.char != '|') { try self.tokens.append(.Pipe); } else if (next.char == '|') { _ = self.eat() orelse unreachable; try self.tokens.append(.DoublePipe); } continue; }, '>' => { if (self.chars.state == .Single or self.chars.state == .Double) break :escaped; try self.break_word_impl(true, false, true); const redirect = self.eat_simple_redirect(.out); try self.tokens.append(.{ .Redirect = redirect }); continue; }, '<' => { if (self.chars.state == .Single or self.chars.state == .Double) break :escaped; try self.break_word_impl(true, false, true); const redirect = self.eat_simple_redirect(.in); try self.tokens.append(.{ .Redirect = redirect }); continue; }, '&' => { if (self.chars.state == .Single or self.chars.state == .Double) break :escaped; try self.break_word(true); const next = self.peek() orelse { self.add_error("Unexpected EOF"); return; }; if (next.char == '>' and !next.escaped) { _ = self.eat(); const inner = if (self.eat_simple_redirect_operator(.out)) AST.Cmd.RedirectFlags.@"&>>"() else AST.Cmd.RedirectFlags.@"&>"(); try self.tokens.append(.{ .Redirect = inner }); } else if (next.escaped or next.char != '&') { try self.tokens.append(.Ampersand); } else if (next.char == '&') { _ = self.eat() orelse unreachable; try self.tokens.append(.DoubleAmpersand); } else continue; }, // 2. State switchers '\'' => { if (self.chars.state == .Single) { self.chars.state = .Normal; continue; } if (self.chars.state == .Normal) { self.chars.state = .Single; continue; } break :escaped; }, '"' => { if (self.chars.state == .Single) break :escaped; if (self.chars.state == .Normal) { try self.break_word(false); self.chars.state = .Double; } else if (self.chars.state == .Double) { try self.break_word(false); // self.delimit_quote = true; self.chars.state = .Normal; } continue; }, // 3. Word breakers ' ' => { if (self.chars.state == .Normal) { try self.break_word_impl(true, true, false); continue; } break :escaped; }, else => break :escaped, } continue; } try self.appendCharToStrPool(char); } if (self.in_subshell) |subshell_kind| { switch (subshell_kind) { .dollar, .backtick => self.add_error("Unclosed command substitution"), .normal => self.add_error("Unclosed subshell"), } return; } try self.tokens.append(.Eof); } fn appendCharToStrPool(self: *@This(), char: Chars.CodepointType) !void { if (comptime encoding == .ascii) { try self.strpool.append(char); self.j += 1; } else { if (char <= 0x7F) { try self.strpool.append(@intCast(char)); self.j += 1; return; } else { try self.appendUnicodeCharToStrPool(char); } } } fn appendUnicodeCharToStrPool(self: *@This(), char: Chars.CodepointType) !void { @setCold(true); const ichar: i32 = @intCast(char); var bytes: [4]u8 = undefined; const n = bun.strings.encodeWTF8Rune(&bytes, ichar); self.j += n; try self.strpool.appendSlice(bytes[0..n]); } fn break_word(self: *@This(), add_delimiter: bool) !void { return try self.break_word_impl(add_delimiter, false, false); } fn break_word_impl(self: *@This(), add_delimiter: bool, in_normal_space: bool, in_redirect_operator: bool) !void { const start: u32 = self.word_start; const end: u32 = self.j; if (start != end) { try self.tokens.append(.{ .Text = .{ .start = start, .end = end } }); if (add_delimiter) { try self.tokens.append(.Delimit); } } else if ((in_normal_space or in_redirect_operator) and self.tokens.items.len > 0 and switch (self.tokens.items[self.tokens.items.len - 1]) { .Var, .Text, .BraceBegin, .Comma, .BraceEnd, .CmdSubstEnd => true, else => false, }) { try self.tokens.append(.Delimit); self.delimit_quote = false; } self.word_start = self.j; } const RedirectDirection = enum { out, in }; fn eat_simple_redirect(self: *@This(), dir: RedirectDirection) AST.Cmd.RedirectFlags { const is_double = self.eat_simple_redirect_operator(dir); if (is_double) { return switch (dir) { .out => AST.Cmd.RedirectFlags.@">>"(), .in => AST.Cmd.RedirectFlags.@"<<"(), }; } return switch (dir) { .out => AST.Cmd.RedirectFlags.@">"(), .in => AST.Cmd.RedirectFlags.@"<"(), }; } /// Returns true if the operator is "double one": >> or << /// Returns null if it is invalid: <> >< fn eat_simple_redirect_operator(self: *@This(), dir: RedirectDirection) bool { if (self.peek()) |peeked| { if (peeked.escaped) return false; switch (peeked.char) { '>' => { if (dir == .out) { _ = self.eat(); return true; } return false; }, '<' => { if (dir == .in) { _ = self.eat(); return true; } return false; }, else => return false, } } return false; } fn eat_redirect(self: *@This(), first: InputChar) ?AST.Cmd.RedirectFlags { var flags: AST.Cmd.RedirectFlags = .{}; switch (first.char) { '0'...'9' => { // Codepoint int casts are safe here because the digits are in the ASCII range var count: usize = 1; var buf: [32]u8 = [_]u8{@intCast(first.char)} ** 32; while (self.peek()) |peeked| { const char = peeked.char; switch (char) { '0'...'9' => { _ = self.eat(); buf[count] = @intCast(char); count += 1; continue; }, else => break, } } const num = std.fmt.parseInt(usize, buf[0..count], 10) catch { // This means the number was really large, meaning it // probably was supposed to be a string return null; }; switch (num) { 0 => { flags.stdin = true; }, 1 => { flags.stdout = true; }, 2 => { flags.stderr = true; }, else => { // FIXME support redirection to any arbitrary fd log("redirection to fd {d} is invalid\n", .{num}); return null; }, } }, '&' => { if (first.escaped) return null; flags.stdout = true; flags.stderr = true; _ = self.eat(); }, else => return null, } var dir: RedirectDirection = .out; if (self.peek()) |input| { if (input.escaped) return null; switch (input.char) { '>' => dir = .out, '<' => dir = .in, else => return null, } _ = self.eat(); } else return null; const is_double = self.eat_simple_redirect_operator(dir); if (is_double) { flags.append = true; } return flags; } /// Assumes the first character of the literal has been eaten /// Backtracks and returns false if unsuccessful fn eat_literal(self: *@This(), comptime CodepointType: type, comptime literal: []const CodepointType) bool { const literal_skip_first = literal[1..]; const snapshot = self.make_snapshot(); const slice = self.eat_slice(CodepointType, literal_skip_first.len) orelse { self.backtrack(snapshot); return false; }; if (std.mem.eql(CodepointType, &slice, literal_skip_first)) return true; self.backtrack(snapshot); return false; } fn eat_number_word(self: *@This()) ?usize { const snap = self.make_snapshot(); var count: usize = 0; var buf: [32]u8 = [_]u8{0} ** 32; while (self.eat()) |result| { const char = result.char; switch (char) { '0'...'9' => { // Safe to cast here because 0-8 is in ASCII range buf[count] = @intCast(char); count += 1; continue; }, else => { break; }, } } if (count == 0) { self.backtrack(snap); return null; } const num = std.fmt.parseInt(usize, buf[0..count], 10) catch { self.backtrack(snap); return null; }; return num; } fn eat_subshell(self: *@This(), kind: SubShellKind) !void { if (kind == .dollar) { // Eat the open paren _ = self.eat(); } switch (kind) { .dollar, .backtick => { try self.tokens.append(.CmdSubstBegin); if (self.chars.state == .Double) { try self.tokens.append(.CmdSubstQuoted); } }, .normal => try self.tokens.append(.OpenParen), } var sublexer = self.make_sublexer(kind); try sublexer.lex(); self.continue_from_sublexer(&sublexer); } fn appendStringToStrPool(self: *@This(), bunstr: bun.String) !void { const start = self.strpool.items.len; if (bunstr.is8Bit() or bunstr.isUTF8()) { try self.strpool.appendSlice(bunstr.byteSlice()); } else { const utf16 = bunstr.utf16(); const additional = bun.simdutf.simdutf__utf8_length_from_utf16le(utf16.ptr, utf16.len); try self.strpool.ensureUnusedCapacity(additional); try bun.strings.convertUTF16ToUTF8Append(&self.strpool, bunstr.utf16()); } const end = self.strpool.items.len; self.j += @intCast(end - start); } fn handleJSStringRef(self: *@This(), bunstr: bun.String) !void { try self.appendStringToStrPool(bunstr); } fn looksLikeJSObjRef(self: *@This()) bool { const bytes = self.chars.srcBytesAtCursor(); if (LEX_JS_OBJREF_PREFIX.len - 1 >= bytes.len) return false; return std.mem.eql(u8, bytes[0 .. LEX_JS_OBJREF_PREFIX.len - 1], LEX_JS_OBJREF_PREFIX[1..]); } fn looksLikeJSStringRef(self: *@This()) bool { const bytes = self.chars.srcBytesAtCursor(); if (LEX_JS_STRING_PREFIX.len - 1 >= bytes.len) return false; return std.mem.eql(u8, bytes[0 .. LEX_JS_STRING_PREFIX.len - 1], LEX_JS_STRING_PREFIX[1..]); } fn eatJSSubstitutionIdx(self: *@This(), comptime literal: []const u8, comptime name: []const u8, comptime validate: *const fn (*@This(), usize) bool) ?usize { const bytes = self.chars.srcBytesAtCursor(); if (literal.len - 1 >= bytes.len) return null; if (std.mem.eql(u8, bytes[0 .. literal.len - 1], literal[1..])) { var i: usize = 0; var digit_buf: [32]u8 = undefined; var digit_buf_count: u8 = 0; i += literal.len - 1; while (i < bytes.len) : (i += 1) { switch (bytes[i]) { '0'...'9' => { if (digit_buf_count >= digit_buf.len) { const ERROR_STR = "Invalid " ++ name ++ " (number too high): "; var error_buf: [ERROR_STR.len + digit_buf.len + 1]u8 = undefined; const error_msg = std.fmt.bufPrint(error_buf[0..], "{s} {s}{c}", .{ ERROR_STR, digit_buf[0..digit_buf_count], bytes[i] }) catch @panic("Should not happen"); self.add_error(error_msg); return null; } digit_buf[digit_buf_count] = bytes[i]; digit_buf_count += 1; }, else => break, } } if (digit_buf_count == 0) { self.add_error("Invalid " ++ name ++ " (no idx)"); return null; } const idx = std.fmt.parseInt(usize, digit_buf[0..digit_buf_count], 10) catch { self.add_error("Invalid " ++ name ++ " ref "); return null; }; if (!validate(self, idx)) return null; // if (idx >= self.string_refs.len) { // self.add_error("Invalid " ++ name ++ " (out of bounds"); // return null; // } // Bump the cursor brk: { const new_idx = self.chars.cursorPos() + i; const prev_ascii_char: ?u7 = if (digit_buf_count == 1) null else @truncate(digit_buf[digit_buf_count - 2]); const cur_ascii_char: u7 = @truncate(digit_buf[digit_buf_count - 1]); if (comptime encoding == .ascii) { self.chars.src.i = new_idx; if (prev_ascii_char) |pc| self.chars.prev = .{ .char = pc }; self.chars.current = .{ .char = cur_ascii_char }; break :brk; } self.chars.src.cursor = CodepointIterator.Cursor{ .i = @intCast(new_idx), .c = cur_ascii_char, .width = 1, }; self.chars.src.next_cursor = self.chars.src.cursor; SrcUnicode.nextCursor(&self.chars.src.iter, &self.chars.src.next_cursor); if (prev_ascii_char) |pc| self.chars.prev = .{ .char = pc }; self.chars.current = .{ .char = cur_ascii_char }; } // return self.string_refs[idx]; return idx; } return null; } /// __NOTE__: Do not store references to the returned bun.String, it does not have its ref count incremented fn eatJSStringRef(self: *@This()) ?bun.String { if (self.eatJSSubstitutionIdx( LEX_JS_STRING_PREFIX, "JS string ref", validateJSStringRefIdx, )) |idx| { return self.string_refs[idx]; } return null; } fn validateJSStringRefIdx(self: *@This(), idx: usize) bool { if (idx >= self.string_refs.len) { self.add_error("Invalid JS string ref (out of bounds"); return false; } return true; } fn eatJSObjRef(self: *@This()) ?Token { if (self.eatJSSubstitutionIdx( LEX_JS_OBJREF_PREFIX, "JS object ref", validateJSObjRefIdx, )) |idx| { return .{ .JSObjRef = @intCast(idx) }; } return null; } fn validateJSObjRefIdx(self: *@This(), idx: usize) bool { if (idx >= std.math.maxInt(u32)) { self.add_error("Invalid JS object ref (out of bounds)"); return false; } return true; } fn eat_var(self: *@This()) !Token.TextRange { const start = self.j; var i: usize = 0; // Eat until special character while (self.peek()) |result| { defer i += 1; const char = result.char; const escaped = result.escaped; if (i == 0) { switch (char) { '=', '0'...'9' => return .{ .start = start, .end = self.j }, 'a'...'z', 'A'...'Z', '_' => {}, else => return .{ .start = start, .end = self.j }, } } // if (char switch (char) { '{', '}', ';', '\'', '\"', ' ', '|', '&', '>', ',', '$' => { return .{ .start = start, .end = self.j }; }, else => { if (!escaped and (self.in_subshell == .dollar and char == ')') or (self.in_subshell == .backtick and char == '`') or (self.in_subshell == .normal and char == ')')) { return .{ .start = start, .end = self.j }; } switch (char) { '0'...'9', 'a'...'z', 'A'...'Z', '_' => { _ = self.eat() orelse unreachable; try self.appendCharToStrPool(char); }, else => return .{ .start = start, .end = self.j }, } }, } } return .{ .start = start, .end = self.j }; } fn eat(self: *@This()) ?InputChar { return self.chars.eat(); } fn eatComment(self: *@This()) void { while (self.eat()) |peeked| { if (peeked.escaped) { continue; } if (peeked.char == '\n') break; } } fn eat_slice(self: *@This(), comptime CodepointType: type, comptime N: usize) ?[N]CodepointType { var slice = [_]CodepointType{0} ** N; var i: usize = 0; while (self.peek()) |result| { // If we passed in codepoint range that is equal to the source // string, or is greater than the codepoint range of source string than an int cast // will not panic if (CodepointType == Chars.CodepointType or std.math.maxInt(CodepointType) >= std.math.maxInt(Chars.CodepointType)) { slice[i] = @intCast(result.char); } else { // Otherwise the codepoint range is smaller than the source, so we need to check that the chars are valid if (result.char > std.math.maxInt(CodepointType)) { return null; } slice[i] = @intCast(result.char); } i += 1; _ = self.eat(); if (i == N) { return slice; } } return null; } fn peek(self: *@This()) ?InputChar { return self.chars.peek(); } fn read_char(self: *@This()) ?InputChar { return self.chars.read_char(); } fn debug_tokens(self: *const @This()) void { std.debug.print("Tokens: \n", .{}); for (self.tokens.items, 0..) |tok, i| { std.debug.print("{d}: ", .{i}); tok.debug(self.strpool.items[0..self.strpool.items.len]); } } }; } pub const StringEncoding = enum { ascii, wtf8, utf16 }; const SrcAscii = struct { bytes: []const u8, i: usize, const IndexValue = packed struct { char: u7, escaped: bool = false, }; fn init(bytes: []const u8) SrcAscii { return .{ .bytes = bytes, .i = 0, }; } inline fn index(this: *const SrcAscii) ?IndexValue { if (this.i >= this.bytes.len) return null; return .{ .char = @intCast(this.bytes[this.i]) }; } inline fn indexNext(this: *const SrcAscii) ?IndexValue { if (this.i + 1 >= this.bytes.len) return null; return .{ .char = @intCast(this.bytes[this.i + 1]) }; } inline fn eat(this: *SrcAscii, escaped: bool) void { this.i += 1 + @as(u32, @intFromBool(escaped)); } }; const SrcUnicode = struct { iter: CodepointIterator, cursor: CodepointIterator.Cursor, next_cursor: CodepointIterator.Cursor, const IndexValue = packed struct { char: u29, width: u3 = 0, }; fn nextCursor(iter: *const CodepointIterator, cursor: *CodepointIterator.Cursor) void { if (!iter.next(cursor)) { // This will make `i > sourceBytes.len` so the condition in `index` will fail cursor.i = @intCast(iter.bytes.len + 1); cursor.width = 1; cursor.c = CodepointIterator.ZeroValue; } } fn init(bytes: []const u8) SrcUnicode { var iter = CodepointIterator.init(bytes); var cursor = CodepointIterator.Cursor{}; nextCursor(&iter, &cursor); var next_cursor: CodepointIterator.Cursor = cursor; nextCursor(&iter, &next_cursor); return .{ .iter = iter, .cursor = cursor, .next_cursor = next_cursor }; } inline fn index(this: *const SrcUnicode) ?IndexValue { if (this.cursor.width + this.cursor.i > this.iter.bytes.len) return null; return .{ .char = this.cursor.c, .width = this.cursor.width }; } inline fn indexNext(this: *const SrcUnicode) ?IndexValue { if (this.next_cursor.width + this.next_cursor.i > this.iter.bytes.len) return null; return .{ .char = @intCast(this.next_cursor.c), .width = this.next_cursor.width }; } inline fn eat(this: *SrcUnicode, escaped: bool) void { // eat two codepoints if (escaped) { nextCursor(&this.iter, &this.next_cursor); this.cursor = this.next_cursor; nextCursor(&this.iter, &this.next_cursor); } else { // eat one codepoint this.cursor = this.next_cursor; nextCursor(&this.iter, &this.next_cursor); } } }; pub fn ShellCharIter(comptime encoding: StringEncoding) type { return struct { src: Src, state: State = .Normal, prev: ?InputChar = null, current: ?InputChar = null, pub const Src = switch (encoding) { .ascii => SrcAscii, .wtf8, .utf16 => SrcUnicode, }; pub const CodepointType = if (encoding == .ascii) u7 else u32; pub const InputChar = if (encoding == .ascii) SrcAscii.IndexValue else struct { char: u32, escaped: bool = false, }; pub fn isWhitespace(char: InputChar) bool { return switch (char.char) { '\t', '\r', '\n', ' ' => true, else => false, }; } pub const State = enum { Normal, Single, Double, }; pub fn init(bytes: []const u8) @This() { const src = if (comptime encoding == .ascii) SrcAscii.init(bytes) else SrcUnicode.init(bytes); return .{ .src = src, }; } pub fn srcBytes(self: *@This()) []const u8 { if (comptime encoding == .ascii) return self.src.bytes; return self.src.iter.bytes; } pub fn srcBytesAtCursor(self: *@This()) []const u8 { const bytes = self.srcBytes(); if (comptime encoding == .ascii) { if (self.src.i >= bytes.len) return ""; return bytes[self.src.i..]; } if (self.src.iter.i >= bytes.len) return ""; return bytes[self.src.iter.i..]; } pub fn cursorPos(self: *@This()) usize { if (comptime encoding == .ascii) return self.src.i; return self.src.iter.i; } pub fn eat(self: *@This()) ?InputChar { if (self.read_char()) |result| { self.prev = self.current; self.current = result; self.src.eat(result.escaped); return result; } return null; } pub fn peek(self: *@This()) ?InputChar { if (self.read_char()) |result| { return result; } return null; } pub fn read_char(self: *@This()) ?InputChar { const indexed_value = self.src.index() orelse return null; var char = indexed_value.char; if (char != '\\' or self.state == .Single) return .{ .char = char }; // Handle backslash switch (self.state) { .Normal => { const peeked = self.src.indexNext() orelse return null; char = peeked.char; }, .Double => { const peeked = self.src.indexNext() orelse return null; switch (peeked.char) { // Backslash only applies to these characters '$', '`', '"', '\\', '\n', '#' => { char = peeked.char; }, else => return .{ .char = char, .escaped = false }, } }, else => unreachable, } return .{ .char = char, .escaped = true }; } }; } /// Only these charaters allowed: /// - a-ZA-Z /// - _ /// - 0-9 (but can't be first char) pub fn isValidVarName(var_name: []const u8) bool { if (isAllAscii(var_name)) return isValidVarNameAscii(var_name); if (var_name.len == 0) return false; var iter = CodepointIterator.init(var_name); var cursor = CodepointIterator.Cursor{}; if (!iter.next(&cursor)) return false; switch (cursor.c) { '=', '0'...'9' => { return false; }, 'a'...'z', 'A'...'Z', '_' => {}, else => return false, } while (iter.next(&cursor)) { switch (cursor.c) { '0'...'9', 'a'...'z', 'A'...'Z', '_' => {}, else => return false, } } return true; } fn isValidVarNameAscii(var_name: []const u8) bool { if (var_name.len == 0) return false; switch (var_name[0]) { '=', '0'...'9' => { return false; }, 'a'...'z', 'A'...'Z', '_' => {}, else => return false, } if (var_name.len - 1 < 16) return isValidVarNameSlowAscii(var_name); const upper_a: @Vector(16, u8) = @splat('A'); const upper_z: @Vector(16, u8) = @splat('Z'); const lower_a: @Vector(16, u8) = @splat('a'); const lower_z: @Vector(16, u8) = @splat('z'); const zero: @Vector(16, u8) = @splat(0); const nine: @Vector(16, u8) = @splat(9); const underscore: @Vector(16, u8) = @splat('_'); const BoolVec = @Vector(16, u1); var i: usize = 0; while (i + 16 <= var_name.len) : (i += 16) { const chars: @Vector(16, u8) = var_name[i..][0..16].*; const in_upper = @as(BoolVec, @bitCast(chars > upper_a)) & @as(BoolVec, @bitCast(chars < upper_z)); const in_lower = @as(BoolVec, @bitCast(chars > lower_a)) & @as(BoolVec, @bitCast(chars < lower_z)); const in_digit = @as(BoolVec, @bitCast(chars > zero)) & @as(BoolVec, @bitCast(chars < nine)); const is_underscore = @as(BoolVec, @bitCast(chars == underscore)); const merged = @as(@Vector(16, bool), @bitCast(in_upper | in_lower | in_digit | is_underscore)); if (std.simd.countTrues(merged) != 16) return false; } return isValidVarNameSlowAscii(var_name[i..]); } fn isValidVarNameSlowAscii(var_name: []const u8) bool { for (var_name) |c| { switch (c) { '0'...'9', 'a'...'z', 'A'...'Z', '_' => {}, else => return false, } } return true; } var stderr_mutex = std.Thread.Mutex{}; pub fn closefd(fd: bun.FileDescriptor) void { if (Syscall.close2(fd)) |err| { _ = err; log("ERR closefd: {d}\n", .{fd}); // stderr_mutex.lock(); // defer stderr_mutex.unlock(); // const stderr = std.io.getStdErr().writer(); // err.toSystemError().format("error", .{}, stderr) catch @panic("damn"); } } pub fn hasEqSign(str: []const u8) ?u32 { if (isAllAscii(str)) { if (str.len < 16) return hasEqSignAsciiSlow(str); const needles: @Vector(16, u8) = @splat('='); var i: u32 = 0; while (i + 16 <= str.len) : (i += 16) { const haystack = str[i..][0..16].*; const result = haystack == needles; if (std.simd.firstTrue(result)) |idx| { return @intCast(i + idx); } } return i + (hasEqSignAsciiSlow(str[i..]) orelse return null); } // TODO actually i think that this can also use the simd stuff var iter = CodepointIterator.init(str); var cursor = CodepointIterator.Cursor{}; while (iter.next(&cursor)) { if (cursor.c == '=') { return @intCast(cursor.i); } } return null; } pub fn hasEqSignAsciiSlow(str: []const u8) ?u32 { for (str, 0..) |c, i| if (c == '=') return @intCast(i); return null; } pub const CmdEnvIter = struct { env: *const std.StringArrayHashMap([:0]const u8), iter: std.StringArrayHashMap([:0]const u8).Iterator, const Entry = struct { key: Key, value: Value, }; const Value = struct { val: [:0]const u8, pub fn format(self: Value, comptime _: []const u8, _: std.fmt.FormatOptions, writer: anytype) !void { try writer.writeAll(self.val); } }; const Key = struct { val: []const u8, pub fn format(self: Key, comptime _: []const u8, _: std.fmt.FormatOptions, writer: anytype) !void { try writer.writeAll(self.val); } pub fn eqlComptime(this: Key, comptime str: []const u8) bool { return bun.strings.eqlComptime(this.val, str); } }; pub fn fromEnv(env: *const std.StringArrayHashMap([:0]const u8)) CmdEnvIter { const iter = env.iterator(); return .{ .env = env, .iter = iter, }; } pub fn len(self: *const CmdEnvIter) usize { return self.env.unmanaged.entries.len; } pub fn next(self: *CmdEnvIter) !?Entry { const entry = self.iter.next() orelse return null; return .{ .key = .{ .val = entry.key_ptr.* }, .value = .{ .val = entry.value_ptr.* }, }; } }; const ExpansionStr = union(enum) {}; pub const Test = struct { pub const TestToken = union(TokenTag) { // | Pipe, // || DoublePipe, // & Ampersand, // && DoubleAmpersand, // > Redirect: AST.Cmd.RedirectFlags, // $ Dollar, // * Asterisk, DoubleAsterisk, // = Eq, Semicolon, Newline, BraceBegin, Comma, BraceEnd, CmdSubstBegin, CmdSubstQuoted, CmdSubstEnd, OpenParen, CloseParen, Var: []const u8, Text: []const u8, JSObjRef: u32, Delimit, Eof, pub fn from_real(the_token: Token, buf: []const u8) TestToken { switch (the_token) { .Var => |txt| return .{ .Var = buf[txt.start..txt.end] }, .Text => |txt| return .{ .Text = buf[txt.start..txt.end] }, .JSObjRef => |val| return .{ .JSObjRef = val }, .Pipe => return .Pipe, .DoublePipe => return .DoublePipe, .Ampersand => return .Ampersand, .DoubleAmpersand => return .DoubleAmpersand, .Redirect => |r| return .{ .Redirect = r }, .Dollar => return .Dollar, .Asterisk => return .Asterisk, .DoubleAsterisk => return .DoubleAsterisk, .Eq => return .Eq, .Semicolon => return .Semicolon, .Newline => return .Newline, .BraceBegin => return .BraceBegin, .Comma => return .Comma, .BraceEnd => return .BraceEnd, .CmdSubstBegin => return .CmdSubstBegin, .CmdSubstQuoted => return .CmdSubstQuoted, .CmdSubstEnd => return .CmdSubstEnd, .OpenParen => return .OpenParen, .CloseParen => return .CloseParen, .Delimit => return .Delimit, .Eof => return .Eof, } } }; }; pub fn shellCmdFromJS( globalThis: *JSC.JSGlobalObject, string_args: JSValue, template_args: []const JSValue, out_jsobjs: *std.ArrayList(JSValue), jsstrings: *std.ArrayList(bun.String), out_script: *std.ArrayList(u8), ) !bool { var builder = ShellSrcBuilder.init(globalThis, out_script, jsstrings); var jsobjref_buf: [128]u8 = [_]u8{0} ** 128; var string_iter = string_args.arrayIterator(globalThis); var i: u32 = 0; const last = string_iter.len -| 1; while (string_iter.next()) |js_value| { defer i += 1; if (!try builder.appendJSValueStr(js_value, false)) { globalThis.throw("Shell script string contains invalid UTF-16", .{}); return false; } // const str = js_value.getZigString(globalThis); // try script.appendSlice(str.full()); if (i < last) { const template_value = template_args[i]; if (!(try handleTemplateValue(globalThis, template_value, out_jsobjs, out_script, jsstrings, jsobjref_buf[0..]))) return false; } } return true; } pub fn handleTemplateValue( globalThis: *JSC.JSGlobalObject, template_value: JSValue, out_jsobjs: *std.ArrayList(JSValue), out_script: *std.ArrayList(u8), jsstrings: *std.ArrayList(bun.String), jsobjref_buf: []u8, ) !bool { var builder = ShellSrcBuilder.init(globalThis, out_script, jsstrings); if (!template_value.isEmpty()) { if (template_value.asArrayBuffer(globalThis)) |array_buffer| { _ = array_buffer; const idx = out_jsobjs.items.len; template_value.protect(); try out_jsobjs.append(template_value); const slice = try std.fmt.bufPrint(jsobjref_buf[0..], "{s}{d}", .{ bun.shell.LEX_JS_OBJREF_PREFIX, idx }); try out_script.appendSlice(slice); return true; } if (template_value.as(JSC.WebCore.Blob)) |blob| { if (blob.store) |store| { if (store.data == .file) { if (store.data.file.pathlike == .path) { const path = store.data.file.pathlike.path.slice(); if (!try builder.appendUTF8(path, true)) { globalThis.throw("Shell script string contains invalid UTF-16", .{}); return false; } return true; } } } const idx = out_jsobjs.items.len; template_value.protect(); try out_jsobjs.append(template_value); const slice = try std.fmt.bufPrint(jsobjref_buf[0..], "{s}{d}", .{ LEX_JS_OBJREF_PREFIX, idx }); try out_script.appendSlice(slice); return true; } if (JSC.WebCore.ReadableStream.fromJS(template_value, globalThis)) |rstream| { _ = rstream; const idx = out_jsobjs.items.len; template_value.protect(); try out_jsobjs.append(template_value); const slice = try std.fmt.bufPrint(jsobjref_buf[0..], "{s}{d}", .{ LEX_JS_OBJREF_PREFIX, idx }); try out_script.appendSlice(slice); return true; } if (template_value.as(JSC.WebCore.Response)) |req| { _ = req; const idx = out_jsobjs.items.len; template_value.protect(); try out_jsobjs.append(template_value); const slice = try std.fmt.bufPrint(jsobjref_buf[0..], "{s}{d}", .{ LEX_JS_OBJREF_PREFIX, idx }); try out_script.appendSlice(slice); return true; } if (template_value.isString()) { if (!try builder.appendJSValueStr(template_value, true)) { globalThis.throw("Shell script string contains invalid UTF-16", .{}); return false; } return true; } if (template_value.jsType().isArray()) { var array = template_value.arrayIterator(globalThis); const last = array.len -| 1; var i: u32 = 0; while (array.next()) |arr| : (i += 1) { if (!(try handleTemplateValue(globalThis, arr, out_jsobjs, out_script, jsstrings, jsobjref_buf))) return false; if (i < last) { const str = bun.String.static(" "); if (!try builder.appendBunStr(str, false)) return false; } } return true; } if (template_value.isObject()) { if (template_value.getTruthy(globalThis, "raw")) |maybe_str| { const bunstr = maybe_str.toBunString(globalThis); defer bunstr.deref(); if (!try builder.appendBunStr(bunstr, false)) { globalThis.throw("Shell script string contains invalid UTF-16", .{}); return false; } return true; } } if (template_value.isPrimitive()) { if (!try builder.appendJSValueStr(template_value, true)) { globalThis.throw("Shell script string contains invalid UTF-16", .{}); return false; } return true; } if (template_value.implementsToString(globalThis)) { if (!try builder.appendJSValueStr(template_value, true)) { globalThis.throw("Shell script string contains invalid UTF-16", .{}); return false; } return true; } globalThis.throw("Invalid JS object used in shell: {}, you might need to call `.toString()` on it", .{template_value.fmtString(globalThis)}); return false; } return true; } pub const ShellSrcBuilder = struct { globalThis: *JSC.JSGlobalObject, outbuf: *std.ArrayList(u8), jsstrs_to_escape: *std.ArrayList(bun.String), jsstr_ref_buf: [128]u8 = [_]u8{0} ** 128, pub fn init( globalThis: *JSC.JSGlobalObject, outbuf: *std.ArrayList(u8), jsstrs_to_escape: *std.ArrayList(bun.String), ) ShellSrcBuilder { return .{ .globalThis = globalThis, .outbuf = outbuf, .jsstrs_to_escape = jsstrs_to_escape, }; } pub fn appendJSValueStr(this: *ShellSrcBuilder, jsval: JSValue, comptime allow_escape: bool) !bool { const bunstr = jsval.toBunString(this.globalThis); defer bunstr.deref(); return try this.appendBunStr(bunstr, allow_escape); } pub fn appendBunStr( this: *ShellSrcBuilder, bunstr: bun.String, comptime allow_escape: bool, ) !bool { const invalid = (bunstr.isUTF16() and !bun.simdutf.validate.utf16le(bunstr.utf16())) or (bunstr.isUTF8() and !bun.simdutf.validate.utf8(bunstr.byteSlice())); if (invalid) return false; if (allow_escape) { if (needsEscapeBunstr(bunstr)) { try this.appendJSStrRef(bunstr); return true; } } if (bunstr.isUTF16()) { try this.appendUTF16Impl(bunstr.utf16()); return true; } if (bunstr.isUTF8() or bun.strings.isAllASCII(bunstr.byteSlice())) { try this.appendUTF8Impl(bunstr.byteSlice()); return true; } try this.appendLatin1Impl(bunstr.byteSlice()); return true; } pub fn appendUTF8(this: *ShellSrcBuilder, utf8: []const u8, comptime allow_escape: bool) !bool { const invalid = bun.simdutf.validate.utf8(utf8); if (!invalid) return false; if (allow_escape) { if (needsEscapeUtf8AsciiLatin1(utf8)) { const bunstr = bun.String.createUTF8(utf8); defer bunstr.deref(); try this.appendJSStrRef(bunstr); return true; } } try this.appendUTF8Impl(utf8); return true; } pub fn appendUTF16Impl(this: *ShellSrcBuilder, utf16: []const u16) !void { const size = bun.simdutf.simdutf__utf8_length_from_utf16le(utf16.ptr, utf16.len); try this.outbuf.ensureUnusedCapacity(size); try bun.strings.convertUTF16ToUTF8Append(this.outbuf, utf16); } pub fn appendUTF8Impl(this: *ShellSrcBuilder, utf8: []const u8) !void { try this.outbuf.appendSlice(utf8); } pub fn appendLatin1Impl(this: *ShellSrcBuilder, latin1: []const u8) !void { const non_ascii_idx = bun.strings.firstNonASCII(latin1) orelse 0; if (non_ascii_idx > 0) { try this.appendUTF8Impl(latin1[0..non_ascii_idx]); } this.outbuf.* = try bun.strings.allocateLatin1IntoUTF8WithList(this.outbuf.*, this.outbuf.items.len, []const u8, latin1); } pub fn appendJSStrRef(this: *ShellSrcBuilder, bunstr: bun.String) !void { const idx = this.jsstrs_to_escape.items.len; const str = std.fmt.bufPrint(this.jsstr_ref_buf[0..], "{s}{d}", .{ LEX_JS_STRING_PREFIX, idx }) catch { @panic("Impossible"); }; try this.outbuf.appendSlice(str); bunstr.ref(); try this.jsstrs_to_escape.append(bunstr); } }; /// Characters that need to escaped const SPECIAL_CHARS = [_]u8{ '$', '>', '&', '|', '=', ';', '\n', '{', '}', ',', '(', ')', '\\', '\"', ' ', '\'' }; /// Characters that need to be backslashed inside double quotes const BACKSLASHABLE_CHARS = [_]u8{ '$', '`', '"', '\\' }; pub fn escapeBunStr(bunstr: bun.String, outbuf: *std.ArrayList(u8), comptime add_quotes: bool) !bool { // latin-1 or ascii if (bunstr.is8Bit()) { try escape8Bit(bunstr.byteSlice(), outbuf, add_quotes); return true; } if (bunstr.isUTF16()) { return try escapeUtf16(bunstr.utf16(), outbuf, add_quotes); } // Otherwise is utf-8 try escapeWTF8(bunstr.byteSlice(), outbuf, add_quotes); return true; } /// works for latin-1 and ascii pub fn escape8Bit(str: []const u8, outbuf: *std.ArrayList(u8), comptime add_quotes: bool) !void { try outbuf.ensureUnusedCapacity(str.len); if (add_quotes) try outbuf.append('\"'); loop: for (str) |c| { inline for (BACKSLASHABLE_CHARS) |spc| { if (spc == c) { try outbuf.appendSlice(&.{ '\\', c, }); continue :loop; } } try outbuf.append(c); } if (add_quotes) try outbuf.append('\"'); } pub fn escapeWTF8(str: []const u8, outbuf: *std.ArrayList(u8), comptime add_quotes: bool) !void { try outbuf.ensureUnusedCapacity(str.len); var bytes: [8]u8 = undefined; var n: u3 = if (add_quotes) bun.strings.encodeWTF8Rune(bytes[0..4], '"') else 0; if (add_quotes) try outbuf.appendSlice(bytes[0..n]); loop: for (str) |c| { inline for (BACKSLASHABLE_CHARS) |spc| { if (spc == c) { n = bun.strings.encodeWTF8Rune(bytes[0..4], '\\'); var next: [4]u8 = bytes[n..][0..4].*; n += bun.strings.encodeWTF8Rune(&next, @intCast(c)); try outbuf.appendSlice(bytes[0..n]); // try outbuf.appendSlice(&.{ // '\\', // c, // }); continue :loop; } } n = bun.strings.encodeWTF8Rune(bytes[0..4], @intCast(c)); try outbuf.appendSlice(bytes[0..n]); } if (add_quotes) { n = bun.strings.encodeWTF8Rune(bytes[0..4], '"'); try outbuf.appendSlice(bytes[0..n]); } } pub fn escapeUtf16(str: []const u16, outbuf: *std.ArrayList(u8), comptime add_quotes: bool) !bool { if (add_quotes) try outbuf.append('"'); const non_ascii = bun.strings.firstNonASCII16([]const u16, str) orelse 0; var cp_buf: [4]u8 = undefined; var i: usize = 0; loop: while (i < str.len) { const char: u32 = brk: { if (i < non_ascii) { i += 1; break :brk str[i]; } const ret = bun.strings.utf16Codepoint([]const u16, str[i..]); if (ret.fail) return false; i += ret.len; break :brk ret.code_point; }; inline for (BACKSLASHABLE_CHARS) |bchar| { if (@as(u32, @intCast(bchar)) == char) { try outbuf.appendSlice(&[_]u8{ '\\', @intCast(char) }); continue :loop; } } const len = bun.strings.encodeWTF8RuneT(&cp_buf, u32, char); try outbuf.appendSlice(cp_buf[0..len]); } if (add_quotes) try outbuf.append('"'); return true; } pub fn needsEscapeBunstr(bunstr: bun.String) bool { if (bunstr.isUTF16()) return needsEscapeUTF16(bunstr.utf16()); // Otherwise is utf-8, ascii, or latin-1 return needsEscapeUtf8AsciiLatin1(bunstr.byteSlice()); } pub fn needsEscapeUTF16Slow(str: []const u16) bool { for (str) |codeunit| { inline for (SPECIAL_CHARS) |spc| { if (@as(u16, @intCast(spc)) == codeunit) return true; } } return false; } pub fn needsEscapeUTF16(str: []const u16) bool { if (str.len < 64) return needsEscapeUTF16Slow(str); const needles = comptime brk: { var needles: [SPECIAL_CHARS.len]@Vector(8, u16) = undefined; for (SPECIAL_CHARS, 0..) |c, i| { needles[i] = @splat(@as(u16, @intCast(c))); } break :brk needles; }; var i: usize = 0; while (i + 8 <= str.len) : (i += 8) { const haystack: @Vector(8, u16) = str[i..][0..8].*; inline for (needles) |needle| { const result = haystack == needle; if (std.simd.firstTrue(result) != null) return true; } } if (i < str.len) return needsEscapeUTF16Slow(str[i..]); return false; } /// Checks for the presence of any char from `SPECIAL_CHARS` in `str`. This /// indicates the *possibility* that the string must be escaped, so it can have /// false positives, but it is faster than running the shell lexer through the /// input string for a more correct implementation. pub fn needsEscapeUtf8AsciiLatin1(str: []const u8) bool { if (str.len < 128) return needsEscapeUtf8AsciiLatin1Slow(str); const needles = comptime brk: { var needles: [SPECIAL_CHARS.len]@Vector(16, u8) = undefined; for (SPECIAL_CHARS, 0..) |c, i| { needles[i] = @splat(c); } break :brk needles; }; var i: usize = 0; while (i + 16 <= str.len) : (i += 16) { const haystack: @Vector(16, u8) = str[i..][0..16].*; inline for (needles) |needle| { const result = haystack == needle; if (std.simd.firstTrue(result) != null) return true; } } if (i < str.len) return needsEscapeUtf8AsciiLatin1Slow(str[i..]); return false; } pub fn needsEscapeUtf8AsciiLatin1Slow(str: []const u8) bool { for (str) |c| { inline for (SPECIAL_CHARS) |spc| { if (spc == c) return true; } } return false; }