fix(shell): non-ASCII characters in template literals (#12225 )

Fix three bugs in Bun Shell's handling of non-ASCII characters: 1. Unicode lexer `srcBytesAtCursor`/`cursorPos` used `iter.i` (the decode-ahead position) instead of `cursor.i` (the current codepoint position), causing `__bunstr_` reference markers to not be found. 2. `bumpCursorAscii` set the Unicode cursor's codepoint to the last parsed digit instead of decoding the actual codepoint at the new position, causing stale characters to leak into output. 3. Template literal raw strings convert non-ASCII chars to `\uXXXX` escape sequences. Added `unescapeUnicodeRaw` to convert these back to actual Unicode characters before passing to the shell parser. 4. `appendLatin1Impl` appended the ASCII prefix of Latin-1 strings twice: once directly and once via `allocateLatin1IntoUTF8WithList` which already handles the full string. Co-Authored-By: Claude <noreply@anthropic.com>
2026-02-19 23:31:45 +00:00 · 2026-02-19 11:03:28 +00:00
4 changed files with 92 additions and 17 deletions
--- a/src/js/builtins/shell.ts
+++ b/src/js/builtins/shell.ts
@@ -299,9 +299,35 @@ export function createBunShellTemplateFunction(createShellInterpreter_, createPa
    }
  }

+  // Template literal raw strings convert non-ASCII characters to \uXXXX or \u{XXXX}
+  // escape sequences. We need to convert these back to actual Unicode characters for
+  // the shell to handle them correctly. We must use raw strings (not cooked) so that
+  // backslash sequences like \n are preserved for shell interpretation.
+  // The lookbehind ensures we only unescape \uXXXX when preceded by an even number
+  // of backslashes (including zero), so \\uXXXX (literal backslash + uXXXX) is preserved.
+  const unicodeEscapeRegex = /(?<=(?:^|[^\\])(?:\\\\)*)\\u(?:\{([0-9a-fA-F]+)\}|([0-9a-fA-F]{4}))/g;
+  function unescapeUnicodeRaw(raw: readonly string[]): readonly string[] {
+    var needsResult = false;
+    const result = new Array(raw.length);
+    for (var i = 0; i < raw.length; i++) {
+      const s = raw[i];
+      // Fast path: if no backslash, no escapes to process
+      if (s.indexOf("\\") === -1) {
+        result[i] = s;
+        continue;
+      }
+      const replaced = s.replace(unicodeEscapeRegex, function (_match, bracedHex, quadHex) {
+        return String.fromCodePoint(parseInt(bracedHex || quadHex, 16));
+      });
+      if (replaced !== s) needsResult = true;
+      result[i] = replaced;
+    }
+    return needsResult ? result : raw;
+  }
+
  var BunShell = function BunShell(first, ...rest) {
    if (first?.raw === undefined) throw new Error("Please use '$' as a tagged template function: $`cmd arg1 arg2`");
-    const parsed_shell_script = createParsedShellScript(first.raw, rest);
+    const parsed_shell_script = createParsedShellScript(unescapeUnicodeRaw(first.raw), rest);

    const cwd = BunShell[cwdSymbol];
    const env = BunShell[envSymbol];
@@ -321,7 +347,7 @@ export function createBunShellTemplateFunction(createShellInterpreter_, createPa

    var Shell = function Shell(first, ...rest) {
      if (first?.raw === undefined) throw new Error("Please use '$' as a tagged template function: $`cmd arg1 arg2`");
-      const parsed_shell_script = createParsedShellScript(first.raw, rest);
+      const parsed_shell_script = createParsedShellScript(unescapeUnicodeRaw(first.raw), rest);

      const cwd = Shell[cwdSymbol];
      const env = Shell[envSymbol];
--- a/src/shell/shell.zig
+++ b/src/shell/shell.zig
@@ -3251,11 +3251,22 @@ pub fn NewLexer(comptime encoding: StringEncoding) type {
                self.chars.current = .{ .char = cur_ascii_char };
                return;
            }
+            // For Unicode, we need to properly decode the codepoint at new_idx
+            // rather than using the ASCII char from the substitution digits.
+            // Set cursor to just before new_idx so that next() advances to new_idx.
            self.chars.src.cursor = CodepointIterator.Cursor{
                .i = @intCast(new_idx),
-                .c = cur_ascii_char,
-                .width = 1,
+                .c = 0,
+                .width = 0,
            };
+            // next() computes pos = cursor.i + cursor.width = new_idx + 0 = new_idx,
+            // then reads the codepoint at new_idx and updates cursor.
+            if (!self.chars.src.iter.next(&self.chars.src.cursor)) {
+                // At end of input
+                self.chars.src.cursor.i = @intCast(self.chars.src.iter.bytes.len + 1);
+                self.chars.src.cursor.width = 1;
+                self.chars.src.cursor.c = CodepointIterator.ZeroValue;
+            }
            self.chars.src.next_cursor = self.chars.src.cursor;
            SrcUnicode.nextCursor(&self.chars.src.iter, &self.chars.src.next_cursor);
            if (prev_ascii_char) |pc| self.chars.prev = .{ .char = pc };
@@ -3602,13 +3613,13 @@ pub fn ShellCharIter(comptime encoding: StringEncoding) type {
                return bytes[self.src.i..];
            }

-            if (self.src.iter.i >= bytes.len) return "";
-            return bytes[self.src.iter.i..];
+            if (self.src.cursor.i >= bytes.len) return "";
+            return bytes[self.src.cursor.i..];
        }

        pub fn cursorPos(self: *@This()) usize {
            if (comptime encoding == .ascii) return self.src.i;
-            return self.src.iter.i;
+            return self.src.cursor.i;
        }

        pub fn eat(self: *@This()) ?InputChar {
@@ -4104,12 +4115,6 @@ pub const ShellSrcBuilder = struct {
    }

    pub fn appendLatin1Impl(this: *ShellSrcBuilder, latin1: []const u8) !void {
-        const non_ascii_idx = bun.strings.firstNonASCII(latin1) orelse 0;
-
-        if (non_ascii_idx > 0) {
-            try this.appendUTF8Impl(latin1[0..non_ascii_idx]);
-        }
-
        this.outbuf.* = try bun.strings.allocateLatin1IntoUTF8WithList(this.outbuf.*, this.outbuf.items.len, latin1);
    }

--- a/test/js/bun/shell/bunshell.test.ts
+++ b/test/js/bun/shell/bunshell.test.ts
@@ -294,10 +294,7 @@ describe("bunshell", () => {

    test("escape unicode", async () => {
      const { stdout } = await $`echo \\弟\\気`;
-      // TODO: Uncomment and replace after unicode in template tags is supported
-      // expect(stdout.toString("utf8")).toEqual(`\弟\気\n`);
-      // Set this here for now, because unicode in template tags while using .raw is broken, but should be fixed
-      expect(stdout.toString("utf8")).toEqual("\\u5F1F\\u6C17\n");
+      expect(stdout.toString("utf8")).toEqual("\\弟\\気\n");
    });

    /**
--- a/test/regression/issue/12225.test.ts
+++ b/test/regression/issue/12225.test.ts
@@ -0,0 +1,47 @@
+import { $ } from "bun";
+import { expect, test } from "bun:test";
+
+// https://github.com/oven-sh/bun/issues/12225
+
+test("non-ASCII interpolated value with special chars needing escape", async () => {
+  const rating = "3"; // Contains digit - needs escaping via __bunstr_ ref
+  const label = "檢視"; // Non-ASCII
+
+  const result = await $`echo key=${rating} ${label}`.text();
+  expect(result.trim()).toBe("key=3 檢視");
+});
+
+test("non-ASCII static template text", async () => {
+  const result = await $`echo 檢視`.text();
+  expect(result.trim()).toBe("檢視");
+});
+
+test("non-ASCII interpolated value without special chars", async () => {
+  const label = "檢視";
+  const result = await $`echo ${label}`.text();
+  expect(result.trim()).toBe("檢視");
+});
+
+test("mixed ASCII and non-ASCII with multiple interpolations", async () => {
+  const num = "42";
+  const text = "日本語";
+  const result = await $`echo ${num} hello ${text} world`.text();
+  expect(result.trim()).toBe("42 hello 日本語 world");
+});
+
+test("supplementary plane characters in static template", async () => {
+  // U+1D573 is outside BMP, uses \u{XXXX} in raw string
+  const result = await $`echo 𝕳ello`.text();
+  expect(result.trim()).toBe("𝕳ello");
+});
+
+test("backslash-escaped unicode in template preserved", async () => {
+  // \\弟 in source means literal backslash + 弟
+  const result = await $`echo \\弟\\気`.text();
+  expect(result.trim()).toBe("\\弟\\気");
+});
+
+test("latin-1 characters in static template", async () => {
+  const result = await $`echo café`.text();
+  expect(result.trim()).toBe("café");
+});