Compare commits

...

1 Commits

Author SHA1 Message Date
Claude Bot
633a8303f3 fix(shell): non-ASCII characters in template literals (#12225)
Fix three bugs in Bun Shell's handling of non-ASCII characters:

1. Unicode lexer `srcBytesAtCursor`/`cursorPos` used `iter.i` (the
   decode-ahead position) instead of `cursor.i` (the current codepoint
   position), causing `__bunstr_` reference markers to not be found.

2. `bumpCursorAscii` set the Unicode cursor's codepoint to the last
   parsed digit instead of decoding the actual codepoint at the new
   position, causing stale characters to leak into output.

3. Template literal raw strings convert non-ASCII chars to `\uXXXX`
   escape sequences. Added `unescapeUnicodeRaw` to convert these back
   to actual Unicode characters before passing to the shell parser.

4. `appendLatin1Impl` appended the ASCII prefix of Latin-1 strings
   twice: once directly and once via `allocateLatin1IntoUTF8WithList`
   which already handles the full string.

Co-Authored-By: Claude <noreply@anthropic.com>
2026-02-19 11:03:28 +00:00
4 changed files with 92 additions and 17 deletions

View File

@@ -299,9 +299,35 @@ export function createBunShellTemplateFunction(createShellInterpreter_, createPa
}
}
// Template literal raw strings convert non-ASCII characters to \uXXXX or \u{XXXX}
// escape sequences. We need to convert these back to actual Unicode characters for
// the shell to handle them correctly. We must use raw strings (not cooked) so that
// backslash sequences like \n are preserved for shell interpretation.
// The lookbehind ensures we only unescape \uXXXX when preceded by an even number
// of backslashes (including zero), so \\uXXXX (literal backslash + uXXXX) is preserved.
const unicodeEscapeRegex = /(?<=(?:^|[^\\])(?:\\\\)*)\\u(?:\{([0-9a-fA-F]+)\}|([0-9a-fA-F]{4}))/g;
function unescapeUnicodeRaw(raw: readonly string[]): readonly string[] {
var needsResult = false;
const result = new Array(raw.length);
for (var i = 0; i < raw.length; i++) {
const s = raw[i];
// Fast path: if no backslash, no escapes to process
if (s.indexOf("\\") === -1) {
result[i] = s;
continue;
}
const replaced = s.replace(unicodeEscapeRegex, function (_match, bracedHex, quadHex) {
return String.fromCodePoint(parseInt(bracedHex || quadHex, 16));
});
if (replaced !== s) needsResult = true;
result[i] = replaced;
}
return needsResult ? result : raw;
}
var BunShell = function BunShell(first, ...rest) {
if (first?.raw === undefined) throw new Error("Please use '$' as a tagged template function: $`cmd arg1 arg2`");
const parsed_shell_script = createParsedShellScript(first.raw, rest);
const parsed_shell_script = createParsedShellScript(unescapeUnicodeRaw(first.raw), rest);
const cwd = BunShell[cwdSymbol];
const env = BunShell[envSymbol];
@@ -321,7 +347,7 @@ export function createBunShellTemplateFunction(createShellInterpreter_, createPa
var Shell = function Shell(first, ...rest) {
if (first?.raw === undefined) throw new Error("Please use '$' as a tagged template function: $`cmd arg1 arg2`");
const parsed_shell_script = createParsedShellScript(first.raw, rest);
const parsed_shell_script = createParsedShellScript(unescapeUnicodeRaw(first.raw), rest);
const cwd = Shell[cwdSymbol];
const env = Shell[envSymbol];

View File

@@ -3251,11 +3251,22 @@ pub fn NewLexer(comptime encoding: StringEncoding) type {
self.chars.current = .{ .char = cur_ascii_char };
return;
}
// For Unicode, we need to properly decode the codepoint at new_idx
// rather than using the ASCII char from the substitution digits.
// Set cursor to just before new_idx so that next() advances to new_idx.
self.chars.src.cursor = CodepointIterator.Cursor{
.i = @intCast(new_idx),
.c = cur_ascii_char,
.width = 1,
.c = 0,
.width = 0,
};
// next() computes pos = cursor.i + cursor.width = new_idx + 0 = new_idx,
// then reads the codepoint at new_idx and updates cursor.
if (!self.chars.src.iter.next(&self.chars.src.cursor)) {
// At end of input
self.chars.src.cursor.i = @intCast(self.chars.src.iter.bytes.len + 1);
self.chars.src.cursor.width = 1;
self.chars.src.cursor.c = CodepointIterator.ZeroValue;
}
self.chars.src.next_cursor = self.chars.src.cursor;
SrcUnicode.nextCursor(&self.chars.src.iter, &self.chars.src.next_cursor);
if (prev_ascii_char) |pc| self.chars.prev = .{ .char = pc };
@@ -3602,13 +3613,13 @@ pub fn ShellCharIter(comptime encoding: StringEncoding) type {
return bytes[self.src.i..];
}
if (self.src.iter.i >= bytes.len) return "";
return bytes[self.src.iter.i..];
if (self.src.cursor.i >= bytes.len) return "";
return bytes[self.src.cursor.i..];
}
pub fn cursorPos(self: *@This()) usize {
if (comptime encoding == .ascii) return self.src.i;
return self.src.iter.i;
return self.src.cursor.i;
}
pub fn eat(self: *@This()) ?InputChar {
@@ -4104,12 +4115,6 @@ pub const ShellSrcBuilder = struct {
}
pub fn appendLatin1Impl(this: *ShellSrcBuilder, latin1: []const u8) !void {
const non_ascii_idx = bun.strings.firstNonASCII(latin1) orelse 0;
if (non_ascii_idx > 0) {
try this.appendUTF8Impl(latin1[0..non_ascii_idx]);
}
this.outbuf.* = try bun.strings.allocateLatin1IntoUTF8WithList(this.outbuf.*, this.outbuf.items.len, latin1);
}

View File

@@ -294,10 +294,7 @@ describe("bunshell", () => {
test("escape unicode", async () => {
const { stdout } = await $`echo \\\\`;
// TODO: Uncomment and replace after unicode in template tags is supported
// expect(stdout.toString("utf8")).toEqual(`\弟\気\n`);
// Set this here for now, because unicode in template tags while using .raw is broken, but should be fixed
expect(stdout.toString("utf8")).toEqual("\\u5F1F\\u6C17\n");
expect(stdout.toString("utf8")).toEqual("\\弟\\気\n");
});
/**

View File

@@ -0,0 +1,47 @@
import { $ } from "bun";
import { expect, test } from "bun:test";
// https://github.com/oven-sh/bun/issues/12225
test("non-ASCII interpolated value with special chars needing escape", async () => {
const rating = "3"; // Contains digit - needs escaping via __bunstr_ ref
const label = "檢視"; // Non-ASCII
const result = await $`echo key=${rating} ${label}`.text();
expect(result.trim()).toBe("key=3 檢視");
});
test("non-ASCII static template text", async () => {
const result = await $`echo 檢視`.text();
expect(result.trim()).toBe("檢視");
});
test("non-ASCII interpolated value without special chars", async () => {
const label = "檢視";
const result = await $`echo ${label}`.text();
expect(result.trim()).toBe("檢視");
});
test("mixed ASCII and non-ASCII with multiple interpolations", async () => {
const num = "42";
const text = "日本語";
const result = await $`echo ${num} hello ${text} world`.text();
expect(result.trim()).toBe("42 hello 日本語 world");
});
test("supplementary plane characters in static template", async () => {
// U+1D573 is outside BMP, uses \u{XXXX} in raw string
const result = await $`echo 𝕳ello`.text();
expect(result.trim()).toBe("𝕳ello");
});
test("backslash-escaped unicode in template preserved", async () => {
// \\弟 in source means literal backslash + 弟
const result = await $`echo \\\\`.text();
expect(result.trim()).toBe("\\弟\\気");
});
test("latin-1 characters in static template", async () => {
const result = await $`echo café`.text();
expect(result.trim()).toBe("café");
});