Compare commits

...

2 Commits

Author SHA1 Message Date
Claude Bot
6757c4d176 refactor(test): use -e flag instead of tempDir for simplicity
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-07 05:59:09 +00:00
Claude Bot
415fa68903 fix(printer): avoid double backslash in regex with escaped non-ASCII
When the regex printer encounters a non-ASCII character that was
preceded by a backslash escape, it was incorrectly adding another
backslash before the unicode escape sequence, resulting in `\\uXXXX`
instead of `\uXXXX`.

For example, `/[\⁄]/` (backslash + U+2044 fraction slash) was being
printed as `/[\\u2044]/`, which changes the regex semantics from
matching the fraction slash to matching `\`, `u`, `2`, `0`, `4`, `4`
as separate characters.

This fix tracks whether the previous character was a backslash and
omits the leading backslash when generating unicode escape sequences
for escaped non-ASCII characters.

Closes #26785

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-07 05:41:31 +00:00
2 changed files with 162 additions and 22 deletions

View File

@@ -3229,6 +3229,10 @@ fn NewPrinter(
// Translate any non-ASCII to unicode escape sequences
var ascii_start: usize = 0;
var is_ascii = false;
// Track if the previous character was a backslash to avoid doubling
// backslashes when converting escaped non-ASCII chars to \uXXXX.
// Example: /[\]/ should become /[\u2044]/ not /[\\u2044]/
var prev_was_backslash = false;
var iter = CodepointIterator.init(e.value);
var cursor = CodepointIterator.Cursor{};
while (iter.next(&cursor)) {
@@ -3238,6 +3242,7 @@ fn NewPrinter(
ascii_start = cursor.i;
is_ascii = true;
}
prev_was_backslash = (cursor.c == '\\' and !prev_was_backslash);
},
else => {
if (is_ascii) {
@@ -3247,14 +3252,26 @@ fn NewPrinter(
switch (cursor.c) {
0...0xFFFF => {
p.print([_]u8{
'\\',
'u',
hex_chars[cursor.c >> 12],
hex_chars[(cursor.c >> 8) & 15],
hex_chars[(cursor.c >> 4) & 15],
hex_chars[cursor.c & 15],
});
// If the previous char was a backslash, it's escaping
// this non-ASCII char, so we only need uXXXX (not \uXXXX).
if (prev_was_backslash) {
p.print(&[_]u8{
'u',
hex_chars[cursor.c >> 12],
hex_chars[(cursor.c >> 8) & 15],
hex_chars[(cursor.c >> 4) & 15],
hex_chars[cursor.c & 15],
});
} else {
p.print(&[_]u8{
'\\',
'u',
hex_chars[cursor.c >> 12],
hex_chars[(cursor.c >> 8) & 15],
hex_chars[(cursor.c >> 4) & 15],
hex_chars[cursor.c & 15],
});
}
},
else => |c| {
@@ -3262,22 +3279,41 @@ fn NewPrinter(
const lo = @as(usize, @intCast(first_high_surrogate + ((k >> 10) & 0x3FF)));
const hi = @as(usize, @intCast(first_low_surrogate + (k & 0x3FF)));
p.print(&[_]u8{
'\\',
'u',
hex_chars[lo >> 12],
hex_chars[(lo >> 8) & 15],
hex_chars[(lo >> 4) & 15],
hex_chars[lo & 15],
'\\',
'u',
hex_chars[hi >> 12],
hex_chars[(hi >> 8) & 15],
hex_chars[(hi >> 4) & 15],
hex_chars[hi & 15],
});
// If the previous char was a backslash, it's escaping
// this non-ASCII char, so we only need uXXXX (not \uXXXX).
if (prev_was_backslash) {
p.print(&[_]u8{
'u',
hex_chars[lo >> 12],
hex_chars[(lo >> 8) & 15],
hex_chars[(lo >> 4) & 15],
hex_chars[lo & 15],
'\\',
'u',
hex_chars[hi >> 12],
hex_chars[(hi >> 8) & 15],
hex_chars[(hi >> 4) & 15],
hex_chars[hi & 15],
});
} else {
p.print(&[_]u8{
'\\',
'u',
hex_chars[lo >> 12],
hex_chars[(lo >> 8) & 15],
hex_chars[(lo >> 4) & 15],
hex_chars[lo & 15],
'\\',
'u',
hex_chars[hi >> 12],
hex_chars[(hi >> 8) & 15],
hex_chars[(hi >> 4) & 15],
hex_chars[hi & 15],
});
}
},
}
prev_was_backslash = false;
},
}
}

View File

@@ -0,0 +1,104 @@
import { expect, test } from "bun:test";
import { bunEnv, bunExe } from "harness";
// https://github.com/oven-sh/bun/issues/26785
// Bun's regex printer was incorrectly handling backslash-escaped non-ASCII
// characters in regex literals. When a non-ASCII character is preceded by a
// backslash, the printer converts the character to `\uXXXX` format but was
// adding another backslash, resulting in `\\uXXXX` which breaks regex semantics.
test("regex with backslash-escaped non-ASCII character matches correctly", async () => {
await using proc = Bun.spawn({
cmd: [
bunExe(),
"-e",
`
const R = /[\\]/; // backslash + U+2044 (fraction slash)
const testString = '³⁄₅₂ cup of stuff';
const match = testString.match(R);
// Should match the fraction slash character, not the letter 'u'
console.log(JSON.stringify({
source: R.source,
match: match ? match[0] : null,
index: match ? match.index : null
}));
`,
],
env: bunEnv,
stdout: "pipe",
stderr: "pipe",
});
const [stdout, stderr, exitCode] = await Promise.all([proc.stdout.text(), proc.stderr.text(), proc.exited]);
const result = JSON.parse(stdout.trim());
// The source should have \u2044 (one backslash), not \\u2044 (two backslashes)
expect(result.source).toBe("[\\u2044]");
// Should match the fraction slash character at index 1, not 'u' from 'cup'
expect(result.match).toBe("");
expect(result.index).toBe(1);
expect(exitCode).toBe(0);
});
test("complex regex with backslash-escaped non-ASCII matches fractions", async () => {
await using proc = Bun.spawn({
cmd: [
bunExe(),
"-e",
`
// Original regex from the issue
const R = /[½⅓⅔¼¾⅕⅖⅗⅘⅙⅚⅐⅛⅜⅝⅞⅑⅒]|([⁰¹²³⁴⁵⁶⁷⁸⁹]+|[₀₁₂₃₄₅₆₇₈₉]+|[0-9]+)([\\/\\])([⁰¹²³⁴⁵⁶⁷⁸⁹]+|[₀₁₂₃₄₅₆₇₈₉]+|[0-9]+)/;
const m = '³⁄₅₂ cup of stuff'.match(R);
console.log(JSON.stringify(m));
`,
],
env: bunEnv,
stdout: "pipe",
stderr: "pipe",
});
const [stdout, stderr, exitCode] = await Promise.all([proc.stdout.text(), proc.stderr.text(), proc.exited]);
const result = JSON.parse(stdout.trim());
// Should match the fraction "³⁄₅₂"
expect(result).not.toBeNull();
expect(result[0]).toBe("³⁄₅₂");
expect(result[1]).toBe("³");
expect(result[2]).toBe("");
expect(result[3]).toBe("₅₂");
expect(exitCode).toBe(0);
});
test("regex with non-ASCII character without preceding backslash works", async () => {
await using proc = Bun.spawn({
cmd: [
bunExe(),
"-e",
`
// Non-ASCII character without a preceding backslash should still work
const R = /[]/;
const testString = '³⁄₅₂ cup of stuff';
const match = testString.match(R);
console.log(JSON.stringify({
match: match ? match[0] : null,
index: match ? match.index : null
}));
`,
],
env: bunEnv,
stdout: "pipe",
stderr: "pipe",
});
const [stdout, stderr, exitCode] = await Promise.all([proc.stdout.text(), proc.stderr.text(), proc.exited]);
const result = JSON.parse(stdout.trim());
// Should still match the fraction slash character
expect(result.match).toBe("");
expect(result.index).toBe(1);
expect(exitCode).toBe(0);
});