Compare commits

...

1 Commits

Author SHA1 Message Date
Claude Bot
5571f5033e fix(transpiler): preserve unicode characters in string literals (#25169)
Unicode characters in BMP range (≤ U+FFFF) are now preserved in
transpiled output instead of being escaped to \uXXXX sequences. This
fixes Function.toString() returning escaped unicode, which broke tools
like unplugin-vue-router that rely on reading source code from function
bodies.

Changes:
- js_printer.zig: Use ascii_only=false for string character printing
- VirtualMachine.zig: Detect non-ASCII UTF-8 and use cloneUTF8 instead
  of treating bytes as Latin-1
- ModuleLoader.zig: Use cloneUTF8 for proper UTF-8 handling

Note: Supplementary characters (> U+FFFF like 🎉) are still escaped as
surrogate pairs due to JSC's UTF-16 string storage. This is a known
limitation.

Closes #25169

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-28 10:42:55 +00:00
4 changed files with 125 additions and 16 deletions

View File

@@ -553,7 +553,10 @@ pub fn transpileSourceCode(
.allocator = null,
.source_code = brk: {
const written = printer.ctx.getWritten();
const result = cache.output_code orelse bun.String.cloneLatin1(written);
// Use cloneUTF8 to properly handle UTF-8 multi-byte sequences in string literals.
// cloneLatin1 would misinterpret UTF-8 bytes as Latin-1 codepoints.
// See: https://github.com/oven-sh/bun/issues/25169
const result = cache.output_code orelse bun.String.cloneUTF8(written);
if (written.len > 1024 * 1024 * 2 or jsc_vm.smol) {
printer.ctx.buffer.deinit();

View File

@@ -1495,16 +1495,37 @@ fn refCountedStringWithWasNew(this: *VirtualMachine, new: *bool, input_: []const
else
input_;
// Check if the input is all ASCII. If it contains non-ASCII bytes (UTF-8 multi-byte sequences),
// we need to convert to UTF-16 for proper JSC string storage.
// See: https://github.com/oven-sh/bun/issues/25169
const is_all_ascii = bun.strings.isAllASCII(input);
const ref = this.allocator.create(jsc.RefString) catch unreachable;
ref.* = jsc.RefString{
.allocator = this.allocator,
.ptr = input.ptr,
.len = input.len,
.impl = bun.String.createExternal(*jsc.RefString, input, true, ref, &freeRefString).value.WTFStringImpl,
.hash = hash,
.ctx = this,
.onBeforeDeinit = VirtualMachine.clearRefString,
};
if (is_all_ascii) {
// All ASCII - can use Latin-1 storage directly
ref.* = jsc.RefString{
.allocator = this.allocator,
.ptr = input.ptr,
.len = input.len,
.impl = bun.String.createExternal(*jsc.RefString, input, true, ref, &freeRefString).value.WTFStringImpl,
.hash = hash,
.ctx = this,
.onBeforeDeinit = VirtualMachine.clearRefString,
};
} else {
// Contains non-ASCII UTF-8 - need to convert to UTF-16 for JSC
// Use cloneUTF8 which properly handles UTF-8 to UTF-16 conversion
const str = bun.String.cloneUTF8(input);
ref.* = jsc.RefString{
.allocator = this.allocator,
.ptr = input.ptr,
.len = input.len,
.impl = str.value.WTFStringImpl,
.hash = hash,
.ctx = this,
.onBeforeDeinit = VirtualMachine.clearRefString,
};
}
entry.value_ptr.* = ref;
}
new.* = !entry.found_existing;

View File

@@ -1526,10 +1526,15 @@ fn NewPrinter(
pub fn printStringCharactersUTF8(e: *Printer, text: []const u8, quote: u8) void {
const writer = e.writer.stdWriter();
// Use ascii_only=false to preserve unicode characters in string literals.
// This ensures Function.toString() returns strings with actual unicode instead of
// escape sequences like \u203C, which is important for code that uses toString()
// to extract source code (e.g., unplugin-vue-router).
// See: https://github.com/oven-sh/bun/issues/25169
(switch (quote) {
'\'' => writePreQuotedString(text, @TypeOf(writer), writer, '\'', ascii_only, false, .utf8),
'"' => writePreQuotedString(text, @TypeOf(writer), writer, '"', ascii_only, false, .utf8),
'`' => writePreQuotedString(text, @TypeOf(writer), writer, '`', ascii_only, false, .utf8),
'\'' => writePreQuotedString(text, @TypeOf(writer), writer, '\'', false, false, .utf8),
'"' => writePreQuotedString(text, @TypeOf(writer), writer, '"', false, false, .utf8),
'`' => writePreQuotedString(text, @TypeOf(writer), writer, '`', false, false, .utf8),
else => unreachable,
}) catch |err| switch (err) {};
}
@@ -1537,10 +1542,12 @@ fn NewPrinter(
const slice = std.mem.sliceAsBytes(text);
const writer = e.writer.stdWriter();
// Use ascii_only=false to preserve unicode characters in string literals.
// See comment in printStringCharactersUTF8 above.
(switch (quote) {
'\'' => writePreQuotedString(slice, @TypeOf(writer), writer, '\'', ascii_only, false, .utf16),
'"' => writePreQuotedString(slice, @TypeOf(writer), writer, '"', ascii_only, false, .utf16),
'`' => writePreQuotedString(slice, @TypeOf(writer), writer, '`', ascii_only, false, .utf16),
'\'' => writePreQuotedString(slice, @TypeOf(writer), writer, '\'', false, false, .utf16),
'"' => writePreQuotedString(slice, @TypeOf(writer), writer, '"', false, false, .utf16),
'`' => writePreQuotedString(slice, @TypeOf(writer), writer, '`', false, false, .utf16),
else => unreachable,
}) catch |err| switch (err) {};
}

View File

@@ -0,0 +1,78 @@
// Test for https://github.com/oven-sh/bun/issues/25169
// Unicode characters in string literals should be preserved in Function.toString()
// instead of being escaped to \uXXXX sequences.
import { expect, test } from "bun:test";
test("Function.toString() preserves unicode in string literals", () => {
// Simple string literal with emoji
function withEmoji() {
return "‼️";
}
const source = withEmoji.toString();
expect(source).toContain('"‼️"');
expect(source).not.toContain("\\u203C");
expect(source).not.toContain("\\uFE0F");
});
test("Function.toString() preserves unicode in template literals", () => {
// Template literal with unicode
function withTemplate() {
return `Hello ‼️ World`;
}
const source = withTemplate.toString();
expect(source).toContain("‼️");
expect(source).not.toContain("\\u203C");
});
test("Function.toString() preserves unicode in String.raw template", () => {
// This is the actual use case from unplugin-vue-router
function getExportedTypeDeclarationsForRouteMap() {
return String.raw`// Generated by test. ${"‼️"} DO NOT MODIFY`;
}
const source = getExportedTypeDeclarationsForRouteMap.toString();
// The interpolated string should preserve the emoji
expect(source).toContain('"‼️"');
expect(source).not.toContain("\\u203C");
});
test("Unicode string values are correct at runtime", () => {
const emoji = "‼️";
expect(emoji).toBe("‼️");
expect(emoji.length).toBe(2); // ‼️ is two code points: U+203C and U+FE0F
// Check the actual codepoints
expect(emoji.codePointAt(0)).toBe(0x203c);
expect(emoji.codePointAt(1)).toBe(0xfe0f);
});
test("BMP unicode characters are preserved in strings", () => {
// Test BMP (Basic Multilingual Plane) characters - codepoints <= 0xFFFF
// These should be preserved in Function.toString()
function bmpUnicode() {
return "日本語 中文 한국어";
}
const source = bmpUnicode.toString();
expect(source).toContain("日本語");
expect(source).toContain("中文");
expect(source).toContain("한국어");
});
test("Supplementary characters (> U+FFFF) are escaped as surrogate pairs", () => {
// Supplementary characters like emoji 🎉 (U+1F389) require surrogate pairs in UTF-16
// These are currently escaped to \uXXXX format due to how JSC stores strings
// This is a known limitation that may be addressed in the future
function supplementaryChar() {
return "🎉";
}
const source = supplementaryChar.toString();
// The emoji value should still work correctly at runtime
expect(supplementaryChar()).toBe("🎉");
// But in the source representation, it's escaped as surrogate pair
expect(source).toContain("\\uD83C\\uDF89");
});