fix(transpiler): preserve unicode characters in string literals (#25169 )

Unicode characters in BMP range (≤ U+FFFF) are now preserved in transpiled output instead of being escaped to \uXXXX sequences. This fixes Function.toString() returning escaped unicode, which broke tools like unplugin-vue-router that rely on reading source code from function bodies. Changes: - js_printer.zig: Use ascii_only=false for string character printing - VirtualMachine.zig: Detect non-ASCII UTF-8 and use cloneUTF8 instead of treating bytes as Latin-1 - ModuleLoader.zig: Use cloneUTF8 for proper UTF-8 handling Note: Supplementary characters (> U+FFFF like 🎉) are still escaped as surrogate pairs due to JSC's UTF-16 string storage. This is a known limitation. Closes #25169 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2026-02-16 05:42:43 +00:00 · 2025-11-28 10:42:55 +00:00
4 changed files with 125 additions and 16 deletions
--- a/src/bun.js/ModuleLoader.zig
+++ b/src/bun.js/ModuleLoader.zig
@@ -553,7 +553,10 @@ pub fn transpileSourceCode(
                .allocator = null,
                .source_code = brk: {
                    const written = printer.ctx.getWritten();
-                    const result = cache.output_code orelse bun.String.cloneLatin1(written);
+                    // Use cloneUTF8 to properly handle UTF-8 multi-byte sequences in string literals.
+                    // cloneLatin1 would misinterpret UTF-8 bytes as Latin-1 codepoints.
+                    // See: https://github.com/oven-sh/bun/issues/25169
+                    const result = cache.output_code orelse bun.String.cloneUTF8(written);

                    if (written.len > 1024 * 1024 * 2 or jsc_vm.smol) {
                        printer.ctx.buffer.deinit();
--- a/src/bun.js/VirtualMachine.zig
+++ b/src/bun.js/VirtualMachine.zig
@@ -1495,16 +1495,37 @@ fn refCountedStringWithWasNew(this: *VirtualMachine, new: *bool, input_: []const
        else
            input_;

+        // Check if the input is all ASCII. If it contains non-ASCII bytes (UTF-8 multi-byte sequences),
+        // we need to convert to UTF-16 for proper JSC string storage.
+        // See: https://github.com/oven-sh/bun/issues/25169
+        const is_all_ascii = bun.strings.isAllASCII(input);
+
        const ref = this.allocator.create(jsc.RefString) catch unreachable;
-        ref.* = jsc.RefString{
-            .allocator = this.allocator,
-            .ptr = input.ptr,
-            .len = input.len,
-            .impl = bun.String.createExternal(*jsc.RefString, input, true, ref, &freeRefString).value.WTFStringImpl,
-            .hash = hash,
-            .ctx = this,
-            .onBeforeDeinit = VirtualMachine.clearRefString,
-        };
+        if (is_all_ascii) {
+            // All ASCII - can use Latin-1 storage directly
+            ref.* = jsc.RefString{
+                .allocator = this.allocator,
+                .ptr = input.ptr,
+                .len = input.len,
+                .impl = bun.String.createExternal(*jsc.RefString, input, true, ref, &freeRefString).value.WTFStringImpl,
+                .hash = hash,
+                .ctx = this,
+                .onBeforeDeinit = VirtualMachine.clearRefString,
+            };
+        } else {
+            // Contains non-ASCII UTF-8 - need to convert to UTF-16 for JSC
+            // Use cloneUTF8 which properly handles UTF-8 to UTF-16 conversion
+            const str = bun.String.cloneUTF8(input);
+            ref.* = jsc.RefString{
+                .allocator = this.allocator,
+                .ptr = input.ptr,
+                .len = input.len,
+                .impl = str.value.WTFStringImpl,
+                .hash = hash,
+                .ctx = this,
+                .onBeforeDeinit = VirtualMachine.clearRefString,
+            };
+        }
        entry.value_ptr.* = ref;
    }
    new.* = !entry.found_existing;
--- a/src/js_printer.zig
+++ b/src/js_printer.zig
@@ -1526,10 +1526,15 @@ fn NewPrinter(

        pub fn printStringCharactersUTF8(e: *Printer, text: []const u8, quote: u8) void {
            const writer = e.writer.stdWriter();
+            // Use ascii_only=false to preserve unicode characters in string literals.
+            // This ensures Function.toString() returns strings with actual unicode instead of
+            // escape sequences like \u203C, which is important for code that uses toString()
+            // to extract source code (e.g., unplugin-vue-router).
+            // See: https://github.com/oven-sh/bun/issues/25169
            (switch (quote) {
-                '\'' => writePreQuotedString(text, @TypeOf(writer), writer, '\'', ascii_only, false, .utf8),
-                '"' => writePreQuotedString(text, @TypeOf(writer), writer, '"', ascii_only, false, .utf8),
-                '`' => writePreQuotedString(text, @TypeOf(writer), writer, '`', ascii_only, false, .utf8),
+                '\'' => writePreQuotedString(text, @TypeOf(writer), writer, '\'', false, false, .utf8),
+                '"' => writePreQuotedString(text, @TypeOf(writer), writer, '"', false, false, .utf8),
+                '`' => writePreQuotedString(text, @TypeOf(writer), writer, '`', false, false, .utf8),
                else => unreachable,
            }) catch |err| switch (err) {};
        }
@@ -1537,10 +1542,12 @@ fn NewPrinter(
            const slice = std.mem.sliceAsBytes(text);

            const writer = e.writer.stdWriter();
+            // Use ascii_only=false to preserve unicode characters in string literals.
+            // See comment in printStringCharactersUTF8 above.
            (switch (quote) {
-                '\'' => writePreQuotedString(slice, @TypeOf(writer), writer, '\'', ascii_only, false, .utf16),
-                '"' => writePreQuotedString(slice, @TypeOf(writer), writer, '"', ascii_only, false, .utf16),
-                '`' => writePreQuotedString(slice, @TypeOf(writer), writer, '`', ascii_only, false, .utf16),
+                '\'' => writePreQuotedString(slice, @TypeOf(writer), writer, '\'', false, false, .utf16),
+                '"' => writePreQuotedString(slice, @TypeOf(writer), writer, '"', false, false, .utf16),
+                '`' => writePreQuotedString(slice, @TypeOf(writer), writer, '`', false, false, .utf16),
                else => unreachable,
            }) catch |err| switch (err) {};
        }
--- a/test/regression/issue/25169.test.ts
+++ b/test/regression/issue/25169.test.ts
@@ -0,0 +1,78 @@
+// Test for https://github.com/oven-sh/bun/issues/25169
+// Unicode characters in string literals should be preserved in Function.toString()
+// instead of being escaped to \uXXXX sequences.
+
+import { expect, test } from "bun:test";
+
+test("Function.toString() preserves unicode in string literals", () => {
+  // Simple string literal with emoji
+  function withEmoji() {
+    return "‼️";
+  }
+
+  const source = withEmoji.toString();
+  expect(source).toContain('"‼️"');
+  expect(source).not.toContain("\\u203C");
+  expect(source).not.toContain("\\uFE0F");
+});
+
+test("Function.toString() preserves unicode in template literals", () => {
+  // Template literal with unicode
+  function withTemplate() {
+    return `Hello ‼️ World`;
+  }
+
+  const source = withTemplate.toString();
+  expect(source).toContain("‼️");
+  expect(source).not.toContain("\\u203C");
+});
+
+test("Function.toString() preserves unicode in String.raw template", () => {
+  // This is the actual use case from unplugin-vue-router
+  function getExportedTypeDeclarationsForRouteMap() {
+    return String.raw`// Generated by test. ${"‼️"} DO NOT MODIFY`;
+  }
+
+  const source = getExportedTypeDeclarationsForRouteMap.toString();
+  // The interpolated string should preserve the emoji
+  expect(source).toContain('"‼️"');
+  expect(source).not.toContain("\\u203C");
+});
+
+test("Unicode string values are correct at runtime", () => {
+  const emoji = "‼️";
+  expect(emoji).toBe("‼️");
+  expect(emoji.length).toBe(2); // ‼️ is two code points: U+203C and U+FE0F
+
+  // Check the actual codepoints
+  expect(emoji.codePointAt(0)).toBe(0x203c);
+  expect(emoji.codePointAt(1)).toBe(0xfe0f);
+});
+
+test("BMP unicode characters are preserved in strings", () => {
+  // Test BMP (Basic Multilingual Plane) characters - codepoints <= 0xFFFF
+  // These should be preserved in Function.toString()
+  function bmpUnicode() {
+    return "日本語 中文 한국어";
+  }
+
+  const source = bmpUnicode.toString();
+  expect(source).toContain("日本語");
+  expect(source).toContain("中文");
+  expect(source).toContain("한국어");
+});
+
+test("Supplementary characters (> U+FFFF) are escaped as surrogate pairs", () => {
+  // Supplementary characters like emoji 🎉 (U+1F389) require surrogate pairs in UTF-16
+  // These are currently escaped to \uXXXX format due to how JSC stores strings
+  // This is a known limitation that may be addressed in the future
+  function supplementaryChar() {
+    return "🎉";
+  }
+
+  const source = supplementaryChar.toString();
+  // The emoji value should still work correctly at runtime
+  expect(supplementaryChar()).toBe("🎉");
+  // But in the source representation, it's escaped as surrogate pair
+  expect(source).toContain("\\uD83C\\uDF89");
+});