From ebc0cfeacde1fbad00ceb89c76f28dd0b01ffe70 Mon Sep 17 00:00:00 2001 From: robobun Date: Mon, 20 Oct 2025 14:19:22 -0700 Subject: [PATCH] fix(yaml): double-quoted strings with '...' incorrectly trigger document end error (#23491) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### What does this PR do? Fixes #23489 The YAML parser was incorrectly treating `...` inside double-quoted strings as document end markers, causing parse errors for strings containing ellipsis, particularly affecting internationalized text. ### Example of the bug: ```yaml balance: "👛 لا تمتلك محفظة... !" ``` This would fail with: `error: Unexpected document end` ### Root cause: The bug was introduced in commit fcbd57ac48 which attempted to optimize document marker detection by using `self.line_indent == .none` instead of tracking newlines with a local flag. However, this check was incomplete - it didn't track whether we had just processed a newline character. ### The fix: Restored the `nl` (newline) flag pattern from the single-quoted scanner and combined it with the `line_indent` check. Document markers `...` and `---` are now only recognized when **all** of these conditions are met: 1. We're after a newline (`nl == true`) 2. We're at column 0 (`self.line_indent == .none`) 3. Followed by whitespace or EOF This allows `...` to appear freely in double-quoted strings while still correctly recognizing actual document end markers at the start of lines. ### How did you verify your code works? 1. Reproduced the original issue from #23489 2. Applied the fix and verified all test cases pass: - Original Arabic text with emoji: `"👛 لا تمتلك محفظة... !"` - Various `...` positions: start, middle, end - Both single and double quotes - Multiline strings with indented `...` (issue #22392) 3. Created regression test in `test/regression/issue/23489.test.ts` 4. Verified existing YAML tests still pass (514 pass, up from 513) cc @dylan-conway for review --------- Co-authored-by: Claude Bot Co-authored-by: Claude Co-authored-by: Jarred Sumner Co-authored-by: Dylan Conway --- src/interchange/yaml.zig | 17 +++++++-- test/js/bun/yaml/yaml.test.ts | 52 ++++++++++++++++++++++++++ test/regression/issue/23489.test.ts | 58 +++++++++++++++++++++++++++++ 3 files changed, 123 insertions(+), 4 deletions(-) create mode 100644 test/regression/issue/23489.test.ts diff --git a/src/interchange/yaml.zig b/src/interchange/yaml.zig index 28f7de06fa..0d887ee7b2 100644 --- a/src/interchange/yaml.zig +++ b/src/interchange/yaml.zig @@ -3117,7 +3117,7 @@ pub fn Parser(comptime enc: Encoding) type { 0 => return error.UnexpectedCharacter, '.' => { - if (nl and self.remainStartsWith("...") and self.isSWhiteOrBCharAt(3)) { + if (nl and self.line_indent == .none and self.remainStartsWith("...") and self.isSWhiteOrBCharAt(3)) { return error.UnexpectedDocumentEnd; } nl = false; @@ -3127,7 +3127,7 @@ pub fn Parser(comptime enc: Encoding) type { }, '-' => { - if (nl and self.remainStartsWith("---") and self.isSWhiteOrBCharAt(3)) { + if (nl and self.line_indent == .none and self.remainStartsWith("---") and self.isSWhiteOrBCharAt(3)) { return error.UnexpectedDocumentStart; } nl = false; @@ -3212,22 +3212,26 @@ pub fn Parser(comptime enc: Encoding) type { const scalar_indent = self.line_indent; var text: std.ArrayList(enc.unit()) = .init(self.allocator); + var nl = false; + next: switch (self.next()) { 0 => return error.UnexpectedCharacter, '.' => { - if (self.line_indent == .none and self.remainStartsWith("...") and self.isSWhiteOrBCharAt(3)) { + if (nl and self.line_indent == .none and self.remainStartsWith("...") and self.isSWhiteOrBCharAt(3)) { return error.UnexpectedDocumentEnd; } + nl = false; try text.append('.'); self.inc(1); continue :next self.next(); }, '-' => { - if (self.line_indent == .none and self.remainStartsWith("---") and self.isSWhiteOrBCharAt(3)) { + if (nl and self.line_indent == .none and self.remainStartsWith("---") and self.isSWhiteOrBCharAt(3)) { return error.UnexpectedDocumentStart; } + nl = false; try text.append('-'); self.inc(1); continue :next self.next(); @@ -3248,12 +3252,14 @@ pub fn Parser(comptime enc: Encoding) type { return error.UnexpectedCharacter; } } + nl = true; continue :next self.next(); }, ' ', '\t', => { + nl = false; const off = self.pos; self.inc(1); self.skipSWhite(); @@ -3264,6 +3270,7 @@ pub fn Parser(comptime enc: Encoding) type { }, '"' => { + nl = false; self.inc(1); return .scalar(.{ .start = start, @@ -3280,6 +3287,7 @@ pub fn Parser(comptime enc: Encoding) type { }, '\\' => { + nl = false; self.inc(1); switch (self.next()) { '\r', @@ -3350,6 +3358,7 @@ pub fn Parser(comptime enc: Encoding) type { }, else => |c| { + nl = false; try text.append(c); self.inc(1); continue :next self.next(); diff --git a/test/js/bun/yaml/yaml.test.ts b/test/js/bun/yaml/yaml.test.ts index b121f5304d..2b451c9909 100644 --- a/test/js/bun/yaml/yaml.test.ts +++ b/test/js/bun/yaml/yaml.test.ts @@ -494,6 +494,58 @@ document: 2 expect(YAML.parse(yaml)).toEqual([{ document: 1 }, { document: 2 }]); }); + test("document markers in quoted strings", () => { + const inputs = [ + { expected: "hi ... hello", input: '"hi ... hello"' }, + { expected: "hi ... hello", input: "'hi ... hello'" }, + { expected: { foo: "hi ... hello" }, input: 'foo: "hi ... hello"' }, + { expected: { foo: "hi ... hello" }, input: "foo: 'hi ... hello'" }, + { + expected: "hi ... hello", + input: `"hi + ... + hello"`, + }, + { + expected: "hi ... hello", + input: `'hi + ... + hello'`, + }, + { + expected: { foo: "hi ... hello" }, + input: `foo: "hi + ... + hello"`, + }, + { + expected: { foo: "hi ... hello" }, + input: `foo: 'hi + ... + hello'`, + }, + { + expected: { foo: { bar: "hi ... hello" } }, + input: `foo: + bar: "hi + ... + hello"`, + }, + { + expected: { foo: { bar: "hi ... hello" } }, + input: `foo: + bar: 'hi + ... + hello'`, + }, + ]; + + for (const { input, expected } of inputs) { + expect(YAML.parse(input)).toEqual(expected); + expect(YAML.parse(YAML.stringify(YAML.parse(input)))).toEqual(expected); + } + }); + test("handles multiline strings", () => { const yaml = ` literal: | diff --git a/test/regression/issue/23489.test.ts b/test/regression/issue/23489.test.ts new file mode 100644 index 0000000000..1a7ef60120 --- /dev/null +++ b/test/regression/issue/23489.test.ts @@ -0,0 +1,58 @@ +import { YAML } from "bun"; +import { expect, test } from "bun:test"; +import { bunEnv, bunExe, tempDir } from "harness"; + +test("YAML double-quoted strings with ... should not trigger document end error - issue #23489", () => { + // Test the original failing case with Arabic text and emoji + const yaml1 = 'balance_dont_have_wallet: "👛 لا تمتلك محفظة... !"'; + const result1 = YAML.parse(yaml1); + expect(result1).toEqual({ + balance_dont_have_wallet: "👛 لا تمتلك محفظة... !", + }); + + // Test various patterns of ... in double-quoted strings + const yaml2 = `test1: "this has ... dots" +test2: "... at start" +test3: "at end ..." +test4: "👛 ... with emoji"`; + const result2 = YAML.parse(yaml2); + expect(result2).toEqual({ + test1: "this has ... dots", + test2: "... at start", + test3: "at end ...", + test4: "👛 ... with emoji", + }); + + // Test that both single and double quotes work + const yaml3 = `single: 'this has ... dots' +double: "this has ... dots"`; + const result3 = YAML.parse(yaml3); + expect(result3).toEqual({ + single: "this has ... dots", + double: "this has ... dots", + }); +}); + +test("YAML import with double-quoted strings containing ... - issue #23489", async () => { + using dir = tempDir("yaml-ellipsis", { + "test.yml": 'balance: "👛 لا تمتلك محفظة... !"', + "test.ts": ` + import yaml from "./test.yml"; + console.log(JSON.stringify(yaml)); + `, + }); + + await using proc = Bun.spawn({ + cmd: [bunExe(), "test.ts"], + env: bunEnv, + cwd: String(dir), + stdout: "pipe", + stderr: "pipe", + }); + + const [stdout, stderr, exitCode] = await Promise.all([proc.stdout.text(), proc.stderr.text(), proc.exited]); + + expect(stderr).not.toContain("Unexpected document end"); + expect(exitCode).toBe(0); + expect(stdout.trim()).toBe('{"balance":"👛 لا تمتلك محفظة... !"}'); +});