mirror of
https://github.com/oven-sh/bun
synced 2026-02-09 10:28:47 +00:00
## Summary
- Port md4c (CommonMark-compliant markdown parser) from C to Zig under
`src/md/`
- Three output modes:
- `Bun.markdown.html(input, options?)` — render to HTML string
- `Bun.markdown.render(input, callbacks?)` — render with custom
callbacks for each element
- `Bun.markdown.react(input, options?)` — render to a React Fragment
element, directly usable as a component return value
- React element creation uses a cached JSC Structure with
`putDirectOffset` for fast allocation
- Component overrides in `react()`: pass tag names as options keys to
replace default HTML elements with custom components
- GFM extensions: tables, strikethrough, task lists, permissive
autolinks, disallowed raw HTML tag filter
- Wire up `.md` as a bundler loader (via explicit `{ type: "md" }`)
## JavaScript API
### `Bun.markdown.html(input, options?)`
Renders markdown to an HTML string:
```js
const html = Bun.markdown.html("# Hello **world**");
// "<h1>Hello <strong>world</strong></h1>\n"
Bun.markdown.html("## Hello", { headingIds: true });
// '<h2 id="hello">Hello</h2>\n'
```
### `Bun.markdown.render(input, callbacks?)`
Renders markdown with custom JavaScript callbacks for each element. Each
callback receives children as a string and optional metadata, and
returns a string:
```js
// Custom HTML with classes
const html = Bun.markdown.render("# Title\n\nHello **world**", {
heading: (children, { level }) => `<h${level} class="title">${children}</h${level}>`,
paragraph: (children) => `<p>${children}</p>`,
strong: (children) => `<b>${children}</b>`,
});
// ANSI terminal output
const ansi = Bun.markdown.render("# Hello\n\n**bold**", {
heading: (children) => `\x1b[1;4m${children}\x1b[0m\n`,
paragraph: (children) => children + "\n",
strong: (children) => `\x1b[1m${children}\x1b[22m`,
});
// Strip all formatting
const text = Bun.markdown.render("# Hello **world**", {
heading: (children) => children,
paragraph: (children) => children,
strong: (children) => children,
});
// "Hello world"
// Return null to omit elements
const result = Bun.markdown.render("# Title\n\n\n\nHello", {
image: () => null,
heading: (children) => children,
paragraph: (children) => children + "\n",
});
// "Title\nHello\n"
```
Parser options can be included alongside callbacks:
```js
Bun.markdown.render("Visit www.example.com", {
link: (children, { href }) => `[${children}](${href})`,
paragraph: (children) => children,
permissiveAutolinks: true,
});
```
### `Bun.markdown.react(input, options?)`
Returns a React Fragment element — use it directly as a component return
value:
```tsx
// Use as a component
function Markdown({ text }: { text: string }) {
return Bun.markdown.react(text);
}
// With custom components
function Heading({ children }: { children: React.ReactNode }) {
return <h1 className="title">{children}</h1>;
}
const element = Bun.markdown.react("# Hello", { h1: Heading });
// Server-side rendering
import { renderToString } from "react-dom/server";
const html = renderToString(Bun.markdown.react("# Hello **world**"));
// "<h1>Hello <strong>world</strong></h1>"
```
#### React 18 and older
By default, `react()` uses `Symbol.for('react.transitional.element')` as
the `$$typeof` symbol, which is what React 19 expects. For React 18 and
older, pass `reactVersion: 18`:
```tsx
const el = Bun.markdown.react("# Hello", { reactVersion: 18 });
```
### Component Overrides
Tag names can be overridden in `react()`:
```tsx
Bun.markdown.react(input, {
h1: MyHeading, // block elements
p: CustomParagraph,
a: CustomLink, // inline elements
img: CustomImage,
pre: CodeBlock,
// ... h1-h6, p, blockquote, ul, ol, li, pre, hr, html,
// table, thead, tbody, tr, th, td,
// em, strong, a, img, code, del, math, u, br
});
```
Boolean values are ignored (not treated as overrides), so parser options
like `{ strikethrough: true }` don't conflict with component overrides.
### Options
```js
Bun.markdown.html(input, {
tables: true, // GFM tables (default: true)
strikethrough: true, // ~~deleted~~ (default: true)
tasklists: true, // - [x] items (default: true)
headingIds: true, // Generate id attributes on headings
autolinkHeadings: true, // Wrap heading content in <a> tags
tagFilter: false, // GFM disallowed HTML tags
wikiLinks: false, // [[wiki]] links
latexMath: false, // $inline$ and $$display$$
underline: false, // __underline__ (instead of <strong>)
// ... and more
});
```
## Architecture
### Parser (`src/md/`)
The parser is split into focused modules using Zig's delegation pattern:
| Module | Purpose |
|--------|---------|
| `parser.zig` | Core `Parser` struct, state, and re-exported method
delegation |
| `blocks.zig` | Block-level parsing: document processing, line
analysis, block start/end |
| `containers.zig` | Container management: blockquotes, lists, list
items |
| `inlines.zig` | Inline parsing: emphasis, code spans, HTML tags,
entities |
| `links.zig` | Link/image resolution, reference links, autolink
rendering |
| `autolinks.zig` | Permissive autolink detection (www, url, email) |
| `line_analysis.zig` | Line classification: headings, fences, HTML
blocks, tables |
| `ref_defs.zig` | Reference definition parsing and lookup |
| `render_blocks.zig` | Block rendering dispatch (code, HTML, table
blocks) |
| `html_renderer.zig` | HTML renderer implementing `Renderer` VTable |
| `types.zig` | Shared types: `Renderer` VTable, `BlockType`,
`SpanType`, `TextType`, etc. |
### Renderer Abstraction
Parsing is decoupled from output via a `Renderer` VTable interface:
```zig
pub const Renderer = struct {
ptr: *anyopaque,
vtable: *const VTable,
pub const VTable = struct {
enterBlock: *const fn (...) void,
leaveBlock: *const fn (...) void,
enterSpan: *const fn (...) void,
leaveSpan: *const fn (...) void,
text: *const fn (...) void,
};
};
```
Four renderers are implemented:
- **`HtmlRenderer`** (`src/md/html_renderer.zig`) — produces HTML string
output
- **`JsCallbackRenderer`** (`src/bun.js/api/MarkdownObject.zig`) — calls
JS callbacks for each element, accumulates string output
- **`ParseRenderer`** (`src/bun.js/api/MarkdownObject.zig`) — builds
React element AST with `MarkedArgumentBuffer` for GC safety
- **`JSReactElement`** (`src/bun.js/bindings/JSReactElement.cpp`) — C++
fast path for React element creation using cached JSC Structure +
`putDirectOffset`
## Test plan
- [x] 792 spec tests pass (CommonMark, GFM tables, strikethrough,
tasklists, permissive autolinks, GFM tag filter, wiki links, coverage,
regressions)
- [x] 114 API tests pass (`html()`, `render()`, `react()`,
`renderToString` integration, component overrides)
- [x] 58 GFM compatibility tests pass
```
bun bd test test/js/bun/md/md-spec.test.ts # 792 pass
bun bd test test/js/bun/md/md-render-api.test.ts # 114 pass
bun bd test test/js/bun/md/gfm-compat.test.ts # 58 pass
```
🤖 Generated with [Claude Code](https://claude.com/claude-code)
---------
Co-authored-by: Claude <noreply@anthropic.com>
Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
Co-authored-by: Dylan Conway <dylan.conway567@gmail.com>
Co-authored-by: SUZUKI Sosuke <sosuke@bun.com>
Co-authored-by: robobun <robobun@oven.sh>
Co-authored-by: Claude Bot <claude-bot@bun.sh>
Co-authored-by: Kirill Markelov <kerusha.chubko@gmail.com>
Co-authored-by: Ciro Spaciari <ciro.spaciari@gmail.com>
Co-authored-by: Alistair Smith <hi@alistair.sh>
268 lines
7.8 KiB
TypeScript
268 lines
7.8 KiB
TypeScript
import { describe, expect, test } from "bun:test";
|
|
import { readFileSync } from "fs";
|
|
import { join } from "path";
|
|
|
|
const SPEC_DIR = import.meta.dir;
|
|
|
|
interface SpecExample {
|
|
markdown: string;
|
|
expected: string;
|
|
line: number;
|
|
section: string;
|
|
flags: string[];
|
|
}
|
|
|
|
function parseSpecFile(path: string): SpecExample[] {
|
|
const content = readFileSync(path, "utf8").replace(/\r\n?/g, "\n");
|
|
const lines = content.split("\n");
|
|
const examples: SpecExample[] = [];
|
|
const fence = "`".repeat(32);
|
|
let i = 0;
|
|
let currentSection = "";
|
|
|
|
while (i < lines.length) {
|
|
const line = lines[i];
|
|
// Track section headers
|
|
if (line.startsWith("# ") || line.startsWith("## ") || line.startsWith("### ")) {
|
|
currentSection = line.replace(/^#+\s*/, "");
|
|
}
|
|
if (line.startsWith(fence + " example")) {
|
|
const startLine = i + 1;
|
|
i++;
|
|
// Collect markdown input (until lone "." line)
|
|
const mdLines: string[] = [];
|
|
while (i < lines.length && lines[i] !== ".") {
|
|
mdLines.push(lines[i]);
|
|
i++;
|
|
}
|
|
i++; // skip the "."
|
|
// Collect expected HTML (until closing fence)
|
|
const htmlLines: string[] = [];
|
|
while (i < lines.length && !lines[i].startsWith(fence)) {
|
|
htmlLines.push(lines[i]);
|
|
i++;
|
|
}
|
|
// Extension spec files have a second "." followed by flags (e.g. "--ftables").
|
|
// Strip trailing ".\n--fXXX\n--fYYY\n..." from expected HTML and save flags.
|
|
let expectedHtml = htmlLines.join("\n");
|
|
let flags: string[] = [];
|
|
const flagMatch = expectedHtml.match(/\n\.\n((?:--[^\n]+\n?)+)$/);
|
|
if (flagMatch) {
|
|
expectedHtml = expectedHtml.slice(0, -flagMatch[0].length);
|
|
flags = flagMatch[1]
|
|
.trim()
|
|
.split("\n")
|
|
.flatMap((line: string) => line.split(/\s+/))
|
|
.filter((f: string) => f.startsWith("--f"));
|
|
}
|
|
examples.push({
|
|
markdown: mdLines.join("\n").replaceAll("\u2192", "\t"),
|
|
expected: expectedHtml.replaceAll("\u2192", "\t"),
|
|
line: startLine,
|
|
section: currentSection,
|
|
flags,
|
|
});
|
|
}
|
|
i++;
|
|
}
|
|
return examples;
|
|
}
|
|
|
|
const markdown = Bun.markdown;
|
|
|
|
function renderMarkdown(md: string, flags?: string[]): string {
|
|
const options: Record<string, any> = {};
|
|
if (flags && flags.length > 0) {
|
|
for (const flag of flags) {
|
|
// Strip --f prefix, replace - with _
|
|
const name = flag.slice(3).replace(/-/g, "_");
|
|
// Map autolink flags to compound option
|
|
if (name === "permissive_autolinks") {
|
|
options.autolinks = true;
|
|
} else if (name === "permissive_url_autolinks") {
|
|
if (typeof options.autolinks !== "object") options.autolinks = {};
|
|
options.autolinks.url = true;
|
|
} else if (name === "permissive_www_autolinks") {
|
|
if (typeof options.autolinks !== "object") options.autolinks = {};
|
|
options.autolinks.www = true;
|
|
} else if (name === "permissive_email_autolinks") {
|
|
if (typeof options.autolinks !== "object") options.autolinks = {};
|
|
options.autolinks.email = true;
|
|
} else {
|
|
options[name] = true;
|
|
}
|
|
}
|
|
}
|
|
return markdown.html(md + "\n", options);
|
|
}
|
|
|
|
// Normalize HTML for comparison, ported from md4c's normalize.py.
|
|
// This ignores insignificant output differences:
|
|
// - Whitespace around block-level tags is removed
|
|
// - Multiple whitespace chars collapsed to single space (outside <pre>)
|
|
// - Self-closing tags converted to open tags (<br /> → <br>)
|
|
function normalizeHtml(html: string): string {
|
|
const blockTags = new Set([
|
|
"article",
|
|
"header",
|
|
"aside",
|
|
"hgroup",
|
|
"blockquote",
|
|
"hr",
|
|
"iframe",
|
|
"body",
|
|
"li",
|
|
"map",
|
|
"button",
|
|
"object",
|
|
"canvas",
|
|
"ol",
|
|
"caption",
|
|
"output",
|
|
"col",
|
|
"p",
|
|
"colgroup",
|
|
"pre",
|
|
"dd",
|
|
"progress",
|
|
"div",
|
|
"section",
|
|
"dl",
|
|
"table",
|
|
"td",
|
|
"dt",
|
|
"tbody",
|
|
"embed",
|
|
"textarea",
|
|
"fieldset",
|
|
"tfoot",
|
|
"figcaption",
|
|
"th",
|
|
"figure",
|
|
"thead",
|
|
"footer",
|
|
"tr",
|
|
"form",
|
|
"ul",
|
|
"h1",
|
|
"h2",
|
|
"h3",
|
|
"h4",
|
|
"h5",
|
|
"h6",
|
|
"video",
|
|
"script",
|
|
"style",
|
|
]);
|
|
|
|
let output = "";
|
|
let lastType = "starttag";
|
|
let lastTag = "";
|
|
let inPre = false;
|
|
|
|
// Simple HTML tokenizer: splits into tags and text
|
|
const tokens = html.match(/<!\[CDATA\[.*?\]\]>|<!--.*?-->|<!\S[^>]*>|<\?[^>]*>|<\/?[a-zA-Z][^>]*\/?>|[^<]+/gs) || [];
|
|
|
|
for (const token of tokens) {
|
|
if (token.startsWith("<![CDATA")) {
|
|
output += token;
|
|
lastType = "data";
|
|
} else if (token.startsWith("<!--")) {
|
|
output += token;
|
|
lastType = "comment";
|
|
} else if (token.startsWith("<!") || token.startsWith("<?")) {
|
|
output += token;
|
|
lastType = "decl";
|
|
} else if (token.startsWith("</")) {
|
|
// End tag
|
|
const tag = token.slice(2, -1).trim().toLowerCase();
|
|
if (tag === "pre") inPre = false;
|
|
if (blockTags.has(tag)) output = output.trimEnd();
|
|
output += `</${tag}>`;
|
|
lastTag = tag;
|
|
lastType = "endtag";
|
|
} else if (token.startsWith("<")) {
|
|
// Start tag (possibly self-closing)
|
|
const selfClosing = token.endsWith("/>");
|
|
const inner = token.slice(1, selfClosing ? -2 : -1).trim();
|
|
const spaceIdx = inner.search(/[\s\/]/);
|
|
const tag = (spaceIdx === -1 ? inner : inner.slice(0, spaceIdx)).toLowerCase();
|
|
|
|
if (tag === "pre") inPre = true;
|
|
if (blockTags.has(tag)) output = output.trimEnd();
|
|
|
|
// Parse attributes
|
|
let attrStr = spaceIdx === -1 ? "" : inner.slice(spaceIdx).replace(/\/$/, "").trim();
|
|
let attrs: [string, string | null][] = [];
|
|
const attrRe = /([a-zA-Z_:][a-zA-Z0-9_.:-]*)(?:\s*=\s*(?:"([^"]*)"|'([^']*)'|(\S+)))?/g;
|
|
let m;
|
|
while ((m = attrRe.exec(attrStr)) !== null) {
|
|
const name = m[1].toLowerCase();
|
|
const value = m[2] ?? m[3] ?? m[4] ?? null;
|
|
attrs.push([name, value]);
|
|
}
|
|
attrs.sort((a, b) => a[0].localeCompare(b[0]));
|
|
|
|
output += `<${tag}`;
|
|
for (const [k, v] of attrs) {
|
|
output += ` ${k}`;
|
|
if (v !== null) output += `="${v}"`;
|
|
}
|
|
output += ">";
|
|
|
|
lastTag = tag;
|
|
// Self-closing tags are treated as endtags for whitespace purposes
|
|
lastType = selfClosing ? "endtag" : "starttag";
|
|
} else {
|
|
// Text data
|
|
let data = token;
|
|
const afterTag = lastType === "endtag" || lastType === "starttag";
|
|
const afterBlockTag = afterTag && blockTags.has(lastTag);
|
|
|
|
if (afterTag && lastTag === "br") data = data.replace(/^\n/, "");
|
|
if (!inPre) data = data.replace(/\s+/g, " ");
|
|
if (afterBlockTag && !inPre) {
|
|
if (lastType === "starttag") data = data.trimStart();
|
|
else if (lastType === "endtag") data = data.trim();
|
|
}
|
|
|
|
output += data;
|
|
lastType = "data";
|
|
}
|
|
}
|
|
|
|
return output.trim();
|
|
}
|
|
|
|
const specFiles = [
|
|
{ name: "CommonMark", file: "spec.txt" },
|
|
{ name: "GFM Tables", file: "spec-tables.txt" },
|
|
{ name: "GFM Strikethrough", file: "spec-strikethrough.txt" },
|
|
{ name: "GFM Tasklists", file: "spec-tasklists.txt" },
|
|
{ name: "Permissive Autolinks", file: "spec-permissive-autolinks.txt" },
|
|
{ name: "GFM", file: "spec-gfm.txt" },
|
|
{ name: "Coverage", file: "coverage.txt" },
|
|
{ name: "Regressions", file: "regressions.txt" },
|
|
];
|
|
|
|
for (const { name, file } of specFiles) {
|
|
const specPath = join(SPEC_DIR, file);
|
|
let examples: SpecExample[];
|
|
try {
|
|
examples = parseSpecFile(specPath);
|
|
} catch {
|
|
continue;
|
|
}
|
|
if (examples.length === 0) continue;
|
|
|
|
describe(name, () => {
|
|
for (let i = 0; i < examples.length; i++) {
|
|
const ex = examples[i];
|
|
test(`example ${i + 1} (line ${ex.line}): ${ex.section}`, () => {
|
|
const actual = renderMarkdown(ex.markdown, ex.flags.length > 0 ? ex.flags : undefined);
|
|
expect(normalizeHtml(actual)).toBe(normalizeHtml(ex.expected));
|
|
});
|
|
}
|
|
});
|
|
}
|