Files
bun.sh/src/md/html_renderer.zig
Andrew Johnston 7f498a2e07 chore: improve markdown to html perf (#26644)
### What does this PR do?

I was looking at the [recent
support](https://github.com/oven-sh/bun/pull/26440) for markdown and did
some benchmarking against
[bindings](https://github.com/just-js/lo/blob/main/lib/md4c/api.js) i
created for my `lo` runtime to `md4c`. In some cases, Bun is quite a bit
slower, so i did a bit of digging and came up with this change. It uses
`indexOfAny` which should utilise `SIMD` where it's available to scan
ahead in the payload for characters that need escaping.

In
[benchmarks](https://gist.github.com/billywhizz/397f7929a8920c826c072139b695bb68#file-results-md)
I have done this results in anywhere from `3%` to `~15%` improvement in
throughput. The bigger the payload and the more space between entities
the bigger the gain afaict, which would make sense.

### How did you verify your code works?

It passes `test/js/bun/md/*.test.ts` running locally. Only tested on
macos. Can test on linux but I assume that will happen in CI anyway?

## main


![bun-main](https://github.com/user-attachments/assets/8b173b34-1f20-4e52-bb67-bb8b7e5658f3)

## patched


![bun-patch](https://github.com/user-attachments/assets/26bb600c-234c-4903-8f70-32f167481156)
2026-02-03 00:51:25 -08:00

715 lines
25 KiB
Zig

pub const HtmlRenderer = struct {
out: OutputBuffer,
allocator: Allocator,
src_text: []const u8,
image_nesting_level: u32 = 0,
saved_img_title: []const u8 = "",
tag_filter: bool = false,
tag_filter_raw_depth: u32 = 0,
autolink_headings: bool = false,
heading_buf: std.ArrayListUnmanaged(u8) = .{},
heading_tracker: helpers.HeadingIdTracker = helpers.HeadingIdTracker.init(false),
pub const OutputBuffer = struct {
list: std.ArrayListUnmanaged(u8),
allocator: Allocator,
oom: bool,
fn write(self: *OutputBuffer, data: []const u8) void {
if (self.oom) return;
self.list.appendSlice(self.allocator, data) catch {
self.oom = true;
};
}
fn writeByte(self: *OutputBuffer, b: u8) void {
if (self.oom) return;
self.list.append(self.allocator, b) catch {
self.oom = true;
};
}
};
pub fn init(allocator: Allocator, src_text: []const u8, render_opts: RenderOptions) HtmlRenderer {
return .{
.out = .{ .list = .{}, .allocator = allocator, .oom = false },
.allocator = allocator,
.src_text = src_text,
.tag_filter = render_opts.tag_filter,
.autolink_headings = render_opts.autolink_headings,
.heading_tracker = helpers.HeadingIdTracker.init(render_opts.heading_ids),
};
}
pub fn deinit(self: *HtmlRenderer) void {
self.out.list.deinit(self.allocator);
self.heading_buf.deinit(self.allocator);
self.heading_tracker.deinit(self.allocator);
}
pub fn toOwnedSlice(self: *HtmlRenderer) error{OutOfMemory}![]u8 {
if (self.out.oom) return error.OutOfMemory;
return self.out.list.toOwnedSlice(self.allocator);
}
pub fn renderer(self: *HtmlRenderer) Renderer {
return .{ .ptr = self, .vtable = &vtable };
}
pub const vtable: Renderer.VTable = .{
.enterBlock = enterBlockImpl,
.leaveBlock = leaveBlockImpl,
.enterSpan = enterSpanImpl,
.leaveSpan = leaveSpanImpl,
.text = textImpl,
};
// ========================================
// VTable implementation functions
// ========================================
fn enterBlockImpl(ptr: *anyopaque, block_type: BlockType, data: u32, flags: u32) bun.JSError!void {
const self: *HtmlRenderer = @ptrCast(@alignCast(ptr));
self.enterBlock(block_type, data, flags);
}
fn leaveBlockImpl(ptr: *anyopaque, block_type: BlockType, data: u32) bun.JSError!void {
const self: *HtmlRenderer = @ptrCast(@alignCast(ptr));
self.leaveBlock(block_type, data);
}
fn enterSpanImpl(ptr: *anyopaque, span_type: SpanType, detail: SpanDetail) bun.JSError!void {
const self: *HtmlRenderer = @ptrCast(@alignCast(ptr));
self.enterSpan(span_type, detail);
}
fn leaveSpanImpl(ptr: *anyopaque, span_type: SpanType) bun.JSError!void {
const self: *HtmlRenderer = @ptrCast(@alignCast(ptr));
self.leaveSpan(span_type);
}
fn textImpl(ptr: *anyopaque, text_type: TextType, content: []const u8) bun.JSError!void {
const self: *HtmlRenderer = @ptrCast(@alignCast(ptr));
self.text(text_type, content);
}
// ========================================
// Block rendering
// ========================================
pub fn enterBlock(self: *HtmlRenderer, block_type: BlockType, data: u32, flags: u32) void {
switch (block_type) {
.doc => {},
.quote => {
self.ensureNewline();
self.write("<blockquote>\n");
},
.ul => {
self.ensureNewline();
self.write("<ul>\n");
},
.ol => {
self.ensureNewline();
const start = data;
if (start == 1) {
self.write("<ol>\n");
} else {
self.write("<ol start=\"");
self.writeDecimal(start);
self.write("\">\n");
}
},
.li => {
const task_mark = types.taskMarkFromData(data);
if (task_mark != 0) {
self.write("<li class=\"task-list-item\">");
if (types.isTaskChecked(task_mark)) {
self.write("<input type=\"checkbox\" class=\"task-list-item-checkbox\" disabled checked>");
} else {
self.write("<input type=\"checkbox\" class=\"task-list-item-checkbox\" disabled>");
}
} else {
self.write("<li>");
}
},
.hr => {
self.ensureNewline();
self.write("<hr />\n");
},
.h => {
self.ensureNewline();
if (self.heading_tracker.enabled) {
self.heading_tracker.enterHeading();
} else {
const level = data;
const tag = switch (level) {
1 => "<h1>",
2 => "<h2>",
3 => "<h3>",
4 => "<h4>",
5 => "<h5>",
else => "<h6>",
};
self.write(tag);
}
},
.code => {
self.ensureNewline();
self.write("<pre><code");
if (flags & types.BLOCK_FENCED_CODE != 0) {
const info_beg = data;
// Find end of language token (first word of info string)
var lang_end = info_beg;
while (lang_end < self.src_text.len and !helpers.isBlank(self.src_text[lang_end]) and
!helpers.isNewline(self.src_text[lang_end]))
{
lang_end += 1;
}
if (lang_end > info_beg) {
self.write(" class=\"language-");
self.writeWithEntityDecoding(self.src_text[info_beg..lang_end]);
self.write("\"");
}
}
self.write(">");
},
.html => self.ensureNewline(),
.p => {
self.ensureNewline();
self.write("<p>");
},
.table => {
self.ensureNewline();
self.write("<table>\n");
},
.thead => self.write("<thead>\n"),
.tbody => self.write("<tbody>\n"),
.tr => self.write("<tr>"),
.th, .td => {
const tag = if (block_type == .th) "<th" else "<td";
self.write(tag);
const alignment = types.alignmentFromData(data);
if (types.alignmentName(alignment)) |name| {
self.write(" align=\"");
self.write(name);
self.write("\"");
}
self.write(">");
},
}
}
pub fn leaveBlock(self: *HtmlRenderer, block_type: BlockType, data: u32) void {
switch (block_type) {
.doc => {},
.quote => self.write("</blockquote>\n"),
.ul => self.write("</ul>\n"),
.ol => self.write("</ol>\n"),
.li => self.write("</li>\n"),
.hr => {},
.h => {
if (self.heading_tracker.leaveHeading(self.allocator)) |slug| {
// Write opening tag with id
self.out.write(switch (data) {
1 => "<h1",
2 => "<h2",
3 => "<h3",
4 => "<h4",
5 => "<h5",
else => "<h6",
});
self.out.write(" id=\"");
self.out.write(slug);
self.out.write("\">");
if (self.autolink_headings) {
self.out.write("<a href=\"#");
self.out.write(slug);
self.out.write("\">");
}
// Flush buffered heading content
self.out.write(self.heading_buf.items);
if (self.autolink_headings) {
self.out.write("</a>");
}
self.out.write(switch (data) {
1 => "</h1>\n",
2 => "</h2>\n",
3 => "</h3>\n",
4 => "</h4>\n",
5 => "</h5>\n",
else => "</h6>\n",
});
self.heading_buf.clearRetainingCapacity();
self.heading_tracker.clearAfterHeading();
} else {
const tag = switch (data) {
1 => "</h1>\n",
2 => "</h2>\n",
3 => "</h3>\n",
4 => "</h4>\n",
5 => "</h5>\n",
else => "</h6>\n",
};
self.write(tag);
}
},
.code => self.write("</code></pre>\n"),
.html => {},
.p => {
self.write("</p>\n");
},
.table => self.write("</table>\n"),
.thead => self.write("</thead>\n"),
.tbody => self.write("</tbody>\n"),
.tr => self.write("</tr>\n"),
.th => self.write("</th>"),
.td => self.write("</td>"),
}
}
// ========================================
// Span rendering
// ========================================
pub fn enterSpan(self: *HtmlRenderer, span_type: SpanType, detail: SpanDetail) void {
if (self.image_nesting_level > 0) {
if (span_type == .img) self.image_nesting_level += 1;
return;
}
switch (span_type) {
.em => self.write("<em>"),
.strong => self.write("<strong>"),
.u => self.write("<u>"),
.code => self.write("<code>"),
.del => self.write("<del>"),
.latexmath => self.write("<x-equation>"),
.latexmath_display => self.write("<x-equation type=\"display\">"),
.a => {
self.write("<a href=\"");
if (detail.permissive_autolink) {
// Permissive autolinks use HTML-escaping for href
if (detail.autolink_email) self.write("mailto:");
if (detail.autolink_www) self.write("http://");
self.writeHtmlEscaped(detail.href);
} else if (detail.autolink) {
// Standard autolinks: percent-encode only, no entity/escape processing
if (detail.autolink_email) self.write("mailto:");
self.writeUrlEscaped(detail.href);
} else {
// Regular links: full entity/escape processing
if (detail.autolink_email) self.write("mailto:");
self.writeUrlWithEscapes(detail.href);
}
self.write("\"");
if (detail.title.len > 0) {
self.write(" title=\"");
self.writeTitleWithEscapes(detail.title);
self.write("\"");
}
self.write(">");
},
.img => {
self.saved_img_title = detail.title;
self.write("<img src=\"");
self.writeUrlWithEscapes(detail.href);
self.write("\" alt=\"");
self.image_nesting_level += 1;
},
.wikilink => {
self.write("<x-wikilink data-target=\"");
self.writeHtmlEscaped(detail.href);
self.write("\">");
},
}
}
pub fn leaveSpan(self: *HtmlRenderer, span_type: SpanType) void {
if (self.image_nesting_level > 0) {
if (span_type == .img) {
self.image_nesting_level -= 1;
if (self.image_nesting_level == 0) {
self.write("\"");
if (self.saved_img_title.len > 0) {
self.write(" title=\"");
self.writeTitleWithEscapes(self.saved_img_title);
self.write("\"");
}
self.write(" />");
self.saved_img_title = "";
}
}
return;
}
switch (span_type) {
.em => self.write("</em>"),
.strong => self.write("</strong>"),
.u => self.write("</u>"),
.a => self.write("</a>"),
.code => self.write("</code>"),
.del => self.write("</del>"),
.latexmath => self.write("</x-equation>"),
.latexmath_display => self.write("</x-equation>"),
.wikilink => self.write("</x-wikilink>"),
.img => {}, // handled above
}
}
// ========================================
// Text rendering
// ========================================
pub fn text(self: *HtmlRenderer, text_type: TextType, content: []const u8) void {
const in_image = self.image_nesting_level > 0;
// Track plain text for slug generation when inside a heading
self.heading_tracker.trackText(text_type, content, self.allocator);
switch (text_type) {
.null_char => self.write("\xEF\xBF\xBD"),
.br => {
if (in_image) self.write(" ") else self.write("<br />\n");
},
.softbr => {
if (in_image) self.write(" ") else self.write("\n");
},
.html => {
if (self.tag_filter) {
// Track entry/exit of disallowed tag raw zones
self.updateTagFilterRawDepth(content);
self.writeHtmlWithTagFilter(content);
} else {
self.write(content);
}
},
.entity => self.writeEntity(content),
.code => {
// In code spans, newlines become spaces
var start: usize = 0;
for (content, 0..) |byte, j| {
if (byte == '\n') {
if (j > start) self.writeHtmlEscaped(content[start..j]);
self.write(" ");
start = j + 1;
}
}
if (start < content.len) self.writeHtmlEscaped(content[start..]);
},
else => {
// When inside a tag-filtered disallowed tag, emit text as raw
if (self.tag_filter and self.tag_filter_raw_depth > 0) {
self.write(content);
} else {
self.writeHtmlEscaped(content);
}
},
}
}
// ========================================
// HTML writing utilities
// ========================================
pub fn write(self: *HtmlRenderer, data: []const u8) void {
if (self.heading_tracker.in_heading) {
self.heading_buf.appendSlice(self.allocator, data) catch {
self.out.oom = true;
};
} else {
self.out.write(data);
}
}
fn writeByte(self: *HtmlRenderer, b: u8) void {
if (self.heading_tracker.in_heading) {
self.heading_buf.append(self.allocator, b) catch {
self.out.oom = true;
};
} else {
self.out.writeByte(b);
}
}
/// Track whether we're inside a disallowed tag's raw zone.
/// When an opening disallowed tag is seen, increment depth.
/// When a closing disallowed tag is seen, decrement depth.
fn updateTagFilterRawDepth(self: *HtmlRenderer, content: []const u8) void {
if (content.len < 2 or content[0] != '<') return;
if (content[1] == '/') {
// Closing tag
if (isDisallowedTag(content) and self.tag_filter_raw_depth > 0) {
self.tag_filter_raw_depth -= 1;
}
} else {
// Opening tag (not self-closing)
if (isDisallowedTag(content)) {
// Check if NOT self-closing (doesn't end with "/>")
if (content[content.len - 2] != '/' or content[content.len - 1] != '>') {
self.tag_filter_raw_depth += 1;
}
}
}
}
/// Write HTML content with GFM tag filter applied. Scans for disallowed
/// tags and replaces their leading `<` with `&lt;`.
fn writeHtmlWithTagFilter(self: *HtmlRenderer, content: []const u8) void {
var start: usize = 0;
var i: usize = 0;
while (i < content.len) {
if (content[i] == '<' and isDisallowedTag(content[i..])) {
// Write everything before this '<'
if (i > start) self.write(content[start..i]);
self.write("&lt;");
start = i + 1;
}
i += 1;
}
if (start < content.len) self.write(content[start..]);
}
fn ensureNewline(self: *HtmlRenderer) void {
if (self.heading_tracker.in_heading) {
const items = self.heading_buf.items;
if (items.len > 0 and items[items.len - 1] != '\n') {
self.heading_buf.append(self.allocator, '\n') catch {
self.out.oom = true;
};
}
} else {
const items = self.out.list.items;
if (items.len > 0 and items[items.len - 1] != '\n') {
self.out.writeByte('\n');
}
}
}
pub fn writeHtmlEscaped(self: *HtmlRenderer, txt: []const u8) void {
var i: usize = 0;
const needle = "&<>\"";
while (true) {
const next = std.mem.indexOfAny(u8, txt[i..], needle) orelse {
self.write(txt[i..]);
return;
};
const pos = i + next;
if (pos > i)
self.write(txt[i..pos]);
const c = txt[pos];
switch (c) {
'&' => self.write("&amp;"),
'<' => self.write("&lt;"),
'>' => self.write("&gt;"),
'"' => self.write("&quot;"),
else => unreachable,
}
i = pos + 1;
}
}
fn writeUrlEscaped(self: *HtmlRenderer, txt: []const u8) void {
for (txt) |byte| {
self.writeUrlByte(byte);
}
}
fn writeUrlByte(self: *HtmlRenderer, byte: u8) void {
switch (byte) {
'&' => self.write("&amp;"),
'\'' => self.write("&#x27;"),
'A'...'Z',
'a'...'z',
'0'...'9',
'-',
'.',
'_',
'~',
':',
'/',
'?',
'#',
'@',
'!',
'$',
'(',
')',
'*',
'+',
',',
';',
'=',
'%',
=> self.writeByte(byte),
else => {
var buf: [3]u8 = undefined;
buf[0] = '%';
buf[1] = hexDigit(byte >> 4);
buf[2] = hexDigit(byte & 0x0F);
self.write(&buf);
},
}
}
/// Write URL with backslash escape and entity processing.
fn writeUrlWithEscapes(self: *HtmlRenderer, txt: []const u8) void {
var i: usize = 0;
while (i < txt.len) {
if (txt[i] == '\\' and i + 1 < txt.len and helpers.isAsciiPunctuation(txt[i + 1])) {
self.writeUrlByte(txt[i + 1]);
i += 2;
} else if (txt[i] == '&') {
if (findEntityInText(txt, i)) |end_pos| {
self.writeEntityToUrl(txt[i..end_pos]);
i = end_pos;
} else {
self.write("&amp;");
i += 1;
}
} else {
self.writeUrlByte(txt[i]);
i += 1;
}
}
}
/// Write title attribute with backslash escape and entity processing (HTML-escaped).
fn writeTitleWithEscapes(self: *HtmlRenderer, txt: []const u8) void {
var i: usize = 0;
while (i < txt.len) {
if (txt[i] == '\\' and i + 1 < txt.len and helpers.isAsciiPunctuation(txt[i + 1])) {
self.writeHtmlEscaped(txt[i + 1 .. i + 2]);
i += 2;
} else if (txt[i] == '&') {
if (findEntityInText(txt, i)) |end_pos| {
self.writeEntity(txt[i..end_pos]);
i = end_pos;
} else {
self.write("&amp;");
i += 1;
}
} else {
self.writeHtmlEscaped(txt[i .. i + 1]);
i += 1;
}
}
}
/// Write text with entity and backslash escape decoding, then HTML-escape the result.
/// Used for code fence info strings where entities are recognized.
fn writeWithEntityDecoding(self: *HtmlRenderer, txt: []const u8) void {
var i: usize = 0;
while (i < txt.len) {
if (txt[i] == '&') {
if (findEntityInText(txt, i)) |end_pos| {
self.writeEntity(txt[i..end_pos]);
i = end_pos;
continue;
}
} else if (txt[i] == '\\' and i + 1 < txt.len and helpers.isAsciiPunctuation(txt[i + 1])) {
self.writeHtmlEscaped(txt[i + 1 .. i + 2]);
i += 2;
continue;
}
self.writeHtmlEscaped(txt[i .. i + 1]);
i += 1;
}
}
fn writeEntity(self: *HtmlRenderer, entity_text: []const u8) void {
var buf: [8]u8 = undefined;
if (helpers.decodeEntityToUtf8(entity_text, &buf)) |decoded| {
self.writeHtmlEscaped(decoded);
} else {
self.write(entity_text);
}
}
/// Decode an entity and write its UTF-8 bytes as percent-encoded URL bytes.
fn writeEntityToUrl(self: *HtmlRenderer, entity_text: []const u8) void {
var buf: [8]u8 = undefined;
if (helpers.decodeEntityToUtf8(entity_text, &buf)) |decoded| {
for (decoded) |b| self.writeUrlByte(b);
} else {
self.writeUrlEscaped(entity_text);
}
}
fn writeDecimal(self: *HtmlRenderer, value: u32) void {
var buf: [10]u8 = undefined;
var v = value;
var i: usize = buf.len;
if (v == 0) {
self.writeByte('0');
return;
}
while (v > 0) {
i -= 1;
buf[i] = @intCast('0' + v % 10);
v /= 10;
}
self.write(buf[i..]);
}
// ========================================
// Static helpers
// ========================================
fn hexDigit(v: u8) u8 {
return if (v < 10) '0' + v else 'A' + v - 10;
}
/// GFM 6.11: Check if HTML content starts with a disallowed tag.
/// Disallowed tags have their leading `<` replaced with `&lt;`.
fn isDisallowedTag(content: []const u8) bool {
// Must start with '<', optionally followed by '/'
if (content.len < 2 or content[0] != '<') return false;
const after_lt: usize = if (content[1] == '/') 2 else 1;
if (after_lt >= content.len) return false;
const disallowed = [_][]const u8{
"title", "textarea", "style", "xmp", "iframe",
"noembed", "noframes", "script", "plaintext",
};
inline for (disallowed) |tag| {
if (matchTagNameCI(content, after_lt, tag)) return true;
}
return false;
}
/// Case-insensitive match of tag name at `pos` in `content`.
/// After the name, the next char must be '>', '/', whitespace, or end of string.
fn matchTagNameCI(content: []const u8, pos: usize, tag: []const u8) bool {
if (pos + tag.len > content.len) return false;
if (!bun.strings.eqlCaseInsensitiveASCIIIgnoreLength(content[pos..][0..tag.len], tag)) return false;
// Check delimiter after tag name
const end = pos + tag.len;
if (end >= content.len) return true;
return switch (content[end]) {
'>', ' ', '\t', '\n', '/' => true,
else => false,
};
}
/// Find an entity in text starting at `start`. Delegates to helpers.findEntity.
fn findEntityInText(content: []const u8, start: usize) ?usize {
return helpers.findEntity(content, start);
}
};
const bun = @import("bun");
const helpers = @import("./helpers.zig");
const std = @import("std");
const Allocator = std.mem.Allocator;
const root = @import("./root.zig");
const RenderOptions = root.RenderOptions;
const types = @import("./types.zig");
const BlockType = types.BlockType;
const Renderer = types.Renderer;
const SpanDetail = types.SpanDetail;
const SpanType = types.SpanType;
const TextType = types.TextType;