Files
bun.sh/src/HTMLScanner.zig
pfg 83760fc446 Sort imports in all files (#21119)
Co-authored-by: taylor.fish <contact@taylor.fish>
2025-07-21 13:26:47 -07:00

309 lines
10 KiB
Zig

const HTMLScanner = @This();
allocator: std.mem.Allocator,
import_records: ImportRecord.List = .{},
log: *logger.Log,
source: *const logger.Source,
pub fn init(allocator: std.mem.Allocator, log: *logger.Log, source: *const logger.Source) HTMLScanner {
return .{
.allocator = allocator,
.import_records = .{},
.log = log,
.source = source,
};
}
pub fn deinit(this: *HTMLScanner) void {
for (this.import_records.slice()) |*record| {
this.allocator.free(record.path.text);
}
this.import_records.deinitWithAllocator(this.allocator);
}
fn createImportRecord(this: *HTMLScanner, input_path: []const u8, kind: ImportKind) !void {
// In HTML, sometimes people do /src/index.js
// In that case, we don't want to use the absolute filesystem path, we want to use the path relative to the project root
const path_to_use = if (input_path.len > 1 and input_path[0] == '/')
bun.path.joinAbsString(bun.fs.FileSystem.instance.top_level_dir, &[_][]const u8{input_path[1..]}, .auto)
// Check if imports to (e.g) "App.tsx" are actually relative imoprts w/o the "./"
else if (input_path.len > 2 and input_path[0] != '.' and input_path[1] != '/') blk: {
const index_of_dot = std.mem.lastIndexOfScalar(u8, input_path, '.') orelse break :blk input_path;
const ext = input_path[index_of_dot..];
if (ext.len > 4) break :blk input_path;
// /foo/bar/index.html -> /foo/bar
const dirname: []const u8 = std.fs.path.dirname(this.source.path.text) orelse break :blk input_path;
const resolved = bun.path.joinAbsString(dirname, &[_][]const u8{input_path}, .auto);
break :blk if (bun.sys.exists(resolved)) resolved else input_path;
} else input_path;
const record = ImportRecord{
.path = fs.Path.init(try this.allocator.dupeZ(u8, path_to_use)),
.kind = kind,
.range = logger.Range.None,
};
try this.import_records.push(this.allocator, record);
}
const debug = bun.Output.scoped(.HTMLScanner, true);
pub fn onWriteHTML(_: *HTMLScanner, bytes: []const u8) void {
_ = bytes; // bytes are not written in scan phase
}
pub fn onHTMLParseError(this: *HTMLScanner, message: []const u8) void {
this.log.addError(
this.source,
logger.Loc.Empty,
message,
) catch bun.outOfMemory();
}
pub fn onTag(this: *HTMLScanner, _: *lol.Element, path: []const u8, url_attribute: []const u8, kind: ImportKind) void {
_ = url_attribute;
this.createImportRecord(path, kind) catch {};
}
const processor = HTMLProcessor(HTMLScanner, false);
pub fn scan(this: *HTMLScanner, input: []const u8) !void {
try processor.run(this, input);
}
pub fn HTMLProcessor(
comptime T: type,
/// If the visitor should visit html, head, body
comptime visit_document_tags: bool,
) type {
return struct {
const TagHandler = struct {
/// CSS selector to match elements
selector: []const u8,
/// Whether this tag can have text content that needs to be processed
has_content: bool = false,
/// The attribute to extract the URL from
url_attribute: []const u8,
/// The kind of import to create
kind: ImportKind,
is_head_or_html: bool = false,
};
const tag_handlers = [_]TagHandler{
// Module scripts with src
.{
.selector = "script[src]",
.has_content = false,
.url_attribute = "src",
.kind = .stmt,
},
// CSS Stylesheets
.{
.selector = "link[rel='stylesheet'][href]",
.url_attribute = "href",
.kind = .at,
},
// CSS Assets
.{
.selector = "link[as='style'][href]",
.url_attribute = "href",
.kind = .at,
},
// Font files
.{
.selector = "link[as='font'][href], link[type^='font/'][href]",
.url_attribute = "href",
.kind = .url,
},
// Image assets
.{
.selector = "link[as='image'][href]",
.url_attribute = "href",
.kind = .url,
},
// Audio/Video assets
.{
.selector = "link[as='video'][href], link[as='audio'][href]",
.url_attribute = "href",
.kind = .url,
},
// Web Workers
.{
.selector = "link[as='worker'][href]",
.url_attribute = "href",
.kind = .stmt,
},
// Manifest files
.{
.selector = "link[rel='manifest'][href]",
.url_attribute = "href",
.kind = .url,
},
// Icons
.{
.selector = "link[rel='icon'][href], link[rel='apple-touch-icon'][href]",
.url_attribute = "href",
.kind = .url,
},
// Images with src
.{
.selector = "img[src]",
.url_attribute = "src",
.kind = .url,
},
// Images with srcset
.{
.selector = "img[srcset]",
.url_attribute = "srcset",
.kind = .url,
},
// Videos with src
.{
.selector = "video[src]",
.url_attribute = "src",
.kind = .url,
},
// Videos with poster
.{
.selector = "video[poster]",
.url_attribute = "poster",
.kind = .url,
},
// Audio with src
.{
.selector = "audio[src]",
.url_attribute = "src",
.kind = .url,
},
// Source elements with src
.{
.selector = "source[src]",
.url_attribute = "src",
.kind = .url,
},
// Source elements with srcset
.{
.selector = "source[srcset]",
.url_attribute = "srcset",
.kind = .url,
},
// // Iframes
// .{
// .selector = "iframe[src]",
// .url_attribute = "src",
// .kind = .url,
// },
};
fn generateHandlerForTag(comptime tag_info: TagHandler) fn (*T, *lol.Element) bool {
const Handler = struct {
pub fn handle(this: *T, element: *lol.Element) bool {
// Handle URL attribute if present
if (tag_info.url_attribute.len > 0) {
if (element.hasAttribute(tag_info.url_attribute) catch false) {
const value = element.getAttribute(tag_info.url_attribute);
defer value.deinit();
if (value.len > 0) {
debug("{s} {s}", .{ tag_info.selector, value.slice() });
T.onTag(this, element, value.slice(), tag_info.url_attribute, tag_info.kind);
}
}
}
return false;
}
};
return Handler.handle;
}
pub fn run(this: *T, input: []const u8) !void {
var builder = lol.HTMLRewriter.Builder.init();
defer builder.deinit();
var selectors: std.BoundedArray(*lol.HTMLSelector, tag_handlers.len + if (visit_document_tags) 3 else 0) = .{};
defer for (selectors.slice()) |selector| {
selector.deinit();
};
// Add handlers for each tag type
inline for (tag_handlers) |tag_info| {
const selector = try lol.HTMLSelector.parse(tag_info.selector);
selectors.appendAssumeCapacity(selector);
try builder.addElementContentHandlers(
selector,
T,
comptime generateHandlerForTag(tag_info),
this,
void,
null,
null,
void,
null,
null,
);
}
if (visit_document_tags) {
inline for (.{ "body", "head", "html" }, &.{ T.onBodyTag, T.onHeadTag, T.onHtmlTag }) |tag, cb| {
const head_selector = try lol.HTMLSelector.parse(tag);
selectors.appendAssumeCapacity(head_selector);
try builder.addElementContentHandlers(
head_selector,
T,
cb,
this,
void,
null,
null,
void,
null,
null,
);
}
}
const memory_settings = lol.MemorySettings{
.preallocated_parsing_buffer_size = @max(input.len / 4, 1024),
.max_allowed_memory_usage = 1024 * 1024 * 10,
};
errdefer {
const last_error = lol.HTMLString.lastError();
defer last_error.deinit();
if (last_error.len > 0) {
this.onHTMLParseError(last_error.slice());
}
}
var rewriter = try builder.build(
.UTF8,
memory_settings,
false,
T,
this,
T.onWriteHTML,
struct {
fn done(_: *T) void {}
}.done,
);
defer rewriter.deinit();
try rewriter.write(input);
try rewriter.end();
}
};
}
const lol = @import("./deps/lol-html.zig");
const std = @import("std");
const ImportKind = @import("./import_record.zig").ImportKind;
const ImportRecord = @import("./import_record.zig").ImportRecord;
const bun = @import("bun");
const fs = bun.fs;
const logger = bun.logger;