Files
bun.sh/src/css_scanner.zig
Jarred Sumner e1677bb774 Skeleton!
Former-commit-id: 6e2c6cd6ea
2021-06-16 16:23:02 -07:00

781 lines
30 KiB
Zig

const Fs = @import("fs.zig");
const std = @import("std");
usingnamespace @import("global.zig");
const options = @import("./options.zig");
const import_record = @import("import_record.zig");
const logger = @import("./logger.zig");
const Options = options;
const replacementCharacter = 0xFFFD;
pub const Chunk = struct {
// Entire chunk
range: logger.Range,
content: Content,
pub const Content = union(Tag) {
t_url: TextContent,
t_import: Import,
t_verbatim: Verbatim,
};
pub fn raw(chunk: *const Chunk, source: *const logger.Source) string {
return source.contents[chunk.range.loc.start..][0..chunk.range.len];
}
// pub fn string(chunk: *const Chunk, source: *const logger.Source) string {
// switch (chunk.content) {
// .t_url => |url| {
// var str = url.utf8;
// var start: i32 = 4;
// var end: i32 = chunk.range.len - 1;
// while (start < end and isWhitespace(str[start])) {
// start += 1;
// }
// while (start < end and isWhitespace(str[end - 1])) {
// end -= 1;
// }
// return str;
// },
// .t_import => |import| {
// if (import.url) {}
// },
// else => {
// return chunk.raw(source);
// },
// }
// }
pub const TextContent = struct {
quote: Quote = .none,
utf8: string,
valid: bool = true,
needs_decode_escape: bool = false,
pub const Quote = enum {
none,
double,
single,
};
};
pub const Import = struct {
url: bool = false,
text: TextContent,
supports: string = "",
// @import can contain media queries and other stuff
media_queries_str: string = "",
suffix: string = "",
};
pub const Verbatim = struct {};
pub const Tag = enum {
t_url,
t_verbatim,
t_import,
};
};
pub const Token = enum {
t_end_of_file,
t_semicolon,
t_whitespace,
t_at_import,
t_url,
t_verbatim,
t_string,
t_bad_string,
};
const escLineFeed = 0x0C;
// This is not a CSS parser.
// All this does is scan for URLs and @import statements
// Once found, it resolves & rewrites them
// Eventually, there will be a real CSS parser in here.
// But, no time yet.
pub fn NewScanner(
comptime WriterType: type,
) type {
return struct {
const Scanner = @This();
current: usize = 0,
start: usize = 0,
end: usize = 0,
log: *logger.Log,
has_newline_before: bool = false,
has_delimiter_before: bool = false,
allocator: *std.mem.Allocator,
source: *const logger.Source,
writer: WriterType,
codepoint: CodePoint = -1,
approximate_newline_count: usize = 0,
pub fn init(log: *logger.Log, allocator: *std.mem.Allocator, writer: WriterType, source: *const logger.Source) Scanner {
return Scanner{ .writer = writer, .log = log, .source = source, .allocator = allocator };
}
pub fn range(scanner: *Scanner) logger.Range {
return logger.Range{
.loc = .{ .start = @intCast(i32, scanner.start) },
.len = @intCast(i32, scanner.end - scanner.start),
};
}
pub fn step(scanner: *Scanner) void {
scanner.codepoint = scanner.nextCodepoint();
scanner.approximate_newline_count += @boolToInt(scanner.codepoint == '\n');
}
pub fn raw(scanner: *Scanner) string {}
pub fn isValidEscape(scanner: *Scanner) bool {
if (scanner.codepoint != '\\') return false;
const slice = scanner.nextCodepointSlice(false);
return switch (slice.len) {
0 => false,
1 => true,
2 => (std.unicode.utf8Decode2(slice) catch 0) > 0,
3 => (std.unicode.utf8Decode3(slice) catch 0) > 0,
4 => (std.unicode.utf8Decode4(slice) catch 0) > 0,
else => false,
};
}
pub fn consumeString(scanner: *Scanner, comptime quote: CodePoint) ?string {
const start = scanner.current;
scanner.step();
while (true) {
switch (scanner.codepoint) {
'\\' => {
scanner.step();
// Handle Windows CRLF
if (scanner.codepoint == '\r') {
scanner.step();
if (scanner.codepoint == '\n') {
scanner.step();
}
continue;
}
// Otherwise, fall through to ignore the character after the backslash
},
-1 => {
scanner.end = scanner.current;
scanner.log.addRangeError(
scanner.source,
scanner.range(),
"Unterminated string token",
) catch unreachable;
return null;
},
'\n', '\r', escLineFeed => {
scanner.end = scanner.current;
scanner.log.addRangeError(
scanner.source,
scanner.range(),
"Unterminated string token",
) catch unreachable;
return null;
},
quote => {
scanner.step();
return scanner.source.contents[start..scanner.current];
},
else => {},
}
scanner.step();
}
unreachable;
}
pub fn consumeURL(scanner: *Scanner) Chunk.TextContent {
var text = Chunk.TextContent{ .utf8 = "" };
const start = scanner.end;
validURL: while (true) {
switch (scanner.codepoint) {
')' => {
scanner.step();
text.utf8 = scanner.source.contents[start..scanner.current];
return text;
},
-1 => {
const loc = logger.Loc{ .start = @intCast(i32, scanner.end) };
scanner.log.addError(scanner.source, loc, "Expected \")\" to end URL token") catch {};
return text;
},
'\t', '\n', '\r', escLineFeed => {
scanner.step();
while (isWhitespace(scanner.codepoint)) {
scanner.step();
}
if (scanner.codepoint != ')') {
const loc = logger.Loc{ .start = @intCast(i32, scanner.end) };
scanner.log.addError(scanner.source, loc, "Expected \")\" to end URL token") catch {};
break :validURL;
}
scanner.step();
text.utf8 = scanner.source.contents[start..scanner.current];
return text;
},
'"', '\'', '(' => {
const r = logger.Range{ .loc = logger.Loc{ .start = @intCast(i32, start) }, .len = @intCast(i32, scanner.end - start) };
scanner.log.addRangeError(scanner.source, r, "Expected \")\" to end URL token") catch {};
break :validURL;
},
'\\' => {
text.needs_decode_escape = true;
if (!scanner.isValidEscape()) {
var loc = logger.Loc{
.start = @intCast(i32, scanner.end),
};
scanner.log.addError(scanner.source, loc, "Expected \")\" to end URL token") catch {};
break :validURL;
}
_ = scanner.consumeEscape();
},
else => {
if (isNonPrintable(scanner.codepoint)) {
const r = logger.Range{
.loc = logger.Loc{
.start = @intCast(i32, start),
},
.len = 1,
};
scanner.log.addRangeError(scanner.source, r, "Invalid escape") catch {};
break :validURL;
}
scanner.step();
},
}
}
text.valid = false;
// Consume the remnants of a bad url
while (true) {
switch (scanner.codepoint) {
')', -1 => {
scanner.step();
text.utf8 = scanner.source.contents[start..scanner.end];
return text;
},
'\\' => {
text.needs_decode_escape = true;
if (scanner.isValidEscape()) {
_ = scanner.consumeEscape();
}
},
else => {},
}
scanner.step();
}
return text;
}
pub fn next(scanner: *Scanner) !void {
scanner.has_newline_before = scanner.end == 0;
scanner.has_delimiter_before = false;
scanner.step();
restart: while (true) {
var chunk = Chunk{
.range = logger.Range{
.loc = .{ .start = @intCast(i32, scanner.end) },
.len = 0,
},
.content = .{
.t_verbatim = .{},
},
};
scanner.start = scanner.end;
toplevel: while (true) {
// We only care about two things.
// 1. url()
// 2. @import
// To correctly parse, url(), we need to verify that the character preceding it is either whitespace, a colon, or a comma
// We also need to parse strings and comments, or else we risk resolving comments like this /* url(hi.jpg) */
switch (scanner.codepoint) {
-1 => {
chunk.range.len = @intCast(i32, scanner.end) - chunk.range.loc.start;
chunk.content.t_verbatim = .{};
try scanner.writer.writeChunk(chunk);
return;
},
'\t', '\n', '\r', escLineFeed => {
scanner.has_newline_before = true;
continue;
},
// Ensure whitespace doesn't affect scanner.has_delimiter_before
' ' => {},
':', ',' => {
scanner.has_delimiter_before = true;
},
// this is a little hacky, but it should work since we're not parsing scopes
'{', '}', ';' => {
scanner.has_delimiter_before = false;
},
'u', 'U' => {
// url() always appears on the property value side
// so we should ignore it if it's part of a different token
if (!scanner.has_delimiter_before) {
scanner.step();
continue :toplevel;
}
var url_start = scanner.current;
scanner.step();
switch (scanner.codepoint) {
'r', 'R' => {},
else => {
continue;
},
}
scanner.step();
switch (scanner.codepoint) {
'l', 'L' => {},
else => {
continue;
},
}
scanner.step();
if (scanner.codepoint != '(') {
continue;
}
const url_text = scanner.consumeURL();
chunk.range.len = @intCast(i32, url_start) - chunk.range.loc.start;
chunk.content = .{ .t_verbatim = .{} };
// flush the pending chunk
try scanner.writer.writeChunk(chunk);
chunk.range.loc.start = @intCast(i32, url_start);
chunk.range.len = @intCast(i32, scanner.end) - chunk.range.loc.start;
chunk.content.t_url = url_text;
try scanner.writer.writeChunk(chunk);
scanner.has_delimiter_before = false;
continue :restart;
},
'@' => {
const start = scanner.end;
scanner.step();
if (scanner.codepoint != 'i') continue :toplevel;
scanner.step();
if (scanner.codepoint != 'm') continue :toplevel;
scanner.step();
if (scanner.codepoint != 'p') continue :toplevel;
scanner.step();
if (scanner.codepoint != 'o') continue :toplevel;
scanner.step();
if (scanner.codepoint != 'r') continue :toplevel;
scanner.step();
if (scanner.codepoint != 't') continue :toplevel;
scanner.step();
if (scanner.codepoint != 't') continue :toplevel;
scanner.step();
if (scanner.codepoint != ' ') continue :toplevel;
// Now that we know to expect an import url, we flush the chunk
chunk.range.len = @intCast(i32, start) - chunk.range.loc.start;
chunk.content = .{ .t_verbatim = .{} };
// flush the pending chunk
try scanner.writer.writeChunk(chunk);
// Don't write the .start until we know it's an @import rule
// We want to avoid messing with other rules
scanner.start = start;
var url_token_start = scanner.current;
var url_token_end = scanner.current;
// "Imported rules must precede all other types of rule"
// https://developer.mozilla.org/en-US/docs/Web/CSS/@import
// @import url;
// @import url list-of-media-queries;
// @import url supports( supports-query );
// @import url supports( supports-query ) list-of-media-queries;
var is_url_token = false;
var quote: CodePoint = -1;
while (isWhitespace(scanner.codepoint)) {
scanner.step();
}
var import = Chunk.Import{
.text = .{
.utf8 = "",
},
};
switch (scanner.codepoint) {
// spongebob-case url() are supported, I guess.
// uRL()
// uRL()
// URl()
'u', 'U' => {
scanner.step();
switch (scanner.codepoint) {
'r', 'R' => {},
else => {
scanner.log.addError(
scanner.source,
logger.Loc{ .start = @intCast(i32, scanner.end) },
"Expected @import to start with a string or url()",
) catch {};
return error.SyntaxError;
},
}
scanner.step();
switch (scanner.codepoint) {
'l', 'L' => {},
else => {
scanner.log.addError(
scanner.source,
logger.Loc{ .start = @intCast(i32, scanner.end) },
"Expected @import to start with a \", ' or url()",
) catch {};
return error.SyntaxError;
},
}
scanner.step();
if (scanner.codepoint != '(') {
scanner.log.addError(
scanner.source,
logger.Loc{ .start = @intCast(i32, scanner.end) },
"Expected \"(\" in @import url",
) catch {};
return error.SyntaxError;
}
import.text = scanner.consumeURL();
},
'"' => {
import.text.quote = .double;
if (scanner.consumeString('"')) |str| {
import.text.utf8 = str;
} else {
return error.SyntaxError;
}
},
'\'' => {
import.text.quote = .single;
if (scanner.consumeString('\'')) |str| {
import.text.utf8 = str;
} else {
return error.SyntaxError;
}
},
else => {
return error.SyntaxError;
},
}
var suffix_start = scanner.end;
get_suffix: while (true) {
switch (scanner.codepoint) {
';' => {
scanner.step();
import.suffix = scanner.source.contents[suffix_start..scanner.end];
scanner.has_delimiter_before = false;
break :get_suffix;
},
-1 => {
scanner.log.addError(
scanner.source,
logger.Loc{ .start = @intCast(i32, scanner.end) },
"Expected \";\" at end of @import",
) catch {};
},
else => {},
}
scanner.step();
}
chunk.range.len = @intCast(i32, scanner.end) - std.math.max(chunk.range.loc.start, 0);
chunk.content = .{ .t_import = import };
try scanner.writer.writeChunk(chunk);
continue :restart;
},
// We don't actually care what the values are here, we just want to avoid confusing strings for URLs.
'\'' => {
scanner.has_delimiter_before = false;
if (scanner.consumeString('\'') == null) {
return error.SyntaxError;
}
},
'"' => {
scanner.has_delimiter_before = false;
if (scanner.consumeString('"') == null) {
return error.SyntaxError;
}
},
// Skip comments
'/' => {},
else => {
scanner.has_delimiter_before = false;
},
}
scanner.step();
}
}
}
pub fn consumeEscape(scanner: *Scanner) CodePoint {
scanner.step();
var c = scanner.codepoint;
if (isHex(c)) |__hex| {
var hex = __hex;
scanner.step();
value: {
comptime var i: usize = 0;
inline while (i < 5) : (i += 1) {
if (isHex(scanner.codepoint)) |_hex| {
scanner.step();
hex = hex * 16 + _hex;
} else {
break :value;
}
}
break :value;
}
if (isWhitespace(scanner.codepoint)) {
scanner.step();
}
return switch (hex) {
0, 0xD800...0xDFFF, 0x10FFFF...std.math.maxInt(CodePoint) => replacementCharacter,
else => hex,
};
}
if (c == -1) return replacementCharacter;
scanner.step();
return c;
}
inline fn nextCodepointSlice(it: *Scanner, comptime advance: bool) []const u8 {
@setRuntimeSafety(false);
const cp_len = strings.utf8ByteSequenceLength(it.source.contents[it.current]);
if (advance) {
it.end = it.current;
it.current += cp_len;
}
return if (!(it.current > it.source.contents.len)) it.source.contents[it.current - cp_len .. it.current] else "";
}
pub inline fn nextCodepoint(it: *Scanner) CodePoint {
const slice = it.nextCodepointSlice(true);
@setRuntimeSafety(false);
return switch (slice.len) {
0 => -1,
1 => @intCast(CodePoint, slice[0]),
2 => @intCast(CodePoint, std.unicode.utf8Decode2(slice) catch unreachable),
3 => @intCast(CodePoint, std.unicode.utf8Decode3(slice) catch unreachable),
4 => @intCast(CodePoint, std.unicode.utf8Decode4(slice) catch unreachable),
else => unreachable,
};
}
};
}
fn isWhitespace(c: CodePoint) bool {
return switch (c) {
' ', '\t', '\n', '\r', escLineFeed => true,
else => false,
};
}
fn isNonPrintable(c: CodePoint) bool {
return switch (c) {
0...0x08, 0x0B, 0x0E...0x1F, 0x7F => true,
else => false,
};
}
pub fn isHex(c: CodePoint) ?CodePoint {
return switch (c) {
'0'...'9' => c - '0',
'a'...'f' => c + (10 - 'a'),
'A'...'F' => c + (10 - 'A'),
else => null,
};
}
pub fn NewWriter(
comptime WriterType: type,
comptime LinkerType: type,
comptime import_path_format: Options.BundleOptions.ImportPathFormat,
) type {
return struct {
const Writer = @This();
const Scanner = NewScanner(*Writer);
ctx: WriterType,
linker: LinkerType,
source: *const logger.Source,
written: usize = 0,
pub fn init(
source: *const logger.Source,
ctx: WriterType,
linker: LinkerType,
) Writer {
return Writer{
.ctx = ctx,
.linker = linker,
.source = source,
.written = 0,
};
}
pub fn run(writer: *Writer, log: *logger.Log, allocator: *std.mem.Allocator) !void {
var scanner = Scanner.init(
log,
allocator,
writer,
writer.source,
);
try scanner.next();
}
fn writeString(writer: *Writer, str: string, quote: Chunk.TextContent.Quote) !void {
switch (quote) {
.none => {
try writer.ctx.writeAll(str);
writer.written += str.len;
return;
},
.single => {
try writer.ctx.writeAll("'");
writer.written += 1;
try writer.ctx.writeAll(str);
writer.written += str.len;
try writer.ctx.writeAll("'");
writer.written += 1;
},
.double => {
try writer.ctx.writeAll("\"");
writer.written += 1;
try writer.ctx.writeAll(str);
writer.written += str.len;
try writer.ctx.writeAll("\"");
writer.written += 1;
},
}
}
fn writeURL(writer: *Writer, url_str: string, text: Chunk.TextContent) !void {
switch (text.quote) {
.none => {
try writer.ctx.writeAll("url(");
writer.written += "url(".len;
},
.single => {
try writer.ctx.writeAll("url('");
writer.written += "url('".len;
},
.double => {
try writer.ctx.writeAll("url(\"");
writer.written += "url(\"".len;
},
}
try writer.ctx.writeAll(url_str);
writer.written += url_str.len;
switch (text.quote) {
.none => {
try writer.ctx.writeAll(")");
writer.written += ")".len;
},
.single => {
try writer.ctx.writeAll("')");
writer.written += "')".len;
},
.double => {
try writer.ctx.writeAll("\")");
writer.written += "\")".len;
},
}
}
pub fn writeChunk(writer: *Writer, chunk: Chunk) !void {
switch (chunk.content) {
.t_url => |url| {
const url_str = try writer.linker.resolveCSS(
writer.source.path,
url.utf8,
chunk.range,
import_record.ImportKind.url,
import_path_format,
);
try writer.writeURL(url_str, url);
},
.t_import => |import| {
const url_str = try writer.linker.resolveCSS(
writer.source.path,
import.text.utf8,
chunk.range,
import_record.ImportKind.at,
import_path_format,
);
try writer.ctx.writeAll("@import ");
writer.written += "@import ".len;
if (import.url) {
try writer.writeURL(url_str, import.text);
} else {
try writer.writeString(url_str, import.text.quote);
}
try writer.ctx.writeAll(import.suffix);
writer.written += import.suffix.len;
try writer.ctx.writeAll("\n");
writer.written += 1;
},
.t_verbatim => |verbatim| {
defer writer.written += @intCast(usize, chunk.range.len);
if (comptime std.meta.trait.hasFn("copyFileRange")(WriterType)) {
try writer.ctx.copyFileRange(
@intCast(usize, chunk.range.loc.start),
@intCast(
usize,
@intCast(
usize,
chunk.range.len + chunk.range.loc.start,
),
),
);
} else {
try writer.ctx.writeAll(
writer.source.contents[@intCast(usize, chunk.range.loc.start)..][0..@intCast(
usize,
chunk.range.len,
)],
);
}
},
}
}
};
}