hm

Former-commit-id: 4d6a8f598a
2026-02-09 10:28:47 +00:00 · 2021-05-18 20:06:08 -07:00
parent 7396fae4e2
commit 2fac623977
7 changed files with 789 additions and 89 deletions
--- a/src/global.zig
+++ b/src/global.zig
@@ -22,6 +22,7 @@ pub const isWindows = std.Target.current.os.tag == .windows;
 pub const enableTracing = true;

 pub const isDebug = std.builtin.Mode.Debug == std.builtin.mode;
+pub const isTest = std.builtin.is_test;

 pub const Output = struct {
    var source: *Source = undefined;
--- a/src/js_ast.zig
+++ b/src/js_ast.zig
@@ -1058,16 +1058,20 @@ pub const Stmt = struct {
    }

    pub fn empty() Stmt {
-        return Stmt.init(&Stmt.None, logger.Loc.Empty);
+        return Stmt.init(Stmt.None, logger.Loc.Empty);
    }

    var None = S.Empty{};

    pub fn init(origData: anytype, loc: logger.Loc) Stmt {
-        if (@typeInfo(@TypeOf(origData)) != .Pointer) {
+        if (@typeInfo(@TypeOf(origData)) != .Pointer and @TypeOf(origData) != S.Empty) {
            @compileError("Stmt.init needs a pointer.");
        }

+        if (@TypeOf(origData) == S.Empty) {
+            return Stmt{ .loc = loc, .data = Data{ .s_empty = S.Empty{} } };
+        }
+
        switch (@TypeOf(origData.*)) {
            S.Block => {
                return Stmt.comptime_init("s_block", S.Block, origData, loc);
@@ -1210,7 +1214,7 @@ pub const Stmt = struct {
                return Stmt.comptime_alloc(allocator, "s_do_while", S.DoWhile, origData, loc);
            },
            S.Empty => {
-                return Stmt.comptime_alloc(allocator, "s_empty", S.Empty, origData, loc);
+                return Stmt{ .loc = loc, .data = Data{ .s_empty = S.Empty{} } };
            },
            S.Enum => {
                return Stmt.comptime_alloc(allocator, "s_enum", S.Enum, origData, loc);
@@ -1336,7 +1340,7 @@ pub const Stmt = struct {
        s_debugger: *S.Debugger,
        s_directive: *S.Directive,
        s_do_while: *S.DoWhile,
-        s_empty: *S.Empty,
+        s_empty: S.Empty,
        s_enum: *S.Enum,
        s_export_clause: *S.ExportClause,
        s_export_default: *S.ExportDefault,
@@ -1382,7 +1386,12 @@ pub const Stmt = struct {
 pub const Expr = struct {
    loc: logger.Loc,
    data: Data,
-
+    pub fn toEmpty(expr: *Expr) Expr {
+        return Expr{ .data = .{ .e_missing = E.Missing{} }, .loc = expr.loc };
+    }
+    pub fn isEmpty(expr: *Expr) bool {
+        return std.meta.activeTag(expr.data) == .e_missing;
+    }
    pub const Query = struct { expr: Expr, loc: logger.Loc };

    pub fn getProperty(expr: *const Expr, name: string) ?Query {
@@ -1829,9 +1838,7 @@ pub const Expr = struct {
                return Expr{ .loc = loc, .data = Data{ .e_jsx_element = dat } };
            },
            E.Missing => {
-                var dat = allocator.create(E.Missing) catch unreachable;
-                dat.* = st;
-                return Expr{ .loc = loc, .data = Data{ .e_missing = dat } };
+                return Expr{ .loc = loc, .data = Data{ .e_missing = E.Missing{} } };
            },
            E.Number => {
                var dat = allocator.create(E.Number) catch unreachable;
@@ -2460,7 +2467,7 @@ pub const Expr = struct {
        e_import_identifier: *E.ImportIdentifier,
        e_private_identifier: *E.PrivateIdentifier,
        e_jsx_element: *E.JSXElement,
-        e_missing: *E.Missing,
+        e_missing: E.Missing,
        e_number: *E.Number,
        e_big_int: *E.BigInt,
        e_object: *E.Object,
--- a/src/js_lexer.zig
+++ b/src/js_lexer.zig
@@ -191,25 +191,258 @@ pub const Lexer = struct {
        }
    }

-    fn parseStringLiteral(lexer: *LexerType) !void {
-        var quote: CodePoint = lexer.code_point;
-        var needs_slow_path = false;
-        var suffixLen: usize = 1;
+    pub fn decodeEscapeSequences(lexer: *LexerType, start: usize, text: string, buf: anytype) !void {
+        var iter = CodepointIterator{ .bytes = text[start..], .i = 0 };
+        const start_length = buf.items.len;
+        while (iter.nextCodepoint()) |c| {
+            const width = iter.width;

-        if (quote != '`') {
-            lexer.token = T.t_string_literal;
-        } else if (lexer.rescan_close_brace_as_template_token) {
-            lexer.token = T.t_template_tail;
-        } else {
-            lexer.token = T.t_no_substitution_template_literal;
+            switch (c) {
+                '\r' => {
+                    // From the specification:
+                    //
+                    // 11.8.6.1 Static Semantics: TV and TRV
+                    //
+                    // TV excludes the code units of LineContinuation while TRV includes
+                    // them. <CR><LF> and <CR> LineTerminatorSequences are normalized to
+                    // <LF> for both TV and TRV. An explicit EscapeSequence is needed to
+                    // include a <CR> or <CR><LF> sequence.
+
+                    // Convert '\r\n' into '\n'
+                    if (iter.i < text.len and text[iter.i] == '\n') {
+                        iter.i += 1;
+                    }
+
+                    // Convert '\r' into '\n'
+                    buf.append('\n') catch unreachable;
+                    continue;
+                },
+
+                '\\' => {
+                    const c2 = iter.nextCodepoint() orelse return;
+                    const width2 = iter.width;
+                    switch (c2) {
+                        'b' => {
+                            buf.append(std.mem.readIntNative(u16, "\\b")) catch unreachable;
+                            continue;
+                        },
+                        'f' => {
+                            buf.append(std.mem.readIntNative(u16, "\\f")) catch unreachable;
+                            continue;
+                        },
+                        'n' => {
+                            buf.append(std.mem.readIntNative(u16, "\\n")) catch unreachable;
+                            continue;
+                        },
+                        'r' => {
+                            buf.append(std.mem.readIntNative(u16, "\\r")) catch unreachable;
+                            continue;
+                        },
+                        't' => {
+                            buf.append(std.mem.readIntNative(u16, "\\t")) catch unreachable;
+                            continue;
+                        },
+                        'v' => {
+                            if (lexer.json_options != null) {
+                                lexer.end = start + iter.i - width2;
+                                try lexer.syntaxError();
+                            }
+
+                            buf.append(std.mem.readIntNative(u16, "\\v")) catch unreachable;
+                            continue;
+                        },
+                        '0'...'7' => {
+                            try lexer.addUnsupportedSyntaxError("Legacy octal literals are not supported.");
+                        },
+                        '8', '9' => {
+                            try lexer.addUnsupportedSyntaxError("Legacy octal literals are not supported.");
+                        },
+                        'x' => {
+                            if (lexer.json_options != null) {
+                                lexer.end = start + iter.i - width2;
+                                try lexer.syntaxError();
+                            }
+
+                            var value: CodePoint = 0;
+                            var c3: CodePoint = 0;
+                            var width3: u3 = 0;
+                            comptime var j: usize = 0;
+                            inline while (j < 2) : (j += 1) {
+                                c3 = iter.nextCodepoint() orelse return lexer.syntaxError();
+                                width3 = iter.width;
+                                switch (c3) {
+                                    '0'...'9' => {
+                                        value = value * 16 | (c3 - '0');
+                                    },
+                                    'a'...'f' => {
+                                        value = value * 16 | (c3 + 10 - 'a');
+                                    },
+                                    'A'...'F' => {
+                                        value = value * 16 | (c3 + 10 - 'A');
+                                    },
+                                    else => {
+                                        lexer.end = start + iter.i - width3;
+                                        return lexer.syntaxError();
+                                    },
+                                }
+                            }
+                            iter.c = value;
+                        },
+                        'u' => {
+                            // We're going to make this an i64 so we don't risk integer overflows
+                            // when people do weird things
+                            var value: i64 = 0;
+
+                            var c3 = iter.nextCodepoint() orelse return lexer.syntaxError();
+                            var width3 = iter.width;
+
+                            // variable-length
+                            if (c3 == '{') {
+                                if (lexer.json_options != null) {
+                                    lexer.end = start + iter.i - width2;
+                                    try lexer.syntaxError();
+                                }
+
+                                const hex_start = iter.i - width - width2 - width3;
+                                var is_first = true;
+                                var is_out_of_range = false;
+                                variableLength: while (true) {
+                                    c3 = iter.nextCodepoint() orelse break :variableLength;
+
+                                    switch (c3) {
+                                        '0'...'9' => {
+                                            value = value * 16 | (c3 - '0');
+                                        },
+                                        'a'...'f' => {
+                                            value = value * 16 | (c3 + 10 - 'a');
+                                        },
+                                        'A'...'F' => {
+                                            value = value * 16 | (c3 + 10 - 'A');
+                                        },
+                                        '}' => {
+                                            if (is_first) {
+                                                lexer.end = start + iter.i - width3;
+                                                return lexer.syntaxError();
+                                            }
+                                            break :variableLength;
+                                        },
+                                        else => {
+                                            lexer.end = start + iter.i - width3;
+                                            return lexer.syntaxError();
+                                        },
+                                    }
+
+                                    // '\U0010FFFF
+                                    // copied from golang utf8.MaxRune
+                                    if (value > 1114111) {
+                                        is_out_of_range = true;
+                                    }
+                                    is_first = false;
+                                }
+
+                                if (is_out_of_range) {
+                                    try lexer.addRangeError(
+                                        .{ .loc = .{ .start = @intCast(i32, start + hex_start) }, .len = @intCast(i32, (iter.i - hex_start)) },
+                                        "Unicode escape sequence is out of range",
+                                        .{},
+                                        true,
+                                    );
+                                    return;
+                                }
+
+                                // fixed-length
+                            } else {
+                                // Fixed-length
+                                comptime var j: usize = 0;
+                                inline while (j < 4) : (j += 1) {
+                                    switch (c3) {
+                                        '0'...'9' => {
+                                            value = value * 16 | (c3 - '0');
+                                        },
+                                        'a'...'f' => {
+                                            value = value * 16 | (c3 + 10 - 'a');
+                                        },
+                                        'A'...'F' => {
+                                            value = value * 16 | (c3 + 10 - 'A');
+                                        },
+                                        else => {
+                                            lexer.end = start + iter.i - width3;
+                                            return lexer.syntaxError();
+                                        },
+                                    }
+
+                                    if (j < 3) {
+                                        c3 = iter.nextCodepoint() orelse return lexer.syntaxError();
+                                        width3 = iter.width;
+                                    }
+                                }
+                            }
+
+                            iter.c = @truncate(CodePoint, value);
+                        },
+                        '\r' => {
+                            if (lexer.json_options != null) {
+                                lexer.end = start + iter.i - width2;
+                                try lexer.syntaxError();
+                            }
+
+                            // Ignore line continuations. A line continuation is not an escaped newline.
+                            if (iter.i < text.len and text[iter.i + 1] == '\n') {
+                                // Make sure Windows CRLF counts as a single newline
+                                iter.i += 1;
+                            }
+                            continue;
+                        },
+                        '\n', 0x2028, 0x2029 => {
+                            if (lexer.json_options != null) {
+                                lexer.end = start + iter.i - width2;
+                                try lexer.syntaxError();
+                            }
+
+                            // Ignore line continuations. A line continuation is not an escaped newline.
+                            continue;
+                        },
+                        else => {
+                            if (lexer.json_options != null) {
+                                switch (c2) {
+                                    '"', '\\', '/' => {},
+                                    else => {
+                                        lexer.end = start + iter.i - width2;
+                                        try lexer.syntaxError();
+                                    },
+                                }
+                            }
+                            iter.c = c2;
+                        },
+                    }
+                },
+                else => {},
+            }
+
+            if (iter.c <= 0xFFFF) {
+                buf.append(@intCast(u16, c)) catch unreachable;
+            } else {
+                iter.c -= 0x10000;
+                buf.ensureUnusedCapacity(2) catch unreachable;
+                buf.appendAssumeCapacity(@intCast(u16, 0xD800 + ((iter.c >> 10) & 0x3FF)));
+                buf.appendAssumeCapacity(@intCast(u16, 0xDC00 + (iter.c & 0x3FF)));
+            }
        }
-        try lexer.step();
+    }

+    pub const InnerStringLiteral = packed struct { suffix_len: u3, needs_slow_path: bool };
+    fn parseStringLiteralInnter(lexer: *LexerType, comptime quote: CodePoint) !InnerStringLiteral {
+        var needs_slow_path = false;
+        var suffix_len: u3 = 1;
        stringLiteral: while (true) {
            switch (lexer.code_point) {
                '\\' => {
-                    needs_slow_path = true;
                    try lexer.step();
+                    // Skip slow path for \n in a string literal
+                    // This is pretty common, shows up in e.g. React
+                    // Example code: array.split("\n")
+                    // We don't need to decode as UTF16 for that. We know it's just a newline char.
+                    needs_slow_path = lexer.code_point != 'n';

                    // Handle Windows CRLF
                    if (lexer.code_point == '\r' and lexer.json_options != null) {
@@ -245,7 +478,7 @@ pub const Lexer = struct {
                    if (quote == '`') {
                        try lexer.step();
                        if (lexer.code_point == '{') {
-                            suffixLen = 2;
+                            suffix_len = 2;
                            try lexer.step();
                            if (lexer.rescan_close_brace_as_template_token) {
                                lexer.token = T.t_template_middle;
@@ -257,12 +490,15 @@ pub const Lexer = struct {
                        continue :stringLiteral;
                    }
                },
+                // exit condition
+                quote => {
+                    try lexer.step();
+
+                    break;
+                },

                else => {
-                    if (quote == lexer.code_point) {
-                        try lexer.step();
-                        break :stringLiteral;
-                    }
+
                    // Non-ASCII strings need the slow path
                    if (lexer.code_point >= 0x80) {
                        needs_slow_path = true;
@@ -274,19 +510,41 @@ pub const Lexer = struct {
            try lexer.step();
        }

+        return InnerStringLiteral{ .needs_slow_path = needs_slow_path, .suffix_len = suffix_len };
+    }
+
+    fn parseStringLiteral(lexer: *LexerType) !void {
+        var quote: CodePoint = lexer.code_point;
+
+        if (quote != '`') {
+            lexer.token = T.t_string_literal;
+        } else if (lexer.rescan_close_brace_as_template_token) {
+            lexer.token = T.t_template_tail;
+        } else {
+            lexer.token = T.t_no_substitution_template_literal;
+        }
+        try lexer.step();
+
+        var string_literal_details = switch (quote) {
+            '`' => try lexer.parseStringLiteralInnter('`'),
+            '\'' => try lexer.parseStringLiteralInnter('\''),
+            '"' => try lexer.parseStringLiteralInnter('"'),
+            else => unreachable,
+        };
+
        // Reset string literal
-        lexer.string_literal_slice = lexer.source.contents[lexer.start + 1 .. lexer.end - suffixLen];
-        lexer.string_literal_is_ascii = !needs_slow_path;
+        lexer.string_literal_slice = lexer.source.contents[lexer.start + 1 .. lexer.end - string_literal_details.suffix_len];
+        lexer.string_literal_is_ascii = !string_literal_details.needs_slow_path;
        lexer.string_literal_buffer.shrinkRetainingCapacity(0);
-        if (needs_slow_path) {
-            lexer.string_literal_buffer.ensureTotalCapacity(lexer.string_literal_slice.len) catch unreachable;
-            var slice = lexer.string_literal_buffer.allocatedSlice();
-            lexer.string_literal_buffer.items = slice[0..strings.toUTF16Buf(lexer.string_literal_slice, slice)];
+        if (string_literal_details.needs_slow_path) {
+            lexer.string_literal_buffer.ensureUnusedCapacity(lexer.string_literal_slice.len) catch unreachable;
+            try lexer.decodeEscapeSequences(0, lexer.string_literal_slice, &lexer.string_literal_buffer);
        }

        if (quote == '\'' and lexer.json_options != null) {
            try lexer.addRangeError(lexer.range(), "JSON strings must use double quotes", .{}, true);
        }
+
        // for (text)
        // // if (needs_slow_path) {
        // //     // Slow path
@@ -333,8 +591,131 @@ pub const Lexer = struct {
        return Error.SyntaxError;
    }

-    pub fn scanIdentifierWithEscapes(self: *LexerType) !void {
-        try self.addUnsupportedSyntaxError("escape sequence");
+    pub const IdentifierKind = enum { normal, private };
+    pub const ScanResult = struct { token: T, contents: string };
+    threadlocal var small_escape_sequence_buffer: [4096]u16 = undefined;
+    const FakeArrayList16 = struct {
+        items: []u16,
+        i: usize = 0,
+
+        pub fn append(fake: *FakeArrayList16, value: u16) !void {
+            std.debug.assert(fake.items.len < fake.i);
+            fake.items[fake.i] = value;
+            fake.i += 1;
+        }
+
+        pub fn appendAssumeCapacity(fake: *FakeArrayList16, value: u16) void {
+            std.debug.assert(fake.items.len < fake.i);
+            fake.items[fake.i] = value;
+            fake.i += 1;
+        }
+        pub fn ensureUnusedCapacity(fake: *FakeArrayList16, int: anytype) !void {
+            std.debug.assert(fake.items.len < fake.i + int);
+        }
+    };
+    threadlocal var large_escape_sequence_list: std.ArrayList(u16) = undefined;
+    threadlocal var large_escape_sequence_list_loaded: bool = false;
+
+    // This is an edge case that doesn't really exist in the wild, so it doesn't
+    // need to be as fast as possible.
+    pub fn scanIdentifierWithEscapes(lexer: *LexerType, comptime kind: IdentifierKind) !ScanResult {
+        var result = ScanResult{ .token = .t_end_of_file, .contents = "" };
+        // First pass: scan over the identifier to see how long it is
+        while (true) {
+            // Scan a unicode escape sequence. There is at least one because that's
+            // what caused us to get on this slow path in the first place.
+            if (lexer.code_point == '\\') {
+                try lexer.step();
+                if (lexer.code_point != 'u') {
+                    try lexer.syntaxError();
+                }
+                try lexer.step();
+                if (lexer.code_point == '{') {
+                    // Variable-length
+                    try lexer.step();
+                    while (lexer.code_point != '}') {
+                        switch (lexer.code_point) {
+                            '0'...'9', 'a'...'f', 'A'...'F' => {
+                                try lexer.step();
+                            },
+                            else => {
+                                try lexer.syntaxError();
+                            },
+                        }
+                    }
+
+                    try lexer.step();
+                } else {
+                    // Fixed-length
+                    comptime var j: usize = 0;
+                    inline while (j < 4) : (j += 1) {
+                        switch (lexer.code_point) {
+                            '0'...'9', 'a'...'f', 'A'...'F' => {
+                                try lexer.step();
+                            },
+                            else => {
+                                try lexer.syntaxError();
+                            },
+                        }
+                    }
+                }
+                continue;
+            }
+
+            if (!isIdentifierContinue(lexer.code_point)) {
+                break;
+            }
+            try lexer.step();
+        }
+
+        // Second pass: re-use our existing escape sequence parser
+        var original_text = lexer.raw();
+        if (original_text.len < 1024) {
+            var buf = FakeArrayList16{ .items = &small_escape_sequence_buffer, .i = 0 };
+            try lexer.decodeEscapeSequences(lexer.start, original_text, &buf);
+            result.contents = lexer.utf16ToString(buf.items[0..buf.i]);
+        } else {
+            if (!large_escape_sequence_list_loaded) {
+                large_escape_sequence_list = try std.ArrayList(u16).initCapacity(lexer.allocator, original_text.len);
+                large_escape_sequence_list_loaded = true;
+            }
+
+            large_escape_sequence_list.shrinkRetainingCapacity(0);
+            try lexer.decodeEscapeSequences(lexer.start, original_text, &large_escape_sequence_list);
+            result.contents = lexer.utf16ToString(large_escape_sequence_list.items);
+        }
+
+        var identifier = result.contents;
+        if (kind == .private) {
+            identifier = result.contents[1..];
+        }
+
+        if (!isIdentifier(identifier)) {
+            try lexer.addRangeError(
+                .{ .loc = logger.usize2Loc(lexer.start), .len = @intCast(i32, lexer.end - lexer.start) },
+                "Invalid identifier: \"{s}\"",
+                .{result.contents},
+                true,
+            );
+        }
+        result.contents = identifier;
+
+        // Escaped keywords are not allowed to work as actual keywords, but they are
+        // allowed wherever we allow identifiers or keywords. For example:
+        //
+        //   // This is an error (equivalent to "var var;")
+        //   var \u0076\u0061\u0072;
+        //
+        //   // This is an error (equivalent to "var foo;" except for this rule)
+        //   \u0076\u0061\u0072 foo;
+        //
+        //   // This is an fine (equivalent to "foo.var;")
+        //   foo.\u0076\u0061\u0072;
+        //
+        result.token = if (Keywords.has(result.contents)) .t_escaped_keyword else .t_identifier;
+
+        // const text = lexer.decodeEscapeSequences(lexer.start, lexer.raw(), )
+        return result;
    }

    pub fn debugInfo(self: *LexerType) void {
@@ -462,31 +843,46 @@ pub const Lexer = struct {

                '#' => {
                    if (lexer.start == 0 and lexer.source.contents[1] == '!') {
-                        try lexer.addUnsupportedSyntaxError("#!hashbang is not supported yet.");
-                        return;
-                    }
-
-                    try lexer.step();
-                    if (!isIdentifierStart(lexer.code_point)) {
-                        try lexer.syntaxError();
-                    }
-                    try lexer.step();
-
-                    if (isIdentifierStart(lexer.code_point)) {
-                        try lexer.step();
-                        while (isIdentifierContinue(lexer.code_point)) {
+                        // "#!/usr/bin/env node"
+                        lexer.token = .t_hashbang;
+                        hashbang: while (true) {
                            try lexer.step();
+                            switch (lexer.code_point) {
+                                '\r', '\n', 0x2028, 0x2029 => {
+                                    break :hashbang;
+                                },
+                                -1 => {
+                                    break :hashbang;
+                                },
+                                else => {},
+                            }
                        }
+                        lexer.identifier = lexer.raw();
+                    } else {
+                        try lexer.step();
                        if (lexer.code_point == '\\') {
-                            try lexer.scanIdentifierWithEscapes();
+                            const scan_result = try lexer.scanIdentifierWithEscapes(.private);
+                            lexer.identifier = scan_result.contents;
                            lexer.token = T.t_private_identifier;
-
-                            // lexer.Identifier, lexer.Token = lexer.scanIdentifierWithEscapes(normalIdentifier);
                        } else {
-                            lexer.token = T.t_private_identifier;
-                            lexer.identifier = lexer.raw();
+                            if (!isIdentifierStart(lexer.code_point)) {
+                                try lexer.syntaxError();
+                            }
+
+                            try lexer.step();
+                            while (isIdentifierContinue(lexer.code_point)) {
+                                try lexer.step();
+                            }
+                            if (lexer.code_point == '\\') {
+                                const scan_result = try lexer.scanIdentifierWithEscapes(.private);
+                                lexer.identifier = scan_result.contents;
+                                lexer.token = T.t_private_identifier;
+                            } else {
+                                lexer.token = T.t_private_identifier;
+                                lexer.identifier = lexer.raw();
+                            }
+                            break;
                        }
-                        break;
                    }
                },
                '\r', '\n', 0x2028, 0x2029 => {
@@ -966,7 +1362,9 @@ pub const Lexer = struct {
                    }

                    if (lexer.code_point == '\\') {
-                        try lexer.scanIdentifierWithEscapes();
+                        const scan_result = try lexer.scanIdentifierWithEscapes(.normal);
+                        lexer.identifier = scan_result.contents;
+                        lexer.token = scan_result.token;
                    } else {
                        const contents = lexer.raw();
                        lexer.identifier = contents;
@@ -975,8 +1373,9 @@ pub const Lexer = struct {
                },

                '\\' => {
-                    // TODO: normal
-                    try lexer.scanIdentifierWithEscapes();
+                    const scan_result = try lexer.scanIdentifierWithEscapes(.normal);
+                    lexer.identifier = scan_result.contents;
+                    lexer.token = scan_result.token;
                },

                '.', '0'...'9' => {
@@ -996,8 +1395,9 @@ pub const Lexer = struct {
                            try lexer.step();
                        }
                        if (lexer.code_point == '\\') {
-
-                            // lexer.Identifier, lexer.Token = lexer.scanIdentifierWithEscapes(normalIdentifier);
+                            const scan_result = try lexer.scanIdentifierWithEscapes(.normal);
+                            lexer.identifier = scan_result.contents;
+                            lexer.token = scan_result.token;
                        } else {
                            lexer.token = T.t_identifier;
                            lexer.identifier = lexer.raw();
@@ -2143,26 +2543,114 @@ pub fn isIdentifierUTF16(text: JavascriptString) bool {
    return true;
 }

+pub const CodepointIterator = struct {
+    bytes: []const u8,
+    i: usize,
+    width: u3 = 0,
+    c: CodePoint = 0,
+
+    pub fn nextCodepointSlice(it: *CodepointIterator) ?[]const u8 {
+        if (it.i >= it.bytes.len) {
+            return null;
+        }
+
+        const cp_len = std
+            .unicode.utf8ByteSequenceLength(it.bytes[it.i]) catch unreachable;
+        it.i += cp_len;
+        return it.bytes[it.i - cp_len .. it.i];
+    }
+
+    pub fn nextCodepoint(it: *CodepointIterator) ?CodePoint {
+        const slice = it.nextCodepointSlice() orelse return null;
+        it.width = @intCast(u3, slice.len);
+
+        it.c = switch (it.width) {
+            1 => @intCast(CodePoint, slice[0]),
+            2 => @intCast(CodePoint, std.unicode.utf8Decode2(slice) catch unreachable),
+            3 => @intCast(CodePoint, std.unicode.utf8Decode3(slice) catch unreachable),
+            4 => @intCast(CodePoint, std.unicode.utf8Decode4(slice) catch unreachable),
+            else => unreachable,
+        };
+
+        return it.c;
+    }
+
+    /// Look ahead at the next n codepoints without advancing the iterator.
+    /// If fewer than n codepoints are available, then return the remainder of the string.
+    pub fn peek(it: *CodepointIterator, n: usize) []const u8 {
+        const original_i = it.i;
+        defer it.i = original_i;
+
+        var end_ix = original_i;
+        var found: usize = 0;
+        while (found < n) : (found += 1) {
+            const next_codepoint = it.nextCodepointSlice() orelse return it.bytes[original_i..];
+            end_ix += next_codepoint.len;
+        }
+
+        return it.bytes[original_i..end_ix];
+    }
+};
+
 // TODO: implement this to actually work right
 // this fn is a stub!
 pub fn rangeOfIdentifier(source: *const Source, loc: logger.Loc) logger.Range {
+    const text = source.contents[loc.toUsize()..];
    var r = logger.Range{ .loc = loc, .len = 0 };
-    const offset = @intCast(usize, loc.start);
-    var i: usize = 0;
-    for (source.contents[offset..]) |c| {
-        if (isIdentifierStart(@as(CodePoint, c))) {
-            for (source.contents[offset + i ..]) |c_| {
-                if (!isIdentifierContinue(c_)) {
-                    r.len = std.math.lossyCast(i32, i);
-                    return r;
+    if (text.len == 0) {
+        return r;
+    }
+
+    var iter = CodepointIterator{ .bytes = text, .i = 0 };
+    var c = @intCast(CodePoint, iter.nextCodepoint() orelse unreachable);
+
+    // Handle private names
+    if (c == '#') {
+        c = @intCast(CodePoint, iter.nextCodepoint() orelse {
+            r.len = 1;
+            return r;
+        });
+    }
+
+    if (isIdentifierStart(c) or c == '\\') {
+        defer r.len = @intCast(i32, iter.i);
+        while (iter.nextCodepoint()) |code_point| {
+            if (code_point == '\\') {
+                // Search for the end of the identifier
+
+                // Skip over bracketed unicode escapes such as "\u{10000}"
+                if (iter.i + 2 < text.len and text[iter.i + 1] == 'u' and text[iter.i + 2] == '{') {
+                    iter.i += 2;
+                    while (iter.i < text.len) {
+                        if (text[iter.i] == '}') {
+                            iter.i += 1;
+                            break;
+                        }
+                        iter.i += 1;
+                    }
                }
-                i += 1;
+            } else if (!isIdentifierContinue(code_point)) {
+                return r;
            }
        }
-
-        i += 1;
    }

+    // const offset = @intCast(usize, loc.start);
+    // var i: usize = 0;
+    // for (text) |c| {
+    //     if (isIdentifierStart(@as(CodePoint, c))) {
+    //         for (source.contents[offset + i ..]) |c_| {
+    //             if (!isIdentifierContinue(c_)) {
+    //                 r.len = std.math.lossyCast(i32, i);
+    //                 return r;
+    //             }
+    //             i += 1;
+    //         }
+    //     }
+
+    //     i += 1;
+    // }
+
    return r;
 }

--- a/src/js_lexer_tables.zig
+++ b/src/js_lexer_tables.zig
@@ -201,7 +201,7 @@ pub const StrictModeReservedWords = std.ComptimeStringMap(bool, .{
    .{ "yield", true },
 });

-pub const CodePoint = i22;
+pub const CodePoint = i32;

 pub const PropertyModifierKeyword = enum {
    p_abstract,
--- a/src/js_parser/js_parser.zig
+++ b/src/js_parser/js_parser.zig
@@ -520,6 +520,155 @@ pub const SideEffects = enum {
        }
    }

+    pub fn simpifyUnusedExpr(p: *P, expr: Expr) ?Expr {
+        switch (expr.data) {
+            .e_null, .e_undefined, .e_missing, .e_boolean, .e_number, .e_big_int, .e_string, .e_this, .e_reg_exp, .e_function, .e_arrow, .e_import_meta => {
+                return null;
+            },
+
+            .e_dot => |dot| {
+                if (dot.can_be_removed_if_unused) {
+                    return null;
+                }
+            },
+            .e_identifier => |ident| {
+                if (ident.must_keep_due_to_with_stmt) {
+                    return expr;
+                }
+
+                if (ident.can_be_removed_if_unused or p.symbols.items[ident.ref.inner_index].kind != .unbound) {
+                    return null;
+                }
+            },
+            .e_if => |__if__| {
+                __if__.yes = simpifyUnusedExpr(p, __if__.yes) orelse __if__.yes.toEmpty();
+                __if__.no = simpifyUnusedExpr(p, __if__.no) orelse __if__.no.toEmpty();
+
+                // "foo() ? 1 : 2" => "foo()"
+                if (__if__.yes.isEmpty() and __if__.no.isEmpty()) {
+                    return simpifyUnusedExpr(p, __if__.test_);
+                }
+            },
+
+            .e_call => |call| {
+                // A call that has been marked "__PURE__" can be removed if all arguments
+                // can be removed. The annotation causes us to ignore the target.
+                if (call.can_be_unwrapped_if_unused) {
+                    return Expr.joinAllWithComma(call.args, p.allocator);
+                }
+            },
+
+            .e_binary => |bin| {
+                switch (bin.op) {
+                    // We can simplify "==" and "!=" even though they can call "toString" and/or
+                    // "valueOf" if we can statically determine that the types of both sides are
+                    // primitives. In that case there won't be any chance for user-defined
+                    // "toString" and/or "valueOf" to be called.
+                    .bin_loose_eq, .bin_loose_ne => {
+                        if (isPrimitiveWithSideEffects(bin.left.data) and isPrimitiveWithSideEffects(bin.right.data)) {
+                            return Expr.joinWithComma(simpifyUnusedExpr(p, bin.left) orelse bin.left.toEmpty(), simpifyUnusedExpr(p, bin.right) orelse bin.right.toEmpty(), p.allocator);
+                        }
+                    },
+                    else => {},
+                }
+            },
+
+            .e_new => |call| {
+                // A constructor call that has been marked "__PURE__" can be removed if all arguments
+                // can be removed. The annotation causes us to ignore the target.
+                if (call.can_be_unwrapped_if_unused) {
+                    return Expr.joinAllWithComma(call.args, p.allocator);
+                }
+            },
+            else => {},
+        }
+
+        return expr;
+    }
+
+    // If this is in a dead branch, then we want to trim as much dead code as we
+    // can. Everything can be trimmed except for hoisted declarations ("var" and
+    // "function"), which affect the parent scope. For example:
+    //
+    //   function foo() {
+    //     if (false) { var x; }
+    //     x = 1;
+    //   }
+    //
+    // We can't trim the entire branch as dead or calling foo() will incorrectly
+    // assign to a global variable instead.
+
+    // The main goal here is to trim conditionals
+    pub fn shouldKeepStmtInDeadControlFlow(stmt: Stmt) bool {
+        switch (stmt.data) {
+            .s_empty, .s_expr, .s_throw, .s_return, .s_break, .s_continue, .s_class, .s_debugger => {
+                // Omit these statements entirely
+                return false;
+            },
+
+            .s_local => |local| {
+                return local.kind != .k_var;
+                // if (local.kind != .k_var) {
+                //     // Omit these statements entirely
+                //     return false;
+                // }
+            },
+
+            .s_block => |block| {
+                for (block.stmts) |child| {
+                    if (shouldKeepStmtInDeadControlFlow(child)) {
+                        return true;
+                    }
+                }
+
+                return false;
+            },
+
+            .s_if => |_if_| {
+                if (shouldKeepStmtInDeadControlFlow(_if_.yes)) {
+                    return true;
+                }
+
+                const no = _if_.no orelse return false;
+
+                return shouldKeepStmtInDeadControlFlow(no);
+            },
+
+            .s_while => |__while__| {
+                return shouldKeepStmtInDeadControlFlow(__while__.body);
+            },
+
+            .s_do_while => |__while__| {
+                return shouldKeepStmtInDeadControlFlow(__while__.body);
+            },
+
+            .s_for => |__for__| {
+                if (__for__.init) |init_| {
+                    if (shouldKeepStmtInDeadControlFlow(init_)) {
+                        return true;
+                    }
+                }
+
+                return shouldKeepStmtInDeadControlFlow(__for__.body);
+            },
+
+            .s_for_in => |__for__| {
+                return shouldKeepStmtInDeadControlFlow(__for__.init) or shouldKeepStmtInDeadControlFlow(__for__.body);
+            },
+
+            .s_for_of => |__for__| {
+                return shouldKeepStmtInDeadControlFlow(__for__.init) or shouldKeepStmtInDeadControlFlow(__for__.body);
+            },
+
+            .s_label => |label| {
+                return shouldKeepStmtInDeadControlFlow(label.stmt);
+            },
+            else => {
+                return true;
+            },
+        }
+    }
+
    pub const Equality = struct { equal: bool = false, ok: bool = false };

    // Returns "equal, ok". If "ok" is false, then nothing is known about the two
@@ -642,9 +791,10 @@ pub const SideEffects = enum {
                    .bin_comma => {
                        return isPrimitiveWithSideEffects(e.right.data);
                    },
+                    else => {},
                }
            },
-            .e_if => {
+            .e_if => |e| {
                return isPrimitiveWithSideEffects(e.yes.data) and isPrimitiveWithSideEffects(e.no.data);
            },
            else => {},
@@ -1283,6 +1433,14 @@ pub const Parser = struct {
        var result: js_ast.Result = undefined;

        if (self.p) |p| {
+
+            // Consume a leading hashbang comment
+            var hashbang: string = "";
+            if (p.lexer.token == .t_hashbang) {
+                hashbang = p.lexer.identifier;
+                try p.lexer.next();
+            }
+
            // Parse the file in the first pass, but do not bind symbols
            var opts = ParseStatementOptions{ .is_module_scope = true };
            debugl("<p.parseStmtsUpTo>");
@@ -1499,8 +1657,8 @@ const ParseStatementOptions = struct {

 var e_missing_data = E.Missing{};
 var s_missing = S.Empty{};
-var nullExprData = Expr.Data{ .e_missing = &e_missing_data };
-var nullStmtData = Stmt.Data{ .s_empty = &s_missing };
+var nullExprData = Expr.Data{ .e_missing = e_missing_data };
+var nullStmtData = Stmt.Data{ .s_empty = s_missing };
 pub const Prefill = struct {
    pub const StringLiteral = struct {
        pub var Key = [3]u16{ 'k', 'e', 'y' };
@@ -1523,10 +1681,10 @@ pub const Prefill = struct {
        pub var BMissing = B{ .b_missing = &BMissing_ };
        pub var BMissing_ = B.Missing{};

-        pub var EMissing = Expr.Data{ .e_missing = &EMissing_ };
+        pub var EMissing = Expr.Data{ .e_missing = EMissing_ };
        pub var EMissing_ = E.Missing{};

-        pub var SEmpty = Stmt.Data{ .s_empty = &SEmpty_ };
+        pub var SEmpty = Stmt.Data{ .s_empty = SEmpty_ };
        pub var SEmpty_ = S.Empty{};

        pub var Filename = Expr.Data{ .e_string = &Prefill.String.Filename };
@@ -4032,7 +4190,7 @@ pub const P = struct {
                const name = p.lexer.identifier;
                var emiss = E.Missing{};
                // Parse either an async function, an async expression, or a normal expression
-                var expr: Expr = Expr{ .loc = loc, .data = Expr.Data{ .e_missing = &emiss } };
+                var expr: Expr = Expr{ .loc = loc, .data = Expr.Data{ .e_missing = emiss } };
                if (is_identifier and strings.eqlComptime(p.lexer.raw(), "async")) {
                    var async_range = p.lexer.range();
                    try p.lexer.next();
@@ -4589,7 +4747,7 @@ pub const P = struct {
                const name = p.lexer.identifier;
                const loc = p.lexer.loc();

-                const e_str = p.lexer.toEString();
+                const e_str = E.String{ .utf8 = name };

                if (!p.lexer.isIdentifierOrKeyword()) {
                    try p.lexer.expect(.t_identifier);
@@ -7262,7 +7420,7 @@ pub const P = struct {
                }
                return p.e(E.Array{
                    .items = items.toOwnedSlice(),
-                    .comma_after_spread = comma_after_spread,
+                    .comma_after_spread = comma_after_spread.toNullable(),
                    .is_single_line = is_single_line,
                }, loc);
            },
@@ -7325,7 +7483,7 @@ pub const P = struct {
                }
                return p.e(E.Object{
                    .properties = properties.toOwnedSlice(),
-                    .comma_after_spread = comma_after_spread,
+                    .comma_after_spread = comma_after_spread.toNullable(),
                    .is_single_line = is_single_line,
                }, loc);
            },
@@ -9707,11 +9865,8 @@ pub const P = struct {
            .s_expr => |data| {
                p.stmt_expr_value = data.value.data;
                data.value = p.visitExpr(data.value);
-
-                // TODO:
-                // if (p.options.mangle_syntax) {
-
-                // }
+                // simplify unused
+                data.value = SideEffects.simpifyUnusedExpr(p, data.value) orelse data.value.toEmpty();
            },
            .s_throw => |data| {
                data.value = p.visitExpr(data.value);
@@ -10622,9 +10777,10 @@ pub const P = struct {
        // Save the current control-flow liveness. This represents if we are
        // currently inside an "if (false) { ... }" block.
        var old_is_control_flow_dead = p.is_control_flow_dead;
+        defer p.is_control_flow_dead = old_is_control_flow_dead;

        // visit all statements first
-        var visited = List(Stmt).init(p.allocator);
+        var visited = try List(Stmt).initCapacity(p.allocator, stmts.items.len);
        var before = List(Stmt).init(p.allocator);
        var after = List(Stmt).init(p.allocator);
        defer before.deinit();
@@ -10657,8 +10813,21 @@ pub const P = struct {
            try p.visitAndAppendStmt(list, stmt);
        }

-        p.is_control_flow_dead = old_is_control_flow_dead;
-        try stmts.resize(visited.items.len + before.items.len + after.items.len);
+        var visited_count = visited.items.len;
+        if (p.is_control_flow_dead) {
+            var end: usize = 0;
+            for (visited.items) |item, i| {
+                if (!SideEffects.shouldKeepStmtInDeadControlFlow(item)) {
+                    continue;
+                }
+
+                visited.items[end] = item;
+                end += 1;
+            }
+            visited_count = end;
+        }
+
+        try stmts.resize(visited_count + before.items.len + after.items.len);
        var i: usize = 0;

        for (before.items) |item| {
@@ -10666,7 +10835,8 @@ pub const P = struct {
            i += 1;
        }

-        for (visited.items) |item| {
+        const visited_slice = visited.items[0..visited_count];
+        for (visited_slice) |item| {
            stmts.items[i] = item;
            i += 1;
        }
--- a/src/js_printer.zig
+++ b/src/js_printer.zig
@@ -199,24 +199,54 @@ pub fn NewPrinter(comptime ascii_only: bool) type {
                    p.js.appendChar(str) catch unreachable;
                },
                string => {
+                    if (isDebug or isTest) {
+                        if (str[0] == 0 or (str[0] == '\\' and str[1] == '0')) {
+                            Global.panic("Attempted to print null char", .{});
+                        }
+                    }
                    p.js.append(str) catch unreachable;
                },
                u8 => {
+                    if (isDebug or isTest) {
+                        if (str == 0) {
+                            Global.panic("Attempted to print null char", .{});
+                        }
+                    }
                    p.js.appendChar(str) catch unreachable;
                },
                u16 => {
+                    if (isDebug or isTest) {
+                        if (str == 0) {
+                            Global.panic("Attempted to print null char", .{});
+                        }
+                    }
                    p.js.appendChar(@intCast(u8, str)) catch unreachable;
                },
                u21 => {
+                    if (isDebug or isTest) {
+                        if (str == 0) {
+                            Global.panic("Attempted to print null char", .{});
+                        }
+                    }
                    p.js.appendChar(@intCast(u8, str)) catch unreachable;
                },
                else => {
+                    if (isDebug or isTest) {
+                        if (str[0] == 0 or (str[0] == '\\' and str[1] == '0')) {
+                            Global.panic("Attempted to print null char", .{});
+                        }
+                    }
                    p.js.append(@as(string, str)) catch unreachable;
                },
            }
        }

        pub fn unsafePrint(p: *Printer, str: string) void {
+            if (isDebug or isTest) {
+                if (str[0] == 0 or (str[0] == '\\' and str[1] == '0')) {
+                    Global.panic("Attempted to print null char", .{});
+                }
+            }
            p.js.appendAssumeCapacity(str);
        }

--- a/src/logger.zig
+++ b/src/logger.zig
@@ -30,8 +30,12 @@ pub const Kind = enum {
 pub const Loc = packed struct {
    start: i32 = -1,

+    pub fn toNullable(loc: *Loc) ?Loc {
+        return if (loc.start == -1) null else loc.*;
+    }
+
    // TODO: remove this stupidity
-    pub fn toUsize(self: *Loc) usize {
+    pub fn toUsize(self: *const Loc) usize {
        return @intCast(usize, self.start);
    }