Fix parsing/printing unicode identifiers, switch to UTF-8 for prefilled strings, remove eagerly loading identifier_name

2026-02-11 03:18:53 +00:00 · 2021-09-20 22:37:22 -07:00
parent 90cd3bf4cb
commit ab565f1a3c
17 changed files with 2039 additions and 180 deletions
--- a/src/string_immutable.zig
+++ b/src/string_immutable.zig
@@ -649,105 +649,111 @@ pub fn utf8ByteSequenceLength(first_byte: u8) u3 {
    };
 }

-pub const CodepointIterator = struct {
-    bytes: []const u8,
-    i: usize,
-    width: u3 = 0,
-    c: CodePoint = 0,
+pub fn NewCodePointIterator(comptime CodePointType: type, comptime zeroValue: comptime_int) type {
+    return struct {
+        const Iterator = @This();
+        bytes: []const u8,
+        i: usize,
+        width: u3 = 0,
+        c: CodePointType = 0,

-    inline fn nextCodepointSlice(it: *CodepointIterator) []const u8 {
-        @setRuntimeSafety(false);
-
-        const cp_len = utf8ByteSequenceLength(it.bytes[it.i]);
-        it.i += cp_len;
-
-        return if (!(it.i > it.bytes.len)) it.bytes[it.i - cp_len .. it.i] else "";
-    }
-
-    pub fn needsUTF8Decoding(slice: string) bool {
-        var it = CodepointIterator{ .bytes = slice, .i = 0 };
-
-        while (true) {
-            const part = it.nextCodepointSlice();
-            it.width = @intCast(u3, part.len);
+        inline fn nextCodepointSlice(it: *Iterator) []const u8 {
            @setRuntimeSafety(false);
-            switch (it.width) {
-                0 => return false,
-                1 => continue,
-                else => return true,
-            }
+
+            const cp_len = utf8ByteSequenceLength(it.bytes[it.i]);
+            it.i += cp_len;
+
+            return if (!(it.i > it.bytes.len)) it.bytes[it.i - cp_len .. it.i] else "";
        }
-    }

-    pub fn scanUntilQuotedValueOrEOF(iter: *CodepointIterator, comptime quote: CodePoint) usize {
-        @setRuntimeSafety(false);
+        pub fn needsUTF8Decoding(slice: string) bool {
+            var it = Iterator{ .bytes = slice, .i = 0 };

-        while (iter.c > -1) {
-            if (!switch (iter.nextCodepoint()) {
-                quote => false,
-                '\\' => brk: {
-                    if (iter.nextCodepoint() == quote) {
-                        continue;
-                    }
-                    break :brk true;
-                },
-                else => true,
-            }) {
-                return iter.i + 1;
+            while (true) {
+                const part = it.nextCodepointSlice();
+                it.width = @intCast(u3, part.len);
+                @setRuntimeSafety(false);
+                switch (it.width) {
+                    0 => return false,
+                    1 => continue,
+                    else => return true,
+                }
            }
        }

-        return iter.i;
-    }
+        pub fn scanUntilQuotedValueOrEOF(iter: *Iterator, comptime quote: CodePointType) usize {
+            @setRuntimeSafety(false);

-    pub fn nextCodepoint(it: *CodepointIterator) CodePoint {
-        const slice = it.nextCodepointSlice();
-        it.width = @intCast(u3, slice.len);
-        @setRuntimeSafety(false);
+            while (iter.c > -1) {
+                if (!switch (iter.nextCodepoint()) {
+                    quote => false,
+                    '\\' => brk: {
+                        if (iter.nextCodepoint() == quote) {
+                            continue;
+                        }
+                        break :brk true;
+                    },
+                    else => true,
+                }) {
+                    return iter.i + 1;
+                }
+            }

-        it.c = switch (it.width) {
-            0 => -1,
-            1 => @intCast(CodePoint, slice[0]),
-            2 => @intCast(CodePoint, std.unicode.utf8Decode2(slice) catch unreachable),
-            3 => @intCast(CodePoint, std.unicode.utf8Decode3(slice) catch unreachable),
-            4 => @intCast(CodePoint, std.unicode.utf8Decode4(slice) catch unreachable),
-            else => unreachable,
-        };
-
-        return it.c;
-    }
-
-    pub fn nextCodepointNoReturn(it: *CodepointIterator) void {
-        const slice = it.nextCodepointSlice();
-        it.width = @intCast(u3, slice.len);
-        @setRuntimeSafety(false);
-
-        it.c = switch (it.width) {
-            0 => -1,
-            1 => @intCast(CodePoint, slice[0]),
-            2 => @intCast(CodePoint, std.unicode.utf8Decode2(slice) catch unreachable),
-            3 => @intCast(CodePoint, std.unicode.utf8Decode3(slice) catch unreachable),
-            4 => @intCast(CodePoint, std.unicode.utf8Decode4(slice) catch unreachable),
-            else => unreachable,
-        };
-    }
-
-    /// Look ahead at the next n codepoints without advancing the iterator.
-    /// If fewer than n codepoints are available, then return the remainder of the string.
-    pub fn peek(it: *CodepointIterator, n: usize) []const u8 {
-        const original_i = it.i;
-        defer it.i = original_i;
-
-        var end_ix = original_i;
-        var found: usize = 0;
-        while (found < n) : (found += 1) {
-            const next_codepoint = it.nextCodepointSlice() orelse return it.bytes[original_i..];
-            end_ix += next_codepoint.len;
+            return iter.i;
        }

-        return it.bytes[original_i..end_ix];
-    }
-};
+        pub fn nextCodepoint(it: *Iterator) CodePointType {
+            const slice = it.nextCodepointSlice();
+            it.width = @intCast(u3, slice.len);
+            @setRuntimeSafety(false);
+
+            it.c = switch (it.width) {
+                0 => zeroValue,
+                1 => @intCast(CodePointType, slice[0]),
+                2 => @intCast(CodePointType, std.unicode.utf8Decode2(slice) catch unreachable),
+                3 => @intCast(CodePointType, std.unicode.utf8Decode3(slice) catch unreachable),
+                4 => @intCast(CodePointType, std.unicode.utf8Decode4(slice) catch unreachable),
+                else => unreachable,
+            };
+
+            return it.c;
+        }
+
+        pub fn nextCodepointNoReturn(it: *Iterator) void {
+            const slice = it.nextCodepointSlice();
+            it.width = @intCast(u3, slice.len);
+            @setRuntimeSafety(false);
+
+            it.c = switch (it.width) {
+                0 => zeroValue,
+                1 => @intCast(CodePointType, slice[0]),
+                2 => @intCast(CodePointType, std.unicode.utf8Decode2(slice) catch unreachable),
+                3 => @intCast(CodePointType, std.unicode.utf8Decode3(slice) catch unreachable),
+                4 => @intCast(CodePointType, std.unicode.utf8Decode4(slice) catch unreachable),
+                else => unreachable,
+            };
+        }
+
+        /// Look ahead at the next n codepoints without advancing the iterator.
+        /// If fewer than n codepoints are available, then return the remainder of the string.
+        pub fn peek(it: *Iterator, n: usize) []const u8 {
+            const original_i = it.i;
+            defer it.i = original_i;
+
+            var end_ix = original_i;
+            var found: usize = 0;
+            while (found < n) : (found += 1) {
+                const next_codepoint = it.nextCodepointSlice() orelse return it.bytes[original_i..];
+                end_ix += next_codepoint.len;
+            }
+
+            return it.bytes[original_i..end_ix];
+        }
+    };
+}
+
+pub const CodepointIterator = NewCodePointIterator(CodePoint, -1);
+pub const UnsignedCodepointIterator = NewCodePointIterator(u32, 0);

 test "join" {
    var string_list = &[_]string{ "abc", "def", "123", "hello" };