Better unicode identifier start / continue check (#15455)

2026-02-02 15:08:46 +00:00 · 2024-12-25 23:02:46 -08:00
parent d4c0432a5f
commit 145a7fd92e
8 changed files with 320 additions and 2397 deletions
--- a/misctools/gen-unicode-table.js
+++ b/misctools/gen-unicode-table.js
@@ -1,172 +0,0 @@
-// Thank you @evanw for this code!!!
-const fs = require("fs");
-const path = require("path");
-
-// ES5 reference: https://es5.github.io/
-//
-// A conforming implementation of this International standard shall interpret
-// characters in conformance with the Unicode Standard, Version 3.0 or later
-// and ISO/IEC 10646-1 with either UCS-2 or UTF-16 as the adopted encoding
-// form, implementation level 3. If the adopted ISO/IEC 10646-1 subset is not
-// otherwise specified, it is presumed to be the BMP subset, collection 300.
-//
-// UnicodeLetter: any character in the Unicode categories “Uppercase letter (Lu)”,
-// “Lowercase letter (Ll)”, “Titlecase letter (Lt)”, “Modifier letter (Lm)”,
-// “Other letter (Lo)”, or “Letter number (Nl)”.
-const idStartES5 = []
-  .concat(
-    require("@unicode/unicode-3.0.0/General_Category/Uppercase_Letter/code-points"),
-    require("@unicode/unicode-3.0.0/General_Category/Lowercase_Letter/code-points"),
-    require("@unicode/unicode-3.0.0/General_Category/Titlecase_Letter/code-points"),
-    require("@unicode/unicode-3.0.0/General_Category/Modifier_Letter/code-points"),
-    require("@unicode/unicode-3.0.0/General_Category/Other_Letter/code-points"),
-
-    // The "letter number" category is not included because old versions of Safari
-    // had a bug where they didn't include it. This means it does not match ES5.
-    // We need to make sure we escape these characters so Safari can read them.
-    // See https://github.com/evanw/esbuild/issues/1349 for more information.
-    // require('@unicode/unicode-3.0.0/General_Category/Letter_Number/code-points'),
-  )
-  .sort((a, b) => a - b);
-
-// UnicodeCombiningMark: any character in the Unicode categories “Non-spacing mark (Mn)”
-// or “Combining spacing mark (Mc)”
-// UnicodeDigit: any character in the Unicode category “Decimal number (Nd)”
-// UnicodeConnectorPunctuation: any character in the Unicode category “Connector punctuation (Pc)”
-const idContinueES5 = idStartES5
-  .concat(
-    require("@unicode/unicode-3.0.0/General_Category/Nonspacing_Mark/code-points"),
-    require("@unicode/unicode-3.0.0/General_Category/Spacing_Mark/code-points"),
-    require("@unicode/unicode-3.0.0/General_Category/Decimal_Number/code-points"),
-    require("@unicode/unicode-3.0.0/General_Category/Connector_Punctuation/code-points"),
-  )
-  .sort((a, b) => a - b);
-
-// ESNext reference: https://tc39.es/ecma262/
-//
-// A conforming implementation of ECMAScript must interpret source text input
-// in conformance with the Unicode Standard, Version 5.1.0 or later and ISO/IEC
-// 10646. If the adopted ISO/IEC 10646-1 subset is not otherwise specified, it
-// is presumed to be the Unicode set, collection 10646.
-//
-// UnicodeIDStart: any Unicode code point with the Unicode property “ID_Start”
-const idStartESNext = require("@unicode/unicode-13.0.0/Binary_Property/ID_Start/code-points");
-const idStartESNextSet = new Set(idStartESNext);
-
-// UnicodeIDContinue: any Unicode code point with the Unicode property “ID_Continue”
-const idContinueESNext = require("@unicode/unicode-13.0.0/Binary_Property/ID_Continue/code-points");
-const idContinueESNextSet = new Set(idContinueESNext);
-
-// These identifiers are valid in both ES5 and ES6+ (i.e. an intersection of both)
-const idStartES5AndESNext = idStartES5.filter(n => idStartESNextSet.has(n));
-const idContinueES5AndESNext = idContinueES5.filter(n => idContinueESNextSet.has(n));
-
-// These identifiers are valid in either ES5 or ES6+ (i.e. a union of both)
-const idStartES5OrESNext = [...new Set(idStartES5.concat(idStartESNext))].sort((a, b) => a - b);
-const idContinueES5OrESNext = [...new Set(idContinueES5.concat(idContinueESNext))].sort((a, b) => a - b);
-
-function generateRangeTable(codePoints) {
-  let lines = [];
-  let index = 0;
-  let latinOffset = 0;
-
-  while (latinOffset < codePoints.length && codePoints[latinOffset] <= 0xff) {
-    latinOffset++;
-  }
-
-  lines.push(`RangeTable.init(`, `   ${latinOffset},`, `  &[_]R16Range{`);
-
-  // 16-bit code points
-  while (index < codePoints.length && codePoints[index] < 0x1000) {
-    let start = codePoints[index];
-    index++;
-    while (index < codePoints.length && codePoints[index] < 0x1000 && codePoints[index] === codePoints[index - 1] + 1) {
-      index++;
-    }
-    let end = codePoints[index - 1];
-    lines.push(`  .{0x${start.toString(16)}, 0x${end.toString(16)}},`);
-  }
-
-  lines.push(`  },`, `&[_]R32Range{`);
-
-  // 32-bit code points
-  while (index < codePoints.length) {
-    let start = codePoints[index];
-    index++;
-    while (index < codePoints.length && codePoints[index] === codePoints[index - 1] + 1) {
-      index++;
-    }
-    let end = codePoints[index - 1];
-    lines.push(`    .{0x${start.toString(16)}, 0x${end.toString(16)}},`);
-  }
-
-  lines.push(`  },`, `);`);
-  return lines.join("\n");
-}
-
-function generateBigSwitchStatement(codePoints) {
-  let lines = [];
-  let index = 0;
-  let latinOffset = 0;
-
-  while (latinOffset < codePoints.length && codePoints[latinOffset] <= 0xff) {
-    latinOffset++;
-  }
-
-  lines.push(`return switch(codepoint) {`);
-
-  // 16-bit code points
-  while (index < codePoints.length && codePoints[index] < 0x1000) {
-    let start = codePoints[index];
-    index++;
-    while (index < codePoints.length && codePoints[index] < 0x1000 && codePoints[index] === codePoints[index - 1] + 1) {
-      index++;
-    }
-    let end = codePoints[index - 1];
-    lines.push(`0x${start.toString(16)}...0x${end.toString(16)},`);
-  }
-
-  // 32-bit code points
-  while (index < codePoints.length) {
-    let start = codePoints[index];
-    index++;
-    while (index < codePoints.length && codePoints[index] === codePoints[index - 1] + 1) {
-      index++;
-    }
-    let end = codePoints[index - 1];
-    lines.push(` 0x${start.toString(16)}...0x${end.toString(16)},`);
-  }
-
-  lines.push(` => true, 
-    else => false  
-};`);
-  return lines.join("\n");
-}
-
-fs.writeFileSync(
-  path.join(__dirname, "..", "src", "js_lexer", "unicode.zig"),
-  `// This file was automatically generated by ${path.basename(__filename)}. Do not edit.
-
-  const RangeTable = @import("./range_table.zig");
-
-
-// ES5 || ESNext
-pub const id_start = ${generateRangeTable(idStartES5OrESNext)}
-
-// ES5 || ESNext
-pub const id_continue = ${generateRangeTable(idContinueES5OrESNext)}
-
-pub const printable_id_start = ${generateRangeTable(idStartESNext)}
-pub const printable_id_continue = ${generateRangeTable(idContinueESNext)}
-
-pub fn isIdentifierStart(comptime Codepoint: type, codepoint: Codepoint) bool{
-  ${generateBigSwitchStatement(idStartES5OrESNext)}
-}
-
-pub fn isIdentifierContinue(comptime Codepoint: type, codepoint: Codepoint) bool{
-  ${generateBigSwitchStatement(idContinueES5OrESNext)}
-}
-
-
-`,
-);
--- a/misctools/gen-unicode-table.ts
+++ b/misctools/gen-unicode-table.ts
@@ -0,0 +1,108 @@
+import { Generator, Context } from "./unicode-generator";
+
+// Create sets for fast lookups
+const idStartES5Set = new Set([
+  ...require("@unicode/unicode-3.0.0/General_Category/Uppercase_Letter/code-points"),
+  ...require("@unicode/unicode-3.0.0/General_Category/Lowercase_Letter/code-points"),
+  ...require("@unicode/unicode-3.0.0/General_Category/Titlecase_Letter/code-points"),
+  ...require("@unicode/unicode-3.0.0/General_Category/Modifier_Letter/code-points"),
+  ...require("@unicode/unicode-3.0.0/General_Category/Other_Letter/code-points"),
+]);
+
+const idContinueES5Set = new Set([
+  ...idStartES5Set,
+  ...require("@unicode/unicode-3.0.0/General_Category/Nonspacing_Mark/code-points"),
+  ...require("@unicode/unicode-3.0.0/General_Category/Spacing_Mark/code-points"),
+  ...require("@unicode/unicode-3.0.0/General_Category/Decimal_Number/code-points"),
+  ...require("@unicode/unicode-3.0.0/General_Category/Connector_Punctuation/code-points"),
+]);
+
+const idStartESNextSet = new Set(require("@unicode/unicode-15.1.0/Binary_Property/ID_Start/code-points"));
+const idContinueESNextSet = new Set(require("@unicode/unicode-15.1.0/Binary_Property/ID_Continue/code-points"));
+
+// Exclude known problematic codepoints
+const ID_Continue_mistake = new Set([0x30fb, 0xff65]);
+
+function bitsToU64Array(bits: number[]): bigint[] {
+  const result: bigint[] = [];
+  for (let i = 0; i < bits.length; i += 64) {
+    let value = 0n;
+    for (let j = 0; j < 64 && i + j < bits.length; j++) {
+      if (bits[i + j]) {
+        value |= 1n << BigInt(j);
+      }
+    }
+    result.push(value);
+  }
+  return result;
+}
+
+async function generateTable(table: string, name: string, checkFn: (cp: number) => boolean) {
+  const context: Context<boolean> = {
+    get: (cp: number) => checkFn(cp),
+    eql: (a: boolean, b: boolean) => a === b,
+  };
+
+  const generator = new Generator(context);
+  const tables = await generator.generate();
+
+  return `
+pub fn ${name}(cp: u21) bool {
+    if (cp > 0x10FFFF) return false;
+    const high = cp >> 8;
+    const low = cp & 0xFF;
+    const stage2_idx = ${table}.stage1[high];
+    const bit_pos = stage2_idx + low;
+    const u64_idx = bit_pos >> 6;
+    const bit_idx = @as(u6, @intCast(bit_pos & 63));
+    return (${table}.stage2[u64_idx] & (@as(u64, 1) << bit_idx)) != 0;
+}
+const ${table} = struct {
+    pub const stage1 = [_]u16{${tables.stage1.join(",")}};
+    pub const stage2 = [_]u64{${bitsToU64Array(tables.stage2)
+      .map(n => n.toString())
+      .join(",")}};
+};
+
+`;
+}
+
+async function main() {
+  const functions = [
+    {
+      name: "isIDStartES5",
+      table: "idStartES5",
+      check: (cp: number) => idStartES5Set.has(cp),
+    },
+    {
+      name: "isIDContinueES5",
+      table: "idContinueES5",
+      check: (cp: number) => idContinueES5Set.has(cp),
+    },
+    {
+      name: "isIDStartESNext",
+      table: "idStartESNext",
+      check: (cp: number) => idStartESNextSet.has(cp),
+    },
+    {
+      name: "isIDContinueESNext",
+      table: "idContinueESNext",
+      check: (cp: number) => idContinueESNextSet.has(cp) && !ID_Continue_mistake.has(cp),
+    },
+  ];
+
+  const results = await Promise.all(
+    functions.map(async ({ name, check, table }) => {
+      const code = await generateTable(table, name, check);
+      return `
+/// ${name} checks if a codepoint is valid in the ${name} category
+${code}`;
+    }),
+  );
+
+  console.log(`/// This file is auto-generated. Do not edit.
+
+${results.join("\n\n")}`);
+}
+
+main();
--- a/misctools/package.json
+++ b/misctools/package.json
@@ -5,7 +5,10 @@
  "license": "MIT",
  "devDependencies": {
    "@unicode/unicode-13.0.0": "^1.2.1",
-    "@unicode/unicode-3.0.0": "^1.2.1",
+    "@unicode/unicode-3.0.0": "^1.6.5",
    "semver": "^7.3.7"
+  },
+  "dependencies": {
+    "@unicode/unicode-15.1.0": "^1.6.5"
  }
 }
--- a/misctools/unicode-generator.ts
+++ b/misctools/unicode-generator.ts
@@ -0,0 +1,138 @@
+import crypto from "crypto";
+
+// Types to mirror Zig's structures
+interface Context<Elem> {
+  get(codepoint: number): Promise<Elem> | Elem;
+  eql(a: Elem, b: Elem): boolean;
+}
+
+interface Tables<Elem> {
+  stage1: number[];
+  stage2: number[];
+  stage3: Elem[];
+}
+
+class Generator<Elem> {
+  private static readonly BLOCK_SIZE = 256;
+  private readonly ctx: Context<Elem>;
+  private readonly blockMap = new Map<string, number>();
+
+  constructor(ctx: Context<Elem>) {
+    this.ctx = ctx;
+  }
+
+  private hashBlock(block: number[]): string {
+    const hash = crypto.createHash("sha256");
+    hash.update(Buffer.from(new Uint16Array(block).buffer));
+    return hash.digest("hex");
+  }
+
+  async generate(): Promise<Tables<Elem>> {
+    const stage1: number[] = [];
+    const stage2: number[] = [];
+    const stage3: Elem[] = [];
+
+    let block = new Array(Generator.BLOCK_SIZE).fill(0);
+    let blockLen = 0;
+
+    // Maximum Unicode codepoint is 0x10FFFF
+    for (let cp = 0; cp <= 0x10ffff; cp++) {
+      // Get the mapping for this codepoint
+      const elem = await this.ctx.get(cp);
+
+      // Find or add the element in stage3
+      let blockIdx = stage3.findIndex(item => this.ctx.eql(item, elem));
+      if (blockIdx === -1) {
+        blockIdx = stage3.length;
+        stage3.push(elem);
+      }
+
+      if (blockIdx > 0xffff) {
+        throw new Error("Block index too large");
+      }
+
+      // Add to current block
+      block[blockLen] = blockIdx;
+      blockLen++;
+
+      // Check if we need to finalize this block
+      if (blockLen < Generator.BLOCK_SIZE && cp !== 0x10ffff) {
+        continue;
+      }
+
+      // Fill remaining block space with zeros if needed
+      if (blockLen < Generator.BLOCK_SIZE) {
+        block.fill(0, blockLen);
+      }
+
+      // Get or create stage2 index for this block
+      const blockHash = this.hashBlock(block);
+      let stage2Idx = this.blockMap.get(blockHash);
+
+      if (stage2Idx === undefined) {
+        stage2Idx = stage2.length;
+        this.blockMap.set(blockHash, stage2Idx);
+        stage2.push(...block.slice(0, blockLen));
+      }
+
+      if (stage2Idx > 0xffff) {
+        throw new Error("Stage2 index too large");
+      }
+
+      // Add mapping to stage1
+      stage1.push(stage2Idx);
+
+      // Reset block
+      block = new Array(Generator.BLOCK_SIZE).fill(0);
+      blockLen = 0;
+    }
+
+    return { stage1, stage2, stage3 };
+  }
+
+  // Generates Zig code for the lookup tables
+  static writeZig<Elem>(tableName: string, tables: Tables<Elem>, elemToString: (elem: Elem) => string): string {
+    let output = `/// Auto-generated. Do not edit.\n`;
+    output += `fn ${tableName}(comptime Elem: type) type {\n`;
+    output += "    return struct {\n";
+
+    // Stage 1
+    output += `pub const stage1: [${tables.stage1.length}]u16 = .{`;
+    output += tables.stage1.join(",");
+    output += "};\n\n";
+
+    // Stage 2
+    output += `pub const stage2: [${tables.stage2.length}]u8 = .{`;
+    output += tables.stage2.join(",");
+    output += "};\n\n";
+
+    // Stage 3
+    output += `pub const stage3: [${tables.stage3.length}]Elem = .{`;
+    output += tables.stage3.map(elemToString).join(",");
+    output += "};\n";
+
+    output += "    };\n}\n";
+    return output;
+  }
+}
+
+// Example usage:
+async function example() {
+  // Example context that maps codepoints to their category
+  const ctx: Context<string> = {
+    get: async (cp: number) => {
+      // This would normally look up the actual Unicode category
+      return "Lu";
+    },
+    eql: (a: string, b: string) => a === b,
+  };
+
+  const generator = new Generator(ctx);
+  const tables = await generator.generate();
+
+  // Generate Zig code
+  const zigCode = Generator.writeZig(tables, (elem: string) => `"${elem}"`);
+  console.log(zigCode);
+}
+
+export { Generator, type Context, type Tables };
--- a/src/js_lexer.zig
+++ b/src/js_lexer.zig
@@ -3043,18 +3043,10 @@ pub const Lexer = NewLexer(.{});

 const JSIdentifier = @import("./js_lexer/identifier.zig");
 pub inline fn isIdentifierStart(codepoint: i32) bool {
-    if (comptime Environment.isWasm) {
-        return JSIdentifier.JumpTable.isIdentifierStart(codepoint);
-    }
-
-    return JSIdentifier.Bitset.isIdentifierStart(codepoint);
+    return JSIdentifier.isIdentifierStart(codepoint);
 }
 pub inline fn isIdentifierContinue(codepoint: i32) bool {
-    if (comptime Environment.isWasm) {
-        return JSIdentifier.JumpTable.isIdentifierPart(codepoint);
-    }
-
-    return JSIdentifier.Bitset.isIdentifierPart(codepoint);
+    return JSIdentifier.isIdentifierPart(codepoint);
 }

 pub fn isWhitespace(codepoint: CodePoint) bool {
--- a/src/js_lexer/identifier.zig
+++ b/src/js_lexer/identifier.zig
--- a/src/js_lexer/identifier_cache.zig
+++ b/src/js_lexer/identifier_cache.zig
@@ -1,22 +0,0 @@
-const std = @import("std");
-const bun = @import("root").bun;
-const identifier_data = @import("./identifier_data.zig");
-
-pub const CachedBitset = extern struct {
-    range: [2]i32,
-    len: u32,
-
-    pub fn fromFile(comptime filename: anytype) CachedBitset {
-        return comptime @as(CachedBitset, @bitCast(bun.asByteSlice(@embedFile(filename)).ptr[0..@sizeOf(CachedBitset)].*));
-    }
-};
-
-pub fn setMasks(masks: [*:0]const u8, comptime MaskType: type, masky: MaskType) void {
-    const FieldInfo: std.builtin.Type.StructField = std.meta.fieldInfo(MaskType, "masks");
-    masky.masks = @as(masks, @bitCast(FieldInfo.type));
-}
-
-pub const id_start_meta = identifier_data.id_start_cached;
-pub const id_continue_meta = identifier_data.id_continue_cached;
-pub const id_start = identifier_data.id_start;
-pub const id_continue = identifier_data.id_continue;
--- a/src/js_lexer/identifier_data.zig
+++ b/src/js_lexer/identifier_data.zig