Better unicode identifier start / continue check (#15455)

2026-02-02 15:08:46 +00:00 · 2024-12-25 23:02:46 -08:00
parent d4c0432a5f
commit 145a7fd92e
8 changed files with 320 additions and 2397 deletions
--- a/misctools/gen-unicode-table.js
+++ b/misctools/gen-unicode-table.js
@@ -1,172 +0,0 @@
 // Thank you @evanw for this code!!!
 const fs = require("fs");
 const path = require("path");
 // ES5 reference: https://es5.github.io/
 //
 // A conforming implementation of this International standard shall interpret
 // characters in conformance with the Unicode Standard, Version 3.0 or later
 // and ISO/IEC 10646-1 with either UCS-2 or UTF-16 as the adopted encoding
 // form, implementation level 3. If the adopted ISO/IEC 10646-1 subset is not
 // otherwise specified, it is presumed to be the BMP subset, collection 300.
 //
 // UnicodeLetter: any character in the Unicode categories “Uppercase letter (Lu)”,
 // “Lowercase letter (Ll)”, “Titlecase letter (Lt)”, “Modifier letter (Lm)”,
 // “Other letter (Lo)”, or “Letter number (Nl)”.
 const idStartES5 = []
  .concat(
    require("@unicode/unicode-3.0.0/General_Category/Uppercase_Letter/code-points"),
    require("@unicode/unicode-3.0.0/General_Category/Lowercase_Letter/code-points"),
    require("@unicode/unicode-3.0.0/General_Category/Titlecase_Letter/code-points"),
    require("@unicode/unicode-3.0.0/General_Category/Modifier_Letter/code-points"),
    require("@unicode/unicode-3.0.0/General_Category/Other_Letter/code-points"),
    // The "letter number" category is not included because old versions of Safari
    // had a bug where they didn't include it. This means it does not match ES5.
    // We need to make sure we escape these characters so Safari can read them.
    // See https://github.com/evanw/esbuild/issues/1349 for more information.
    // require('@unicode/unicode-3.0.0/General_Category/Letter_Number/code-points'),
  )
  .sort((a, b) => a - b);
 // UnicodeCombiningMark: any character in the Unicode categories “Non-spacing mark (Mn)”
 // or “Combining spacing mark (Mc)”
 // UnicodeDigit: any character in the Unicode category “Decimal number (Nd)”
 // UnicodeConnectorPunctuation: any character in the Unicode category “Connector punctuation (Pc)”
 const idContinueES5 = idStartES5
  .concat(
    require("@unicode/unicode-3.0.0/General_Category/Nonspacing_Mark/code-points"),
    require("@unicode/unicode-3.0.0/General_Category/Spacing_Mark/code-points"),
    require("@unicode/unicode-3.0.0/General_Category/Decimal_Number/code-points"),
    require("@unicode/unicode-3.0.0/General_Category/Connector_Punctuation/code-points"),
  )
  .sort((a, b) => a - b);
 // ESNext reference: https://tc39.es/ecma262/
 //
 // A conforming implementation of ECMAScript must interpret source text input
 // in conformance with the Unicode Standard, Version 5.1.0 or later and ISO/IEC
 // 10646. If the adopted ISO/IEC 10646-1 subset is not otherwise specified, it
 // is presumed to be the Unicode set, collection 10646.
 //
 // UnicodeIDStart: any Unicode code point with the Unicode property “ID_Start”
 const idStartESNext = require("@unicode/unicode-13.0.0/Binary_Property/ID_Start/code-points");
 const idStartESNextSet = new Set(idStartESNext);
 // UnicodeIDContinue: any Unicode code point with the Unicode property “ID_Continue”
 const idContinueESNext = require("@unicode/unicode-13.0.0/Binary_Property/ID_Continue/code-points");
 const idContinueESNextSet = new Set(idContinueESNext);
 // These identifiers are valid in both ES5 and ES6+ (i.e. an intersection of both)
 const idStartES5AndESNext = idStartES5.filter(n => idStartESNextSet.has(n));
 const idContinueES5AndESNext = idContinueES5.filter(n => idContinueESNextSet.has(n));
 // These identifiers are valid in either ES5 or ES6+ (i.e. a union of both)
 const idStartES5OrESNext = [...new Set(idStartES5.concat(idStartESNext))].sort((a, b) => a - b);
 const idContinueES5OrESNext = [...new Set(idContinueES5.concat(idContinueESNext))].sort((a, b) => a - b);
 function generateRangeTable(codePoints) {
  let lines = [];
  let index = 0;
  let latinOffset = 0;
  while (latinOffset < codePoints.length && codePoints[latinOffset] <= 0xff) {
    latinOffset++;
  }
  lines.push(`RangeTable.init(`, `   ${latinOffset},`, `  &[_]R16Range{`);
  // 16-bit code points
  while (index < codePoints.length && codePoints[index] < 0x1000) {
    let start = codePoints[index];
    index++;
    while (index < codePoints.length && codePoints[index] < 0x1000 && codePoints[index] === codePoints[index - 1] + 1) {
      index++;
    }
    let end = codePoints[index - 1];
    lines.push(`  .{0x${start.toString(16)}, 0x${end.toString(16)}},`);
  }
  lines.push(`  },`, `&[_]R32Range{`);
  // 32-bit code points
  while (index < codePoints.length) {
    let start = codePoints[index];
    index++;
    while (index < codePoints.length && codePoints[index] === codePoints[index - 1] + 1) {
      index++;
    }
    let end = codePoints[index - 1];
    lines.push(`    .{0x${start.toString(16)}, 0x${end.toString(16)}},`);
  }
  lines.push(`  },`, `);`);
  return lines.join("\n");
 }
 function generateBigSwitchStatement(codePoints) {
  let lines = [];
  let index = 0;
  let latinOffset = 0;
  while (latinOffset < codePoints.length && codePoints[latinOffset] <= 0xff) {
    latinOffset++;
  }
  lines.push(`return switch(codepoint) {`);
  // 16-bit code points
  while (index < codePoints.length && codePoints[index] < 0x1000) {
    let start = codePoints[index];
    index++;
    while (index < codePoints.length && codePoints[index] < 0x1000 && codePoints[index] === codePoints[index - 1] + 1) {
      index++;
    }
    let end = codePoints[index - 1];
    lines.push(`0x${start.toString(16)}...0x${end.toString(16)},`);
  }
  // 32-bit code points
  while (index < codePoints.length) {
    let start = codePoints[index];
    index++;
    while (index < codePoints.length && codePoints[index] === codePoints[index - 1] + 1) {
      index++;
    }
    let end = codePoints[index - 1];
    lines.push(` 0x${start.toString(16)}...0x${end.toString(16)},`);
  }
  lines.push(` => true, 
    else => false  
 };`);
  return lines.join("\n");
 }
 fs.writeFileSync(
  path.join(__dirname, "..", "src", "js_lexer", "unicode.zig"),
  `// This file was automatically generated by ${path.basename(__filename)}. Do not edit.
  const RangeTable = @import("./range_table.zig");
 // ES5 || ESNext
 pub const id_start = ${generateRangeTable(idStartES5OrESNext)}
 // ES5 || ESNext
 pub const id_continue = ${generateRangeTable(idContinueES5OrESNext)}
 pub const printable_id_start = ${generateRangeTable(idStartESNext)}
 pub const printable_id_continue = ${generateRangeTable(idContinueESNext)}
 pub fn isIdentifierStart(comptime Codepoint: type, codepoint: Codepoint) bool{
  ${generateBigSwitchStatement(idStartES5OrESNext)}
 }
 pub fn isIdentifierContinue(comptime Codepoint: type, codepoint: Codepoint) bool{
  ${generateBigSwitchStatement(idContinueES5OrESNext)}
 }
 `,
 );
--- a/misctools/gen-unicode-table.ts
+++ b/misctools/gen-unicode-table.ts
@@ -0,0 +1,108 @@
 import { Generator, Context } from "./unicode-generator";
 // Create sets for fast lookups
 const idStartES5Set = new Set([
  ...require("@unicode/unicode-3.0.0/General_Category/Uppercase_Letter/code-points"),
  ...require("@unicode/unicode-3.0.0/General_Category/Lowercase_Letter/code-points"),
  ...require("@unicode/unicode-3.0.0/General_Category/Titlecase_Letter/code-points"),
  ...require("@unicode/unicode-3.0.0/General_Category/Modifier_Letter/code-points"),
  ...require("@unicode/unicode-3.0.0/General_Category/Other_Letter/code-points"),
 ]);
 const idContinueES5Set = new Set([
  ...idStartES5Set,
  ...require("@unicode/unicode-3.0.0/General_Category/Nonspacing_Mark/code-points"),
  ...require("@unicode/unicode-3.0.0/General_Category/Spacing_Mark/code-points"),
  ...require("@unicode/unicode-3.0.0/General_Category/Decimal_Number/code-points"),
  ...require("@unicode/unicode-3.0.0/General_Category/Connector_Punctuation/code-points"),
 ]);
 const idStartESNextSet = new Set(require("@unicode/unicode-15.1.0/Binary_Property/ID_Start/code-points"));
 const idContinueESNextSet = new Set(require("@unicode/unicode-15.1.0/Binary_Property/ID_Continue/code-points"));
 // Exclude known problematic codepoints
 const ID_Continue_mistake = new Set([0x30fb, 0xff65]);
 function bitsToU64Array(bits: number[]): bigint[] {
  const result: bigint[] = [];
  for (let i = 0; i < bits.length; i += 64) {
    let value = 0n;
    for (let j = 0; j < 64 && i + j < bits.length; j++) {
      if (bits[i + j]) {
        value |= 1n << BigInt(j);
      }
    }
    result.push(value);
  }
  return result;
 }
 async function generateTable(table: string, name: string, checkFn: (cp: number) => boolean) {
  const context: Context<boolean> = {
    get: (cp: number) => checkFn(cp),
    eql: (a: boolean, b: boolean) => a === b,
  };
  const generator = new Generator(context);
  const tables = await generator.generate();
  return `
 pub fn ${name}(cp: u21) bool {
    if (cp > 0x10FFFF) return false;
    const high = cp >> 8;
    const low = cp & 0xFF;
    const stage2_idx = ${table}.stage1[high];
    const bit_pos = stage2_idx + low;
    const u64_idx = bit_pos >> 6;
    const bit_idx = @as(u6, @intCast(bit_pos & 63));
    return (${table}.stage2[u64_idx] & (@as(u64, 1) << bit_idx)) != 0;
 }
 const ${table} = struct {
    pub const stage1 = [_]u16{${tables.stage1.join(",")}};
    pub const stage2 = [_]u64{${bitsToU64Array(tables.stage2)
      .map(n => n.toString())
      .join(",")}};
 };
 `;
 }
 async function main() {
  const functions = [
    {
      name: "isIDStartES5",
      table: "idStartES5",
      check: (cp: number) => idStartES5Set.has(cp),
    },
    {
      name: "isIDContinueES5",
      table: "idContinueES5",
      check: (cp: number) => idContinueES5Set.has(cp),
    },
    {
      name: "isIDStartESNext",
      table: "idStartESNext",
      check: (cp: number) => idStartESNextSet.has(cp),
    },
    {
      name: "isIDContinueESNext",
      table: "idContinueESNext",
      check: (cp: number) => idContinueESNextSet.has(cp) && !ID_Continue_mistake.has(cp),
    },
  ];
  const results = await Promise.all(
    functions.map(async ({ name, check, table }) => {
      const code = await generateTable(table, name, check);
      return `
 /// ${name} checks if a codepoint is valid in the ${name} category
 ${code}`;
    }),
  );
  console.log(`/// This file is auto-generated. Do not edit.
 ${results.join("\n\n")}`);
 }
 main();
--- a/misctools/package.json
+++ b/misctools/package.json
@@ -5,7 +5,10 @@
  "license": "MIT",
  "devDependencies": {
    "@unicode/unicode-13.0.0": "^1.2.1",
-    "@unicode/unicode-3.0.0": "^1.2.1",
+    "@unicode/unicode-3.0.0": "^1.6.5",
    "semver": "^7.3.7"
  },
  "dependencies": {
    "@unicode/unicode-15.1.0": "^1.6.5"
  }
 }
--- a/misctools/unicode-generator.ts
+++ b/misctools/unicode-generator.ts
@@ -0,0 +1,138 @@
 import crypto from "crypto";
 // Types to mirror Zig's structures
 interface Context<Elem> {
  get(codepoint: number): Promise<Elem> | Elem;
  eql(a: Elem, b: Elem): boolean;
 }
 interface Tables<Elem> {
  stage1: number[];
  stage2: number[];
  stage3: Elem[];
 }
 class Generator<Elem> {
  private static readonly BLOCK_SIZE = 256;
  private readonly ctx: Context<Elem>;
  private readonly blockMap = new Map<string, number>();
  constructor(ctx: Context<Elem>) {
    this.ctx = ctx;
  }
  private hashBlock(block: number[]): string {
    const hash = crypto.createHash("sha256");
    hash.update(Buffer.from(new Uint16Array(block).buffer));
    return hash.digest("hex");
  }
  async generate(): Promise<Tables<Elem>> {
    const stage1: number[] = [];
    const stage2: number[] = [];
    const stage3: Elem[] = [];
    let block = new Array(Generator.BLOCK_SIZE).fill(0);
    let blockLen = 0;
    // Maximum Unicode codepoint is 0x10FFFF
    for (let cp = 0; cp <= 0x10ffff; cp++) {
      // Get the mapping for this codepoint
      const elem = await this.ctx.get(cp);
      // Find or add the element in stage3
      let blockIdx = stage3.findIndex(item => this.ctx.eql(item, elem));
      if (blockIdx === -1) {
        blockIdx = stage3.length;
        stage3.push(elem);
      }
      if (blockIdx > 0xffff) {
        throw new Error("Block index too large");
      }
      // Add to current block
      block[blockLen] = blockIdx;
      blockLen++;
      // Check if we need to finalize this block
      if (blockLen < Generator.BLOCK_SIZE && cp !== 0x10ffff) {
        continue;
      }
      // Fill remaining block space with zeros if needed
      if (blockLen < Generator.BLOCK_SIZE) {
        block.fill(0, blockLen);
      }
      // Get or create stage2 index for this block
      const blockHash = this.hashBlock(block);
      let stage2Idx = this.blockMap.get(blockHash);
      if (stage2Idx === undefined) {
        stage2Idx = stage2.length;
        this.blockMap.set(blockHash, stage2Idx);
        stage2.push(...block.slice(0, blockLen));
      }
      if (stage2Idx > 0xffff) {
        throw new Error("Stage2 index too large");
      }
      // Add mapping to stage1
      stage1.push(stage2Idx);
      // Reset block
      block = new Array(Generator.BLOCK_SIZE).fill(0);
      blockLen = 0;
    }
    return { stage1, stage2, stage3 };
  }
  // Generates Zig code for the lookup tables
  static writeZig<Elem>(tableName: string, tables: Tables<Elem>, elemToString: (elem: Elem) => string): string {
    let output = `/// Auto-generated. Do not edit.\n`;
    output += `fn ${tableName}(comptime Elem: type) type {\n`;
    output += "    return struct {\n";
    // Stage 1
    output += `pub const stage1: [${tables.stage1.length}]u16 = .{`;
    output += tables.stage1.join(",");
    output += "};\n\n";
    // Stage 2
    output += `pub const stage2: [${tables.stage2.length}]u8 = .{`;
    output += tables.stage2.join(",");
    output += "};\n\n";
    // Stage 3
    output += `pub const stage3: [${tables.stage3.length}]Elem = .{`;
    output += tables.stage3.map(elemToString).join(",");
    output += "};\n";
    output += "    };\n}\n";
    return output;
  }
 }
 // Example usage:
 async function example() {
  // Example context that maps codepoints to their category
  const ctx: Context<string> = {
    get: async (cp: number) => {
      // This would normally look up the actual Unicode category
      return "Lu";
    },
    eql: (a: string, b: string) => a === b,
  };
  const generator = new Generator(ctx);
  const tables = await generator.generate();
  // Generate Zig code
  const zigCode = Generator.writeZig(tables, (elem: string) => `"${elem}"`);
  console.log(zigCode);
 }
 export { Generator, type Context, type Tables };
--- a/src/js_lexer.zig
+++ b/src/js_lexer.zig
@@ -3043,18 +3043,10 @@ pub const Lexer = NewLexer(.{});
 const JSIdentifier = @import("./js_lexer/identifier.zig");
 pub inline fn isIdentifierStart(codepoint: i32) bool {
-    if (comptime Environment.isWasm) {
+    return JSIdentifier.isIdentifierStart(codepoint);
        return JSIdentifier.JumpTable.isIdentifierStart(codepoint);
    }
    return JSIdentifier.Bitset.isIdentifierStart(codepoint);
 }
 pub inline fn isIdentifierContinue(codepoint: i32) bool {
-    if (comptime Environment.isWasm) {
+    return JSIdentifier.isIdentifierPart(codepoint);
        return JSIdentifier.JumpTable.isIdentifierPart(codepoint);
    }
    return JSIdentifier.Bitset.isIdentifierPart(codepoint);
 }
 pub fn isWhitespace(codepoint: CodePoint) bool {
--- a/src/js_lexer/identifier.zig
+++ b/src/js_lexer/identifier.zig
--- a/src/js_lexer/identifier_cache.zig
+++ b/src/js_lexer/identifier_cache.zig
@@ -1,22 +0,0 @@
 const std = @import("std");
 const bun = @import("root").bun;
 const identifier_data = @import("./identifier_data.zig");
 pub const CachedBitset = extern struct {
    range: [2]i32,
    len: u32,
    pub fn fromFile(comptime filename: anytype) CachedBitset {
        return comptime @as(CachedBitset, @bitCast(bun.asByteSlice(@embedFile(filename)).ptr[0..@sizeOf(CachedBitset)].*));
    }
 };
 pub fn setMasks(masks: [*:0]const u8, comptime MaskType: type, masky: MaskType) void {
    const FieldInfo: std.builtin.Type.StructField = std.meta.fieldInfo(MaskType, "masks");
    masky.masks = @as(masks, @bitCast(FieldInfo.type));
 }
 pub const id_start_meta = identifier_data.id_start_cached;
 pub const id_continue_meta = identifier_data.id_continue_cached;
 pub const id_start = identifier_data.id_start;
 pub const id_continue = identifier_data.id_continue;
--- a/src/js_lexer/identifier_data.zig
+++ b/src/js_lexer/identifier_data.zig