Better unicode identifier start / continue check (#15455)

This commit is contained in:
Jarred Sumner
2024-12-25 23:02:46 -08:00
committed by GitHub
parent d4c0432a5f
commit 145a7fd92e
8 changed files with 320 additions and 2397 deletions

View File

@@ -1,172 +0,0 @@
// Thank you @evanw for this code!!!
const fs = require("fs");
const path = require("path");
// ES5 reference: https://es5.github.io/
//
// A conforming implementation of this International standard shall interpret
// characters in conformance with the Unicode Standard, Version 3.0 or later
// and ISO/IEC 10646-1 with either UCS-2 or UTF-16 as the adopted encoding
// form, implementation level 3. If the adopted ISO/IEC 10646-1 subset is not
// otherwise specified, it is presumed to be the BMP subset, collection 300.
//
// UnicodeLetter: any character in the Unicode categories “Uppercase letter (Lu)”,
// “Lowercase letter (Ll)”, “Titlecase letter (Lt)”, “Modifier letter (Lm)”,
// “Other letter (Lo)”, or “Letter number (Nl)”.
const idStartES5 = []
.concat(
require("@unicode/unicode-3.0.0/General_Category/Uppercase_Letter/code-points"),
require("@unicode/unicode-3.0.0/General_Category/Lowercase_Letter/code-points"),
require("@unicode/unicode-3.0.0/General_Category/Titlecase_Letter/code-points"),
require("@unicode/unicode-3.0.0/General_Category/Modifier_Letter/code-points"),
require("@unicode/unicode-3.0.0/General_Category/Other_Letter/code-points"),
// The "letter number" category is not included because old versions of Safari
// had a bug where they didn't include it. This means it does not match ES5.
// We need to make sure we escape these characters so Safari can read them.
// See https://github.com/evanw/esbuild/issues/1349 for more information.
// require('@unicode/unicode-3.0.0/General_Category/Letter_Number/code-points'),
)
.sort((a, b) => a - b);
// UnicodeCombiningMark: any character in the Unicode categories “Non-spacing mark (Mn)”
// or “Combining spacing mark (Mc)”
// UnicodeDigit: any character in the Unicode category “Decimal number (Nd)”
// UnicodeConnectorPunctuation: any character in the Unicode category “Connector punctuation (Pc)”
const idContinueES5 = idStartES5
.concat(
require("@unicode/unicode-3.0.0/General_Category/Nonspacing_Mark/code-points"),
require("@unicode/unicode-3.0.0/General_Category/Spacing_Mark/code-points"),
require("@unicode/unicode-3.0.0/General_Category/Decimal_Number/code-points"),
require("@unicode/unicode-3.0.0/General_Category/Connector_Punctuation/code-points"),
)
.sort((a, b) => a - b);
// ESNext reference: https://tc39.es/ecma262/
//
// A conforming implementation of ECMAScript must interpret source text input
// in conformance with the Unicode Standard, Version 5.1.0 or later and ISO/IEC
// 10646. If the adopted ISO/IEC 10646-1 subset is not otherwise specified, it
// is presumed to be the Unicode set, collection 10646.
//
// UnicodeIDStart: any Unicode code point with the Unicode property “ID_Start”
const idStartESNext = require("@unicode/unicode-13.0.0/Binary_Property/ID_Start/code-points");
const idStartESNextSet = new Set(idStartESNext);
// UnicodeIDContinue: any Unicode code point with the Unicode property “ID_Continue”
const idContinueESNext = require("@unicode/unicode-13.0.0/Binary_Property/ID_Continue/code-points");
const idContinueESNextSet = new Set(idContinueESNext);
// These identifiers are valid in both ES5 and ES6+ (i.e. an intersection of both)
const idStartES5AndESNext = idStartES5.filter(n => idStartESNextSet.has(n));
const idContinueES5AndESNext = idContinueES5.filter(n => idContinueESNextSet.has(n));
// These identifiers are valid in either ES5 or ES6+ (i.e. a union of both)
const idStartES5OrESNext = [...new Set(idStartES5.concat(idStartESNext))].sort((a, b) => a - b);
const idContinueES5OrESNext = [...new Set(idContinueES5.concat(idContinueESNext))].sort((a, b) => a - b);
function generateRangeTable(codePoints) {
let lines = [];
let index = 0;
let latinOffset = 0;
while (latinOffset < codePoints.length && codePoints[latinOffset] <= 0xff) {
latinOffset++;
}
lines.push(`RangeTable.init(`, ` ${latinOffset},`, ` &[_]R16Range{`);
// 16-bit code points
while (index < codePoints.length && codePoints[index] < 0x1000) {
let start = codePoints[index];
index++;
while (index < codePoints.length && codePoints[index] < 0x1000 && codePoints[index] === codePoints[index - 1] + 1) {
index++;
}
let end = codePoints[index - 1];
lines.push(` .{0x${start.toString(16)}, 0x${end.toString(16)}},`);
}
lines.push(` },`, `&[_]R32Range{`);
// 32-bit code points
while (index < codePoints.length) {
let start = codePoints[index];
index++;
while (index < codePoints.length && codePoints[index] === codePoints[index - 1] + 1) {
index++;
}
let end = codePoints[index - 1];
lines.push(` .{0x${start.toString(16)}, 0x${end.toString(16)}},`);
}
lines.push(` },`, `);`);
return lines.join("\n");
}
function generateBigSwitchStatement(codePoints) {
let lines = [];
let index = 0;
let latinOffset = 0;
while (latinOffset < codePoints.length && codePoints[latinOffset] <= 0xff) {
latinOffset++;
}
lines.push(`return switch(codepoint) {`);
// 16-bit code points
while (index < codePoints.length && codePoints[index] < 0x1000) {
let start = codePoints[index];
index++;
while (index < codePoints.length && codePoints[index] < 0x1000 && codePoints[index] === codePoints[index - 1] + 1) {
index++;
}
let end = codePoints[index - 1];
lines.push(`0x${start.toString(16)}...0x${end.toString(16)},`);
}
// 32-bit code points
while (index < codePoints.length) {
let start = codePoints[index];
index++;
while (index < codePoints.length && codePoints[index] === codePoints[index - 1] + 1) {
index++;
}
let end = codePoints[index - 1];
lines.push(` 0x${start.toString(16)}...0x${end.toString(16)},`);
}
lines.push(` => true,
else => false
};`);
return lines.join("\n");
}
fs.writeFileSync(
path.join(__dirname, "..", "src", "js_lexer", "unicode.zig"),
`// This file was automatically generated by ${path.basename(__filename)}. Do not edit.
const RangeTable = @import("./range_table.zig");
// ES5 || ESNext
pub const id_start = ${generateRangeTable(idStartES5OrESNext)}
// ES5 || ESNext
pub const id_continue = ${generateRangeTable(idContinueES5OrESNext)}
pub const printable_id_start = ${generateRangeTable(idStartESNext)}
pub const printable_id_continue = ${generateRangeTable(idContinueESNext)}
pub fn isIdentifierStart(comptime Codepoint: type, codepoint: Codepoint) bool{
${generateBigSwitchStatement(idStartES5OrESNext)}
}
pub fn isIdentifierContinue(comptime Codepoint: type, codepoint: Codepoint) bool{
${generateBigSwitchStatement(idContinueES5OrESNext)}
}
`,
);

View File

@@ -0,0 +1,108 @@
import { Generator, Context } from "./unicode-generator";
// Create sets for fast lookups
const idStartES5Set = new Set([
...require("@unicode/unicode-3.0.0/General_Category/Uppercase_Letter/code-points"),
...require("@unicode/unicode-3.0.0/General_Category/Lowercase_Letter/code-points"),
...require("@unicode/unicode-3.0.0/General_Category/Titlecase_Letter/code-points"),
...require("@unicode/unicode-3.0.0/General_Category/Modifier_Letter/code-points"),
...require("@unicode/unicode-3.0.0/General_Category/Other_Letter/code-points"),
]);
const idContinueES5Set = new Set([
...idStartES5Set,
...require("@unicode/unicode-3.0.0/General_Category/Nonspacing_Mark/code-points"),
...require("@unicode/unicode-3.0.0/General_Category/Spacing_Mark/code-points"),
...require("@unicode/unicode-3.0.0/General_Category/Decimal_Number/code-points"),
...require("@unicode/unicode-3.0.0/General_Category/Connector_Punctuation/code-points"),
]);
const idStartESNextSet = new Set(require("@unicode/unicode-15.1.0/Binary_Property/ID_Start/code-points"));
const idContinueESNextSet = new Set(require("@unicode/unicode-15.1.0/Binary_Property/ID_Continue/code-points"));
// Exclude known problematic codepoints
const ID_Continue_mistake = new Set([0x30fb, 0xff65]);
function bitsToU64Array(bits: number[]): bigint[] {
const result: bigint[] = [];
for (let i = 0; i < bits.length; i += 64) {
let value = 0n;
for (let j = 0; j < 64 && i + j < bits.length; j++) {
if (bits[i + j]) {
value |= 1n << BigInt(j);
}
}
result.push(value);
}
return result;
}
async function generateTable(table: string, name: string, checkFn: (cp: number) => boolean) {
const context: Context<boolean> = {
get: (cp: number) => checkFn(cp),
eql: (a: boolean, b: boolean) => a === b,
};
const generator = new Generator(context);
const tables = await generator.generate();
return `
pub fn ${name}(cp: u21) bool {
if (cp > 0x10FFFF) return false;
const high = cp >> 8;
const low = cp & 0xFF;
const stage2_idx = ${table}.stage1[high];
const bit_pos = stage2_idx + low;
const u64_idx = bit_pos >> 6;
const bit_idx = @as(u6, @intCast(bit_pos & 63));
return (${table}.stage2[u64_idx] & (@as(u64, 1) << bit_idx)) != 0;
}
const ${table} = struct {
pub const stage1 = [_]u16{${tables.stage1.join(",")}};
pub const stage2 = [_]u64{${bitsToU64Array(tables.stage2)
.map(n => n.toString())
.join(",")}};
};
`;
}
async function main() {
const functions = [
{
name: "isIDStartES5",
table: "idStartES5",
check: (cp: number) => idStartES5Set.has(cp),
},
{
name: "isIDContinueES5",
table: "idContinueES5",
check: (cp: number) => idContinueES5Set.has(cp),
},
{
name: "isIDStartESNext",
table: "idStartESNext",
check: (cp: number) => idStartESNextSet.has(cp),
},
{
name: "isIDContinueESNext",
table: "idContinueESNext",
check: (cp: number) => idContinueESNextSet.has(cp) && !ID_Continue_mistake.has(cp),
},
];
const results = await Promise.all(
functions.map(async ({ name, check, table }) => {
const code = await generateTable(table, name, check);
return `
/// ${name} checks if a codepoint is valid in the ${name} category
${code}`;
}),
);
console.log(`/// This file is auto-generated. Do not edit.
${results.join("\n\n")}`);
}
main();

View File

@@ -5,7 +5,10 @@
"license": "MIT", "license": "MIT",
"devDependencies": { "devDependencies": {
"@unicode/unicode-13.0.0": "^1.2.1", "@unicode/unicode-13.0.0": "^1.2.1",
"@unicode/unicode-3.0.0": "^1.2.1", "@unicode/unicode-3.0.0": "^1.6.5",
"semver": "^7.3.7" "semver": "^7.3.7"
},
"dependencies": {
"@unicode/unicode-15.1.0": "^1.6.5"
} }
} }

View File

@@ -0,0 +1,138 @@
import crypto from "crypto";
// Types to mirror Zig's structures
interface Context<Elem> {
get(codepoint: number): Promise<Elem> | Elem;
eql(a: Elem, b: Elem): boolean;
}
interface Tables<Elem> {
stage1: number[];
stage2: number[];
stage3: Elem[];
}
class Generator<Elem> {
private static readonly BLOCK_SIZE = 256;
private readonly ctx: Context<Elem>;
private readonly blockMap = new Map<string, number>();
constructor(ctx: Context<Elem>) {
this.ctx = ctx;
}
private hashBlock(block: number[]): string {
const hash = crypto.createHash("sha256");
hash.update(Buffer.from(new Uint16Array(block).buffer));
return hash.digest("hex");
}
async generate(): Promise<Tables<Elem>> {
const stage1: number[] = [];
const stage2: number[] = [];
const stage3: Elem[] = [];
let block = new Array(Generator.BLOCK_SIZE).fill(0);
let blockLen = 0;
// Maximum Unicode codepoint is 0x10FFFF
for (let cp = 0; cp <= 0x10ffff; cp++) {
// Get the mapping for this codepoint
const elem = await this.ctx.get(cp);
// Find or add the element in stage3
let blockIdx = stage3.findIndex(item => this.ctx.eql(item, elem));
if (blockIdx === -1) {
blockIdx = stage3.length;
stage3.push(elem);
}
if (blockIdx > 0xffff) {
throw new Error("Block index too large");
}
// Add to current block
block[blockLen] = blockIdx;
blockLen++;
// Check if we need to finalize this block
if (blockLen < Generator.BLOCK_SIZE && cp !== 0x10ffff) {
continue;
}
// Fill remaining block space with zeros if needed
if (blockLen < Generator.BLOCK_SIZE) {
block.fill(0, blockLen);
}
// Get or create stage2 index for this block
const blockHash = this.hashBlock(block);
let stage2Idx = this.blockMap.get(blockHash);
if (stage2Idx === undefined) {
stage2Idx = stage2.length;
this.blockMap.set(blockHash, stage2Idx);
stage2.push(...block.slice(0, blockLen));
}
if (stage2Idx > 0xffff) {
throw new Error("Stage2 index too large");
}
// Add mapping to stage1
stage1.push(stage2Idx);
// Reset block
block = new Array(Generator.BLOCK_SIZE).fill(0);
blockLen = 0;
}
return { stage1, stage2, stage3 };
}
// Generates Zig code for the lookup tables
static writeZig<Elem>(tableName: string, tables: Tables<Elem>, elemToString: (elem: Elem) => string): string {
let output = `/// Auto-generated. Do not edit.\n`;
output += `fn ${tableName}(comptime Elem: type) type {\n`;
output += " return struct {\n";
// Stage 1
output += `pub const stage1: [${tables.stage1.length}]u16 = .{`;
output += tables.stage1.join(",");
output += "};\n\n";
// Stage 2
output += `pub const stage2: [${tables.stage2.length}]u8 = .{`;
output += tables.stage2.join(",");
output += "};\n\n";
// Stage 3
output += `pub const stage3: [${tables.stage3.length}]Elem = .{`;
output += tables.stage3.map(elemToString).join(",");
output += "};\n";
output += " };\n}\n";
return output;
}
}
// Example usage:
async function example() {
// Example context that maps codepoints to their category
const ctx: Context<string> = {
get: async (cp: number) => {
// This would normally look up the actual Unicode category
return "Lu";
},
eql: (a: string, b: string) => a === b,
};
const generator = new Generator(ctx);
const tables = await generator.generate();
// Generate Zig code
const zigCode = Generator.writeZig(tables, (elem: string) => `"${elem}"`);
console.log(zigCode);
}
export { Generator, type Context, type Tables };

View File

@@ -3043,18 +3043,10 @@ pub const Lexer = NewLexer(.{});
const JSIdentifier = @import("./js_lexer/identifier.zig"); const JSIdentifier = @import("./js_lexer/identifier.zig");
pub inline fn isIdentifierStart(codepoint: i32) bool { pub inline fn isIdentifierStart(codepoint: i32) bool {
if (comptime Environment.isWasm) { return JSIdentifier.isIdentifierStart(codepoint);
return JSIdentifier.JumpTable.isIdentifierStart(codepoint);
}
return JSIdentifier.Bitset.isIdentifierStart(codepoint);
} }
pub inline fn isIdentifierContinue(codepoint: i32) bool { pub inline fn isIdentifierContinue(codepoint: i32) bool {
if (comptime Environment.isWasm) { return JSIdentifier.isIdentifierPart(codepoint);
return JSIdentifier.JumpTable.isIdentifierPart(codepoint);
}
return JSIdentifier.Bitset.isIdentifierPart(codepoint);
} }
pub fn isWhitespace(codepoint: CodePoint) bool { pub fn isWhitespace(codepoint: CodePoint) bool {

File diff suppressed because one or more lines are too long

View File

@@ -1,22 +0,0 @@
const std = @import("std");
const bun = @import("root").bun;
const identifier_data = @import("./identifier_data.zig");
pub const CachedBitset = extern struct {
range: [2]i32,
len: u32,
pub fn fromFile(comptime filename: anytype) CachedBitset {
return comptime @as(CachedBitset, @bitCast(bun.asByteSlice(@embedFile(filename)).ptr[0..@sizeOf(CachedBitset)].*));
}
};
pub fn setMasks(masks: [*:0]const u8, comptime MaskType: type, masky: MaskType) void {
const FieldInfo: std.builtin.Type.StructField = std.meta.fieldInfo(MaskType, "masks");
masky.masks = @as(masks, @bitCast(FieldInfo.type));
}
pub const id_start_meta = identifier_data.id_start_cached;
pub const id_continue_meta = identifier_data.id_continue_cached;
pub const id_start = identifier_data.id_start;
pub const id_continue = identifier_data.id_continue;

File diff suppressed because one or more lines are too long