mirror of
https://github.com/oven-sh/bun
synced 2026-02-02 15:08:46 +00:00
Better unicode identifier start / continue check (#15455)
This commit is contained in:
@@ -1,172 +0,0 @@
|
||||
// Thank you @evanw for this code!!!
|
||||
const fs = require("fs");
|
||||
const path = require("path");
|
||||
|
||||
// ES5 reference: https://es5.github.io/
|
||||
//
|
||||
// A conforming implementation of this International standard shall interpret
|
||||
// characters in conformance with the Unicode Standard, Version 3.0 or later
|
||||
// and ISO/IEC 10646-1 with either UCS-2 or UTF-16 as the adopted encoding
|
||||
// form, implementation level 3. If the adopted ISO/IEC 10646-1 subset is not
|
||||
// otherwise specified, it is presumed to be the BMP subset, collection 300.
|
||||
//
|
||||
// UnicodeLetter: any character in the Unicode categories “Uppercase letter (Lu)”,
|
||||
// “Lowercase letter (Ll)”, “Titlecase letter (Lt)”, “Modifier letter (Lm)”,
|
||||
// “Other letter (Lo)”, or “Letter number (Nl)”.
|
||||
const idStartES5 = []
|
||||
.concat(
|
||||
require("@unicode/unicode-3.0.0/General_Category/Uppercase_Letter/code-points"),
|
||||
require("@unicode/unicode-3.0.0/General_Category/Lowercase_Letter/code-points"),
|
||||
require("@unicode/unicode-3.0.0/General_Category/Titlecase_Letter/code-points"),
|
||||
require("@unicode/unicode-3.0.0/General_Category/Modifier_Letter/code-points"),
|
||||
require("@unicode/unicode-3.0.0/General_Category/Other_Letter/code-points"),
|
||||
|
||||
// The "letter number" category is not included because old versions of Safari
|
||||
// had a bug where they didn't include it. This means it does not match ES5.
|
||||
// We need to make sure we escape these characters so Safari can read them.
|
||||
// See https://github.com/evanw/esbuild/issues/1349 for more information.
|
||||
// require('@unicode/unicode-3.0.0/General_Category/Letter_Number/code-points'),
|
||||
)
|
||||
.sort((a, b) => a - b);
|
||||
|
||||
// UnicodeCombiningMark: any character in the Unicode categories “Non-spacing mark (Mn)”
|
||||
// or “Combining spacing mark (Mc)”
|
||||
// UnicodeDigit: any character in the Unicode category “Decimal number (Nd)”
|
||||
// UnicodeConnectorPunctuation: any character in the Unicode category “Connector punctuation (Pc)”
|
||||
const idContinueES5 = idStartES5
|
||||
.concat(
|
||||
require("@unicode/unicode-3.0.0/General_Category/Nonspacing_Mark/code-points"),
|
||||
require("@unicode/unicode-3.0.0/General_Category/Spacing_Mark/code-points"),
|
||||
require("@unicode/unicode-3.0.0/General_Category/Decimal_Number/code-points"),
|
||||
require("@unicode/unicode-3.0.0/General_Category/Connector_Punctuation/code-points"),
|
||||
)
|
||||
.sort((a, b) => a - b);
|
||||
|
||||
// ESNext reference: https://tc39.es/ecma262/
|
||||
//
|
||||
// A conforming implementation of ECMAScript must interpret source text input
|
||||
// in conformance with the Unicode Standard, Version 5.1.0 or later and ISO/IEC
|
||||
// 10646. If the adopted ISO/IEC 10646-1 subset is not otherwise specified, it
|
||||
// is presumed to be the Unicode set, collection 10646.
|
||||
//
|
||||
// UnicodeIDStart: any Unicode code point with the Unicode property “ID_Start”
|
||||
const idStartESNext = require("@unicode/unicode-13.0.0/Binary_Property/ID_Start/code-points");
|
||||
const idStartESNextSet = new Set(idStartESNext);
|
||||
|
||||
// UnicodeIDContinue: any Unicode code point with the Unicode property “ID_Continue”
|
||||
const idContinueESNext = require("@unicode/unicode-13.0.0/Binary_Property/ID_Continue/code-points");
|
||||
const idContinueESNextSet = new Set(idContinueESNext);
|
||||
|
||||
// These identifiers are valid in both ES5 and ES6+ (i.e. an intersection of both)
|
||||
const idStartES5AndESNext = idStartES5.filter(n => idStartESNextSet.has(n));
|
||||
const idContinueES5AndESNext = idContinueES5.filter(n => idContinueESNextSet.has(n));
|
||||
|
||||
// These identifiers are valid in either ES5 or ES6+ (i.e. a union of both)
|
||||
const idStartES5OrESNext = [...new Set(idStartES5.concat(idStartESNext))].sort((a, b) => a - b);
|
||||
const idContinueES5OrESNext = [...new Set(idContinueES5.concat(idContinueESNext))].sort((a, b) => a - b);
|
||||
|
||||
function generateRangeTable(codePoints) {
|
||||
let lines = [];
|
||||
let index = 0;
|
||||
let latinOffset = 0;
|
||||
|
||||
while (latinOffset < codePoints.length && codePoints[latinOffset] <= 0xff) {
|
||||
latinOffset++;
|
||||
}
|
||||
|
||||
lines.push(`RangeTable.init(`, ` ${latinOffset},`, ` &[_]R16Range{`);
|
||||
|
||||
// 16-bit code points
|
||||
while (index < codePoints.length && codePoints[index] < 0x1000) {
|
||||
let start = codePoints[index];
|
||||
index++;
|
||||
while (index < codePoints.length && codePoints[index] < 0x1000 && codePoints[index] === codePoints[index - 1] + 1) {
|
||||
index++;
|
||||
}
|
||||
let end = codePoints[index - 1];
|
||||
lines.push(` .{0x${start.toString(16)}, 0x${end.toString(16)}},`);
|
||||
}
|
||||
|
||||
lines.push(` },`, `&[_]R32Range{`);
|
||||
|
||||
// 32-bit code points
|
||||
while (index < codePoints.length) {
|
||||
let start = codePoints[index];
|
||||
index++;
|
||||
while (index < codePoints.length && codePoints[index] === codePoints[index - 1] + 1) {
|
||||
index++;
|
||||
}
|
||||
let end = codePoints[index - 1];
|
||||
lines.push(` .{0x${start.toString(16)}, 0x${end.toString(16)}},`);
|
||||
}
|
||||
|
||||
lines.push(` },`, `);`);
|
||||
return lines.join("\n");
|
||||
}
|
||||
|
||||
function generateBigSwitchStatement(codePoints) {
|
||||
let lines = [];
|
||||
let index = 0;
|
||||
let latinOffset = 0;
|
||||
|
||||
while (latinOffset < codePoints.length && codePoints[latinOffset] <= 0xff) {
|
||||
latinOffset++;
|
||||
}
|
||||
|
||||
lines.push(`return switch(codepoint) {`);
|
||||
|
||||
// 16-bit code points
|
||||
while (index < codePoints.length && codePoints[index] < 0x1000) {
|
||||
let start = codePoints[index];
|
||||
index++;
|
||||
while (index < codePoints.length && codePoints[index] < 0x1000 && codePoints[index] === codePoints[index - 1] + 1) {
|
||||
index++;
|
||||
}
|
||||
let end = codePoints[index - 1];
|
||||
lines.push(`0x${start.toString(16)}...0x${end.toString(16)},`);
|
||||
}
|
||||
|
||||
// 32-bit code points
|
||||
while (index < codePoints.length) {
|
||||
let start = codePoints[index];
|
||||
index++;
|
||||
while (index < codePoints.length && codePoints[index] === codePoints[index - 1] + 1) {
|
||||
index++;
|
||||
}
|
||||
let end = codePoints[index - 1];
|
||||
lines.push(` 0x${start.toString(16)}...0x${end.toString(16)},`);
|
||||
}
|
||||
|
||||
lines.push(` => true,
|
||||
else => false
|
||||
};`);
|
||||
return lines.join("\n");
|
||||
}
|
||||
|
||||
fs.writeFileSync(
|
||||
path.join(__dirname, "..", "src", "js_lexer", "unicode.zig"),
|
||||
`// This file was automatically generated by ${path.basename(__filename)}. Do not edit.
|
||||
|
||||
const RangeTable = @import("./range_table.zig");
|
||||
|
||||
|
||||
// ES5 || ESNext
|
||||
pub const id_start = ${generateRangeTable(idStartES5OrESNext)}
|
||||
|
||||
// ES5 || ESNext
|
||||
pub const id_continue = ${generateRangeTable(idContinueES5OrESNext)}
|
||||
|
||||
pub const printable_id_start = ${generateRangeTable(idStartESNext)}
|
||||
pub const printable_id_continue = ${generateRangeTable(idContinueESNext)}
|
||||
|
||||
pub fn isIdentifierStart(comptime Codepoint: type, codepoint: Codepoint) bool{
|
||||
${generateBigSwitchStatement(idStartES5OrESNext)}
|
||||
}
|
||||
|
||||
pub fn isIdentifierContinue(comptime Codepoint: type, codepoint: Codepoint) bool{
|
||||
${generateBigSwitchStatement(idContinueES5OrESNext)}
|
||||
}
|
||||
|
||||
|
||||
`,
|
||||
);
|
||||
108
misctools/gen-unicode-table.ts
Normal file
108
misctools/gen-unicode-table.ts
Normal file
@@ -0,0 +1,108 @@
|
||||
import { Generator, Context } from "./unicode-generator";
|
||||
|
||||
// Create sets for fast lookups
|
||||
const idStartES5Set = new Set([
|
||||
...require("@unicode/unicode-3.0.0/General_Category/Uppercase_Letter/code-points"),
|
||||
...require("@unicode/unicode-3.0.0/General_Category/Lowercase_Letter/code-points"),
|
||||
...require("@unicode/unicode-3.0.0/General_Category/Titlecase_Letter/code-points"),
|
||||
...require("@unicode/unicode-3.0.0/General_Category/Modifier_Letter/code-points"),
|
||||
...require("@unicode/unicode-3.0.0/General_Category/Other_Letter/code-points"),
|
||||
]);
|
||||
|
||||
const idContinueES5Set = new Set([
|
||||
...idStartES5Set,
|
||||
...require("@unicode/unicode-3.0.0/General_Category/Nonspacing_Mark/code-points"),
|
||||
...require("@unicode/unicode-3.0.0/General_Category/Spacing_Mark/code-points"),
|
||||
...require("@unicode/unicode-3.0.0/General_Category/Decimal_Number/code-points"),
|
||||
...require("@unicode/unicode-3.0.0/General_Category/Connector_Punctuation/code-points"),
|
||||
]);
|
||||
|
||||
const idStartESNextSet = new Set(require("@unicode/unicode-15.1.0/Binary_Property/ID_Start/code-points"));
|
||||
const idContinueESNextSet = new Set(require("@unicode/unicode-15.1.0/Binary_Property/ID_Continue/code-points"));
|
||||
|
||||
// Exclude known problematic codepoints
|
||||
const ID_Continue_mistake = new Set([0x30fb, 0xff65]);
|
||||
|
||||
function bitsToU64Array(bits: number[]): bigint[] {
|
||||
const result: bigint[] = [];
|
||||
for (let i = 0; i < bits.length; i += 64) {
|
||||
let value = 0n;
|
||||
for (let j = 0; j < 64 && i + j < bits.length; j++) {
|
||||
if (bits[i + j]) {
|
||||
value |= 1n << BigInt(j);
|
||||
}
|
||||
}
|
||||
result.push(value);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
async function generateTable(table: string, name: string, checkFn: (cp: number) => boolean) {
|
||||
const context: Context<boolean> = {
|
||||
get: (cp: number) => checkFn(cp),
|
||||
eql: (a: boolean, b: boolean) => a === b,
|
||||
};
|
||||
|
||||
const generator = new Generator(context);
|
||||
const tables = await generator.generate();
|
||||
|
||||
return `
|
||||
pub fn ${name}(cp: u21) bool {
|
||||
if (cp > 0x10FFFF) return false;
|
||||
const high = cp >> 8;
|
||||
const low = cp & 0xFF;
|
||||
const stage2_idx = ${table}.stage1[high];
|
||||
const bit_pos = stage2_idx + low;
|
||||
const u64_idx = bit_pos >> 6;
|
||||
const bit_idx = @as(u6, @intCast(bit_pos & 63));
|
||||
return (${table}.stage2[u64_idx] & (@as(u64, 1) << bit_idx)) != 0;
|
||||
}
|
||||
const ${table} = struct {
|
||||
pub const stage1 = [_]u16{${tables.stage1.join(",")}};
|
||||
pub const stage2 = [_]u64{${bitsToU64Array(tables.stage2)
|
||||
.map(n => n.toString())
|
||||
.join(",")}};
|
||||
};
|
||||
|
||||
`;
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const functions = [
|
||||
{
|
||||
name: "isIDStartES5",
|
||||
table: "idStartES5",
|
||||
check: (cp: number) => idStartES5Set.has(cp),
|
||||
},
|
||||
{
|
||||
name: "isIDContinueES5",
|
||||
table: "idContinueES5",
|
||||
check: (cp: number) => idContinueES5Set.has(cp),
|
||||
},
|
||||
{
|
||||
name: "isIDStartESNext",
|
||||
table: "idStartESNext",
|
||||
check: (cp: number) => idStartESNextSet.has(cp),
|
||||
},
|
||||
{
|
||||
name: "isIDContinueESNext",
|
||||
table: "idContinueESNext",
|
||||
check: (cp: number) => idContinueESNextSet.has(cp) && !ID_Continue_mistake.has(cp),
|
||||
},
|
||||
];
|
||||
|
||||
const results = await Promise.all(
|
||||
functions.map(async ({ name, check, table }) => {
|
||||
const code = await generateTable(table, name, check);
|
||||
return `
|
||||
/// ${name} checks if a codepoint is valid in the ${name} category
|
||||
${code}`;
|
||||
}),
|
||||
);
|
||||
|
||||
console.log(`/// This file is auto-generated. Do not edit.
|
||||
|
||||
${results.join("\n\n")}`);
|
||||
}
|
||||
|
||||
main();
|
||||
@@ -5,7 +5,10 @@
|
||||
"license": "MIT",
|
||||
"devDependencies": {
|
||||
"@unicode/unicode-13.0.0": "^1.2.1",
|
||||
"@unicode/unicode-3.0.0": "^1.2.1",
|
||||
"@unicode/unicode-3.0.0": "^1.6.5",
|
||||
"semver": "^7.3.7"
|
||||
},
|
||||
"dependencies": {
|
||||
"@unicode/unicode-15.1.0": "^1.6.5"
|
||||
}
|
||||
}
|
||||
138
misctools/unicode-generator.ts
Normal file
138
misctools/unicode-generator.ts
Normal file
@@ -0,0 +1,138 @@
|
||||
import crypto from "crypto";
|
||||
|
||||
// Types to mirror Zig's structures
|
||||
interface Context<Elem> {
|
||||
get(codepoint: number): Promise<Elem> | Elem;
|
||||
eql(a: Elem, b: Elem): boolean;
|
||||
}
|
||||
|
||||
interface Tables<Elem> {
|
||||
stage1: number[];
|
||||
stage2: number[];
|
||||
stage3: Elem[];
|
||||
}
|
||||
|
||||
class Generator<Elem> {
|
||||
private static readonly BLOCK_SIZE = 256;
|
||||
private readonly ctx: Context<Elem>;
|
||||
private readonly blockMap = new Map<string, number>();
|
||||
|
||||
constructor(ctx: Context<Elem>) {
|
||||
this.ctx = ctx;
|
||||
}
|
||||
|
||||
private hashBlock(block: number[]): string {
|
||||
const hash = crypto.createHash("sha256");
|
||||
hash.update(Buffer.from(new Uint16Array(block).buffer));
|
||||
return hash.digest("hex");
|
||||
}
|
||||
|
||||
async generate(): Promise<Tables<Elem>> {
|
||||
const stage1: number[] = [];
|
||||
const stage2: number[] = [];
|
||||
const stage3: Elem[] = [];
|
||||
|
||||
let block = new Array(Generator.BLOCK_SIZE).fill(0);
|
||||
let blockLen = 0;
|
||||
|
||||
// Maximum Unicode codepoint is 0x10FFFF
|
||||
for (let cp = 0; cp <= 0x10ffff; cp++) {
|
||||
// Get the mapping for this codepoint
|
||||
const elem = await this.ctx.get(cp);
|
||||
|
||||
// Find or add the element in stage3
|
||||
let blockIdx = stage3.findIndex(item => this.ctx.eql(item, elem));
|
||||
if (blockIdx === -1) {
|
||||
blockIdx = stage3.length;
|
||||
stage3.push(elem);
|
||||
}
|
||||
|
||||
if (blockIdx > 0xffff) {
|
||||
throw new Error("Block index too large");
|
||||
}
|
||||
|
||||
// Add to current block
|
||||
block[blockLen] = blockIdx;
|
||||
blockLen++;
|
||||
|
||||
// Check if we need to finalize this block
|
||||
if (blockLen < Generator.BLOCK_SIZE && cp !== 0x10ffff) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Fill remaining block space with zeros if needed
|
||||
if (blockLen < Generator.BLOCK_SIZE) {
|
||||
block.fill(0, blockLen);
|
||||
}
|
||||
|
||||
// Get or create stage2 index for this block
|
||||
const blockHash = this.hashBlock(block);
|
||||
let stage2Idx = this.blockMap.get(blockHash);
|
||||
|
||||
if (stage2Idx === undefined) {
|
||||
stage2Idx = stage2.length;
|
||||
this.blockMap.set(blockHash, stage2Idx);
|
||||
stage2.push(...block.slice(0, blockLen));
|
||||
}
|
||||
|
||||
if (stage2Idx > 0xffff) {
|
||||
throw new Error("Stage2 index too large");
|
||||
}
|
||||
|
||||
// Add mapping to stage1
|
||||
stage1.push(stage2Idx);
|
||||
|
||||
// Reset block
|
||||
block = new Array(Generator.BLOCK_SIZE).fill(0);
|
||||
blockLen = 0;
|
||||
}
|
||||
|
||||
return { stage1, stage2, stage3 };
|
||||
}
|
||||
|
||||
// Generates Zig code for the lookup tables
|
||||
static writeZig<Elem>(tableName: string, tables: Tables<Elem>, elemToString: (elem: Elem) => string): string {
|
||||
let output = `/// Auto-generated. Do not edit.\n`;
|
||||
output += `fn ${tableName}(comptime Elem: type) type {\n`;
|
||||
output += " return struct {\n";
|
||||
|
||||
// Stage 1
|
||||
output += `pub const stage1: [${tables.stage1.length}]u16 = .{`;
|
||||
output += tables.stage1.join(",");
|
||||
output += "};\n\n";
|
||||
|
||||
// Stage 2
|
||||
output += `pub const stage2: [${tables.stage2.length}]u8 = .{`;
|
||||
output += tables.stage2.join(",");
|
||||
output += "};\n\n";
|
||||
|
||||
// Stage 3
|
||||
output += `pub const stage3: [${tables.stage3.length}]Elem = .{`;
|
||||
output += tables.stage3.map(elemToString).join(",");
|
||||
output += "};\n";
|
||||
|
||||
output += " };\n}\n";
|
||||
return output;
|
||||
}
|
||||
}
|
||||
|
||||
// Example usage:
|
||||
async function example() {
|
||||
// Example context that maps codepoints to their category
|
||||
const ctx: Context<string> = {
|
||||
get: async (cp: number) => {
|
||||
// This would normally look up the actual Unicode category
|
||||
return "Lu";
|
||||
},
|
||||
eql: (a: string, b: string) => a === b,
|
||||
};
|
||||
|
||||
const generator = new Generator(ctx);
|
||||
const tables = await generator.generate();
|
||||
|
||||
// Generate Zig code
|
||||
const zigCode = Generator.writeZig(tables, (elem: string) => `"${elem}"`);
|
||||
console.log(zigCode);
|
||||
}
|
||||
|
||||
export { Generator, type Context, type Tables };
|
||||
@@ -3043,18 +3043,10 @@ pub const Lexer = NewLexer(.{});
|
||||
|
||||
const JSIdentifier = @import("./js_lexer/identifier.zig");
|
||||
pub inline fn isIdentifierStart(codepoint: i32) bool {
|
||||
if (comptime Environment.isWasm) {
|
||||
return JSIdentifier.JumpTable.isIdentifierStart(codepoint);
|
||||
}
|
||||
|
||||
return JSIdentifier.Bitset.isIdentifierStart(codepoint);
|
||||
return JSIdentifier.isIdentifierStart(codepoint);
|
||||
}
|
||||
pub inline fn isIdentifierContinue(codepoint: i32) bool {
|
||||
if (comptime Environment.isWasm) {
|
||||
return JSIdentifier.JumpTable.isIdentifierPart(codepoint);
|
||||
}
|
||||
|
||||
return JSIdentifier.Bitset.isIdentifierPart(codepoint);
|
||||
return JSIdentifier.isIdentifierPart(codepoint);
|
||||
}
|
||||
|
||||
pub fn isWhitespace(codepoint: CodePoint) bool {
|
||||
|
||||
File diff suppressed because one or more lines are too long
@@ -1,22 +0,0 @@
|
||||
const std = @import("std");
|
||||
const bun = @import("root").bun;
|
||||
const identifier_data = @import("./identifier_data.zig");
|
||||
|
||||
pub const CachedBitset = extern struct {
|
||||
range: [2]i32,
|
||||
len: u32,
|
||||
|
||||
pub fn fromFile(comptime filename: anytype) CachedBitset {
|
||||
return comptime @as(CachedBitset, @bitCast(bun.asByteSlice(@embedFile(filename)).ptr[0..@sizeOf(CachedBitset)].*));
|
||||
}
|
||||
};
|
||||
|
||||
pub fn setMasks(masks: [*:0]const u8, comptime MaskType: type, masky: MaskType) void {
|
||||
const FieldInfo: std.builtin.Type.StructField = std.meta.fieldInfo(MaskType, "masks");
|
||||
masky.masks = @as(masks, @bitCast(FieldInfo.type));
|
||||
}
|
||||
|
||||
pub const id_start_meta = identifier_data.id_start_cached;
|
||||
pub const id_continue_meta = identifier_data.id_continue_cached;
|
||||
pub const id_start = identifier_data.id_start;
|
||||
pub const id_continue = identifier_data.id_continue;
|
||||
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user