Files
bun.sh/misctools/gen-unicode-table.js
Colin McDonnell f7f1b60444 Add bun-types, add typechecking, add child_process types (#1475)
* Add bun-types to packages

* Improve typing

* Fix types in tests

* Fix dts tests

* Run formatter

* Fix all type errors

* Add strict mode, fix type errors

* Add ffi changes

* Move workflows to root

* Add workflows

* Remove labeler

* Add child_process types

* Fix synthetic defaults issue

* Remove docs

* Move scripts

* Run prettier

* Include examples in typechecking

* captureStackTrace types

* moved captureStackTrace types to globals

* Address reviews

Co-authored-by: Colin McDonnell <colinmcd@alum.mit.edu>
Co-authored-by: Dylan Conway <dylan.conway567@gmail.com>
2022-11-09 15:40:40 -08:00

195 lines
6.7 KiB
JavaScript

// Thank you @evanw for this code!!!
const fs = require("fs");
const path = require("path");
// ES5 reference: https://es5.github.io/
//
// A conforming implementation of this International standard shall interpret
// characters in conformance with the Unicode Standard, Version 3.0 or later
// and ISO/IEC 10646-1 with either UCS-2 or UTF-16 as the adopted encoding
// form, implementation level 3. If the adopted ISO/IEC 10646-1 subset is not
// otherwise specified, it is presumed to be the BMP subset, collection 300.
//
// UnicodeLetter: any character in the Unicode categories “Uppercase letter (Lu)”,
// “Lowercase letter (Ll)”, “Titlecase letter (Lt)”, “Modifier letter (Lm)”,
// “Other letter (Lo)”, or “Letter number (Nl)”.
const idStartES5 = []
.concat(
require("@unicode/unicode-3.0.0/General_Category/Uppercase_Letter/code-points"),
require("@unicode/unicode-3.0.0/General_Category/Lowercase_Letter/code-points"),
require("@unicode/unicode-3.0.0/General_Category/Titlecase_Letter/code-points"),
require("@unicode/unicode-3.0.0/General_Category/Modifier_Letter/code-points"),
require("@unicode/unicode-3.0.0/General_Category/Other_Letter/code-points"),
// The "letter number" category is not included because old versions of Safari
// had a bug where they didn't include it. This means it does not match ES5.
// We need to make sure we escape these characters so Safari can read them.
// See https://github.com/evanw/esbuild/issues/1349 for more information.
// require('@unicode/unicode-3.0.0/General_Category/Letter_Number/code-points'),
)
.sort((a, b) => a - b);
// UnicodeCombiningMark: any character in the Unicode categories “Non-spacing mark (Mn)”
// or “Combining spacing mark (Mc)”
// UnicodeDigit: any character in the Unicode category “Decimal number (Nd)”
// UnicodeConnectorPunctuation: any character in the Unicode category “Connector punctuation (Pc)”
const idContinueES5 = idStartES5
.concat(
require("@unicode/unicode-3.0.0/General_Category/Nonspacing_Mark/code-points"),
require("@unicode/unicode-3.0.0/General_Category/Spacing_Mark/code-points"),
require("@unicode/unicode-3.0.0/General_Category/Decimal_Number/code-points"),
require("@unicode/unicode-3.0.0/General_Category/Connector_Punctuation/code-points"),
)
.sort((a, b) => a - b);
// ESNext reference: https://tc39.es/ecma262/
//
// A conforming implementation of ECMAScript must interpret source text input
// in conformance with the Unicode Standard, Version 5.1.0 or later and ISO/IEC
// 10646. If the adopted ISO/IEC 10646-1 subset is not otherwise specified, it
// is presumed to be the Unicode set, collection 10646.
//
// UnicodeIDStart: any Unicode code point with the Unicode property “ID_Start”
const idStartESNext = require("@unicode/unicode-13.0.0/Binary_Property/ID_Start/code-points");
const idStartESNextSet = new Set(idStartESNext);
// UnicodeIDContinue: any Unicode code point with the Unicode property “ID_Continue”
const idContinueESNext = require("@unicode/unicode-13.0.0/Binary_Property/ID_Continue/code-points");
const idContinueESNextSet = new Set(idContinueESNext);
// These identifiers are valid in both ES5 and ES6+ (i.e. an intersection of both)
const idStartES5AndESNext = idStartES5.filter((n) => idStartESNextSet.has(n));
const idContinueES5AndESNext = idContinueES5.filter((n) =>
idContinueESNextSet.has(n),
);
// These identifiers are valid in either ES5 or ES6+ (i.e. a union of both)
const idStartES5OrESNext = [...new Set(idStartES5.concat(idStartESNext))].sort(
(a, b) => a - b,
);
const idContinueES5OrESNext = [
...new Set(idContinueES5.concat(idContinueESNext)),
].sort((a, b) => a - b);
function generateRangeTable(codePoints) {
let lines = [];
let index = 0;
let latinOffset = 0;
while (latinOffset < codePoints.length && codePoints[latinOffset] <= 0xff) {
latinOffset++;
}
lines.push(`RangeTable.init(`, ` ${latinOffset},`, ` &[_]R16Range{`);
// 16-bit code points
while (index < codePoints.length && codePoints[index] < 0x1000) {
let start = codePoints[index];
index++;
while (
index < codePoints.length &&
codePoints[index] < 0x1000 &&
codePoints[index] === codePoints[index - 1] + 1
) {
index++;
}
let end = codePoints[index - 1];
lines.push(` .{0x${start.toString(16)}, 0x${end.toString(16)}},`);
}
lines.push(` },`, `&[_]R32Range{`);
// 32-bit code points
while (index < codePoints.length) {
let start = codePoints[index];
index++;
while (
index < codePoints.length &&
codePoints[index] === codePoints[index - 1] + 1
) {
index++;
}
let end = codePoints[index - 1];
lines.push(` .{0x${start.toString(16)}, 0x${end.toString(16)}},`);
}
lines.push(` },`, `);`);
return lines.join("\n");
}
function generateBigSwitchStatement(codePoints) {
let lines = [];
let index = 0;
let latinOffset = 0;
while (latinOffset < codePoints.length && codePoints[latinOffset] <= 0xff) {
latinOffset++;
}
lines.push(`return switch(codepoint) {`);
// 16-bit code points
while (index < codePoints.length && codePoints[index] < 0x1000) {
let start = codePoints[index];
index++;
while (
index < codePoints.length &&
codePoints[index] < 0x1000 &&
codePoints[index] === codePoints[index - 1] + 1
) {
index++;
}
let end = codePoints[index - 1];
lines.push(`0x${start.toString(16)}...0x${end.toString(16)},`);
}
// 32-bit code points
while (index < codePoints.length) {
let start = codePoints[index];
index++;
while (
index < codePoints.length &&
codePoints[index] === codePoints[index - 1] + 1
) {
index++;
}
let end = codePoints[index - 1];
lines.push(` 0x${start.toString(16)}...0x${end.toString(16)},`);
}
lines.push(` => true,
else => false
};`);
return lines.join("\n");
}
fs.writeFileSync(
path.join(__dirname, "..", "src", "js_lexer", "unicode.zig"),
`// This file was automatically generated by ${path.basename(
__filename,
)}. Do not edit.
const RangeTable = @import("./range_table.zig");
// ES5 || ESNext
pub const id_start = ${generateRangeTable(idStartES5OrESNext)}
// ES5 || ESNext
pub const id_continue = ${generateRangeTable(idContinueES5OrESNext)}
pub const printable_id_start = ${generateRangeTable(idStartESNext)}
pub const printable_id_continue = ${generateRangeTable(idContinueESNext)}
pub fn isIdentifierStart(comptime Codepoint: type, codepoint: Codepoint) bool{
${generateBigSwitchStatement(idStartES5OrESNext)}
}
pub fn isIdentifierContinue(comptime Codepoint: type, codepoint: Codepoint) bool{
${generateBigSwitchStatement(idContinueES5OrESNext)}
}
`,
);