mirror of
https://github.com/oven-sh/bun
synced 2026-02-02 15:08:46 +00:00
feat(unicode): migrate grapheme breaking to uucode with GB9c support (#26376)
## Summary Replace Bun's outdated grapheme breaking implementation with [Ghostty's approach](https://github.com/ghostty-org/ghostty/tree/main/src/unicode) using the [uucode](https://github.com/jacobsandlund/uucode) library. This adds proper **GB9c (Indic Conjunct Break)** support — Devanagari and other Indic script conjuncts now correctly form single grapheme clusters. ## Motivation The previous implementation used a `GraphemeBoundaryClass` enum with only 12 values and a 2-bit `BreakState` (just `extended_pictographic` and `regional_indicator` flags). It had no support for Unicode's GB9c rule, meaning Indic conjunct sequences (consonant + virama + consonant) were incorrectly split into multiple grapheme clusters. ## Architecture ### Runtime (zero uucode dependency, two table lookups) ``` codepoint → [3-level LUT] → GraphemeBreakNoControl enum (u5, 17 values) (state, gb1, gb2) → [8KB precomputed array] → (break_result, new_state) ``` The full grapheme break algorithm (GB6-GB13, GB9c, GB11, GB999) runs only at **comptime** to populate the 8KB decision array. At runtime it's pure table lookups. ### File Layout ``` src/deps/uucode/ ← Vendored library (MIT, build-time only) src/unicode/uucode/ ← Build-time integration ├── uucode_config.zig ← What Unicode properties to generate ├── grapheme_gen.zig ← Generator: queries uucode → writes tables ├── lut.zig ← 3-level lookup table generator └── CLAUDE.md ← Maintenance docs src/string/immutable/ ← Runtime (no uucode dependency) ├── grapheme.zig ← Grapheme break API + comptime decisions ├── grapheme_tables.zig ← Pre-generated tables (committed, ~91KB source) └── visible.zig ← Width calculation (2 lines changed) scripts/update-uucode.sh ← Update vendored uucode + regenerate ``` ### Key Types | Type | Size | Values | |------|------|--------| | `GraphemeBreakNoControl` | u5 | 17 (adds `indic_conjunct_break_{consonant,linker,extend}`, `emoji_modifier_base`, `zwnj`, etc.) | | `BreakState` | u3 | 5 (`default`, `regional_indicator`, `extended_pictographic`, `indic_conjunct_break_consonant`, `indic_conjunct_break_linker`) | ### Binary Size The tables store only the `GraphemeBreakNoControl` enum per codepoint (not width or emoji properties, which visible.zig handles separately): - stage1: 8192 × u16 = **16KB** (maps high byte → stage2 offset) - stage2: 27392 × u8 = **27KB** (maps to stage3 index; max value is 16) - stage3: 17 × u5 = **~17 bytes** (one per enum value) - Precomputed decisions: **8KB** - **Total: ~51KB** (vs previous ~70KB+) ## How to Regenerate Tables ```bash # After updating src/deps/uucode/: ./scripts/update-uucode.sh # Or manually: vendor/zig/zig build generate-grapheme-tables ``` Normal builds never run the generator — they use the committed `grapheme_tables.zig`. ## Testing ```bash bun bd test test/js/bun/util/stringWidth.test.ts ``` New test cases verify Devanagari conjuncts (GB9c): - `क्ष` (Ka+Virama+Ssa) → single cluster, width 2 - `क्ष` (Ka+Virama+ZWJ+Ssa) → single cluster, width 2 - `क्क्क` (Ka+Virama+Ka+Virama+Ka) → single cluster, width 3 --------- Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
This commit is contained in:
140
build.zig
140
build.zig
@@ -459,6 +459,146 @@ pub fn build(b: *Build) !void {
|
||||
// const run = b.addRunArtifact(exe);
|
||||
// step.dependOn(&run.step);
|
||||
}
|
||||
|
||||
// zig build generate-grapheme-tables
|
||||
// Regenerates src/string/immutable/grapheme_tables.zig from the vendored uucode.
|
||||
// Run this when updating src/deps/uucode. Normal builds use the committed file.
|
||||
{
|
||||
const step = b.step("generate-grapheme-tables", "Regenerate grapheme property tables from vendored uucode");
|
||||
|
||||
// --- Phase 1: Build uucode tables (separate module graph, no tables dependency) ---
|
||||
const bt_config_mod = b.createModule(.{
|
||||
.root_source_file = b.path("src/deps/uucode/src/config.zig"),
|
||||
.target = b.graph.host,
|
||||
});
|
||||
const bt_types_mod = b.createModule(.{
|
||||
.root_source_file = b.path("src/deps/uucode/src/types.zig"),
|
||||
.target = b.graph.host,
|
||||
});
|
||||
bt_types_mod.addImport("config.zig", bt_config_mod);
|
||||
bt_config_mod.addImport("types.zig", bt_types_mod);
|
||||
|
||||
const bt_config_x_mod = b.createModule(.{
|
||||
.root_source_file = b.path("src/deps/uucode/src/x/config.x.zig"),
|
||||
.target = b.graph.host,
|
||||
});
|
||||
const bt_types_x_mod = b.createModule(.{
|
||||
.root_source_file = b.path("src/deps/uucode/src/x/types.x.zig"),
|
||||
.target = b.graph.host,
|
||||
});
|
||||
bt_types_x_mod.addImport("config.x.zig", bt_config_x_mod);
|
||||
bt_config_x_mod.addImport("types.x.zig", bt_types_x_mod);
|
||||
bt_config_x_mod.addImport("types.zig", bt_types_mod);
|
||||
bt_config_x_mod.addImport("config.zig", bt_config_mod);
|
||||
|
||||
const bt_build_config_mod = b.createModule(.{
|
||||
.root_source_file = b.path("src/unicode/uucode/uucode_config.zig"),
|
||||
.target = b.graph.host,
|
||||
});
|
||||
bt_build_config_mod.addImport("types.zig", bt_types_mod);
|
||||
bt_build_config_mod.addImport("config.zig", bt_config_mod);
|
||||
bt_build_config_mod.addImport("types.x.zig", bt_types_x_mod);
|
||||
bt_build_config_mod.addImport("config.x.zig", bt_config_x_mod);
|
||||
|
||||
const build_tables_mod = b.createModule(.{
|
||||
.root_source_file = b.path("src/deps/uucode/src/build/tables.zig"),
|
||||
.target = b.graph.host,
|
||||
.optimize = .Debug,
|
||||
});
|
||||
build_tables_mod.addImport("config.zig", bt_config_mod);
|
||||
build_tables_mod.addImport("build_config", bt_build_config_mod);
|
||||
build_tables_mod.addImport("types.zig", bt_types_mod);
|
||||
|
||||
const build_tables_exe = b.addExecutable(.{
|
||||
.name = "uucode_build_tables",
|
||||
.root_module = build_tables_mod,
|
||||
.use_llvm = true,
|
||||
});
|
||||
const run_build_tables = b.addRunArtifact(build_tables_exe);
|
||||
run_build_tables.setCwd(b.path("src/deps/uucode"));
|
||||
const tables_path = run_build_tables.addOutputFileArg("tables.zig");
|
||||
|
||||
// --- Phase 2: Build grapheme-gen with full uucode (separate module graph) ---
|
||||
const rt_config_mod = b.createModule(.{
|
||||
.root_source_file = b.path("src/deps/uucode/src/config.zig"),
|
||||
.target = b.graph.host,
|
||||
});
|
||||
const rt_types_mod = b.createModule(.{
|
||||
.root_source_file = b.path("src/deps/uucode/src/types.zig"),
|
||||
.target = b.graph.host,
|
||||
});
|
||||
rt_types_mod.addImport("config.zig", rt_config_mod);
|
||||
rt_config_mod.addImport("types.zig", rt_types_mod);
|
||||
|
||||
const rt_config_x_mod = b.createModule(.{
|
||||
.root_source_file = b.path("src/deps/uucode/src/x/config.x.zig"),
|
||||
.target = b.graph.host,
|
||||
});
|
||||
const rt_types_x_mod = b.createModule(.{
|
||||
.root_source_file = b.path("src/deps/uucode/src/x/types.x.zig"),
|
||||
.target = b.graph.host,
|
||||
});
|
||||
rt_types_x_mod.addImport("config.x.zig", rt_config_x_mod);
|
||||
rt_config_x_mod.addImport("types.x.zig", rt_types_x_mod);
|
||||
rt_config_x_mod.addImport("types.zig", rt_types_mod);
|
||||
rt_config_x_mod.addImport("config.zig", rt_config_mod);
|
||||
|
||||
const rt_build_config_mod = b.createModule(.{
|
||||
.root_source_file = b.path("src/unicode/uucode/uucode_config.zig"),
|
||||
.target = b.graph.host,
|
||||
});
|
||||
rt_build_config_mod.addImport("types.zig", rt_types_mod);
|
||||
rt_build_config_mod.addImport("config.zig", rt_config_mod);
|
||||
rt_build_config_mod.addImport("types.x.zig", rt_types_x_mod);
|
||||
rt_build_config_mod.addImport("config.x.zig", rt_config_x_mod);
|
||||
|
||||
const rt_tables_mod = b.createModule(.{
|
||||
.root_source_file = tables_path,
|
||||
.target = b.graph.host,
|
||||
});
|
||||
rt_tables_mod.addImport("types.zig", rt_types_mod);
|
||||
rt_tables_mod.addImport("types.x.zig", rt_types_x_mod);
|
||||
rt_tables_mod.addImport("config.zig", rt_config_mod);
|
||||
rt_tables_mod.addImport("build_config", rt_build_config_mod);
|
||||
|
||||
const rt_get_mod = b.createModule(.{
|
||||
.root_source_file = b.path("src/deps/uucode/src/get.zig"),
|
||||
.target = b.graph.host,
|
||||
});
|
||||
rt_get_mod.addImport("types.zig", rt_types_mod);
|
||||
rt_get_mod.addImport("tables", rt_tables_mod);
|
||||
rt_types_mod.addImport("get.zig", rt_get_mod);
|
||||
|
||||
const uucode_mod = b.createModule(.{
|
||||
.root_source_file = b.path("src/deps/uucode/src/root.zig"),
|
||||
.target = b.graph.host,
|
||||
});
|
||||
uucode_mod.addImport("types.zig", rt_types_mod);
|
||||
uucode_mod.addImport("config.zig", rt_config_mod);
|
||||
uucode_mod.addImport("types.x.zig", rt_types_x_mod);
|
||||
uucode_mod.addImport("tables", rt_tables_mod);
|
||||
uucode_mod.addImport("get.zig", rt_get_mod);
|
||||
|
||||
// grapheme_gen executable
|
||||
const gen_exe = b.addExecutable(.{
|
||||
.name = "grapheme-gen",
|
||||
.root_module = b.createModule(.{
|
||||
.root_source_file = b.path("src/unicode/uucode/grapheme_gen.zig"),
|
||||
.target = b.graph.host,
|
||||
.optimize = .Debug,
|
||||
.imports = &.{
|
||||
.{ .name = "uucode", .module = uucode_mod },
|
||||
},
|
||||
}),
|
||||
.use_llvm = true,
|
||||
});
|
||||
|
||||
const run_gen = b.addRunArtifact(gen_exe);
|
||||
const gen_output = run_gen.captureStdOut();
|
||||
|
||||
const install = b.addInstallFile(gen_output, "../src/string/immutable/grapheme_tables.zig");
|
||||
step.dependOn(&install.step);
|
||||
}
|
||||
}
|
||||
|
||||
const TargetDescription = struct {
|
||||
|
||||
Reference in New Issue
Block a user