feat(unicode): migrate grapheme breaking to uucode with GB9c support (#26376)

## Summary

Replace Bun's outdated grapheme breaking implementation with [Ghostty's
approach](https://github.com/ghostty-org/ghostty/tree/main/src/unicode)
using the [uucode](https://github.com/jacobsandlund/uucode) library.
This adds proper **GB9c (Indic Conjunct Break)** support — Devanagari
and other Indic script conjuncts now correctly form single grapheme
clusters.

## Motivation

The previous implementation used a `GraphemeBoundaryClass` enum with
only 12 values and a 2-bit `BreakState` (just `extended_pictographic`
and `regional_indicator` flags). It had no support for Unicode's GB9c
rule, meaning Indic conjunct sequences (consonant + virama + consonant)
were incorrectly split into multiple grapheme clusters.

## Architecture

### Runtime (zero uucode dependency, two table lookups)

```
codepoint → [3-level LUT] → GraphemeBreakNoControl enum (u5, 17 values)
(state, gb1, gb2) → [8KB precomputed array] → (break_result, new_state)
```

The full grapheme break algorithm (GB6-GB13, GB9c, GB11, GB999) runs
only at **comptime** to populate the 8KB decision array. At runtime it's
pure table lookups.

### File Layout

```
src/deps/uucode/              ← Vendored library (MIT, build-time only)
src/unicode/uucode/           ← Build-time integration
  ├── uucode_config.zig       ← What Unicode properties to generate
  ├── grapheme_gen.zig        ← Generator: queries uucode → writes tables
  ├── lut.zig                 ← 3-level lookup table generator
  └── CLAUDE.md               ← Maintenance docs
src/string/immutable/         ← Runtime (no uucode dependency)
  ├── grapheme.zig            ← Grapheme break API + comptime decisions
  ├── grapheme_tables.zig     ← Pre-generated tables (committed, ~91KB source)
  └── visible.zig             ← Width calculation (2 lines changed)
scripts/update-uucode.sh      ← Update vendored uucode + regenerate
```

### Key Types

| Type | Size | Values |
|------|------|--------|
| `GraphemeBreakNoControl` | u5 | 17 (adds
`indic_conjunct_break_{consonant,linker,extend}`, `emoji_modifier_base`,
`zwnj`, etc.) |
| `BreakState` | u3 | 5 (`default`, `regional_indicator`,
`extended_pictographic`, `indic_conjunct_break_consonant`,
`indic_conjunct_break_linker`) |

### Binary Size

The tables store only the `GraphemeBreakNoControl` enum per codepoint
(not width or emoji properties, which visible.zig handles separately):

- stage1: 8192 × u16 = **16KB** (maps high byte → stage2 offset)
- stage2: 27392 × u8 = **27KB** (maps to stage3 index; max value is 16)
- stage3: 17 × u5 = **~17 bytes** (one per enum value)
- Precomputed decisions: **8KB**
- **Total: ~51KB** (vs previous ~70KB+)

## How to Regenerate Tables

```bash
# After updating src/deps/uucode/:
./scripts/update-uucode.sh

# Or manually:
vendor/zig/zig build generate-grapheme-tables
```

Normal builds never run the generator — they use the committed
`grapheme_tables.zig`.

## Testing

```bash
bun bd test test/js/bun/util/stringWidth.test.ts
```

New test cases verify Devanagari conjuncts (GB9c):
- `क्ष` (Ka+Virama+Ssa) → single cluster, width 2
- `क्‍ष` (Ka+Virama+ZWJ+Ssa) → single cluster, width 2
- `क्क्क` (Ka+Virama+Ka+Virama+Ka) → single cluster, width 3

---------

Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
This commit is contained in:
Jarred Sumner
2026-01-23 00:07:06 -08:00
committed by GitHub
parent 3b1c3bfe97
commit 86d4d87beb
45 changed files with 114381 additions and 1679 deletions

140
build.zig
View File

@@ -459,6 +459,146 @@ pub fn build(b: *Build) !void {
// const run = b.addRunArtifact(exe);
// step.dependOn(&run.step);
}
// zig build generate-grapheme-tables
// Regenerates src/string/immutable/grapheme_tables.zig from the vendored uucode.
// Run this when updating src/deps/uucode. Normal builds use the committed file.
{
const step = b.step("generate-grapheme-tables", "Regenerate grapheme property tables from vendored uucode");
// --- Phase 1: Build uucode tables (separate module graph, no tables dependency) ---
const bt_config_mod = b.createModule(.{
.root_source_file = b.path("src/deps/uucode/src/config.zig"),
.target = b.graph.host,
});
const bt_types_mod = b.createModule(.{
.root_source_file = b.path("src/deps/uucode/src/types.zig"),
.target = b.graph.host,
});
bt_types_mod.addImport("config.zig", bt_config_mod);
bt_config_mod.addImport("types.zig", bt_types_mod);
const bt_config_x_mod = b.createModule(.{
.root_source_file = b.path("src/deps/uucode/src/x/config.x.zig"),
.target = b.graph.host,
});
const bt_types_x_mod = b.createModule(.{
.root_source_file = b.path("src/deps/uucode/src/x/types.x.zig"),
.target = b.graph.host,
});
bt_types_x_mod.addImport("config.x.zig", bt_config_x_mod);
bt_config_x_mod.addImport("types.x.zig", bt_types_x_mod);
bt_config_x_mod.addImport("types.zig", bt_types_mod);
bt_config_x_mod.addImport("config.zig", bt_config_mod);
const bt_build_config_mod = b.createModule(.{
.root_source_file = b.path("src/unicode/uucode/uucode_config.zig"),
.target = b.graph.host,
});
bt_build_config_mod.addImport("types.zig", bt_types_mod);
bt_build_config_mod.addImport("config.zig", bt_config_mod);
bt_build_config_mod.addImport("types.x.zig", bt_types_x_mod);
bt_build_config_mod.addImport("config.x.zig", bt_config_x_mod);
const build_tables_mod = b.createModule(.{
.root_source_file = b.path("src/deps/uucode/src/build/tables.zig"),
.target = b.graph.host,
.optimize = .Debug,
});
build_tables_mod.addImport("config.zig", bt_config_mod);
build_tables_mod.addImport("build_config", bt_build_config_mod);
build_tables_mod.addImport("types.zig", bt_types_mod);
const build_tables_exe = b.addExecutable(.{
.name = "uucode_build_tables",
.root_module = build_tables_mod,
.use_llvm = true,
});
const run_build_tables = b.addRunArtifact(build_tables_exe);
run_build_tables.setCwd(b.path("src/deps/uucode"));
const tables_path = run_build_tables.addOutputFileArg("tables.zig");
// --- Phase 2: Build grapheme-gen with full uucode (separate module graph) ---
const rt_config_mod = b.createModule(.{
.root_source_file = b.path("src/deps/uucode/src/config.zig"),
.target = b.graph.host,
});
const rt_types_mod = b.createModule(.{
.root_source_file = b.path("src/deps/uucode/src/types.zig"),
.target = b.graph.host,
});
rt_types_mod.addImport("config.zig", rt_config_mod);
rt_config_mod.addImport("types.zig", rt_types_mod);
const rt_config_x_mod = b.createModule(.{
.root_source_file = b.path("src/deps/uucode/src/x/config.x.zig"),
.target = b.graph.host,
});
const rt_types_x_mod = b.createModule(.{
.root_source_file = b.path("src/deps/uucode/src/x/types.x.zig"),
.target = b.graph.host,
});
rt_types_x_mod.addImport("config.x.zig", rt_config_x_mod);
rt_config_x_mod.addImport("types.x.zig", rt_types_x_mod);
rt_config_x_mod.addImport("types.zig", rt_types_mod);
rt_config_x_mod.addImport("config.zig", rt_config_mod);
const rt_build_config_mod = b.createModule(.{
.root_source_file = b.path("src/unicode/uucode/uucode_config.zig"),
.target = b.graph.host,
});
rt_build_config_mod.addImport("types.zig", rt_types_mod);
rt_build_config_mod.addImport("config.zig", rt_config_mod);
rt_build_config_mod.addImport("types.x.zig", rt_types_x_mod);
rt_build_config_mod.addImport("config.x.zig", rt_config_x_mod);
const rt_tables_mod = b.createModule(.{
.root_source_file = tables_path,
.target = b.graph.host,
});
rt_tables_mod.addImport("types.zig", rt_types_mod);
rt_tables_mod.addImport("types.x.zig", rt_types_x_mod);
rt_tables_mod.addImport("config.zig", rt_config_mod);
rt_tables_mod.addImport("build_config", rt_build_config_mod);
const rt_get_mod = b.createModule(.{
.root_source_file = b.path("src/deps/uucode/src/get.zig"),
.target = b.graph.host,
});
rt_get_mod.addImport("types.zig", rt_types_mod);
rt_get_mod.addImport("tables", rt_tables_mod);
rt_types_mod.addImport("get.zig", rt_get_mod);
const uucode_mod = b.createModule(.{
.root_source_file = b.path("src/deps/uucode/src/root.zig"),
.target = b.graph.host,
});
uucode_mod.addImport("types.zig", rt_types_mod);
uucode_mod.addImport("config.zig", rt_config_mod);
uucode_mod.addImport("types.x.zig", rt_types_x_mod);
uucode_mod.addImport("tables", rt_tables_mod);
uucode_mod.addImport("get.zig", rt_get_mod);
// grapheme_gen executable
const gen_exe = b.addExecutable(.{
.name = "grapheme-gen",
.root_module = b.createModule(.{
.root_source_file = b.path("src/unicode/uucode/grapheme_gen.zig"),
.target = b.graph.host,
.optimize = .Debug,
.imports = &.{
.{ .name = "uucode", .module = uucode_mod },
},
}),
.use_llvm = true,
});
const run_gen = b.addRunArtifact(gen_exe);
const gen_output = run_gen.captureStdOut();
const install = b.addInstallFile(gen_output, "../src/string/immutable/grapheme_tables.zig");
step.dependOn(&install.step);
}
}
const TargetDescription = struct {