#!/usr/bin/env bun /** * Compact String Table Generator * * Generates a Zig enum that stores multiple strings in a contiguous buffer with * reduced per-string overhead compared to individual string slices. * * Instead of storing each string as a separate slice (16 bytes each), this packs * the string metadata into enum values using bit fields. The actual string data * is stored in a single static array. * * ## How it works: * * 1. Groups strings by length for uniform spacing within each group * 2. Stores position and length group in bit-packed enum values * 3. Uses non-power-of-2 integers (u9, u12, etc.) to minimize enum size * * ## Usage: * * ```bash * # Input: newline-delimited strings * echo -e "application/json\\ntext/html\\ntext/plain" > strings.txt * * # Generate Zig code * bun src/codegen/generate-compact-string-table.ts strings.txt output.zig MyStrings * ``` * * ## Trade-offs: * * - Reduces memory overhead from 16 bytes to 1-2 bytes per string * - O(1) string access through length-based grouping * - Uniform spacing within groups may include some padding * - Requires build-time code generation * */ import { writeFileSync } from "fs"; const args = process.argv.slice(2); if (args.length < 3) { console.error("Usage: generate-compact-string-table.ts [namespace]"); console.error("Provide strings via stdin, one per line"); process.exit(1); } const [inputPath, outputPath, enumName, namespace] = args; interface StringEntry { name: string; value: string; offset: number; length: number; } interface PackedString { value: string; offset: number; entries: StringEntry[]; } function escapeZigIdentifier(name: string): string { // Always use @"..." syntax for consistency return `@"${name.replace(/\\/g, "\\\\").replace(/"/g, '\\"')}"`; } function escapeZigString(str: string): string { return str .replace(/\\/g, "\\\\") .replace(/"/g, '\\"') .replace(/\n/g, "\\n") .replace(/\r/g, "\\r") .replace(/\t/g, "\\t") .replace(/[\x00-\x1F\x7F-\x9F]/g, char => { const code = char.charCodeAt(0); return `\\x${code.toString(16).padStart(2, "0")}`; }); } function findSmallestIntType(maxValue: number): string { if (maxValue <= 0xff) return "u8"; if (maxValue <= 0xffff) return "u16"; if (maxValue <= 0xffffffff) return "u32"; return "u64"; } interface LengthGroup { length: number; strings: { name: string; value: string }[]; startOffset: number; } function optimizeStringPacking(strings: { name: string; value: string }[]): { entries: StringEntry[]; packedData: string; lengthGroups: LengthGroup[]; } { // First, identify unique string values and do substring deduplication const uniqueValues = new Map(); // value -> [names] for (const { name, value } of strings) { if (!uniqueValues.has(value)) { uniqueValues.set(value, []); } uniqueValues.get(value)!.push(name); } // Sort unique values by length descending for substring detection const sortedValues = Array.from(uniqueValues.keys()).sort((a, b) => b.length - a.length); // Find which strings can be substrings of others const substringMap = new Map(); const rootStrings: string[] = []; for (const value of sortedValues) { let foundAsSubstring = false; // Check if this string is a substring of any root string for (const root of rootStrings) { const index = root.indexOf(value); if (index !== -1) { substringMap.set(value, { parent: root, offset: index }); foundAsSubstring = true; break; } } if (!foundAsSubstring) { // This is a new root string, check if it contains any existing roots const newRoot = value; const toRemove: number[] = []; for (let i = 0; i < rootStrings.length; i++) { const existingRoot = rootStrings[i]; const index = newRoot.indexOf(existingRoot); if (index !== -1) { // Update all strings that were substrings of the old root for (const [substr, info] of substringMap.entries()) { if (info.parent === existingRoot) { substringMap.set(substr, { parent: newRoot, offset: index + info.offset, }); } } // The existing root is now a substring of the new root substringMap.set(existingRoot, { parent: newRoot, offset: index }); toRemove.push(i); } } // Remove absorbed roots for (let i = toRemove.length - 1; i >= 0; i--) { rootStrings.splice(toRemove[i], 1); } rootStrings.push(newRoot); } } // Now reorganize by length groups for uniform spacing const lengthMap = new Map(); for (const [value, names] of uniqueValues.entries()) { const length = value.length; if (!lengthMap.has(length)) { lengthMap.set(length, []); } lengthMap.get(length)!.push({ value, names }); } // Sort length groups by frequency const lengthGroups: LengthGroup[] = Array.from(lengthMap.entries()) .sort((a, b) => { // Group with more total strings comes first (smaller index = fewer bits) const aCount = a[1].reduce((sum, v) => sum + v.names.length, 0); const bCount = b[1].reduce((sum, v) => sum + v.names.length, 0); if (aCount !== bCount) return bCount - aCount; return a[0] - b[0]; }) .map(([length, values]) => ({ length, strings: values.flatMap(v => v.names.map(name => ({ name, value: v.value }))), startOffset: 0, })); // Build packed data with uniform spacing within length groups let packedData = ""; const entries: StringEntry[] = []; for (const group of lengthGroups) { group.startOffset = packedData.length; // Get unique values in this length group const uniqueInGroup = new Map(); for (const { name, value } of group.strings) { if (!uniqueInGroup.has(value)) { uniqueInGroup.set(value, []); } uniqueInGroup.get(value)!.push(name); } // Sort values for consistent ordering const sortedValues = Array.from(uniqueInGroup.keys()).sort(); // Pack values uniformly for (let i = 0; i < sortedValues.length; i++) { const value = sortedValues[i]; const offset = group.startOffset + i * group.length; // Get the actual string data to pack let sourceData: string; if (substringMap.has(value)) { // This is a substring of another string const { parent, offset: parentOffset } = substringMap.get(value)!; // We need to ensure the parent data is available sourceData = value; // We'll still pack it directly for uniform spacing } else { sourceData = value; } packedData += sourceData; // Create entries for all names with this value for (const name of uniqueInGroup.get(value)!) { entries.push({ name, value, offset, length: group.length, }); } } } return { entries, packedData, lengthGroups }; } export function generateCompactStringTable( enumName: string, strings: { name: string; value: string }[], namespace?: string, ): string { if (strings.length === 0) { throw new Error("No strings provided"); } const { entries, packedData, lengthGroups } = optimizeStringPacking(strings); // Create lookup tables for length groups const lengthGroupMap = new Map(); const uniqueLengths: number[] = []; const groupStartOffsets: number[] = []; const groupCounts: number[] = []; lengthGroups.forEach((group, index) => { const uniqueInGroup = new Set(group.strings.map(s => s.value)).size; lengthGroupMap.set(group.length, { index, startOffset: group.startOffset, count: uniqueInGroup, }); uniqueLengths.push(group.length); groupStartOffsets.push(group.startOffset); groupCounts.push(uniqueInGroup); }); // For each entry, calculate its position within its length group const entryPositions = new Map(); for (const group of lengthGroups) { // Get unique values in this length group (must match packing logic exactly) const uniqueInGroup = new Map(); for (const { name, value } of group.strings) { if (!uniqueInGroup.has(value)) { uniqueInGroup.set(value, []); } uniqueInGroup.get(value)!.push(name); } // Sort values for consistent ordering (must match packing logic exactly) const sortedValues = Array.from(uniqueInGroup.keys()).sort(); // Assign positions sortedValues.forEach((value, position) => { for (const name of uniqueInGroup.get(value)!) { entryPositions.set(name, { groupIndex: lengthGroupMap.get(group.length)!.index, positionInGroup: position, }); } }); } // Calculate bits needed const lengthGroupBits = Math.ceil(Math.log2(lengthGroups.length || 1)); const maxPositionInGroup = Math.max(...groupCounts); const positionBits = Math.ceil(Math.log2(maxPositionInGroup || 1)); const actualPackedBits = lengthGroupBits + positionBits; // Use exact bit size for the enum const packedIntType = `u${actualPackedBits}`; // Sort entries by name for stable output entries.sort((a, b) => a.name.localeCompare(b.name)); let output = `//! Generated by generate-compact-string-table.ts //! Do not edit manually //! To regenerate, run: //! \`\`\` //! bun run src/codegen/generate-compact-string-table.ts ${inputPath} ${outputPath} ${enumName} //! \`\`\` `; if (namespace) { output += `\npub const ${namespace} = struct {\n`; } output += `pub const ${enumName} = enum(${packedIntType}) { const LengthGroupBits = ${lengthGroupBits}; const PositionBits = ${positionBits}; const PackedInt = ${packedIntType}; pub const Packed = packed struct (PackedInt) { length_group: u${lengthGroupBits}, position: u${positionBits}, }; const _bytes = "${escapeZigString(packedData)}"; const _lengths = [_]${findSmallestIntType(Math.max(...uniqueLengths))}{${uniqueLengths.join(", ")}}; const _group_start_offsets = [_]${findSmallestIntType(Math.max(...groupStartOffsets))}{${groupStartOffsets.join(", ")}}; `; // Generate enum fields // Sort entries by name for stable output entries.sort((a, b) => a.name.localeCompare(b.name)); for (const entry of entries) { const pos = entryPositions.get(entry.name)!; const packedValue = pos.groupIndex | (pos.positionInGroup << lengthGroupBits); output += ` ${escapeZigIdentifier(entry.name)} = ${packedValue},\n`; } output += ` pub fn slice(this: ${enumName}) []const u8 { const p: Packed = @bitCast(@as(PackedInt, @intFromEnum(this))); const length: usize = _lengths[p.length_group]; const offset = @as(usize, _group_start_offsets[p.length_group]) + @as(usize, p.position) * @as(usize, length); return _bytes[offset..][0..length]; } pub fn len(this: ${enumName}) usize { const p: Packed = @bitCast(@as(PackedInt, @intFromEnum(this))); return _lengths[p.length_group]; } pub fn ptr(this: ${enumName}) [*]const u8 { const p: Packed = @bitCast(@as(PackedInt, @intFromEnum(this))); const length: usize = _lengths[p.length_group]; const offset = @as(usize, _group_start_offsets[p.length_group]) + @as(usize, p.position) * @as(usize, length); return _bytes[offset..].ptr; } pub const count = ${entries.length}; pub const all = &[_]${enumName}{ ${entries.map(entry => ` .${escapeZigIdentifier(entry.name)},`).join("\n")} }; }; `; if (namespace) { output += `};\n`; } output += `\nconst std = @import("std");\n`; return output; } // CLI interface // Read strings from stdin const input = await Bun.file(inputPath).text(); const strings: { name: string; value: string }[] = []; for (const line of input.trim().split("\n")) { if (!line) continue; // Each line is just a string value, use it as both name and value strings.push({ name: line.trim(), value: line.trim() }); } if (strings.length === 0) { console.error("No valid strings provided"); process.exit(1); } try { const output = generateCompactStringTable(enumName, strings, namespace); writeFileSync(outputPath, output); // Print statistics const totalOriginalSize = strings.reduce((sum, s) => sum + s.value.length, 0); const packedSize = output.match(/const _bytes = "(.*?)"/s)?.[1]?.length ?? 0; const lengthsMatch = output.match(/const _lengths = \[_\][^{]+\{([^}]+)\}/); const lengthGroupsCount = lengthsMatch ? lengthsMatch[1].split(",").filter(s => s.trim()).length : 0; // Calculate actual memory usage const naiveMemory = strings.length * 16; // Each []const u8 is 16 bytes const actualBits = parseInt(output.match(/const LengthGroupBits = (\d+)/)?.[1] ?? "0") + parseInt(output.match(/const PositionBits = (\d+)/)?.[1] ?? "0"); const ourEnumSize = Math.ceil(actualBits / 8); const ourTotalMemory = strings.length * ourEnumSize; // Each enum value console.log(`Generated ${outputPath}`); console.log(` Strings: ${strings.length}`); console.log(` Length groups: ${lengthGroupsCount}`); console.log(` Packed bits: ${actualBits} (u${actualBits})`); console.log(` Packed data: ${packedSize} bytes`); console.log(` String deduplication: ${((1 - packedSize / totalOriginalSize) * 100).toFixed(1)}% saved`); console.log( ` Memory per value: ${ourEnumSize} bytes vs 16 bytes (${((1 - ourEnumSize / 16) * 100).toFixed(1)}% saved)`, ); console.log(` Total memory: ${ourTotalMemory} bytes vs ${naiveMemory} bytes`); } catch (error) { console.error("Error:", error); process.exit(1); }