Shrink MimeType list (#21004)

Co-authored-by: Jarred-Sumner <709451+Jarred-Sumner@users.noreply.github.com> Co-authored-by: Meghan Denny <meghan@bun.sh>
2026-02-09 10:28:47 +00:00 · 2025-07-15 22:37:09 -07:00
parent 32ce9a3890
commit d8b37bf408
8 changed files with 8784 additions and 3491 deletions
--- a/cmake/sources/JavaScriptCodegenSources.txt
+++ b/cmake/sources/JavaScriptCodegenSources.txt
@@ -10,6 +10,7 @@ src/codegen/class-definitions.ts
 src/codegen/client-js.ts
 src/codegen/create-hash-table.ts
 src/codegen/generate-classes.ts
+src/codegen/generate-compact-string-table.ts
 src/codegen/generate-js2native.ts
 src/codegen/generate-jssink.ts
 src/codegen/generate-node-errors.ts
--- a/cmake/sources/ZigSources.txt
+++ b/cmake/sources/ZigSources.txt
@@ -538,6 +538,7 @@ src/http/HTTPThread.zig
 src/http/InitError.zig
 src/http/InternalState.zig
 src/http/Method.zig
+src/http/mime_type_list_enum.zig
 src/http/MimeType.zig
 src/http/ProxyTunnel.zig
 src/http/SendFile.zig
--- a/src/bun.js/rare_data.zig
+++ b/src/bun.js/rare_data.zig
@@ -135,7 +135,11 @@ pub fn mimeTypeFromString(this: *RareData, allocator: std.mem.Allocator, str: []
        ) catch bun.outOfMemory();
    }

-    return this.mime_types.?.get(str);
+    if (this.mime_types.?.get(str)) |entry| {
+        return bun.http.MimeType.Compact.from(entry).toMimeType();
+    }
+
+    return null;
 }

 pub const HotMap = struct {
--- a/src/bun.js/webcore/Blob.zig
+++ b/src/bun.js/webcore/Blob.zig
@@ -518,7 +518,7 @@ pub fn fromURLSearchParams(
    };
    search_params.toString(URLSearchParamsConverter, &converter, URLSearchParamsConverter.convert);
    var store = Blob.Store.init(converter.buf, allocator);
-    store.mime_type = MimeType.all.@"application/x-www-form-urlencoded";
+    store.mime_type = MimeType.Compact.from(.@"application/x-www-form-urlencoded").toMimeType();

    var blob = Blob.initWithStore(store, globalThis);
    blob.content_type = store.mime_type.value;
--- a/src/codegen/generate-compact-string-table.ts
+++ b/src/codegen/generate-compact-string-table.ts
@@ -0,0 +1,430 @@
+#!/usr/bin/env bun
+
+/**
+ * Compact String Table Generator
+ *
+ * Generates a Zig enum that stores multiple strings in a contiguous buffer with
+ * reduced per-string overhead compared to individual string slices.
+ *
+ * Instead of storing each string as a separate slice (16 bytes each), this packs
+ * the string metadata into enum values using bit fields. The actual string data
+ * is stored in a single static array.
+ *
+ * ## How it works:
+ *
+ * 1. Groups strings by length for uniform spacing within each group
+ * 2. Stores position and length group in bit-packed enum values
+ * 3. Uses non-power-of-2 integers (u9, u12, etc.) to minimize enum size
+ *
+ * ## Usage:
+ *
+ * ```bash
+ * # Input: newline-delimited strings
+ * echo -e "application/json\\ntext/html\\ntext/plain" > strings.txt
+ *
+ * # Generate Zig code
+ * bun src/codegen/generate-compact-string-table.ts strings.txt output.zig MyStrings
+ * ```
+ *
+ * ## Trade-offs:
+ *
+ * - Reduces memory overhead from 16 bytes to 1-2 bytes per string
+ * - O(1) string access through length-based grouping
+ * - Uniform spacing within groups may include some padding
+ * - Requires build-time code generation
+ *
+ */
+
+import { writeFileSync } from "fs";
+
+const args = process.argv.slice(2);
+
+if (args.length < 3) {
+  console.error("Usage: generate-compact-string-table.ts <input.txt> <output.zig> <enum-name> [namespace]");
+  console.error("Provide strings via stdin, one per line");
+  process.exit(1);
+}
+
+const [inputPath, outputPath, enumName, namespace] = args;
+
+interface StringEntry {
+  name: string;
+  value: string;
+  offset: number;
+  length: number;
+}
+
+interface PackedString {
+  value: string;
+  offset: number;
+  entries: StringEntry[];
+}
+
+function escapeZigIdentifier(name: string): string {
+  // Always use @"..." syntax for consistency
+  return `@"${name.replace(/\\/g, "\\\\").replace(/"/g, '\\"')}"`;
+}
+
+function escapeZigString(str: string): string {
+  return str
+    .replace(/\\/g, "\\\\")
+    .replace(/"/g, '\\"')
+    .replace(/\n/g, "\\n")
+    .replace(/\r/g, "\\r")
+    .replace(/\t/g, "\\t")
+    .replace(/[\x00-\x1F\x7F-\x9F]/g, char => {
+      const code = char.charCodeAt(0);
+      return `\\x${code.toString(16).padStart(2, "0")}`;
+    });
+}
+
+function findSmallestIntType(maxValue: number): string {
+  if (maxValue <= 0xff) return "u8";
+  if (maxValue <= 0xffff) return "u16";
+  if (maxValue <= 0xffffffff) return "u32";
+  return "u64";
+}
+
+interface LengthGroup {
+  length: number;
+  strings: { name: string; value: string }[];
+  startOffset: number;
+}
+
+function optimizeStringPacking(strings: { name: string; value: string }[]): {
+  entries: StringEntry[];
+  packedData: string;
+  lengthGroups: LengthGroup[];
+} {
+  // First, identify unique string values and do substring deduplication
+  const uniqueValues = new Map<string, string[]>(); // value -> [names]
+
+  for (const { name, value } of strings) {
+    if (!uniqueValues.has(value)) {
+      uniqueValues.set(value, []);
+    }
+    uniqueValues.get(value)!.push(name);
+  }
+
+  // Sort unique values by length descending for substring detection
+  const sortedValues = Array.from(uniqueValues.keys()).sort((a, b) => b.length - a.length);
+
+  // Find which strings can be substrings of others
+  const substringMap = new Map<string, { parent: string; offset: number }>();
+  const rootStrings: string[] = [];
+
+  for (const value of sortedValues) {
+    let foundAsSubstring = false;
+
+    // Check if this string is a substring of any root string
+    for (const root of rootStrings) {
+      const index = root.indexOf(value);
+      if (index !== -1) {
+        substringMap.set(value, { parent: root, offset: index });
+        foundAsSubstring = true;
+        break;
+      }
+    }
+
+    if (!foundAsSubstring) {
+      // This is a new root string, check if it contains any existing roots
+      const newRoot = value;
+      const toRemove: number[] = [];
+
+      for (let i = 0; i < rootStrings.length; i++) {
+        const existingRoot = rootStrings[i];
+        const index = newRoot.indexOf(existingRoot);
+        if (index !== -1) {
+          // Update all strings that were substrings of the old root
+          for (const [substr, info] of substringMap.entries()) {
+            if (info.parent === existingRoot) {
+              substringMap.set(substr, {
+                parent: newRoot,
+                offset: index + info.offset,
+              });
+            }
+          }
+          // The existing root is now a substring of the new root
+          substringMap.set(existingRoot, { parent: newRoot, offset: index });
+          toRemove.push(i);
+        }
+      }
+
+      // Remove absorbed roots
+      for (let i = toRemove.length - 1; i >= 0; i--) {
+        rootStrings.splice(toRemove[i], 1);
+      }
+
+      rootStrings.push(newRoot);
+    }
+  }
+
+  // Now reorganize by length groups for uniform spacing
+  const lengthMap = new Map<number, { value: string; names: string[] }[]>();
+
+  for (const [value, names] of uniqueValues.entries()) {
+    const length = value.length;
+    if (!lengthMap.has(length)) {
+      lengthMap.set(length, []);
+    }
+    lengthMap.get(length)!.push({ value, names });
+  }
+
+  // Sort length groups by frequency
+  const lengthGroups: LengthGroup[] = Array.from(lengthMap.entries())
+    .sort((a, b) => {
+      // Group with more total strings comes first (smaller index = fewer bits)
+      const aCount = a[1].reduce((sum, v) => sum + v.names.length, 0);
+      const bCount = b[1].reduce((sum, v) => sum + v.names.length, 0);
+      if (aCount !== bCount) return bCount - aCount;
+      return a[0] - b[0];
+    })
+    .map(([length, values]) => ({
+      length,
+      strings: values.flatMap(v => v.names.map(name => ({ name, value: v.value }))),
+      startOffset: 0,
+    }));
+
+  // Build packed data with uniform spacing within length groups
+  let packedData = "";
+  const entries: StringEntry[] = [];
+
+  for (const group of lengthGroups) {
+    group.startOffset = packedData.length;
+
+    // Get unique values in this length group
+    const uniqueInGroup = new Map<string, string[]>();
+    for (const { name, value } of group.strings) {
+      if (!uniqueInGroup.has(value)) {
+        uniqueInGroup.set(value, []);
+      }
+      uniqueInGroup.get(value)!.push(name);
+    }
+
+    // Sort values for consistent ordering
+    const sortedValues = Array.from(uniqueInGroup.keys()).sort();
+
+    // Pack values uniformly
+    for (let i = 0; i < sortedValues.length; i++) {
+      const value = sortedValues[i];
+      const offset = group.startOffset + i * group.length;
+
+      // Get the actual string data to pack
+      let sourceData: string;
+      if (substringMap.has(value)) {
+        // This is a substring of another string
+        const { parent, offset: parentOffset } = substringMap.get(value)!;
+        // We need to ensure the parent data is available
+        sourceData = value; // We'll still pack it directly for uniform spacing
+      } else {
+        sourceData = value;
+      }
+
+      packedData += sourceData;
+
+      // Create entries for all names with this value
+      for (const name of uniqueInGroup.get(value)!) {
+        entries.push({
+          name,
+          value,
+          offset,
+          length: group.length,
+        });
+      }
+    }
+  }
+
+  return { entries, packedData, lengthGroups };
+}
+
+export function generateCompactStringTable(
+  enumName: string,
+  strings: { name: string; value: string }[],
+  namespace?: string,
+): string {
+  if (strings.length === 0) {
+    throw new Error("No strings provided");
+  }
+
+  const { entries, packedData, lengthGroups } = optimizeStringPacking(strings);
+
+  // Create lookup tables for length groups
+  const lengthGroupMap = new Map<number, { index: number; startOffset: number; count: number }>();
+  const uniqueLengths: number[] = [];
+  const groupStartOffsets: number[] = [];
+  const groupCounts: number[] = [];
+
+  lengthGroups.forEach((group, index) => {
+    const uniqueInGroup = new Set(group.strings.map(s => s.value)).size;
+    lengthGroupMap.set(group.length, {
+      index,
+      startOffset: group.startOffset,
+      count: uniqueInGroup,
+    });
+    uniqueLengths.push(group.length);
+    groupStartOffsets.push(group.startOffset);
+    groupCounts.push(uniqueInGroup);
+  });
+
+  // For each entry, calculate its position within its length group
+  const entryPositions = new Map<string, { groupIndex: number; positionInGroup: number }>();
+
+  for (const group of lengthGroups) {
+    // Get unique values in this length group (must match packing logic exactly)
+    const uniqueInGroup = new Map<string, string[]>();
+    for (const { name, value } of group.strings) {
+      if (!uniqueInGroup.has(value)) {
+        uniqueInGroup.set(value, []);
+      }
+      uniqueInGroup.get(value)!.push(name);
+    }
+
+    // Sort values for consistent ordering (must match packing logic exactly)
+    const sortedValues = Array.from(uniqueInGroup.keys()).sort();
+
+    // Assign positions
+    sortedValues.forEach((value, position) => {
+      for (const name of uniqueInGroup.get(value)!) {
+        entryPositions.set(name, {
+          groupIndex: lengthGroupMap.get(group.length)!.index,
+          positionInGroup: position,
+        });
+      }
+    });
+  }
+
+  // Calculate bits needed
+  const lengthGroupBits = Math.ceil(Math.log2(lengthGroups.length || 1));
+  const maxPositionInGroup = Math.max(...groupCounts);
+  const positionBits = Math.ceil(Math.log2(maxPositionInGroup || 1));
+  const actualPackedBits = lengthGroupBits + positionBits;
+
+  // Use exact bit size for the enum
+  const packedIntType = `u${actualPackedBits}`;
+
+  // Sort entries by name for stable output
+  entries.sort((a, b) => a.name.localeCompare(b.name));
+
+  let output = `//! Generated by generate-compact-string-table.ts
+//! Do not edit manually
+//! To regenerate, run:
+//! \`\`\`
+//!   bun run src/codegen/generate-compact-string-table.ts ${inputPath} ${outputPath} ${enumName}
+//! \`\`\`
+`;
+
+  if (namespace) {
+    output += `\npub const ${namespace} = struct {\n`;
+  }
+
+  output += `pub const ${enumName} = enum(${packedIntType}) {
+    const LengthGroupBits = ${lengthGroupBits};
+    const PositionBits = ${positionBits};
+    const PackedInt = ${packedIntType};
+    
+    pub const Packed = packed struct (PackedInt) {
+        length_group: u${lengthGroupBits},
+        position: u${positionBits},
+    };
+    
+    const _bytes = "${escapeZigString(packedData)}";
+    const _lengths = [_]${findSmallestIntType(Math.max(...uniqueLengths))}{${uniqueLengths.join(", ")}};
+    const _group_start_offsets = [_]${findSmallestIntType(Math.max(...groupStartOffsets))}{${groupStartOffsets.join(", ")}};
+    
+`;
+
+  // Generate enum fields
+  // Sort entries by name for stable output
+  entries.sort((a, b) => a.name.localeCompare(b.name));
+
+  for (const entry of entries) {
+    const pos = entryPositions.get(entry.name)!;
+    const packedValue = pos.groupIndex | (pos.positionInGroup << lengthGroupBits);
+    output += `    ${escapeZigIdentifier(entry.name)} = ${packedValue},\n`;
+  }
+
+  output += `
+    pub fn slice(this: ${enumName}) []const u8 {
+        const p: Packed = @bitCast(@as(PackedInt, @intFromEnum(this)));
+        const length: usize = _lengths[p.length_group];
+        const offset = @as(usize, _group_start_offsets[p.length_group]) + @as(usize, p.position) * @as(usize, length);
+        return _bytes[offset..][0..length];
+    }
+    
+    pub fn len(this: ${enumName}) usize {
+        const p: Packed = @bitCast(@as(PackedInt, @intFromEnum(this)));
+        return _lengths[p.length_group];
+    }
+    
+    pub fn ptr(this: ${enumName}) [*]const u8 {
+        const p: Packed = @bitCast(@as(PackedInt, @intFromEnum(this)));
+        const length: usize = _lengths[p.length_group];
+        const offset = @as(usize, _group_start_offsets[p.length_group]) + @as(usize, p.position) * @as(usize, length);
+        return _bytes[offset..].ptr;
+    }
+    
+    pub const count = ${entries.length};
+    pub const all = &[_]${enumName}{
+${entries.map(entry => `        .${escapeZigIdentifier(entry.name)},`).join("\n")}
+    };
+};
+`;
+
+  if (namespace) {
+    output += `};\n`;
+  }
+
+  output += `\nconst std = @import("std");\n`;
+
+  return output;
+}
+
+// CLI interface
+
+// Read strings from stdin
+const input = await Bun.file(inputPath).text();
+const strings: { name: string; value: string }[] = [];
+
+for (const line of input.trim().split("\n")) {
+  if (!line) continue;
+  // Each line is just a string value, use it as both name and value
+  strings.push({ name: line.trim(), value: line.trim() });
+}
+
+if (strings.length === 0) {
+  console.error("No valid strings provided");
+  process.exit(1);
+}
+
+try {
+  const output = generateCompactStringTable(enumName, strings, namespace);
+  writeFileSync(outputPath, output);
+
+  // Print statistics
+  const totalOriginalSize = strings.reduce((sum, s) => sum + s.value.length, 0);
+  const packedSize = output.match(/const _bytes = "(.*?)"/s)?.[1]?.length ?? 0;
+  const lengthsMatch = output.match(/const _lengths = \[_\][^{]+\{([^}]+)\}/);
+  const lengthGroupsCount = lengthsMatch ? lengthsMatch[1].split(",").filter(s => s.trim()).length : 0;
+
+  // Calculate actual memory usage
+  const naiveMemory = strings.length * 16; // Each []const u8 is 16 bytes
+  const actualBits =
+    parseInt(output.match(/const LengthGroupBits = (\d+)/)?.[1] ?? "0") +
+    parseInt(output.match(/const PositionBits = (\d+)/)?.[1] ?? "0");
+  const ourEnumSize = Math.ceil(actualBits / 8);
+  const ourTotalMemory = strings.length * ourEnumSize; // Each enum value
+
+  console.log(`Generated ${outputPath}`);
+  console.log(`  Strings: ${strings.length}`);
+  console.log(`  Length groups: ${lengthGroupsCount}`);
+  console.log(`  Packed bits: ${actualBits} (u${actualBits})`);
+  console.log(`  Packed data: ${packedSize} bytes`);
+  console.log(`  String deduplication: ${((1 - packedSize / totalOriginalSize) * 100).toFixed(1)}% saved`);
+  console.log(
+    `  Memory per value: ${ourEnumSize} bytes vs 16 bytes (${((1 - ourEnumSize / 16) * 100).toFixed(1)}% saved)`,
+  );
+  console.log(`  Total memory: ${ourTotalMemory} bytes vs ${naiveMemory} bytes`);
+} catch (error) {
+  console.error("Error:", error);
+  process.exit(1);
+}
--- a/src/http/MimeType.zig
+++ b/src/http/MimeType.zig
--- a/src/http/mime_type_list.txt
+++ b/src/http/mime_type_list.txt
--- a/src/http/mime_type_list_enum.zig
+++ b/src/http/mime_type_list_enum.zig