improve Bun.stringWidth's algorithm (#9022)

* improve Bun.stringWidth's algorithm * add a bunch more tests from string-width package * make typescript happy * undo typescript changes * use better #define check for debug mode * properly handle latin1 width tests * support grapheme clusters * fix trailing newline * visibleUTF16WidthFn- add fast path for leading ascii * add firstNonASCII16IgnoreMin * fix firstNonASCII16CheckMin * vectorize visibleUTF16WidthFn * support emoji variation selector * expose stringWidth in release mode too * vectorize visibleLatin1Width * support ambiguousIsNarrow option * add typescript definition for stringWidth
2026-02-09 18:38:55 +00:00 · 2024-02-22 19:16:17 -08:00
parent 22c25fad92
commit ed339b367d
13 changed files with 2238 additions and 100 deletions
--- a/src/string_immutable.zig
+++ b/src/string_immutable.zig
@@ -8,6 +8,7 @@ const bun = @import("root").bun;
 pub const joiner = @import("./string_joiner.zig");
 const log = bun.Output.scoped(.STR, true);
 const js_lexer = @import("./js_lexer.zig");
+const grapheme = @import("./grapheme.zig");

 pub const Encoding = enum {
    ascii,
@@ -4208,6 +4209,10 @@ pub fn firstNonASCII16(comptime Slice: type, slice: Slice) ?u32 {
    return firstNonASCII16CheckMin(Slice, slice, true);
 }

+pub fn firstNonASCII16IgnoreMin(comptime Slice: type, slice: Slice) ?u32 {
+    return firstNonASCII16CheckMin(Slice, slice, false);
+}
+
 /// Get the line number and the byte offsets of `line_range_count` above the desired line number
 /// The final element is the end index of the desired line
 const LineRange = struct {
@@ -4351,11 +4356,11 @@ pub fn getLinesInText(text: []const u8, line: u32, comptime line_range_count: us

 pub fn firstNonASCII16CheckMin(comptime Slice: type, slice: Slice, comptime check_min: bool) ?u32 {
    var remaining = slice;
+    const remaining_start = remaining.ptr;

-    if (comptime Environment.enableSIMD and Environment.isNative) {
+    if (Environment.enableSIMD and Environment.isNative) {
        const end_ptr = remaining.ptr + remaining.len - (remaining.len % ascii_u16_vector_size);
        if (remaining.len > ascii_u16_vector_size) {
-            const remaining_start = remaining.ptr;
            while (remaining.ptr != end_ptr) {
                const vec: AsciiU16Vector = remaining[0..ascii_u16_vector_size].*;
                const max_value = @reduce(.Max, vec);
@@ -4371,24 +4376,21 @@ pub fn firstNonASCII16CheckMin(comptime Slice: type, slice: Slice, comptime chec
                        // it does it element-wise for every single u8 on the vector
                        // instead of doing the SIMD instructions
                        // it removes a loop, but probably is slower in the end
-                        const cmp = @as(AsciiVectorU16U1, @bitCast(vec > max_u16_ascii)) |
-                            @as(AsciiVectorU16U1, @bitCast(vec < min_u16_ascii));
+                        const cmp = @as(AsciiVectorU16U1, @bitCast(vec > max_u16_ascii)) | @as(AsciiVectorU16U1, @bitCast(vec < min_u16_ascii));
                        const bitmask: u8 = @as(u8, @bitCast(cmp));
                        const first = @ctz(bitmask);

-                        return @as(u32, @intCast(@as(u32, first) +
-                            @as(u32, @intCast(slice.len - remaining.len))));
+                        return @as(u32, @intCast(@as(u32, first) + @as(u32, @intCast(slice.len - remaining.len))));
                    }
-                } else if (comptime !check_min) {
+                } else {
                    if (max_value > 127) {
-                        remaining.len -= (@intFromPtr(remaining.ptr) - @intFromPtr(remaining_start)) / 2;
-
                        const cmp = vec > max_u16_ascii;
                        const bitmask: u8 = @as(u8, @bitCast(cmp));
-                        const first = @ctz(bitmask);
+                        const index_of_first_nonascii_in_vector = @ctz(bitmask);

-                        return @as(u32, @intCast(@as(u32, first) +
-                            @as(u32, @intCast(slice.len - remaining.len))));
+                        const offset_of_vector_in_input = (@intFromPtr(remaining.ptr) - @intFromPtr(remaining_start)) / 2;
+
+                        return @intCast(offset_of_vector_in_input + index_of_first_nonascii_in_vector);
                    }
                }

@@ -4398,22 +4400,19 @@ pub fn firstNonASCII16CheckMin(comptime Slice: type, slice: Slice, comptime chec
        }
    }

+    var i: usize = (@intFromPtr(remaining.ptr) - @intFromPtr(remaining_start)) / 2;
    if (comptime check_min) {
-        var i: usize = 0;
        for (remaining) |char| {
            if (char > 127 or char < 0x20) {
                return @as(u32, @truncate(i));
            }
-
            i += 1;
        }
    } else {
-        var i: usize = 0;
        for (remaining) |char| {
            if (char > 127) {
                return @as(u32, @truncate(i));
            }
-
            i += 1;
        }
    }
@@ -5799,11 +5798,241 @@ pub fn isFullWidthCodepointType(comptime T: type, cp: T) bool {
    };
 }

-pub fn visibleCodepointWidth(cp: anytype) u3 {
-    return visibleCodepointWidthType(@TypeOf(cp), cp);
+pub fn isAmgiguousCodepointType(comptime T: type, cp: T) bool {
+    return switch (cp) {
+        0xA1,
+        0xA4,
+        0xA7,
+        0xA8,
+        0xAA,
+        0xAD,
+        0xAE,
+        0xB0...0xB4,
+        0xB6...0xBA,
+        0xBC...0xBF,
+        0xC6,
+        0xD0,
+        0xD7,
+        0xD8,
+        0xDE...0xE1,
+        0xE6,
+        0xE8...0xEA,
+        0xEC,
+        0xED,
+        0xF0,
+        0xF2,
+        0xF3,
+        0xF7...0xFA,
+        0xFC,
+        0xFE,
+        0x101,
+        0x111,
+        0x113,
+        0x11B,
+        0x126,
+        0x127,
+        0x12B,
+        0x131...0x133,
+        0x138,
+        0x13F...0x142,
+        0x144,
+        0x148...0x14B,
+        0x14D,
+        0x152,
+        0x153,
+        0x166,
+        0x167,
+        0x16B,
+        0x1CE,
+        0x1D0,
+        0x1D2,
+        0x1D4,
+        0x1D6,
+        0x1D8,
+        0x1DA,
+        0x1DC,
+        0x251,
+        0x261,
+        0x2C4,
+        0x2C7,
+        0x2C9...0x2CB,
+        0x2CD,
+        0x2D0,
+        0x2D8...0x2DB,
+        0x2DD,
+        0x2DF,
+        0x300...0x36F,
+        0x391...0x3A1,
+        0x3A3...0x3A9,
+        0x3B1...0x3C1,
+        0x3C3...0x3C9,
+        0x401,
+        0x410...0x44F,
+        0x451,
+        0x2010,
+        0x2013...0x2016,
+        0x2018,
+        0x2019,
+        0x201C,
+        0x201D,
+        0x2020...0x2022,
+        0x2024...0x2027,
+        0x2030,
+        0x2032,
+        0x2033,
+        0x2035,
+        0x203B,
+        0x203E,
+        0x2074,
+        0x207F,
+        0x2081...0x2084,
+        0x20AC,
+        0x2103,
+        0x2105,
+        0x2109,
+        0x2113,
+        0x2116,
+        0x2121,
+        0x2122,
+        0x2126,
+        0x212B,
+        0x2153,
+        0x2154,
+        0x215B...0x215E,
+        0x2160...0x216B,
+        0x2170...0x2179,
+        0x2189,
+        0x2190...0x2199,
+        0x21B8,
+        0x21B9,
+        0x21D2,
+        0x21D4,
+        0x21E7,
+        0x2200,
+        0x2202,
+        0x2203,
+        0x2207,
+        0x2208,
+        0x220B,
+        0x220F,
+        0x2211,
+        0x2215,
+        0x221A,
+        0x221D...0x2220,
+        0x2223,
+        0x2225,
+        0x2227...0x222C,
+        0x222E,
+        0x2234...0x2237,
+        0x223C,
+        0x223D,
+        0x2248,
+        0x224C,
+        0x2252,
+        0x2260,
+        0x2261,
+        0x2264...0x2267,
+        0x226A,
+        0x226B,
+        0x226E,
+        0x226F,
+        0x2282,
+        0x2283,
+        0x2286,
+        0x2287,
+        0x2295,
+        0x2299,
+        0x22A5,
+        0x22BF,
+        0x2312,
+        0x2460...0x24E9,
+        0x24EB...0x254B,
+        0x2550...0x2573,
+        0x2580...0x258F,
+        0x2592...0x2595,
+        0x25A0,
+        0x25A1,
+        0x25A3...0x25A9,
+        0x25B2,
+        0x25B3,
+        0x25B6,
+        0x25B7,
+        0x25BC,
+        0x25BD,
+        0x25C0,
+        0x25C1,
+        0x25C6...0x25C8,
+        0x25CB,
+        0x25CE...0x25D1,
+        0x25E2...0x25E5,
+        0x25EF,
+        0x2605,
+        0x2606,
+        0x2609,
+        0x260E,
+        0x260F,
+        0x261C,
+        0x261E,
+        0x2640,
+        0x2642,
+        0x2660,
+        0x2661,
+        0x2663...0x2665,
+        0x2667...0x266A,
+        0x266C,
+        0x266D,
+        0x266F,
+        0x269E,
+        0x269F,
+        0x26BF,
+        0x26C6...0x26CD,
+        0x26CF...0x26D3,
+        0x26D5...0x26E1,
+        0x26E3,
+        0x26E8,
+        0x26E9,
+        0x26EB...0x26F1,
+        0x26F4,
+        0x26F6...0x26F9,
+        0x26FB,
+        0x26FC,
+        0x26FE,
+        0x26FF,
+        0x273D,
+        0x2776...0x277F,
+        0x2B56...0x2B59,
+        0x3248...0x324F,
+        0xE000...0xF8FF,
+        0xFE00...0xFE0F,
+        0xFFFD,
+        0x1F100...0x1F10A,
+        0x1F110...0x1F12D,
+        0x1F130...0x1F169,
+        0x1F170...0x1F18D,
+        0x1F18F,
+        0x1F190,
+        0x1F19B...0x1F1AC,
+        0xE0100...0xE01EF,
+        0xF0000...0xFFFFD,
+        0x100000...0x10FFFD,
+        => true,
+        else => false,
+    };
 }

-pub fn visibleCodepointWidthType(comptime T: type, cp: T) usize {
+pub fn visibleCodepointWidth(cp: u32, ambiguousAsWide: bool) u3 {
+    return visibleCodepointWidthType(u32, cp, ambiguousAsWide);
+}
+
+pub fn visibleCodepointWidthMaybeEmoji(cp: u32, maybe_emoji: bool, ambiguousAsWide: bool) u3 {
+    // UCHAR_EMOJI=57,
+    if (maybe_emoji and icu_hasBinaryProperty(cp, 57)) {
+        return 2;
+    }
+    return visibleCodepointWidth(cp, ambiguousAsWide);
+}
+
+pub fn visibleCodepointWidthType(comptime T: type, cp: T, ambiguousAsWide: bool) u3 {
    if (isZeroWidthCodepointType(T, cp)) {
        return 0;
    }
@@ -5811,46 +6040,55 @@ pub fn visibleCodepointWidthType(comptime T: type, cp: T) usize {
    if (isFullWidthCodepointType(T, cp)) {
        return 2;
    }
+    if (ambiguousAsWide and isAmgiguousCodepointType(T, cp)) {
+        return 2;
+    }

    return 1;
 }

 pub const visible = struct {
-    fn visibleASCIIWidth(input_: anytype) usize {
+    // Ref: https://cs.stanford.edu/people/miles/iso8859.html
+    fn visibleLatin1Width(input_: []const u8) usize {
        var length: usize = 0;
        var input = input_;
-
-        if (comptime Environment.enableSIMD) {
-            // https://zig.godbolt.org/z/hxhjncvq7
-            const ElementType = std.meta.Child(@TypeOf(input_));
-            const simd = 16 / @sizeOf(ElementType);
-            if (input.len >= simd) {
-                const input_end = input.ptr + input.len - (input.len % simd);
-                while (input.ptr != input_end) {
-                    const chunk: @Vector(simd, ElementType) = input[0..simd].*;
-                    input = input[simd..];
-
-                    const cmp: @Vector(simd, ElementType) = @splat(0x1f);
-                    const match1: @Vector(simd, u1) = @bitCast(chunk >= cmp);
-                    const match: @Vector(simd, ElementType) = match1;
-
-                    length += @reduce(.Add, match);
-                }
-            }
-
-            // this is a deliberate compiler optimization
-            // it disables auto-vectorizing the "input" for loop.
-            if (!(input.len < simd)) unreachable;
-        }
-
-        for (input) |c| {
-            length += if (c > 0x1f) 1 else 0;
+        const input_end_ptr = input.ptr + input.len - (input.len % 16);
+        var input_ptr = input.ptr;
+        while (input_ptr != input_end_ptr) {
+            const input_chunk: [16]u8 = input_ptr[0..16].*;
+            const sums: @Vector(16, u8) = [16]u8{
+                visibleLatin1WidthScalar(input_chunk[0]),
+                visibleLatin1WidthScalar(input_chunk[1]),
+                visibleLatin1WidthScalar(input_chunk[2]),
+                visibleLatin1WidthScalar(input_chunk[3]),
+                visibleLatin1WidthScalar(input_chunk[4]),
+                visibleLatin1WidthScalar(input_chunk[5]),
+                visibleLatin1WidthScalar(input_chunk[6]),
+                visibleLatin1WidthScalar(input_chunk[7]),
+                visibleLatin1WidthScalar(input_chunk[8]),
+                visibleLatin1WidthScalar(input_chunk[9]),
+                visibleLatin1WidthScalar(input_chunk[10]),
+                visibleLatin1WidthScalar(input_chunk[11]),
+                visibleLatin1WidthScalar(input_chunk[12]),
+                visibleLatin1WidthScalar(input_chunk[13]),
+                visibleLatin1WidthScalar(input_chunk[14]),
+                visibleLatin1WidthScalar(input_chunk[15]),
+            };
+            length += @reduce(.Add, sums);
+            input_ptr += 16;
        }
+        input.len %= 16;
+        input.ptr = input_ptr;

+        for (input) |byte| length += visibleLatin1WidthScalar(byte);
        return length;
    }

-    fn visibleASCIIWidthExcludeANSIColors(input_: anytype) usize {
+    fn visibleLatin1WidthScalar(c: u8) u1 {
+        return if ((c >= 127 and c <= 159) or c < 32) 0 else 1;
+    }
+
+    fn visibleLatin1WidthExcludeANSIColors(input_: anytype) usize {
        var length: usize = 0;
        var input = input_;

@@ -5858,7 +6096,7 @@ pub const visible = struct {
        const indexFn = if (comptime ElementType == u8) strings.indexOfCharUsize else strings.indexOfChar16Usize;

        while (indexFn(input, '\x1b')) |i| {
-            length += visibleASCIIWidth(input[0..i]);
+            length += visibleLatin1Width(input[0..i]);
            input = input[i..];

            if (input.len < 3) return length;
@@ -5871,7 +6109,7 @@ pub const visible = struct {
            }
        }

-        length += visibleASCIIWidth(input);
+        length += visibleLatin1Width(input);

        return length;
    }
@@ -5895,7 +6133,7 @@ pub const visible = struct {
            };

            const cp = decodeWTF8RuneTMultibyte(&cp_bytes, skip, u32, unicode_replacement);
-            len += visibleCodepointWidthType(u32, cp);
+            len += visibleCodepointWidth(cp, false);

            bytes = bytes[@min(i + skip, bytes.len)..];
        }
@@ -5905,51 +6143,115 @@ pub const visible = struct {
        return len;
    }

-    fn visibleUTF16WidthFn(input: []const u16, comptime asciiFn: anytype) usize {
-        var bytes = input;
+    fn visibleUTF16WidthFn(input_: []const u16, exclude_ansi_colors: bool, ambiguousAsWide: bool) usize {
+        var input = input_;
        var len: usize = 0;
-        while (bun.strings.firstNonASCII16CheckMin([]const u16, bytes, false)) |i| {
-            len += asciiFn(bytes[0..i]);
-            bytes = bytes[i..];
+        var prev: ?u21 = 0;
+        var break_state = grapheme.BreakState{};
+        var break_start: u21 = 0;
+        var saw_1b = false;
+        var saw_bracket = false;
+        var stretch_len: usize = 0;

-            const utf8 = utf16CodepointWithFFFD([]const u16, bytes);
-            len += visibleCodepointWidthType(u32, utf8.code_point);
-            bytes = bytes[@min(@as(usize, utf8.len), bytes.len)..];
+        while (true) {
+            {
+                const idx = firstNonASCII16IgnoreMin([]const u16, input) orelse input.len;
+                for (0..idx) |j| {
+                    const cp = input[j];
+                    defer prev = cp;
+
+                    if (saw_bracket) {
+                        if (cp == 'm') {
+                            saw_1b = false;
+                            saw_bracket = false;
+                            stretch_len = 0;
+                            continue;
+                        }
+                        stretch_len += visibleCodepointWidth(cp, ambiguousAsWide);
+                        continue;
+                    }
+                    if (saw_1b) {
+                        if (cp == '[') {
+                            saw_bracket = true;
+                            stretch_len = 0;
+                            continue;
+                        }
+                        len += visibleCodepointWidth(cp, ambiguousAsWide);
+                        continue;
+                    }
+                    if (!exclude_ansi_colors or cp != 0x1b) {
+                        if (prev) |prev_| {
+                            const should_break = grapheme.graphemeBreak(prev_, cp, &break_state);
+                            if (should_break) {
+                                len += visibleCodepointWidthMaybeEmoji(break_start, cp == 0xFE0F, ambiguousAsWide);
+                                break_start = cp;
+                            } else {
+                                //
+                            }
+                        } else {
+                            len += visibleCodepointWidth(cp, ambiguousAsWide);
+                            break_start = cp;
+                        }
+                        continue;
+                    }
+                    saw_1b = true;
+                    continue;
+                }
+                len += stretch_len;
+                input = input[idx..];
+            }
+            if (input.len == 0) break;
+            const replacement = utf16CodepointWithFFFD([]const u16, input);
+            defer input = input[replacement.len..];
+            if (replacement.fail) continue;
+            const cp: u21 = @intCast(replacement.code_point);
+            defer prev = cp;
+
+            if (prev) |prev_| {
+                const should_break = grapheme.graphemeBreak(prev_, cp, &break_state);
+                if (should_break) {
+                    len += visibleCodepointWidthMaybeEmoji(break_start, cp == 0xFE0F, ambiguousAsWide);
+                    break_start = cp;
+                }
+            } else {
+                len += visibleCodepointWidth(cp, ambiguousAsWide);
+                break_start = cp;
+            }
+        }
+        if (break_start > 0) {
+            len += visibleCodepointWidthMaybeEmoji(break_start, (prev orelse 0) == 0xFE0F, ambiguousAsWide);
        }
-
-        len += asciiFn(bytes);
-
        return len;
    }

    fn visibleLatin1WidthFn(input: []const u8) usize {
-        return visibleASCIIWidth(input);
+        return visibleLatin1Width(input);
    }

    pub const width = struct {
-        pub fn ascii(input: []const u8) usize {
-            return visibleASCIIWidth(input);
+        pub fn latin1(input: []const u8) usize {
+            return visibleLatin1Width(input);
        }

        pub fn utf8(input: []const u8) usize {
-            return visibleUTF8WidthFn(input, visibleASCIIWidth);
+            return visibleUTF8WidthFn(input, visibleLatin1Width);
        }

-        pub fn utf16(input: []const u16) usize {
-            return visibleUTF16WidthFn(input, visibleASCIIWidth);
+        pub fn utf16(input: []const u16, ambiguousAsWide: bool) usize {
+            return visibleUTF16WidthFn(input, false, ambiguousAsWide);
        }

        pub const exclude_ansi_colors = struct {
-            pub fn ascii(input: []const u8) usize {
-                return visibleASCIIWidthExcludeANSIColors(input);
+            pub fn latin1(input: []const u8) usize {
+                return visibleLatin1WidthExcludeANSIColors(input);
            }

            pub fn utf8(input: []const u8) usize {
-                return visibleUTF8WidthFn(input, visibleASCIIWidthExcludeANSIColors);
+                return visibleUTF8WidthFn(input, visibleLatin1WidthExcludeANSIColors);
            }

-            pub fn utf16(input: []const u16) usize {
-                return visibleUTF16WidthFn(input, visibleASCIIWidthExcludeANSIColors);
+            pub fn utf16(input: []const u16, ambiguousAsWide: bool) usize {
+                return visibleUTF16WidthFn(input, true, ambiguousAsWide);
            }
        };
    };
@@ -5994,3 +6296,6 @@ pub fn withoutSuffixComptime(input: []const u8, comptime suffix: []const u8) []c
    }
    return input;
 }
+
+// extern "C" bool icu_hasBinaryProperty(UChar32 cp, unsigned int prop)
+extern fn icu_hasBinaryProperty(c: u32, which: c_uint) bool;