lldb: pretty printing for bun.String, ZigString, WTFStringImpl (#21685)

### What does this PR do? This PR adds lldb pretty printing support for `bun.String`, `ZigString` and `WTFStringImpl` so you don't have to click through so many fields to what the actual string value is.
2026-02-02 15:08:46 +00:00 · 2025-08-07 17:51:33 -07:00
parent 74b1462ad4
commit a9a7526ed1
2 changed files with 384 additions and 3 deletions
--- a/misctools/lldb/README.md
+++ b/misctools/lldb/README.md
@@ -0,0 +1,121 @@
+# LLDB Pretty Printers for Bun
+
+This directory contains LLDB pretty printers for various Bun data structures to improve the debugging experience.
+
+## Files
+
+- `bun_pretty_printer.py` - Pretty printers for Bun-specific types (bun.String, WTFStringImpl, ZigString, BabyList, etc.)
+- `lldb_pretty_printers.py` - Pretty printers for Zig language types from the Zig project
+- `lldb_webkit.py` - Pretty printers for WebKit/JavaScriptCore types
+- `init.lldb` - LLDB initialization commands
+
+## Supported Types
+
+### bun.String Types
+- `bun.String` (or just `String`) - The main Bun string type
+- `WTFStringImpl` - WebKit string implementation (Latin1/UTF16)
+- `ZigString` - Zig string type (UTF8/Latin1/UTF16 with pointer tagging)
+
+### Display Format
+
+The pretty printers show string content directly, with additional metadata:
+
+```
+# bun.String examples:
+"Hello, World!" [latin1]          # Regular ZigString
+"UTF-8 String 🎉" [utf8]          # UTF-8 encoded
+"Static content" [latin1 static]  # Static string
+""                                # Empty string
+<dead>                            # Dead/invalid string
+
+# WTFStringImpl examples:
+"WebKit String"                   # Shows the actual string content
+
+# ZigString examples:
+"Some text" [utf16 global]        # UTF16 globally allocated
+"ASCII text" [latin1]             # Latin1 encoded
+```
+
+## Usage
+
+### Option 1: Manual Loading
+In your LLDB session:
+```lldb
+command script import /path/to/bun/misctools/lldb/bun_pretty_printer.py
+```
+
+### Option 2: Add to ~/.lldbinit
+Add the following line to your `~/.lldbinit` file to load automatically:
+```lldb
+command script import /path/to/bun/misctools/lldb/bun_pretty_printer.py
+```
+
+### Option 3: Use init.lldb
+```lldb
+command source /path/to/bun/misctools/lldb/init.lldb
+```
+
+## Testing
+
+To test the pretty printers:
+
+1. Build a debug version of Bun:
+```bash
+bun bd
+```
+
+2. Create a test file that uses bun.String types
+
+3. Debug with LLDB:
+```bash
+lldb ./build/debug/bun-debug
+(lldb) command script import misctools/lldb/bun_pretty_printer.py
+(lldb) breakpoint set --file your_test.zig --line <line_number>
+(lldb) run your_test.zig
+(lldb) frame variable
+```
+
+## Implementation Details
+
+### ZigString Pointer Tagging
+ZigString uses pointer tagging in the upper bits:
+- Bit 63: 1 = UTF16, 0 = UTF8/Latin1
+- Bit 62: 1 = Globally allocated (mimalloc)
+- Bit 61: 1 = UTF8 encoding
+
+The pretty printer automatically detects and handles these tags.
+
+### WTFStringImpl Encoding
+WTFStringImpl uses flags in `m_hashAndFlags`:
+- Bit 2 (s_hashFlag8BitBuffer): 1 = Latin1, 0 = UTF16
+
+### bun.String Tag Union
+bun.String is a tagged union with these variants:
+- Dead (0): Invalid/freed string
+- WTFStringImpl (1): WebKit string
+- ZigString (2): Regular Zig string
+- StaticZigString (3): Static/immortal string
+- Empty (4): Empty string ""
+
+## Troubleshooting
+
+If the pretty printers don't work:
+
+1. Verify the Python script loaded:
+```lldb
+(lldb) script print("Python works")
+```
+
+2. Check if the category is enabled:
+```lldb
+(lldb) type category list
+```
+
+3. Enable the Bun category manually:
+```lldb
+(lldb) type category enable bun
+```
+
+4. For debugging the pretty printer itself, check for exceptions:
+- The pretty printers catch all exceptions and return `<error>`
+- Modify the code to print exceptions for debugging
--- a/misctools/lldb/bun_pretty_printer.py
+++ b/misctools/lldb/bun_pretty_printer.py
@@ -10,8 +10,8 @@ class bun_BabyList_SynthProvider:

        try:
            self.ptr = self.value.GetChildMemberWithName('ptr')
-            self.len = self.value.GetChildMemberWithName('len').unsigned
-            self.cap = self.value.GetChildMemberWithName('cap').unsigned
+            self.len = self.value.GetChildMemberWithName('len').GetValueAsUnsigned()
+            self.cap = self.value.GetChildMemberWithName('cap').GetValueAsUnsigned()
            self.elem_type = self.ptr.type.GetPointeeType()
            self.elem_size = self.elem_type.size
        except:
@@ -46,7 +46,7 @@ def bun_BabyList_SummaryProvider(value, _=None):
        value = value.GetNonSyntheticValue()
        len_val = value.GetChildMemberWithName('len')
        cap_val = value.GetChildMemberWithName('cap')
-        return 'len=%d cap=%d' % (len_val.unsigned, cap_val.unsigned)
+        return 'len=%d cap=%d' % (len_val.GetValueAsUnsigned(), cap_val.GetValueAsUnsigned())
    except:
        return 'len=? cap=?'

@@ -67,6 +67,241 @@ def add(debugger, *, category, regex=False, type, identifier=None, synth=False,
            type
        ))

+def WTFStringImpl_SummaryProvider(value, _=None):
+    try:
+        # Get the raw pointer (it's already a pointer type)
+        value = value.GetNonSyntheticValue()
+        
+        # Check if it's a pointer type and dereference if needed
+        if value.type.IsPointerType():
+            struct = value.deref
+        else:
+            struct = value
+        
+        m_length = struct.GetChildMemberWithName('m_length').GetValueAsUnsigned()
+        m_hashAndFlags = struct.GetChildMemberWithName('m_hashAndFlags').GetValueAsUnsigned()
+        m_ptr = struct.GetChildMemberWithName('m_ptr')
+        
+        # Check if it's 8-bit (latin1) or 16-bit (utf16) string
+        s_hashFlag8BitBuffer = 1 << 2
+        is_8bit = (m_hashAndFlags & s_hashFlag8BitBuffer) != 0
+        
+        if m_length == 0:
+            return '[%s] ""' % ('latin1' if is_8bit else 'utf16')
+        
+        # Limit memory reads to 1MB for performance
+        MAX_BYTES = 1024 * 1024  # 1MB
+        MAX_DISPLAY_CHARS = 200  # Maximum characters to display
+        
+        # Calculate how much to read
+        bytes_per_char = 1 if is_8bit else 2
+        total_bytes = m_length * bytes_per_char
+        truncated = False
+        
+        if total_bytes > MAX_BYTES:
+            # Read only first part of very large strings
+            chars_to_read = MAX_BYTES // bytes_per_char
+            bytes_to_read = chars_to_read * bytes_per_char
+            truncated = True
+        else:
+            chars_to_read = m_length
+            bytes_to_read = total_bytes
+        
+        if is_8bit:
+            # Latin1 string
+            latin1_ptr = m_ptr.GetChildMemberWithName('latin1')
+            process = value.process
+            error = lldb.SBError()
+            ptr_addr = latin1_ptr.GetValueAsUnsigned()
+            if ptr_addr:
+                byte_data = process.ReadMemory(ptr_addr, min(chars_to_read, m_length), error)
+                if error.Success():
+                    string_val = byte_data.decode('latin1', errors='replace')
+                else:
+                    return '[latin1] <read error: %s>' % error
+            else:
+                return '[latin1] <null ptr>'
+        else:
+            # UTF16 string
+            utf16_ptr = m_ptr.GetChildMemberWithName('utf16')
+            process = value.process
+            error = lldb.SBError()
+            ptr_addr = utf16_ptr.GetValueAsUnsigned()
+            if ptr_addr:
+                byte_data = process.ReadMemory(ptr_addr, bytes_to_read, error)
+                if error.Success():
+                    # Properly decode UTF16LE to string
+                    string_val = byte_data.decode('utf-16le', errors='replace')
+                else:
+                    return '[utf16] <read error: %s>' % error
+            else:
+                return '[utf16] <null ptr>'
+        
+        # Escape special characters
+        string_val = string_val.replace('\\', '\\\\')
+        string_val = string_val.replace('"', '\\"')
+        string_val = string_val.replace('\n', '\\n')
+        string_val = string_val.replace('\r', '\\r')
+        string_val = string_val.replace('\t', '\\t')
+        
+        # Truncate display if too long
+        display_truncated = truncated or len(string_val) > MAX_DISPLAY_CHARS
+        if len(string_val) > MAX_DISPLAY_CHARS:
+            string_val = string_val[:MAX_DISPLAY_CHARS]
+        
+        # Add encoding and size info at the beginning
+        encoding = 'latin1' if is_8bit else 'utf16'
+        
+        if display_truncated:
+            size_info = ' %d chars' % m_length
+            if total_bytes >= 1024 * 1024:
+                size_info += ' (%.1fMB)' % (total_bytes / (1024.0 * 1024.0))
+            elif total_bytes >= 1024:
+                size_info += ' (%.1fKB)' % (total_bytes / 1024.0)
+            return '[%s%s] "%s..." <truncated>' % (encoding, size_info, string_val)
+        else:
+            return '[%s] "%s"' % (encoding, string_val)
+    except:
+        return '<error>'
+
+def ZigString_SummaryProvider(value, _=None):
+    try:
+        value = value.GetNonSyntheticValue()
+        
+        ptr = value.GetChildMemberWithName('_unsafe_ptr_do_not_use').GetValueAsUnsigned()
+        length = value.GetChildMemberWithName('len').GetValueAsUnsigned()
+        
+        # Check encoding flags
+        is_16bit = (ptr & (1 << 63)) != 0
+        is_utf8 = (ptr & (1 << 61)) != 0
+        is_global = (ptr & (1 << 62)) != 0
+        
+        # Determine encoding
+        encoding = 'utf16' if is_16bit else ('utf8' if is_utf8 else 'latin1')
+        flags = ' global' if is_global else ''
+        
+        if length == 0:
+            return '[%s%s] ""' % (encoding, flags)
+        
+        # Untag the pointer (keep only the lower 53 bits)
+        untagged_ptr = ptr & ((1 << 53) - 1)
+        
+        # Limit memory reads to 1MB for performance
+        MAX_BYTES = 1024 * 1024  # 1MB
+        MAX_DISPLAY_CHARS = 200  # Maximum characters to display
+        
+        # Calculate how much to read
+        bytes_per_char = 2 if is_16bit else 1
+        total_bytes = length * bytes_per_char
+        truncated = False
+        
+        if total_bytes > MAX_BYTES:
+            # Read only first part of very large strings
+            chars_to_read = MAX_BYTES // bytes_per_char
+            bytes_to_read = chars_to_read * bytes_per_char
+            truncated = True
+        else:
+            bytes_to_read = total_bytes
+        
+        # Read the string data
+        process = value.process
+        error = lldb.SBError()
+        
+        byte_data = process.ReadMemory(untagged_ptr, bytes_to_read, error)
+        if not error.Success():
+            return '[%s%s] <read error>' % (encoding, flags)
+        
+        # Decode based on encoding
+        if is_16bit:
+            string_val = byte_data.decode('utf-16le', errors='replace')
+        elif is_utf8:
+            string_val = byte_data.decode('utf-8', errors='replace')
+        else:
+            string_val = byte_data.decode('latin1', errors='replace')
+        
+        # Escape special characters
+        string_val = string_val.replace('\\', '\\\\')
+        string_val = string_val.replace('"', '\\"')
+        string_val = string_val.replace('\n', '\\n')
+        string_val = string_val.replace('\r', '\\r')
+        string_val = string_val.replace('\t', '\\t')
+        
+        # Truncate display if too long
+        display_truncated = truncated or len(string_val) > MAX_DISPLAY_CHARS
+        if len(string_val) > MAX_DISPLAY_CHARS:
+            string_val = string_val[:MAX_DISPLAY_CHARS]
+        
+        # Build the output
+        if display_truncated:
+            size_info = ' %d chars' % length
+            if total_bytes >= 1024 * 1024:
+                size_info += ' (%.1fMB)' % (total_bytes / (1024.0 * 1024.0))
+            elif total_bytes >= 1024:
+                size_info += ' (%.1fKB)' % (total_bytes / 1024.0)
+            return '[%s%s%s] "%s..." <truncated>' % (encoding, flags, size_info, string_val)
+        else:
+            return '[%s%s] "%s"' % (encoding, flags, string_val)
+    except:
+        return '<error>'
+
+def bun_String_SummaryProvider(value, _=None):
+    try:
+        value = value.GetNonSyntheticValue()
+        
+        # Debug: Show the actual type name LLDB sees
+        type_name = value.GetTypeName()
+        
+        tag = value.GetChildMemberWithName('tag')
+        if not tag or not tag.IsValid():
+            # Try alternate field names
+            tag = value.GetChildMemberWithName('Tag')
+            if not tag or not tag.IsValid():
+                # Show type name to help debug
+                return '<no tag field in type: %s>' % type_name
+        
+        tag_value = tag.GetValueAsUnsigned()
+        
+        # Map tag values to names
+        tag_names = {
+            0: 'Dead',
+            1: 'WTFStringImpl', 
+            2: 'ZigString',
+            3: 'StaticZigString',
+            4: 'Empty'
+        }
+        
+        tag_name = tag_names.get(tag_value, 'Unknown')
+        
+        if tag_name == 'Empty':
+            return '""'
+        elif tag_name == 'Dead':
+            return '<dead>'
+        elif tag_name == 'WTFStringImpl':
+            value_union = value.GetChildMemberWithName('value')
+            if not value_union or not value_union.IsValid():
+                return '<no value field>'
+            impl_value = value_union.GetChildMemberWithName('WTFStringImpl')
+            if not impl_value or not impl_value.IsValid():
+                return '<no WTFStringImpl field>'
+            return WTFStringImpl_SummaryProvider(impl_value, _)
+        elif tag_name == 'ZigString' or tag_name == 'StaticZigString':
+            value_union = value.GetChildMemberWithName('value')
+            if not value_union or not value_union.IsValid():
+                return '<no value field>'
+            field_name = 'ZigString' if tag_name == 'ZigString' else 'StaticZigString'
+            zig_string_value = value_union.GetChildMemberWithName(field_name)
+            if not zig_string_value or not zig_string_value.IsValid():
+                return '<no %s field>' % field_name
+            result = ZigString_SummaryProvider(zig_string_value, _)
+            # Add static marker if needed
+            if tag_name == 'StaticZigString':
+                result = result.replace(']', ' static]')
+            return result
+        else:
+            return '<unknown tag %d>' % tag_value
+    except Exception as e:
+        return '<error: %s>' % str(e)
+
 def __lldb_init_module(debugger, _=None):
    # Initialize Bun Category
    debugger.HandleCommand('type category define --language c99 bun')
@@ -74,5 +309,30 @@ def __lldb_init_module(debugger, _=None):
    # Initialize Bun Data Structures
    add(debugger, category='bun', regex=True, type='^baby_list\\.BabyList\\(.*\\)$', identifier='bun_BabyList', synth=True, expand=True, summary=True)
    
+    # Add WTFStringImpl pretty printer - try multiple possible type names
+    add(debugger, category='bun', type='WTFStringImpl', identifier='WTFStringImpl', summary=True)
+    add(debugger, category='bun', type='*WTFStringImplStruct', identifier='WTFStringImpl', summary=True)
+    add(debugger, category='bun', type='string.WTFStringImpl', identifier='WTFStringImpl', summary=True)
+    add(debugger, category='bun', type='string.WTFStringImplStruct', identifier='WTFStringImpl', summary=True)
+    add(debugger, category='bun', type='*string.WTFStringImplStruct', identifier='WTFStringImpl', summary=True)
+    
+    # Add ZigString pretty printer - try multiple possible type names
+    add(debugger, category='bun', type='ZigString', identifier='ZigString', summary=True)
+    add(debugger, category='bun', type='bun.js.bindings.ZigString', identifier='ZigString', summary=True)
+    add(debugger, category='bun', type='bindings.ZigString', identifier='ZigString', summary=True)
+    
+    # Add bun.String pretty printer - try multiple possible type names
+    add(debugger, category='bun', type='String', identifier='bun_String', summary=True)
+    add(debugger, category='bun', type='bun.String', identifier='bun_String', summary=True)
+    add(debugger, category='bun', type='string.String', identifier='bun_String', summary=True)
+    add(debugger, category='bun', type='BunString', identifier='bun_String', summary=True)
+    add(debugger, category='bun', type='bun::String', identifier='bun_String', summary=True)
+    add(debugger, category='bun', type='bun::string::String', identifier='bun_String', summary=True)
+    
+    # Try regex patterns for more flexible matching
+    add(debugger, category='bun', regex=True, type='.*String$', identifier='bun_String', summary=True)
+    add(debugger, category='bun', regex=True, type='.*WTFStringImpl.*', identifier='WTFStringImpl', summary=True)
+    add(debugger, category='bun', regex=True, type='.*ZigString.*', identifier='ZigString', summary=True)
+    
    # Enable the category
    debugger.HandleCommand('type category enable bun')