Rename some instances of latin1 to cp1252 (#22059)

in JS, `new TextDecoder("latin1").decode(...)` uses cp1252. In python, latin1 is half-width utf-16. In our code, latin1 typically refers to half-width utf-16 because JavaScriptCore uses that for most strings, but sometimes it refers to cp1252. Rename the cp1252 functions to be called cp1252 Also fixes an issue where Buffer.from with utf-16le would sometimes output the wrong value: ```js $> bun -p "Buffer.from('\x80', 'utf-16le')" <Buffer ac 20> $> node -p "Buffer.from('\x80', 'utf-16le')" <Buffer 80 00> $> bun-debug -p "Buffer.from('\x80', 'utf-16le')" <Buffer 80 00> ```
2026-02-11 19:38:58 +00:00 · 2025-08-28 17:28:38 -07:00
parent edea077947
commit c69ed120e9
5 changed files with 58 additions and 24 deletions
--- a/src/string/immutable/unicode.zig
+++ b/src/string/immutable/unicode.zig
@@ -839,7 +839,7 @@ pub fn elementLengthLatin1IntoUTF8(slice: []const u8) usize {
    return bun.simdutf.length.utf8.from.latin1(slice);
 }

-pub fn copyLatin1IntoUTF16(comptime Buffer: type, buf_: Buffer, comptime Type: type, latin1_: Type) EncodeIntoResult {
+pub fn copyCP1252IntoUTF16(comptime Buffer: type, buf_: Buffer, comptime Type: type, latin1_: Type) EncodeIntoResult {
    var buf = buf_;
    var latin1 = latin1_;
    while (buf.len > 0 and latin1.len > 0) {
@@ -853,7 +853,7 @@ pub fn copyLatin1IntoUTF16(comptime Buffer: type, buf_: Buffer, comptime Type: t
        latin1 = latin1[to_write..];
        buf = buf[to_write..];
        if (latin1.len > 0 and buf.len >= 1) {
-            buf[0] = latin1ToCodepointBytesAssumeNotASCII16(latin1[0]);
+            buf[0] = cp1252ToCodepointBytesAssumeNotASCII16(latin1[0]);
            latin1 = latin1[1..];
            buf = buf[1..];
        }
@@ -865,13 +865,15 @@ pub fn copyLatin1IntoUTF16(comptime Buffer: type, buf_: Buffer, comptime Type: t
    };
 }

-pub fn elementLengthLatin1IntoUTF16(comptime Type: type, latin1_: Type) usize {
-    // latin1 is always at most 1 UTF-16 code unit long
-    if (comptime std.meta.Child([]const u16) == Type) {
-        return latin1_.len;
-    }
+pub fn copyLatin1IntoUTF16(comptime Buffer: type, buf_: Buffer, comptime Type: type, latin1_: Type) EncodeIntoResult {
+    const len = @min(buf_.len, latin1_.len);
+    for (buf_[0..len], latin1_[0..len]) |*out, in| out.* = in;
+    return .{ .read = @as(u32, @truncate(len)), .written = @as(u32, @truncate(len)) };
+}

-    return bun.simdutf.length.utf16.from.latin1(latin1_);
+pub fn elementLengthCP1252IntoUTF16(comptime Type: type, cp1252_: Type) usize {
+    // cp1252 is always at most 1 UTF-16 code unit long
+    return cp1252_.len;
 }

 pub fn eqlUtf16(comptime self: string, other: []const u16) bool {
@@ -1629,14 +1631,14 @@ pub fn convertUTF16toUTF8InBuffer(
    return buf[0..result];
 }

-pub fn latin1ToCodepointAssumeNotASCII(char: u8, comptime CodePointType: type) CodePointType {
+pub fn cp1252ToCodepointAssumeNotASCII(char: u8, comptime CodePointType: type) CodePointType {
    return @as(
        CodePointType,
-        @intCast(latin1ToCodepointBytesAssumeNotASCII16(char)),
+        @intCast(cp1252ToCodepointBytesAssumeNotASCII16(char)),
    );
 }

-const latin1_to_utf16_conversion_table = [256]u16{
+const cp1252_to_utf16_conversion_table = [256]u16{
    0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, // 00-07
    0x0008, 0x0009, 0x000A, 0x000B, 0x000C, 0x000D, 0x000E, 0x000F, // 08-0F
    0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, // 10-17
@@ -1677,8 +1679,8 @@ pub fn latin1ToCodepointBytesAssumeNotASCII(char: u32) [2]u8 {
    return bytes[0..2].*;
 }

-pub fn latin1ToCodepointBytesAssumeNotASCII16(char: u32) u16 {
-    return latin1_to_utf16_conversion_table[@as(u8, @truncate(char))];
+pub fn cp1252ToCodepointBytesAssumeNotASCII16(char: u32) u16 {
+    return cp1252_to_utf16_conversion_table[@as(u8, @truncate(char))];
 }

 /// Copy a UTF-16 string as UTF-8 into `buf`