some of buffer

2026-02-12 11:59:00 +00:00 · 2022-04-25 07:09:18 -07:00
parent 4b4df5095e
commit 2c6e5c3fc4
43 changed files with 3645 additions and 87 deletions
--- a/src/string_immutable.zig
+++ b/src/string_immutable.zig
@@ -719,10 +719,18 @@ pub fn copyU8IntoU16WithAlignment(comptime alignment: u21, output_: []align(alig
    if (comptime Environment.allow_assert) {
        std.debug.assert(input.len <= output.len);
    }
-    while (input.len >= word) {
-        appendUTF8MachineWordToUTF16MachineWordUnaligned(alignment, output[0..word], input[0..word]);
-        output = output[word..];
-        input = input[word..];
+
+    // un-aligned data access is slow
+    // so we attempt to align the data
+    while (!std.mem.isAligned(@ptrToInt(output.ptr), @alignOf(u16)) and input.len >= word) {
+        output[0] = input[0];
+        output = output[1..];
+        input = input[1..];
+    }
+
+    if (std.mem.isAligned(@ptrToInt(output.ptr), @alignOf(u16)) and input.len > 0) {
+        copyU8IntoU16(@alignCast(@alignOf(u16), output.ptr)[0..output.len], input);
+        return;
    }

    for (input) |c, i| {
@@ -758,28 +766,33 @@ pub fn copyU8IntoU16WithAlignment(comptime alignment: u21, output_: []align(alig
 // }

 pub inline fn copyU16IntoU8(output_: []u8, comptime InputType: type, input_: InputType) void {
-    var output = output_;
-    var input = input_;
    if (comptime Environment.allow_assert) {
-        std.debug.assert(input.len <= output.len);
+        std.debug.assert(input_.len <= output_.len);
    }

-    // on X64, this is 4
-    // on WASM, this is 2
-    const machine_word_length = comptime @sizeOf(usize) / @sizeOf(u16);
+    if (comptime !JSC.is_bindgen) {
+        JSC.WTF.copyLCharsFromUCharSource(output_.ptr, InputType, input_);
+    } else {
+        var output = output_;
+        var input = input_;

-    while (input.len >= machine_word_length) {
-        comptime var machine_word_i: usize = 0;
-        inline while (machine_word_i < machine_word_length) : (machine_word_i += 1) {
-            output[machine_word_i] = @intCast(u8, input[machine_word_i]);
+        // on X64, this is 4
+        // on WASM, this is 2
+        const machine_word_length = comptime @sizeOf(usize) / @sizeOf(u16);
+
+        while (input.len >= machine_word_length) {
+            comptime var machine_word_i: usize = 0;
+            inline while (machine_word_i < machine_word_length) : (machine_word_i += 1) {
+                output[machine_word_i] = @intCast(u8, input[machine_word_i]);
+            }
+
+            output = output[machine_word_length..];
+            input = input[machine_word_length..];
        }

-        output = output[machine_word_length..];
-        input = input[machine_word_length..];
-    }
-
-    for (input) |c, i| {
-        output[i] = @intCast(u8, c);
+        for (input) |c, i| {
+            output[i] = @intCast(u8, c);
+        }
    }
 }

@@ -972,7 +985,7 @@ pub fn allocateLatin1IntoUTF8(allocator: std.mem.Allocator, comptime Type: type,
                if (first < 16) {
                    latin1 = latin1[(comptime count * ascii_vector_size)..];
                    list.items.len += (comptime count * ascii_vector_size);
-                    try list.appendSlice(latin1[0..first]);
+                    list.appendSliceAssumeCapacity(latin1[0..first]);
                    latin1 = latin1[first..];
                    break_outer = true;
                    break :outer;
@@ -1152,6 +1165,29 @@ pub fn copyLatin1IntoUTF8(buf_: []u8, comptime Type: type, latin1_: Type) Encode
    };
 }

+const JSC = @import("javascript_core");
+
+pub fn copyLatin1IntoUTF16(comptime Buffer: type, buf_: Buffer, comptime Type: type, latin1_: Type) EncodeIntoResult {
+    var buf = buf_;
+    var latin1 = latin1_;
+    while (buf.len > 0 and latin1.len > 0) {
+        var to_write = strings.firstNonASCII(latin1) orelse @truncate(u32, latin1.len);
+        strings.copyU8IntoU16WithAlignment(std.meta.alignment(Buffer), buf, latin1[0..to_write]);
+        latin1 = latin1[to_write..];
+        buf = buf[to_write..];
+        if (latin1.len > 0 and buf.len >= 2) {
+            buf[0..2].* = latin1ToCodepointBytesAssumeNotASCII16(latin1[0]);
+            latin1 = latin1[1..];
+            buf = buf[2..];
+        }
+    }
+
+    return .{
+        .read = @truncate(u32, buf_.len - buf.len),
+        .written = @truncate(u32, latin1_.len - latin1.len),
+    };
+}
+
 test "copyLatin1IntoUTF8" {
    var input: string = "hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!";
    var output = std.mem.zeroes([500]u8);
@@ -1172,13 +1208,21 @@ pub fn latin1ToCodepointAssumeNotASCII(char: u8, comptime CodePointType: type) C
    );
 }

-pub fn latin1ToCodepointBytesAssumeNotASCII(char: u32) [2]u8 {
-    return [2]u8{
-        @truncate(u8, 0xc0 | char >> 6),
-        @truncate(u8, 0x80 | (char & 0x3f)),
+pub fn latin1ToCodepointBytesAssumeNotASCIIWIthCharType(comptime Char: type, char: u32) [2]Char {
+    return [2]Char{
+        @as(Char, @truncate(u8, 0xc0 | char >> 6)),
+        @as(Char, @truncate(u8, 0x80 | (char & 0x3f))),
    };
 }

+pub fn latin1ToCodepointBytesAssumeNotASCII(char: u32) [2]u8 {
+    return latin1ToCodepointBytesAssumeNotASCIIWIthCharType(u8, char);
+}
+
+pub fn latin1ToCodepointBytesAssumeNotASCII16(char: u32) [2]u16 {
+    return latin1ToCodepointBytesAssumeNotASCIIWIthCharType(u16, char);
+}
+
 pub fn copyUTF16IntoUTF8(buf: []u8, comptime Type: type, utf16: Type) EncodeIntoResult {
    var remaining = buf;
    var utf16_remaining = utf16;
@@ -1661,11 +1705,72 @@ pub fn indexOfNotChar(slice: []const u8, char: u8) ?u32 {
    return null;
 }

+const hex_table: [255]u8 = brk: {
+    var values: [255]u8 = [_]u8{0} ** 255;
+    values['0'] = 0;
+    values['1'] = 1;
+    values['2'] = 2;
+    values['3'] = 3;
+    values['4'] = 4;
+    values['5'] = 5;
+    values['6'] = 6;
+    values['7'] = 7;
+    values['8'] = 8;
+    values['9'] = 9;
+    values['A'] = 10;
+    values['B'] = 11;
+    values['C'] = 12;
+    values['D'] = 13;
+    values['E'] = 14;
+    values['F'] = 15;
+    values['a'] = 10;
+    values['b'] = 11;
+    values['c'] = 12;
+    values['d'] = 13;
+    values['e'] = 14;
+    values['f'] = 15;
+
+    break :brk values;
+};
+
+pub fn decodeHexToBytes(destination: []u8, comptime Char: type, source: []const Char) usize {
+    var remain = destination;
+    var input = source;
+
+    while (input.len > 1 and remain.len > 0) {
+        const int = input[0..2].*;
+        const a = hex_table[@truncate(u8, int[0])];
+        const b = hex_table[@truncate(u8, int[1])];
+        if (a == 255 or b == 255) {
+            break;
+        }
+        remain[0] = a << 4 | b;
+        remain = remain[1..];
+        input = input[2..];
+    }
+
+    return destination.len - remain.len;
+}
+
+test "decodeHexToBytes" {
+    var buffer = std.mem.zeroes([1024]u8);
+    for (buffer) |_, i| {
+        buffer[i] = @truncate(u8, i % 256);
+    }
+    var written: [2048]u8 = undefined;
+    var hex = std.fmt.bufPrint(&written, "{}", .{std.fmt.fmtSliceHexLower(&buffer)}) catch unreachable;
+    var good: [4096]u8 = undefined;
+    var ours_buf: [4096]u8 = undefined;
+    var match = try std.fmt.hexToBytes(good[0..1024], hex);
+    var ours = decodeHexToBytes(&ours_buf, hex);
+    try std.testing.expectEqualSlices(u8, match, ours_buf[0..ours]);
+    try std.testing.expectEqualSlices(u8, &buffer, ours_buf[0..ours]);
+}
+
 pub fn trimLeadingChar(slice: []const u8, char: u8) []const u8 {
    if (indexOfNotChar(slice, char)) |i| {
        return slice[i..];
    }
-
    return "";
 }

@@ -2028,9 +2133,10 @@ pub fn containsNonBmpCodePoint(text: string) bool {
 }

 // this is std.mem.trim except it doesn't forcibly change the slice to be const
-pub fn trim(slice: anytype, values_to_strip: []const u8) @TypeOf(slice) {
+pub fn trim(slice: anytype, comptime values_to_strip: []const u8) @TypeOf(slice) {
    var begin: usize = 0;
    var end: usize = slice.len;
+
    while (begin < end and std.mem.indexOfScalar(u8, values_to_strip, slice[begin]) != null) : (begin += 1) {}
    while (end > begin and std.mem.indexOfScalar(u8, values_to_strip, slice[end - 1]) != null) : (end -= 1) {}
    return slice[begin..end];