diff --git a/src/ast/parse.zig b/src/ast/parse.zig index 2582d5bd40..e3f7dd170d 100644 --- a/src/ast/parse.zig +++ b/src/ast/parse.zig @@ -587,7 +587,7 @@ pub fn Parse( var estr = try p.lexer.toEString(); if (estr.isUTF8()) { return estr.slice8(); - } else if (strings.toUTF8AllocWithTypeWithoutInvalidSurrogatePairs(p.lexer.allocator, []const u16, estr.slice16())) |alias_utf8| { + } else if (strings.toUTF8AllocWithTypeWithoutInvalidSurrogatePairs(p.lexer.allocator, estr.slice16())) |alias_utf8| { return alias_utf8; } else |err| { const r = p.source.rangeOfString(loc); diff --git a/src/bun.js/ConsoleObject.zig b/src/bun.js/ConsoleObject.zig index 6b27a3adc7..f017857c97 100644 --- a/src/bun.js/ConsoleObject.zig +++ b/src/bun.js/ConsoleObject.zig @@ -1720,7 +1720,7 @@ pub const Formatter = struct { } pub inline fn write16Bit(self: *@This(), input: []const u16) void { - bun.fmt.formatUTF16Type([]const u16, input, self.ctx) catch { + bun.fmt.formatUTF16Type(input, self.ctx) catch { self.failed = true; }; } @@ -2166,7 +2166,7 @@ pub const Formatter = struct { writer.writeAll(slice); } else if (!str.isEmpty()) { // slow path - const buf = strings.allocateLatin1IntoUTF8(bun.default_allocator, []const u8, str.latin1()) catch &[_]u8{}; + const buf = strings.allocateLatin1IntoUTF8(bun.default_allocator, str.latin1()) catch &[_]u8{}; if (buf.len > 0) { defer bun.default_allocator.free(buf); writer.writeAll(buf); diff --git a/src/bun.js/bindings/ZigString.zig b/src/bun.js/bindings/ZigString.zig index f55da9cfdb..da23004d69 100644 --- a/src/bun.js/bindings/ZigString.zig +++ b/src/bun.js/bindings/ZigString.zig @@ -132,7 +132,7 @@ pub const ZigString = extern struct { pub fn isAllASCII(this: ZigString) bool { if (this.is16Bit()) { - return strings.firstNonASCII16([]const u16, this.utf16SliceAligned()) == null; + return strings.firstNonASCII16(this.utf16SliceAligned()) == null; } return strings.isAllASCII(this.slice()); @@ -224,7 +224,7 @@ pub const ZigString = extern struct { } if (this.is16Bit()) { - return strings.elementLengthUTF16IntoUTF8([]const u16, this.utf16SliceAligned()); + return strings.elementLengthUTF16IntoUTF8(this.utf16SliceAligned()); } return bun.webcore.encoding.byteLengthU8(this.slice().ptr, this.slice().len, .utf8); @@ -236,9 +236,9 @@ pub const ZigString = extern struct { var list = std.ArrayList(u8).init(allocator); list = if (this.is16Bit()) - try strings.toUTF8ListWithType(list, []const u16, this.utf16SliceAligned()) + try strings.toUTF8ListWithType(list, this.utf16SliceAligned()) else - try strings.allocateLatin1IntoUTF8WithList(list, 0, []const u8, this.slice()); + try strings.allocateLatin1IntoUTF8WithList(list, 0, this.slice()); if (list.capacity > list.items.len) { list.items.ptr[list.items.len] = 0; @@ -258,9 +258,9 @@ pub const ZigString = extern struct { var list = std.ArrayList(u8).init(allocator); list = if (this.is16Bit()) - try strings.toUTF8ListWithType(list, []const u16, this.utf16SliceAligned()) + try strings.toUTF8ListWithType(list, this.utf16SliceAligned()) else - try strings.allocateLatin1IntoUTF8WithList(list, 0, []const u8, this.slice()); + try strings.allocateLatin1IntoUTF8WithList(list, 0, this.slice()); return list.toOwnedSliceSentinel(0); } @@ -588,7 +588,7 @@ pub const ZigString = extern struct { } if (self.is16Bit()) { - try bun.fmt.formatUTF16Type(@TypeOf(self.utf16Slice()), self.utf16Slice(), writer); + try bun.fmt.formatUTF16Type(self.utf16SliceAligned(), writer); return; } diff --git a/src/bun.js/node/node_process.zig b/src/bun.js/node/node_process.zig index b78d0d04da..d77bce1233 100644 --- a/src/bun.js/node/node_process.zig +++ b/src/bun.js/node/node_process.zig @@ -303,7 +303,7 @@ pub fn Bun__Process__editWindowsEnvVar(k: bun.String, v: bun.String) callconv(.C var buf2 = bun.handleOom(allocator.alloc(u16, v.utf16ByteLength() + 1)); defer allocator.free(buf2); const len1: usize = switch (wtf1.is8Bit()) { - true => bun.strings.copyLatin1IntoUTF16([]u16, buf1, []const u8, wtf1.latin1Slice()).written, + true => bun.strings.copyLatin1IntoUTF16([]u16, buf1, wtf1.latin1Slice()).written, false => b: { @memcpy(buf1[0..wtf1.length()], wtf1.utf16Slice()); break :b wtf1.length(); @@ -314,7 +314,7 @@ pub fn Bun__Process__editWindowsEnvVar(k: bun.String, v: bun.String) callconv(.C if (v.tag == .Empty) break :str (&[_]u16{0})[0..0 :0]; const wtf2 = v.value.WTFStringImpl; const len2: usize = switch (wtf2.is8Bit()) { - true => bun.strings.copyLatin1IntoUTF16([]u16, buf2, []const u8, wtf2.latin1Slice()).written, + true => bun.strings.copyLatin1IntoUTF16([]u16, buf2, wtf2.latin1Slice()).written, false => b: { @memcpy(buf2[0..wtf2.length()], wtf2.utf16Slice()); break :b wtf2.length(); diff --git a/src/bun.js/test/pretty_format.zig b/src/bun.js/test/pretty_format.zig index 05bfe8f3d1..78fa89f33e 100644 --- a/src/bun.js/test/pretty_format.zig +++ b/src/bun.js/test/pretty_format.zig @@ -628,7 +628,7 @@ pub const JestPrettyFormat = struct { } pub inline fn write16Bit(self: *@This(), input: []const u16) void { - bun.fmt.formatUTF16Type([]const u16, input, self.ctx) catch { + bun.fmt.formatUTF16Type(input, self.ctx) catch { self.failed = true; }; } @@ -999,7 +999,7 @@ pub const JestPrettyFormat = struct { writer.writeAll(str.slice()); } else if (str.len > 0) { // slow path - const buf = strings.allocateLatin1IntoUTF8(bun.default_allocator, []const u8, str.slice()) catch &[_]u8{}; + const buf = strings.allocateLatin1IntoUTF8(bun.default_allocator, str.slice()) catch &[_]u8{}; if (buf.len > 0) { defer bun.default_allocator.free(buf); writer.writeAll(buf); diff --git a/src/bun.js/webcore/Sink.zig b/src/bun.js/webcore/Sink.zig index 6ad9e07c9a..1093826834 100644 --- a/src/bun.js/webcore/Sink.zig +++ b/src/bun.js/webcore/Sink.zig @@ -79,7 +79,7 @@ pub const UTF8Fallback = struct { if (stack_size >= str.len * 2) { var buf: [stack_size]u8 = undefined; - const copied = bun.strings.copyUTF16IntoUTF8Impl(&buf, []const u16, str, true); + const copied = bun.strings.copyUTF16IntoUTF8Impl(&buf, str, true); bun.assert(copied.written <= stack_size); bun.assert(copied.read <= stack_size); if (input.isDone()) { diff --git a/src/bun.js/webcore/TextDecoder.zig b/src/bun.js/webcore/TextDecoder.zig index 715b3d246a..0953a5a1c5 100644 --- a/src/bun.js/webcore/TextDecoder.zig +++ b/src/bun.js/webcore/TextDecoder.zig @@ -202,10 +202,10 @@ fn decodeSlice(this: *TextDecoder, globalThis: *jsc.JSGlobalObject, buffer_slice // However, this is also what WebKit seems to do. // // => The reason we need to encode it is because TextDecoder "latin1" is actually CP1252, while WebKit latin1 is 8-bit utf-16 - const out_length = strings.elementLengthCP1252IntoUTF16([]const u8, buffer_slice); + const out_length = strings.elementLengthCP1252IntoUTF16(buffer_slice); const bytes = try bun.default_allocator.alloc(u16, out_length); - const out = strings.copyCP1252IntoUTF16([]u16, bytes, []const u8, buffer_slice); + const out = strings.copyCP1252IntoUTF16(bytes, buffer_slice); return ZigString.toExternalU16(bytes.ptr, out.written, globalThis); }, EncodingLabel.@"UTF-8" => { diff --git a/src/bun.js/webcore/TextEncoder.zig b/src/bun.js/webcore/TextEncoder.zig index 1cff9e5561..351c671381 100644 --- a/src/bun.js/webcore/TextEncoder.zig +++ b/src/bun.js/webcore/TextEncoder.zig @@ -17,7 +17,7 @@ pub export fn TextEncoder__encode8( const slice = ptr[0..len]; if (slice.len <= buf.len / 2) { - const result = strings.copyLatin1IntoUTF8(&buf, []const u8, slice); + const result = strings.copyLatin1IntoUTF8(&buf, slice); const uint8array = jsc.JSValue.createUninitializedUint8Array(globalThis, result.written) catch return .zero; bun.assert(result.written <= buf.len); bun.assert(result.read == slice.len); @@ -26,7 +26,7 @@ pub export fn TextEncoder__encode8( @memcpy(array_buffer.byteSlice()[0..result.written], buf[0..result.written]); return uint8array; } else { - const bytes = strings.allocateLatin1IntoUTF8(globalThis.bunVM().allocator, []const u8, slice) catch { + const bytes = strings.allocateLatin1IntoUTF8(globalThis.bunVM().allocator, slice) catch { return globalThis.throwOutOfMemoryValue(); }; bun.assert(bytes.len >= slice.len); @@ -53,7 +53,7 @@ pub export fn TextEncoder__encode16( // max utf16 -> utf8 length if (slice.len <= buf.len / 4) { - const result = strings.copyUTF16IntoUTF8(&buf, @TypeOf(slice), slice); + const result = strings.copyUTF16IntoUTF8(&buf, slice); if (result.read == 0 or result.written == 0) { const uint8array = jsc.JSValue.createUninitializedUint8Array(globalThis, 3) catch return .zero; const array_buffer = uint8array.asArrayBuffer(globalThis).?; @@ -71,7 +71,6 @@ pub export fn TextEncoder__encode16( } else { const bytes = strings.toUTF8AllocWithType( bun.default_allocator, - @TypeOf(slice), slice, ) catch { return globalThis.toInvalidArguments("Out of memory", .{}); @@ -99,7 +98,7 @@ pub export fn c( // max utf16 -> utf8 length if (slice.len <= buf.len / 4) { - const result = strings.copyUTF16IntoUTF8(&buf, @TypeOf(slice), slice); + const result = strings.copyUTF16IntoUTF8(&buf, slice); if (result.read == 0 or result.written == 0) { const uint8array = jsc.JSValue.createUninitializedUint8Array(globalThis, 3) catch return .zero; const array_buffer = uint8array.asArrayBuffer(globalThis).?; @@ -117,7 +116,6 @@ pub export fn c( } else { const bytes = strings.toUTF8AllocWithType( bun.default_allocator, - @TypeOf(slice), slice, ) catch { return globalThis.throwOutOfMemoryValue(); @@ -136,7 +134,7 @@ const RopeStringEncoder = struct { pub fn append8(it: *jsc.JSString.Iterator, ptr: [*]const u8, len: u32) callconv(.C) void { var this = bun.cast(*RopeStringEncoder, it.data.?); - const result = strings.copyLatin1IntoUTF8StopOnNonASCII(this.buf[this.tail..], []const u8, ptr[0..len], true); + const result = strings.copyLatin1IntoUTF8StopOnNonASCII(this.buf[this.tail..], ptr[0..len], true); if (result.read == std.math.maxInt(u32) and result.written == std.math.maxInt(u32)) { it.stop = 1; this.any_non_ascii = true; @@ -151,7 +149,7 @@ const RopeStringEncoder = struct { } pub fn write8(it: *jsc.JSString.Iterator, ptr: [*]const u8, len: u32, offset: u32) callconv(.C) void { var this = bun.cast(*RopeStringEncoder, it.data.?); - const result = strings.copyLatin1IntoUTF8StopOnNonASCII(this.buf[offset..], []const u8, ptr[0..len], true); + const result = strings.copyLatin1IntoUTF8StopOnNonASCII(this.buf[offset..], ptr[0..len], true); if (result.read == std.math.maxInt(u32) and result.written == std.math.maxInt(u32)) { it.stop = 1; this.any_non_ascii = true; @@ -222,7 +220,7 @@ pub export fn TextEncoder__encodeInto16( ) u64 { const output = buf_ptr[0..buf_len]; const input = input_ptr[0..input_len]; - var result: strings.EncodeIntoResult = strings.copyUTF16IntoUTF8(output, []const u16, input); + var result: strings.EncodeIntoResult = strings.copyUTF16IntoUTF8(output, input); if (output.len >= 3 and (result.read == 0 or result.written == 0)) { const replacement_char = [_]u8{ 239, 191, 189 }; @memcpy(buf_ptr[0..replacement_char.len], &replacement_char); @@ -242,7 +240,7 @@ pub export fn TextEncoder__encodeInto8( const output = buf_ptr[0..buf_len]; const input = input_ptr[0..input_len]; const result: strings.EncodeIntoResult = - strings.copyLatin1IntoUTF8(output, []const u8, input); + strings.copyLatin1IntoUTF8(output, input); const sized: [2]u32 = .{ result.read, result.written }; return @bitCast(sized); } diff --git a/src/bun.js/webcore/TextEncoderStreamEncoder.zig b/src/bun.js/webcore/TextEncoderStreamEncoder.zig index e15f1971b5..3820bbac25 100644 --- a/src/bun.js/webcore/TextEncoderStreamEncoder.zig +++ b/src/bun.js/webcore/TextEncoderStreamEncoder.zig @@ -74,7 +74,7 @@ fn encodeLatin1(this: *TextEncoderStreamEncoder, globalObject: *JSGlobalObject, var remain = input; while (remain.len > 0) { - const result = strings.copyLatin1IntoUTF8(buffer.unusedCapacitySlice(), []const u8, remain); + const result = strings.copyLatin1IntoUTF8(buffer.unusedCapacitySlice(), remain); buffer.items.len += result.written; remain = remain[result.read..]; @@ -123,7 +123,7 @@ fn encodeUTF16(this: *TextEncoderStreamEncoder, globalObject: *JSGlobalObject, i this.pending_lead_surrogate = null; const maybe_trail = remain[0]; if (strings.u16IsTrail(maybe_trail)) { - const converted = strings.utf16CodepointWithFFFD([]const u16, &.{ lead, maybe_trail }); + const converted = strings.utf16CodepointWithFFFD(&.{ lead, maybe_trail }); // shouldn't fail because `u16IsTrail` is true and `pending_lead_surrogate` is always // a valid lead. bun.debugAssert(!converted.fail); @@ -164,7 +164,7 @@ fn encodeUTF16(this: *TextEncoderStreamEncoder, globalObject: *JSGlobalObject, i switch (result.status) { else => { // Slow path: there was invalid UTF-16, so we need to convert it without simdutf. - const lead_surrogate = strings.toUTF8ListWithTypeBun(&buf, []const u16, remain, true) catch { + const lead_surrogate = strings.toUTF8ListWithTypeBun(&buf, remain, true) catch { buf.deinit(); return globalObject.throwOutOfMemoryValue(); }; diff --git a/src/bun.js/webcore/encoding.zig b/src/bun.js/webcore/encoding.zig index 65f7874196..b78cdcfc38 100644 --- a/src/bun.js/webcore/encoding.zig +++ b/src/bun.js/webcore/encoding.zig @@ -36,7 +36,7 @@ export fn Bun__encoding__byteLengthLatin1AsUTF8(input: [*]const u8, len: usize) // TODO(@190n) handle unpaired surrogates export fn Bun__encoding__byteLengthUTF16AsUTF8(input: [*]const u16, len: usize) usize { - return strings.elementLengthUTF16IntoUTF8([]const u16, input[0..len]); + return strings.elementLengthUTF16IntoUTF8(input[0..len]); } export fn Bun__encoding__constructFromLatin1(globalObject: *JSGlobalObject, input: [*]const u8, len: usize, encoding: u8) JSValue { @@ -285,7 +285,7 @@ pub fn writeU8(input: [*]const u8, len: usize, to_ptr: [*]u8, to_len: usize, com }, .utf8 => { // need to encode - return strings.copyLatin1IntoUTF8(to_ptr[0..to_len], []const u8, input[0..len]).written; + return strings.copyLatin1IntoUTF8(to_ptr[0..to_len], input[0..len]).written; }, // encode latin1 into UTF16 .ucs2, .utf16le => { @@ -296,13 +296,13 @@ pub fn writeU8(input: [*]const u8, len: usize, to_ptr: [*]u8, to_len: usize, com const buf = input[0..len]; const output = @as([*]u16, @ptrCast(@alignCast(to_ptr)))[0 .. to_len / 2]; - const written = strings.copyLatin1IntoUTF16([]u16, output, []const u8, buf).written; + const written = strings.copyLatin1IntoUTF16([]u16, output, buf).written; return written * 2; } else { const buf = input[0..len]; const output = @as([*]align(1) u16, @ptrCast(to_ptr))[0 .. to_len / 2]; - const written = strings.copyLatin1IntoUTF16([]align(1) u16, output, []const u8, buf).written; + const written = strings.copyLatin1IntoUTF16([]align(1) u16, output, buf).written; return written * 2; } }, @@ -331,7 +331,7 @@ pub fn byteLengthU8(input: [*]const u8, len: usize, comptime encoding: Encoding) }, .ucs2, .utf16le => { - return strings.elementLengthUTF8IntoUTF16([]const u8, input[0..len]) * 2; + return strings.elementLengthUTF8IntoUTF16(input[0..len]) * 2; }, .hex => { @@ -361,7 +361,6 @@ pub fn writeU16(input: [*]const u16, len: usize, to: [*]u8, to_len: usize, compt .utf8 => { return strings.copyUTF16IntoUTF8Impl( to[0..to_len], - []const u16, input[0..len], allow_partial_write, ).written; @@ -433,13 +432,13 @@ pub fn constructFromU8(input: [*]const u8, len: usize, allocator: std.mem.Alloca }, .utf8 => { // need to encode - return strings.allocateLatin1IntoUTF8(allocator, []const u8, input[0..len]) catch return &[_]u8{}; + return strings.allocateLatin1IntoUTF8(allocator, input[0..len]) catch return &[_]u8{}; }, // encode latin1 into UTF16 // return as bytes .ucs2, .utf16le => { var to = allocator.alloc(u16, len) catch return &[_]u8{}; - _ = strings.copyLatin1IntoUTF16([]u16, to, []const u8, input[0..len]); + _ = strings.copyLatin1IntoUTF16([]u16, to, input[0..len]); return std.mem.sliceAsBytes(to[0..len]); }, @@ -469,7 +468,7 @@ pub fn constructFromU16(input: [*]const u16, len: usize, allocator: std.mem.Allo switch (comptime encoding) { .utf8 => { - return strings.toUTF8AllocWithType(allocator, []const u16, input[0..len]) catch return &[_]u8{}; + return strings.toUTF8AllocWithType(allocator, input[0..len]) catch return &[_]u8{}; }, .latin1, .buffer, .ascii => { var to = allocator.alloc(u8, len) catch return &[_]u8{}; diff --git a/src/bun.zig b/src/bun.zig index 7e24954e94..032e843178 100644 --- a/src/bun.zig +++ b/src/bun.zig @@ -1315,7 +1315,7 @@ pub fn getFdPath(fd: FileDescriptor, buf: *bun.PathBuffer) ![]u8 { if (comptime Environment.isWindows) { var wide_buf: WPathBuffer = undefined; const wide_slice = try windows.GetFinalPathNameByHandle(fd.native(), .{}, wide_buf[0..]); - const res = strings.copyUTF16IntoUTF8(buf[0..], @TypeOf(wide_slice), wide_slice); + const res = strings.copyUTF16IntoUTF8(buf[0..], wide_slice); return buf[0..res.written]; } diff --git a/src/cli/pack_command.zig b/src/cli/pack_command.zig index 5c2b7e3d0e..0a57afb944 100644 --- a/src/cli/pack_command.zig +++ b/src/cli/pack_command.zig @@ -2591,7 +2591,7 @@ pub const bindings = struct { const pathname_string = if (bun.Environment.isWindows) blk: { const pathname_w = archive_entry.pathnameW(); const list = std.ArrayList(u8).init(bun.default_allocator); - var result = bun.handleOom(bun.strings.toUTF8ListWithType(list, []const u16, pathname_w)); + var result = bun.handleOom(bun.strings.toUTF8ListWithType(list, pathname_w)); defer result.deinit(); break :blk String.cloneUTF8(result.items); } else String.cloneUTF8(archive_entry.pathname()); diff --git a/src/collections/baby_list.zig b/src/collections/baby_list.zig index 57feab74a6..0ecf3b9df2 100644 --- a/src/collections/baby_list.zig +++ b/src/collections/baby_list.zig @@ -400,63 +400,39 @@ pub fn BabyList(comptime Type: type) type { @compileError("Unsupported for type " ++ @typeName(Type)); const initial = this.len; const old = this.listManaged(allocator); - const new = try strings.allocateLatin1IntoUTF8WithList(old, old.items.len, []const u8, str); + const new = try strings.allocateLatin1IntoUTF8WithList(old, old.items.len, str); this.update(new); return this.len - initial; } - /// This method is available only for `BabyList(u8)`. + /// This method is available only for `BabyList(u8)`. Invalid characters are replaced with + /// replacement character pub fn writeUTF16(this: *Self, allocator: std.mem.Allocator, str: []const u16) OOM!u32 { if ((comptime safety_checks) and str.len > 0) this.assertOwned(); if (comptime Type != u8) @compileError("Unsupported for type " ++ @typeName(Type)); - var list_ = this.listManaged(allocator); - const initial = this.len; - outer: { - defer this.update(list_); - const trimmed = bun.simdutf.trim.utf16(str); - if (trimmed.len == 0) - break :outer; - const available_len = (list_.capacity - list_.items.len); + const initial_len = this.len; - // maximum UTF-16 length is 3 times the UTF-8 length + 2 - // only do the pass over the input length if we may not have enough space - const out_len = if (available_len <= (trimmed.len * 3 + 2)) - bun.simdutf.length.utf8.from.utf16.le(trimmed) + var list_ = this.listManaged(allocator); + { + defer this.update(list_); + + // Maximum UTF-16 length is 3 times the UTF-8 length + 2 + const length_estimate = if (list_.unusedCapacitySlice().len <= (str.len * 3 + 2)) + // This length is an estimate. `str` isn't validated and might contain invalid + // sequences. If it does simdutf will assume they require 2 characters instead + // of 3. + bun.simdutf.length.utf8.from.utf16.le(str) else str.len; - if (out_len == 0) - break :outer; + try list_.ensureUnusedCapacity(length_estimate); - // intentionally over-allocate a little - try list_.ensureTotalCapacity(list_.items.len + out_len); - - var remain = str; - while (remain.len > 0) { - const orig_len = list_.items.len; - - const slice_ = list_.items.ptr[orig_len..list_.capacity]; - const result = strings.copyUTF16IntoUTF8WithBufferImpl( - slice_, - []const u16, - remain, - trimmed, - out_len, - // FIXME: Unclear whether or not we should allow - // incomplete UTF-8 sequences. If you are solving a bug - // with invalid UTF-8 sequences, this may be the - // culprit... - true, - ); - remain = remain[result.read..]; - list_.items.len += @as(usize, result.written); - if (result.read == 0 or result.written == 0) break; - } + try strings.convertUTF16ToUTF8Append(&list_, str); } - return this.len - initial; + return this.len - initial_len; } /// This method is available only for `BabyList(u8)`. diff --git a/src/fmt.zig b/src/fmt.zig index a2938ab668..a3faac9a6d 100644 --- a/src/fmt.zig +++ b/src/fmt.zig @@ -254,7 +254,7 @@ fn getSharedBuffer() []u8 { } threadlocal var shared_temp_buffer_ptr: ?*SharedTempBuffer = null; -pub fn formatUTF16Type(comptime Slice: type, slice_: Slice, writer: anytype) !void { +pub fn formatUTF16Type(slice_: []const u16, writer: anytype) !void { var chunk = getSharedBuffer(); // Defensively ensure recursion doesn't cause the buffer to be overwritten in-place @@ -272,7 +272,7 @@ pub fn formatUTF16Type(comptime Slice: type, slice_: Slice, writer: anytype) !vo var slice = slice_; while (slice.len > 0) { - const result = strings.copyUTF16IntoUTF8(chunk, Slice, slice); + const result = strings.copyUTF16IntoUTF8(chunk, slice); if (result.read == 0 or result.written == 0) break; try writer.writeAll(chunk[0..result.written]); @@ -280,7 +280,7 @@ pub fn formatUTF16Type(comptime Slice: type, slice_: Slice, writer: anytype) !vo } } -pub fn formatUTF16TypeWithPathOptions(comptime Slice: type, slice_: Slice, writer: anytype, opts: PathFormatOptions) !void { +pub fn formatUTF16TypeWithPathOptions(slice_: []const u16, writer: anytype, opts: PathFormatOptions) !void { var chunk = getSharedBuffer(); // Defensively ensure recursion doesn't cause the buffer to be overwritten in-place @@ -298,7 +298,7 @@ pub fn formatUTF16TypeWithPathOptions(comptime Slice: type, slice_: Slice, write var slice = slice_; while (slice.len > 0) { - const result = strings.copyUTF16IntoUTF8(chunk, Slice, slice); + const result = strings.copyUTF16IntoUTF8(chunk, slice); if (result.read == 0 or result.written == 0) break; @@ -356,9 +356,9 @@ pub const FormatUTF16 = struct { path_fmt_opts: ?PathFormatOptions = null, pub fn format(self: @This(), comptime _: []const u8, _: anytype, writer: anytype) !void { if (self.path_fmt_opts) |opts| { - try formatUTF16TypeWithPathOptions([]const u16, self.buf, writer, opts); + try formatUTF16TypeWithPathOptions(self.buf, writer, opts); } else { - try formatUTF16Type([]const u16, self.buf, writer); + try formatUTF16Type(self.buf, writer); } } }; @@ -465,7 +465,7 @@ pub fn formatLatin1(slice_: []const u8, writer: anytype) !void { try writer.writeAll(slice[0..i]); slice = slice[i..]; } - const result = strings.copyLatin1IntoUTF8(chunk, @TypeOf(slice), slice[0..@min(chunk.len, slice.len)]); + const result = strings.copyLatin1IntoUTF8(chunk, slice[0..@min(chunk.len, slice.len)]); if (result.read == 0 or result.written == 0) break; try writer.writeAll(chunk[0..result.written]); diff --git a/src/http/websocket_client.zig b/src/http/websocket_client.zig index 8db05a0b6c..b340b8fc56 100644 --- a/src/http/websocket_client.zig +++ b/src/http/websocket_client.zig @@ -743,9 +743,9 @@ pub fn NewWebSocketClient(comptime ssl: bool) type { const content_to_compress: []const u8 = switch (bytes) { .utf16 => |utf16| brk: { // Convert UTF16 to UTF8 for compression - const content_byte_len: usize = strings.elementLengthUTF16IntoUTF8([]const u16, utf16); + const content_byte_len: usize = strings.elementLengthUTF16IntoUTF8(utf16); temp_buffer = allocator.alloc(u8, content_byte_len) catch return false; - const encode_result = strings.copyUTF16IntoUTF8(temp_buffer.?, []const u16, utf16); + const encode_result = strings.copyUTF16IntoUTF8(temp_buffer.?, utf16); break :brk temp_buffer.?[0..encode_result.written]; }, .latin1 => |latin1| brk: { @@ -757,7 +757,7 @@ pub fn NewWebSocketClient(comptime ssl: bool) type { } temp_buffer = allocator.alloc(u8, content_byte_len) catch return false; - const encode_result = strings.copyLatin1IntoUTF8(temp_buffer.?, []const u8, latin1); + const encode_result = strings.copyLatin1IntoUTF8(temp_buffer.?, latin1); break :brk temp_buffer.?[0..encode_result.written]; }, .bytes => |b| b, @@ -1430,7 +1430,7 @@ const Copy = union(enum) { pub fn len(this: @This(), byte_len: *usize) usize { switch (this) { .utf16 => { - byte_len.* = strings.elementLengthUTF16IntoUTF8([]const u16, this.utf16); + byte_len.* = strings.elementLengthUTF16IntoUTF8(this.utf16); return WebsocketHeader.frameSizeIncludingMask(byte_len.*); }, .latin1 => { @@ -1486,7 +1486,7 @@ const Copy = union(enum) { switch (this) { .utf16 => |utf16| { header.len = WebsocketHeader.packLength(content_byte_len); - const encode_into_result = strings.copyUTF16IntoUTF8Impl(to_mask, []const u16, utf16, true); + const encode_into_result = strings.copyUTF16IntoUTF8Impl(to_mask, utf16, true); bun.assert(@as(usize, encode_into_result.written) == content_byte_len); bun.assert(@as(usize, encode_into_result.read) == utf16.len); header.len = WebsocketHeader.packLength(encode_into_result.written); @@ -1496,7 +1496,7 @@ const Copy = union(enum) { Mask.fill(globalThis, buf[mask_offset..][0..4], to_mask[0..content_byte_len], to_mask[0..content_byte_len]); }, .latin1 => |latin1| { - const encode_into_result = strings.copyLatin1IntoUTF8(to_mask, []const u8, latin1); + const encode_into_result = strings.copyLatin1IntoUTF8(to_mask, latin1); bun.assert(@as(usize, encode_into_result.written) == content_byte_len); // latin1 can contain non-ascii diff --git a/src/install/PackageInstall.zig b/src/install/PackageInstall.zig index 3446d35094..c7ec749484 100644 --- a/src/install/PackageInstall.zig +++ b/src/install/PackageInstall.zig @@ -1279,7 +1279,7 @@ pub const PackageInstall = struct { _ = node_fs_for_package_installer.mkdirRecursiveOSPathImpl(void, {}, fullpath, 0, false); } - const res = strings.copyUTF16IntoUTF8(dest_buf[0..], []const u16, wbuf[0..i]); + const res = strings.copyUTF16IntoUTF8(dest_buf[0..], wbuf[0..i]); var offset: usize = res.written; if (dest_buf[offset - 1] != std.fs.path.sep_windows) { dest_buf[offset] = std.fs.path.sep_windows; diff --git a/src/js_lexer.zig b/src/js_lexer.zig index 80f9ad0afa..386d785532 100644 --- a/src/js_lexer.zig +++ b/src/js_lexer.zig @@ -2085,7 +2085,7 @@ fn NewLexer_( defer lexer.temp_buffer_u16.clearRetainingCapacity(); try lexer.temp_buffer_u16.ensureUnusedCapacity(lexer.string_literal_raw_content.len); try lexer.decodeEscapeSequences(lexer.string_literal_start, lexer.string_literal_raw_content, std.ArrayList(u16), &lexer.temp_buffer_u16); - const first_non_ascii = strings.firstNonASCII16([]const u16, lexer.temp_buffer_u16.items); + const first_non_ascii = strings.firstNonASCII16(lexer.temp_buffer_u16.items); // prefer to store an ascii e.string rather than a utf-16 one. ascii takes less memory, and `+` folding is not yet supported on utf-16. if (first_non_ascii != null) { return js_ast.E.String.init(try lexer.allocator.dupe(u16, lexer.temp_buffer_u16.items)); @@ -2172,7 +2172,7 @@ fn NewLexer_( } pub fn utf16ToString(noalias lexer: *const LexerType, js: JavascriptString) !string { - return try strings.toUTF8AllocWithType(lexer.allocator, []const u16, js); + return try strings.toUTF8AllocWithType(lexer.allocator, js); } pub fn nextInsideJSXElement(noalias lexer: *LexerType) !void { lexer.assertNotJSON(); diff --git a/src/libarchive/libarchive.zig b/src/libarchive/libarchive.zig index 6d7d3ae172..c535b5b525 100644 --- a/src/libarchive/libarchive.zig +++ b/src/libarchive/libarchive.zig @@ -345,7 +345,7 @@ pub const Archiver = struct { if (appender.needs_first_dirname) { if (comptime Environment.isWindows) { const list = std.ArrayList(u8).init(default_allocator); - var result = try strings.toUTF8ListWithType(list, []const u16, pathname[0..pathname.len]); + var result = try strings.toUTF8ListWithType(list, pathname[0..pathname.len]); // onFirstDirectoryName copies the contents of pathname to another buffer, safe to free defer result.deinit(); appender.onFirstDirectoryName(strings.withoutTrailingSlash(result.items)); diff --git a/src/shell/shell.zig b/src/shell/shell.zig index 7759256915..5c3883f05d 100644 --- a/src/shell/shell.zig +++ b/src/shell/shell.zig @@ -3059,7 +3059,7 @@ pub fn NewLexer(comptime encoding: StringEncoding) type { if (non_ascii_idx > 0) { try self.strpool.appendSlice(bytes[0..non_ascii_idx]); } - self.strpool = try bun.strings.allocateLatin1IntoUTF8WithList(self.strpool, self.strpool.items.len, []const u8, bytes[non_ascii_idx..]); + self.strpool = try bun.strings.allocateLatin1IntoUTF8WithList(self.strpool, self.strpool.items.len, bytes[non_ascii_idx..]); } } const end = self.strpool.items.len; @@ -3929,7 +3929,7 @@ pub const ShellSrcBuilder = struct { try this.appendUTF8Impl(latin1[0..non_ascii_idx]); } - this.outbuf.* = try bun.strings.allocateLatin1IntoUTF8WithList(this.outbuf.*, this.outbuf.items.len, []const u8, latin1); + this.outbuf.* = try bun.strings.allocateLatin1IntoUTF8WithList(this.outbuf.*, this.outbuf.items.len, latin1); } pub fn appendJSStrRef(this: *ShellSrcBuilder, bunstr: bun.String) bun.OOM!void { @@ -3993,7 +3993,7 @@ pub fn escape8Bit(str: []const u8, outbuf: *std.ArrayList(u8), comptime add_quot pub fn escapeUtf16(str: []const u16, outbuf: *std.ArrayList(u8), comptime add_quotes: bool) !struct { is_invalid: bool = false } { if (add_quotes) try outbuf.append('"'); - const non_ascii = bun.strings.firstNonASCII16([]const u16, str) orelse 0; + const non_ascii = bun.strings.firstNonASCII16(str) orelse 0; var cp_buf: [4]u8 = undefined; var i: usize = 0; @@ -4003,7 +4003,7 @@ pub fn escapeUtf16(str: []const u16, outbuf: *std.ArrayList(u8), comptime add_qu defer i += 1; break :brk str[i]; } - const ret = bun.strings.utf16Codepoint([]const u16, str[i..]); + const ret = bun.strings.utf16Codepoint(str[i..]); if (ret.fail) return .{ .is_invalid = true }; i += ret.len; break :brk ret.code_point; diff --git a/src/string.zig b/src/string.zig index 17d70e05e5..cb1ed9d85d 100644 --- a/src/string.zig +++ b/src/string.zig @@ -183,7 +183,7 @@ pub const String = extern struct { pub fn cloneUTF16(bytes: []const u16) String { if (bytes.len == 0) return String.empty; - if (bun.strings.firstNonASCII16([]const u16, bytes) == null) { + if (bun.strings.firstNonASCII16(bytes) == null) { return validateRefCount(bun.cpp.BunString__fromUTF16ToLatin1(bytes.ptr, bytes.len)); } return validateRefCount(bun.cpp.BunString__fromUTF16(bytes.ptr, bytes.len)); diff --git a/src/string/StringBuilder.zig b/src/string/StringBuilder.zig index ae89ee4f74..da901af971 100644 --- a/src/string/StringBuilder.zig +++ b/src/string/StringBuilder.zig @@ -40,7 +40,7 @@ pub fn count16(this: *StringBuilder, slice: []const u16) void { } pub fn count16Z(this: *StringBuilder, slice: [:0]const u16) void { - const result = bun.strings.elementLengthUTF16IntoUTF8([:0]const u16, slice); + const result = bun.strings.elementLengthUTF16IntoUTF8(slice); this.cap += result + 1; } @@ -59,7 +59,7 @@ pub fn append16(this: *StringBuilder, slice: []const u16, fallback_allocator: st return buf[0..result.count :0]; } else { var list = std.ArrayList(u8).init(fallback_allocator); - var out = bun.strings.toUTF8ListWithTypeBun(&list, []const u16, slice, false) catch return null; + var out = bun.strings.toUTF8ListWithTypeBun(&list, slice, false) catch return null; out.append(0) catch return null; return out.items[0 .. out.items.len - 1 :0]; } diff --git a/src/string/immutable.zig b/src/string/immutable.zig index 2f577f69b5..07d99d292b 100644 --- a/src/string/immutable.zig +++ b/src/string/immutable.zig @@ -1682,7 +1682,7 @@ pub fn getLinesInText(text: []const u8, line: u32, comptime line_range_count: us return results; } -pub fn firstNonASCII16(comptime Slice: type, slice: Slice) ?u32 { +pub fn firstNonASCII16(slice: []const u16) ?u32 { var remaining = slice; const remaining_start = remaining.ptr; @@ -2244,10 +2244,8 @@ pub const copyLatin1IntoUTF8 = unicode.copyLatin1IntoUTF8; pub const copyLatin1IntoUTF8StopOnNonASCII = unicode.copyLatin1IntoUTF8StopOnNonASCII; pub const copyU16IntoU8 = unicode.copyU16IntoU8; pub const copyU8IntoU16 = unicode.copyU8IntoU16; -pub const copyU8IntoU16WithAlignment = unicode.copyU8IntoU16WithAlignment; pub const copyUTF16IntoUTF8 = unicode.copyUTF16IntoUTF8; pub const copyUTF16IntoUTF8Impl = unicode.copyUTF16IntoUTF8Impl; -pub const copyUTF16IntoUTF8WithBuffer = unicode.copyUTF16IntoUTF8WithBuffer; pub const copyUTF16IntoUTF8WithBufferImpl = unicode.copyUTF16IntoUTF8WithBufferImpl; pub const decodeCheck = unicode.decodeCheck; pub const decodeWTF8RuneT = unicode.decodeWTF8RuneT; @@ -2264,7 +2262,6 @@ pub const isAllASCII = unicode.isAllASCII; pub const isValidUTF8 = unicode.isValidUTF8; pub const isValidUTF8WithoutSIMD = unicode.isValidUTF8WithoutSIMD; pub const cp1252ToCodepointAssumeNotASCII = unicode.cp1252ToCodepointAssumeNotASCII; -pub const cp1252ToCodepointBytesAssumeNotASCII = unicode.cp1252ToCodepointBytesAssumeNotASCII; pub const cp1252ToCodepointBytesAssumeNotASCII16 = unicode.cp1252ToCodepointBytesAssumeNotASCII16; pub const literal = unicode.literal; pub const nonASCIISequenceLength = unicode.nonASCIISequenceLength; @@ -2312,7 +2309,6 @@ pub const escapeHTMLForUTF16Input = escapeHTML_.escapeHTMLForUTF16Input; pub const addNTPathPrefix = paths_.addNTPathPrefix; pub const addNTPathPrefixIfNeeded = paths_.addNTPathPrefixIfNeeded; pub const addLongPathPrefix = paths_.addLongPathPrefix; -pub const addLongPathPrefixIfNeeded = paths_.addLongPathPrefixIfNeeded; pub const charIsAnySlash = paths_.charIsAnySlash; pub const cloneNormalizingSeparators = paths_.cloneNormalizingSeparators; pub const fromWPath = paths_.fromWPath; diff --git a/src/string/immutable/escapeHTML.zig b/src/string/immutable/escapeHTML.zig index 7ef3e45935..7e44565a8a 100644 --- a/src/string/immutable/escapeHTML.zig +++ b/src/string/immutable/escapeHTML.zig @@ -433,7 +433,7 @@ pub fn escapeHTMLForUTF16Input(allocator: std.mem.Allocator, utf16: []const u16) break :lazy; }, 128...std.math.maxInt(u16) => { - const cp = utf16Codepoint([]const u16, remaining[i..]); + const cp = utf16Codepoint(remaining[i..]); i += @as(u16, cp.len); }, else => { @@ -470,7 +470,7 @@ pub fn escapeHTMLForUTF16Input(allocator: std.mem.Allocator, utf16: []const u16) i += 1; }, 128...std.math.maxInt(u16) => { - const cp = utf16Codepoint([]const u16, remaining[i..]); + const cp = utf16Codepoint(remaining[i..]); buf.appendSlice(remaining[i..][0..@as(usize, cp.len)]) catch unreachable; i += @as(u16, cp.len); @@ -528,7 +528,7 @@ pub fn escapeHTMLForUTF16Input(allocator: std.mem.Allocator, utf16: []const u16) i += 1; }, 128...std.math.maxInt(u16) => { - const cp = utf16Codepoint([]const u16, remaining[i..]); + const cp = utf16Codepoint(remaining[i..]); buf.appendSlice(remaining[i..][0..@as(usize, cp.len)]) catch unreachable; i += @as(u16, cp.len); @@ -569,7 +569,7 @@ pub fn escapeHTMLForUTF16Input(allocator: std.mem.Allocator, utf16: []const u16) break :scan_and_allocate_lazily; }, 128...std.math.maxInt(u16) => { - const cp = utf16Codepoint([]const u16, ptr[0..if (ptr + 1 == end) 1 else 2]); + const cp = utf16Codepoint(ptr[0..if (ptr + 1 == end) 1 else 2]); ptr += @as(u16, cp.len); }, @@ -603,7 +603,7 @@ pub fn escapeHTMLForUTF16Input(allocator: std.mem.Allocator, utf16: []const u16) ptr += 1; }, 128...std.math.maxInt(u16) => { - const cp = utf16Codepoint([]const u16, ptr[0..if (ptr + 1 == end) 1 else 2]); + const cp = utf16Codepoint(ptr[0..if (ptr + 1 == end) 1 else 2]); buf.appendSlice(ptr[0..@as(usize, cp.len)]) catch unreachable; ptr += @as(u16, cp.len); diff --git a/src/string/immutable/paths.zig b/src/string/immutable/paths.zig index 079d6f5cf4..8cd11483b7 100644 --- a/src/string/immutable/paths.zig +++ b/src/string/immutable/paths.zig @@ -39,7 +39,7 @@ pub fn isWindowsAbsolutePathMissingDriveLetter(comptime T: type, chars: []const pub fn fromWPath(buf: []u8, utf16: []const u16) [:0]const u8 { bun.unsafeAssert(buf.len > 0); const to_copy = trimPrefixComptime(u16, utf16, bun.windows.long_path_prefix); - const encode_into_result = copyUTF16IntoUTF8(buf[0 .. buf.len - 1], []const u16, to_copy); + const encode_into_result = copyUTF16IntoUTF8(buf[0 .. buf.len - 1], to_copy); bun.unsafeAssert(encode_into_result.written < buf.len); buf[encode_into_result.written] = 0; return buf[0..encode_into_result.written :0]; diff --git a/src/string/immutable/unicode.zig b/src/string/immutable/unicode.zig index 06a92236f7..15af6e87d2 100644 --- a/src/string/immutable/unicode.zig +++ b/src/string/immutable/unicode.zig @@ -258,26 +258,26 @@ pub fn codepointSize(comptime R: type, r: R) u3_fast { }; } -pub fn convertUTF16ToUTF8(list_: std.ArrayList(u8), comptime Type: type, utf16: Type) OOM!std.ArrayList(u8) { +pub fn convertUTF16ToUTF8(list_: std.ArrayList(u8), utf16: []const u16) OOM!std.ArrayList(u8) { var list = list_; const result = bun.simdutf.convert.utf16.to.utf8.with_errors.le( utf16, - list.items.ptr[0..list.capacity], + list.allocatedSlice(), ); if (result.status == .surrogate) { // Slow path: there was invalid UTF-16, so we need to convert it without simdutf. - return toUTF8ListWithTypeBun(&list, Type, utf16, false); + return toUTF8ListWithTypeBun(&list, utf16, false); } list.items.len = result.count; return list; } -pub fn convertUTF16ToUTF8WithoutInvalidSurrogatePairs(list_: std.ArrayList(u8), comptime Type: type, utf16: Type) !std.ArrayList(u8) { +pub fn convertUTF16ToUTF8WithoutInvalidSurrogatePairs(list_: std.ArrayList(u8), utf16: []const u16) error{SurrogatePair}!std.ArrayList(u8) { var list = list_; const result = bun.simdutf.convert.utf16.to.utf8.with_errors.le( utf16, - list.items.ptr[0..list.capacity], + list.allocatedSlice(), ); if (result.status == .surrogate) { return error.SurrogatePair; @@ -287,55 +287,55 @@ pub fn convertUTF16ToUTF8WithoutInvalidSurrogatePairs(list_: std.ArrayList(u8), return list; } -pub fn convertUTF16ToUTF8Append(list: *std.ArrayList(u8), utf16: []const u16) !void { +pub fn convertUTF16ToUTF8Append(list: *std.ArrayList(u8), utf16: []const u16) OOM!void { const result = bun.simdutf.convert.utf16.to.utf8.with_errors.le( utf16, - list.items.ptr[list.items.len..list.capacity], + list.unusedCapacitySlice(), ); if (result.status == .surrogate) { // Slow path: there was invalid UTF-16, so we need to convert it without simdutf. - _ = try toUTF8ListWithTypeBun(list, []const u16, utf16, false); + _ = try toUTF8ListWithTypeBun(list, utf16, false); return; } list.items.len += result.count; } -pub fn toUTF8AllocWithTypeWithoutInvalidSurrogatePairs(allocator: std.mem.Allocator, comptime Type: type, utf16: Type) ![]u8 { - if (bun.FeatureFlags.use_simdutf and comptime Type == []const u16) { +pub fn toUTF8AllocWithTypeWithoutInvalidSurrogatePairs(allocator: std.mem.Allocator, utf16: []const u16) OOM![]u8 { + if (bun.FeatureFlags.use_simdutf) { const length = bun.simdutf.length.utf8.from.utf16.le(utf16); // add 16 bytes of padding for SIMDUTF var list = try std.ArrayList(u8).initCapacity(allocator, length + 16); - list = try convertUTF16ToUTF8(list, Type, utf16); + list = try convertUTF16ToUTF8(list, utf16); return list.items; } var list = try std.ArrayList(u8).initCapacity(allocator, utf16.len); - list = try toUTF8ListWithType(list, Type, utf16); + list = try toUTF8ListWithType(list, utf16); return list.items; } -pub fn toUTF8AllocWithType(allocator: std.mem.Allocator, comptime Type: type, utf16: Type) OOM![]u8 { - if (bun.FeatureFlags.use_simdutf and comptime Type == []const u16) { +pub fn toUTF8AllocWithType(allocator: std.mem.Allocator, utf16: []const u16) OOM![]u8 { + if (bun.FeatureFlags.use_simdutf) { const length = bun.simdutf.length.utf8.from.utf16.le(utf16); // add 16 bytes of padding for SIMDUTF var list = try std.ArrayList(u8).initCapacity(allocator, length + 16); - list = try convertUTF16ToUTF8(list, Type, utf16); + list = try convertUTF16ToUTF8(list, utf16); return list.items; } var list = try std.ArrayList(u8).initCapacity(allocator, utf16.len); - list = try toUTF8ListWithType(list, Type, utf16); + list = try toUTF8ListWithType(list, utf16); return list.items; } -pub fn toUTF8ListWithType(list_: std.ArrayList(u8), comptime Type: type, utf16: Type) OOM!std.ArrayList(u8) { - if (bun.FeatureFlags.use_simdutf and comptime Type == []const u16) { +pub fn toUTF8ListWithType(list_: std.ArrayList(u8), utf16: []const u16) OOM!std.ArrayList(u8) { + if (bun.FeatureFlags.use_simdutf) { var list = list_; const length = bun.simdutf.length.utf8.from.utf16.le(utf16); try list.ensureTotalCapacityPrecise(length + 16); - const buf = try convertUTF16ToUTF8(list, Type, utf16); + const buf = try convertUTF16ToUTF8(list, utf16); // Commenting out because `convertUTF16ToUTF8` may convert to WTF-8 // which uses 3 bytes for invalid surrogates, causing the length to not @@ -364,7 +364,7 @@ pub fn toUTF8FromLatin1(allocator: std.mem.Allocator, latin1: []const u8) !?std. return null; const list = try std.ArrayList(u8).initCapacity(allocator, latin1.len); - return try allocateLatin1IntoUTF8WithList(list, 0, []const u8, latin1); + return try allocateLatin1IntoUTF8WithList(list, 0, latin1); } pub fn toUTF8FromLatin1Z(allocator: std.mem.Allocator, latin1: []const u8) !?std.ArrayList(u8) { @@ -372,20 +372,20 @@ pub fn toUTF8FromLatin1Z(allocator: std.mem.Allocator, latin1: []const u8) !?std return null; const list = try std.ArrayList(u8).initCapacity(allocator, latin1.len + 1); - var list1 = try allocateLatin1IntoUTF8WithList(list, 0, []const u8, latin1); + var list1 = try allocateLatin1IntoUTF8WithList(list, 0, latin1); try list1.append(0); return list1; } -pub fn toUTF8ListWithTypeBun(list: *std.ArrayList(u8), comptime Type: type, utf16: Type, comptime skip_trailing_replacement: bool) OOM!(if (skip_trailing_replacement) ?u16 else std.ArrayList(u8)) { +pub fn toUTF8ListWithTypeBun(list: *std.ArrayList(u8), utf16: []const u16, comptime skip_trailing_replacement: bool) OOM!(if (skip_trailing_replacement) ?u16 else std.ArrayList(u8)) { var utf16_remaining = utf16; - while (firstNonASCII16(Type, utf16_remaining)) |i| { + while (firstNonASCII16(utf16_remaining)) |i| { const to_copy = utf16_remaining[0..i]; utf16_remaining = utf16_remaining[i..]; const token = utf16_remaining[0]; - const replacement = utf16CodepointWithFFFDAndFirstInputChar(Type, token, utf16_remaining); + const replacement = utf16CodepointWithFFFDAndFirstInputChar(token, utf16_remaining); utf16_remaining = utf16_remaining[replacement.len..]; const count: usize = replacement.utf8Width(); @@ -433,7 +433,7 @@ pub const EncodeIntoResult = struct { /// The number of u8s we wrote to the utf-8 buffer written: u32 = 0, }; -pub fn allocateLatin1IntoUTF8(allocator: std.mem.Allocator, comptime Type: type, latin1_: Type) ![]u8 { +pub fn allocateLatin1IntoUTF8(allocator: std.mem.Allocator, latin1_: []const u8) ![]u8 { if (comptime bun.FeatureFlags.latin1_is_now_ascii) { var out = try allocator.alloc(u8, latin1_.len); @memcpy(out[0..latin1_.len], latin1_); @@ -441,11 +441,11 @@ pub fn allocateLatin1IntoUTF8(allocator: std.mem.Allocator, comptime Type: type, } const list = try std.ArrayList(u8).initCapacity(allocator, latin1_.len); - var foo = try allocateLatin1IntoUTF8WithList(list, 0, Type, latin1_); + var foo = try allocateLatin1IntoUTF8WithList(list, 0, latin1_); return try foo.toOwnedSlice(); } -pub fn allocateLatin1IntoUTF8WithList(list_: std.ArrayList(u8), offset_into_list: usize, comptime Type: type, latin1_: Type) OOM!std.ArrayList(u8) { +pub fn allocateLatin1IntoUTF8WithList(list_: std.ArrayList(u8), offset_into_list: usize, latin1_: []const u8) OOM!std.ArrayList(u8) { var latin1 = latin1_; var i: usize = offset_into_list; var list = list_; @@ -688,11 +688,11 @@ pub fn convertUTF8BytesIntoUTF16(bytes: []const u8) UTF16Replacement { return convertUTF8BytesIntoUTF16WithLength(&sequence, sequence_length, bytes.len); } -pub fn copyLatin1IntoUTF8(buf_: []u8, comptime Type: type, latin1_: Type) EncodeIntoResult { - return copyLatin1IntoUTF8StopOnNonASCII(buf_, Type, latin1_, false); +pub fn copyLatin1IntoUTF8(buf_: []u8, latin1_: []const u8) EncodeIntoResult { + return copyLatin1IntoUTF8StopOnNonASCII(buf_, latin1_, false); } -pub fn copyLatin1IntoUTF8StopOnNonASCII(buf_: []u8, comptime Type: type, latin1_: Type, comptime stop: bool) EncodeIntoResult { +pub fn copyLatin1IntoUTF8StopOnNonASCII(buf_: []u8, latin1_: []const u8, comptime stop: bool) EncodeIntoResult { if (comptime bun.FeatureFlags.latin1_is_now_ascii) { const to_copy = @as(u32, @truncate(@min(buf_.len, latin1_.len))); @memcpy(buf_[0..to_copy], latin1_[0..to_copy]); @@ -839,16 +839,12 @@ pub fn elementLengthLatin1IntoUTF8(slice: []const u8) usize { return bun.simdutf.length.utf8.from.latin1(slice); } -pub fn copyCP1252IntoUTF16(comptime Buffer: type, buf_: Buffer, comptime Type: type, latin1_: Type) EncodeIntoResult { +pub fn copyCP1252IntoUTF16(buf_: []u16, latin1_: []const u8) EncodeIntoResult { var buf = buf_; var latin1 = latin1_; while (buf.len > 0 and latin1.len > 0) { const to_write = strings.firstNonASCII(latin1) orelse @as(u32, @truncate(@min(latin1.len, buf.len))); - if (comptime std.meta.alignment(Buffer) != @alignOf(u16)) { - strings.copyU8IntoU16WithAlignment(std.meta.alignment(Buffer), buf, latin1[0..to_write]); - } else { - strings.copyU8IntoU16(buf, latin1[0..to_write]); - } + strings.copyU8IntoU16(buf, latin1[0..to_write]); latin1 = latin1[to_write..]; buf = buf[to_write..]; @@ -865,13 +861,13 @@ pub fn copyCP1252IntoUTF16(comptime Buffer: type, buf_: Buffer, comptime Type: t }; } -pub fn copyLatin1IntoUTF16(comptime Buffer: type, buf_: Buffer, comptime Type: type, latin1_: Type) EncodeIntoResult { +pub fn copyLatin1IntoUTF16(comptime Buffer: type, buf_: Buffer, latin1_: []const u8) EncodeIntoResult { const len = @min(buf_.len, latin1_.len); for (buf_[0..len], latin1_[0..len]) |*out, in| out.* = in; return .{ .read = @as(u32, @truncate(len)), .written = @as(u32, @truncate(len)) }; } -pub fn elementLengthCP1252IntoUTF16(comptime Type: type, cp1252_: Type) usize { +pub fn elementLengthCP1252IntoUTF16(cp1252_: []const u8) usize { // cp1252 is always at most 1 UTF-16 code unit long return cp1252_.len; } @@ -885,7 +881,7 @@ pub fn eqlUtf16(comptime self: string, other: []const u16) bool { } pub fn toUTF8Alloc(allocator: std.mem.Allocator, js: []const u16) OOM![]u8 { - return try toUTF8AllocWithType(allocator, []const u16, js); + return try toUTF8AllocWithType(allocator, js); } pub fn toUTF8AllocZ(allocator: std.mem.Allocator, js: []const u16) OOM![:0]u8 { @@ -924,55 +920,6 @@ pub fn copyU8IntoU16(output_: []u16, input_: []const u8) callconv(bun.callconv_i } } -pub fn copyU8IntoU16WithAlignment(comptime alignment: u21, output_: []align(alignment) u16, input_: []const u8) void { - var output = output_; - var input = input_; - const word = @sizeOf(usize) / 2; - if (comptime Environment.allow_assert) assert(input.len <= output.len); - - // un-aligned data access is slow - // so we attempt to align the data - while (!std.mem.isAligned(@intFromPtr(output.ptr), @alignOf(u16)) and input.len >= word) { - output[0] = input[0]; - output = output[1..]; - input = input[1..]; - } - - if (std.mem.isAligned(@intFromPtr(output.ptr), @alignOf(u16)) and input.len > 0) { - copyU8IntoU16(@as([*]u16, @alignCast(output.ptr))[0..output.len], input); - return; - } - - for (input, 0..) |c, i| { - output[i] = c; - } -} - -// pub fn copy(output_: []u8, input_: []const u8) callconv(bun.callconv_inline) void { -// var output = output_; -// var input = input_; -// if (comptime Environment.allow_assert) assert(input.len <= output.len); - -// if (input.len > @sizeOf(usize) * 4) { -// comptime var i: usize = 0; -// inline while (i < 4) : (i += 1) { -// appendUTF8MachineWord(output[i * @sizeOf(usize) ..][0..@sizeOf(usize)], input[i * @sizeOf(usize) ..][0..@sizeOf(usize)]); -// } -// output = output[4 * @sizeOf(usize) ..]; -// input = input[4 * @sizeOf(usize) ..]; -// } - -// while (input.len >= @sizeOf(usize)) { -// appendUTF8MachineWord(output[0..@sizeOf(usize)], input[0..@sizeOf(usize)]); -// output = output[@sizeOf(usize)..]; -// input = input[@sizeOf(usize)..]; -// } - -// for (input) |c, i| { -// output[i] = c; -// } -// } - pub inline fn copyU16IntoU8(output: []u8, input: []align(1) const u16) void { if (comptime Environment.allow_assert) assert(input.len <= output.len); const count = @min(input.len, output.len); @@ -1372,11 +1319,11 @@ pub fn toUTF16AllocMaybeBuffered( return .{ output.items, .{0} ** 3, 0 }; } -pub fn utf16CodepointWithFFFD(comptime Type: type, input: Type) UTF16Replacement { - return utf16CodepointWithFFFDAndFirstInputChar(Type, input[0], input); +pub fn utf16CodepointWithFFFD(input: []const u16) UTF16Replacement { + return utf16CodepointWithFFFDAndFirstInputChar(input[0], input); } -fn utf16CodepointWithFFFDAndFirstInputChar(comptime Type: type, char: std.meta.Elem(Type), input: Type) UTF16Replacement { +fn utf16CodepointWithFFFDAndFirstInputChar(char: u16, input: []const u16) UTF16Replacement { const c0 = @as(u21, char); if (c0 & ~@as(u21, 0x03ff) == 0xd800) { @@ -1412,7 +1359,7 @@ fn utf16CodepointWithFFFDAndFirstInputChar(comptime Type: type, char: std.meta.E } } -pub fn utf16Codepoint(comptime Type: type, input: Type) UTF16Replacement { +pub fn utf16Codepoint(input: []const u16) UTF16Replacement { const c0 = @as(u21, input[0]); if (c0 & ~@as(u21, 0x03ff) == 0xd800) { @@ -1686,34 +1633,28 @@ pub fn cp1252ToCodepointBytesAssumeNotASCII16(char: u32) u16 { /// Copy a UTF-16 string as UTF-8 into `buf` /// /// This may not encode everything if `buf` is not big enough. -pub fn copyUTF16IntoUTF8(buf: []u8, comptime Type: type, utf16: Type) EncodeIntoResult { - return copyUTF16IntoUTF8Impl(buf, Type, utf16, false); +pub fn copyUTF16IntoUTF8(buf: []u8, utf16: []const u16) EncodeIntoResult { + return copyUTF16IntoUTF8Impl(buf, utf16, false); } /// See comment on `copyUTF16IntoUTF8WithBufferImpl` on what `allow_truncated_utf8_sequence` should do -pub fn copyUTF16IntoUTF8Impl(buf: []u8, comptime Type: type, utf16: Type, comptime allow_truncated_utf8_sequence: bool) EncodeIntoResult { - if (comptime Type == []const u16) { - if (bun.FeatureFlags.use_simdutf) { - if (utf16.len == 0) - return .{ .read = 0, .written = 0 }; - const trimmed = bun.simdutf.trim.utf16(utf16); - if (trimmed.len == 0) - return .{ .read = 0, .written = 0 }; +pub fn copyUTF16IntoUTF8Impl(buf: []u8, utf16: []const u16, comptime allow_truncated_utf8_sequence: bool) EncodeIntoResult { + if (bun.FeatureFlags.use_simdutf) { + if (utf16.len == 0) + return .{ .read = 0, .written = 0 }; + const trimmed = bun.simdutf.trim.utf16(utf16); + if (trimmed.len == 0) + return .{ .read = 0, .written = 0 }; - const out_len = if (buf.len <= (trimmed.len * 3 + 2)) - bun.simdutf.length.utf8.from.utf16.le(trimmed) - else - buf.len; + const out_len = if (buf.len <= (trimmed.len * 3 + 2)) + bun.simdutf.length.utf8.from.utf16.le(trimmed) + else + buf.len; - return copyUTF16IntoUTF8WithBufferImpl(buf, Type, utf16, trimmed, out_len, allow_truncated_utf8_sequence); - } + return copyUTF16IntoUTF8WithBufferImpl(buf, utf16, out_len, allow_truncated_utf8_sequence); } - return copyUTF16IntoUTF8WithBufferImpl(buf, Type, utf16, utf16, utf16.len, allow_truncated_utf8_sequence); -} - -pub fn copyUTF16IntoUTF8WithBuffer(buf: []u8, comptime Type: type, utf16: Type, trimmed: Type, out_len: usize) EncodeIntoResult { - return copyUTF16IntoUTF8WithBufferImpl(buf, Type, utf16, trimmed, out_len, false); + return copyUTF16IntoUTF8WithBufferImpl(buf, utf16, utf16.len, allow_truncated_utf8_sequence); } /// Q: What does the `allow_truncated_utf8_sequence` parameter do? @@ -1731,29 +1672,27 @@ pub fn copyUTF16IntoUTF8WithBuffer(buf: []u8, comptime Type: type, utf16: Type, /// buffer.fill("\u0222"); /// expect(buffer[0]).toBe(0xc8); /// ``` -pub fn copyUTF16IntoUTF8WithBufferImpl(buf: []u8, comptime Type: type, utf16: Type, trimmed: Type, out_len: usize, comptime allow_truncated_utf8_sequence: bool) EncodeIntoResult { +pub fn copyUTF16IntoUTF8WithBufferImpl(buf: []u8, utf16: []const u16, out_len: usize, comptime allow_truncated_utf8_sequence: bool) EncodeIntoResult { var remaining = buf; var utf16_remaining = utf16; var ended_on_non_ascii = false; brk: { - if (comptime Type == []const u16) { - if (bun.FeatureFlags.use_simdutf) { - log("UTF16 {d} -> UTF8 {d}", .{ utf16.len, out_len }); - if (remaining.len >= out_len) { - const result = bun.simdutf.convert.utf16.to.utf8.with_errors.le(trimmed, remaining); - if (result.status == .surrogate) break :brk; + if (bun.FeatureFlags.use_simdutf) { + log("UTF16 {d} -> UTF8 {d}", .{ utf16.len, out_len }); + if (remaining.len >= out_len) { + const result = bun.simdutf.convert.utf16.to.utf8.with_errors.le(utf16, remaining); + if (result.status == .surrogate) break :brk; - return EncodeIntoResult{ - .read = @as(u32, @truncate(trimmed.len)), - .written = @as(u32, @truncate(result.count)), - }; - } + return EncodeIntoResult{ + .read = @as(u32, @truncate(utf16.len)), + .written = @as(u32, @truncate(result.count)), + }; } } } - while (firstNonASCII16(Type, utf16_remaining)) |i| { + while (firstNonASCII16(utf16_remaining)) |i| { const end = @min(i, remaining.len); if (end > 0) copyU16IntoU8(remaining, utf16_remaining[0..end]); remaining = remaining[end..]; @@ -1762,7 +1701,7 @@ pub fn copyUTF16IntoUTF8WithBufferImpl(buf: []u8, comptime Type: type, utf16: Ty if (@min(utf16_remaining.len, remaining.len) == 0) break; - const replacement = utf16CodepointWithFFFD(Type, utf16_remaining); + const replacement = utf16CodepointWithFFFD(utf16_remaining); const width: usize = replacement.utf8Width(); bun.assert(width > 1); @@ -1836,7 +1775,7 @@ pub fn copyUTF16IntoUTF8WithBufferImpl(buf: []u8, comptime Type: type, utf16: Ty }; } -pub fn elementLengthUTF16IntoUTF8(comptime Type: type, utf16: Type) usize { +pub fn elementLengthUTF16IntoUTF8(utf16: []const u16) usize { if (bun.FeatureFlags.use_simdutf) { return bun.simdutf.length.utf8.from.utf16.le(utf16); } @@ -1844,12 +1783,12 @@ pub fn elementLengthUTF16IntoUTF8(comptime Type: type, utf16: Type) usize { var utf16_remaining = utf16; var count: usize = 0; - while (firstNonASCII16(Type, utf16_remaining)) |i| { + while (firstNonASCII16(utf16_remaining)) |i| { count += i; utf16_remaining = utf16_remaining[i..]; - const replacement = utf16Codepoint(Type, utf16_remaining); + const replacement = utf16Codepoint(utf16_remaining); count += replacement.utf8Width(); utf16_remaining = utf16_remaining[replacement.len..]; @@ -1858,7 +1797,7 @@ pub fn elementLengthUTF16IntoUTF8(comptime Type: type, utf16: Type) usize { return count + utf16_remaining.len; } -pub fn elementLengthUTF8IntoUTF16(comptime Type: type, utf8: Type) usize { +pub fn elementLengthUTF8IntoUTF16(utf8: []const u8) usize { var utf8_remaining = utf8; var count: usize = 0; @@ -1871,7 +1810,7 @@ pub fn elementLengthUTF8IntoUTF16(comptime Type: type, utf8: Type) usize { utf8_remaining = utf8_remaining[i..]; - const replacement = utf16Codepoint(Type, utf8_remaining); + const replacement = utf16Codepoint(utf8_remaining); count += replacement.len; utf8_remaining = utf8_remaining[@min(replacement.utf8Width(), utf8_remaining.len)..]; diff --git a/src/string/immutable/visible.zig b/src/string/immutable/visible.zig index 582475b640..a2fea60879 100644 --- a/src/string/immutable/visible.zig +++ b/src/string/immutable/visible.zig @@ -714,7 +714,7 @@ pub const visible = struct { while (true) { { - const idx = firstNonASCII16([]const u16, input) orelse input.len; + const idx = firstNonASCII16(input) orelse input.len; for (0..idx) |j| { const cp = input[j]; defer prev = cp; @@ -760,7 +760,7 @@ pub const visible = struct { input = input[idx..]; } if (input.len == 0) break; - const replacement = utf16CodepointWithFFFD([]const u16, input); + const replacement = utf16CodepointWithFFFD(input); defer input = input[replacement.len..]; if (replacement.fail) continue; const cp: u21 = @intCast(replacement.code_point); diff --git a/src/string/wtf.zig b/src/string/wtf.zig index c0afcbed5d..8587ce0308 100644 --- a/src/string/wtf.zig +++ b/src/string/wtf.zig @@ -200,7 +200,7 @@ pub const WTFStringImplStruct = extern struct { return if (input.len > 0) jsc.WebCore.encoding.byteLengthU8(input.ptr, input.len, .utf8) else 0; } else { const input = this.utf16Slice(); - return if (input.len > 0) bun.strings.elementLengthUTF16IntoUTF8([]const u16, input) else 0; + return if (input.len > 0) bun.strings.elementLengthUTF16IntoUTF8(input) else 0; } } diff --git a/src/watcher/WindowsWatcher.zig b/src/watcher/WindowsWatcher.zig index 3ee59eff56..a2b8459c38 100644 --- a/src/watcher/WindowsWatcher.zig +++ b/src/watcher/WindowsWatcher.zig @@ -217,7 +217,7 @@ pub fn watchLoopCycle(this: *bun.Watcher) bun.sys.Maybe(void) { const item_paths = this.watchlist.items(.file_path); log("number of watched items: {d}", .{item_paths.len}); while (iter.next()) |event| { - const convert_res = bun.strings.copyUTF16IntoUTF8(buf[base_idx..], []const u16, event.filename); + const convert_res = bun.strings.copyUTF16IntoUTF8(buf[base_idx..], event.filename); const eventpath = buf[0 .. base_idx + convert_res.written]; log("watcher update event: (filename: {s}, action: {s}", .{ eventpath, @tagName(event.action) }); diff --git a/test/js/node/process/process-stdio-invalid-utf16.test.ts b/test/js/node/process/process-stdio-invalid-utf16.test.ts new file mode 100644 index 0000000000..128bb75f39 --- /dev/null +++ b/test/js/node/process/process-stdio-invalid-utf16.test.ts @@ -0,0 +1,405 @@ +import { describe, expect, test } from "bun:test"; +import { bunEnv, bunExe, tempDir } from "harness"; + +describe.concurrent.each(["stdout", "stderr"])("process.%s.write with invalid UTF-16", stream => { + test("single unpaired high surrogate (D800)", async () => { + using dir = tempDir("stdio-utf16", { + "test.js": ` + process.${stream}.write(String.fromCharCode(0xD800)); + process.${stream}.write("\\n"); + ${stream === "stdout" ? "" : 'console.log("Done");'} + `, + }); + + await using proc = Bun.spawn({ + cmd: [bunExe(), "test.js"], + env: bunEnv, + cwd: String(dir), + stdout: "pipe", + stderr: "pipe", + }); + + const [stdout, stderr, exitCode] = await Promise.all([proc.stdout.text(), proc.stderr.text(), proc.exited]); + + expect(exitCode).toBe(0); + if (stream === "stdout") { + expect(stdout).toBe("�\n"); + } else { + expect(stdout).toBe("Done\n"); + expect(stderr).toBe("�\n"); + } + }); + + test("single unpaired low surrogate (DC00)", async () => { + using dir = tempDir("stdio-utf16", { + "test.js": ` + process.${stream}.write(String.fromCharCode(0xDC00)); + process.${stream}.write("\\n"); + ${stream === "stdout" ? "" : 'console.log("Done");'} + `, + }); + + await using proc = Bun.spawn({ + cmd: [bunExe(), "test.js"], + env: bunEnv, + cwd: String(dir), + stdout: "pipe", + stderr: "pipe", + }); + + const [stdout, stderr, exitCode] = await Promise.all([proc.stdout.text(), proc.stderr.text(), proc.exited]); + + expect(exitCode).toBe(0); + if (stream === "stdout") { + expect(stdout).toBe("�\n"); + } else { + expect(stdout).toBe("Done\n"); + expect(stderr).toBe("�\n"); + } + }); + + test("trailing unpaired high surrogate should not duplicate output", async () => { + // This was the main bug: strings ending with high surrogates (D800-DBFF) + // would duplicate the output ~32 times + using dir = tempDir("stdio-utf16", { + "test.js": ` + process.${stream}.write("Help" + String.fromCharCode(0xD800)); + process.${stream}.write("\\n"); + process.${stream}.write("Test" + String.fromCharCode(0xDBFF)); + process.${stream}.write("\\n"); + ${stream === "stdout" ? "" : 'console.log("Done");'} + `, + }); + + await using proc = Bun.spawn({ + cmd: [bunExe(), "test.js"], + env: bunEnv, + cwd: String(dir), + stdout: "pipe", + stderr: "pipe", + }); + + const [stdout, stderr, exitCode] = await Promise.all([proc.stdout.text(), proc.stderr.text(), proc.exited]); + + expect(exitCode).toBe(0); + + const output = stream === "stdout" ? stdout : stderr; + expect(output).toBe("Help�\nTest�\n"); + + // Also verify no duplication + expect((output.match(/Help/g) || []).length).toBe(1); + expect((output.match(/Test/g) || []).length).toBe(1); + + if (stream === "stderr") { + expect(stdout).toBe("Done\n"); + } + }); + + test("trailing unpaired low surrogate", async () => { + using dir = tempDir("stdio-utf16", { + "test.js": ` + process.${stream}.write("Hello" + String.fromCharCode(0xDC00)); + process.${stream}.write("\\n"); + process.${stream}.write("World" + String.fromCharCode(0xDFFF)); + process.${stream}.write("\\n"); + ${stream === "stdout" ? "" : 'console.log("Done");'} + `, + }); + + await using proc = Bun.spawn({ + cmd: [bunExe(), "test.js"], + env: bunEnv, + cwd: String(dir), + stdout: "pipe", + stderr: "pipe", + }); + + const [stdout, stderr, exitCode] = await Promise.all([proc.stdout.text(), proc.stderr.text(), proc.exited]); + + expect(exitCode).toBe(0); + if (stream === "stdout") { + expect(stdout).toBe("Hello�\nWorld�\n"); + } else { + expect(stdout).toBe("Done\n"); + expect(stderr).toBe("Hello�\nWorld�\n"); + } + }); + + test("leading unpaired surrogates", async () => { + using dir = tempDir("stdio-utf16", { + "test.js": ` + process.${stream}.write(String.fromCharCode(0xD800) + "Hello"); + process.${stream}.write("\\n"); + process.${stream}.write(String.fromCharCode(0xDC00) + "World"); + process.${stream}.write("\\n"); + ${stream === "stdout" ? "" : 'console.log("Done");'} + `, + }); + + await using proc = Bun.spawn({ + cmd: [bunExe(), "test.js"], + env: bunEnv, + cwd: String(dir), + stdout: "pipe", + stderr: "pipe", + }); + + const [stdout, stderr, exitCode] = await Promise.all([proc.stdout.text(), proc.stderr.text(), proc.exited]); + + expect(exitCode).toBe(0); + if (stream === "stdout") { + expect(stdout).toBe("�Hello\n�World\n"); + } else { + expect(stdout).toBe("Done\n"); + expect(stderr).toBe("�Hello\n�World\n"); + } + }); + + test("unpaired surrogates at both ends", async () => { + using dir = tempDir("stdio-utf16", { + "test.js": ` + process.${stream}.write(String.fromCharCode(0xD800) + "Middle" + String.fromCharCode(0xDC00)); + process.${stream}.write("\\n"); + process.${stream}.write(String.fromCharCode(0xDC00) + "Text" + String.fromCharCode(0xD800)); + process.${stream}.write("\\n"); + ${stream === "stdout" ? "" : 'console.log("Done");'} + `, + }); + + await using proc = Bun.spawn({ + cmd: [bunExe(), "test.js"], + env: bunEnv, + cwd: String(dir), + stdout: "pipe", + stderr: "pipe", + }); + + const [stdout, stderr, exitCode] = await Promise.all([proc.stdout.text(), proc.stderr.text(), proc.exited]); + + expect(exitCode).toBe(0); + if (stream === "stdout") { + expect(stdout).toBe("�Middle�\n�Text�\n"); + } else { + expect(stdout).toBe("Done\n"); + expect(stderr).toBe("�Middle�\n�Text�\n"); + } + }); + + test("multiple unpaired high surrogates", async () => { + using dir = tempDir("stdio-utf16", { + "test.js": ` + // Multiple high surrogates only + process.${stream}.write(String.fromCharCode(0xD800, 0xD801, 0xD802)); + process.${stream}.write("\\n"); + // Text with multiple trailing high surrogates + process.${stream}.write("Test" + String.fromCharCode(0xD800, 0xD801, 0xD802)); + process.${stream}.write("\\n"); + ${stream === "stdout" ? "" : 'console.log("Done");'} + `, + }); + + await using proc = Bun.spawn({ + cmd: [bunExe(), "test.js"], + env: bunEnv, + cwd: String(dir), + stdout: "pipe", + stderr: "pipe", + }); + + const [stdout, stderr, exitCode] = await Promise.all([proc.stdout.text(), proc.stderr.text(), proc.exited]); + + expect(exitCode).toBe(0); + if (stream === "stdout") { + expect(stdout).toBe("���\nTest���\n"); + } else { + expect(stdout).toBe("Done\n"); + expect(stderr).toBe("���\nTest���\n"); + } + }); + + test("multiple unpaired low surrogates", async () => { + using dir = tempDir("stdio-utf16", { + "test.js": ` + // Multiple low surrogates only + process.${stream}.write(String.fromCharCode(0xDC00, 0xDC01, 0xDC02)); + process.${stream}.write("\\n"); + // Text with multiple trailing low surrogates + process.${stream}.write("Test" + String.fromCharCode(0xDC00, 0xDC01, 0xDC02)); + process.${stream}.write("\\n"); + ${stream === "stdout" ? "" : 'console.log("Done");'} + `, + }); + + await using proc = Bun.spawn({ + cmd: [bunExe(), "test.js"], + env: bunEnv, + cwd: String(dir), + stdout: "pipe", + stderr: "pipe", + }); + + const [stdout, stderr, exitCode] = await Promise.all([proc.stdout.text(), proc.stderr.text(), proc.exited]); + + expect(exitCode).toBe(0); + if (stream === "stdout") { + expect(stdout).toBe("���\nTest���\n"); + } else { + expect(stdout).toBe("Done\n"); + expect(stderr).toBe("���\nTest���\n"); + } + }); + + test("valid surrogate pairs are preserved", async () => { + using dir = tempDir("stdio-utf16", { + "test.js": ` + // Valid surrogate pair (𝄞 - musical symbol) + process.${stream}.write(String.fromCharCode(0xD834, 0xDD1E)); + process.${stream}.write("\\n"); + // Valid pair with unpaired surrogates + process.${stream}.write( + String.fromCharCode(0xD800) + + String.fromCharCode(0xD834, 0xDD1E) + + String.fromCharCode(0xDC00) + ); + process.${stream}.write("\\n"); + ${stream === "stdout" ? "" : 'console.log("Done");'} + `, + }); + + await using proc = Bun.spawn({ + cmd: [bunExe(), "test.js"], + env: bunEnv, + cwd: String(dir), + stdout: "pipe", + stderr: "pipe", + }); + + const [stdout, stderr, exitCode] = await Promise.all([proc.stdout.text(), proc.stderr.text(), proc.exited]); + + expect(exitCode).toBe(0); + if (stream === "stdout") { + expect(stdout).toBe("𝄞\n�𝄞�\n"); + } else { + expect(stdout).toBe("Done\n"); + expect(stderr).toBe("𝄞\n�𝄞�\n"); + } + }); + + test("surrogate pair combinations", async () => { + using dir = tempDir("stdio-utf16", { + "test.js": ` + // D800,D801,DC00: D800 is unpaired, D801+DC00 forms valid pair + process.${stream}.write(String.fromCharCode(0xD800, 0xD801, 0xDC00)); + process.${stream}.write("\\n"); + // DC00,D800,DC01,D801: DC00 unpaired, D800+DC01 valid, D801 unpaired + process.${stream}.write(String.fromCharCode(0xDC00, 0xD800, 0xDC01, 0xD801)); + process.${stream}.write("\\n"); + // Two high surrogates (both unpaired) + process.${stream}.write(String.fromCharCode(0xD800, 0xD801)); + process.${stream}.write("\\n"); + // Two low surrogates (both unpaired) + process.${stream}.write(String.fromCharCode(0xDC00, 0xDC01)); + process.${stream}.write("\\n"); + ${stream === "stdout" ? "" : 'console.log("Done");'} + `, + }); + + await using proc = Bun.spawn({ + cmd: [bunExe(), "test.js"], + env: bunEnv, + cwd: String(dir), + stdout: "pipe", + stderr: "pipe", + }); + + const [stdout, stderr, exitCode] = await Promise.all([proc.stdout.text(), proc.stderr.text(), proc.exited]); + + expect(exitCode).toBe(0); + + const expectedOutput = + "�" + + String.fromCharCode(0xd801, 0xdc00) + + "\n" + + "�" + + String.fromCharCode(0xd800, 0xdc01) + + "�\n" + + "��\n" + + "��\n"; + + if (stream === "stdout") { + expect(stdout).toBe(expectedOutput); + } else { + expect(stdout).toBe("Done\n"); + expect(stderr).toBe(expectedOutput); + } + }); + + test("large strings with trailing unpaired surrogates", async () => { + using dir = tempDir("stdio-utf16", { + "test.js": ` + // Large string to test buffer boundaries + const largeStr = "A".repeat(10000) + String.fromCharCode(0xD800); + process.${stream}.write(largeStr); + process.${stream}.write("\\n"); + ${stream === "stdout" ? "" : 'console.log("Done");'} + `, + }); + + await using proc = Bun.spawn({ + cmd: [bunExe(), "test.js"], + env: bunEnv, + cwd: String(dir), + stdout: "pipe", + stderr: "pipe", + }); + + const [stdout, stderr, exitCode] = await Promise.all([proc.stdout.text(), proc.stderr.text(), proc.exited]); + + expect(exitCode).toBe(0); + + const output = stream === "stdout" ? stdout : stderr; + + // Should be exactly 10000 A's plus one replacement character + const aCount = (output.match(/A/g) || []).length; + expect(aCount).toBe(10000); + expect(output.endsWith("�\n")).toBe(true); + + if (stream === "stderr") { + expect(stdout).toBe("Done\n"); + } + }); + + test("empty string and edge cases", async () => { + using dir = tempDir("stdio-utf16", { + "test.js": ` + // Empty string + process.${stream}.write(""); + process.${stream}.write("\\n"); + // Single char before/after unpaired + process.${stream}.write("A" + String.fromCharCode(0xD800)); + process.${stream}.write("\\n"); + process.${stream}.write(String.fromCharCode(0xD800) + "B"); + process.${stream}.write("\\n"); + ${stream === "stdout" ? "" : 'console.log("Done");'} + `, + }); + + await using proc = Bun.spawn({ + cmd: [bunExe(), "test.js"], + env: bunEnv, + cwd: String(dir), + stdout: "pipe", + stderr: "pipe", + }); + + const [stdout, stderr, exitCode] = await Promise.all([proc.stdout.text(), proc.stderr.text(), proc.exited]); + + expect(exitCode).toBe(0); + if (stream === "stdout") { + expect(stdout).toBe("\nA�\n�B\n"); + } else { + expect(stdout).toBe("Done\n"); + expect(stderr).toBe("\nA�\n�B\n"); + } + }); +}); diff --git a/test/js/web/encoding/text-encoder.test.js b/test/js/web/encoding/text-encoder.test.js index af7aff0efe..150271f3ba 100644 --- a/test/js/web/encoding/text-encoder.test.js +++ b/test/js/web/encoding/text-encoder.test.js @@ -280,6 +280,205 @@ describe("TextEncoder", () => { }); }); + describe("comprehensive invalid UTF-16 edge cases", () => { + it("should handle trailing unpaired high surrogates", () => { + const encoder = new TextEncoder(); + + // Single trailing high surrogate + const test1 = "Hello" + String.fromCharCode(0xd800); + const encoded1 = encoder.encode(test1); + const decoded1 = new TextDecoder().decode(encoded1); + expect(decoded1).toBe("Hello\uFFFD"); + + // Multiple trailing high surrogates + const test2 = "Hello" + String.fromCharCode(0xd800, 0xd801, 0xdbff); + const encoded2 = encoder.encode(test2); + const decoded2 = new TextDecoder().decode(encoded2); + expect(decoded2).toBe("Hello\uFFFD\uFFFD\uFFFD"); + }); + + it("should handle trailing unpaired low surrogates", () => { + const encoder = new TextEncoder(); + + // Single trailing low surrogate + const test1 = "World" + String.fromCharCode(0xdc00); + const encoded1 = encoder.encode(test1); + const decoded1 = new TextDecoder().decode(encoded1); + expect(decoded1).toBe("World\uFFFD"); + + // Multiple trailing low surrogates + const test2 = "World" + String.fromCharCode(0xdc00, 0xdc01, 0xdfff); + const encoded2 = encoder.encode(test2); + const decoded2 = new TextDecoder().decode(encoded2); + expect(decoded2).toBe("World\uFFFD\uFFFD\uFFFD"); + }); + + it("should handle leading unpaired surrogates", () => { + const encoder = new TextEncoder(); + + // Leading high surrogate + const test1 = String.fromCharCode(0xd800) + "Hello"; + const encoded1 = encoder.encode(test1); + const decoded1 = new TextDecoder().decode(encoded1); + expect(decoded1).toBe("\uFFFDHello"); + + // Leading low surrogate + const test2 = String.fromCharCode(0xdc00) + "World"; + const encoded2 = encoder.encode(test2); + const decoded2 = new TextDecoder().decode(encoded2); + expect(decoded2).toBe("\uFFFDWorld"); + }); + + it("should handle mixed valid and invalid surrogates", () => { + const encoder = new TextEncoder(); + + // Valid emoji followed by unpaired high surrogate + const test1 = "🌍" + String.fromCharCode(0xd800); + const encoded1 = encoder.encode(test1); + const decoded1 = new TextDecoder().decode(encoded1); + expect(decoded1).toBe("🌍\uFFFD"); + + // Unpaired low surrogate followed by valid emoji + const test2 = String.fromCharCode(0xdc00) + "🌍"; + const encoded2 = encoder.encode(test2); + const decoded2 = new TextDecoder().decode(encoded2); + expect(decoded2).toBe("\uFFFD🌍"); + + // Alternating valid and invalid + const test3 = "A" + String.fromCharCode(0xd800) + "B" + String.fromCharCode(0xdc00) + "C"; + const encoded3 = encoder.encode(test3); + const decoded3 = new TextDecoder().decode(encoded3); + expect(decoded3).toBe("A\uFFFDB\uFFFDC"); + }); + + it("should handle strings with only unpaired surrogates", () => { + const encoder = new TextEncoder(); + + // Only unpaired high surrogates + const test1 = String.fromCharCode(0xd800, 0xd801, 0xd802); + const encoded1 = encoder.encode(test1); + const decoded1 = new TextDecoder().decode(encoded1); + expect(decoded1).toBe("\uFFFD\uFFFD\uFFFD"); + + // Only unpaired low surrogates + const test2 = String.fromCharCode(0xdc00, 0xdc01, 0xdc02); + const encoded2 = encoder.encode(test2); + const decoded2 = new TextDecoder().decode(encoded2); + expect(decoded2).toBe("\uFFFD\uFFFD\uFFFD"); + + // Mixed unpaired surrogates + const test3 = String.fromCharCode(0xdc00, 0xd800, 0xdc01, 0xd801); + const encoded3 = encoder.encode(test3); + const decoded3 = new TextDecoder().decode(encoded3); + expect(decoded3).toBe("\uFFFD\uD800\uDC01\uFFFD"); + }); + + it("should handle invalid surrogate pairs", () => { + const encoder = new TextEncoder(); + + // High surrogate not followed by low surrogate + const test1 = String.fromCharCode(0xd800, 0x0041); // High surrogate + 'A' + const encoded1 = encoder.encode(test1); + const decoded1 = new TextDecoder().decode(encoded1); + expect(decoded1).toBe("\uFFFDA"); + + // Low surrogate not preceded by high surrogate + const test2 = String.fromCharCode(0x0041, 0xdc00); // 'A' + low surrogate + const encoded2 = encoder.encode(test2); + const decoded2 = new TextDecoder().decode(encoded2); + expect(decoded2).toBe("A\uFFFD"); + + // Two high surrogates in a row + const test3 = String.fromCharCode(0xd800, 0xd801); + const encoded3 = encoder.encode(test3); + const decoded3 = new TextDecoder().decode(encoded3); + expect(decoded3).toBe("\uFFFD\uFFFD"); + + // Two low surrogates in a row + const test4 = String.fromCharCode(0xdc00, 0xdc01); + const encoded4 = encoder.encode(test4); + const decoded4 = new TextDecoder().decode(encoded4); + expect(decoded4).toBe("\uFFFD\uFFFD"); + }); + + it("should handle edge case buffer boundaries with invalid UTF-16", () => { + const encoder = new TextEncoder(); + + // Large string ending with unpaired surrogate + const largeStr = "A".repeat(100000) + String.fromCharCode(0xd800); + const encoded = encoder.encode(largeStr); + const decoded = new TextDecoder().decode(encoded); + expect(decoded.length).toBe(100001); // 100000 'A's + 1 replacement char + expect(decoded.endsWith("\uFFFD")).toBe(true); + + // Large string with unpaired surrogates scattered throughout + let scatteredStr = ""; + for (let i = 0; i < 1000; i++) { + scatteredStr += "Hello"; + if (i % 100 === 0) { + scatteredStr += String.fromCharCode(0xd800); + } + } + const encoded2 = encoder.encode(scatteredStr); + const decoded2 = new TextDecoder().decode(encoded2); + expect(decoded2).toContain("\uFFFD"); + }); + + it("should handle encodeInto with insufficient buffer for replacement characters", () => { + const encoder = new TextEncoder(); + + // Unpaired surrogate needs 3 bytes for U+FFFD, but buffer is too small + const str = String.fromCharCode(0xd800); + const buffer1 = new Uint8Array(2); // Too small for U+FFFD + const result1 = encoder.encodeInto(str, buffer1); + expect(result1.read).toBe(0); // Should not read the surrogate + expect(result1.written).toBe(0); // Should not write anything + + // Buffer exactly the right size + const buffer2 = new Uint8Array(3); // Exact size for U+FFFD + const result2 = encoder.encodeInto(str, buffer2); + expect(result2.read).toBe(1); // Should read the surrogate + expect(result2.written).toBe(3); // Should write U+FFFD + expect(Array.from(buffer2)).toEqual([0xef, 0xbf, 0xbd]); // U+FFFD in UTF-8 + + // Multiple unpaired surrogates with limited buffer + const str2 = String.fromCharCode(0xd800, 0xdc00); + const buffer3 = new Uint8Array(3); // Only room for one replacement + const result3 = encoder.encodeInto(str2, buffer3); + expect(result3.read).toBe(1); // Should only read first surrogate + expect(result3.written).toBe(3); // Should write one U+FFFD + }); + + it("should handle boundary surrogates correctly", () => { + const encoder = new TextEncoder(); + + // Maximum high surrogate + const test1 = String.fromCharCode(0xdbff); + const encoded1 = encoder.encode(test1); + const decoded1 = new TextDecoder().decode(encoded1); + expect(decoded1).toBe("\uFFFD"); + + // Maximum low surrogate + const test2 = String.fromCharCode(0xdfff); + const encoded2 = encoder.encode(test2); + const decoded2 = new TextDecoder().decode(encoded2); + expect(decoded2).toBe("\uFFFD"); + + // Valid surrogate pair at boundaries + const test3 = String.fromCharCode(0xdbff, 0xdfff); // Maximum valid surrogate pair + const encoded3 = encoder.encode(test3); + expect(encoded3.length).toBe(4); // Should encode to 4 bytes + const decoded3 = new TextDecoder().decode(encoded3); + expect(decoded3).toBe(String.fromCharCode(0xdbff, 0xdfff)); // Should preserve the valid pair + + // Just outside surrogate range (valid BMP characters) + const test4 = String.fromCharCode(0xd7ff, 0xe000); // Last char before surrogates, first after + const encoded4 = encoder.encode(test4); + const decoded4 = new TextDecoder().decode(encoded4); + expect(decoded4).toBe(String.fromCharCode(0xd7ff, 0xe000)); // Should preserve both + }); + }); + it("should encode utf-16 rope text", () => { gcTrace(true); var textReal = `❤️ Red Heart ✨ Sparkles 🔥 Fire`;