Files
bun.sh/src/libarchive/libarchive.zig
robobun 70fa6af355 feat: add Bun.Archive API for creating and extracting tarballs (#25665)
## Summary

- Adds new `Bun.Archive` API for working with tar archives
- `Bun.Archive.from(data)` - Create archive from object, Blob,
TypedArray, or ArrayBuffer
- `Bun.Archive.write(path, data, compress?)` - Write archive to disk
(async)
- `archive.extract(path)` - Extract to directory, returns
`Promise<number>` (file count)
- `archive.blob(compress?)` - Get archive as Blob (async)
- `archive.bytes(compress?)` - Get archive as Uint8Array (async)

Key implementation details:
- Uses existing libarchive bindings for tarball creation/extraction via
`extractToDisk`
- Uses libdeflate for gzip compression
- Immediate byte copying for GC safety (no JSValue protection, no
`hasPendingActivity`)
- Async operations run on worker pool threads with proper VM reference
handling
- Growing memory buffer via `archive_write_open2` callbacks for
efficient tarball creation

## Test plan

- [x] 65 comprehensive tests covering:
  - Normal operations (create, extract, blob, bytes, write)
  - GC safety (unreferenced archives, mutation isolation)  
  - Error handling (invalid args, corrupted data, I/O errors)
- Edge cases (large files, many files, special characters, path
normalization)
  - Concurrent operations

🤖 Generated with [Claude Code](https://claude.com/claude-code)

---------

Co-authored-by: Claude Bot <claude-bot@bun.sh>
Co-authored-by: Claude <noreply@anthropic.com>
Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
Co-authored-by: Dylan Conway <dylan.conway567@gmail.com>
Co-authored-by: Jarred Sumner <jarred@jarredsumner.com>
2026-01-09 00:33:35 -08:00

628 lines
28 KiB
Zig

// @link "../deps/libarchive.a"
pub const lib = @import("./libarchive-bindings.zig");
const Archive = lib.Archive;
pub const Seek = enum(c_int) {
set = std.posix.SEEK_SET,
current = std.posix.SEEK_CUR,
end = std.posix.SEEK_END,
};
pub const BufferReadStream = struct {
const Stream = @This();
buf: []const u8,
pos: usize = 0,
block_size: usize = 16384,
archive: *Archive,
reading: bool = false,
pub fn init(this: *BufferReadStream, buf: []const u8) void {
this.* = BufferReadStream{
.buf = buf,
.pos = 0,
.archive = Archive.readNew(),
.reading = false,
};
}
pub fn deinit(this: *BufferReadStream) void {
_ = this.archive.readClose();
_ = this.archive.readFree();
}
pub fn openRead(this: *BufferReadStream) Archive.Result {
// lib.archive_read_set_open_callback(this.archive, this.);
// _ = lib.archive_read_set_read_callback(this.archive, archive_read_callback);
// _ = lib.archive_read_set_seek_callback(this.archive, archive_seek_callback);
// _ = lib.archive_read_set_skip_callback(this.archive, archive_skip_callback);
// _ = lib.archive_read_set_close_callback(this.archive, archive_close_callback);
// // lib.archive_read_set_switch_callback(this.archive, this.archive_s);
// _ = lib.archive_read_set_callback_data(this.archive, this);
_ = this.archive.readSupportFormatTar();
_ = this.archive.readSupportFormatGnutar();
_ = this.archive.readSupportFilterGzip();
// Ignore zeroed blocks in the archive, which occurs when multiple tar archives
// have been concatenated together.
// Without this option, only the contents of
// the first concatenated archive would be read.
_ = this.archive.readSetOptions("read_concatenated_archives");
// _ = lib.archive_read_support_filter_none(this.archive);
const rc = this.archive.readOpenMemory(this.buf);
this.reading = @intFromEnum(rc) > -1;
// _ = lib.archive_read_support_compression_all(this.archive);
return rc;
}
pub inline fn bufLeft(this: BufferReadStream) []const u8 {
return this.buf[this.pos..];
}
pub inline fn fromCtx(ctx: *anyopaque) *Stream {
return @as(*Stream, @ptrCast(@alignCast(ctx)));
}
pub fn archive_close_callback(
_: *Archive,
_: *anyopaque,
) callconv(.c) c_int {
return 0;
}
pub fn archive_read_callback(
_: *Archive,
ctx_: *anyopaque,
buffer: [*c]*const anyopaque,
) callconv(.c) lib.la_ssize_t {
var this = fromCtx(ctx_);
const remaining = this.bufLeft();
if (remaining.len == 0) return 0;
const diff = @min(remaining.len, this.block_size);
buffer.* = remaining[0..diff].ptr;
this.pos += diff;
return @as(isize, @intCast(diff));
}
pub fn archive_skip_callback(
_: *Archive,
ctx_: *anyopaque,
offset: lib.la_int64_t,
) callconv(.c) lib.la_int64_t {
var this = fromCtx(ctx_);
const buflen = @as(isize, @intCast(this.buf.len));
const pos = @as(isize, @intCast(this.pos));
const proposed = pos + offset;
const new_pos = @min(@max(proposed, 0), buflen - 1);
this.pos = @as(usize, @intCast(new_pos));
return new_pos - pos;
}
pub fn archive_seek_callback(
_: *Archive,
ctx_: *anyopaque,
offset: lib.la_int64_t,
whence: c_int,
) callconv(.c) lib.la_int64_t {
var this = fromCtx(ctx_);
const buflen = @as(isize, @intCast(this.buf.len));
const pos = @as(isize, @intCast(this.pos));
switch (@as(Seek, @enumFromInt(whence))) {
Seek.current => {
const new_pos = @max(@min(pos + offset, buflen - 1), 0);
this.pos = @as(usize, @intCast(new_pos));
return new_pos;
},
Seek.end => {
const new_pos = @max(@min(buflen - offset, buflen), 0);
this.pos = @as(usize, @intCast(new_pos));
return new_pos;
},
Seek.set => {
const new_pos = @max(@min(offset, buflen - 1), 0);
this.pos = @as(usize, @intCast(new_pos));
return new_pos;
},
}
}
// pub fn archive_write_callback(
// archive: *Archive,
// ctx_: *anyopaque,
// buffer: *const anyopaque,
// len: usize,
// ) callconv(.c) lib.la_ssize_t {
// var this = fromCtx(ctx_);
// }
// pub fn archive_close_callback(
// archive: *Archive,
// ctx_: *anyopaque,
// ) callconv(.c) c_int {
// var this = fromCtx(ctx_);
// }
// pub fn archive_free_callback(
// archive: *Archive,
// ctx_: *anyopaque,
// ) callconv(.c) c_int {
// var this = fromCtx(ctx_);
// }
// pub fn archive_switch_callback(
// archive: *Archive,
// ctx1: *anyopaque,
// ctx2: *anyopaque,
// ) callconv(.c) c_int {
// var this = fromCtx(ctx1);
// var that = fromCtx(ctx2);
// }
};
pub const Archiver = struct {
// impl: *lib.archive = undefined,
// buf: []const u8 = undefined,
// dir: FileDescriptorType = 0,
pub const Context = struct {
pluckers: []Plucker = &[_]Plucker{},
overwrite_list: bun.StringArrayHashMap(void),
all_files: EntryMap,
pub const EntryMap = std.ArrayHashMap(u64, [*c]u8, U64Context, false);
pub const U64Context = struct {
pub fn hash(_: @This(), k: u64) u32 {
return @as(u32, @truncate(k));
}
pub fn eql(_: @This(), a: u64, b: u64, _: usize) bool {
return a == b;
}
};
};
pub const Plucker = struct {
contents: MutableString,
filename_hash: u64 = 0,
found: bool = false,
fd: FileDescriptorType,
pub fn init(filepath: bun.OSPathSlice, estimated_size: usize, allocator: std.mem.Allocator) !Plucker {
return Plucker{
.contents = try MutableString.init(allocator, estimated_size),
.filename_hash = bun.hash(std.mem.sliceAsBytes(filepath)),
.fd = .invalid,
.found = false,
};
}
};
pub fn getOverwritingFileList(
file_buffer: []const u8,
root: []const u8,
ctx: *Archiver.Context,
comptime FilePathAppender: type,
appender: FilePathAppender,
comptime depth_to_skip: usize,
) !void {
var entry: *Archive.Entry = undefined;
var stream: BufferReadStream = undefined;
stream.init(file_buffer);
defer stream.deinit();
_ = stream.openRead();
const archive = stream.archive;
const dir: std.fs.Dir = brk: {
const cwd = std.fs.cwd();
// if the destination doesn't exist, we skip the whole thing since nothing can overwrite it.
if (std.fs.path.isAbsolute(root)) {
break :brk std.fs.openDirAbsolute(root, .{}) catch return;
} else {
break :brk cwd.openDir(root, .{}) catch return;
}
};
loop: while (true) {
const r = archive.readNextHeader(&entry);
switch (r) {
.eof => break :loop,
.retry => continue :loop,
.failed, .fatal => return error.Fail,
else => {
// do not use the utf8 name there
// it will require us to pull in libiconv
// though we should probably validate the utf8 here nonetheless
var pathname = entry.pathname();
var tokenizer = std.mem.tokenizeScalar(u8, bun.asByteSlice(pathname), std.fs.path.sep);
comptime var depth_i: usize = 0;
inline while (depth_i < depth_to_skip) : (depth_i += 1) {
if (tokenizer.next() == null) continue :loop;
}
var pathname_ = tokenizer.rest();
pathname = std.mem.sliceTo(pathname_.ptr[0..pathname_.len :0], 0);
const dirname = std.mem.trim(u8, std.fs.path.dirname(bun.asByteSlice(pathname)) orelse "", std.fs.path.sep_str);
const size: usize = @intCast(@max(entry.size(), 0));
if (size > 0) {
var opened = dir.openFileZ(pathname, .{ .mode = .write_only }) catch continue :loop;
defer opened.close();
const stat_size = try opened.getEndPos();
if (stat_size > 0) {
const is_already_top_level = dirname.len == 0;
const path_to_use_: string = brk: {
const __pathname: string = bun.asByteSlice(pathname);
if (is_already_top_level) break :brk __pathname;
const index = std.mem.indexOfScalar(u8, __pathname, std.fs.path.sep).?;
break :brk __pathname[0..index];
};
var temp_buf: [1024]u8 = undefined;
bun.copy(u8, &temp_buf, path_to_use_);
var path_to_use: string = temp_buf[0..path_to_use_.len];
if (!is_already_top_level) {
temp_buf[path_to_use_.len] = std.fs.path.sep;
path_to_use = temp_buf[0 .. path_to_use_.len + 1];
}
const overwrite_entry = try ctx.overwrite_list.getOrPut(path_to_use);
if (!overwrite_entry.found_existing) {
overwrite_entry.key_ptr.* = try appender.append(@TypeOf(path_to_use), path_to_use);
}
}
}
},
}
}
}
pub const ExtractOptions = struct {
depth_to_skip: usize,
close_handles: bool = true,
log: bool = false,
npm: bool = false,
};
pub fn extractToDir(
file_buffer: []const u8,
dir: std.fs.Dir,
ctx: ?*Archiver.Context,
comptime ContextType: type,
appender: ContextType,
options: ExtractOptions,
) !u32 {
var entry: *Archive.Entry = undefined;
var stream: BufferReadStream = undefined;
stream.init(file_buffer);
defer stream.deinit();
_ = stream.openRead();
const archive = stream.archive;
var count: u32 = 0;
const dir_fd = dir.fd;
var normalized_buf: bun.OSPathBuffer = undefined;
var use_pwrite = Environment.isPosix;
var use_lseek = true;
loop: while (true) {
const r = archive.readNextHeader(&entry);
switch (r) {
.eof => break :loop,
.retry => continue :loop,
.failed, .fatal => return error.Fail,
else => {
// TODO:
// Due to path separator replacement and other copies that happen internally, libarchive changes the
// storage type of paths on windows to wide character strings. Using `archive_entry_pathname` or `archive_entry_pathname_utf8`
// on an wide character string will return null if there are non-ascii characters.
// (this can be seen by installing @fastify/send, which has a path "@fastify\send\test\fixtures\snow ☃")
//
// Ideally, we find a way to tell libarchive to not convert the strings to wide characters and also to not
// replace path separators. We can do both of these with our own normalization and utf8/utf16 string conversion code.
var pathname: bun.OSPathSliceZ = if (comptime Environment.isWindows)
entry.pathnameW()
else
entry.pathname();
if (comptime ContextType != void and @hasDecl(std.meta.Child(ContextType), "onFirstDirectoryName")) {
if (appender.needs_first_dirname) {
if (comptime Environment.isWindows) {
const list = std.array_list.Managed(u8).init(default_allocator);
var result = try strings.toUTF8ListWithType(list, pathname[0..pathname.len]);
// onFirstDirectoryName copies the contents of pathname to another buffer, safe to free
defer result.deinit();
appender.onFirstDirectoryName(strings.withoutTrailingSlash(result.items));
} else {
appender.onFirstDirectoryName(strings.withoutTrailingSlash(bun.asByteSlice(pathname)));
}
}
}
const kind = bun.sys.kindFromMode(entry.filetype());
if (options.npm) {
// - ignore entries other than files (`true` can only be returned if type is file)
// https://github.com/npm/cli/blob/93883bb6459208a916584cad8c6c72a315cf32af/node_modules/pacote/lib/fetcher.js#L419-L441
if (kind != .file) continue;
// TODO: .npmignore, or .gitignore if it doesn't exist
// https://github.com/npm/cli/blob/93883bb6459208a916584cad8c6c72a315cf32af/node_modules/pacote/lib/fetcher.js#L434
}
// strip and normalize the path
var tokenizer = std.mem.tokenizeScalar(bun.OSPathChar, pathname, '/');
for (0..options.depth_to_skip) |_| {
if (tokenizer.next() == null) continue :loop;
}
const rest = tokenizer.rest();
pathname = rest.ptr[0..rest.len :0];
const normalized = bun.path.normalizeBufT(bun.OSPathChar, pathname, &normalized_buf, .auto);
normalized_buf[normalized.len] = 0;
const path: [:0]bun.OSPathChar = normalized_buf[0..normalized.len :0];
if (path.len == 0 or path.len == 1 and path[0] == '.') continue;
if (options.npm and Environment.isWindows) {
// When writing files on Windows, translate the characters to their
// 0xf000 higher-encoded versions.
// https://github.com/isaacs/node-tar/blob/0510c9ea6d000c40446d56674a7efeec8e72f052/lib/winchars.js
var remain = path;
if (strings.startsWithWindowsDriveLetterT(bun.OSPathChar, remain)) {
// don't encode `:` from the drive letter
// https://github.com/npm/cli/blob/93883bb6459208a916584cad8c6c72a315cf32af/node_modules/tar/lib/unpack.js#L327
remain = remain[2..];
}
for (remain) |*char| {
switch (char.*) {
'|', '<', '>', '?', ':' => char.* += 0xf000,
else => {},
}
}
}
const path_slice: bun.OSPathSlice = path.ptr[0..path.len];
if (options.log) {
Output.prettyln(" {f}", .{bun.fmt.fmtOSPath(path_slice, .{})});
}
count += 1;
switch (kind) {
.directory => {
var mode = @as(i32, @intCast(entry.perm()));
// if dirs are readable, then they should be listable
// https://github.com/npm/node-tar/blob/main/lib/mode-fix.js
if ((mode & 0o400) != 0)
mode |= 0o100;
if ((mode & 0o40) != 0)
mode |= 0o10;
if ((mode & 0o4) != 0)
mode |= 0o1;
if (comptime Environment.isWindows) {
try bun.MakePath.makePath(u16, dir, path);
} else {
std.posix.mkdiratZ(dir_fd, pathname, @intCast(mode)) catch |err| {
// It's possible for some tarballs to return a directory twice, with and
// without `./` in the beginning. So if it already exists, continue to the
// next entry.
if (err == error.PathAlreadyExists or err == error.NotDir) continue;
bun.makePath(dir, std.fs.path.dirname(path_slice) orelse return err) catch {};
std.posix.mkdiratZ(dir_fd, pathname, 0o777) catch {};
};
}
},
.sym_link => {
const link_target = entry.symlink();
if (Environment.isPosix) {
bun.sys.symlinkat(link_target, .fromNative(dir_fd), path).unwrap() catch |err| brk: {
switch (err) {
error.EPERM, error.ENOENT => {
dir.makePath(std.fs.path.dirname(path_slice) orelse return err) catch {};
break :brk try bun.sys.symlinkat(link_target, .fromNative(dir_fd), path).unwrap();
},
else => return err,
}
};
}
},
.file => {
// first https://github.com/npm/cli/blob/feb54f7e9a39bd52519221bae4fafc8bc70f235e/node_modules/pacote/lib/fetcher.js#L65-L66
// this.fmode = opts.fmode || 0o666
//
// then https://github.com/npm/cli/blob/feb54f7e9a39bd52519221bae4fafc8bc70f235e/node_modules/pacote/lib/fetcher.js#L402-L411
//
// we simplify and turn it into `entry.mode || 0o666` because we aren't accepting a umask or fmask option.
const mode: bun.Mode = if (comptime Environment.isWindows) 0 else @intCast(entry.perm() | 0o666);
const flags = bun.O.WRONLY | bun.O.CREAT | bun.O.TRUNC;
const file_handle_native: bun.FD = if (Environment.isWindows)
switch (bun.sys.openatWindows(.fromNative(dir_fd), path, flags, 0)) {
.result => |fd| fd,
.err => |e| switch (e.errno) {
@intFromEnum(bun.sys.E.PERM),
@intFromEnum(bun.sys.E.NOENT),
=> brk: {
bun.MakePath.makePath(u16, dir, bun.Dirname.dirname(u16, path_slice) orelse return bun.errnoToZigErr(e.errno)) catch {};
break :brk try bun.sys.openatWindows(.fromNative(dir_fd), path, flags, 0).unwrap();
},
else => return bun.errnoToZigErr(e.errno),
},
}
else
.fromStdFile(dir.createFileZ(path, .{
.truncate = true,
.mode = mode,
}) catch |err|
switch (err) {
error.AccessDenied, error.FileNotFound => brk: {
dir.makePath(std.fs.path.dirname(path_slice) orelse return err) catch {};
break :brk try dir.createFileZ(path, .{
.truncate = true,
.mode = mode,
});
},
else => return err,
});
const file_handle = brk: {
errdefer _ = file_handle_native.close();
break :brk try file_handle_native.makeLibUVOwned();
};
var plucked_file = false;
defer if (options.close_handles and !plucked_file) {
// On windows, AV hangs these closes really badly.
// 'bun i @mui/icons-material' takes like 20 seconds to extract
// mostly spend on waiting for things to close closing
//
// Using Async.Closer defers closing the file to a different thread,
// which can make the NtSetInformationFile call fail.
//
// Using async closing doesnt actually improve end user performance
// probably because our process is still waiting on AV to do it's thing.
//
// But this approach does not actually solve the problem, it just
// defers the close to a different thread. And since we are already
// on a worker thread, that doesn't help us.
file_handle.close();
};
const size: usize = @intCast(@max(entry.size(), 0));
if (size > 0) {
if (ctx) |ctx_| {
const hash: u64 = if (ctx_.pluckers.len > 0)
bun.hash(std.mem.sliceAsBytes(path_slice))
else
@as(u64, 0);
if (comptime ContextType != void and @hasDecl(std.meta.Child(ContextType), "appendMutable")) {
const result = ctx.?.all_files.getOrPutAdapted(hash, Context.U64Context{}) catch unreachable;
if (!result.found_existing) {
result.value_ptr.* = (try appender.appendMutable(@TypeOf(path_slice), path_slice)).ptr;
}
}
for (ctx_.pluckers) |*plucker_| {
if (plucker_.filename_hash == hash) {
try plucker_.contents.inflate(size);
plucker_.contents.list.expandToCapacity();
const read = archive.readData(plucker_.contents.list.items);
try plucker_.contents.inflate(@as(usize, @intCast(read)));
plucker_.found = read > 0;
plucker_.fd = file_handle;
plucked_file = true;
continue :loop;
}
}
}
// archive_read_data_into_fd reads in chunks of 1 MB
// #define MAX_WRITE (1024 * 1024)
if (comptime Environment.isLinux) {
if (size > 1_000_000) {
bun.sys.preallocate_file(
file_handle.cast(),
0,
@intCast(size),
) catch {};
}
}
var retries_remaining: u8 = 5;
possibly_retry: while (retries_remaining != 0) : (retries_remaining -= 1) {
switch (archive.readDataIntoFd(file_handle, &use_pwrite, &use_lseek)) {
.eof => break :loop,
.ok => break :possibly_retry,
.retry => {
if (options.log) {
Output.err("libarchive error", "extracting {f}, retry {d} / {d}", .{
bun.fmt.fmtOSPath(path_slice, .{}),
retries_remaining,
5,
});
}
},
else => {
if (options.log) {
const archive_error = bun.sliceTo(lib.Archive.errorString(@ptrCast(archive)), 0);
Output.err("libarchive error", "extracting {f}: {s}", .{
bun.fmt.fmtOSPath(path_slice, .{}),
archive_error,
});
}
return error.Fail;
},
}
}
}
},
else => {},
}
},
}
}
return count;
}
pub fn extractToDisk(
file_buffer: []const u8,
root: []const u8,
ctx: ?*Archiver.Context,
comptime FilePathAppender: type,
appender: FilePathAppender,
comptime options: ExtractOptions,
) !u32 {
var dir: std.fs.Dir = brk: {
const cwd = std.fs.cwd();
cwd.makePath(
root,
) catch {};
if (std.fs.path.isAbsolute(root)) {
break :brk try std.fs.openDirAbsolute(root, .{});
} else {
break :brk try cwd.openDir(root, .{});
}
};
defer if (comptime options.close_handles) dir.close();
return try extractToDir(file_buffer, dir, ctx, FilePathAppender, appender, options);
}
};
const string = []const u8;
const std = @import("std");
const bun = @import("bun");
const Environment = bun.Environment;
const FileDescriptorType = bun.FileDescriptor;
const MutableString = bun.MutableString;
const Output = bun.Output;
const c = bun.c;
const default_allocator = bun.default_allocator;
const strings = bun.strings;