diff --git a/bench/snippets/archive.mjs b/bench/snippets/archive.mjs index 778a2e888b..b2f661f11a 100644 --- a/bench/snippets/archive.mjs +++ b/bench/snippets/archive.mjs @@ -364,6 +364,109 @@ group("write .tar.gz to disk (100 small files)", () => { } }); +// ============================================================================ +// Get files array from archive (files() method) benchmarks +// ============================================================================ + +// Helper to get files array from node-tar (reads all entries into memory) +async function getFilesArrayNodeTar(buffer) { + return new Promise((resolve, reject) => { + const files = new Map(); + let pending = 0; + let closed = false; + + const maybeResolve = () => { + if (closed && pending === 0) { + resolve(files); + } + }; + + const unpack = new Unpack({ + onReadEntry: entry => { + if (entry.type === "File") { + pending++; + const chunks = []; + entry.on("data", chunk => chunks.push(chunk)); + entry.on("end", () => { + const content = Buffer.concat(chunks); + // Create a File-like object similar to Bun.Archive.files() + files.set(entry.path, new Blob([content])); + pending--; + maybeResolve(); + }); + } + entry.resume(); // Drain the entry + }, + }); + unpack.on("close", () => { + closed = true; + maybeResolve(); + }); + unpack.on("error", reject); + unpack.end(buffer); + }); +} + +group("files() - get all files as Map (3 small files)", () => { + bench("node-tar", async () => { + await getFilesArrayNodeTar(smallTarBuffer); + }); + + if (hasBunArchive) { + bench("Bun.Archive.files()", async () => { + await Bun.Archive.from(smallBunArchive).files(); + }); + } +}); + +group("files() - get all files as Map (3 x 100KB files)", () => { + bench("node-tar", async () => { + await getFilesArrayNodeTar(largeTarBuffer); + }); + + if (hasBunArchive) { + bench("Bun.Archive.files()", async () => { + await Bun.Archive.from(largeBunArchive).files(); + }); + } +}); + +group("files() - get all files as Map (100 small files)", () => { + bench("node-tar", async () => { + await getFilesArrayNodeTar(manyFilesTarBuffer); + }); + + if (hasBunArchive) { + bench("Bun.Archive.files()", async () => { + await Bun.Archive.from(manyFilesBunArchive).files(); + }); + } +}); + +group("files() - get all files as Map from .tar.gz (3 small files)", () => { + bench("node-tar", async () => { + await getFilesArrayNodeTar(smallTarGzBuffer); + }); + + if (hasBunArchive) { + bench("Bun.Archive.files()", async () => { + await Bun.Archive.from(smallBunArchiveGz).files(); + }); + } +}); + +group("files() - get all files as Map from .tar.gz (100 small files)", () => { + bench("node-tar", async () => { + await getFilesArrayNodeTar(manyFilesTarGzBuffer); + }); + + if (hasBunArchive) { + bench("Bun.Archive.files()", async () => { + await Bun.Archive.from(manyFilesBunArchiveGz).files(); + }); + } +}); + await run(); // Cleanup diff --git a/docs/docs.json b/docs/docs.json index 8e5ed5c538..172438965f 100644 --- a/docs/docs.json +++ b/docs/docs.json @@ -121,6 +121,7 @@ "/runtime/file-io", "/runtime/streams", "/runtime/binary-data", + "/runtime/archive", "/runtime/sql", "/runtime/sqlite", "/runtime/s3", diff --git a/docs/runtime/archive.mdx b/docs/runtime/archive.mdx new file mode 100644 index 0000000000..a88a473711 --- /dev/null +++ b/docs/runtime/archive.mdx @@ -0,0 +1,444 @@ +--- +title: Archive +description: Create and extract tar archives with Bun's fast native implementation +--- + +Bun provides a fast, native implementation for working with tar archives through `Bun.Archive`. It supports creating archives from in-memory data, extracting archives to disk, and reading archive contents without extraction. + +## Quickstart + +**Create an archive from files:** + +```ts +const archive = Bun.Archive.from({ + "hello.txt": "Hello, World!", + "data.json": JSON.stringify({ foo: "bar" }), + "nested/file.txt": "Nested content", +}); + +// Write to disk +await Bun.Archive.write("bundle.tar", archive); +``` + +**Extract an archive:** + +```ts +const tarball = await Bun.file("package.tar.gz").bytes(); +const archive = Bun.Archive.from(tarball); +const entryCount = await archive.extract("./output"); +console.log(`Extracted ${entryCount} entries`); +``` + +**Read archive contents without extracting:** + +```ts +const tarball = await Bun.file("package.tar.gz").bytes(); +const archive = Bun.Archive.from(tarball); +const files = await archive.files(); + +for (const [path, file] of files) { + console.log(`${path}: ${await file.text()}`); +} +``` + +## Creating Archives + +Use `Bun.Archive.from()` to create an archive from an object where keys are file paths and values are file contents: + +```ts +const archive = Bun.Archive.from({ + "README.md": "# My Project", + "src/index.ts": "console.log('Hello');", + "package.json": JSON.stringify({ name: "my-project" }), +}); +``` + +File contents can be: + +- **Strings** - Text content +- **Blobs** - Binary data +- **ArrayBufferViews** (e.g., `Uint8Array`) - Raw bytes +- **ArrayBuffers** - Raw binary data + +```ts +const data = "binary data"; +const arrayBuffer = new ArrayBuffer(8); + +const archive = Bun.Archive.from({ + "text.txt": "Plain text", + "blob.bin": new Blob([data]), + "bytes.bin": new Uint8Array([1, 2, 3, 4]), + "buffer.bin": arrayBuffer, +}); +``` + +### Writing Archives to Disk + +Use `Bun.Archive.write()` to create and write an archive in one operation: + +```ts +// Write uncompressed tar +await Bun.Archive.write("output.tar", { + "file1.txt": "content1", + "file2.txt": "content2", +}); + +// Write gzipped tar +const files = { "src/index.ts": "console.log('Hello');" }; +await Bun.Archive.write("output.tar.gz", files, "gzip"); +``` + +### Getting Archive Bytes + +Get the archive data as bytes or a Blob: + +```ts +const files = { "hello.txt": "Hello, World!" }; +const archive = Bun.Archive.from(files); + +// As Uint8Array +const bytes = await archive.bytes(); + +// As Blob +const blob = await archive.blob(); + +// With gzip compression +const gzippedBytes = await archive.bytes("gzip"); +const gzippedBlob = await archive.blob("gzip"); +``` + +## Extracting Archives + +### From Existing Archive Data + +Create an archive from existing tar/tar.gz data: + +```ts +// From a file +const tarball = await Bun.file("package.tar.gz").bytes(); +const archiveFromFile = Bun.Archive.from(tarball); +``` + +```ts +// From a fetch response +const response = await fetch("https://example.com/archive.tar.gz"); +const archiveFromFetch = Bun.Archive.from(await response.blob()); +``` + +### Extracting to Disk + +Use `.extract()` to write all files to a directory: + +```ts +const tarball = await Bun.file("package.tar.gz").bytes(); +const archive = Bun.Archive.from(tarball); +const count = await archive.extract("./extracted"); +console.log(`Extracted ${count} entries`); +``` + +The target directory is created automatically if it doesn't exist. Existing files are overwritten. The returned count includes files, directories, and symlinks (on POSIX systems). + +**Note**: On Windows, symbolic links in archives are always skipped during extraction. Bun does not attempt to create them regardless of privilege level. On Linux and macOS, symlinks are extracted normally. + +**Security note**: Bun.Archive validates paths during extraction, rejecting absolute paths (POSIX `/`, Windows drive letters like `C:\` or `C:/`, and UNC paths like `\\server\share`). Path traversal components (`..`) are normalized away (e.g., `dir/sub/../file` becomes `dir/file`) to prevent directory escape attacks. + +### Filtering Extracted Files + +Use glob patterns to extract only specific files. Patterns are matched against archive entry paths normalized to use forward slashes (`/`). Positive patterns specify what to include, and negative patterns (prefixed with `!`) specify what to exclude. Negative patterns are applied after positive patterns, so **using only negative patterns will match nothing** (you must include a positive pattern like `**` first): + +```ts +const tarball = await Bun.file("package.tar.gz").bytes(); +const archive = Bun.Archive.from(tarball); + +// Extract only TypeScript files +const tsCount = await archive.extract("./extracted", { glob: "**/*.ts" }); + +// Extract files from multiple directories +const multiCount = await archive.extract("./extracted", { + glob: ["src/**", "lib/**"], +}); +``` + +Use negative patterns (prefixed with `!`) to exclude files. When mixing positive and negative patterns, entries must match at least one positive pattern and not match any negative pattern: + +```ts +// Extract everything except node_modules +const distCount = await archive.extract("./extracted", { + glob: ["**", "!node_modules/**"], +}); + +// Extract source files but exclude tests +const srcCount = await archive.extract("./extracted", { + glob: ["src/**", "!**/*.test.ts", "!**/__tests__/**"], +}); +``` + +## Reading Archive Contents + +### Get All Files + +Use `.files()` to get archive contents as a `Map` of `File` objects without extracting to disk. Unlike `extract()` which processes all entry types, `files()` returns only regular files (no directories): + +```ts +const tarball = await Bun.file("package.tar.gz").bytes(); +const archive = Bun.Archive.from(tarball); +const files = await archive.files(); + +for (const [path, file] of files) { + console.log(`${path}: ${file.size} bytes`); + console.log(await file.text()); +} +``` + +Each `File` object includes: + +- `name` - The file path within the archive (always uses forward slashes `/` as separators) +- `size` - File size in bytes +- `lastModified` - Modification timestamp +- Standard `Blob` methods: `text()`, `arrayBuffer()`, `stream()`, etc. + +**Note**: `files()` loads file contents into memory. For large archives, consider using `extract()` to write directly to disk instead. + +### Error Handling + +Archive operations can fail due to corrupted data, I/O errors, or invalid paths. Use try/catch to handle these cases: + +```ts +try { + const tarball = await Bun.file("package.tar.gz").bytes(); + const archive = Bun.Archive.from(tarball); + const count = await archive.extract("./output"); + console.log(`Extracted ${count} entries`); +} catch (e: unknown) { + if (e instanceof Error) { + const error = e as Error & { code?: string }; + if (error.code === "EACCES") { + console.error("Permission denied"); + } else if (error.code === "ENOSPC") { + console.error("Disk full"); + } else { + console.error("Archive error:", error.message); + } + } else { + console.error("Archive error:", String(e)); + } +} +``` + +Common error scenarios: + +- **Corrupted/truncated archives** - `Archive.from()` loads the archive data; errors may be deferred until read/extract operations +- **Permission denied** - `extract()` throws if the target directory is not writable +- **Disk full** - `extract()` throws if there's insufficient space +- **Invalid paths** - Operations throw for malformed file paths + +The count returned by `extract()` includes all successfully written entries (files, directories, and symlinks on POSIX systems). + +**Security note**: Bun.Archive automatically validates paths during extraction. Absolute paths (POSIX `/`, Windows drive letters, UNC paths) and unsafe symlink targets are rejected. Path traversal components (`..`) are normalized away to prevent directory escape. + +For additional security with untrusted archives, you can enumerate and validate paths before extraction: + +```ts +const archive = Bun.Archive.from(untrustedData); +const files = await archive.files(); + +// Optional: Custom validation for additional checks +for (const [path] of files) { + // Example: Reject hidden files + if (path.startsWith(".") || path.includes("/.")) { + throw new Error(`Hidden file rejected: ${path}`); + } + // Example: Whitelist specific directories + if (!path.startsWith("src/") && !path.startsWith("lib/")) { + throw new Error(`Unexpected path: ${path}`); + } +} + +// Extract to a controlled destination +await archive.extract("./safe-output"); +``` + +When using `files()` with a glob pattern, an empty `Map` is returned if no files match: + +```ts +const matches = await archive.files("*.nonexistent"); +if (matches.size === 0) { + console.log("No matching files found"); +} +``` + +### Filtering with Glob Patterns + +Pass a glob pattern to filter which files are returned: + +```ts +// Get only TypeScript files +const tsFiles = await archive.files("**/*.ts"); + +// Get files in src directory +const srcFiles = await archive.files("src/*"); + +// Get all JSON files (recursive) +const jsonFiles = await archive.files("**/*.json"); + +// Get multiple file types with array of patterns +const codeFiles = await archive.files(["**/*.ts", "**/*.js"]); +``` + +Supported glob patterns (subset of [Bun.Glob](/docs/api/glob) syntax): + +- `*` - Match any characters except `/` +- `**` - Match any characters including `/` +- `?` - Match single character +- `[abc]` - Match character set +- `{a,b}` - Match alternatives +- `!pattern` - Exclude files matching pattern (negation). Must be combined with positive patterns; using only negative patterns matches nothing. + +See [Bun.Glob](/docs/api/glob) for the full glob syntax including escaping and advanced patterns. + +## Compression + +Bun.Archive supports gzip compression for both reading and writing: + +```ts +// Reading: automatically detects gzip +const gzippedTarball = await Bun.file("archive.tar.gz").bytes(); +const archive = Bun.Archive.from(gzippedTarball); + +// Writing: specify compression +const files = { "hello.txt": "Hello, World!" }; +await Bun.Archive.write("output.tar.gz", files, "gzip"); + +// Getting bytes: specify compression +const gzippedBytes = await archive.bytes("gzip"); +``` + +The compression argument accepts: + +- `"gzip"` - Enable gzip compression +- `true` - Same as `"gzip"` +- `false` or `undefined` - No compression + +## Examples + +### Bundle Project Files + +```ts +import { Glob } from "bun"; + +// Collect source files +const files: Record = {}; +const glob = new Glob("src/**/*.ts"); + +for await (const path of glob.scan(".")) { + // Normalize path separators to forward slashes for cross-platform compatibility + const archivePath = path.replaceAll("\\", "/"); + files[archivePath] = await Bun.file(path).text(); +} + +// Add package.json +files["package.json"] = await Bun.file("package.json").text(); + +// Create compressed archive +await Bun.Archive.write("bundle.tar.gz", files, "gzip"); +``` + +### Extract and Process npm Package + +```ts +const response = await fetch("https://registry.npmjs.org/lodash/-/lodash-4.17.21.tgz"); +const archive = Bun.Archive.from(await response.blob()); + +// Get package.json +const files = await archive.files("package/package.json"); +const packageJson = files.get("package/package.json"); + +if (packageJson) { + const pkg = JSON.parse(await packageJson.text()); + console.log(`Package: ${pkg.name}@${pkg.version}`); +} +``` + +### Create Archive from Directory + +```ts +import { readdir } from "node:fs/promises"; +import { join } from "node:path"; + +async function archiveDirectory(dir: string): Promise { + const files: Record = {}; + + async function walk(currentDir: string, prefix: string = "") { + const entries = await readdir(currentDir, { withFileTypes: true }); + + for (const entry of entries) { + const fullPath = join(currentDir, entry.name); + const archivePath = prefix ? `${prefix}/${entry.name}` : entry.name; + + if (entry.isDirectory()) { + await walk(fullPath, archivePath); + } else { + files[archivePath] = Bun.file(fullPath); + } + } + } + + await walk(dir); + return Bun.Archive.from(files); +} + +const archive = await archiveDirectory("./my-project"); +await Bun.Archive.write("my-project.tar.gz", archive, "gzip"); +``` + +## Reference + +> **Note**: The following type signatures are simplified for documentation purposes. See [`packages/bun-types/bun.d.ts`](https://github.com/oven-sh/bun/blob/main/packages/bun-types/bun.d.ts) for the full type definitions. + +```ts +type ArchiveCompression = "gzip" | boolean; + +type ArchiveInput = + | Record + | Blob + | Bun.ArrayBufferView + | ArrayBufferLike; + +interface ArchiveExtractOptions { + /** Glob pattern(s) to filter extraction. Supports negative patterns with "!" prefix. */ + glob?: string | readonly string[]; +} + +class Archive { + /** + * Create an Archive from input data + */ + static from(data: ArchiveInput): Archive; + + /** + * Write an archive directly to disk + */ + static write(path: string, data: ArchiveInput | Archive, compress?: ArchiveCompression): Promise; + + /** + * Extract archive to a directory + * @returns Number of entries extracted (files, directories, and symlinks) + */ + extract(path: string, options?: ArchiveExtractOptions): Promise; + + /** + * Get archive as a Blob + */ + blob(compress?: ArchiveCompression): Promise; + + /** + * Get archive as a Uint8Array + */ + bytes(compress?: ArchiveCompression): Promise>; + + /** + * Get archive contents as File objects (regular files only, no directories) + */ + files(glob?: string | readonly string[]): Promise>; +} +``` diff --git a/packages/bun-types/bun.d.ts b/packages/bun-types/bun.d.ts index 34dda03dc7..38493f9b77 100644 --- a/packages/bun-types/bun.d.ts +++ b/packages/bun-types/bun.d.ts @@ -6965,6 +6965,296 @@ declare module "bun" { match(str: string): boolean; } + /** + * Input data for creating an archive. Can be: + * - An object mapping paths to file contents (string, Blob, TypedArray, or ArrayBuffer) + * - A Blob containing existing archive data + * - A TypedArray or ArrayBuffer containing existing archive data + */ + type ArchiveInput = Record | Blob | ArrayBufferView | ArrayBufferLike; + + /** + * Compression format for archive output. + * - `"gzip"` - Compress with gzip + * - `true` - Same as `"gzip"` + * - `false` - Explicitly disable compression (no compression) + * - `undefined` - No compression (default behavior when omitted) + * + * Both `false` and `undefined` result in no compression; `false` can be used + * to explicitly indicate "no compression" in code where the intent should be clear. + */ + type ArchiveCompression = "gzip" | boolean; + + /** + * Options for extracting archive contents. + */ + interface ArchiveExtractOptions { + /** + * Glob pattern(s) to filter which entries are extracted. + * Uses the same syntax as {@link Bun.Glob}, including support for wildcards (`*`, `**`), + * character classes (`[abc]`), alternation (`{a,b}`), and negation (`!pattern`). + * + * Patterns are matched against archive entry paths normalized to use forward slashes (`/`), + * regardless of the host operating system. Always write patterns using `/` as the separator. + * + * - Positive patterns: Only entries matching at least one pattern will be extracted. + * - Negative patterns (prefixed with `!`): Entries matching these patterns will be excluded. + * Negative patterns are applied after positive patterns. + * + * If not specified, all entries are extracted. + * + * @example + * ```ts + * // Extract only TypeScript files + * await archive.extract("./out", { glob: "**" + "/*.ts" }); + * + * // Extract files from multiple directories + * await archive.extract("./out", { glob: ["src/**", "lib/**"] }); + * + * // Exclude node_modules using negative pattern + * await archive.extract("./out", { glob: ["**", "!node_modules/**"] }); + * + * // Extract source files but exclude tests + * await archive.extract("./out", { glob: ["src/**", "!**" + "/*.test.ts"] }); + * ``` + */ + glob?: string | readonly string[]; + } + + /** + * A class for creating and extracting tar archives with optional gzip compression. + * + * `Bun.Archive` provides a fast, native implementation for working with tar archives. + * It supports creating archives from in-memory data or extracting existing archives + * to disk or memory. + * + * @example + * **Create an archive from an object:** + * ```ts + * const archive = Bun.Archive.from({ + * "hello.txt": "Hello, World!", + * "data.json": JSON.stringify({ foo: "bar" }), + * "binary.bin": new Uint8Array([1, 2, 3, 4]), + * }); + * ``` + * + * @example + * **Extract an archive to disk:** + * ```ts + * const archive = Bun.Archive.from(tarballBytes); + * const entryCount = await archive.extract("./output"); + * console.log(`Extracted ${entryCount} entries`); + * ``` + * + * @example + * **Get archive contents as a Map of File objects:** + * ```ts + * const archive = Bun.Archive.from(tarballBytes); + * const entries = await archive.files(); + * for (const [path, file] of entries) { + * console.log(path, await file.text()); + * } + * ``` + * + * @example + * **Write a gzipped archive directly to disk:** + * ```ts + * await Bun.Archive.write("bundle.tar.gz", { + * "src/index.ts": sourceCode, + * "package.json": packageJson, + * }, "gzip"); + * ``` + */ + export class Archive { + /** + * Create an `Archive` instance from input data. + * + * @param data - The input data for the archive: + * - **Object**: Creates a new tarball with the object's keys as file paths and values as file contents + * - **Blob/TypedArray/ArrayBuffer**: Wraps existing archive data (tar or tar.gz) + * + * @returns A new `Archive` instance + * + * @example + * **From an object (creates new tarball):** + * ```ts + * const archive = Bun.Archive.from({ + * "hello.txt": "Hello, World!", + * "nested/file.txt": "Nested content", + * }); + * ``` + * + * @example + * **From existing archive data:** + * ```ts + * const response = await fetch("https://example.com/package.tar.gz"); + * const archive = Bun.Archive.from(await response.blob()); + * ``` + */ + static from(data: ArchiveInput): Archive; + + /** + * Create and write an archive directly to disk in one operation. + * + * This is more efficient than creating an archive and then writing it separately, + * as it streams the data directly to disk. + * + * @param path - The file path to write the archive to + * @param data - The input data for the archive (same as `Archive.from()`) + * @param compress - Optional compression: `"gzip"`, `true` for gzip, or `false`/`undefined` for none + * + * @returns A promise that resolves when the write is complete + * + * @example + * **Write uncompressed tarball:** + * ```ts + * await Bun.Archive.write("output.tar", { + * "file1.txt": "content1", + * "file2.txt": "content2", + * }); + * ``` + * + * @example + * **Write gzipped tarball:** + * ```ts + * await Bun.Archive.write("output.tar.gz", files, "gzip"); + * ``` + */ + static write(path: string, data: ArchiveInput | Archive, compress?: ArchiveCompression): Promise; + + /** + * Extract the archive contents to a directory on disk. + * + * Creates the target directory and any necessary parent directories if they don't exist. + * Existing files will be overwritten. + * + * @param path - The directory path to extract to + * @param options - Optional extraction options + * @param options.glob - Glob pattern(s) to filter entries (positive patterns include, negative patterns starting with `!` exclude) + * @returns A promise that resolves with the number of entries extracted (files, directories, and symlinks) + * + * @example + * **Extract all entries:** + * ```ts + * const archive = Bun.Archive.from(tarballBytes); + * const count = await archive.extract("./extracted"); + * console.log(`Extracted ${count} entries`); + * ``` + * + * @example + * **Extract only TypeScript files:** + * ```ts + * const count = await archive.extract("./src", { glob: "**" + "/*.ts" }); + * ``` + * + * @example + * **Extract everything except tests:** + * ```ts + * const count = await archive.extract("./dist", { glob: ["**", "!**" + "/*.test.*"] }); + * ``` + * + * @example + * **Extract source files but exclude tests:** + * ```ts + * const count = await archive.extract("./output", { + * glob: ["src/**", "lib/**", "!**" + "/*.test.ts", "!**" + "/__tests__/**"] + * }); + * ``` + */ + extract(path: string, options?: ArchiveExtractOptions): Promise; + + /** + * Get the archive contents as a `Blob`. + * + * @param compress - Optional compression: `"gzip"`, `true` for gzip, or `false`/`undefined` for none + * @returns A promise that resolves with the archive data as a Blob + * + * @example + * **Get uncompressed tarball:** + * ```ts + * const blob = await archive.blob(); + * ``` + * + * @example + * **Get gzipped tarball:** + * ```ts + * const gzippedBlob = await archive.blob("gzip"); + * ``` + */ + blob(compress?: ArchiveCompression): Promise; + + /** + * Get the archive contents as a `Uint8Array`. + * + * @param compress - Optional compression: `"gzip"`, `true` for gzip, or `false`/`undefined` for none + * @returns A promise that resolves with the archive data as a Uint8Array + * + * @example + * **Get uncompressed tarball bytes:** + * ```ts + * const bytes = await archive.bytes(); + * ``` + * + * @example + * **Get gzipped tarball bytes:** + * ```ts + * const gzippedBytes = await archive.bytes("gzip"); + * ``` + */ + bytes(compress?: ArchiveCompression): Promise>; + + /** + * Get the archive contents as a `Map` of `File` objects. + * + * Each file in the archive is returned as a `File` object with: + * - `name`: The file path within the archive + * - `lastModified`: The file's modification time from the archive + * - Standard Blob methods (`text()`, `arrayBuffer()`, `stream()`, etc.) + * + * Only regular files are included; directories are not returned. + * File contents are loaded into memory, so for large archives consider using `extract()` instead. + * + * @param glob - Optional glob pattern(s) to filter files. Supports the same syntax as {@link Bun.Glob}, + * including negation patterns (prefixed with `!`). Patterns are matched against paths normalized + * to use forward slashes (`/`). + * @returns A promise that resolves with a Map where keys are file paths (always using forward slashes `/` as separators) and values are File objects + * + * @example + * **Get all files:** + * ```ts + * const entries = await archive.files(); + * for (const [path, file] of entries) { + * console.log(`${path}: ${file.size} bytes`); + * } + * ``` + * + * @example + * **Filter by glob pattern:** + * ```ts + * const tsFiles = await archive.files("**" + "/*.ts"); + * const srcFiles = await archive.files(["src/**", "lib/**"]); + * ``` + * + * @example + * **Exclude files with negative patterns:** + * ```ts + * // Get all source files except tests + * const srcFiles = await archive.files(["src/**", "!**" + "/*.test.ts"]); + * ``` + * + * @example + * **Read file contents:** + * ```ts + * const entries = await archive.files(); + * const readme = entries.get("README.md"); + * if (readme) { + * console.log(await readme.text()); + * } + * ``` + */ + files(glob?: string | readonly string[]): Promise>; + } + /** * Generate a UUIDv7, which is a sequential ID based on the current timestamp with a random component. * diff --git a/src/bun.js/api/Archive.classes.ts b/src/bun.js/api/Archive.classes.ts index 113c688a07..64c35caa44 100644 --- a/src/bun.js/api/Archive.classes.ts +++ b/src/bun.js/api/Archive.classes.ts @@ -20,7 +20,7 @@ export default [ proto: { extract: { fn: "extract", - length: 1, + length: 2, }, blob: { fn: "blob", diff --git a/src/bun.js/api/Archive.zig b/src/bun.js/api/Archive.zig index 5bf7a07f66..f8a0803f26 100644 --- a/src/bun.js/api/Archive.zig +++ b/src/bun.js/api/Archive.zig @@ -283,11 +283,13 @@ fn parseCompressArg(globalThis: *jsc.JSGlobalObject, arg: jsc.JSValue) bun.JSErr return globalThis.throwInvalidArguments("Archive: compress argument must be 'gzip', a boolean, or undefined", .{}); } -/// Instance method: archive.extract(path) +/// Instance method: archive.extract(path, options?) /// Extracts the archive to the given path +/// Options: +/// - glob: string | string[] - Only extract files matching the glob pattern(s). Supports negative patterns with "!". /// Returns Promise with count of extracted files pub fn extract(this: *Archive, globalThis: *jsc.JSGlobalObject, callframe: *jsc.CallFrame) bun.JSError!jsc.JSValue { - const path_arg = callframe.argumentsAsArray(1)[0]; + const path_arg, const options_arg = callframe.argumentsAsArray(2); if (path_arg == .zero or !path_arg.isString()) { return globalThis.throwInvalidArguments("Archive.extract requires a path argument", .{}); } @@ -295,7 +297,86 @@ pub fn extract(this: *Archive, globalThis: *jsc.JSGlobalObject, callframe: *jsc. const path_slice = try path_arg.toSlice(globalThis, bun.default_allocator); defer path_slice.deinit(); - return startExtractTask(globalThis, this.store, path_slice.slice()); + // Parse options + var glob_patterns: ?[]const []const u8 = null; + errdefer { + if (glob_patterns) |patterns| freePatterns(patterns); + } + + if (!options_arg.isUndefinedOrNull()) { + if (!options_arg.isObject()) { + return globalThis.throwInvalidArguments("Archive.extract: second argument must be an options object", .{}); + } + + // Parse glob option + if (try options_arg.getTruthy(globalThis, "glob")) |glob_val| { + glob_patterns = try parsePatternArg(globalThis, glob_val, "Archive.extract", "glob"); + } + } + + return startExtractTask(globalThis, this.store, path_slice.slice(), glob_patterns); +} + +/// Parse a string or array of strings into a pattern list. +/// Returns null for empty strings or empty arrays (treated as "no filter"). +fn parsePatternArg(globalThis: *jsc.JSGlobalObject, arg: jsc.JSValue, api_name: []const u8, name: []const u8) bun.JSError!?[]const []const u8 { + const allocator = bun.default_allocator; + + // Single string + if (arg.isString()) { + const str_slice = try arg.toSlice(globalThis, allocator); + defer str_slice.deinit(); + // Empty string = no filter + if (str_slice.len == 0) return null; + const pattern = allocator.dupe(u8, str_slice.slice()) catch return error.OutOfMemory; + errdefer allocator.free(pattern); + const patterns = allocator.alloc([]const u8, 1) catch return error.OutOfMemory; + patterns[0] = pattern; + return patterns; + } + + // Array of strings + if (arg.jsType() == .Array) { + const len = try arg.getLength(globalThis); + // Empty array = no filter + if (len == 0) return null; + + var patterns = std.ArrayList([]const u8).initCapacity(allocator, @intCast(len)) catch return error.OutOfMemory; + errdefer { + for (patterns.items) |p| allocator.free(p); + patterns.deinit(allocator); + } + + // Use index-based iteration for safety (avoids issues if array mutates) + var i: u32 = 0; + while (i < len) : (i += 1) { + const item = try arg.getIndex(globalThis, i); + if (!item.isString()) { + return globalThis.throwInvalidArguments("{s}: {s} array must contain only strings", .{ api_name, name }); + } + const str_slice = try item.toSlice(globalThis, allocator); + defer str_slice.deinit(); + // Skip empty strings in array + if (str_slice.len == 0) continue; + const pattern = allocator.dupe(u8, str_slice.slice()) catch return error.OutOfMemory; + patterns.appendAssumeCapacity(pattern); + } + + // If all strings were empty, treat as no filter + if (patterns.items.len == 0) { + patterns.deinit(allocator); + return null; + } + + return patterns.toOwnedSlice(allocator) catch return error.OutOfMemory; + } + + return globalThis.throwInvalidArguments("{s}: {s} must be a string or array of strings", .{ api_name, name }); +} + +fn freePatterns(patterns: []const []const u8) void { + for (patterns) |p| bun.default_allocator.free(p); + bun.default_allocator.free(patterns); } /// Instance method: archive.blob(compress?) @@ -319,19 +400,14 @@ pub fn bytes(this: *Archive, globalThis: *jsc.JSGlobalObject, callframe: *jsc.Ca pub fn files(this: *Archive, globalThis: *jsc.JSGlobalObject, callframe: *jsc.CallFrame) bun.JSError!jsc.JSValue { const glob_arg = callframe.argument(0); - var glob_pattern: ?[]const u8 = null; + var glob_patterns: ?[]const []const u8 = null; + errdefer if (glob_patterns) |patterns| freePatterns(patterns); if (!glob_arg.isUndefinedOrNull()) { - if (!glob_arg.isString()) { - return globalThis.throwInvalidArguments("Archive.files: argument must be a string glob pattern or undefined", .{}); - } - const glob_slice = try glob_arg.toSlice(globalThis, bun.default_allocator); - defer glob_slice.deinit(); - glob_pattern = try bun.default_allocator.dupe(u8, glob_slice.slice()); + glob_patterns = try parsePatternArg(globalThis, glob_arg, "Archive.files", "glob"); } - errdefer if (glob_pattern) |p| bun.default_allocator.free(p); - return startFilesTask(globalThis, this.store, glob_pattern); + return startFilesTask(globalThis, this.store, glob_patterns); } // ============================================================================ @@ -427,9 +503,21 @@ const ExtractContext = struct { store: *jsc.WebCore.Blob.Store, path: []const u8, + glob_patterns: ?[]const []const u8, result: Result = .{ .err = error.ReadError }, fn run(this: *ExtractContext) Result { + // If we have glob patterns, use filtered extraction + if (this.glob_patterns != null) { + const count = extractToDiskFiltered( + this.store.sharedView(), + this.path, + this.glob_patterns, + ) catch return .{ .err = error.ReadError }; + return .{ .success = count }; + } + + // Otherwise use the fast path without filtering const count = libarchive.Archiver.extractToDisk( this.store.sharedView(), this.path, @@ -451,12 +539,18 @@ const ExtractContext = struct { fn deinit(this: *ExtractContext) void { this.store.deref(); bun.default_allocator.free(this.path); + if (this.glob_patterns) |patterns| freePatterns(patterns); } }; pub const ExtractTask = AsyncTask(ExtractContext); -fn startExtractTask(globalThis: *jsc.JSGlobalObject, store: *jsc.WebCore.Blob.Store, path: []const u8) bun.JSError!jsc.JSValue { +fn startExtractTask( + globalThis: *jsc.JSGlobalObject, + store: *jsc.WebCore.Blob.Store, + path: []const u8, + glob_patterns: ?[]const []const u8, +) bun.JSError!jsc.JSValue { const path_copy = try bun.default_allocator.dupe(u8, path); errdefer bun.default_allocator.free(path_copy); @@ -466,6 +560,7 @@ fn startExtractTask(globalThis: *jsc.JSGlobalObject, store: *jsc.WebCore.Blob.St const task = try ExtractTask.create(globalThis, .{ .store = store, .path = path_copy, + .glob_patterns = glob_patterns, }); const promise_js = task.promise.value(); @@ -652,7 +747,7 @@ const FilesContext = struct { }; store: *jsc.WebCore.Blob.Store, - glob_pattern: ?[]const u8, + glob_patterns: ?[]const []const u8, result: Result = .{ .err = error.ReadError }, fn cloneErrorString(archive: *libarchive.lib.Archive) ?[*:0]u8 { @@ -685,8 +780,9 @@ const FilesContext = struct { if (entry.filetype() != @intFromEnum(lib.FileType.regular)) continue; const pathname = entry.pathnameUtf8(); - if (this.glob_pattern) |pattern| { - if (!bun.glob.match(pattern, pathname).matches()) continue; + // Apply glob pattern filtering (supports both positive and negative patterns) + if (this.glob_patterns) |patterns| { + if (!matchGlobPatterns(patterns, pathname)) continue; } const size: usize = @intCast(@max(entry.size(), 0)); @@ -747,20 +843,21 @@ const FilesContext = struct { fn deinit(this: *FilesContext) void { this.result.deinit(); this.store.deref(); - if (this.glob_pattern) |p| bun.default_allocator.free(p); + if (this.glob_patterns) |patterns| freePatterns(patterns); } }; pub const FilesTask = AsyncTask(FilesContext); -fn startFilesTask(globalThis: *jsc.JSGlobalObject, store: *jsc.WebCore.Blob.Store, glob_pattern: ?[]const u8) bun.JSError!jsc.JSValue { +fn startFilesTask(globalThis: *jsc.JSGlobalObject, store: *jsc.WebCore.Blob.Store, glob_patterns: ?[]const []const u8) bun.JSError!jsc.JSValue { store.ref(); errdefer store.deref(); - errdefer if (glob_pattern) |p| bun.default_allocator.free(p); + // Ownership: On error, caller's errdefer frees glob_patterns. + // On success, ownership transfers to FilesContext, which frees them in deinit(). const task = try FilesTask.create(globalThis, .{ .store = store, - .glob_pattern = glob_pattern, + .glob_patterns = glob_patterns, }); const promise_js = task.promise.value(); @@ -799,6 +896,213 @@ fn compressGzip(data: []const u8) ![]u8 { return bun.default_allocator.realloc(output, result.written) catch output[0..result.written]; } +/// Check if a path is safe (no absolute paths or path traversal) +fn isSafePath(pathname: []const u8) bool { + // Reject empty paths + if (pathname.len == 0) return false; + + // Reject absolute paths + if (pathname[0] == '/' or pathname[0] == '\\') return false; + + // Check for Windows drive letters (e.g., "C:") + if (pathname.len >= 2 and pathname[1] == ':') return false; + + // Reject paths with ".." components + var iter = std.mem.splitScalar(u8, pathname, '/'); + while (iter.next()) |component| { + if (std.mem.eql(u8, component, "..")) return false; + // Also check Windows-style separators + var win_iter = std.mem.splitScalar(u8, component, '\\'); + while (win_iter.next()) |win_component| { + if (std.mem.eql(u8, win_component, "..")) return false; + } + } + + return true; +} + +/// Match a path against multiple glob patterns with support for negative patterns. +/// Positive patterns: at least one must match for the path to be included. +/// Negative patterns (starting with "!"): if any matches, the path is excluded. +/// Returns true if the path should be included, false if excluded. +fn matchGlobPatterns(patterns: []const []const u8, pathname: []const u8) bool { + var has_positive_patterns = false; + var matches_positive = false; + + for (patterns) |pattern| { + // Check if it's a negative pattern + if (pattern.len > 0 and pattern[0] == '!') { + // Negative pattern - if it matches, exclude the file + const neg_pattern = pattern[1..]; + if (neg_pattern.len > 0 and bun.glob.match(neg_pattern, pathname).matches()) { + return false; + } + } else { + // Positive pattern - at least one must match + has_positive_patterns = true; + if (bun.glob.match(pattern, pathname).matches()) { + matches_positive = true; + } + } + } + + // If there are no positive patterns, include everything (that wasn't excluded) + // If there are positive patterns, at least one must match + return !has_positive_patterns or matches_positive; +} + +/// Extract archive to disk with glob pattern filtering. +/// Supports negative patterns with "!" prefix (e.g., "!node_modules/**"). +fn extractToDiskFiltered( + file_buffer: []const u8, + root: []const u8, + glob_patterns: ?[]const []const u8, +) !u32 { + const lib = libarchive.lib; + const archive = lib.Archive.readNew(); + defer _ = archive.readFree(); + configureArchiveReader(archive); + + if (archive.readOpenMemory(file_buffer) != .ok) { + return error.ReadError; + } + + // Open/create target directory using bun.sys + const cwd = bun.FD.cwd(); + cwd.makePath(u8, root) catch {}; + const dir_fd: bun.FD = brk: { + if (std.fs.path.isAbsolute(root)) { + break :brk bun.sys.openA(root, bun.O.RDONLY | bun.O.DIRECTORY, 0).unwrap() catch return error.OpenError; + } else { + break :brk bun.sys.openatA(cwd, root, bun.O.RDONLY | bun.O.DIRECTORY, 0).unwrap() catch return error.OpenError; + } + }; + defer _ = dir_fd.close(); + + var count: u32 = 0; + var entry: *lib.Archive.Entry = undefined; + + while (archive.readNextHeader(&entry) == .ok) { + const pathname = entry.pathnameUtf8(); + + // Validate path safety (reject absolute paths, path traversal) + if (!isSafePath(pathname)) continue; + + // Apply glob pattern filtering. Supports negative patterns with "!" prefix. + // Positive patterns: at least one must match + // Negative patterns: if any matches, the file is excluded + if (glob_patterns) |patterns| { + if (!matchGlobPatterns(patterns, pathname)) continue; + } + + const filetype = entry.filetype(); + const kind = bun.sys.kindFromMode(filetype); + + switch (kind) { + .directory => { + dir_fd.makePath(u8, pathname) catch |err| switch (err) { + // Directory already exists - don't count as extracted + error.PathAlreadyExists => continue, + else => continue, + }; + count += 1; + }, + .file => { + const size: usize = @intCast(@max(entry.size(), 0)); + // Sanitize permissions: use entry perms masked to 0o777, or default 0o644 + const entry_perm = entry.perm(); + const mode: bun.Mode = if (entry_perm != 0) + @intCast(entry_perm & 0o777) + else + 0o644; + + // Create parent directories if needed (ignore expected errors) + if (std.fs.path.dirname(pathname)) |parent_dir| { + dir_fd.makePath(u8, parent_dir) catch |err| switch (err) { + // Expected: directory already exists + error.PathAlreadyExists => {}, + // Permission errors: skip this file, will fail at openat + error.AccessDenied => {}, + // Other errors: skip, will fail at openat + else => {}, + }; + } + + // Create and write the file using bun.sys + const file_fd: bun.FD = bun.sys.openat( + dir_fd, + pathname, + bun.O.WRONLY | bun.O.CREAT | bun.O.TRUNC, + mode, + ).unwrap() catch continue; + + var write_success = true; + if (size > 0) { + // Read archive data and write to file + var remaining = size; + var buf: [64 * 1024]u8 = undefined; + while (remaining > 0) { + const to_read = @min(remaining, buf.len); + const read = archive.readData(buf[0..to_read]); + if (read <= 0) { + write_success = false; + break; + } + const bytes_read: usize = @intCast(read); + // Write all bytes, handling partial writes + var written: usize = 0; + while (written < bytes_read) { + const w = file_fd.write(buf[written..bytes_read]).unwrap() catch { + write_success = false; + break; + }; + if (w == 0) { + write_success = false; + break; + } + written += w; + } + if (!write_success) break; + remaining -= bytes_read; + } + } + _ = file_fd.close(); + + if (write_success) { + count += 1; + } else { + // Remove partial file on failure + _ = dir_fd.unlinkat(pathname); + } + }, + .sym_link => { + const link_target = entry.symlink(); + // Validate symlink target is also safe + if (!isSafePath(link_target)) continue; + // Symlinks are only extracted on POSIX systems (Linux/macOS). + // On Windows, symlinks are skipped since they require elevated privileges. + if (bun.Environment.isPosix) { + bun.sys.symlinkat(link_target, dir_fd, pathname).unwrap() catch |err| { + switch (err) { + error.EPERM, error.ENOENT => { + if (std.fs.path.dirname(pathname)) |parent| { + dir_fd.makePath(u8, parent) catch {}; + } + _ = bun.sys.symlinkat(link_target, dir_fd, pathname).unwrap() catch continue; + }, + else => continue, + } + }; + count += 1; + } + }, + else => {}, + } + } + + return count; +} + const libarchive = @import("../../libarchive/libarchive.zig"); const libdeflate = @import("../../deps/libdeflate.zig"); const std = @import("std"); diff --git a/test/js/bun/archive.test.ts b/test/js/bun/archive.test.ts index cc99f0c0f6..58882c07d4 100644 --- a/test/js/bun/archive.test.ts +++ b/test/js/bun/archive.test.ts @@ -1158,6 +1158,138 @@ describe("Bun.Archive", () => { }); }); + describe("extract with glob patterns", () => { + test("extracts only files matching glob pattern", async () => { + const archive = Bun.Archive.from({ + "src/index.ts": "export {}", + "src/utils.ts": "export {}", + "src/types.d.ts": "declare {}", + "test/index.test.ts": "test()", + "README.md": "# Hello", + "package.json": "{}", + }); + + using dir = tempDir("archive-glob-pattern", {}); + const count = await archive.extract(String(dir), { glob: "**/*.ts" }); + + // Should extract 4 .ts files (including .d.ts and .test.ts) + expect(count).toBe(4); + expect(await Bun.file(join(String(dir), "src/index.ts")).exists()).toBe(true); + expect(await Bun.file(join(String(dir), "src/utils.ts")).exists()).toBe(true); + expect(await Bun.file(join(String(dir), "src/types.d.ts")).exists()).toBe(true); + expect(await Bun.file(join(String(dir), "test/index.test.ts")).exists()).toBe(true); + expect(await Bun.file(join(String(dir), "README.md")).exists()).toBe(false); + expect(await Bun.file(join(String(dir), "package.json")).exists()).toBe(false); + }); + + test("extracts files matching any of multiple glob patterns", async () => { + const archive = Bun.Archive.from({ + "src/index.ts": "export {}", + "lib/utils.js": "module.exports = {}", + "test/test.ts": "test()", + "README.md": "# Hello", + }); + + using dir = tempDir("archive-multi-glob", {}); + const count = await archive.extract(String(dir), { glob: ["src/**", "lib/**"] }); + + expect(count).toBe(2); + expect(await Bun.file(join(String(dir), "src/index.ts")).exists()).toBe(true); + expect(await Bun.file(join(String(dir), "lib/utils.js")).exists()).toBe(true); + expect(await Bun.file(join(String(dir), "test/test.ts")).exists()).toBe(false); + expect(await Bun.file(join(String(dir), "README.md")).exists()).toBe(false); + }); + + test("excludes files matching negative pattern", async () => { + const archive = Bun.Archive.from({ + "src/index.ts": "export {}", + "src/index.test.ts": "test()", + "src/utils.ts": "export {}", + "src/utils.test.ts": "test()", + }); + + using dir = tempDir("archive-negative-pattern", {}); + // Use negative pattern to exclude test files + const count = await archive.extract(String(dir), { glob: ["**", "!**/*.test.ts"] }); + + expect(count).toBe(2); + expect(await Bun.file(join(String(dir), "src/index.ts")).exists()).toBe(true); + expect(await Bun.file(join(String(dir), "src/utils.ts")).exists()).toBe(true); + expect(await Bun.file(join(String(dir), "src/index.test.ts")).exists()).toBe(false); + expect(await Bun.file(join(String(dir), "src/utils.test.ts")).exists()).toBe(false); + }); + + test("excludes files matching any of multiple negative patterns", async () => { + const archive = Bun.Archive.from({ + "src/index.ts": "export {}", + "src/index.test.ts": "test()", + "__tests__/helper.ts": "helper", + "node_modules/pkg/index.js": "module", + }); + + using dir = tempDir("archive-multi-negative", {}); + const count = await archive.extract(String(dir), { + glob: ["**", "!**/*.test.ts", "!__tests__/**", "!node_modules/**"], + }); + + expect(count).toBe(1); + expect(await Bun.file(join(String(dir), "src/index.ts")).exists()).toBe(true); + expect(await Bun.file(join(String(dir), "src/index.test.ts")).exists()).toBe(false); + expect(await Bun.file(join(String(dir), "__tests__/helper.ts")).exists()).toBe(false); + expect(await Bun.file(join(String(dir), "node_modules/pkg/index.js")).exists()).toBe(false); + }); + + test("combines positive and negative glob patterns", async () => { + const archive = Bun.Archive.from({ + "src/index.ts": "export {}", + "src/index.test.ts": "test()", + "src/utils.ts": "export {}", + "lib/helper.ts": "helper", + "lib/helper.test.ts": "test()", + "README.md": "# Hello", + }); + + using dir = tempDir("archive-glob-and-negative", {}); + const count = await archive.extract(String(dir), { + glob: ["src/**", "lib/**", "!**/*.test.ts"], + }); + + expect(count).toBe(3); + expect(await Bun.file(join(String(dir), "src/index.ts")).exists()).toBe(true); + expect(await Bun.file(join(String(dir), "src/utils.ts")).exists()).toBe(true); + expect(await Bun.file(join(String(dir), "lib/helper.ts")).exists()).toBe(true); + expect(await Bun.file(join(String(dir), "src/index.test.ts")).exists()).toBe(false); + expect(await Bun.file(join(String(dir), "lib/helper.test.ts")).exists()).toBe(false); + expect(await Bun.file(join(String(dir), "README.md")).exists()).toBe(false); + }); + + test("extracts all files when no patterns are provided", async () => { + const archive = Bun.Archive.from({ + "file1.txt": "content1", + "file2.txt": "content2", + }); + + using dir = tempDir("archive-no-patterns", {}); + const count = await archive.extract(String(dir), {}); + + expect(count).toBe(2); + expect(await Bun.file(join(String(dir), "file1.txt")).exists()).toBe(true); + expect(await Bun.file(join(String(dir), "file2.txt")).exists()).toBe(true); + }); + + test("returns 0 when no files match glob pattern", async () => { + const archive = Bun.Archive.from({ + "file.txt": "content", + "other.md": "markdown", + }); + + using dir = tempDir("archive-no-match", {}); + const count = await archive.extract(String(dir), { glob: "**/*.ts" }); + + expect(count).toBe(0); + }); + }); + describe("concurrent operations", () => { test("multiple extract operations run correctly", async () => { const archive = Bun.Archive.from({