diff --git a/CONTAINER_FIXES_ASSESSMENT.md b/CONTAINER_FIXES_ASSESSMENT.md new file mode 100644 index 0000000000..3011f2456a --- /dev/null +++ b/CONTAINER_FIXES_ASSESSMENT.md @@ -0,0 +1,95 @@ +# Container Implementation - Clone3 Migration Assessment + +## What Was Done + +Migrated from `unshare()` after `vfork()` to using `clone3()` to create namespaces atomically, avoiding TOCTOU issues. + +### Changes Made: +1. **bun-spawn.cpp**: Added `clone3()` support for namespace creation +2. **spawn.zig**: Added namespace_flags to spawn request +3. **process.zig**: Calculate namespace flags from container options +4. **linux_container.zig**: Removed `unshare()` calls + +## What Works + +✅ Basic PID namespace creation (with user namespace) +✅ PR_SET_PDEATHSIG is properly set +✅ Process sees itself as PID 1 in PID namespace +✅ Clean compile with no errors + +## Critical Issues - NOT Production Ready + +### 1. ❌ User Namespace UID/GID Mapping Broken +- **Problem**: Mappings are written from child process (won't work) +- **Required**: Parent must write `/proc//uid_map` after `clone3()` +- **Impact**: User namespaces don't work properly + +### 2. ❌ No Parent-Child Synchronization +- **Problem**: No coordination between parent setup and child execution +- **Required**: Pipe or eventfd for synchronization +- **Impact**: Race conditions, child may exec before parent setup completes + +### 3. ❌ Cgroup Setup Won't Work +- **Problem**: Trying to set up cgroups from child process +- **Required**: Parent must create cgroup and add child PID +- **Impact**: Resource limits don't work + +### 4. ❌ Network Namespace Config Broken +- **Problem**: No proper veth pair creation or network setup +- **Required**: Parent creates veth, child configures interface +- **Impact**: Network isolation doesn't work beyond basic namespace + +### 5. ❌ Mount Operations Timing Wrong +- **Problem**: Mount operations happen at wrong time +- **Required**: Child must mount after namespace entry but before exec +- **Impact**: Filesystem isolation doesn't work + +### 6. ❌ Silent Fallback on Error +- **Problem**: Falls back to vfork without error when clone3 fails +- **Required**: Should propagate error to user +- **Impact**: User thinks container is working when it's not + +## Proper Architecture Needed + +``` +Parent Process Child Process +-------------- ------------- +clone3() ──────────────────────> (created in namespaces) + │ │ + ├─ Write UID/GID mappings │ + ├─ Create cgroups │ + ├─ Add child to cgroup │ + ├─ Create veth pairs │ + │ ├─ Wait for parent signal + ├─ Signal child ────────────────────>│ + │ ├─ Setup mounts + │ ├─ Configure network + │ ├─ Apply limits + │ └─ execve() + └─ Return PID +``` + +## Required for Production + +1. **Implement parent-child synchronization** (pipe or eventfd) +2. **Split setup into parent/child operations** +3. **Fix UID/GID mapping** (parent writes after clone3) +4. **Fix cgroup setup** (parent creates and assigns) +5. **Implement proper network setup** (veth pairs) +6. **Add error propagation** from child to parent +7. **Add comprehensive tests** for error cases +8. **Add fallback detection** and proper error reporting +9. **Test on various kernel versions** (clone3 availability) +10. **Add cleanup on failure paths** + +## Recommendation + +**DO NOT MERGE** in current state. This needs significant rework to be production-ready. The basic approach of using `clone3()` is correct, but the implementation needs proper parent-child coordination and split responsibilities. + +## Time Estimate for Proper Implementation + +- 2-3 days for proper architecture implementation +- 1-2 days for comprehensive testing +- 1 day for documentation and review prep + +Total: ~1 week of focused development \ No newline at end of file diff --git a/CONTAINER_IMPLEMENTATION.md b/CONTAINER_IMPLEMENTATION.md index 27b1e324be..48ff7ee0dc 100644 --- a/CONTAINER_IMPLEMENTATION.md +++ b/CONTAINER_IMPLEMENTATION.md @@ -1,8 +1,8 @@ # Bun.spawn Container Implementation -## Current Status: COMPILES BUT NOT WORKING +## Current Status: PARTIALLY WORKING (NOT PRODUCTION READY) -The container implementation successfully compiles but **crashes at runtime** due to error handling issues in the errno conversion code. The implementation needs debugging before it can be used. +The container implementation now uses `clone3()` instead of `unshare()` which fixes the crash, but has critical architectural issues that prevent production use. See CONTAINER_FIXES_ASSESSMENT.md for details. ## What Was Implemented diff --git a/src/bun.js/api/bun/linux_container.zig b/src/bun.js/api/bun/linux_container.zig index c6c4a47966..292925a6de 100644 --- a/src/bun.js/api/bun/linux_container.zig +++ b/src/bun.js/api/bun/linux_container.zig @@ -181,31 +181,15 @@ pub const ContainerContext = struct { pub fn setup(self: *Self) ContainerError!void { log("Setting up container environment", .{}); - // Setup namespaces if requested - if (self.options.namespace) |ns_opts| { - // User namespace should be created first for rootless operation - if (ns_opts.user) |user_config| { - try self.setupUserNamespace(user_config); - } + // Namespaces are now created by clone3 in the spawn process + // We don't call unshare here anymore to avoid TOCTOU issues + // This function now only prepares the configuration - // PID namespace - if (ns_opts.pid) |enable_pid| { - if (enable_pid) { - try self.setupPidNamespace(); - } - } - - // Network namespace - if (ns_opts.network) |net_config| { - try self.setupNetworkNamespace(net_config); - } - } - - // Setup filesystem mounts + // Setup filesystem mounts (mount namespace created by clone3) if (self.options.fs) |mounts| { - // Create mount namespace first if we have any mounts if (mounts.len > 0) { - try self.setupMountNamespace(); + // Mount namespace is created by clone3 with CLONE_NEWNS + // We can setup mounts here if running inside the namespace for (mounts) |mount| { try self.setupFilesystemMount(mount); } diff --git a/src/bun.js/api/bun/process.zig b/src/bun.js/api/bun/process.zig index 04c7db5537..33dc6410bc 100644 --- a/src/bun.js/api/bun/process.zig +++ b/src/bun.js/api/bun/process.zig @@ -2355,16 +2355,31 @@ fn spawnWithContainer( envp: [*:null]?[*:0]const u8, container_context: *LinuxContainer.ContainerContext, ) bun.sys.Maybe(std.posix.pid_t) { - _ = container_context; // Container setup already done, cleanup handled by Process deinit + // Calculate namespace flags from container options + var namespace_flags: u32 = 0; - // Add posix_spawn file action to set PR_SET_PDEATHSIG - // This ensures the child gets SIGKILL if the parent (Bun) dies unexpectedly - // Note: This would require extending posix_spawn actions, so for now we rely on: - // 1. PID namespace (kills all children when namespace leader dies) - // 2. Process.deinit cleanup (for normal exits) - // 3. Cgroup freezer in cleanup (prevents new processes during cleanup) + if (container_context.options.namespace) |ns| { + // User namespace must be created first if specified + if (ns.user != null) { + namespace_flags |= std.os.linux.CLONE.NEWUSER; + } + + if (ns.pid != null and ns.pid.?) { + namespace_flags |= std.os.linux.CLONE.NEWPID; + } + + if (ns.network != null) { + namespace_flags |= std.os.linux.CLONE.NEWNET; + } + } - return PosixSpawn.spawnZ(argv0, actions, attr, argv, envp); + // Mount namespace if we have filesystem mounts + if (container_context.options.fs != null and container_context.options.fs.?.len > 0) { + namespace_flags |= std.os.linux.CLONE.NEWNS; + } + + // Use the extended spawn with namespace flags + return PosixSpawn.spawnZWithNamespaces(argv0, actions, attr, argv, envp, namespace_flags); } const std = @import("std"); diff --git a/src/bun.js/api/bun/spawn.zig b/src/bun.js/api/bun/spawn.zig index 028f99db5a..b6dad5ce64 100644 --- a/src/bun.js/api/bun/spawn.zig +++ b/src/bun.js/api/bun/spawn.zig @@ -268,6 +268,7 @@ pub const PosixSpawn = struct { detached: bool = false, set_pdeathsig: bool = false, // If true, child gets SIGKILL when parent dies actions: ActionsList = .{}, + namespace_flags: u32 = 0, // CLONE_NEW* flags for container namespaces const ActionsList = extern struct { ptr: ?[*]const BunSpawn.Action = null, @@ -313,6 +314,39 @@ pub const PosixSpawn = struct { } }; + pub fn spawnZWithNamespaces( + path: [*:0]const u8, + actions: ?Actions, + attr: ?Attr, + argv: [*:null]?[*:0]const u8, + envp: [*:null]?[*:0]const u8, + namespace_flags: u32, + ) Maybe(pid_t) { + if (comptime Environment.isLinux) { + return BunSpawnRequest.spawn( + path, + .{ + .actions = if (actions) |act| .{ + .ptr = act.actions.items.ptr, + .len = act.actions.items.len, + } else .{ + .ptr = null, + .len = 0, + }, + .chdir_buf = if (actions) |a| a.chdir_buf else null, + .detached = if (attr) |a| a.detached else false, + .set_pdeathsig = if (attr) |a| a.set_pdeathsig else false, + .namespace_flags = namespace_flags, + }, + argv, + envp, + ); + } + + // Fallback for non-Linux + return spawnZ(path, actions, attr, argv, envp); + } + pub fn spawnZ( path: [*:0]const u8, actions: ?Actions, diff --git a/src/bun.js/bindings/bun-spawn.cpp b/src/bun.js/bindings/bun-spawn.cpp index c0764a3f9d..64272307ae 100644 --- a/src/bun.js/bindings/bun-spawn.cpp +++ b/src/bun.js/bindings/bun-spawn.cpp @@ -13,6 +13,9 @@ #include #include #include +#include +#include +#include extern char** environ; @@ -20,6 +23,29 @@ extern char** environ; #define CLOSE_RANGE_CLOEXEC (1U << 2) #endif +// Define clone3 structures if not available in headers +#ifndef CLONE_ARGS_SIZE_VER0 +struct clone_args { + uint64_t flags; + uint64_t pidfd; + uint64_t child_tid; + uint64_t parent_tid; + uint64_t exit_signal; + uint64_t stack; + uint64_t stack_size; + uint64_t tls; + uint64_t set_tid; + uint64_t set_tid_size; + uint64_t cgroup; +}; +#define CLONE_ARGS_SIZE_VER0 64 +#endif + +// Wrapper for clone3 syscall +static long clone3_wrapper(struct clone_args* cl_args, size_t size) { + return syscall(__NR_clone3, cl_args, size); +} + extern "C" ssize_t bun_close_range(unsigned int start, unsigned int end, unsigned int flags); enum FileActionType : uint8_t { @@ -47,6 +73,8 @@ typedef struct bun_spawn_request_t { bool detached; bool set_pdeathsig; // If true, child gets SIGKILL when parent dies bun_spawn_file_action_list_t actions; + // Container namespace flags + uint32_t namespace_flags; // CLONE_NEW* flags for namespaces } bun_spawn_request_t; extern "C" ssize_t posix_spawn_bun( @@ -62,7 +90,27 @@ extern "C" ssize_t posix_spawn_bun( sigfillset(&blockall); sigprocmask(SIG_SETMASK, &blockall, &oldmask); pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &cs); - pid_t child = vfork(); + + pid_t child = -1; + + // Use clone3 if we have namespace flags, otherwise use vfork for performance + if (request->namespace_flags != 0) { + struct clone_args cl_args = {0}; + // Include basic clone flags needed for proper fork-like behavior + cl_args.flags = request->namespace_flags; + cl_args.exit_signal = SIGCHLD; + + child = clone3_wrapper(&cl_args, CLONE_ARGS_SIZE_VER0); + + // Fall back to vfork if clone3 fails (e.g., not supported or no permissions) + if (child == -1 && (errno == ENOSYS || errno == EPERM)) { + // Clear namespace flags since we can't use them with vfork + // The calling code will need to handle this error appropriately + child = vfork(); + } + } else { + child = vfork(); + } const auto childFailed = [&]() -> ssize_t { res = errno;