[publish images] [BAPI-2103] use gdb and bun.report code to print stack traces upon crash in CI (#21143)

### What does this PR do?

Closes #13012

On Linux, when any Bun process spawned by `runner.node.mjs` crashes, we
run GDB in batch mode to print a backtrace from the core file.

And on all platforms, we run a mini `bun.report` server which collects
crashes reported by any Bun process executed during the tests, and after
each test `runner.node.mjs` fetches and prints any new crashes from the
server.

<details>
<summary>example 1</summary>

```
#0  crash_handler.crash () at crash_handler.zig:1513
#1  0x0000000002cf4020 in crash_handler.crashHandler (reason=..., error_return_trace=0x0, begin_addr=...) at crash_handler.zig:479
#2  0x0000000002cefe25 in crash_handler.handleSegfaultPosix (sig=<optimized out>, info=<optimized out>) at crash_handler.zig:800
#3  0x00000000045a1124 in WTF::jscSignalHandler (sig=11, info=0x7ffe044e30b0, ucontext=0x0) at vendor/WebKit/Source/WTF/wtf/threads/Signals.cpp:548
#4  <signal handler called>
#5  JSC::JSCell::type (this=0x0) at vendor/WebKit/Source/JavaScriptCore/runtime/JSCellInlines.h:137
#6  JSC::JSObject::getOwnNonIndexPropertySlot (this=0x150bc914fe18, vm=..., structure=0x150a0102de50, propertyName=..., slot=...) at vendor/WebKit/Source/JavaScriptCore/runtime/JSObject.h:1348
#7  JSC::JSObject::getPropertySlot<false> (this=0x150bc914fe18, globalObject=0x150b864e0088, propertyName=..., slot=...) at vendor/WebKit/Source/JavaScriptCore/runtime/JSObject.h:1433
#8  JSC::JSValue::getPropertySlot (this=0x7ffe044e4880, globalObject=0x150b864e0088, propertyName=..., slot=...) at vendor/WebKit/Source/JavaScriptCore/runtime/JSCJSValueInlines.h:1108
#9  JSC::JSValue::get (this=0x7ffe044e4880, globalObject=0x150b864e0088, propertyName=..., slot=...) at vendor/WebKit/Source/JavaScriptCore/runtime/JSCJSValueInlines.h:1065
#10 JSC::LLInt::performLLIntGetByID (bytecodeIndex=..., codeBlock=0x150b861e7740, globalObject=0x150b864e0088, baseValue=..., ident=..., metadata=...) at vendor/WebKit/Source/JavaScriptCore/llint/LLIntSlowPaths.cpp:878
#11 0x0000000004d7b055 in llint_slow_path_get_by_id (callFrame=0x7ffe044e4ab0, pc=0x150bc92ea0e7) at vendor/WebKit/Source/JavaScriptCore/llint/LLIntSlowPaths.cpp:946
#12 0x0000000003dd6042 in llint_op_get_by_id ()
#13 0x0000000000000000 in ?? ()
```

</details>

<details>
<summary>example 2</summary>

```
  #0  crash_handler.crash () at crash_handler.zig:1513
  #1  0x0000000002c5db80 in crash_handler.crashHandler (reason=..., error_return_trace=0x0, begin_addr=...) at crash_handler.zig:479
  #2  0x0000000002c59f60 in crash_handler.handleSegfaultPosix (sig=<optimized out>, info=<optimized out>) at crash_handler.zig:800
  #3  0x00000000042ecc88 in WTF::jscSignalHandler (sig=11, info=0xfffff60141b0, ucontext=0xfffff6014230) at vendor/WebKit/Source/WTF/wtf/threads/Signals.cpp:548
  #4  <signal handler called>
  #5  bun.js.api.FFIObject.Reader.u8 (globalObject=0x4000554e0088) at /var/lib/buildkite-agent/builds/ip-172-31-75-92/bun/bun/src/bun.js/api/FFIObject.zig:65
  #6  bun.js.jsc.host_fn.toJSHostCall__anon_1711576 (globalThis=0x4000554e0088, args=...) at /var/lib/buildkite-agent/builds/ip-172-31-75-92/bun/bun/src/bun.js/jsc/host_fn.zig:97
  #7  bun.js.jsc.host_fn.DOMCall("Reader"[0..6],bun.js.api.FFIObject.Reader,"u8"[0..2],.{ .reads = .{ ... }, .writes = .{ ... } }).slowpath (globalObject=0x4000554e0088, thisValue=70370172175040, arguments_ptr=0xfffff6015460, arguments_len=1) at /var/lib/buildkite-agent/builds/ip-172-31-75-92/bun/bun/src/bun.js/jsc/host_fn.zig:490
  #8  0x000040003419003c in ?? ()
  #9  0x0000400055173440 in ?? ()
```

</details>

I used GDB instead of LLDB (as the branch name suggests) because it
seems to produce more useful stack traces with musl libc.

- [x] on linux, use gdb to print from core dump of main bun process
crashed
- [x] on linux, use gdb to print from all new core dumps (so including
bun subprocesses spawned by the test that crashed)
- [x] on all platforms, use a mini bun.report server to print a
self-reported trace (depends on oven-sh/bun.report#15; for now our
package.json points to a commit on the branch of that repo)
- [x] fix trying to fetch stack traces too early on windows
- [x] use output groups so the traces show up alongside the log for the
specific test instead of having to find it in the logs from the entire
run
- [x] get oven-sh/bun.report#15 merged, and point to a bun.report commit
on the main branch instead of the PR branch in package.json

### How did you verify your code works?

Manually, and in CI with a crashing test.

---------

Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
This commit is contained in:
190n
2025-07-24 23:32:54 -07:00
committed by GitHub
parent 72a6278b3f
commit 97a530d832
5 changed files with 371 additions and 31 deletions

View File

@@ -30,6 +30,8 @@ import {
import { readFile } from "node:fs/promises";
import { availableParallelism, userInfo } from "node:os";
import { basename, dirname, extname, join, relative, sep } from "node:path";
import { createInterface } from "node:readline";
import { setTimeout as setTimeoutPromise } from "node:timers/promises";
import { parseArgs } from "node:util";
import pLimit from "./p-limit.mjs";
import {
@@ -159,6 +161,10 @@ const { values: options, positionals: filters } = parseArgs({
type: "boolean",
default: false,
},
["fail-on-coredump-or-report"]: {
type: "boolean",
default: false, // STAB-861
},
},
});
@@ -177,6 +183,25 @@ if (options["quiet"]) {
isQuiet = true;
}
let coresDir;
if (options["coredump-upload"]) {
// this sysctl is set in bootstrap.sh to /var/bun-cores-$distro-$release-$arch
const sysctl = await spawnSafe({ command: "sysctl", args: ["-n", "kernel.core_pattern"] });
coresDir = sysctl.stdout;
if (sysctl.ok) {
if (coresDir.startsWith("|")) {
throw new Error("cores are being piped not saved");
}
// change /foo/bar/%e-%p.core to /foo/bar
coresDir = dirname(sysctl.stdout);
} else {
throw new Error(`Failed to check core_pattern: ${sysctl.error}`);
}
}
let remapPort = undefined;
/**
* @typedef {Object} TestExpectation
* @property {string} filename
@@ -450,6 +475,40 @@ async function runTests() {
}
if (!failedResults.length) {
// bun install has succeeded
const { promise: portPromise, resolve: portResolve } = Promise.withResolvers();
const { promise: errorPromise, resolve: errorResolve } = Promise.withResolvers();
console.log("run in", cwd);
let exiting = false;
const server = spawn(execPath, ["run", "ci-remap-server", execPath, cwd, getCommit()], {
stdio: ["ignore", "pipe", "inherit"],
cwd, // run in main repo
env: { ...process.env, BUN_DEBUG_QUIET_LOGS: "1", NO_COLOR: "1" },
});
server.unref();
server.on("error", errorResolve);
server.on("exit", (code, signal) => {
if (!exiting && (code !== 0 || signal !== null)) errorResolve(signal ? signal : "code " + code);
});
process.on("exit", () => {
exiting = true;
server.kill();
});
const lines = createInterface(server.stdout);
lines.on("line", line => {
portResolve({ port: parseInt(line) });
});
const result = await Promise.race([portPromise, errorPromise, setTimeoutPromise(5000, "timeout")]);
if (typeof result.port != "number") {
server.kill();
console.warn("ci-remap server did not start:", result);
} else {
console.log("crash reports parsed on port", result.port);
remapPort = result.port;
}
await Promise.all(
tests.map(testPath =>
limit(() => {
@@ -469,7 +528,7 @@ async function runTests() {
env.BUN_JSC_validateExceptionChecks = "1";
}
return runTest(title, async () => {
const { ok, error, stdout } = await spawnBun(execPath, {
const { ok, error, stdout, crashes } = await spawnBun(execPath, {
cwd: cwd,
args: [
subcommand,
@@ -482,7 +541,8 @@ async function runTests() {
stderr: parallelism > 1 ? () => {} : chunk => pipeTestStdout(process.stderr, chunk),
});
const mb = 1024 ** 3;
const stdoutPreview = stdout.slice(0, mb).split("\n").slice(0, 50).join("\n");
let stdoutPreview = stdout.slice(0, mb).split("\n").slice(0, 50).join("\n");
if (crashes) stdoutPreview += crashes;
return {
testPath: title,
ok: ok,
@@ -642,19 +702,6 @@ async function runTests() {
if (options["coredump-upload"]) {
try {
// this sysctl is set in bootstrap.sh to /var/bun-cores-$distro-$release-$arch
const sysctl = await spawnSafe({ command: "sysctl", args: ["-n", "kernel.core_pattern"] });
let coresDir = sysctl.stdout;
if (sysctl.ok) {
if (coresDir.startsWith("|")) {
throw new Error("cores are being piped not saved");
}
// change /foo/bar/%e-%p.core to /foo/bar
coresDir = dirname(sysctl.stdout);
} else {
throw new Error(`Failed to check core_pattern: ${sysctl.error}`);
}
const coresDirBase = dirname(coresDir);
const coresDirName = basename(coresDir);
const coreFileNames = readdirSync(coresDir);
@@ -760,6 +807,7 @@ async function runTests() {
* @property {number} timestamp
* @property {number} duration
* @property {string} stdout
* @property {number} [pid]
*/
/**
@@ -921,6 +969,7 @@ async function spawnSafe(options) {
stdout: buffer,
timestamp: timestamp || Date.now(),
duration: duration || 0,
pid: subprocess?.pid,
};
}
@@ -949,10 +998,16 @@ function getCombinedPath(execPath) {
return _combinedPath;
}
/**
* @typedef {object} SpawnBunResult
* @extends SpawnResult
* @property {string} [crashes]
*/
/**
* @param {string} execPath Path to bun binary
* @param {SpawnOptions} options
* @returns {Promise<SpawnResult>}
* @returns {Promise<SpawnBunResult>}
*/
async function spawnBun(execPath, { args, cwd, timeout, env, stdout, stderr }) {
const path = getCombinedPath(execPath);
@@ -971,11 +1026,13 @@ async function spawnBun(execPath, { args, cwd, timeout, env, stdout, stderr }) {
BUN_DEBUG_QUIET_LOGS: "1",
BUN_GARBAGE_COLLECTOR_LEVEL: "1",
BUN_JSC_randomIntegrityAuditRate: "1.0",
BUN_ENABLE_CRASH_REPORTING: "0", // change this to '1' if https://github.com/oven-sh/bun/issues/13012 is implemented
BUN_RUNTIME_TRANSPILER_CACHE_PATH: "0",
BUN_INSTALL_CACHE_DIR: tmpdirPath,
SHELLOPTS: isWindows ? "igncr" : undefined, // ignore "\r" on Windows
TEST_TMPDIR: tmpdirPath, // Used in Node.js tests.
...(typeof remapPort == "number"
? { BUN_CRASH_REPORT_URL: `http://localhost:${remapPort}` }
: { BUN_ENABLE_CRASH_REPORTING: "0" }),
};
if (basename(execPath).includes("asan")) {
@@ -999,7 +1056,8 @@ async function spawnBun(execPath, { args, cwd, timeout, env, stdout, stderr }) {
bunEnv["TEMP"] = tmpdirPath;
}
try {
return await spawnSafe({
const existingCores = options["coredump-upload"] ? readdirSync(coresDir) : [];
const result = await spawnSafe({
command: execPath,
args,
cwd,
@@ -1008,6 +1066,85 @@ async function spawnBun(execPath, { args, cwd, timeout, env, stdout, stderr }) {
stdout,
stderr,
});
const newCores = options["coredump-upload"] ? readdirSync(coresDir).filter(c => !existingCores.includes(c)) : [];
let crashes = "";
if (options["coredump-upload"] && (result.signalCode !== null || newCores.length > 0)) {
// warn if the main PID crashed and we don't have a core
if (result.signalCode !== null && !newCores.some(c => c.endsWith(`${result.pid}.core`))) {
crashes += `main process killed by ${result.signalCode} but no core file found\n`;
}
if (options["fail-on-coredump-or-report"] && newCores.length > 0) {
result.ok = false;
if (!isAlwaysFailure(result.error)) result.error = "core dumped";
}
for (const coreName of newCores) {
const corePath = join(coresDir, coreName);
let out = "";
const gdb = await spawnSafe({
command: "gdb",
args: ["-batch", `--eval-command=bt`, "--core", corePath, execPath],
timeout: 240_000,
stderr: () => {},
stdout(text) {
out += text;
},
});
if (!gdb.ok) {
crashes += `failed to get backtrace from GDB: ${gdb.error}\n`;
} else {
crashes += `======== Stack trace from GDB for ${coreName}: ========\n`;
for (const line of out.split("\n")) {
// filter GDB output since it is pretty verbose
if (
line.startsWith("Program terminated") ||
line.startsWith("#") || // gdb backtrace lines start with #0, #1, etc.
line.startsWith("[Current thread is")
) {
crashes += line + "\n";
}
}
}
}
}
// Skip this if the remap server didn't work or if Bun exited normally
// (tests in which a subprocess crashed should at least set exit code 1)
if (typeof remapPort == "number" && result.exitCode !== 0) {
try {
// When Bun crashes, it exits before the subcommand it runs to upload the crash report has necessarily finished.
// So wait a little bit to make sure that the crash report has at least started uploading
// (once the server sees the /ack request then /traces will wait for any crashes to finish processing)
await setTimeoutPromise(500);
const response = await fetch(`http://localhost:${remapPort}/traces`);
if (!response.ok || response.status !== 200) throw new Error(`server responded with code ${response.status}`);
const traces = await response.json();
if (traces.length > 0) {
if (options["fail-on-coredump-or-report"]) {
result.ok = false;
if (!isAlwaysFailure(result.error)) result.error = "crash reported";
}
crashes += `${traces.length} crashes reported during this test\n`;
for (const t of traces) {
if (t.failed_parse) {
crashes += "Trace string failed to parse:\n";
crashes += t.failed_parse + "\n";
} else if (t.failed_remap) {
crashes += "Parsed trace failed to remap:\n";
crashes += JSON.stringify(t.failed_remap, null, 2) + "\n";
} else {
crashes += "================\n";
crashes += t.remap + "\n";
}
}
}
} catch (e) {
crashes += "failed to fetch traces: " + e.toString() + "\n";
}
}
if (crashes.length > 0) result.crashes = crashes;
return result;
} finally {
try {
rmSync(tmpdirPath, { recursive: true, force: true });
@@ -1093,7 +1230,7 @@ async function spawnBunTest(execPath, testPath, options = { cwd }) {
env.BUN_JSC_validateExceptionChecks = "1";
}
const { ok, error, stdout } = await spawnBun(execPath, {
const { ok, error, stdout, crashes } = await spawnBun(execPath, {
args: isReallyTest ? testArgs : [...args, absPath],
cwd: options["cwd"],
timeout: isReallyTest ? timeout : 30_000,
@@ -1101,7 +1238,8 @@ async function spawnBunTest(execPath, testPath, options = { cwd }) {
stdout: options.stdout,
stderr: options.stderr,
});
const { tests, errors, stdout: stdoutPreview } = parseTestStdout(stdout, testPath);
let { tests, errors, stdout: stdoutPreview } = parseTestStdout(stdout, testPath);
if (crashes) stdoutPreview += crashes;
// If we generated a JUnit file and we're on BuildKite, upload it immediately
if (junitFilePath && isReallyTest && isBuildkite && cliOptions["junit-upload"]) {
@@ -1276,11 +1414,12 @@ function parseTestStdout(stdout, testPath) {
* @returns {Promise<TestResult>}
*/
async function spawnBunInstall(execPath, options) {
const { ok, error, stdout, duration } = await spawnBun(execPath, {
let { ok, error, stdout, duration, crashes } = await spawnBun(execPath, {
args: ["install"],
timeout: testTimeout,
...options,
});
if (crashes) stdout += crashes;
const relativePath = relative(cwd, options.cwd);
const testPath = join(relativePath, "package.json");
const status = ok ? "pass" : "fail";
@@ -2015,7 +2154,9 @@ function isAlwaysFailure(error) {
error.includes("segmentation fault") ||
error.includes("illegal instruction") ||
error.includes("sigtrap") ||
error.includes("error: addresssanitizer")
error.includes("error: addresssanitizer") ||
error.includes("core dumped") ||
error.includes("crash reported")
);
}