Compare commits

...

12 Commits

Author SHA1 Message Date
autofix-ci[bot]
a07c319887 [autofix.ci] apply automated fixes 2025-12-30 17:44:28 +00:00
Sosuke Suzuki
f160cc91a2 fix(jsonl): add RELEASE_AND_RETURN for proper exception scope handling 2025-12-31 02:42:50 +09:00
Sosuke Suzuki
7a831b71c9 fix(jsonl): replace UNLIKELY macro with [[unlikely]] for Windows compatibility 2025-12-31 01:52:55 +09:00
autofix-ci[bot]
26ea6a1251 [autofix.ci] apply automated fixes 2025-12-30 16:44:22 +00:00
Sosuke Suzuki
afe0c17dae perf(jsonl): add ASCII fast path using LiteralParser<Latin1Character>
- Use charactersAreAllASCII() for fast SIMD ASCII detection
- For ASCII-only data, parse directly as Latin1 to skip UTF-16 conversion
- Use memchr for SIMD-optimized newline search
- Fall back to UTF-16 path for non-ASCII content
2025-12-31 01:38:04 +09:00
Sosuke Suzuki
765b34992a perf(jsonl): use Bun's SIMD-accelerated UTF-8 conversion
Replace WTF::String::fromUTF8() with Bun__encoding__toStringUTF8()
for ~12x faster UTF-8 to UTF-16 conversion.

Before: UTF-8 conversion took 68% of total time (35.9ms for 16MB file)
After:  UTF-8 conversion takes 10% of total time (2.9ms for 16MB file)

Native JSONL parsing is now on par with JS blob.text() + JSON.parse().
2025-12-31 00:33:44 +09:00
Sosuke Suzuki
0312ac0285 perf(jsonl): add ASCII fast path to skip UTF-8 conversion
For ASCII-only lines (common in JSON), create StringView directly from
raw bytes as Latin1 instead of converting through WTF::String::fromUTF8().

Uses word-sized operations to quickly check if line is ASCII by testing
8 bytes at a time for high bit.

Also removes timing instrumentation that was adding ~30% overhead.
2025-12-30 22:15:35 +09:00
Sosuke Suzuki
295162ab0c perf(jsonl): pure C++ implementation with MarkedArgumentBuffer
Move all JSONL parsing logic to C++ for better performance:
- Use MarkedArgumentBuffer for GC-safe value collection
- Scan newlines with memchr (typically SIMD-optimized)
- Handle BOM, CRLF, whitespace-only lines in C++
- Simplify Zig code from ~70 lines to ~15 lines

Performance improvements:
- Small objects 100k lines: 2.41x faster (was 1.24x)
- Small strings 500k lines: 2.48x faster (was JS 1.07x faster)
- Session files (large JSON): now competitive with JS
2025-12-30 19:46:04 +09:00
Sosuke Suzuki
caeb43f3cd bench: add JSONL parsing benchmarks
- jsonl-comparison.ts: Compare native vs JS parsing with file I/O
- jsonl-memory-bench.ts: In-memory benchmark without I/O variance
- jsonl-tiny-objects-bench.ts: Stress test with many small objects
- jsonl-timing-test.js: Simple timing test
2025-12-30 18:50:07 +09:00
Sosuke Suzuki
7555710057 perf(jsonl): batch parse all lines in single C++ call
Reduce Zig↔C++ boundary crossings from N (number of lines) to 1
by parsing all JSON lines in a single Bun__parseJSONLines call.

This eliminates per-line overhead from Strong.Optional GC wrappers
and improves performance for files with many small JSON objects.
2025-12-30 18:47:42 +09:00
Sosuke Suzuki
ec4c0bf888 fix(jsonl): use Strong.Optional for GC safety
Address CodeRabbit review: wrap JSValue in jsc.Strong.Optional when
storing in ArrayList to prevent GC from collecting parsed JSON objects
during the parsing loop.
2025-12-30 17:15:18 +09:00
Claude Bot
25f334805b feat(io): add Bun.file().jsonl() for parsing JSONL files
Add a new `jsonl()` method to Blob that parses JSONL (JSON Lines) format
and returns `Promise<T[]>`. This provides a high-performance way to parse
JSONL files.

Features:
- Returns Promise<T[]> with parsed JSON objects
- Automatically removes BOM
- Skips empty lines and whitespace-only lines
- Silently skips lines with JSON parse errors
- Handles both LF and CRLF line endings
- Handles files with or without trailing newline
- Works with Bun.file() and Blob

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-30 07:42:57 +00:00
9 changed files with 841 additions and 0 deletions

View File

@@ -0,0 +1,170 @@
// Benchmark comparing Bun.file().jsonl() vs TypeScript implementation
import { bench, group, run } from "mitata";
import { mkdir, readFile, rm, writeFile } from "node:fs/promises";
import path from "node:path";
// User's TypeScript implementation
const UTF8_BOM = "\ufeff";
function stripBOM(content: string): string {
return content.startsWith(UTF8_BOM) ? content.slice(1) : content;
}
async function readJSONLFile<T>(filePath: string): Promise<T[]> {
try {
let content = await readFile(filePath, "utf8");
if (!content.trim()) return [];
// Strip BOM from the beginning of the file - PowerShell 5.x adds BOM to UTF-8 files
content = stripBOM(content);
return content
.split("\n")
.filter(line => line.trim())
.map(line => {
try {
return JSON.parse(line) as T;
} catch (err) {
console.error(`Error parsing line in ${filePath}: ${err}`);
return null;
}
})
.filter((entry): entry is T => entry !== null);
} catch (err) {
console.error(`Error opening file ${filePath}: ${err}`);
return [];
}
}
async function readJSONLFileBunJSONL<T>(filePath: string): Promise<T[]> {
const result = await Bun.file(filePath).jsonl();
return result;
}
// Alternative TypeScript implementation using Bun.file().text()
async function readJSONLFileBunText<T>(filePath: string): Promise<T[]> {
try {
let content = await Bun.file(filePath).text();
if (!content.trim()) return [];
content = stripBOM(content);
return content
.split("\n")
.filter(line => line.trim())
.map(line => {
try {
return JSON.parse(line) as T;
} catch {
return null;
}
})
.filter((entry): entry is T => entry !== null);
} catch {
return [];
}
}
// Setup test data directory
const BENCH_DIR = path.join(import.meta.dir, ".jsonl-bench-data");
interface TestRecord {
id: number;
name: string;
email: string;
timestamp: number;
data: { key: string; value: number };
}
function generateRecord(i: number): TestRecord {
return {
id: i,
name: `User_${i}`,
email: `user${i}@example.com`,
timestamp: Date.now(),
data: { key: `key_${i}`, value: i * 100 },
};
}
async function setup() {
await mkdir(BENCH_DIR, { recursive: true });
// Generate test files of various sizes
const sizes = [10, 100, 1000, 10000, 100000];
for (const size of sizes) {
const lines: string[] = [];
for (let i = 0; i < size; i++) {
lines.push(JSON.stringify(generateRecord(i)));
}
await writeFile(path.join(BENCH_DIR, `data-${size}.jsonl`), lines.join("\n") + "\n");
}
// File with BOM
const bomContent = "\ufeff" + [0, 1, 2].map(i => JSON.stringify(generateRecord(i))).join("\n") + "\n";
await writeFile(path.join(BENCH_DIR, "data-bom.jsonl"), bomContent);
// File with empty lines and invalid JSON
const mixedContent = [
JSON.stringify(generateRecord(0)),
"",
" ",
"invalid json here",
JSON.stringify(generateRecord(1)),
"\t\t",
JSON.stringify(generateRecord(2)),
].join("\n");
await writeFile(path.join(BENCH_DIR, "data-mixed.jsonl"), mixedContent);
// File with CRLF
const crlfContent = [0, 1, 2].map(i => JSON.stringify(generateRecord(i))).join("\r\n") + "\r\n";
await writeFile(path.join(BENCH_DIR, "data-crlf.jsonl"), crlfContent);
console.log("Setup complete. Test files created in:", BENCH_DIR);
}
async function cleanup() {
await rm(BENCH_DIR, { recursive: true, force: true });
}
async function runBenchmarks() {
await setup();
const sizes = [10, 100, 1000, 10000, 100000];
for (const size of sizes) {
const filePath = path.join(BENCH_DIR, `data-${size}.jsonl`);
group(`JSONL parsing (${size} lines)`, () => {
bench("Bun.file().jsonl() [native]", async () => {
await readJSONLFileBunJSONL(filePath);
});
bench("readJSONLFile (node:fs)", async () => {
await readJSONLFile(filePath);
});
bench("readJSONLFile (Bun.file)", async () => {
await readJSONLFileBunText(filePath);
});
});
}
// Edge cases
group("Edge cases - BOM handling", () => {
const filePath = path.join(BENCH_DIR, "data-bom.jsonl");
bench("Bun.file().jsonl() [native]", async () => {
await readJSONLFileBunJSONL(filePath);
});
bench("readJSONLFile (node:fs)", async () => {
await readJSONLFile(filePath);
});
});
await run();
await cleanup();
}
runBenchmarks().catch(console.error);

View File

@@ -0,0 +1,132 @@
// Benchmark JSONL parsing performance without file I/O overhead
import { bench, group, run } from "mitata";
interface TestRecord {
id: number;
name: string;
email: string;
timestamp: number;
data: { key: string; value: number };
}
function generateRecord(i: number): TestRecord {
return {
id: i,
name: `User_${i}`,
email: `user${i}@example.com`,
timestamp: Date.now(),
data: { key: `key_${i}`, value: i * 100 },
};
}
function generateJSONLContent(lineCount: number): string {
const lines: string[] = [];
for (let i = 0; i < lineCount; i++) {
lines.push(JSON.stringify(generateRecord(i)));
}
return lines.join("\n") + "\n";
}
// TypeScript implementation using Blob.text()
async function parseJSONLWithText<T>(blob: Blob): Promise<T[]> {
const content = await blob.text();
if (!content.trim()) return [];
return content
.split("\n")
.filter(line => line.trim())
.map(line => {
try {
return JSON.parse(line) as T;
} catch {
return null;
}
})
.filter((entry): entry is T => entry !== null);
}
// Native Bun.file().jsonl() equivalent via Blob
async function parseJSONLNative<T>(blob: Blob): Promise<T[]> {
return (blob as any).jsonl();
}
// Sync-like TypeScript implementation (text is already available)
function parseJSONLSync<T>(content: string): T[] {
if (!content.trim()) return [];
return content
.split("\n")
.filter(line => line.trim())
.map(line => {
try {
return JSON.parse(line) as T;
} catch {
return null;
}
})
.filter((entry): entry is T => entry !== null);
}
async function runBenchmarks() {
const sizes = [100, 1000, 10000, 100000];
for (const size of sizes) {
const content = generateJSONLContent(size);
const blob = new Blob([content]);
// Pre-warm the blob text for sync comparison
const textContent = await blob.text();
group(`JSONL parsing ${size} lines (in-memory)`, () => {
bench("Blob.jsonl() [native]", async () => {
// Create new blob each time to avoid caching effects
const b = new Blob([content]);
await parseJSONLNative(b);
});
bench("Blob.text() + JS parse", async () => {
const b = new Blob([content]);
await parseJSONLWithText(b);
});
bench("String split + JSON.parse (sync)", () => {
parseJSONLSync(textContent);
});
});
}
// Test with varying line lengths
group("JSONL with large objects (1000 lines)", () => {
const largeObjects = Array.from({ length: 1000 }, (_, i) => ({
id: i,
name: `User_${i}`,
description: "A".repeat(500), // 500 char string
metadata: {
key1: "value1",
key2: "value2",
key3: "value3",
nested: { a: 1, b: 2, c: 3 },
},
}));
const content = largeObjects.map(o => JSON.stringify(o)).join("\n") + "\n";
const textContent = content;
bench("Blob.jsonl() [native]", async () => {
const b = new Blob([content]);
await parseJSONLNative(b);
});
bench("Blob.text() + JS parse", async () => {
const b = new Blob([content]);
await parseJSONLWithText(b);
});
bench("String split + JSON.parse (sync)", () => {
parseJSONLSync(textContent);
});
});
await run();
}
runBenchmarks().catch(console.error);

View File

@@ -0,0 +1,43 @@
// Test script for JSONL timing
const fs = require("fs");
const path = require("path");
const testDir = path.join(__dirname, ".jsonl-bench-data");
// Create test data if it doesn't exist
if (!fs.existsSync(testDir)) {
fs.mkdirSync(testDir, { recursive: true });
const sizes = [1000, 10000, 100000];
for (const size of sizes) {
const lines = [];
for (let i = 0; i < size; i++) {
lines.push(
JSON.stringify({
id: i,
name: `User_${i}`,
email: `user${i}@example.com`,
timestamp: Date.now(),
data: { key: `key_${i}`, value: i * 100 },
}),
);
}
fs.writeFileSync(path.join(testDir, `data-${size}.jsonl`), lines.join("\n") + "\n");
console.log(`Created data-${size}.jsonl`);
}
}
// Run tests
async function main() {
const sizes = [1000, 10000, 100000];
for (const size of sizes) {
const filePath = path.join(testDir, `data-${size}.jsonl`);
console.log(`\n>>> Testing ${size} lines...`);
const result = await Bun.file(filePath).jsonl();
console.log(`Result: ${result.length} items parsed`);
}
}
main().catch(console.error);

View File

@@ -0,0 +1,120 @@
// Benchmark designed to maximize native implementation advantage
// Small JSON objects = minimal parse time, maximum boundary-crossing overhead ratio
import { bench, group, run } from "mitata";
// Generate tiny JSON objects - minimal parse time per object
function generateTinyJSONL(lineCount: number): string {
const lines: string[] = [];
for (let i = 0; i < lineCount; i++) {
lines.push(`{"i":${i}}`);
}
return lines.join("\n") + "\n";
}
// Even smaller - just numbers
function generateNumbersJSONL(lineCount: number): string {
const lines: string[] = [];
for (let i = 0; i < lineCount; i++) {
lines.push(String(i));
}
return lines.join("\n") + "\n";
}
// Small strings
function generateStringsJSONL(lineCount: number): string {
const lines: string[] = [];
for (let i = 0; i < lineCount; i++) {
lines.push(`"s${i}"`);
}
return lines.join("\n") + "\n";
}
// TypeScript implementation
async function parseJSONLWithText<T>(blob: Blob): Promise<T[]> {
const content = await blob.text();
if (!content.trim()) return [];
return content
.split("\n")
.filter(line => line.trim())
.map(line => {
try {
return JSON.parse(line) as T;
} catch {
return null;
}
})
.filter((entry): entry is T => entry !== null);
}
// Native Blob.jsonl()
async function parseJSONLNative<T>(blob: Blob): Promise<T[]> {
return (blob as any).jsonl();
}
async function runBenchmarks() {
console.log("=== Native vs JS: Small Objects Benchmark ===");
console.log("Goal: Maximize boundary-crossing overhead ratio\n");
// Test with very high line counts and tiny objects
const sizes = [10_000, 50_000, 100_000, 500_000, 1_000_000];
// Tiny objects: {"i":N}
for (const size of sizes) {
const content = generateTinyJSONL(size);
const sizeKB = (content.length / 1024).toFixed(1);
group(`Tiny objects {"i":N} - ${size / 1000}k lines (${sizeKB} KB)`, () => {
bench("Blob.jsonl() [native]", async () => {
const b = new Blob([content]);
await parseJSONLNative(b);
});
bench("Blob.text() + JS parse", async () => {
const b = new Blob([content]);
await parseJSONLWithText(b);
});
});
}
// Plain numbers - absolute minimum parse time
const numberSizes = [100_000, 500_000, 1_000_000];
for (const size of numberSizes) {
const content = generateNumbersJSONL(size);
const sizeKB = (content.length / 1024).toFixed(1);
group(`Plain numbers - ${size / 1000}k lines (${sizeKB} KB)`, () => {
bench("Blob.jsonl() [native]", async () => {
const b = new Blob([content]);
await parseJSONLNative(b);
});
bench("Blob.text() + JS parse", async () => {
const b = new Blob([content]);
await parseJSONLWithText(b);
});
});
}
// Small strings
for (const size of numberSizes) {
const content = generateStringsJSONL(size);
const sizeKB = (content.length / 1024).toFixed(1);
group(`Small strings "sN" - ${size / 1000}k lines (${sizeKB} KB)`, () => {
bench("Blob.jsonl() [native]", async () => {
const b = new Blob([content]);
await parseJSONLNative(b);
});
bench("Blob.text() + JS parse", async () => {
const b = new Blob([content]);
await parseJSONLWithText(b);
});
});
}
await run();
}
runBenchmarks().catch(console.error);

View File

@@ -60,7 +60,9 @@
#include "JavaScriptCore/JSModuleRecord.h"
#include "JavaScriptCore/JSNativeStdFunction.h"
#include "JavaScriptCore/JSONObject.h"
#include "JavaScriptCore/LiteralParser.h"
#include "JavaScriptCore/JSObject.h"
#include <wtf/text/ASCIIFastPath.h>
#include "JavaScriptCore/JSSet.h"
#include "JavaScriptCore/Strong.h"
#include "JavaScriptCore/JSSetIterator.h"
@@ -2195,6 +2197,189 @@ extern "C" JSC::EncodedJSValue ZigString__toJSONObject(const ZigString* strPtr,
return JSValue::encode(result);
}
// Parse JSONL content entirely in C++ - no Zig offset/length arrays needed.
// Forward declaration for Bun's optimized UTF-8 to string conversion
extern "C" JSC::EncodedJSValue Bun__encoding__toStringUTF8(const uint8_t* input, size_t len, JSC::JSGlobalObject* globalObject);
// Helper to find newline in byte array using memchr (SIMD-optimized)
static inline size_t findNewline(const uint8_t* data, size_t start, size_t end)
{
if (start >= end) return notFound;
const void* result = memchr(data + start, '\n', end - start);
if (result) {
return static_cast<const uint8_t*>(result) - data;
}
return notFound;
}
// Check if a line is whitespace-only (for 8-bit data)
static inline bool isWhitespaceOnlyLine8(const Latin1Character* data, size_t start, size_t len)
{
Latin1Character firstChar = data[start];
if (firstChar != ' ' && firstChar != '\t') return false;
for (size_t i = start; i < start + len; i++) {
Latin1Character c = data[i];
if (c != ' ' && c != '\t') return false;
}
return true;
}
// Uses MarkedArgumentBuffer for GC-safe value collection.
// Optimized: For ASCII-only data, parses directly from UTF-8 using LiteralParser<Latin1Character>
// to avoid UTF-16 conversion overhead.
extern "C" JSC::EncodedJSValue Bun__parseJSONLFromBlob(
JSC::JSGlobalObject* globalObject,
const uint8_t* data,
size_t size)
{
auto& vm = globalObject->vm();
auto scope = DECLARE_THROW_SCOPE(vm);
// Handle BOM (Byte Order Mark)
size_t offset = 0;
if (size >= 3 && data[0] == 0xEF && data[1] == 0xBB && data[2] == 0xBF) {
offset = 3; // UTF-8 BOM
}
if (size <= offset) {
RELEASE_AND_RETURN(scope, JSValue::encode(constructEmptyArray(globalObject, nullptr)));
}
const uint8_t* contentStart = data + offset;
size_t contentSize = size - offset;
// Use MarkedArgumentBuffer for GC-safe collection of parsed values
MarkedArgumentBuffer args;
// Check if content is ASCII-only (fast SIMD check)
std::span<const uint8_t> contentSpan(contentStart, contentSize);
bool isAllASCII = charactersAreAllASCII(contentSpan);
if (isAllASCII) {
// Fast path: ASCII-only data can be parsed directly as Latin1
// UTF-8 ASCII bytes are identical to Latin1 encoding
const Latin1Character* latin1Data = reinterpret_cast<const Latin1Character*>(contentStart);
size_t pos = 0;
while (pos < contentSize) {
// Find newline
size_t newlinePos = findNewline(contentStart, pos, contentSize);
size_t lineEnd = (newlinePos == notFound) ? contentSize : newlinePos;
// Handle CRLF
if (lineEnd > pos && latin1Data[lineEnd - 1] == '\r') {
lineEnd--;
}
size_t lineLen = lineEnd - pos;
if (lineLen > 0 && !isWhitespaceOnlyLine8(latin1Data, pos, lineLen)) {
// Use LiteralParser directly with Latin1 data (8-bit fast path)
std::span<const Latin1Character> lineSpan(latin1Data + pos, lineLen);
LiteralParser<Latin1Character, JSONReviverMode::Disabled> parser(globalObject, lineSpan, StrictJSON);
JSValue parsed = parser.tryLiteralParse();
if (scope.exception()) {
scope.clearException();
} else if (parsed) {
args.append(parsed);
}
}
pos = (newlinePos == notFound) ? contentSize : newlinePos + 1;
}
} else {
// Slow path: Contains non-ASCII, need UTF-16 conversion
JSValue jsStringValue = JSValue::decode(Bun__encoding__toStringUTF8(contentStart, contentSize, globalObject));
if (!jsStringValue || !jsStringValue.isString()) {
return JSValue::encode(constructEmptyArray(globalObject, nullptr));
}
JSString* jsString = jsCast<JSString*>(jsStringValue);
auto fullString = jsString->value(globalObject);
RETURN_IF_EXCEPTION(scope, {});
StringView fullView = fullString;
size_t pos = 0;
size_t length = fullView.length();
// Check if the converted string is 8-bit (Latin1)
// Even with non-ASCII UTF-8, if all chars fit in Latin1, we can use 8-bit path
bool use8BitPath = fullView.is8Bit();
while (pos < length) {
size_t newlinePos = fullView.find('\n', pos);
size_t lineEnd = (newlinePos == notFound) ? length : newlinePos;
if (lineEnd > pos && fullView[lineEnd - 1] == '\r') {
lineEnd--;
}
size_t lineLen = lineEnd - pos;
if (lineLen > 0) {
bool isWhitespaceOnly = false;
if (use8BitPath) {
Latin1Character firstChar = fullView.span8()[pos];
if (firstChar == ' ' || firstChar == '\t') {
isWhitespaceOnly = true;
for (size_t i = pos; i < pos + lineLen; i++) {
Latin1Character c = fullView.span8()[i];
if (c != ' ' && c != '\t') {
isWhitespaceOnly = false;
break;
}
}
}
} else {
UChar firstChar = fullView[pos];
if (firstChar == ' ' || firstChar == '\t') {
isWhitespaceOnly = true;
for (size_t i = pos; i < pos + lineLen; i++) {
UChar c = fullView[i];
if (c != ' ' && c != '\t') {
isWhitespaceOnly = false;
break;
}
}
}
}
if (!isWhitespaceOnly) {
JSValue parsed;
if (use8BitPath) {
// Use LiteralParser directly with 8-bit data
std::span<const Latin1Character> lineSpan(fullView.span8().data() + pos, lineLen);
LiteralParser<Latin1Character, JSONReviverMode::Disabled> parser(globalObject, lineSpan, StrictJSON);
parsed = parser.tryLiteralParse();
} else {
// Use LiteralParser with 16-bit data
std::span<const char16_t> lineSpan(fullView.span16().data() + pos, lineLen);
LiteralParser<char16_t, JSONReviverMode::Disabled> parser(globalObject, lineSpan, StrictJSON);
parsed = parser.tryLiteralParse();
}
if (scope.exception()) {
scope.clearException();
} else if (parsed) {
args.append(parsed);
}
}
}
pos = (newlinePos == notFound) ? length : newlinePos + 1;
}
}
if (args.hasOverflowed()) [[unlikely]] {
throwOutOfMemoryError(globalObject, scope);
return {};
}
RELEASE_AND_RETURN(scope, JSValue::encode(constructArray(globalObject, static_cast<ArrayAllocationProfile*>(nullptr), args)));
}
// We used to just throw "Out of memory" as a regular Error with that string.
//
// But JSC has some different handling for out of memory errors. So we should

View File

@@ -61,6 +61,7 @@ CPP_DECL JSC::EncodedJSValue ZigString__toRangeErrorInstance(const ZigString* ar
CPP_DECL JSC::EncodedJSValue ZigString__toSyntaxErrorInstance(const ZigString* arg0, JSC::JSGlobalObject* arg1);
CPP_DECL JSC::EncodedJSValue ZigString__toTypeErrorInstance(const ZigString* arg0, JSC::JSGlobalObject* arg1);
CPP_DECL JSC::EncodedJSValue ZigString__toValueGC(const ZigString* arg0, JSC::JSGlobalObject* arg1);
CPP_DECL JSC::EncodedJSValue Bun__parseJSONLFromBlob(JSC::JSGlobalObject* arg0, const uint8_t* arg1, size_t arg2);
CPP_DECL WebCore::DOMURL* WebCore__DOMURL__cast_(JSC::EncodedJSValue JSValue0, JSC::VM* arg1);
CPP_DECL BunString WebCore__DOMURL__fileSystemPath(WebCore::DOMURL* arg0, int* errorCode);
CPP_DECL void WebCore__DOMURL__href_(WebCore::DOMURL* arg0, ZigString* arg1);

View File

@@ -3602,6 +3602,62 @@ pub fn toJSONWithBytes(this: *Blob, global: *JSGlobalObject, raw_bytes: []const
return ZigString.init(buf).toJSONObject(global);
}
// ===== JSONL Support =====
pub fn getJSONL(
this: *Blob,
globalThis: *jsc.JSGlobalObject,
_: *jsc.CallFrame,
) bun.JSError!jsc.JSValue {
return this.getJSONLShare(globalThis);
}
pub fn getJSONLShare(
this: *Blob,
globalObject: *jsc.JSGlobalObject,
) bun.JSTerminated!jsc.JSValue {
const store = this.store;
if (store) |st| st.ref();
defer if (store) |st| st.deref();
return jsc.JSPromise.wrap(globalObject, lifetimeWrap(toJSONL, .share), .{ this, globalObject });
}
pub fn toJSONL(this: *Blob, global: *JSGlobalObject, comptime lifetime: Lifetime) bun.JSError!JSValue {
if (this.needsToReadFile()) {
return this.doReadFile(toJSONLWithBytes, global);
}
if (this.isS3()) {
return this.doReadFromS3(toJSONLWithBytes, global);
}
const view_ = this.sharedView();
return toJSONLWithBytes(this, global, view_, lifetime);
}
// Pure C++ JSONL parsing - all processing happens in C++ for efficiency
extern fn Bun__parseJSONLFromBlob(
globalObject: *JSGlobalObject,
data: [*]const u8,
size: usize,
) JSValue;
pub fn toJSONLWithBytes(_: *Blob, global: *JSGlobalObject, raw_bytes: []const u8, comptime lifetime: Lifetime) bun.JSError!JSValue {
defer if (comptime lifetime == .temporary) bun.default_allocator.free(@constCast(raw_bytes));
if (raw_bytes.len == 0) {
return jsc.JSArray.createEmpty(global, 0);
}
// All processing (BOM handling, line scanning, JSON parsing) happens in C++
const result = Bun__parseJSONLFromBlob(global, raw_bytes.ptr, raw_bytes.len);
// C++ returns .zero on exception
if (result == .zero) {
return error.JSError;
}
return result;
}
pub fn toFormDataWithBytes(this: *Blob, global: *JSGlobalObject, buf: []u8, comptime _: Lifetime) JSValue {
var encoder = this.getFormDataEncoding() orelse return {
return ZigString.init("Invalid encoding").toErrorInstance(global);

View File

@@ -152,6 +152,7 @@ export default [
proto: {
text: { fn: "getText", async: true },
json: { fn: "getJSON", async: true },
jsonl: { fn: "getJSONL", async: true },
arrayBuffer: { fn: "getArrayBuffer", async: true },
slice: { fn: "getSlice", length: 2 },
stream: { fn: "getStream", length: 1 },

View File

@@ -0,0 +1,133 @@
import { describe, expect, test } from "bun:test";
import { tempDir } from "harness";
describe("Bun.file().jsonl()", () => {
test("parses basic JSONL file", async () => {
using dir = tempDir("jsonl-basic", {
"data.jsonl": '{"a":1}\n{"b":2}\n',
});
const result = await Bun.file(`${dir}/data.jsonl`).jsonl();
expect(result).toEqual([{ a: 1 }, { b: 2 }]);
});
test("returns empty array for empty file", async () => {
using dir = tempDir("jsonl-empty-file", {
"data.jsonl": "",
});
const result = await Bun.file(`${dir}/data.jsonl`).jsonl();
expect(result).toEqual([]);
});
test("handles CRLF line endings", async () => {
using dir = tempDir("jsonl-crlf", {
"data.jsonl": '{"a":1}\r\n{"b":2}\r\n',
});
const result = await Bun.file(`${dir}/data.jsonl`).jsonl();
expect(result).toEqual([{ a: 1 }, { b: 2 }]);
});
test("handles last line without newline", async () => {
using dir = tempDir("jsonl-no-trailing", {
"data.jsonl": '{"a":1}\n{"b":2}',
});
const result = await Bun.file(`${dir}/data.jsonl`).jsonl();
expect(result).toEqual([{ a: 1 }, { b: 2 }]);
});
test("skips empty lines", async () => {
using dir = tempDir("jsonl-empty-lines", {
"data.jsonl": '{"a":1}\n\n{"b":2}\n\n',
});
const result = await Bun.file(`${dir}/data.jsonl`).jsonl();
expect(result).toEqual([{ a: 1 }, { b: 2 }]);
});
test("skips whitespace-only lines", async () => {
using dir = tempDir("jsonl-whitespace-lines", {
"data.jsonl": '{"a":1}\n \n{"b":2}\n\t\t\n',
});
const result = await Bun.file(`${dir}/data.jsonl`).jsonl();
expect(result).toEqual([{ a: 1 }, { b: 2 }]);
});
test("skips invalid JSON lines", async () => {
using dir = tempDir("jsonl-invalid", {
"data.jsonl": '{"a":1}\ninvalid json\n{"b":2}\n',
});
const result = await Bun.file(`${dir}/data.jsonl`).jsonl();
expect(result).toEqual([{ a: 1 }, { b: 2 }]);
});
test("handles BOM", async () => {
using dir = tempDir("jsonl-bom", {
"data.jsonl": '\ufeff{"a":1}\n{"b":2}\n',
});
const result = await Bun.file(`${dir}/data.jsonl`).jsonl();
expect(result).toEqual([{ a: 1 }, { b: 2 }]);
});
test("handles arrays as JSON values", async () => {
using dir = tempDir("jsonl-arrays", {
"data.jsonl": '[1,2,3]\n["a","b"]\n',
});
const result = await Bun.file(`${dir}/data.jsonl`).jsonl();
expect(result).toEqual([
[1, 2, 3],
["a", "b"],
]);
});
test("handles strings as JSON values", async () => {
using dir = tempDir("jsonl-strings", {
"data.jsonl": '"hello"\n"world"\n',
});
const result = await Bun.file(`${dir}/data.jsonl`).jsonl();
expect(result).toEqual(["hello", "world"]);
});
test("handles numbers as JSON values", async () => {
using dir = tempDir("jsonl-numbers", {
"data.jsonl": "42\n3.14\n-100\n",
});
const result = await Bun.file(`${dir}/data.jsonl`).jsonl();
expect(result).toEqual([42, 3.14, -100]);
});
test("handles null and boolean values", async () => {
using dir = tempDir("jsonl-primitives", {
"data.jsonl": "null\ntrue\nfalse\n",
});
const result = await Bun.file(`${dir}/data.jsonl`).jsonl();
expect(result).toEqual([null, true, false]);
});
test("handles nested objects", async () => {
using dir = tempDir("jsonl-nested", {
"data.jsonl": '{"user":{"name":"John","age":30}}\n{"data":[1,2,3]}\n',
});
const result = await Bun.file(`${dir}/data.jsonl`).jsonl();
expect(result).toEqual([{ user: { name: "John", age: 30 } }, { data: [1, 2, 3] }]);
});
test("handles unicode content", async () => {
using dir = tempDir("jsonl-unicode", {
"data.jsonl": '{"emoji":"\\ud83d\\ude00"}\n{"japanese":"\\u3053\\u3093\\u306b\\u3061\\u306f"}\n',
});
const result = await Bun.file(`${dir}/data.jsonl`).jsonl();
expect(result).toEqual([{ emoji: "\ud83d\ude00" }, { japanese: "\u3053\u3093\u306b\u3061\u306f" }]);
});
test("works with Blob directly", async () => {
const blob = new Blob(['{"a":1}\n{"b":2}\n']);
const result = await blob.jsonl();
expect(result).toEqual([{ a: 1 }, { b: 2 }]);
});
test("handles single line without newline", async () => {
using dir = tempDir("jsonl-single", {
"data.jsonl": '{"only":"one"}',
});
const result = await Bun.file(`${dir}/data.jsonl`).jsonl();
expect(result).toEqual([{ only: "one" }]);
});
});