mirror of
https://github.com/oven-sh/bun
synced 2026-02-17 06:12:08 +00:00
Compare commits
12 Commits
claude/fix
...
claude/bun
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a07c319887 | ||
|
|
f160cc91a2 | ||
|
|
7a831b71c9 | ||
|
|
26ea6a1251 | ||
|
|
afe0c17dae | ||
|
|
765b34992a | ||
|
|
0312ac0285 | ||
|
|
295162ab0c | ||
|
|
caeb43f3cd | ||
|
|
7555710057 | ||
|
|
ec4c0bf888 | ||
|
|
25f334805b |
170
bench/snippets/jsonl-comparison.ts
Normal file
170
bench/snippets/jsonl-comparison.ts
Normal file
@@ -0,0 +1,170 @@
|
||||
// Benchmark comparing Bun.file().jsonl() vs TypeScript implementation
|
||||
import { bench, group, run } from "mitata";
|
||||
import { mkdir, readFile, rm, writeFile } from "node:fs/promises";
|
||||
import path from "node:path";
|
||||
|
||||
// User's TypeScript implementation
|
||||
const UTF8_BOM = "\ufeff";
|
||||
|
||||
function stripBOM(content: string): string {
|
||||
return content.startsWith(UTF8_BOM) ? content.slice(1) : content;
|
||||
}
|
||||
|
||||
async function readJSONLFile<T>(filePath: string): Promise<T[]> {
|
||||
try {
|
||||
let content = await readFile(filePath, "utf8");
|
||||
if (!content.trim()) return [];
|
||||
|
||||
// Strip BOM from the beginning of the file - PowerShell 5.x adds BOM to UTF-8 files
|
||||
content = stripBOM(content);
|
||||
|
||||
return content
|
||||
.split("\n")
|
||||
.filter(line => line.trim())
|
||||
.map(line => {
|
||||
try {
|
||||
return JSON.parse(line) as T;
|
||||
} catch (err) {
|
||||
console.error(`Error parsing line in ${filePath}: ${err}`);
|
||||
return null;
|
||||
}
|
||||
})
|
||||
.filter((entry): entry is T => entry !== null);
|
||||
} catch (err) {
|
||||
console.error(`Error opening file ${filePath}: ${err}`);
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
async function readJSONLFileBunJSONL<T>(filePath: string): Promise<T[]> {
|
||||
const result = await Bun.file(filePath).jsonl();
|
||||
return result;
|
||||
}
|
||||
|
||||
// Alternative TypeScript implementation using Bun.file().text()
|
||||
async function readJSONLFileBunText<T>(filePath: string): Promise<T[]> {
|
||||
try {
|
||||
let content = await Bun.file(filePath).text();
|
||||
if (!content.trim()) return [];
|
||||
|
||||
content = stripBOM(content);
|
||||
|
||||
return content
|
||||
.split("\n")
|
||||
.filter(line => line.trim())
|
||||
.map(line => {
|
||||
try {
|
||||
return JSON.parse(line) as T;
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
})
|
||||
.filter((entry): entry is T => entry !== null);
|
||||
} catch {
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
// Setup test data directory
|
||||
const BENCH_DIR = path.join(import.meta.dir, ".jsonl-bench-data");
|
||||
|
||||
interface TestRecord {
|
||||
id: number;
|
||||
name: string;
|
||||
email: string;
|
||||
timestamp: number;
|
||||
data: { key: string; value: number };
|
||||
}
|
||||
|
||||
function generateRecord(i: number): TestRecord {
|
||||
return {
|
||||
id: i,
|
||||
name: `User_${i}`,
|
||||
email: `user${i}@example.com`,
|
||||
timestamp: Date.now(),
|
||||
data: { key: `key_${i}`, value: i * 100 },
|
||||
};
|
||||
}
|
||||
|
||||
async function setup() {
|
||||
await mkdir(BENCH_DIR, { recursive: true });
|
||||
|
||||
// Generate test files of various sizes
|
||||
const sizes = [10, 100, 1000, 10000, 100000];
|
||||
|
||||
for (const size of sizes) {
|
||||
const lines: string[] = [];
|
||||
for (let i = 0; i < size; i++) {
|
||||
lines.push(JSON.stringify(generateRecord(i)));
|
||||
}
|
||||
await writeFile(path.join(BENCH_DIR, `data-${size}.jsonl`), lines.join("\n") + "\n");
|
||||
}
|
||||
|
||||
// File with BOM
|
||||
const bomContent = "\ufeff" + [0, 1, 2].map(i => JSON.stringify(generateRecord(i))).join("\n") + "\n";
|
||||
await writeFile(path.join(BENCH_DIR, "data-bom.jsonl"), bomContent);
|
||||
|
||||
// File with empty lines and invalid JSON
|
||||
const mixedContent = [
|
||||
JSON.stringify(generateRecord(0)),
|
||||
"",
|
||||
" ",
|
||||
"invalid json here",
|
||||
JSON.stringify(generateRecord(1)),
|
||||
"\t\t",
|
||||
JSON.stringify(generateRecord(2)),
|
||||
].join("\n");
|
||||
await writeFile(path.join(BENCH_DIR, "data-mixed.jsonl"), mixedContent);
|
||||
|
||||
// File with CRLF
|
||||
const crlfContent = [0, 1, 2].map(i => JSON.stringify(generateRecord(i))).join("\r\n") + "\r\n";
|
||||
await writeFile(path.join(BENCH_DIR, "data-crlf.jsonl"), crlfContent);
|
||||
|
||||
console.log("Setup complete. Test files created in:", BENCH_DIR);
|
||||
}
|
||||
|
||||
async function cleanup() {
|
||||
await rm(BENCH_DIR, { recursive: true, force: true });
|
||||
}
|
||||
|
||||
async function runBenchmarks() {
|
||||
await setup();
|
||||
|
||||
const sizes = [10, 100, 1000, 10000, 100000];
|
||||
|
||||
for (const size of sizes) {
|
||||
const filePath = path.join(BENCH_DIR, `data-${size}.jsonl`);
|
||||
|
||||
group(`JSONL parsing (${size} lines)`, () => {
|
||||
bench("Bun.file().jsonl() [native]", async () => {
|
||||
await readJSONLFileBunJSONL(filePath);
|
||||
});
|
||||
|
||||
bench("readJSONLFile (node:fs)", async () => {
|
||||
await readJSONLFile(filePath);
|
||||
});
|
||||
|
||||
bench("readJSONLFile (Bun.file)", async () => {
|
||||
await readJSONLFileBunText(filePath);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
// Edge cases
|
||||
group("Edge cases - BOM handling", () => {
|
||||
const filePath = path.join(BENCH_DIR, "data-bom.jsonl");
|
||||
|
||||
bench("Bun.file().jsonl() [native]", async () => {
|
||||
await readJSONLFileBunJSONL(filePath);
|
||||
});
|
||||
|
||||
bench("readJSONLFile (node:fs)", async () => {
|
||||
await readJSONLFile(filePath);
|
||||
});
|
||||
});
|
||||
|
||||
await run();
|
||||
await cleanup();
|
||||
}
|
||||
|
||||
runBenchmarks().catch(console.error);
|
||||
132
bench/snippets/jsonl-memory-bench.ts
Normal file
132
bench/snippets/jsonl-memory-bench.ts
Normal file
@@ -0,0 +1,132 @@
|
||||
// Benchmark JSONL parsing performance without file I/O overhead
|
||||
import { bench, group, run } from "mitata";
|
||||
|
||||
interface TestRecord {
|
||||
id: number;
|
||||
name: string;
|
||||
email: string;
|
||||
timestamp: number;
|
||||
data: { key: string; value: number };
|
||||
}
|
||||
|
||||
function generateRecord(i: number): TestRecord {
|
||||
return {
|
||||
id: i,
|
||||
name: `User_${i}`,
|
||||
email: `user${i}@example.com`,
|
||||
timestamp: Date.now(),
|
||||
data: { key: `key_${i}`, value: i * 100 },
|
||||
};
|
||||
}
|
||||
|
||||
function generateJSONLContent(lineCount: number): string {
|
||||
const lines: string[] = [];
|
||||
for (let i = 0; i < lineCount; i++) {
|
||||
lines.push(JSON.stringify(generateRecord(i)));
|
||||
}
|
||||
return lines.join("\n") + "\n";
|
||||
}
|
||||
|
||||
// TypeScript implementation using Blob.text()
|
||||
async function parseJSONLWithText<T>(blob: Blob): Promise<T[]> {
|
||||
const content = await blob.text();
|
||||
if (!content.trim()) return [];
|
||||
|
||||
return content
|
||||
.split("\n")
|
||||
.filter(line => line.trim())
|
||||
.map(line => {
|
||||
try {
|
||||
return JSON.parse(line) as T;
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
})
|
||||
.filter((entry): entry is T => entry !== null);
|
||||
}
|
||||
|
||||
// Native Bun.file().jsonl() equivalent via Blob
|
||||
async function parseJSONLNative<T>(blob: Blob): Promise<T[]> {
|
||||
return (blob as any).jsonl();
|
||||
}
|
||||
|
||||
// Sync-like TypeScript implementation (text is already available)
|
||||
function parseJSONLSync<T>(content: string): T[] {
|
||||
if (!content.trim()) return [];
|
||||
|
||||
return content
|
||||
.split("\n")
|
||||
.filter(line => line.trim())
|
||||
.map(line => {
|
||||
try {
|
||||
return JSON.parse(line) as T;
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
})
|
||||
.filter((entry): entry is T => entry !== null);
|
||||
}
|
||||
|
||||
async function runBenchmarks() {
|
||||
const sizes = [100, 1000, 10000, 100000];
|
||||
|
||||
for (const size of sizes) {
|
||||
const content = generateJSONLContent(size);
|
||||
const blob = new Blob([content]);
|
||||
|
||||
// Pre-warm the blob text for sync comparison
|
||||
const textContent = await blob.text();
|
||||
|
||||
group(`JSONL parsing ${size} lines (in-memory)`, () => {
|
||||
bench("Blob.jsonl() [native]", async () => {
|
||||
// Create new blob each time to avoid caching effects
|
||||
const b = new Blob([content]);
|
||||
await parseJSONLNative(b);
|
||||
});
|
||||
|
||||
bench("Blob.text() + JS parse", async () => {
|
||||
const b = new Blob([content]);
|
||||
await parseJSONLWithText(b);
|
||||
});
|
||||
|
||||
bench("String split + JSON.parse (sync)", () => {
|
||||
parseJSONLSync(textContent);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
// Test with varying line lengths
|
||||
group("JSONL with large objects (1000 lines)", () => {
|
||||
const largeObjects = Array.from({ length: 1000 }, (_, i) => ({
|
||||
id: i,
|
||||
name: `User_${i}`,
|
||||
description: "A".repeat(500), // 500 char string
|
||||
metadata: {
|
||||
key1: "value1",
|
||||
key2: "value2",
|
||||
key3: "value3",
|
||||
nested: { a: 1, b: 2, c: 3 },
|
||||
},
|
||||
}));
|
||||
const content = largeObjects.map(o => JSON.stringify(o)).join("\n") + "\n";
|
||||
const textContent = content;
|
||||
|
||||
bench("Blob.jsonl() [native]", async () => {
|
||||
const b = new Blob([content]);
|
||||
await parseJSONLNative(b);
|
||||
});
|
||||
|
||||
bench("Blob.text() + JS parse", async () => {
|
||||
const b = new Blob([content]);
|
||||
await parseJSONLWithText(b);
|
||||
});
|
||||
|
||||
bench("String split + JSON.parse (sync)", () => {
|
||||
parseJSONLSync(textContent);
|
||||
});
|
||||
});
|
||||
|
||||
await run();
|
||||
}
|
||||
|
||||
runBenchmarks().catch(console.error);
|
||||
43
bench/snippets/jsonl-timing-test.js
Normal file
43
bench/snippets/jsonl-timing-test.js
Normal file
@@ -0,0 +1,43 @@
|
||||
// Test script for JSONL timing
|
||||
const fs = require("fs");
|
||||
const path = require("path");
|
||||
|
||||
const testDir = path.join(__dirname, ".jsonl-bench-data");
|
||||
|
||||
// Create test data if it doesn't exist
|
||||
if (!fs.existsSync(testDir)) {
|
||||
fs.mkdirSync(testDir, { recursive: true });
|
||||
|
||||
const sizes = [1000, 10000, 100000];
|
||||
for (const size of sizes) {
|
||||
const lines = [];
|
||||
for (let i = 0; i < size; i++) {
|
||||
lines.push(
|
||||
JSON.stringify({
|
||||
id: i,
|
||||
name: `User_${i}`,
|
||||
email: `user${i}@example.com`,
|
||||
timestamp: Date.now(),
|
||||
data: { key: `key_${i}`, value: i * 100 },
|
||||
}),
|
||||
);
|
||||
}
|
||||
fs.writeFileSync(path.join(testDir, `data-${size}.jsonl`), lines.join("\n") + "\n");
|
||||
console.log(`Created data-${size}.jsonl`);
|
||||
}
|
||||
}
|
||||
|
||||
// Run tests
|
||||
async function main() {
|
||||
const sizes = [1000, 10000, 100000];
|
||||
|
||||
for (const size of sizes) {
|
||||
const filePath = path.join(testDir, `data-${size}.jsonl`);
|
||||
console.log(`\n>>> Testing ${size} lines...`);
|
||||
|
||||
const result = await Bun.file(filePath).jsonl();
|
||||
console.log(`Result: ${result.length} items parsed`);
|
||||
}
|
||||
}
|
||||
|
||||
main().catch(console.error);
|
||||
120
bench/snippets/jsonl-tiny-objects-bench.ts
Normal file
120
bench/snippets/jsonl-tiny-objects-bench.ts
Normal file
@@ -0,0 +1,120 @@
|
||||
// Benchmark designed to maximize native implementation advantage
|
||||
// Small JSON objects = minimal parse time, maximum boundary-crossing overhead ratio
|
||||
import { bench, group, run } from "mitata";
|
||||
|
||||
// Generate tiny JSON objects - minimal parse time per object
|
||||
function generateTinyJSONL(lineCount: number): string {
|
||||
const lines: string[] = [];
|
||||
for (let i = 0; i < lineCount; i++) {
|
||||
lines.push(`{"i":${i}}`);
|
||||
}
|
||||
return lines.join("\n") + "\n";
|
||||
}
|
||||
|
||||
// Even smaller - just numbers
|
||||
function generateNumbersJSONL(lineCount: number): string {
|
||||
const lines: string[] = [];
|
||||
for (let i = 0; i < lineCount; i++) {
|
||||
lines.push(String(i));
|
||||
}
|
||||
return lines.join("\n") + "\n";
|
||||
}
|
||||
|
||||
// Small strings
|
||||
function generateStringsJSONL(lineCount: number): string {
|
||||
const lines: string[] = [];
|
||||
for (let i = 0; i < lineCount; i++) {
|
||||
lines.push(`"s${i}"`);
|
||||
}
|
||||
return lines.join("\n") + "\n";
|
||||
}
|
||||
|
||||
// TypeScript implementation
|
||||
async function parseJSONLWithText<T>(blob: Blob): Promise<T[]> {
|
||||
const content = await blob.text();
|
||||
if (!content.trim()) return [];
|
||||
|
||||
return content
|
||||
.split("\n")
|
||||
.filter(line => line.trim())
|
||||
.map(line => {
|
||||
try {
|
||||
return JSON.parse(line) as T;
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
})
|
||||
.filter((entry): entry is T => entry !== null);
|
||||
}
|
||||
|
||||
// Native Blob.jsonl()
|
||||
async function parseJSONLNative<T>(blob: Blob): Promise<T[]> {
|
||||
return (blob as any).jsonl();
|
||||
}
|
||||
|
||||
async function runBenchmarks() {
|
||||
console.log("=== Native vs JS: Small Objects Benchmark ===");
|
||||
console.log("Goal: Maximize boundary-crossing overhead ratio\n");
|
||||
|
||||
// Test with very high line counts and tiny objects
|
||||
const sizes = [10_000, 50_000, 100_000, 500_000, 1_000_000];
|
||||
|
||||
// Tiny objects: {"i":N}
|
||||
for (const size of sizes) {
|
||||
const content = generateTinyJSONL(size);
|
||||
const sizeKB = (content.length / 1024).toFixed(1);
|
||||
|
||||
group(`Tiny objects {"i":N} - ${size / 1000}k lines (${sizeKB} KB)`, () => {
|
||||
bench("Blob.jsonl() [native]", async () => {
|
||||
const b = new Blob([content]);
|
||||
await parseJSONLNative(b);
|
||||
});
|
||||
|
||||
bench("Blob.text() + JS parse", async () => {
|
||||
const b = new Blob([content]);
|
||||
await parseJSONLWithText(b);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
// Plain numbers - absolute minimum parse time
|
||||
const numberSizes = [100_000, 500_000, 1_000_000];
|
||||
for (const size of numberSizes) {
|
||||
const content = generateNumbersJSONL(size);
|
||||
const sizeKB = (content.length / 1024).toFixed(1);
|
||||
|
||||
group(`Plain numbers - ${size / 1000}k lines (${sizeKB} KB)`, () => {
|
||||
bench("Blob.jsonl() [native]", async () => {
|
||||
const b = new Blob([content]);
|
||||
await parseJSONLNative(b);
|
||||
});
|
||||
|
||||
bench("Blob.text() + JS parse", async () => {
|
||||
const b = new Blob([content]);
|
||||
await parseJSONLWithText(b);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
// Small strings
|
||||
for (const size of numberSizes) {
|
||||
const content = generateStringsJSONL(size);
|
||||
const sizeKB = (content.length / 1024).toFixed(1);
|
||||
|
||||
group(`Small strings "sN" - ${size / 1000}k lines (${sizeKB} KB)`, () => {
|
||||
bench("Blob.jsonl() [native]", async () => {
|
||||
const b = new Blob([content]);
|
||||
await parseJSONLNative(b);
|
||||
});
|
||||
|
||||
bench("Blob.text() + JS parse", async () => {
|
||||
const b = new Blob([content]);
|
||||
await parseJSONLWithText(b);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
await run();
|
||||
}
|
||||
|
||||
runBenchmarks().catch(console.error);
|
||||
@@ -60,7 +60,9 @@
|
||||
#include "JavaScriptCore/JSModuleRecord.h"
|
||||
#include "JavaScriptCore/JSNativeStdFunction.h"
|
||||
#include "JavaScriptCore/JSONObject.h"
|
||||
#include "JavaScriptCore/LiteralParser.h"
|
||||
#include "JavaScriptCore/JSObject.h"
|
||||
#include <wtf/text/ASCIIFastPath.h>
|
||||
#include "JavaScriptCore/JSSet.h"
|
||||
#include "JavaScriptCore/Strong.h"
|
||||
#include "JavaScriptCore/JSSetIterator.h"
|
||||
@@ -2195,6 +2197,189 @@ extern "C" JSC::EncodedJSValue ZigString__toJSONObject(const ZigString* strPtr,
|
||||
return JSValue::encode(result);
|
||||
}
|
||||
|
||||
// Parse JSONL content entirely in C++ - no Zig offset/length arrays needed.
|
||||
// Forward declaration for Bun's optimized UTF-8 to string conversion
|
||||
extern "C" JSC::EncodedJSValue Bun__encoding__toStringUTF8(const uint8_t* input, size_t len, JSC::JSGlobalObject* globalObject);
|
||||
|
||||
// Helper to find newline in byte array using memchr (SIMD-optimized)
|
||||
static inline size_t findNewline(const uint8_t* data, size_t start, size_t end)
|
||||
{
|
||||
if (start >= end) return notFound;
|
||||
const void* result = memchr(data + start, '\n', end - start);
|
||||
if (result) {
|
||||
return static_cast<const uint8_t*>(result) - data;
|
||||
}
|
||||
return notFound;
|
||||
}
|
||||
|
||||
// Check if a line is whitespace-only (for 8-bit data)
|
||||
static inline bool isWhitespaceOnlyLine8(const Latin1Character* data, size_t start, size_t len)
|
||||
{
|
||||
Latin1Character firstChar = data[start];
|
||||
if (firstChar != ' ' && firstChar != '\t') return false;
|
||||
for (size_t i = start; i < start + len; i++) {
|
||||
Latin1Character c = data[i];
|
||||
if (c != ' ' && c != '\t') return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Uses MarkedArgumentBuffer for GC-safe value collection.
|
||||
// Optimized: For ASCII-only data, parses directly from UTF-8 using LiteralParser<Latin1Character>
|
||||
// to avoid UTF-16 conversion overhead.
|
||||
extern "C" JSC::EncodedJSValue Bun__parseJSONLFromBlob(
|
||||
JSC::JSGlobalObject* globalObject,
|
||||
const uint8_t* data,
|
||||
size_t size)
|
||||
{
|
||||
auto& vm = globalObject->vm();
|
||||
auto scope = DECLARE_THROW_SCOPE(vm);
|
||||
|
||||
// Handle BOM (Byte Order Mark)
|
||||
size_t offset = 0;
|
||||
if (size >= 3 && data[0] == 0xEF && data[1] == 0xBB && data[2] == 0xBF) {
|
||||
offset = 3; // UTF-8 BOM
|
||||
}
|
||||
|
||||
if (size <= offset) {
|
||||
RELEASE_AND_RETURN(scope, JSValue::encode(constructEmptyArray(globalObject, nullptr)));
|
||||
}
|
||||
|
||||
const uint8_t* contentStart = data + offset;
|
||||
size_t contentSize = size - offset;
|
||||
|
||||
// Use MarkedArgumentBuffer for GC-safe collection of parsed values
|
||||
MarkedArgumentBuffer args;
|
||||
|
||||
// Check if content is ASCII-only (fast SIMD check)
|
||||
std::span<const uint8_t> contentSpan(contentStart, contentSize);
|
||||
bool isAllASCII = charactersAreAllASCII(contentSpan);
|
||||
|
||||
if (isAllASCII) {
|
||||
// Fast path: ASCII-only data can be parsed directly as Latin1
|
||||
// UTF-8 ASCII bytes are identical to Latin1 encoding
|
||||
const Latin1Character* latin1Data = reinterpret_cast<const Latin1Character*>(contentStart);
|
||||
size_t pos = 0;
|
||||
|
||||
while (pos < contentSize) {
|
||||
// Find newline
|
||||
size_t newlinePos = findNewline(contentStart, pos, contentSize);
|
||||
size_t lineEnd = (newlinePos == notFound) ? contentSize : newlinePos;
|
||||
|
||||
// Handle CRLF
|
||||
if (lineEnd > pos && latin1Data[lineEnd - 1] == '\r') {
|
||||
lineEnd--;
|
||||
}
|
||||
|
||||
size_t lineLen = lineEnd - pos;
|
||||
|
||||
if (lineLen > 0 && !isWhitespaceOnlyLine8(latin1Data, pos, lineLen)) {
|
||||
// Use LiteralParser directly with Latin1 data (8-bit fast path)
|
||||
std::span<const Latin1Character> lineSpan(latin1Data + pos, lineLen);
|
||||
LiteralParser<Latin1Character, JSONReviverMode::Disabled> parser(globalObject, lineSpan, StrictJSON);
|
||||
JSValue parsed = parser.tryLiteralParse();
|
||||
|
||||
if (scope.exception()) {
|
||||
scope.clearException();
|
||||
} else if (parsed) {
|
||||
args.append(parsed);
|
||||
}
|
||||
}
|
||||
|
||||
pos = (newlinePos == notFound) ? contentSize : newlinePos + 1;
|
||||
}
|
||||
} else {
|
||||
// Slow path: Contains non-ASCII, need UTF-16 conversion
|
||||
JSValue jsStringValue = JSValue::decode(Bun__encoding__toStringUTF8(contentStart, contentSize, globalObject));
|
||||
|
||||
if (!jsStringValue || !jsStringValue.isString()) {
|
||||
return JSValue::encode(constructEmptyArray(globalObject, nullptr));
|
||||
}
|
||||
|
||||
JSString* jsString = jsCast<JSString*>(jsStringValue);
|
||||
auto fullString = jsString->value(globalObject);
|
||||
RETURN_IF_EXCEPTION(scope, {});
|
||||
|
||||
StringView fullView = fullString;
|
||||
size_t pos = 0;
|
||||
size_t length = fullView.length();
|
||||
|
||||
// Check if the converted string is 8-bit (Latin1)
|
||||
// Even with non-ASCII UTF-8, if all chars fit in Latin1, we can use 8-bit path
|
||||
bool use8BitPath = fullView.is8Bit();
|
||||
|
||||
while (pos < length) {
|
||||
size_t newlinePos = fullView.find('\n', pos);
|
||||
size_t lineEnd = (newlinePos == notFound) ? length : newlinePos;
|
||||
|
||||
if (lineEnd > pos && fullView[lineEnd - 1] == '\r') {
|
||||
lineEnd--;
|
||||
}
|
||||
|
||||
size_t lineLen = lineEnd - pos;
|
||||
|
||||
if (lineLen > 0) {
|
||||
bool isWhitespaceOnly = false;
|
||||
if (use8BitPath) {
|
||||
Latin1Character firstChar = fullView.span8()[pos];
|
||||
if (firstChar == ' ' || firstChar == '\t') {
|
||||
isWhitespaceOnly = true;
|
||||
for (size_t i = pos; i < pos + lineLen; i++) {
|
||||
Latin1Character c = fullView.span8()[i];
|
||||
if (c != ' ' && c != '\t') {
|
||||
isWhitespaceOnly = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
UChar firstChar = fullView[pos];
|
||||
if (firstChar == ' ' || firstChar == '\t') {
|
||||
isWhitespaceOnly = true;
|
||||
for (size_t i = pos; i < pos + lineLen; i++) {
|
||||
UChar c = fullView[i];
|
||||
if (c != ' ' && c != '\t') {
|
||||
isWhitespaceOnly = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!isWhitespaceOnly) {
|
||||
JSValue parsed;
|
||||
if (use8BitPath) {
|
||||
// Use LiteralParser directly with 8-bit data
|
||||
std::span<const Latin1Character> lineSpan(fullView.span8().data() + pos, lineLen);
|
||||
LiteralParser<Latin1Character, JSONReviverMode::Disabled> parser(globalObject, lineSpan, StrictJSON);
|
||||
parsed = parser.tryLiteralParse();
|
||||
} else {
|
||||
// Use LiteralParser with 16-bit data
|
||||
std::span<const char16_t> lineSpan(fullView.span16().data() + pos, lineLen);
|
||||
LiteralParser<char16_t, JSONReviverMode::Disabled> parser(globalObject, lineSpan, StrictJSON);
|
||||
parsed = parser.tryLiteralParse();
|
||||
}
|
||||
|
||||
if (scope.exception()) {
|
||||
scope.clearException();
|
||||
} else if (parsed) {
|
||||
args.append(parsed);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pos = (newlinePos == notFound) ? length : newlinePos + 1;
|
||||
}
|
||||
}
|
||||
|
||||
if (args.hasOverflowed()) [[unlikely]] {
|
||||
throwOutOfMemoryError(globalObject, scope);
|
||||
return {};
|
||||
}
|
||||
|
||||
RELEASE_AND_RETURN(scope, JSValue::encode(constructArray(globalObject, static_cast<ArrayAllocationProfile*>(nullptr), args)));
|
||||
}
|
||||
|
||||
// We used to just throw "Out of memory" as a regular Error with that string.
|
||||
//
|
||||
// But JSC has some different handling for out of memory errors. So we should
|
||||
|
||||
1
src/bun.js/bindings/headers.h
generated
1
src/bun.js/bindings/headers.h
generated
@@ -61,6 +61,7 @@ CPP_DECL JSC::EncodedJSValue ZigString__toRangeErrorInstance(const ZigString* ar
|
||||
CPP_DECL JSC::EncodedJSValue ZigString__toSyntaxErrorInstance(const ZigString* arg0, JSC::JSGlobalObject* arg1);
|
||||
CPP_DECL JSC::EncodedJSValue ZigString__toTypeErrorInstance(const ZigString* arg0, JSC::JSGlobalObject* arg1);
|
||||
CPP_DECL JSC::EncodedJSValue ZigString__toValueGC(const ZigString* arg0, JSC::JSGlobalObject* arg1);
|
||||
CPP_DECL JSC::EncodedJSValue Bun__parseJSONLFromBlob(JSC::JSGlobalObject* arg0, const uint8_t* arg1, size_t arg2);
|
||||
CPP_DECL WebCore::DOMURL* WebCore__DOMURL__cast_(JSC::EncodedJSValue JSValue0, JSC::VM* arg1);
|
||||
CPP_DECL BunString WebCore__DOMURL__fileSystemPath(WebCore::DOMURL* arg0, int* errorCode);
|
||||
CPP_DECL void WebCore__DOMURL__href_(WebCore::DOMURL* arg0, ZigString* arg1);
|
||||
|
||||
@@ -3602,6 +3602,62 @@ pub fn toJSONWithBytes(this: *Blob, global: *JSGlobalObject, raw_bytes: []const
|
||||
return ZigString.init(buf).toJSONObject(global);
|
||||
}
|
||||
|
||||
// ===== JSONL Support =====
|
||||
|
||||
pub fn getJSONL(
|
||||
this: *Blob,
|
||||
globalThis: *jsc.JSGlobalObject,
|
||||
_: *jsc.CallFrame,
|
||||
) bun.JSError!jsc.JSValue {
|
||||
return this.getJSONLShare(globalThis);
|
||||
}
|
||||
|
||||
pub fn getJSONLShare(
|
||||
this: *Blob,
|
||||
globalObject: *jsc.JSGlobalObject,
|
||||
) bun.JSTerminated!jsc.JSValue {
|
||||
const store = this.store;
|
||||
if (store) |st| st.ref();
|
||||
defer if (store) |st| st.deref();
|
||||
return jsc.JSPromise.wrap(globalObject, lifetimeWrap(toJSONL, .share), .{ this, globalObject });
|
||||
}
|
||||
|
||||
pub fn toJSONL(this: *Blob, global: *JSGlobalObject, comptime lifetime: Lifetime) bun.JSError!JSValue {
|
||||
if (this.needsToReadFile()) {
|
||||
return this.doReadFile(toJSONLWithBytes, global);
|
||||
}
|
||||
if (this.isS3()) {
|
||||
return this.doReadFromS3(toJSONLWithBytes, global);
|
||||
}
|
||||
const view_ = this.sharedView();
|
||||
return toJSONLWithBytes(this, global, view_, lifetime);
|
||||
}
|
||||
|
||||
// Pure C++ JSONL parsing - all processing happens in C++ for efficiency
|
||||
extern fn Bun__parseJSONLFromBlob(
|
||||
globalObject: *JSGlobalObject,
|
||||
data: [*]const u8,
|
||||
size: usize,
|
||||
) JSValue;
|
||||
|
||||
pub fn toJSONLWithBytes(_: *Blob, global: *JSGlobalObject, raw_bytes: []const u8, comptime lifetime: Lifetime) bun.JSError!JSValue {
|
||||
defer if (comptime lifetime == .temporary) bun.default_allocator.free(@constCast(raw_bytes));
|
||||
|
||||
if (raw_bytes.len == 0) {
|
||||
return jsc.JSArray.createEmpty(global, 0);
|
||||
}
|
||||
|
||||
// All processing (BOM handling, line scanning, JSON parsing) happens in C++
|
||||
const result = Bun__parseJSONLFromBlob(global, raw_bytes.ptr, raw_bytes.len);
|
||||
|
||||
// C++ returns .zero on exception
|
||||
if (result == .zero) {
|
||||
return error.JSError;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
pub fn toFormDataWithBytes(this: *Blob, global: *JSGlobalObject, buf: []u8, comptime _: Lifetime) JSValue {
|
||||
var encoder = this.getFormDataEncoding() orelse return {
|
||||
return ZigString.init("Invalid encoding").toErrorInstance(global);
|
||||
|
||||
@@ -152,6 +152,7 @@ export default [
|
||||
proto: {
|
||||
text: { fn: "getText", async: true },
|
||||
json: { fn: "getJSON", async: true },
|
||||
jsonl: { fn: "getJSONL", async: true },
|
||||
arrayBuffer: { fn: "getArrayBuffer", async: true },
|
||||
slice: { fn: "getSlice", length: 2 },
|
||||
stream: { fn: "getStream", length: 1 },
|
||||
|
||||
133
test/js/bun/io/file-jsonl.test.ts
Normal file
133
test/js/bun/io/file-jsonl.test.ts
Normal file
@@ -0,0 +1,133 @@
|
||||
import { describe, expect, test } from "bun:test";
|
||||
import { tempDir } from "harness";
|
||||
|
||||
describe("Bun.file().jsonl()", () => {
|
||||
test("parses basic JSONL file", async () => {
|
||||
using dir = tempDir("jsonl-basic", {
|
||||
"data.jsonl": '{"a":1}\n{"b":2}\n',
|
||||
});
|
||||
const result = await Bun.file(`${dir}/data.jsonl`).jsonl();
|
||||
expect(result).toEqual([{ a: 1 }, { b: 2 }]);
|
||||
});
|
||||
|
||||
test("returns empty array for empty file", async () => {
|
||||
using dir = tempDir("jsonl-empty-file", {
|
||||
"data.jsonl": "",
|
||||
});
|
||||
const result = await Bun.file(`${dir}/data.jsonl`).jsonl();
|
||||
expect(result).toEqual([]);
|
||||
});
|
||||
|
||||
test("handles CRLF line endings", async () => {
|
||||
using dir = tempDir("jsonl-crlf", {
|
||||
"data.jsonl": '{"a":1}\r\n{"b":2}\r\n',
|
||||
});
|
||||
const result = await Bun.file(`${dir}/data.jsonl`).jsonl();
|
||||
expect(result).toEqual([{ a: 1 }, { b: 2 }]);
|
||||
});
|
||||
|
||||
test("handles last line without newline", async () => {
|
||||
using dir = tempDir("jsonl-no-trailing", {
|
||||
"data.jsonl": '{"a":1}\n{"b":2}',
|
||||
});
|
||||
const result = await Bun.file(`${dir}/data.jsonl`).jsonl();
|
||||
expect(result).toEqual([{ a: 1 }, { b: 2 }]);
|
||||
});
|
||||
|
||||
test("skips empty lines", async () => {
|
||||
using dir = tempDir("jsonl-empty-lines", {
|
||||
"data.jsonl": '{"a":1}\n\n{"b":2}\n\n',
|
||||
});
|
||||
const result = await Bun.file(`${dir}/data.jsonl`).jsonl();
|
||||
expect(result).toEqual([{ a: 1 }, { b: 2 }]);
|
||||
});
|
||||
|
||||
test("skips whitespace-only lines", async () => {
|
||||
using dir = tempDir("jsonl-whitespace-lines", {
|
||||
"data.jsonl": '{"a":1}\n \n{"b":2}\n\t\t\n',
|
||||
});
|
||||
const result = await Bun.file(`${dir}/data.jsonl`).jsonl();
|
||||
expect(result).toEqual([{ a: 1 }, { b: 2 }]);
|
||||
});
|
||||
|
||||
test("skips invalid JSON lines", async () => {
|
||||
using dir = tempDir("jsonl-invalid", {
|
||||
"data.jsonl": '{"a":1}\ninvalid json\n{"b":2}\n',
|
||||
});
|
||||
const result = await Bun.file(`${dir}/data.jsonl`).jsonl();
|
||||
expect(result).toEqual([{ a: 1 }, { b: 2 }]);
|
||||
});
|
||||
|
||||
test("handles BOM", async () => {
|
||||
using dir = tempDir("jsonl-bom", {
|
||||
"data.jsonl": '\ufeff{"a":1}\n{"b":2}\n',
|
||||
});
|
||||
const result = await Bun.file(`${dir}/data.jsonl`).jsonl();
|
||||
expect(result).toEqual([{ a: 1 }, { b: 2 }]);
|
||||
});
|
||||
|
||||
test("handles arrays as JSON values", async () => {
|
||||
using dir = tempDir("jsonl-arrays", {
|
||||
"data.jsonl": '[1,2,3]\n["a","b"]\n',
|
||||
});
|
||||
const result = await Bun.file(`${dir}/data.jsonl`).jsonl();
|
||||
expect(result).toEqual([
|
||||
[1, 2, 3],
|
||||
["a", "b"],
|
||||
]);
|
||||
});
|
||||
|
||||
test("handles strings as JSON values", async () => {
|
||||
using dir = tempDir("jsonl-strings", {
|
||||
"data.jsonl": '"hello"\n"world"\n',
|
||||
});
|
||||
const result = await Bun.file(`${dir}/data.jsonl`).jsonl();
|
||||
expect(result).toEqual(["hello", "world"]);
|
||||
});
|
||||
|
||||
test("handles numbers as JSON values", async () => {
|
||||
using dir = tempDir("jsonl-numbers", {
|
||||
"data.jsonl": "42\n3.14\n-100\n",
|
||||
});
|
||||
const result = await Bun.file(`${dir}/data.jsonl`).jsonl();
|
||||
expect(result).toEqual([42, 3.14, -100]);
|
||||
});
|
||||
|
||||
test("handles null and boolean values", async () => {
|
||||
using dir = tempDir("jsonl-primitives", {
|
||||
"data.jsonl": "null\ntrue\nfalse\n",
|
||||
});
|
||||
const result = await Bun.file(`${dir}/data.jsonl`).jsonl();
|
||||
expect(result).toEqual([null, true, false]);
|
||||
});
|
||||
|
||||
test("handles nested objects", async () => {
|
||||
using dir = tempDir("jsonl-nested", {
|
||||
"data.jsonl": '{"user":{"name":"John","age":30}}\n{"data":[1,2,3]}\n',
|
||||
});
|
||||
const result = await Bun.file(`${dir}/data.jsonl`).jsonl();
|
||||
expect(result).toEqual([{ user: { name: "John", age: 30 } }, { data: [1, 2, 3] }]);
|
||||
});
|
||||
|
||||
test("handles unicode content", async () => {
|
||||
using dir = tempDir("jsonl-unicode", {
|
||||
"data.jsonl": '{"emoji":"\\ud83d\\ude00"}\n{"japanese":"\\u3053\\u3093\\u306b\\u3061\\u306f"}\n',
|
||||
});
|
||||
const result = await Bun.file(`${dir}/data.jsonl`).jsonl();
|
||||
expect(result).toEqual([{ emoji: "\ud83d\ude00" }, { japanese: "\u3053\u3093\u306b\u3061\u306f" }]);
|
||||
});
|
||||
|
||||
test("works with Blob directly", async () => {
|
||||
const blob = new Blob(['{"a":1}\n{"b":2}\n']);
|
||||
const result = await blob.jsonl();
|
||||
expect(result).toEqual([{ a: 1 }, { b: 2 }]);
|
||||
});
|
||||
|
||||
test("handles single line without newline", async () => {
|
||||
using dir = tempDir("jsonl-single", {
|
||||
"data.jsonl": '{"only":"one"}',
|
||||
});
|
||||
const result = await Bun.file(`${dir}/data.jsonl`).jsonl();
|
||||
expect(result).toEqual([{ only: "one" }]);
|
||||
});
|
||||
});
|
||||
Reference in New Issue
Block a user