Created
March 6, 2026 02:12
-
-
Save brandonhimpfen/d3e3aeb84dc51b0bebb8800bde85d529 to your computer and use it in GitHub Desktop.
Split a large JSONL/NDJSON file in Node.js into smaller chunks by line count or approximate size, using streams and backpressure (no deps).
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env node | |
| /** | |
| * Split a large JSONL (NDJSON) file into chunks by line count or size. | |
| * | |
| * Features: | |
| * - Streams input line-by-line | |
| * - Writes chunk files incrementally | |
| * - Supports splitting by: | |
| * --max-lines=N | |
| * --max-bytes=N | |
| * - Handles stdout backpressure on file writes | |
| * - No dependencies | |
| * | |
| * Usage: | |
| * node node-split-jsonl-file.js input.jsonl --max-lines=100000 | |
| * node node-split-jsonl-file.js input.jsonl --max-bytes=10485760 | |
| * | |
| * Output: | |
| * input.part-0001.jsonl | |
| * input.part-0002.jsonl | |
| * ... | |
| */ | |
| const fs = require("fs"); | |
| const path = require("path"); | |
| const readline = require("readline"); | |
| const { once } = require("events"); | |
| function usage() { | |
| console.error(`Usage: node node-split-jsonl-file.js <input.jsonl> [options] | |
| Options: | |
| --max-lines=N Split after N lines per chunk | |
| --max-bytes=N Split after about N bytes per chunk | |
| (size is approximate, based on UTF-8 encoded output) | |
| Examples: | |
| node node-split-jsonl-file.js input.jsonl --max-lines=100000 | |
| node node-split-jsonl-file.js input.jsonl --max-bytes=10485760 | |
| `); | |
| } | |
| function parseArgs(argv) { | |
| const args = { | |
| input: null, | |
| maxLines: null, | |
| maxBytes: null, | |
| }; | |
| for (const arg of argv.slice(2)) { | |
| if (!args.input && !arg.startsWith("--")) { | |
| args.input = arg; | |
| continue; | |
| } | |
| if (arg.startsWith("--max-lines=")) { | |
| args.maxLines = Number(arg.split("=", 2)[1]); | |
| continue; | |
| } | |
| if (arg.startsWith("--max-bytes=")) { | |
| args.maxBytes = Number(arg.split("=", 2)[1]); | |
| continue; | |
| } | |
| if (arg === "-h" || arg === "--help") { | |
| args.help = true; | |
| continue; | |
| } | |
| throw new Error(`Unknown argument: ${arg}`); | |
| } | |
| return args; | |
| } | |
| function makeChunkPath(inputPath, chunkNum) { | |
| const abs = path.resolve(inputPath); | |
| const dir = path.dirname(abs); | |
| const ext = path.extname(abs); | |
| const base = path.basename(abs, ext); | |
| const suffix = String(chunkNum).padStart(4, "0"); | |
| return path.join(dir, `${base}.part-${suffix}${ext || ".jsonl"}`); | |
| } | |
| async function safeWrite(stream, data) { | |
| if (!stream.write(data)) { | |
| await once(stream, "drain"); | |
| } | |
| } | |
| async function closeStream(stream) { | |
| await new Promise((resolve, reject) => { | |
| stream.end(() => resolve()); | |
| stream.on("error", reject); | |
| }); | |
| } | |
| async function main() { | |
| const args = parseArgs(process.argv); | |
| if (args.help || !args.input || (!args.maxLines && !args.maxBytes)) { | |
| usage(); | |
| process.exit(args.help ? 0 : 2); | |
| } | |
| if (args.maxLines && (!Number.isInteger(args.maxLines) || args.maxLines <= 0)) { | |
| throw new Error("--max-lines must be a positive integer"); | |
| } | |
| if (args.maxBytes && (!Number.isInteger(args.maxBytes) || args.maxBytes <= 0)) { | |
| throw new Error("--max-bytes must be a positive integer"); | |
| } | |
| const inputPath = path.resolve(args.input); | |
| const inputStream = fs.createReadStream(inputPath, { encoding: "utf8" }); | |
| const rl = readline.createInterface({ | |
| input: inputStream, | |
| crlfDelay: Infinity, | |
| }); | |
| let chunkNum = 0; | |
| let chunkStream = null; | |
| let chunkLines = 0; | |
| let chunkBytes = 0; | |
| let totalLines = 0; | |
| let filesCreated = 0; | |
| async function openNextChunk() { | |
| chunkNum += 1; | |
| const chunkPath = makeChunkPath(inputPath, chunkNum); | |
| chunkStream = fs.createWriteStream(chunkPath, { encoding: "utf8", flags: "wx" }); | |
| chunkLines = 0; | |
| chunkBytes = 0; | |
| filesCreated += 1; | |
| console.error(`Opened ${chunkPath}`); | |
| } | |
| try { | |
| for await (const line of rl) { | |
| if (chunkStream === null) { | |
| await openNextChunk(); | |
| } | |
| const outLine = line + "\n"; | |
| const outBytes = Buffer.byteLength(outLine, "utf8"); | |
| const wouldExceedLines = | |
| args.maxLines && chunkLines >= args.maxLines; | |
| const wouldExceedBytes = | |
| args.maxBytes && chunkBytes > 0 && (chunkBytes + outBytes) > args.maxBytes; | |
| if (wouldExceedLines || wouldExceedBytes) { | |
| await closeStream(chunkStream); | |
| chunkStream = null; | |
| await openNextChunk(); | |
| } | |
| await safeWrite(chunkStream, outLine); | |
| chunkLines += 1; | |
| chunkBytes += outBytes; | |
| totalLines += 1; | |
| } | |
| if (chunkStream) { | |
| await closeStream(chunkStream); | |
| chunkStream = null; | |
| } | |
| } finally { | |
| rl.close(); | |
| } | |
| console.error(`Done. Total lines: ${totalLines}. Files created: ${filesCreated}.`); | |
| } | |
| main().catch((err) => { | |
| console.error("ERROR:", err && err.stack ? err.stack : err); | |
| process.exit(1); | |
| }); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment