Skip to content

Instantly share code, notes, and snippets.

@brandonhimpfen
Created March 6, 2026 02:12
Show Gist options
  • Select an option

  • Save brandonhimpfen/d3e3aeb84dc51b0bebb8800bde85d529 to your computer and use it in GitHub Desktop.

Select an option

Save brandonhimpfen/d3e3aeb84dc51b0bebb8800bde85d529 to your computer and use it in GitHub Desktop.
Split a large JSONL/NDJSON file in Node.js into smaller chunks by line count or approximate size, using streams and backpressure (no deps).
#!/usr/bin/env node
/**
* Split a large JSONL (NDJSON) file into chunks by line count or size.
*
* Features:
* - Streams input line-by-line
* - Writes chunk files incrementally
* - Supports splitting by:
* --max-lines=N
* --max-bytes=N
* - Handles stdout backpressure on file writes
* - No dependencies
*
* Usage:
* node node-split-jsonl-file.js input.jsonl --max-lines=100000
* node node-split-jsonl-file.js input.jsonl --max-bytes=10485760
*
* Output:
* input.part-0001.jsonl
* input.part-0002.jsonl
* ...
*/
const fs = require("fs");
const path = require("path");
const readline = require("readline");
const { once } = require("events");
function usage() {
console.error(`Usage: node node-split-jsonl-file.js <input.jsonl> [options]
Options:
--max-lines=N Split after N lines per chunk
--max-bytes=N Split after about N bytes per chunk
(size is approximate, based on UTF-8 encoded output)
Examples:
node node-split-jsonl-file.js input.jsonl --max-lines=100000
node node-split-jsonl-file.js input.jsonl --max-bytes=10485760
`);
}
function parseArgs(argv) {
const args = {
input: null,
maxLines: null,
maxBytes: null,
};
for (const arg of argv.slice(2)) {
if (!args.input && !arg.startsWith("--")) {
args.input = arg;
continue;
}
if (arg.startsWith("--max-lines=")) {
args.maxLines = Number(arg.split("=", 2)[1]);
continue;
}
if (arg.startsWith("--max-bytes=")) {
args.maxBytes = Number(arg.split("=", 2)[1]);
continue;
}
if (arg === "-h" || arg === "--help") {
args.help = true;
continue;
}
throw new Error(`Unknown argument: ${arg}`);
}
return args;
}
function makeChunkPath(inputPath, chunkNum) {
const abs = path.resolve(inputPath);
const dir = path.dirname(abs);
const ext = path.extname(abs);
const base = path.basename(abs, ext);
const suffix = String(chunkNum).padStart(4, "0");
return path.join(dir, `${base}.part-${suffix}${ext || ".jsonl"}`);
}
async function safeWrite(stream, data) {
if (!stream.write(data)) {
await once(stream, "drain");
}
}
async function closeStream(stream) {
await new Promise((resolve, reject) => {
stream.end(() => resolve());
stream.on("error", reject);
});
}
async function main() {
const args = parseArgs(process.argv);
if (args.help || !args.input || (!args.maxLines && !args.maxBytes)) {
usage();
process.exit(args.help ? 0 : 2);
}
if (args.maxLines && (!Number.isInteger(args.maxLines) || args.maxLines <= 0)) {
throw new Error("--max-lines must be a positive integer");
}
if (args.maxBytes && (!Number.isInteger(args.maxBytes) || args.maxBytes <= 0)) {
throw new Error("--max-bytes must be a positive integer");
}
const inputPath = path.resolve(args.input);
const inputStream = fs.createReadStream(inputPath, { encoding: "utf8" });
const rl = readline.createInterface({
input: inputStream,
crlfDelay: Infinity,
});
let chunkNum = 0;
let chunkStream = null;
let chunkLines = 0;
let chunkBytes = 0;
let totalLines = 0;
let filesCreated = 0;
async function openNextChunk() {
chunkNum += 1;
const chunkPath = makeChunkPath(inputPath, chunkNum);
chunkStream = fs.createWriteStream(chunkPath, { encoding: "utf8", flags: "wx" });
chunkLines = 0;
chunkBytes = 0;
filesCreated += 1;
console.error(`Opened ${chunkPath}`);
}
try {
for await (const line of rl) {
if (chunkStream === null) {
await openNextChunk();
}
const outLine = line + "\n";
const outBytes = Buffer.byteLength(outLine, "utf8");
const wouldExceedLines =
args.maxLines && chunkLines >= args.maxLines;
const wouldExceedBytes =
args.maxBytes && chunkBytes > 0 && (chunkBytes + outBytes) > args.maxBytes;
if (wouldExceedLines || wouldExceedBytes) {
await closeStream(chunkStream);
chunkStream = null;
await openNextChunk();
}
await safeWrite(chunkStream, outLine);
chunkLines += 1;
chunkBytes += outBytes;
totalLines += 1;
}
if (chunkStream) {
await closeStream(chunkStream);
chunkStream = null;
}
} finally {
rl.close();
}
console.error(`Done. Total lines: ${totalLines}. Files created: ${filesCreated}.`);
}
main().catch((err) => {
console.error("ERROR:", err && err.stack ? err.stack : err);
process.exit(1);
});
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment