Skip to content

Instantly share code, notes, and snippets.

@gvergnaud
Last active October 7, 2025 16:57
Show Gist options
  • Select an option

  • Save gvergnaud/72a2fb4978ee1c302f066a926178bab8 to your computer and use it in GitHub Desktop.

Select an option

Save gvergnaud/72a2fb4978ee1c302f066a926178bab8 to your computer and use it in GitHub Desktop.
A partial JSON parser that support incremental parsing and accessing the work-in-progress JSON structure.
/**
* Why?
* - With LLMs we often need to parse partial JSON strings incrementally,
* as they are being generated.
* - open-source Partial JSON parser all have problems:
* - They are ineficient: re-parsing the full JSON on each update
* - They don't stream updates of string values.
* - They don't guarantee that a given position in a JSON will remain of the same type.
*
* What is this?
* - This is JSON stream parser that's:
* - efficient: Incremental parsing, never do the same work twice.
* - stream updates of string values.
* - provides strong typing guarantees: types never change.
*
* What is missing?
* - Error handling when the JSON is invalid
*/
export type JSONValue = object | unknown[] | null | boolean | number | string;
type ParserScope =
| { type: "global"; current?: null | boolean | number | string }
| { type: "array"; current: unknown[] }
| { type: "object"; key: string; current: Record<string, unknown> };
export class JSONStreamParser {
private tokenizer = new JSONStreamTokenizer();
private stack: ParserScope[] = [];
private currentScope: ParserScope = { type: "global" };
write(jsonStr: string) {
const events = this.tokenizer.write(jsonStr);
this.processEvents(events);
}
end(): JSONValue {
const events = this.tokenizer.end();
this.processEvents(events);
return this.getCurrentValue();
}
getCurrentValue(): JSONValue {
const currentValue =
this.stack.length > 0 ? this.stack[0].current : this.currentScope.current;
return structuredClone(currentValue ?? null);
}
private processEvents(events: JSONParseToken[]) {
for (const event of events) {
switch (event.type) {
case "startObject": {
const newObj = {};
if (this.currentScope.type === "array") {
this.currentScope.current.push(newObj);
this.stack.push(this.currentScope);
} else if (this.currentScope.type === "object") {
this.currentScope.current[this.currentScope.key] = newObj;
this.stack.push(this.currentScope);
}
this.currentScope = { type: "object", key: "", current: newObj };
break;
}
case "startArray": {
const newArr: unknown[] = [];
if (this.currentScope.type === "array") {
this.currentScope.current.push(newArr);
this.stack.push(this.currentScope);
} else if (this.currentScope.type === "object") {
this.currentScope.current[this.currentScope.key] = newArr;
this.stack.push(this.currentScope);
}
this.currentScope = { type: "array", current: newArr };
break;
}
case "endObject":
case "endArray": {
this.currentScope = this.stack.pop() || this.currentScope;
break;
}
case "key": {
if (this.currentScope.type === "object") {
this.currentScope.key = event.value;
}
break;
}
case "value_start": {
if (this.currentScope.type === "array") {
this.currentScope.current.push(event.value);
} else if (this.currentScope.type === "object") {
this.currentScope.current[this.currentScope.key] = event.value;
} else if (this.currentScope.type === "global") {
this.currentScope.current = event.value;
}
break;
}
case "value_update": {
if (this.currentScope.type === "array") {
setLast(this.currentScope.current, event.value);
} else if (this.currentScope.type === "object") {
this.currentScope.current[this.currentScope.key] = event.value;
} else if (this.currentScope.type === "global") {
this.currentScope.current = event.value;
}
break;
}
case "value_complete": {
if (this.currentScope.type === "array") {
setLast(this.currentScope.current, event.value);
} else if (this.currentScope.type === "object") {
this.currentScope.current[this.currentScope.key] = event.value;
this.currentScope.key = "";
} else if (this.currentScope.type === "global") {
this.currentScope.current = event.value;
}
break;
}
}
}
}
}
const setLast = <T>(array: T[], value: T) => {
array[array.length ? array.length - 1 : 0] = value;
};
type JSONParseToken =
| { type: "startObject" }
| { type: "endObject" }
| { type: "startArray" }
| { type: "endArray" }
| { type: "key"; value: string }
| { type: "value_start"; value: any }
| { type: "value_update"; value: any }
| { type: "value_complete"; value: any };
const tokenStartObject = { type: "startObject" } satisfies JSONParseToken;
const tokenEndObject = { type: "endObject" } satisfies JSONParseToken;
const tokenStartArray = { type: "startArray" } satisfies JSONParseToken;
const tokenEndArray = { type: "endArray" } satisfies JSONParseToken;
type TokenizerScope =
| { type: "global" | "array" }
| { type: "object"; inValue: boolean };
type ValueContext =
| {
type: "string";
buffer: string;
parser: UnicodeStreamParser;
isComplete: boolean;
}
| { type: "literal"; buffer: string };
class JSONStreamTokenizer {
private valueContext: ValueContext | null = null;
private lastEmittedStringValueUpdate: string | null = null;
private stack: TokenizerScope[] = [];
private currentScope: TokenizerScope = { type: "global" };
get isInStringValue(): boolean {
return (
this.valueContext?.type === "string" &&
((this.currentScope.type === "object" && this.currentScope.inValue) ||
this.currentScope.type === "array" ||
this.currentScope.type === "global")
);
}
write(jsonStr: string): JSONParseToken[] {
const events: JSONParseToken[] = [];
for (const char of jsonStr) {
if (
this.valueContext?.type === "string" &&
!this.valueContext.isComplete
) {
const isEndOfString =
!this.valueContext.parser.isEscaping && char === '"';
if (isEndOfString) {
this.valueContext.buffer = this.valueContext.parser.end();
this.valueContext.isComplete = true;
} else {
this.valueContext.buffer = this.valueContext.parser.write(char);
}
const maybeEvent = this.maybeEmitValueUpdate(this.valueContext.buffer);
if (maybeEvent) appendEvent(events, maybeEvent);
continue;
}
switch (char) {
case "{": {
appendEvent(events, tokenStartObject);
this.stack.push(this.currentScope);
this.currentScope = { type: "object", inValue: false };
break;
}
case "}": {
if (this.valueContext) {
appendEvent(events, ...this.emitValueComplete());
}
appendEvent(events, tokenEndObject);
this.currentScope = this.stack.pop() || { type: "global" };
break;
}
case "[": {
appendEvent(events, tokenStartArray);
this.stack.push(this.currentScope);
this.currentScope = { type: "array" };
break;
}
case "]": {
if (this.valueContext) {
appendEvent(events, ...this.emitValueComplete());
}
appendEvent(events, tokenEndArray);
this.currentScope = this.stack.pop() || { type: "global" };
break;
}
case ":": {
if (this.valueContext?.type !== "string") {
throw new Error(
`Invalid JSON: expected a key, but got ${
this.valueContext
? ` unknown literal ${this.valueContext.buffer}`
: ' ":"'
}`
);
}
appendEvent(events, {
type: "key",
value: this.valueContext.buffer,
});
this.currentScope = {
type: "object",
inValue: true,
};
this.valueContext = null;
break;
}
case ",": {
if (this.valueContext) {
appendEvent(events, ...this.emitValueComplete());
}
if (this.currentScope.type === "object") {
this.currentScope = {
type: "object",
inValue: false,
};
}
break;
}
case '"': {
this.valueContext = {
type: "string",
buffer: "",
parser: new UnicodeStreamParser(),
isComplete: false,
};
break;
}
default: {
if (char !== " " && char !== "\n" && char !== "\t") {
if (this.valueContext) {
this.valueContext.buffer += char;
} else {
this.valueContext = { type: "literal", buffer: char };
}
}
break;
}
}
}
return events;
}
end(): JSONParseToken[] {
return this.emitValueComplete();
}
private maybeEmitValueUpdate(value: string): JSONParseToken | null {
if (!this.isInStringValue) return null;
if (this.lastEmittedStringValueUpdate === null) {
this.lastEmittedStringValueUpdate = value;
return {
type: "value_start",
value: value,
};
}
if (this.lastEmittedStringValueUpdate === value) return null;
this.lastEmittedStringValueUpdate = value;
return {
type: "value_update",
value,
};
}
private emitValueComplete(): JSONParseToken[] {
const events: JSONParseToken[] = [];
// Closing string in case the JSON is unfinished.
if (this.valueContext?.type === "string" && !this.valueContext.isComplete) {
this.valueContext.buffer = this.valueContext.parser.end();
this.valueContext.isComplete = true;
}
if (this.valueContext === null) return events;
if (this.lastEmittedStringValueUpdate === null) {
appendEvent(events, {
type: "value_start",
value: this.parseValue(this.valueContext),
});
appendEvent(events, {
type: "value_complete",
value: this.parseValue(this.valueContext),
});
} else {
appendEvent(events, {
type: "value_complete",
value: this.parseValue(this.valueContext),
});
}
this.lastEmittedStringValueUpdate = null;
this.valueContext = null;
return events;
}
private parseValue(
valueContext: ValueContext
): null | boolean | number | string {
switch (valueContext.type) {
case "string": {
return valueContext.buffer;
}
case "literal": {
if (valueContext.buffer === "true") return true;
if (valueContext.buffer === "false") return false;
if (valueContext.buffer === "null") return null;
if (!Number.isNaN(Number(valueContext.buffer)))
return Number(valueContext.buffer);
throw new Error(
`Invalid JSON: Unexpected literal: ${valueContext.buffer}`
);
}
}
}
}
const appendEvent = (
events: JSONParseToken[],
...eventsToAppend: JSONParseToken[]
) => {
for (const event of eventsToAppend) {
switch (event.type) {
case "value_update": {
const lastEvent = events[events.length - 1];
if (
lastEvent &&
(lastEvent.type === "value_start" ||
lastEvent.type === "value_update")
) {
lastEvent.value = event.value;
} else {
events.push(event);
}
break;
}
case "value_complete": {
const lastEvent = events[events.length - 1];
if (lastEvent && lastEvent.type === "value_update") {
events[events.length - 1] = event;
} else {
events.push(event);
}
break;
}
default: {
events.push(event);
break;
}
}
}
};
type UnicodeEscapeState = { type: "unicode_escape"; digits: string[] };
type UnicodeStreamState =
| { type: "unescaped" }
| { type: "escape" }
| UnicodeEscapeState;
class UnicodeStreamParser {
private state: UnicodeStreamState = { type: "unescaped" };
private lowSurrogateState?: { high: number };
private stringOutput: string = "";
get isEscaping(): boolean {
return this.state.type === "escape";
}
constructor(
private onError: (error: Error) => void = (err) => {
throw err;
}
) {}
write(chunk: string): string {
this.processBuffer(chunk);
return this.stringOutput;
}
end(): string {
if (this.state.type === "unicode_escape" && this.lowSurrogateState) {
this.onError(new Error("Unterminated surrogate pair"));
}
this.state = { type: "unescaped" };
return this.stringOutput;
}
private processBuffer(chunk: string): void {
for (const char of chunk) {
switch (this.state.type) {
case "unescaped":
if (char === "\\") {
this.state = { type: "escape" };
} else {
this.stringOutput += char;
}
break;
case "escape":
if (char === "u") {
this.state = { type: "unicode_escape", digits: [] };
} else {
this.handleEscapeChar(char);
this.state = { type: "unescaped" };
}
break;
case "unicode_escape":
this.state.digits.push(char);
if (this.state.digits.length === 4) {
this.handleUnicodeEscape(this.state);
this.state = { type: "unescaped" };
}
break;
}
}
}
private handleEscapeChar(char: string): void {
const escapeMap: Record<string, string> = {
'"': '"',
"\\": "\\",
"/": "/",
b: "\b",
f: "\f",
n: "\n",
r: "\r",
t: "\t",
};
this.stringOutput += escapeMap[char] ?? char;
}
private handleUnicodeEscape(state: UnicodeEscapeState): void {
const hexStr = state.digits.join("");
const codeUnit = parseInt(hexStr, 16);
if (Number.isNaN(codeUnit)) {
this.onError(new Error(`Invalid Unicode escape: \\u${hexStr}`));
return;
}
if (this.lowSurrogateState) {
// Handle low surrogate
if (codeUnit >= 0xdc00 && codeUnit <= 0xdfff) {
const high = this.lowSurrogateState.high;
const codePoint =
((high - 0xd800) << 10) + (codeUnit - 0xdc00) + 0x10000;
this.stringOutput += String.fromCodePoint(codePoint);
} else {
this.onError(new Error(`Invalid low surrogate: \\u${hexStr}`));
}
this.lowSurrogateState = undefined;
} else if (codeUnit >= 0xd800 && codeUnit <= 0xdbff) {
// High surrogate, expect low surrogate next
this.lowSurrogateState = { high: codeUnit };
} else if (codeUnit >= 0xdc00 && codeUnit <= 0xdfff) {
// Lone low surrogate
this.onError(new Error(`Lone low surrogate: \\u${hexStr}`));
} else {
// Regular Unicode character
this.stringOutput += String.fromCharCode(codeUnit);
this.lowSurrogateState = undefined;
}
}
}
import jsonpatch, { Operation } from "fast-json-patch";
import { JSONStreamParser, JSONValue } from "./json-stream-parser";
type ServerStreamChunk =
| { type: "initialization"; value: JSONValue }
| { type: "update"; patch: Operation[] }
| { type: "complete"; value: JSONValue };
/**
* Turn stream of incomplete JSON into JSON patches.
*/
const serverStream = function* (
jsonChunkStream: Generator<string>
): Generator<ServerStreamChunk> {
const parser = new JSONStreamParser();
let previousServerResult: JSONValue = null;
for (const chunk of jsonChunkStream) {
const serverResult = parser.write(chunk);
if (serverResult === null) {
continue;
}
if (previousServerResult === null) {
yield { type: "initialization", value: serverResult };
previousServerResult = serverResult;
continue;
}
const patch: Operation[] = jsonpatch.compare(
previousServerResult,
serverResult
);
previousServerResult = serverResult;
if (patch.length) {
yield { type: "update", patch };
}
}
const finalResult = parser.end();
yield { type: "complete", value: finalResult };
};
const createLlmJsonStream = function* (
jsonString: string
): Generator<string> {
yield* splitStringRandomly(jsonString);
}
const splitStringRandomly = (str: string) => {
const result: string[] = [];
while (str.length > 0) {
const randomIndex = numberBetween(1, Math.min(4, str.length));
result.push(str.slice(0, randomIndex));
str = str.slice(randomIndex);
}
return result;
};
const numberBetween = (min: number, max: number) => {
return Math.floor(Math.random() * (max - min + 1)) + min;
};
const jsonString = `{
"🧪 _test": "🚀 Stress-test JSON parser 🚀",
"nested": {
"level1": {
"level2": {
"level3": {
"level4": {
"level5": {
"level6": {
"level7": {
"level8": {
"level9": {
"level10": {
"deep": true,
"array": [
[
[
[
[1, 2, 3, {"a": "b"}]
]
]
]
],
"empty": {},
"nullValue": null,
"unicodeKey🌍": "unicodeValue🌍",
"escapes": "\\\\\\"\\\\\\\\\\\\/\\\\b\\\\f\\\\n\\\\r\\\\t\\u00A9\\uD83D\\uDE00\\uD83D\\uDC35",
"surrogatePair": "\\uD83D\\uDE0A",
"mixed": [
42,
-42,
3.14159265359,
-3.14159265359e+10,
1.7976931348623157e+308,
-1.7976931348623157e+308,
2.2250738585072014e-308,
true,
false,
null,
"",
" ",
"\\u0000",
"\\uFFFF",
"\\uD83D\\uDE00",
"\\uD83D\\uDC35",
{"": ""},
{"\\uD83D\\uDE00": "\\uD83D\\uDC35"},
[],
[null],
[1, "2", true, false, null, {}],
{"\\uD83D\\uDE00": ["\\uD83D\\uDC35", {"nested": {}}]}
],
"objectWithAllTypes": {
"string": "Hello\\\\nWorld\\\\u00A9",
"number": 1234567890.123456789,
"boolean": true,
"null": null,
"array": [1, "2", true, false, null, {}],
"emptyObject": {},
"emptyArray": [],
"scientificNotation": 1.23e-45,
"negativeZero": -0,
"infinityPlaceholder": "Infinity",
"naNPlaceholder": "NaN"
},
"trailingCommaObject": {
"valid": true
},
"trailingCommaArray": [1, 2, 3]
}
}
}
}
}
}
}
}
}
}
},
"specialNumbers": {
"maxInt": 9007199254740991,
"minInt": -9007199254740991,
"maxSafeInteger": 9007199254740991,
"minSafeInteger": -9007199254740991,
"maxFloat": 1.7976931348623157e+308,
"minFloat": -1.7976931348623157e+308,
"epsilon": 2.2250738585072014e-308
},
"edgeCases": {
"emptyString": "",
"whitespaceString": " \\t\\n\\r",
"controlChars": "\\u0000\\u0001\\u0002\\u0003\\u0004\\u0005\\u0006\\u0007\\b\\t\\n\\u000B\\f\\r\\u000E\\u000F\\u0010\\u0011\\u0012\\u0013\\u0014\\u0015\\u0016\\u0017\\u0018\\u0019\\u001A\\u001B\\u001C\\u001D\\u001E\\u001F",
"invalidUnicodeEscape": "\\\\uXYZ",
"unclosed": {
"object": "{",
"array": "[",
"string": "\\"unclosed"
},
"circularReferencePlaceholder": "[Circular]",
"commentsPlaceholder": "// This is not valid JSON, but some parsers might choke on it"
},
"largeArray": [
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80,
81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100
],
"mixedWhitespace": {
"tabKey\\t": "tabValue\\t",
"newlineKey\\n": "newlineValue\\n",
"carriageReturnKey\\r": "carriageReturnValue\\r"
},
"unicodeKeys": {
"😊": "smile",
"❤️": "heart",
"🎉": "party",
"🐶": "dog",
"🍣": "sushi",
"🚀": "rocket"
},
"nestedArrays": [
[],
[[]],
[[[]]],
[[[[]]]],
[[[[[]]]]],
[[[[[{}]]]]],
[[[[[{"a": "b"}]]]]]
],
"finalTest": {
"validJSON": true,
"butDidItCrash?": false
}
}`
const llmJsonStream = createLlmJsonStream(jsonString)
let wipValue: unknown = {};
for (const chunk of serverStream(llmJsonStream)) {
switch (chunk.type) {
case "initialization":
wipValue = chunk.value;
break;
case "update":
jsonpatch.applyPatch(wipValue, chunk.patch);
break;
case "complete":
wipValue = chunk.value;
break;
}
}
expect(wipValue).toEqual(JSON.parse(jsonString));
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment