|
#!/usr/bin/env node |
|
/** |
|
* script.js |
|
* |
|
* - Removes base64 / base64url image blobs from JSON (PNG/JPEG/GIF/BMP/WEBP/TIFF/HEIF/AVIF/SVG) |
|
* - Handles strings with literal \n, \r, \t splits inside base64 |
|
* - Parses string fields that contain JSON (object/array), cleans them recursively |
|
* • default: re-stringifies (keeps original "string field" schema) |
|
* • with --unwrap-strings: replaces those string fields with real JSON objects/arrays |
|
* |
|
* Usage: |
|
* node script.js <input.json> <output.json> |
|
* [--min-bytes 5120] |
|
* [--keys image,photo,avatar,icon,thumbnail,wallpaper,bitmap] |
|
* [--remove-array-element] |
|
* [--unwrap-strings] |
|
*/ |
|
|
|
const fs = require('fs'); |
|
const path = require('path'); |
|
|
|
function die(msg) { console.error(msg); process.exit(2); } |
|
|
|
function parseArgs(argv) { |
|
const args = { |
|
minBytes: 5120, |
|
keys: null, |
|
removeArrayElement: false, |
|
unwrapStrings: false, |
|
}; |
|
const pos = []; |
|
for (let i = 2; i < argv.length; i++) { |
|
const a = argv[i]; |
|
if (a === '--min-bytes') { |
|
args.minBytes = Number(argv[++i] ?? NaN); |
|
if (!Number.isFinite(args.minBytes) || args.minBytes < 0) die('Invalid --min-bytes value.'); |
|
} else if (a === '--keys') { |
|
args.keys = (argv[++i] || '') |
|
.split(',') |
|
.map(s => s.trim().toLowerCase()) |
|
.filter(Boolean); |
|
if (!args.keys.length) args.keys = null; |
|
} else if (a === '--remove-array-element') { |
|
args.removeArrayElement = true; |
|
} else if (a === '--unwrap-strings') { |
|
args.unwrapStrings = true; |
|
} else if (a.startsWith('-')) { |
|
die(`Unknown option: ${a}`); |
|
} else { |
|
pos.push(a); |
|
} |
|
} |
|
if (pos.length < 2) { |
|
die('Usage: node script.js <input.json> <output.json> [--min-bytes 5120] [--keys image,photo,avatar] [--remove-array-element] [--unwrap-strings]'); |
|
} |
|
args.input = pos[0]; |
|
args.output = pos[1]; |
|
return args; |
|
} |
|
|
|
// ---------- Base64 helpers ---------- |
|
|
|
const dataUrlRx = /^data:image\/[a-zA-Z0-9.+-]+;base64,([A-Za-z0-9+/=\s\\r\\n\\t\\f_-]+)$/; |
|
|
|
/** Remove whitespace AND literal escape breaks like "\\n" that appear in serialized blobs. */ |
|
function normalizeB64String(s) { |
|
let t = s.replace(/\s+/g, ''); // real whitespace/newlines |
|
t = t.replace(/\\[rntf]/g, ''); // literal backslash escapes present as text |
|
t = t.replace(/-/g, '+').replace(/_/g, '/'); // base64url → base64 |
|
if (t.length % 4 !== 0) t = t.padEnd(t.length + (4 - (t.length % 4)), '='); |
|
return t; |
|
} |
|
|
|
function isLikelyBase64(s) { |
|
if (typeof s !== 'string' || s.length < 8) return false; |
|
const stripped = normalizeB64String(s); |
|
if (stripped.length % 4 !== 0) return false; |
|
return /^[A-Za-z0-9+/=]+$/.test(stripped); |
|
} |
|
|
|
function tryDecodeBase64(str) { |
|
try { |
|
const norm = normalizeB64String(str); |
|
const buf = Buffer.from(norm, 'base64'); |
|
// Rough sanity check (avoid accepting random ASCII): |
|
const re = buf.toString('base64'); |
|
if (!re || re.length < Math.floor(norm.length * 0.6)) return null; |
|
return buf; |
|
} catch { |
|
return null; |
|
} |
|
} |
|
|
|
function detectImageFromBytes(buf) { |
|
if (!Buffer.isBuffer(buf) || buf.length < 4) return false; |
|
const b0 = buf[0], b1 = buf[1], b2 = buf[2], b3 = buf[3]; |
|
// PNG |
|
if (b0 === 0x89 && b1 === 0x50 && b2 === 0x4E && b3 === 0x47) return true; |
|
// JPEG |
|
if (b0 === 0xFF && b1 === 0xD8 && b2 === 0xFF) return true; |
|
// GIF |
|
if (b0 === 0x47 && b1 === 0x49 && b2 === 0x46 && b3 === 0x38) return true; |
|
// BMP |
|
if (b0 === 0x42 && b1 === 0x4D) return true; |
|
// WEBP ("RIFF....WEBP") |
|
if (buf.length >= 12 && |
|
buf.slice(0,4).toString('ascii') === 'RIFF' && |
|
buf.slice(8,12).toString('ascii') === 'WEBP') return true; |
|
// TIFF |
|
if ((b0 === 0x49 && b1 === 0x49 && b2 === 0x2A && b3 === 0x00) || |
|
(b0 === 0x4D && b1 === 0x4D && b2 === 0x00 && b3 === 0x2A)) return true; |
|
// ISO BMFF (HEIF/AVIF) |
|
if (buf.length >= 12 && buf.slice(4,8).toString('ascii') === 'ftyp') { |
|
const brand = buf.slice(8,12).toString('ascii'); |
|
if (['heic','heix','mif1','hevc','hevx','avif','avis'].includes(brand)) return true; |
|
} |
|
// SVG (base64 of XML) |
|
const head = buf.slice(0, 256).toString('utf8').trimStart(); |
|
if (head.startsWith('<?xml') || head.startsWith('<svg')) return true; |
|
return false; |
|
} |
|
|
|
function getImagePayloadIfAny(value) { |
|
if (typeof value !== 'string') return null; |
|
|
|
// data URL → extract payload |
|
const m = value.match(dataUrlRx); |
|
if (m) { |
|
const buf = tryDecodeBase64(m[1]); |
|
if (buf && detectImageFromBytes(buf)) return buf; |
|
} |
|
|
|
// raw base64 / base64url (possibly with \n escapes) |
|
if (isLikelyBase64(value)) { |
|
const buf = tryDecodeBase64(value); |
|
if (buf && detectImageFromBytes(buf)) return buf; |
|
} |
|
|
|
return null; |
|
} |
|
|
|
// ---------- Stringified-JSON handling ---------- |
|
|
|
/** Heuristic: looks like a JSON object/array string? (leading/trailing trimmed braces) */ |
|
function looksLikeJSONText(s) { |
|
if (typeof s !== 'string') return false; |
|
const trimmed = s.trim(); |
|
if (!trimmed) return false; |
|
const first = trimmed[0]; |
|
const last = trimmed[trimmed.length - 1]; |
|
return (first === '{' && last === '}') || (first === '[' && last === ']'); |
|
} |
|
|
|
/** Try to parse a string as JSON; returns {ok, value} */ |
|
function maybeParseJSON(s, maxChars = 15_000_000) { // 5 MB guard |
|
if (!looksLikeJSONText(s)) return { ok: false }; |
|
if (s.length > maxChars) return { ok: false }; |
|
try { |
|
return { ok: true, value: JSON.parse(s) }; |
|
} catch { |
|
return { ok: false }; |
|
} |
|
} |
|
|
|
// ---------- Recursive cleaner ---------- |
|
|
|
function keyMatches(keys, keyName) { |
|
if (!keys) return true; |
|
const k = String(keyName).toLowerCase(); |
|
return keys.some(part => k.includes(part)); |
|
} |
|
|
|
function cleanValue(node, opts, parentKey = '') { |
|
const { minBytes, keys, removeArrayElement, unwrapStrings } = opts; |
|
|
|
if (Array.isArray(node)) { |
|
const out = []; |
|
for (const el of node) { |
|
if (typeof el === 'string') { |
|
// Direct string image? |
|
const payload = getImagePayloadIfAny(el); |
|
if (payload && payload.length >= minBytes) { |
|
if (!removeArrayElement) out.push(null); |
|
continue; |
|
} |
|
// Stringified JSON? |
|
const parsed = maybeParseJSON(el); |
|
if (parsed.ok) { |
|
const cleanedInner = cleanValue(parsed.value, opts, parentKey); |
|
if (unwrapStrings) { |
|
out.push(cleanedInner); |
|
} else { |
|
const originalStr = JSON.stringify(parsed.value); |
|
const cleanedStr = JSON.stringify(cleanedInner); |
|
out.push(originalStr === cleanedStr ? el : cleanedStr); |
|
} |
|
} else { |
|
out.push(el); |
|
} |
|
} else { |
|
out.push(cleanValue(el, opts, parentKey)); |
|
} |
|
} |
|
return out; |
|
} |
|
|
|
if (node && typeof node === 'object') { |
|
const copy = {}; |
|
for (const [k, v] of Object.entries(node)) { |
|
if (typeof v === 'string') { |
|
// Remove if key suggests image OR no key filter at all. |
|
let removed = false; |
|
if (keyMatches(keys, k)) { |
|
const payload = getImagePayloadIfAny(v); |
|
if (payload && payload.length >= minBytes) removed = true; |
|
} |
|
if (removed) continue; |
|
|
|
// If this string itself is JSON |
|
const parsed = maybeParseJSON(v); |
|
if (parsed.ok) { |
|
const cleanedInner = cleanValue(parsed.value, opts, k); |
|
if (unwrapStrings) { |
|
copy[k] = cleanedInner; |
|
} else { |
|
const originalStr = JSON.stringify(parsed.value); |
|
const cleanedStr = JSON.stringify(cleanedInner); |
|
copy[k] = (originalStr === cleanedStr) ? v : cleanedStr; |
|
} |
|
} else { |
|
copy[k] = v; |
|
} |
|
} else { |
|
copy[k] = cleanValue(v, opts, k); |
|
} |
|
} |
|
return copy; |
|
} |
|
|
|
// primitives |
|
return node; |
|
} |
|
|
|
// ---------- Main ---------- |
|
|
|
(function main() { |
|
const args = parseArgs(process.argv); |
|
const inputPath = path.resolve(args.input); |
|
const outputPath = path.resolve(args.output); |
|
|
|
let data; |
|
try { |
|
data = JSON.parse(fs.readFileSync(inputPath, 'utf8')); |
|
} catch (e) { |
|
console.error(`Failed to read/parse ${inputPath}: ${e.message}`); |
|
process.exit(1); |
|
} |
|
|
|
const cleaned = cleanValue(data, { |
|
minBytes: args.minBytes, |
|
keys: args.keys, |
|
removeArrayElement: args.removeArrayElement, |
|
unwrapStrings: args.unwrapStrings, |
|
}); |
|
|
|
try { |
|
fs.writeFileSync(outputPath, JSON.stringify(cleaned, null, 2), 'utf8'); |
|
} catch (e) { |
|
console.error(`Failed to write ${outputPath}: ${e.message}`); |
|
process.exit(1); |
|
} |
|
|
|
console.log(`Wrote cleaned JSON to ${outputPath}`); |
|
})(); |