Created
January 1, 2026 10:30
-
-
Save ifthenelse/38768bfe71ca5323b05ace17abbe1a59 to your computer and use it in GitHub Desktop.
ZSH script that scans a target path with ncdu, exports the full scan as JSON, then extracts the TOP N largest directories by *disk usage* (ncdu "dsize", not apparent size). Requires ncdu and jq
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env zsh | |
| set -euo pipefail | |
| # ----------------------------------------------------------------------------- | |
| # ncdu-top-dirs.zsh | |
| # | |
| # Scans a target path with ncdu, exports the full scan as JSON, then extracts | |
| # the TOP N largest directories by *disk usage* (ncdu "dsize", not apparent size). | |
| # | |
| # Requirements: | |
| # - ncdu (any version with -e -o export support) | |
| # - jq (1.5+) | |
| # | |
| # Notes on ncdu JSON: | |
| # - "dsize" = disk usage (allocated on disk) — this is what we sort by | |
| # - "asize" = apparent size (included for reference only) | |
| # | |
| # Supported ncdu export formats: | |
| # - Compact array nodes: ["dirname", {dsize, asize, ...}, [children...]] | |
| # - Object nodes: {name: "...", dsize: ..., items: [...]} | |
| # - Root at .[3], .[3][0], or nested — discovered dynamically | |
| # ----------------------------------------------------------------------------- | |
| print_help() { | |
| cat <<'EOF' | |
| Usage: | |
| ncdu-top-dirs.zsh [-p <path>] [-n <count>] [-h] | |
| Options: | |
| -p <path> Start path to scan (default: "/") | |
| -n <count> Number of top directories to export (default: 50) | |
| -h Show this help and exit | |
| Outputs (timestamped, in ~/Downloads): | |
| - Full ncdu export JSON: ncdu-export-<timestamp>.json | |
| - Top directories JSON list: ncdu-topdirs-<timestamp>.json | |
| - Top directories TSV: ncdu-topdirs-<timestamp>.tsv | |
| TSV columns: | |
| dsize_GB asize_GB dsize_bytes asize_bytes path | |
| Examples: | |
| Scan root, top 50: | |
| ncdu-top-dirs.zsh | |
| Scan home, top 100: | |
| ncdu-top-dirs.zsh -p "$HOME" -n 100 | |
| EOF | |
| } | |
| # ----------------------------------------------------------------------------- | |
| # Dependency checks — fail fast if any required tool is missing | |
| # ----------------------------------------------------------------------------- | |
| require_cmd() { | |
| local cmd="$1" | |
| if ! command -v "$cmd" >/dev/null 2>&1; then | |
| echo "Error: required dependency '$cmd' not found in PATH." >&2 | |
| exit 127 | |
| fi | |
| } | |
| require_cmd ncdu | |
| require_cmd jq | |
| # ----------------------------------------------------------------------------- | |
| # Defaults | |
| # ----------------------------------------------------------------------------- | |
| START_PATH="/" | |
| TOP_N=50 | |
| # ----------------------------------------------------------------------------- | |
| # Parse command-line options | |
| # ----------------------------------------------------------------------------- | |
| while getopts ":p:n:h" opt; do | |
| case "$opt" in | |
| p) START_PATH="$OPTARG" ;; | |
| n) TOP_N="$OPTARG" ;; | |
| h) print_help; exit 0 ;; | |
| \?) | |
| echo "Error: unknown option -$OPTARG" >&2 | |
| print_help | |
| exit 2 | |
| ;; | |
| :) | |
| echo "Error: option -$OPTARG requires an argument." >&2 | |
| print_help | |
| exit 2 | |
| ;; | |
| esac | |
| done | |
| # ----------------------------------------------------------------------------- | |
| # Validate inputs | |
| # ----------------------------------------------------------------------------- | |
| if [[ ! -d "$START_PATH" ]]; then | |
| echo "Error: start path is not a directory: $START_PATH" >&2 | |
| exit 2 | |
| fi | |
| if ! [[ "$TOP_N" =~ '^[0-9]+$' ]] || [[ "$TOP_N" -le 0 ]]; then | |
| echo "Error: -n <count> must be a positive integer (got: $TOP_N)" >&2 | |
| exit 2 | |
| fi | |
| # ----------------------------------------------------------------------------- | |
| # Output filenames (timestamped) | |
| # ----------------------------------------------------------------------------- | |
| TS="$(date '+%Y%m%d-%H%M%S')" | |
| OUT_DIR="$HOME/Downloads" | |
| FULL_JSON="$OUT_DIR/ncdu-export-$TS.json" | |
| TOP_JSON="$OUT_DIR/ncdu-topdirs-$TS.json" | |
| TOP_TSV="$OUT_DIR/ncdu-topdirs-$TS.tsv" | |
| echo "Scanning: $START_PATH" | |
| echo "Exporting full ncdu JSON to: $FULL_JSON" | |
| # -e: include extended info (dsize, asize, mtime, etc.) | |
| # -o: write export to file | |
| ncdu -e -o "$FULL_JSON" "$START_PATH" | |
| echo "Extracting TOP $TOP_N directories by disk usage (dsize)..." | |
| echo "Writing JSON to: $TOP_JSON" | |
| echo "Writing TSV to: $TOP_TSV" | |
| # ----------------------------------------------------------------------------- | |
| # jq filter: robust extraction of top directories by disk usage | |
| # | |
| # DESIGN NOTES: | |
| # | |
| # 1. ROOT DISCOVERY (find_root): | |
| # ncdu exports vary across versions. Three known formats at .[3]: | |
| # | |
| # FORMAT A (ncdu 1.x compact): .[3] = ["name", {metadata}, [children]] | |
| # Direct compact-array node with name, metadata, children | |
| # | |
| # FORMAT B (ncdu 1.x object): .[3] = {name: "...", items: [...]} | |
| # Direct object node with name and items array | |
| # | |
| # FORMAT C (ncdu 2.x flat): .[3] = [{root_meta}, {entry1}, [children1], {entry2}, [children2], ...] | |
| # Flat alternating list where: | |
| # .[3][0] = root directory metadata object | |
| # .[3][1], .[3][3], .[3][5], ... = entry metadata objects | |
| # .[3][2], .[3][4], .[3][6], ... = corresponding children arrays | |
| # | |
| # We detect the format dynamically and normalize to a tree structure. | |
| # | |
| # 2. NODE TYPE DETECTION: | |
| # Three representations must be supported: | |
| # - Compact array: ["name", {metadata}, [children]] | |
| # - Object with items: {name: "...", dsize: ..., items: [...]} | |
| # - Object metadata: {name: "...", dsize: ..., asize: ...} (ncdu 2.x) | |
| # | |
| # Directories are identified by having children (non-empty items/children array). | |
| # | |
| # 3. DISK USAGE SEMANTICS: | |
| # - Sorting is ALWAYS by dsize (disk usage = allocated blocks) | |
| # - asize (apparent size) is included for reference but never used for sorting | |
| # - Only directories (nodes with children) are included in output | |
| # | |
| # 4. RECURSION (collect_dirs): | |
| # We walk the tree depth-first, accumulating path prefixes. | |
| # Each directory node yields one record; we then recurse into children. | |
| # ----------------------------------------------------------------------------- | |
| JQ_FILTER=' | |
| # ============================================================================ | |
| # Format detection and normalization (ncdu 2.x flat list support) | |
| # ============================================================================ | |
| # Convert ncdu 2.x flat depth-first list into tree structure | |
| # Input at root level: [{root_meta}, child1, child2, ...] | |
| # where each child is either: | |
| # - {file_meta} (a standalone object for files) | |
| # - [{dir_meta}, subchild1, subchild2, ...] (a flat list for directories) | |
| def normalize_flat_list: | |
| if type == "array" and length >= 1 and (.[0] | type) == "object" then | |
| # First element is directory metadata | |
| .[0] as $dir_meta | |
| | { | |
| meta: $dir_meta, | |
| children: ( | |
| # Process children (everything after first element) | |
| .[1:] | map( | |
| if type == "object" then | |
| # This is a file (no children) | |
| {meta: ., children: []} | |
| elif type == "array" and length >= 1 and (.[0] | type) == "object" then | |
| # This is a directory (flat list) - recursively normalize it | |
| normalize_flat_list | |
| else | |
| # Unexpected - skip or return empty | |
| empty | |
| end | |
| ) | |
| ) | |
| } | |
| else | |
| # Not a flat list, return as-is | |
| . | |
| end; | |
| # ============================================================================ | |
| # Node accessors: abstract over all three formats | |
| # ============================================================================ | |
| # Get the name of a node | |
| def node_name: | |
| if type == "array" and length >= 1 and (.[0] | type) == "string" then | |
| # Format A: compact array ["name", ...] | |
| .[0] | |
| elif type == "object" then | |
| if has("name") then | |
| # Format B/C: object with name field | |
| .name | |
| elif has("meta") and (.meta | type) == "object" and (.meta | has("name")) then | |
| # Normalized flat list node | |
| .meta.name | |
| else | |
| null | |
| end | |
| else | |
| null | |
| end; | |
| # Get metadata object (contains dsize, asize, mtime, etc.) | |
| def node_meta: | |
| if type == "array" and length >= 2 and (.[1] | type) == "object" then | |
| # Format A: compact array [name, {metadata}, ...] | |
| .[1] | |
| elif type == "object" then | |
| if has("meta") then | |
| # Normalized flat list node | |
| .meta | |
| else | |
| # Format B/C: object node, metadata is on the node itself | |
| . | |
| end | |
| else | |
| {} | |
| end; | |
| # Get children array (empty for files, non-empty for directories) | |
| def node_children: | |
| if type == "array" and length >= 3 and (.[2] | type) == "array" then | |
| # Format A: compact array [name, meta, [children]] | |
| .[2] | |
| elif type == "object" then | |
| if has("children") and (.children | type) == "array" then | |
| # Normalized flat list node | |
| .children | |
| elif has("items") and (.items | type) == "array" then | |
| # Format B: object with items array | |
| .items | |
| else | |
| [] | |
| end | |
| else | |
| [] | |
| end; | |
| # ============================================================================ | |
| # Validation predicates | |
| # ============================================================================ | |
| # Check if this looks like a valid ncdu filesystem node | |
| def is_valid_node: | |
| if type == "array" then | |
| length >= 2 and (.[0] | type) == "string" and (.[1] | type) == "object" | |
| elif type == "object" then | |
| has("name") or has("meta") | |
| else | |
| false | |
| end; | |
| # Check if this node is a directory (has children) | |
| def is_dir_node: | |
| is_valid_node and (node_children | length) > 0; | |
| # ============================================================================ | |
| # Root discovery: find the filesystem tree root dynamically | |
| # ============================================================================ | |
| # Detect if this is ncdu 2.x flat list format | |
| def is_flat_list: | |
| (type == "array") | |
| and (length >= 3) | |
| and ((.[0] | type) == "object") | |
| and ((.[0] | type) == "object" and (.[0] | has("name"))) | |
| and ((.[1] | type) == "object") | |
| and ((.[2] | type) == "array"); | |
| # Recursively search for a valid node starting from a candidate | |
| def find_root_recursive: | |
| if is_flat_list then | |
| # ncdu 2.x flat list - normalize it | |
| normalize_flat_list | |
| elif is_valid_node then | |
| # Already a valid node | |
| . | |
| elif type == "array" and length > 0 then | |
| # Try first element if current is just a wrapper array | |
| .[0] | find_root_recursive | |
| else | |
| null | |
| end; | |
| # Entry point: locate the filesystem root in the ncdu export | |
| # ncdu exports are arrays where index 3 typically contains the tree | |
| def find_root: | |
| if type == "array" and length > 3 then | |
| .[3] | find_root_recursive | |
| elif type == "array" and length > 0 then | |
| # Fallback: search from beginning | |
| . | find_root_recursive | |
| elif is_valid_node then | |
| . | |
| else | |
| null | |
| end; | |
| # ============================================================================ | |
| # Directory collection: recursively gather all directories | |
| # ============================================================================ | |
| # Recursively collect all directories with their full paths and metadata | |
| # Arguments: prefix (string) — the path prefix built so far | |
| # Returns: {dirs: [...], total_dsize: N, total_asize: N} | |
| def collect_dirs(prefix): | |
| (node_name // "") as $name | |
| | (if prefix == "" then $name else prefix + "/" + $name end) as $path | |
| | node_meta as $meta | |
| | node_children as $children | |
| # Start with this node'\''s own size (0 for directories, non-zero for files) | |
| | ($meta.dsize // 0) as $own_dsize | |
| | ($meta.asize // 0) as $own_asize | |
| # Recursively collect from children and sum their sizes | |
| | ($children | map( | |
| if is_valid_node then | |
| collect_dirs($path) | |
| else | |
| empty | |
| end | |
| )) as $child_results | |
| # Sum up all children'\''s total sizes | |
| | ($child_results | map(.total_dsize) | add // 0) as $children_dsize | |
| | ($child_results | map(.total_asize) | add // 0) as $children_asize | |
| # Total for this node | |
| | ($own_dsize + $children_dsize) as $total_dsize | |
| | ($own_asize + $children_asize) as $total_asize | |
| # Collect all directory records from children | |
| | ($child_results | map(.dirs) | add // []) as $child_dirs | |
| # Only include this node if it is a directory (has children) | |
| | if ($children | length) > 0 then | |
| { | |
| dirs: ([{ | |
| path: $path, | |
| dsize: $total_dsize, | |
| asize: $total_asize, | |
| mtime: ($meta.mtime // null) | |
| }] + $child_dirs), | |
| total_dsize: $total_dsize, | |
| total_asize: $total_asize | |
| } | |
| else | |
| # This is a file - contribute size but no dir record | |
| { | |
| dirs: $child_dirs, | |
| total_dsize: $total_dsize, | |
| total_asize: $total_asize | |
| } | |
| end; | |
| # ============================================================================ | |
| # Main pipeline | |
| # ============================================================================ | |
| find_root as $root | |
| | if $root == null then | |
| # Could not find valid root — output empty array (will trigger error in shell) | |
| [] | |
| else | |
| $root | |
| | collect_dirs("") | |
| | .dirs # Extract the dirs array from the result | |
| | sort_by(.dsize) | |
| | reverse | |
| | .[:$N] | |
| end | |
| ' | |
| # Run the extraction | |
| jq --argjson N "$TOP_N" "$JQ_FILTER" "$FULL_JSON" > "$TOP_JSON" | |
| # ----------------------------------------------------------------------------- | |
| # Validate output — fail loudly if extraction yielded no directories | |
| # ----------------------------------------------------------------------------- | |
| DIR_COUNT=$(jq 'length' "$TOP_JSON" 2>/dev/null || echo "0") | |
| if [[ "$DIR_COUNT" -eq 0 ]]; then | |
| echo "" >&2 | |
| echo "========================================================================" >&2 | |
| echo "ERROR: Extracted zero directories from ncdu export." >&2 | |
| echo "========================================================================" >&2 | |
| echo "" >&2 | |
| echo "This may indicate:" >&2 | |
| echo " 1. The scanned path contains only files (no subdirectories)" >&2 | |
| echo " 2. An unsupported ncdu export format" >&2 | |
| echo " 3. The ncdu export is corrupt or empty" >&2 | |
| echo "" >&2 | |
| echo "Diagnostic info:" >&2 | |
| echo " Export file: $FULL_JSON" >&2 | |
| echo " File size: $(wc -c < "$FULL_JSON" | tr -d ' ') bytes" >&2 | |
| echo "" >&2 | |
| echo "First 500 chars of export:" >&2 | |
| head -c 500 "$FULL_JSON" >&2 | |
| echo "" >&2 | |
| echo "" >&2 | |
| echo "To debug, run: jq '.[3]' \"$FULL_JSON\" | head -c 1000" >&2 | |
| echo "========================================================================" >&2 | |
| exit 1 | |
| fi | |
| # ----------------------------------------------------------------------------- | |
| # Generate human-readable TSV | |
| # Columns: dsize_GB asize_GB dsize_bytes asize_bytes path | |
| # ----------------------------------------------------------------------------- | |
| { | |
| echo "dsize_GB\tasize_GB\tdsize_bytes\tasize_bytes\tpath" | |
| jq -r ' | |
| .[] | |
| | [ | |
| ((.dsize / 1073741824 * 100 | floor) / 100), # dsize in GB (2 decimal places) | |
| ((.asize / 1073741824 * 100 | floor) / 100), # asize in GB (2 decimal places) | |
| .dsize, | |
| .asize, | |
| .path | |
| ] | |
| | @tsv | |
| ' "$TOP_JSON" | |
| } > "$TOP_TSV" | |
| # ----------------------------------------------------------------------------- | |
| # Summary | |
| # ----------------------------------------------------------------------------- | |
| echo "" | |
| echo "Done. Extracted $DIR_COUNT directories." | |
| echo "" | |
| echo "Output files:" | |
| echo " Full export: $FULL_JSON" | |
| echo " Top JSON: $TOP_JSON" | |
| echo " Top TSV: $TOP_TSV" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment