Skip to content

Instantly share code, notes, and snippets.

@ifthenelse
Created January 1, 2026 10:30
Show Gist options
  • Select an option

  • Save ifthenelse/38768bfe71ca5323b05ace17abbe1a59 to your computer and use it in GitHub Desktop.

Select an option

Save ifthenelse/38768bfe71ca5323b05ace17abbe1a59 to your computer and use it in GitHub Desktop.
ZSH script that scans a target path with ncdu, exports the full scan as JSON, then extracts the TOP N largest directories by *disk usage* (ncdu "dsize", not apparent size). Requires ncdu and jq
#!/usr/bin/env zsh
set -euo pipefail
# -----------------------------------------------------------------------------
# ncdu-top-dirs.zsh
#
# Scans a target path with ncdu, exports the full scan as JSON, then extracts
# the TOP N largest directories by *disk usage* (ncdu "dsize", not apparent size).
#
# Requirements:
# - ncdu (any version with -e -o export support)
# - jq (1.5+)
#
# Notes on ncdu JSON:
# - "dsize" = disk usage (allocated on disk) — this is what we sort by
# - "asize" = apparent size (included for reference only)
#
# Supported ncdu export formats:
# - Compact array nodes: ["dirname", {dsize, asize, ...}, [children...]]
# - Object nodes: {name: "...", dsize: ..., items: [...]}
# - Root at .[3], .[3][0], or nested — discovered dynamically
# -----------------------------------------------------------------------------
print_help() {
cat <<'EOF'
Usage:
ncdu-top-dirs.zsh [-p <path>] [-n <count>] [-h]
Options:
-p <path> Start path to scan (default: "/")
-n <count> Number of top directories to export (default: 50)
-h Show this help and exit
Outputs (timestamped, in ~/Downloads):
- Full ncdu export JSON: ncdu-export-<timestamp>.json
- Top directories JSON list: ncdu-topdirs-<timestamp>.json
- Top directories TSV: ncdu-topdirs-<timestamp>.tsv
TSV columns:
dsize_GB asize_GB dsize_bytes asize_bytes path
Examples:
Scan root, top 50:
ncdu-top-dirs.zsh
Scan home, top 100:
ncdu-top-dirs.zsh -p "$HOME" -n 100
EOF
}
# -----------------------------------------------------------------------------
# Dependency checks — fail fast if any required tool is missing
# -----------------------------------------------------------------------------
require_cmd() {
local cmd="$1"
if ! command -v "$cmd" >/dev/null 2>&1; then
echo "Error: required dependency '$cmd' not found in PATH." >&2
exit 127
fi
}
require_cmd ncdu
require_cmd jq
# -----------------------------------------------------------------------------
# Defaults
# -----------------------------------------------------------------------------
START_PATH="/"
TOP_N=50
# -----------------------------------------------------------------------------
# Parse command-line options
# -----------------------------------------------------------------------------
while getopts ":p:n:h" opt; do
case "$opt" in
p) START_PATH="$OPTARG" ;;
n) TOP_N="$OPTARG" ;;
h) print_help; exit 0 ;;
\?)
echo "Error: unknown option -$OPTARG" >&2
print_help
exit 2
;;
:)
echo "Error: option -$OPTARG requires an argument." >&2
print_help
exit 2
;;
esac
done
# -----------------------------------------------------------------------------
# Validate inputs
# -----------------------------------------------------------------------------
if [[ ! -d "$START_PATH" ]]; then
echo "Error: start path is not a directory: $START_PATH" >&2
exit 2
fi
if ! [[ "$TOP_N" =~ '^[0-9]+$' ]] || [[ "$TOP_N" -le 0 ]]; then
echo "Error: -n <count> must be a positive integer (got: $TOP_N)" >&2
exit 2
fi
# -----------------------------------------------------------------------------
# Output filenames (timestamped)
# -----------------------------------------------------------------------------
TS="$(date '+%Y%m%d-%H%M%S')"
OUT_DIR="$HOME/Downloads"
FULL_JSON="$OUT_DIR/ncdu-export-$TS.json"
TOP_JSON="$OUT_DIR/ncdu-topdirs-$TS.json"
TOP_TSV="$OUT_DIR/ncdu-topdirs-$TS.tsv"
echo "Scanning: $START_PATH"
echo "Exporting full ncdu JSON to: $FULL_JSON"
# -e: include extended info (dsize, asize, mtime, etc.)
# -o: write export to file
ncdu -e -o "$FULL_JSON" "$START_PATH"
echo "Extracting TOP $TOP_N directories by disk usage (dsize)..."
echo "Writing JSON to: $TOP_JSON"
echo "Writing TSV to: $TOP_TSV"
# -----------------------------------------------------------------------------
# jq filter: robust extraction of top directories by disk usage
#
# DESIGN NOTES:
#
# 1. ROOT DISCOVERY (find_root):
# ncdu exports vary across versions. Three known formats at .[3]:
#
# FORMAT A (ncdu 1.x compact): .[3] = ["name", {metadata}, [children]]
# Direct compact-array node with name, metadata, children
#
# FORMAT B (ncdu 1.x object): .[3] = {name: "...", items: [...]}
# Direct object node with name and items array
#
# FORMAT C (ncdu 2.x flat): .[3] = [{root_meta}, {entry1}, [children1], {entry2}, [children2], ...]
# Flat alternating list where:
# .[3][0] = root directory metadata object
# .[3][1], .[3][3], .[3][5], ... = entry metadata objects
# .[3][2], .[3][4], .[3][6], ... = corresponding children arrays
#
# We detect the format dynamically and normalize to a tree structure.
#
# 2. NODE TYPE DETECTION:
# Three representations must be supported:
# - Compact array: ["name", {metadata}, [children]]
# - Object with items: {name: "...", dsize: ..., items: [...]}
# - Object metadata: {name: "...", dsize: ..., asize: ...} (ncdu 2.x)
#
# Directories are identified by having children (non-empty items/children array).
#
# 3. DISK USAGE SEMANTICS:
# - Sorting is ALWAYS by dsize (disk usage = allocated blocks)
# - asize (apparent size) is included for reference but never used for sorting
# - Only directories (nodes with children) are included in output
#
# 4. RECURSION (collect_dirs):
# We walk the tree depth-first, accumulating path prefixes.
# Each directory node yields one record; we then recurse into children.
# -----------------------------------------------------------------------------
JQ_FILTER='
# ============================================================================
# Format detection and normalization (ncdu 2.x flat list support)
# ============================================================================
# Convert ncdu 2.x flat depth-first list into tree structure
# Input at root level: [{root_meta}, child1, child2, ...]
# where each child is either:
# - {file_meta} (a standalone object for files)
# - [{dir_meta}, subchild1, subchild2, ...] (a flat list for directories)
def normalize_flat_list:
if type == "array" and length >= 1 and (.[0] | type) == "object" then
# First element is directory metadata
.[0] as $dir_meta
| {
meta: $dir_meta,
children: (
# Process children (everything after first element)
.[1:] | map(
if type == "object" then
# This is a file (no children)
{meta: ., children: []}
elif type == "array" and length >= 1 and (.[0] | type) == "object" then
# This is a directory (flat list) - recursively normalize it
normalize_flat_list
else
# Unexpected - skip or return empty
empty
end
)
)
}
else
# Not a flat list, return as-is
.
end;
# ============================================================================
# Node accessors: abstract over all three formats
# ============================================================================
# Get the name of a node
def node_name:
if type == "array" and length >= 1 and (.[0] | type) == "string" then
# Format A: compact array ["name", ...]
.[0]
elif type == "object" then
if has("name") then
# Format B/C: object with name field
.name
elif has("meta") and (.meta | type) == "object" and (.meta | has("name")) then
# Normalized flat list node
.meta.name
else
null
end
else
null
end;
# Get metadata object (contains dsize, asize, mtime, etc.)
def node_meta:
if type == "array" and length >= 2 and (.[1] | type) == "object" then
# Format A: compact array [name, {metadata}, ...]
.[1]
elif type == "object" then
if has("meta") then
# Normalized flat list node
.meta
else
# Format B/C: object node, metadata is on the node itself
.
end
else
{}
end;
# Get children array (empty for files, non-empty for directories)
def node_children:
if type == "array" and length >= 3 and (.[2] | type) == "array" then
# Format A: compact array [name, meta, [children]]
.[2]
elif type == "object" then
if has("children") and (.children | type) == "array" then
# Normalized flat list node
.children
elif has("items") and (.items | type) == "array" then
# Format B: object with items array
.items
else
[]
end
else
[]
end;
# ============================================================================
# Validation predicates
# ============================================================================
# Check if this looks like a valid ncdu filesystem node
def is_valid_node:
if type == "array" then
length >= 2 and (.[0] | type) == "string" and (.[1] | type) == "object"
elif type == "object" then
has("name") or has("meta")
else
false
end;
# Check if this node is a directory (has children)
def is_dir_node:
is_valid_node and (node_children | length) > 0;
# ============================================================================
# Root discovery: find the filesystem tree root dynamically
# ============================================================================
# Detect if this is ncdu 2.x flat list format
def is_flat_list:
(type == "array")
and (length >= 3)
and ((.[0] | type) == "object")
and ((.[0] | type) == "object" and (.[0] | has("name")))
and ((.[1] | type) == "object")
and ((.[2] | type) == "array");
# Recursively search for a valid node starting from a candidate
def find_root_recursive:
if is_flat_list then
# ncdu 2.x flat list - normalize it
normalize_flat_list
elif is_valid_node then
# Already a valid node
.
elif type == "array" and length > 0 then
# Try first element if current is just a wrapper array
.[0] | find_root_recursive
else
null
end;
# Entry point: locate the filesystem root in the ncdu export
# ncdu exports are arrays where index 3 typically contains the tree
def find_root:
if type == "array" and length > 3 then
.[3] | find_root_recursive
elif type == "array" and length > 0 then
# Fallback: search from beginning
. | find_root_recursive
elif is_valid_node then
.
else
null
end;
# ============================================================================
# Directory collection: recursively gather all directories
# ============================================================================
# Recursively collect all directories with their full paths and metadata
# Arguments: prefix (string) — the path prefix built so far
# Returns: {dirs: [...], total_dsize: N, total_asize: N}
def collect_dirs(prefix):
(node_name // "") as $name
| (if prefix == "" then $name else prefix + "/" + $name end) as $path
| node_meta as $meta
| node_children as $children
# Start with this node'\''s own size (0 for directories, non-zero for files)
| ($meta.dsize // 0) as $own_dsize
| ($meta.asize // 0) as $own_asize
# Recursively collect from children and sum their sizes
| ($children | map(
if is_valid_node then
collect_dirs($path)
else
empty
end
)) as $child_results
# Sum up all children'\''s total sizes
| ($child_results | map(.total_dsize) | add // 0) as $children_dsize
| ($child_results | map(.total_asize) | add // 0) as $children_asize
# Total for this node
| ($own_dsize + $children_dsize) as $total_dsize
| ($own_asize + $children_asize) as $total_asize
# Collect all directory records from children
| ($child_results | map(.dirs) | add // []) as $child_dirs
# Only include this node if it is a directory (has children)
| if ($children | length) > 0 then
{
dirs: ([{
path: $path,
dsize: $total_dsize,
asize: $total_asize,
mtime: ($meta.mtime // null)
}] + $child_dirs),
total_dsize: $total_dsize,
total_asize: $total_asize
}
else
# This is a file - contribute size but no dir record
{
dirs: $child_dirs,
total_dsize: $total_dsize,
total_asize: $total_asize
}
end;
# ============================================================================
# Main pipeline
# ============================================================================
find_root as $root
| if $root == null then
# Could not find valid root — output empty array (will trigger error in shell)
[]
else
$root
| collect_dirs("")
| .dirs # Extract the dirs array from the result
| sort_by(.dsize)
| reverse
| .[:$N]
end
'
# Run the extraction
jq --argjson N "$TOP_N" "$JQ_FILTER" "$FULL_JSON" > "$TOP_JSON"
# -----------------------------------------------------------------------------
# Validate output — fail loudly if extraction yielded no directories
# -----------------------------------------------------------------------------
DIR_COUNT=$(jq 'length' "$TOP_JSON" 2>/dev/null || echo "0")
if [[ "$DIR_COUNT" -eq 0 ]]; then
echo "" >&2
echo "========================================================================" >&2
echo "ERROR: Extracted zero directories from ncdu export." >&2
echo "========================================================================" >&2
echo "" >&2
echo "This may indicate:" >&2
echo " 1. The scanned path contains only files (no subdirectories)" >&2
echo " 2. An unsupported ncdu export format" >&2
echo " 3. The ncdu export is corrupt or empty" >&2
echo "" >&2
echo "Diagnostic info:" >&2
echo " Export file: $FULL_JSON" >&2
echo " File size: $(wc -c < "$FULL_JSON" | tr -d ' ') bytes" >&2
echo "" >&2
echo "First 500 chars of export:" >&2
head -c 500 "$FULL_JSON" >&2
echo "" >&2
echo "" >&2
echo "To debug, run: jq '.[3]' \"$FULL_JSON\" | head -c 1000" >&2
echo "========================================================================" >&2
exit 1
fi
# -----------------------------------------------------------------------------
# Generate human-readable TSV
# Columns: dsize_GB asize_GB dsize_bytes asize_bytes path
# -----------------------------------------------------------------------------
{
echo "dsize_GB\tasize_GB\tdsize_bytes\tasize_bytes\tpath"
jq -r '
.[]
| [
((.dsize / 1073741824 * 100 | floor) / 100), # dsize in GB (2 decimal places)
((.asize / 1073741824 * 100 | floor) / 100), # asize in GB (2 decimal places)
.dsize,
.asize,
.path
]
| @tsv
' "$TOP_JSON"
} > "$TOP_TSV"
# -----------------------------------------------------------------------------
# Summary
# -----------------------------------------------------------------------------
echo ""
echo "Done. Extracted $DIR_COUNT directories."
echo ""
echo "Output files:"
echo " Full export: $FULL_JSON"
echo " Top JSON: $TOP_JSON"
echo " Top TSV: $TOP_TSV"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment