Skip to content

Instantly share code, notes, and snippets.

@chrowe
Last active September 24, 2025 13:18
Show Gist options
  • Select an option

  • Save chrowe/14956ad4f7d5ef2c04a6d77c695a0e5b to your computer and use it in GitHub Desktop.

Select an option

Save chrowe/14956ad4f7d5ef2c04a6d77c695a0e5b to your computer and use it in GitHub Desktop.
Scripts to help with managing Aqueduct files

Aqueduct Data Files Scripts

Collection of scripts for downloading directory lists from various sources and a tool to visualize them.

Dependencies

All scripts use inline dependency declarations and can be run directly with uv run.

Scripts to get file/folder lists

Azure Blob Inventory Download

Download Azure Blob Storage inventory files for a specific date.

Authentication

To access Azure storage, you need to authenticate:

  1. Install Azure CLI from https://learn.microsoft.com/en-us/cli/azure/install-azure-cli
  2. Login to Azure az login

Usage

# Download all inventory files for 2025-08-13
uv run download_azure_inventory.py \
    --date 2025-08-13 \
    --container wri-water-container-inventory \
    --account-name uuwridata

# For more options
uv run download_azure_inventory.py --help

WebDAV Tree Listing

Recursively list files from a WebDAV directory.

# List all files in a WebDAV directory
uv run list_webdav_tree.py <folder_url>

uv run list_webdav_tree.py https://geo.public.data.uu.nl/vault-pcrglobwb-cmip6/research-pcrglobwb-cmip6%5B1690540205%5D/

# For more options
uv run list_webdav_tree.py --help

7z Archive Listing

List or extract files from remote/local .7z archives.

# List archive contents
uv run list_7z_http.py <7z_url>

uv run list_7z_http.py https://geo.public.data.uu.nl/vault-pcrglobwb-cmip6/research-pcrglobwb-cmip6%5B1690540205%5D/original/hypflowsci6_v1.0/input_and_code/pcrglobwb_input_aqueduct_v2021-09-16_mx9.7z

# Extract a single file
uv run list_7z_http.py <7z_url> --get <file>

uv run list_7z_http.py "https://geo.public.data.uu.nl/vault-pcrglobwb-cmip6/research-pcrglobwb-cmip6%5B1690540205%5D/original/hypflowsci6_v1.0/input_and_code/pcrglobwb_input_aqueduct_v2021-09-16_mx9.7z" --get "version_2021-09-16/general/paddy_cropfactor_filled.nc"

# For more options
uv run list_7z_http.py --help

Combine parquet files into one

See duckdb.md file

Group by folder

uv run analyze_storage_by_folder.py --input <input_csv_path> --output <output_csv_path>

uv run analyze_storage_by_folder.py \
  --input data/azure_blob_inventory/2025-08-13/blob-inventory-all_1000000_joined.parquet \
  --output data/azure_blob_inventory/2025-08-13/blob-inventory-all-joined-folders.csv

Visualize

Open storage_treemap.html in a browser.

/data
.vscode/settings.json
#!/usr/bin/env python3
# /// script
# requires-python = ">=3.9"
# dependencies = [
# "pandas>=2.0.0",
# "pyarrow>=14.0.1",
# ]
# ///
"""
Analyze blob storage inventory data to calculate storage usage by folder.
"""
import pandas as pd
def analyze_storage_by_folder(input_df):
# Create a copy of the input dataframe to avoid modifying the original
df = input_df.copy()
# Extract folder paths using regex
# This will match everything up to the last forward slash
df['Folder'] = df['Name'].str.extract(r'^(.+)/[^/]*$')
# Convert Content-Length to numeric, invalid values become NaN
df['Content-Length'] = pd.to_numeric(df['Content-Length'], errors='coerce')
# Filter out folders
df = df[df['hdi_isfolder'] != "True"]
# Group by folder and calculate metrics
result = df.groupby('Folder').agg(
File_Count=('Name', 'count'),
Total_Storage=('Content-Length', 'sum')
).reset_index()
return result
if __name__ == "__main__":
import argparse
from pathlib import Path
default_input = "data/azure_blob_inventory/2025-08-13/blob-inventory-all_1000000_joined.parquet"
default_output = "data/azure_blob_inventory/2025-08-13/blob-inventory-all-joined-folders.csv"
parser = argparse.ArgumentParser(description='Analyze storage usage by folder from blob inventory data')
parser.add_argument('--input', default=default_input,
help=f'Path to input file (CSV or Parquet, default: {default_input})')
parser.add_argument('--output', default=default_output,
help=f'Path to output CSV file (default: {default_output})')
args = parser.parse_args()
try:
# Determine file type from extension
input_path = Path(args.input)
if input_path.suffix.lower() == '.parquet':
df = pd.read_parquet(args.input)
else:
df = pd.read_csv(args.input)
result_df = analyze_storage_by_folder(df)
# Create output directory if it doesn't exist
output_path = Path(args.output)
output_path.parent.mkdir(parents=True, exist_ok=True)
result_df.to_csv(args.output, index=False)
except Exception as e:
print(f"Error processing data: {str(e)}")
exit(1)
#!/usr/bin/env python3
# /// script
# requires-python = ">=3.9"
# dependencies = [
# "azure-storage-blob>=12.19.0",
# "azure-identity>=1.15.0",
# ]
# ///
"""
Download Azure Blob Storage inventory files for a specific date.
"""
import os
import sys
import argparse
from datetime import datetime
from pathlib import Path
from azure.storage.blob import BlobServiceClient
from azure.identity import DefaultAzureCredential
from azure.core.exceptions import ClientAuthenticationError, HttpResponseError
def parse_arguments():
parser = argparse.ArgumentParser(description='Download Azure Blob inventory files for a specific date')
parser.add_argument('--date', required=True, help='Date in YYYY-MM-DD format')
parser.add_argument('--container', required=True, help='Azure storage container name')
parser.add_argument('--account-name', help='Storage account name (if using DefaultAzureCredential)')
parser.add_argument('--connection-string', help='Storage account connection string')
parser.add_argument('--inventory-prefix', default='', help='Prefix path before date folders (e.g., "wri-water-container-inventory/")')
parser.add_argument('--output-dir', default='./inventory_downloads', help='Local directory for downloads')
parser.add_argument('--preserve-structure', action='store_true', help='Preserve the full directory structure when downloading')
return parser.parse_args()
def print_auth_help():
"""Print authentication help message."""
print("\n" + "="*70)
print("AUTHENTICATION REQUIRED")
print("="*70)
print("\nYou need to authenticate to access this Azure storage account.")
print("\nOption 1: Use a connection string")
print(" --connection-string \"<your-connection-string>\"")
print("\nOption 2: Use Azure CLI (with --account-name)")
print(" 1. Install Azure CLI: https://docs.microsoft.com/en-us/cli/azure/install-azure-cli")
print(" 2. Login: az login")
print(" 3. Run this script with --account-name <storage-account>")
print("\nOption 3: Set environment variables (with --account-name)")
print(" export AZURE_CLIENT_ID=<your-client-id>")
print(" export AZURE_TENANT_ID=<your-tenant-id>")
print(" export AZURE_CLIENT_SECRET=<your-client-secret>")
print("\nFor more info: https://aka.ms/azsdk/python/identity/defaultazurecredential/troubleshoot")
print("="*70 + "\n")
def print_permission_help(account_name, container):
"""Print permission error help message."""
print("\n" + "="*70)
print("PERMISSION DENIED")
print("="*70)
print(f"\nYou are authenticated but don't have permission to access:")
print(f" Storage Account: {account_name}")
print(f" Container: {container}")
print("\nPossible solutions:")
print("1. Contact the storage account owner to grant you access")
print("2. Use a connection string with appropriate permissions:")
print(" --connection-string \"<connection-string>\"")
print("3. If this is a public container, it may require anonymous access")
print("="*70 + "\n")
def get_blob_service_client(args):
"""Initialize BlobServiceClient with either connection string or default credentials."""
if args.connection_string:
return BlobServiceClient.from_connection_string(args.connection_string)
elif args.account_name:
account_url = f"https://{args.account_name}.blob.core.windows.net"
credential = DefaultAzureCredential()
return BlobServiceClient(account_url=account_url, credential=credential)
else:
print("Error: Provide either --connection-string or --account-name")
sys.exit(1)
def download_inventory_files(blob_service_client, container_name, date_str, inventory_prefix, output_dir, preserve_structure, account_name=None):
"""Download all inventory files for the specified date."""
container_client = blob_service_client.get_container_client(container_name)
# Parse date to get year/month/day components
date_obj = datetime.strptime(date_str, '%Y-%m-%d')
year = date_obj.strftime('%Y')
month = date_obj.strftime('%m')
day = date_obj.strftime('%d')
# Build the date-based prefix (e.g., "2025/08/13/")
# This will match all time folders for that date
date_prefix = f"{inventory_prefix}{year}/{month}/{day}/"
# Create base output directory
output_path = Path(output_dir) / date_str
output_path.mkdir(parents=True, exist_ok=True)
downloaded_count = 0
total_size = 0
try:
print(f"Searching for blobs with prefix: {date_prefix}")
blobs = container_client.list_blobs(name_starts_with=date_prefix)
for blob in blobs:
# Skip if it's not a CSV file (inventory files are CSVs)
if not blob.name.endswith('.csv'):
continue
blob_client = container_client.get_blob_client(blob.name)
# Determine local file path
if preserve_structure:
# Keep the time folder and subdirectory structure
# Remove the inventory prefix but keep date/time/subfolder structure
relative_path = blob.name
if inventory_prefix:
relative_path = blob.name[len(inventory_prefix):]
local_file_path = output_path.parent / relative_path
local_file_path.parent.mkdir(parents=True, exist_ok=True)
else:
# Flatten structure - just use the filename
local_file_path = output_path / Path(blob.name).name
print(f"Downloading: {blob.name} -> {local_file_path}")
with open(local_file_path, 'wb') as file:
download_stream = blob_client.download_blob()
data = download_stream.readall()
file.write(data)
total_size += len(data)
downloaded_count += 1
except ClientAuthenticationError as e:
print(f"\nError: Authentication failed!")
print_auth_help()
sys.exit(1)
except HttpResponseError as e:
if "AuthorizationPermissionMismatch" in str(e) or "AuthorizationFailure" in str(e):
print(f"\nError: Permission denied!")
print_permission_help(account_name, container_name)
else:
print(f"Error downloading files: {e}")
sys.exit(1)
except Exception as e:
print(f"Error downloading files: {e}")
sys.exit(1)
return downloaded_count, total_size
def main():
args = parse_arguments()
# Validate date format
try:
datetime.strptime(args.date, '%Y-%m-%d')
except ValueError:
print(f"Error: Invalid date format '{args.date}'. Use YYYY-MM-DD")
sys.exit(1)
# Initialize Azure client
blob_service_client = get_blob_service_client(args)
# Download inventory files
print(f"Downloading inventory files for date: {args.date}")
print(f"Container: {args.container}")
if args.inventory_prefix:
print(f"Inventory prefix: {args.inventory_prefix}")
count, size = download_inventory_files(
blob_service_client,
args.container,
args.date,
args.inventory_prefix,
args.output_dir,
args.preserve_structure,
args.account_name
)
if count == 0:
print(f"\nNo inventory files found for date {args.date}")
else:
size_mb = size / (1024 * 1024)
print(f"\nSuccessfully downloaded {count} inventory files ({size_mb:.2f} MB) to {args.output_dir}/{args.date}")
if __name__ == '__main__':
main()

Are all schemas identical across Parquet files?

duckdb -c "
WITH s AS (
  SELECT file_name,
         md5(string_agg(name || ':' || CAST(\"type\" AS VARCHAR), ',' ORDER BY name)) AS sig
  FROM parquet_schema('*.parquet')
  GROUP BY file_name
)
SELECT CASE WHEN COUNT(DISTINCT sig)=1
     THEN printf('OK: all %d files share the same columns and types', COUNT(*))
     ELSE printf('%d different schemas found', COUNT(DISTINCT sig))
END AS result
FROM s;
"

Which files have different schemas?

duckdb -c "
WITH per_file AS (
  SELECT file_name,
         string_agg(name || ':' || CAST(\"type\" AS VARCHAR), ', ' ORDER BY name) AS schema_str
  FROM parquet_schema('*.parquet')
  GROUP BY file_name
)
SELECT schema_str, string_agg(file_name, '\n  - ' ORDER BY file_name) AS files
FROM per_file
GROUP BY schema_str;
"

Show both per-file and total

duckdb -c "
SELECT COALESCE(filename, '=== TOTAL ===') AS file, COUNT(*) AS rows
FROM read_parquet('*.parquet', filename=true)
GROUP BY ROLLUP(filename)
ORDER BY filename NULLS LAST;
"

Count columns

duckdb -c "
SELECT file_name, COUNT(*) AS num_columns
FROM parquet_schema('*.parquet')
GROUP BY file_name
ORDER BY file_name;
"

Join all Parquet files, keeping just the columns of interest

duckdb -c "
COPY (
  SELECT \"Name\", \"Creation-Time\", \"Last-Modified\", \"Etag\", \"Content-Length\", \"Content-Type\", \"Content-CRC64\", \"Content-MD5\", \"AccessTier\", \"AccessTierChangeTime\", \"AccessTierInferred\", \"hdi_isfolder\", \"Owner\", \"Group\", \"Permissions\", \"Acl\" 
  FROM read_parquet('*.parquet', union_by_name=true)
) TO 'blob-inventory-all_1000000_joined.parquet' (FORMAT PARQUET);
"
# /// script
# requires-python = ">=3.9"
# dependencies = [
# "py7zr",
# "fsspec",
# "aiohttp",
# "requests",
# ]
# ///
import sys
import argparse
from pathlib import Path
from tempfile import TemporaryDirectory
import shutil
import fsspec
import py7zr
from datetime import datetime
def human(n):
# simple humanize, base-1024
for unit in ("", "K", "M", "G", "T"):
if n < 1024:
return f"{n:.0f}{unit}"
n /= 1024
return f"{n:.0f}P"
def iso(dt):
if not dt:
return ""
# ensure consistent display
if isinstance(dt, datetime):
return dt.isoformat(sep=" ", timespec="seconds")
return str(dt)
def file_mtime(fi):
"""Best-effort to get a modification time from a py7zr file info object.
Different py7zr versions/archives may expose different fields.
"""
for attr in (
"modified_time", # common
"writetime",
"write_time",
"mtime",
"datetime",
"creationtime",
"create_time",
):
v = getattr(fi, attr, None)
if v:
# Convert numeric epoch to datetime if encountered
if isinstance(v, (int, float)):
try:
return datetime.fromtimestamp(v)
except Exception:
pass
return v
return None
parser = argparse.ArgumentParser(
description="List a .7z archive (remote/local) or extract a single file.",
usage="%(prog)s <url> [--get <file>]",
)
parser.add_argument("url", metavar="<url>", help="URL or path to a .7z archive")
parser.add_argument("--get", dest="file", metavar="<file>", help="Extract a single file to the current directory (preserving its path)")
args = parser.parse_args()
url = args.url
# Single-file extraction path (avoid listing)
if args.file:
target = args.file
with fsspec.open(url, mode="rb") as fh:
with py7zr.SevenZipFile(fh, mode="r") as z:
# Validate presence (exact match preferred)
entries = [fi for fi in z.list() if not getattr(fi, "is_directory", False)]
names = [getattr(fi, "filename", "") for fi in entries]
if target not in names:
# Relax to unique basename match
candidates = [n for n in names if Path(n).name == Path(target).name]
if len(candidates) == 1:
target = candidates[0]
elif len(candidates) > 1:
print("Multiple entries match by basename; please specify full path:")
for n in candidates:
print(f" {n}")
sys.exit(2)
else:
print(f"File not found in archive: {args.file}")
sys.exit(2)
out_path = Path(target)
out_path.parent.mkdir(parents=True, exist_ok=True)
with TemporaryDirectory() as td:
try:
z.extract(path=td, targets=[target])
except TypeError:
# Older py7zr signature may require positional path
z.extract(td, targets=[target])
tmp_src = Path(td) / target
if not tmp_src.exists():
# Fallback: search by basename under temp tree
matches = list(Path(td).rglob(Path(target).name))
if not matches:
print(f"Extraction failed: {target} not found in temp dir")
sys.exit(2)
tmp_src = matches[0]
# Stream copy to output
with open(tmp_src, "rb") as rf, open(out_path, "wb") as wf:
shutil.copyfileobj(rf, wf, length=1024 * 1024)
print(f"Saved: {out_path} ({human(out_path.stat().st_size)})")
sys.exit(0)
# Header
print(f'Listing archive: {url}\n')
print(f'{"Modified":19} {"Size":>12} {"Packed":>12} Name')
print('-' * 64)
total_size = 0
total_packed = 0
count = 0
with fsspec.open(url, mode="rb") as fh:
with py7zr.SevenZipFile(fh, mode="r") as z:
# py7zr.list() yields file info objects with attributes:
# filename, uncompressed, compressed, modified_time, is_directory, etc.
for fi in z.list():
if getattr(fi, "is_directory", False):
continue
name = getattr(fi, "filename", "")
size = int(getattr(fi, "uncompressed", 0) or 0)
packed = int(getattr(fi, "compressed", 0) or 0)
mtime = file_mtime(fi)
print(f'{iso(mtime):19} {human(size):>12} {human(packed):>12} {name}')
total_size += size
total_packed += packed
count += 1
print('-' * 64)
print(f'Files: {count} Total size: {round(total_size / 1024**3, 3)} GB Total packed: {round(total_packed / 1024**3, 3)} GB')
# /// script
# requires-python = ">=3.9"
# dependencies = [
# "requests",
# ]
# ///
import sys
import time
import argparse
import urllib.parse as urlparse
import xml.etree.ElementTree as ET
from collections import deque
import requests
DAV_NS = "{DAV:}"
PROPFIND_BODY = (
"<?xml version=\"1.0\" encoding=\"utf-8\"?>"
"<d:propfind xmlns:d=\"DAV:\">"
"<d:prop><d:resourcetype/><d:getcontentlength/><d:getlastmodified/><d:etag/></d:prop>"
"</d:propfind>"
)
def normalize_dir(url: str) -> str:
return url if url.endswith("/") else url + "/"
def same_tree(url: str, base: str) -> bool:
u = urlparse.urlsplit(url)
b = urlparse.urlsplit(base)
if (u.scheme, u.netloc) != (b.scheme, b.netloc):
return False
# Compare using unquoted (decoded) paths to ignore % encoding differences
u_path = normalize_dir(urlparse.unquote(u.path))
b_path = normalize_dir(urlparse.unquote(b.path))
return u_path.startswith(b_path)
def human_bytes(b: float) -> str:
b = float(b or 0)
units = ["B", "KB", "MB", "GB", "TB", "PB"]
i = 0
while b >= 1024 and i < len(units) - 1:
b /= 1024
i += 1
return f"{b:.1f}{units[i]}" if i else f"{int(b)}{units[i]}"
def propfind(url: str, auth: tuple[str, str] | None = None) -> ET.Element:
headers = {
"Depth": "1",
"Accept": "application/xml",
"Content-Type": 'application/xml; charset="utf-8"',
}
r = requests.request("PROPFIND", url, headers=headers, data=PROPFIND_BODY, auth=auth)
r.raise_for_status()
# Some servers return bytes; ensure text for ET
text = r.text if isinstance(r.text, str) else r.content.decode("utf-8", "replace")
return ET.fromstring(text)
def crawl(base_url: str, auth: tuple[str, str] | None, delay: float = 0.0, max_pages: int | None = None):
base_url = normalize_dir(base_url)
q = deque([base_url])
seen = set()
pages = 0
while q:
cur = q.popleft()
if cur in seen:
continue
seen.add(cur)
try:
root = propfind(cur, auth=auth)
except Exception as e:
print(f"WARN: PROPFIND failed for {cur}: {e}", file=sys.stderr)
continue
for resp in root.findall(f".//{DAV_NS}response"):
href_el = resp.find(f"{DAV_NS}href")
if href_el is None or not href_el.text:
continue
abs_url = urlparse.urljoin(cur, href_el.text)
# stay in tree
if not same_tree(abs_url, base_url):
continue
rt = resp.find(f".//{DAV_NS}resourcetype")
is_dir = rt is not None and rt.find(f"{DAV_NS}collection") is not None
# many servers echo the current directory as an entry; skip it
# Compare on decoded paths to avoid % encoding mismatches
cur_path = normalize_dir(urlparse.unquote(urlparse.urlsplit(cur).path)).rstrip("/")
abs_path = normalize_dir(urlparse.unquote(urlparse.urlsplit(abs_url).path)).rstrip("/")
if abs_path == cur_path:
continue
if is_dir:
q.append(normalize_dir(abs_url))
else:
size_el = resp.find(f".//{DAV_NS}getcontentlength")
mod_el = resp.find(f".//{DAV_NS}getlastmodified")
etag_el = resp.find(f".//{DAV_NS}etag")
yield {
"url": abs_url,
"size": int(size_el.text) if size_el is not None and size_el.text and size_el.text.isdigit() else size_el.text if size_el is not None else None,
"modified": mod_el.text if mod_el is not None else None,
"etag": etag_el.text if etag_el is not None else None,
}
pages += 1
if max_pages is not None and pages >= max_pages:
break
if delay:
time.sleep(delay)
def main(argv: list[str]) -> int:
ap = argparse.ArgumentParser(description="Recursively list files from a WebDAV directory using PROPFIND Depth:1.")
ap.add_argument("url", help="Base WebDAV directory URL to start from")
ap.add_argument("--user", default=None, help="Username (if required)")
ap.add_argument("--password", default=None, help="Password (if required)")
ap.add_argument("--output", choices=["text", "json"], default="text", help="Output format")
ap.add_argument("--delay", type=float, default=0.0, help="Polite delay between requests")
ap.add_argument("--max-pages", type=int, default=None, help="Limit number of directory pages crawled")
args = ap.parse_args(argv)
base = args.url
if not base.startswith(("http://", "https://")):
print("ERROR: url must start with http:// or https://", file=sys.stderr)
return 2
auth = None
if args.user is not None or args.password is not None:
auth = (args.user or "", args.password or "")
items = list(crawl(base, auth=auth, delay=args.delay, max_pages=args.max_pages))
if args.output == "json":
import json
print(json.dumps({
"base": normalize_dir(base),
"count": len(items),
"files": items,
}, indent=2))
else:
for it in items:
size = it.get("size")
human = human_bytes(size) if isinstance(size, (int, float)) or (isinstance(size, str) and size.isdigit()) else str(size)
print(f"{human:>8}\t{it.get('url')}")
return 0
if __name__ == "__main__":
raise SystemExit(main(sys.argv[1:]))
#!/usr/bin/env python3
# /// script
# requires-python = ">=3.9"
# dependencies = [
# "pandas>=2.0.0",
# "pyarrow>=14.0.1",
# ]
# ///
"""
Read inventory data from a CSV.GZ file using schema information from manifest.json
"""
import pandas as pd
import json
import os
from pathlib import Path
import argparse
def read_inventory_with_schema(csv_file_path, manifest_file_path=None):
"""
Read inventory data from a CSV file using schema from manifest.json
Args:
csv_file_path (str): Path to the CSV or CSV.GZ file
manifest_file_path (str, optional): Path to the manifest.json file.
If None, will look in the same directory as csv_file_path
Returns:
pd.DataFrame: DataFrame with the inventory data
"""
csv_path = Path(csv_file_path)
# If manifest_file_path is not provided, look in the same directory as csv_file_path
if manifest_file_path is None:
manifest_file_path = csv_path.parent / "manifest.json"
# Check if manifest file exists
if not Path(manifest_file_path).exists():
print(f"Warning: Manifest file not found at {manifest_file_path}")
print("Reading CSV without schema information...")
return pd.read_csv(csv_file_path)
# Read the manifest file
try:
with open(manifest_file_path, 'r') as f:
manifest_data = json.load(f)
if 'fileSchema' not in manifest_data:
print("Warning: No 'fileSchema' found in manifest file")
print("Reading CSV without schema information...")
return pd.read_csv(csv_file_path)
# Parse column headers from fileSchema
columns = [col.strip() for col in manifest_data['fileSchema'].split(',')]
print(f"Found schema with {len(columns)} columns: {', '.join(columns[:5])}...")
# Read the CSV file with the schema
df = pd.read_csv(csv_file_path, names=columns, header=None)
return df
except Exception as e:
print(f"Error reading manifest or applying schema: {str(e)}")
print("Falling back to reading CSV without schema...")
return pd.read_csv(csv_file_path)
def main():
parser = argparse.ArgumentParser(description='Read inventory data using schema from manifest.json')
parser.add_argument('--input', default='data/wri-projects/csv/9a8e0fee-c83e-4269-8368-dbdc2d92b042.csv.gz',
help='Path to input CSV or CSV.GZ file')
parser.add_argument('--manifest', default=None,
help='Path to manifest.json file (default: same directory as input file)')
parser.add_argument('--output', default='data/wri-projects/csv/output.csv',
help='Path to output CSV file (optional)')
parser.add_argument('--show-columns', action='store_true',
help='Show column names and exit')
parser.add_argument('--show-sample', action='store_true',
help='Show sample data (first 5 rows)')
parser.add_argument('--analyze-storage', action='store_true',
help='Analyze storage by folder (like analyze_storage_by_folder.py)')
parser.add_argument('--filter-prefix', default="Aqueduct",
help='Only include folders starting with this prefix (e.g., "data/raw/")')
args = parser.parse_args()
try:
# Read the data
df = read_inventory_with_schema(args.input, args.manifest)
# Show columns if requested
if args.show_columns:
print(f"\nColumns in {args.input}:")
for i, column in enumerate(df.columns):
print(f" {i}: {column}")
# Show sample data if requested
if args.show_sample:
print(f"\nSample data (first 5 rows):")
print(df.head(5))
# Analyze storage by folder if requested
if args.analyze_storage:
# Try to detect appropriate column names for storage analysis
name_column = None
size_column = None
# Look for common column names
for col in df.columns:
if col.lower() in ['key', 'name', 'path']:
name_column = col
if col.lower() in ['size', 'content-length', 'contentlength']:
size_column = col
if name_column is None or size_column is None:
print("\nCouldn't automatically detect name/size columns.")
if not args.show_columns:
print("Use --show-columns to see available columns.")
print("Please specify columns manually for analysis:")
print("Example: --name-column 'Key' --size-column 'Size'")
else:
# Calculate storage by folder
df['Folder'] = df[name_column].astype(str).str.extract(r'^(.+)/[^/]*$')
# Apply folder prefix filter if specified
if args.filter_prefix:
print(f"\nFiltering folders by prefix: '{args.filter_prefix}'")
# Create a filtered dataframe with only matching entries
filtered_df = df[df[name_column].astype(str).str.startswith(args.filter_prefix)]
if len(filtered_df) == 0:
print(f"Warning: No entries found with prefix '{args.filter_prefix}'")
print(f"Total entries before filtering: {len(df)}")
else:
print(f"Filtered {len(filtered_df)} entries out of {len(df)} total")
df = filtered_df
result = df.groupby('Folder', dropna=False).agg(
File_Count=(name_column, 'count'),
Total_Storage=(size_column, 'sum')
).reset_index()
# Sort by storage size (descending)
result = result.sort_values('Total_Storage', ascending=False)
print(f"\nStorage analysis by folder (using {name_column} and {size_column}):")
print(result.head(10))
if args.output:
result.to_csv(args.output, index=False)
print(f"\nFull results saved to {args.output}")
# Save to output if specified
elif args.output:
df.to_csv(args.output, index=False)
print(f"Data saved to {args.output}")
except Exception as e:
print(f"Error: {str(e)}")
return 1
return 0
if __name__ == "__main__":
exit(main())
<!DOCTYPE html>
<html>
<head>
<title>Storage Treemap Visualization</title>
<script src="https://d3js.org/d3.v7.min.js"></script>
<style>
html, body {
height: 100%;
margin: 0;
padding: 0;
font-family: Arial, sans-serif;
overflow: hidden;
}
body {
display: flex;
flex-direction: column;
}
#header {
padding: 10px 20px;
background: #f5f5f5;
border-bottom: 1px solid #ddd;
}
#title-row {
display: flex;
justify-content: space-between;
align-items: center;
margin-bottom: 10px;
}
h1 {
margin: 0;
font-size: 24px;
}
#file-controls {
display: flex;
align-items: center;
gap: 10px;
}
#controls {
margin: 0;
}
#treemap {
flex: 1;
position: relative;
width: 100%;
}
.node {
position: absolute;
overflow: hidden;
cursor: pointer;
line-height: 1.2;
font-size: 10px;
}
.node-label {
padding: 4px;
color: white;
text-shadow: 0 0 3px rgba(0,0,0,0.8);
overflow: hidden;
text-overflow: ellipsis;
}
#tooltip {
position: absolute;
padding: 10px;
background: rgba(0,0,0,0.9);
color: white;
border-radius: 5px;
pointer-events: none;
display: none;
z-index: 1000;
}
input[type="file"] { margin-right: 10px; }
#cached-info {
display: inline-block;
margin-left: 10px;
color: #666;
font-size: 12px;
}
#clear-cache {
margin-left: 5px;
font-size: 11px;
cursor: pointer;
color: #0066cc;
text-decoration: underline;
}
#depth-control {
display: inline-flex;
align-items: center;
margin-left: 10px;
}
#maxDepth {
width: 150px;
margin: 0 8px;
}
#depth-value {
min-width: 20px;
font-weight: bold;
}
#breadcrumb {
padding: 5px 20px;
background: #fff;
border-bottom: 1px solid #ddd;
font-size: 14px;
min-height: 20px;
}
#breadcrumb span {
color: #0066cc;
cursor: pointer;
text-decoration: underline;
}
#breadcrumb span:hover {
color: #0044aa;
}
#breadcrumb .separator {
color: #666;
margin: 0 5px;
text-decoration: none;
cursor: default;
}
#breadcrumb .current {
color: #333;
text-decoration: none;
cursor: default;
font-weight: bold;
}
.node.folder {
cursor: pointer;
}
.node.folder:hover {
opacity: 0.8;
}
#welcome {
position: absolute;
top: 50%;
left: 50%;
transform: translate(-50%, -50%);
text-align: center;
padding: 40px;
background: #f9f9f9;
border-radius: 10px;
box-shadow: 0 2px 10px rgba(0,0,0,0.1);
}
#welcome h2 {
margin: 0 0 20px 0;
color: #333;
}
#welcome p {
margin: 0 0 30px 0;
color: #666;
}
#welcome input[type="file"] {
padding: 10px 20px;
font-size: 16px;
cursor: pointer;
}
.has-data #welcome {
display: none;
}
.has-data #header,
.has-data #breadcrumb,
.has-data #treemap {
display: block;
}
body:not(.has-data) #header,
body:not(.has-data) #breadcrumb,
body:not(.has-data) #treemap {
display: none;
}
</style>
</head>
<body>
<div id="welcome">
<h2>Storage Treemap Visualization</h2>
<p>Select a CSV file to visualize storage data</p>
<input type="file" id="welcomeFileInput" accept=".csv">
</div>
<div id="header">
<div id="title-row">
<h1>Storage Treemap Visualization</h1>
<div id="file-controls">
<input type="file" id="csvFile" accept=".csv">
<span id="cached-info" style="display: none;">
Using cached: <span id="cached-filename"></span>
<span id="clear-cache">[clear]</span>
</span>
</div>
</div>
<div id="controls">
<label>Size by: <select id="sizeBy">
<option value="storage" selected>Total Storage</option>
<option value="count">File Count</option>
</select></label>
<label style="margin-left: 10px;">Color by: <select id="colorBy">
<option value="size">Total Storage</option>
<option value="count" selected>File Count</option>
</select></label>
<span id="depth-control">
<label>Max Depth:</label>
<input type="range" id="maxDepth" min="1" max="10" value="10">
<span id="depth-value">10</span>
</span>
</div>
</div>
<div id="breadcrumb"></div>
<div id="treemap"></div>
<div id="tooltip"></div>
<script>
let currentData = null; // Store current data for resize events
let originalData = null; // Store original unfiltered data
let maxDataDepth = 10; // Store the maximum depth found in the data
let navigationPath = []; // Store navigation path for breadcrumb
// Check for cached data on page load
window.addEventListener('DOMContentLoaded', function() {
const cachedCSV = localStorage.getItem('treemap_csv_data');
const cachedFilename = localStorage.getItem('treemap_csv_filename');
if (cachedCSV && cachedFilename) {
// Show cached info
document.body.classList.add('has-data');
document.getElementById('cached-info').style.display = 'inline-block';
document.getElementById('cached-filename').textContent = cachedFilename;
// Process the cached data
const csv = d3.csvParse(cachedCSV);
processData(csv);
}
});
// Clear cache handler
document.getElementById('clear-cache').addEventListener('click', function() {
localStorage.removeItem('treemap_csv_data');
localStorage.removeItem('treemap_csv_filename');
document.getElementById('cached-info').style.display = 'none';
// Clear the visualization
document.getElementById('treemap').innerHTML = '';
document.getElementById('breadcrumb').innerHTML = '';
currentData = null;
originalData = null;
navigationPath = [];
// Show welcome screen again
document.body.classList.remove('has-data');
});
// Handle file input from welcome screen
document.getElementById('welcomeFileInput').addEventListener('change', handleFileSelect);
document.getElementById('csvFile').addEventListener('change', handleFileSelect);
function handleFileSelect(e) {
const file = e.target.files[0];
if (file) {
const reader = new FileReader();
reader.onload = function(e) {
const csvContent = e.target.result;
// Store in localStorage
try {
localStorage.setItem('treemap_csv_data', csvContent);
localStorage.setItem('treemap_csv_filename', file.name);
// Update UI
document.body.classList.add('has-data');
document.getElementById('cached-info').style.display = 'inline-block';
document.getElementById('cached-filename').textContent = file.name;
} catch (e) {
console.warn('Could not cache file (localStorage full or disabled):', e);
document.body.classList.add('has-data');
}
const csv = d3.csvParse(csvContent);
processData(csv);
// Sync file inputs
if (e.target.id === 'welcomeFileInput') {
document.getElementById('csvFile').files = e.target.files;
} else {
document.getElementById('welcomeFileInput').files = e.target.files;
}
};
reader.readAsText(file);
}
}
function processData(csv) {
// Build hierarchy from CSV data
const root = {name: "root", children: []};
const nodeMap = {};
maxDataDepth = 0; // Reset max depth
// Process each row
csv.forEach(row => {
const path = row.Folder || "";
if (!path) return;
const parts = path.split('/').filter(p => p);
const fileCount = +row.File_Count || 0;
const totalStorage = +row.Total_Storage || 0;
// Track maximum depth
maxDataDepth = Math.max(maxDataDepth, parts.length);
// Build the tree structure
let currentPath = "";
let parent = root;
parts.forEach((part, index) => {
currentPath = currentPath ? `${currentPath}/${part}` : part;
if (!nodeMap[currentPath]) {
const node = {
name: part,
path: currentPath,
depth: index + 1 // Add depth tracking
};
// If this is a leaf node (last part), add the values
if (index === parts.length - 1) {
node.fileCount = fileCount;
node.totalStorage = totalStorage;
// Store both metrics, we'll decide which to use for value later
node.storageValue = totalStorage;
node.countValue = fileCount;
} else {
node.children = [];
}
nodeMap[currentPath] = node;
if (!parent.children) {
parent.children = [];
}
parent.children.push(node);
}
parent = nodeMap[currentPath];
});
});
// Update slider max value and reset to max
const slider = document.getElementById('maxDepth');
slider.max = maxDataDepth;
slider.value = maxDataDepth;
document.getElementById('depth-value').textContent = maxDataDepth;
// Remove empty branches and calculate parent values
function cleanAndSum(node) {
if (node.children) {
node.children = node.children.filter(child => {
const hasValue = cleanAndSum(child);
return hasValue;
});
if (node.children.length === 0) {
delete node.children;
return false;
}
// Sum up children values for parents
let totalStorage = 0;
let fileCount = 0;
node.children.forEach(child => {
totalStorage += child.totalStorage || 0;
fileCount += child.fileCount || 0;
});
node.totalStorage = totalStorage;
node.fileCount = fileCount;
node.storageValue = totalStorage;
node.countValue = fileCount;
return totalStorage > 0 || fileCount > 0;
}
return (node.storageValue || 0) > 0 || (node.countValue || 0) > 0;
}
cleanAndSum(root);
originalData = root; // Store original data
applyDepthFilter();
}
function applyDepthFilter() {
if (!originalData) return;
const maxDepth = parseInt(document.getElementById('maxDepth').value);
// Start from the current navigation point
let startNode = originalData;
for (const pathItem of navigationPath) {
startNode = pathItem.node;
}
// Deep clone the original data
function cloneWithDepth(node, currentDepth = 0) {
const clone = {
name: node.name,
path: node.path,
depth: node.depth,
fileCount: node.fileCount || 0,
totalStorage: node.totalStorage || 0,
storageValue: node.storageValue || 0,
countValue: node.countValue || 0
};
// If we're at max depth, collapse children into this node
if (currentDepth >= maxDepth) {
// Sum all descendant values
function sumDescendants(n) {
let storage = n.storageValue || n.totalStorage || 0;
let files = n.countValue || n.fileCount || 0;
if (n.children) {
n.children.forEach(child => {
const childSums = sumDescendants(child);
storage += childSums.storage;
files += childSums.files;
});
}
return { storage: storage, files: files };
}
const sums = sumDescendants(node);
clone.storageValue = sums.storage;
clone.totalStorage = sums.storage;
clone.countValue = sums.files;
clone.fileCount = sums.files;
// Don't include children
} else if (node.children) {
clone.children = node.children.map(child => cloneWithDepth(child, currentDepth + 1));
}
return clone;
}
currentData = cloneWithDepth(startNode);
drawTreemap(currentData);
updateBreadcrumb();
}
function navigateToFolder(node) {
if (!node.children || node.children.length === 0) return;
// Add to navigation path
navigationPath.push({
name: node.name,
path: node.path,
node: node
});
// Redraw from this node
applyDepthFilter();
}
function navigateToBreadcrumb(index) {
// Trim navigation path to the selected index
navigationPath = navigationPath.slice(0, index);
applyDepthFilter();
}
function updateBreadcrumb() {
const breadcrumb = document.getElementById('breadcrumb');
if (navigationPath.length === 0) {
breadcrumb.innerHTML = '';
return;
}
let html = '<span onclick="navigateToBreadcrumb(0)">Root</span>';
navigationPath.forEach((item, index) => {
html += '<span class="separator">›</span>';
if (index === navigationPath.length - 1) {
html += `<span class="current">${item.name}</span>`;
} else {
html += `<span onclick="navigateToBreadcrumb(${index + 1})">${item.name}</span>`;
}
});
breadcrumb.innerHTML = html;
}
function drawTreemap(data) {
const container = document.getElementById('treemap');
const width = container.offsetWidth;
const height = container.offsetHeight;
// Clear previous content
container.innerHTML = '';
// Don't draw if container has no size
if (width === 0 || height === 0) return;
// Get the size metric
const sizeBy = document.getElementById('sizeBy').value;
// Create hierarchy with selected size metric
const hierarchy = d3.hierarchy(data)
.sum(d => {
if (sizeBy === 'storage') {
return d.storageValue || 0;
} else {
return d.countValue || 0;
}
})
.sort((a, b) => b.value - a.value);
// Create treemap layout
const treemap = d3.treemap()
.size([width, height])
.paddingOuter(3)
.paddingTop(19)
.paddingInner(1)
.round(true);
// Generate the treemap
const root = treemap(hierarchy);
// Color scales
const colorBy = document.getElementById('colorBy').value;
const leaves = root.leaves();
const colorScale = d3.scaleSequential()
.domain([0, d3.max(leaves, d => colorBy === 'size' ? d.data.totalStorage : d.data.fileCount)])
.interpolator(d3.interpolateCool);
// Create the treemap cells
const cell = d3.select(container)
.selectAll('.node')
.data(root.descendants())
.enter()
.append('div')
.attr('class', d => d.children ? 'node folder' : 'node')
.style('left', d => `${d.x0}px`)
.style('top', d => `${d.y0}px`)
.style('width', d => `${d.x1 - d.x0}px`)
.style('height', d => `${d.y1 - d.y0}px`)
.style('background', d => {
if (d.children) {
return '#ddd'; // Parent nodes are light gray
} else {
const value = colorBy === 'size' ? d.data.totalStorage : d.data.fileCount;
return colorScale(value);
}
})
.style('border', d => d.children ? '2px solid #999' : '1px solid white')
.on('click', function(event, d) {
if (d.children) {
event.stopPropagation();
navigateToFolder(d.data);
}
})
.on('mouseover', function(event, d) {
const tooltip = document.getElementById('tooltip');
tooltip.style.display = 'block';
let tooltipText = `<strong>${d.data.path || d.data.name}</strong><br>
Files: ${(d.data.fileCount || 0).toLocaleString()}<br>
Size: ${formatBytes(d.data.totalStorage || d.value || 0)}`;
if (d.children) {
tooltipText += '<br><em>Click to explore this folder</em>';
}
tooltip.innerHTML = tooltipText;
tooltip.style.left = `${event.pageX + 10}px`;
tooltip.style.top = `${event.pageY - 30}px`;
})
.on('mouseout', function() {
document.getElementById('tooltip').style.display = 'none';
});
// Add labels
cell.append('div')
.attr('class', 'node-label')
.text(d => {
const width = d.x1 - d.x0;
const height = d.y1 - d.y0;
// Only show label if there's enough space
if (width > 40 && height > 20) {
return d.data.name;
}
return '';
})
.style('font-size', d => {
const width = d.x1 - d.x0;
if (width > 100) return '12px';
if (width > 50) return '10px';
return '9px';
});
}
function formatBytes(bytes) {
if (bytes === 0) return '0 Bytes';
const k = 1024;
const sizes = ['Bytes', 'KB', 'MB', 'GB', 'TB'];
const i = Math.floor(Math.log(bytes) / Math.log(k));
return parseFloat((bytes / Math.pow(k, i)).toFixed(2)) + ' ' + sizes[i];
}
// Handle window resize
let resizeTimer;
window.addEventListener('resize', function() {
clearTimeout(resizeTimer);
resizeTimer = setTimeout(function() {
if (currentData) {
drawTreemap(currentData);
}
}, 250); // Debounce resize events
});
// Handle color change
document.getElementById('colorBy').addEventListener('change', function() {
if (currentData) {
drawTreemap(currentData);
}
});
// Handle size change
document.getElementById('sizeBy').addEventListener('change', function() {
if (currentData) {
drawTreemap(currentData);
}
});
// Handle depth change with slider
document.getElementById('maxDepth').addEventListener('input', function(e) {
document.getElementById('depth-value').textContent = e.target.value;
applyDepthFilter();
});
</script>
</body>
</html>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment