chrowe/ aqueduct-data-files-scripts.md

## aqueduct-data-files-scripts.md

      
    Raw
  

               aqueduct-data-files-scripts.md
            
          
    Aqueduct Data Files Scripts

Collection of scripts for downloading directory lists from various sources and a tool to visualize them.
Dependencies

All scripts use inline dependency declarations and can be run directly with uv run.
Scripts to get file/folder lists

Azure Blob Inventory Download

Download Azure Blob Storage inventory files for a specific date.
Authentication

To access Azure storage, you need to authenticate:

Install Azure CLI from https://learn.microsoft.com/en-us/cli/azure/install-azure-cli
Login to Azure az login

Usage

# Download all inventory files for 2025-08-13
uv run download_azure_inventory.py \
    --date 2025-08-13 \
    --container wri-water-container-inventory \
    --account-name uuwridata

# For more options
uv run download_azure_inventory.py --help
WebDAV Tree Listing

Recursively list files from a WebDAV directory.
# List all files in a WebDAV directory
uv run list_webdav_tree.py <folder_url>

uv run list_webdav_tree.py https://geo.public.data.uu.nl/vault-pcrglobwb-cmip6/research-pcrglobwb-cmip6%5B1690540205%5D/

# For more options
uv run list_webdav_tree.py --help
7z Archive Listing

List or extract files from remote/local .7z archives.
# List archive contents
uv run list_7z_http.py <7z_url>

uv run list_7z_http.py https://geo.public.data.uu.nl/vault-pcrglobwb-cmip6/research-pcrglobwb-cmip6%5B1690540205%5D/original/hypflowsci6_v1.0/input_and_code/pcrglobwb_input_aqueduct_v2021-09-16_mx9.7z

# Extract a single file
uv run list_7z_http.py <7z_url> --get <file>

uv run list_7z_http.py "https://geo.public.data.uu.nl/vault-pcrglobwb-cmip6/research-pcrglobwb-cmip6%5B1690540205%5D/original/hypflowsci6_v1.0/input_and_code/pcrglobwb_input_aqueduct_v2021-09-16_mx9.7z" --get "version_2021-09-16/general/paddy_cropfactor_filled.nc"

# For more options
uv run list_7z_http.py --help
Combine parquet files into one

See duckdb.md file
Group by folder

uv run analyze_storage_by_folder.py --input <input_csv_path> --output <output_csv_path>

uv run analyze_storage_by_folder.py \
  --input data/azure_blob_inventory/2025-08-13/blob-inventory-all_1000000_joined.parquet \
  --output data/azure_blob_inventory/2025-08-13/blob-inventory-all-joined-folders.csv
Visualize

Open storage_treemap.html in a browser.

  
## .gitignore
/data
.vscode/settings.json

## analyze_storage_by_folder.py
#!/usr/bin/env python3
# /// script
# requires-python = ">=3.9"
# dependencies = [
#     "pandas>=2.0.0",
#     "pyarrow>=14.0.1",
# ]
# ///
"""
Analyze blob storage inventory data to calculate storage usage by folder.
"""

import pandas as pd

def analyze_storage_by_folder(input_df):
    # Create a copy of the input dataframe to avoid modifying the original
    df = input_df.copy()

    # Extract folder paths using regex
    # This will match everything up to the last forward slash
    df['Folder'] = df['Name'].str.extract(r'^(.+)/[^/]*$')

    # Convert Content-Length to numeric, invalid values become NaN
    df['Content-Length'] = pd.to_numeric(df['Content-Length'], errors='coerce')

    # Filter out folders
    df = df[df['hdi_isfolder'] != "True"]

    # Group by folder and calculate metrics
    result = df.groupby('Folder').agg(
        File_Count=('Name', 'count'),
        Total_Storage=('Content-Length', 'sum')
    ).reset_index()

    return result

if __name__ == "__main__":
    import argparse
    from pathlib import Path

    default_input = "data/azure_blob_inventory/2025-08-13/blob-inventory-all_1000000_joined.parquet"
    default_output = "data/azure_blob_inventory/2025-08-13/blob-inventory-all-joined-folders.csv"

    parser = argparse.ArgumentParser(description='Analyze storage usage by folder from blob inventory data')
    parser.add_argument('--input', default=default_input,
                      help=f'Path to input file (CSV or Parquet, default: {default_input})')
    parser.add_argument('--output', default=default_output,
                      help=f'Path to output CSV file (default: {default_output})')

    args = parser.parse_args()

    try:
        # Determine file type from extension
        input_path = Path(args.input)
        if input_path.suffix.lower() == '.parquet':
            df = pd.read_parquet(args.input)
        else:
            df = pd.read_csv(args.input)

        result_df = analyze_storage_by_folder(df)

        # Create output directory if it doesn't exist
        output_path = Path(args.output)
        output_path.parent.mkdir(parents=True, exist_ok=True)

        result_df.to_csv(args.output, index=False)
    except Exception as e:
        print(f"Error processing data: {str(e)}")
        exit(1)

## download_azure_inventory.py
#!/usr/bin/env python3
# /// script
# requires-python = ">=3.9"
# dependencies = [
#     "azure-storage-blob>=12.19.0",
#     "azure-identity>=1.15.0",
# ]
# ///
"""
Download Azure Blob Storage inventory files for a specific date.
"""

import os
import sys
import argparse
from datetime import datetime
from pathlib import Path
from azure.storage.blob import BlobServiceClient
from azure.identity import DefaultAzureCredential
from azure.core.exceptions import ClientAuthenticationError, HttpResponseError

def parse_arguments():
    parser = argparse.ArgumentParser(description='Download Azure Blob inventory files for a specific date')
    parser.add_argument('--date', required=True, help='Date in YYYY-MM-DD format')
    parser.add_argument('--container', required=True, help='Azure storage container name')
    parser.add_argument('--account-name', help='Storage account name (if using DefaultAzureCredential)')
    parser.add_argument('--connection-string', help='Storage account connection string')
    parser.add_argument('--inventory-prefix', default='', help='Prefix path before date folders (e.g., "wri-water-container-inventory/")')
    parser.add_argument('--output-dir', default='./inventory_downloads', help='Local directory for downloads')
    parser.add_argument('--preserve-structure', action='store_true', help='Preserve the full directory structure when downloading')
    return parser.parse_args()

def print_auth_help():
    """Print authentication help message."""
    print("\n" + "="*70)
    print("AUTHENTICATION REQUIRED")
    print("="*70)
    print("\nYou need to authenticate to access this Azure storage account.")
    print("\nOption 1: Use a connection string")
    print("  --connection-string \"<your-connection-string>\"")
    print("\nOption 2: Use Azure CLI (with --account-name)")
    print("  1. Install Azure CLI: https://docs.microsoft.com/en-us/cli/azure/install-azure-cli")
    print("  2. Login: az login")
    print("  3. Run this script with --account-name <storage-account>")
    print("\nOption 3: Set environment variables (with --account-name)")
    print("  export AZURE_CLIENT_ID=<your-client-id>")
    print("  export AZURE_TENANT_ID=<your-tenant-id>")
    print("  export AZURE_CLIENT_SECRET=<your-client-secret>")
    print("\nFor more info: https://aka.ms/azsdk/python/identity/defaultazurecredential/troubleshoot")
    print("="*70 + "\n")

def print_permission_help(account_name, container):
    """Print permission error help message."""
    print("\n" + "="*70)
    print("PERMISSION DENIED")
    print("="*70)
    print(f"\nYou are authenticated but don't have permission to access:")
    print(f"  Storage Account: {account_name}")
    print(f"  Container: {container}")
    print("\nPossible solutions:")
    print("1. Contact the storage account owner to grant you access")
    print("2. Use a connection string with appropriate permissions:")
    print("   --connection-string \"<connection-string>\"")
    print("3. If this is a public container, it may require anonymous access")
    print("="*70 + "\n")

def get_blob_service_client(args):
    """Initialize BlobServiceClient with either connection string or default credentials."""
    if args.connection_string:
        return BlobServiceClient.from_connection_string(args.connection_string)
    elif args.account_name:
        account_url = f"https://{args.account_name}.blob.core.windows.net"
        credential = DefaultAzureCredential()
        return BlobServiceClient(account_url=account_url, credential=credential)
    else:
        print("Error: Provide either --connection-string or --account-name")
        sys.exit(1)

def download_inventory_files(blob_service_client, container_name, date_str, inventory_prefix, output_dir, preserve_structure, account_name=None):
    """Download all inventory files for the specified date."""
    container_client = blob_service_client.get_container_client(container_name)

    # Parse date to get year/month/day components
    date_obj = datetime.strptime(date_str, '%Y-%m-%d')
    year = date_obj.strftime('%Y')
    month = date_obj.strftime('%m')
    day = date_obj.strftime('%d')

    # Build the date-based prefix (e.g., "2025/08/13/")
    # This will match all time folders for that date
    date_prefix = f"{inventory_prefix}{year}/{month}/{day}/"

    # Create base output directory
    output_path = Path(output_dir) / date_str
    output_path.mkdir(parents=True, exist_ok=True)

    downloaded_count = 0
    total_size = 0

    try:
        print(f"Searching for blobs with prefix: {date_prefix}")
        blobs = container_client.list_blobs(name_starts_with=date_prefix)

        for blob in blobs:
            # Skip if it's not a CSV file (inventory files are CSVs)
            if not blob.name.endswith('.csv'):
                continue

            blob_client = container_client.get_blob_client(blob.name)

            # Determine local file path
            if preserve_structure:
                # Keep the time folder and subdirectory structure
                # Remove the inventory prefix but keep date/time/subfolder structure
                relative_path = blob.name
                if inventory_prefix:
                    relative_path = blob.name[len(inventory_prefix):]
                local_file_path = output_path.parent / relative_path
                local_file_path.parent.mkdir(parents=True, exist_ok=True)
            else:
                # Flatten structure - just use the filename
                local_file_path = output_path / Path(blob.name).name

            print(f"Downloading: {blob.name} -> {local_file_path}")

            with open(local_file_path, 'wb') as file:
                download_stream = blob_client.download_blob()
                data = download_stream.readall()
                file.write(data)
                total_size += len(data)

            downloaded_count += 1

    except ClientAuthenticationError as e:
        print(f"\nError: Authentication failed!")
        print_auth_help()
        sys.exit(1)
    except HttpResponseError as e:
        if "AuthorizationPermissionMismatch" in str(e) or "AuthorizationFailure" in str(e):
            print(f"\nError: Permission denied!")
            print_permission_help(account_name, container_name)
        else:
            print(f"Error downloading files: {e}")
        sys.exit(1)
    except Exception as e:
        print(f"Error downloading files: {e}")
        sys.exit(1)

    return downloaded_count, total_size

def main():
    args = parse_arguments()

    # Validate date format
    try:
        datetime.strptime(args.date, '%Y-%m-%d')
    except ValueError:
        print(f"Error: Invalid date format '{args.date}'. Use YYYY-MM-DD")
        sys.exit(1)

    # Initialize Azure client
    blob_service_client = get_blob_service_client(args)

    # Download inventory files
    print(f"Downloading inventory files for date: {args.date}")
    print(f"Container: {args.container}")
    if args.inventory_prefix:
        print(f"Inventory prefix: {args.inventory_prefix}")

    count, size = download_inventory_files(
        blob_service_client,
        args.container,
        args.date,
        args.inventory_prefix,
        args.output_dir,
        args.preserve_structure,
        args.account_name
    )

    if count == 0:
        print(f"\nNo inventory files found for date {args.date}")
    else:
        size_mb = size / (1024 * 1024)
        print(f"\nSuccessfully downloaded {count} inventory files ({size_mb:.2f} MB) to {args.output_dir}/{args.date}")

if __name__ == '__main__':
    main()

## duckdb.md

      
    Raw
  

              duckdb.md
            
          
    Are all schemas identical across Parquet files?

duckdb -c "
WITH s AS (
  SELECT file_name,
         md5(string_agg(name || ':' || CAST(\"type\" AS VARCHAR), ',' ORDER BY name)) AS sig
  FROM parquet_schema('*.parquet')
  GROUP BY file_name
)
SELECT CASE WHEN COUNT(DISTINCT sig)=1
     THEN printf('OK: all %d files share the same columns and types', COUNT(*))
     ELSE printf('%d different schemas found', COUNT(DISTINCT sig))
END AS result
FROM s;
"
Which files have different schemas?

duckdb -c "
WITH per_file AS (
  SELECT file_name,
         string_agg(name || ':' || CAST(\"type\" AS VARCHAR), ', ' ORDER BY name) AS schema_str
  FROM parquet_schema('*.parquet')
  GROUP BY file_name
)
SELECT schema_str, string_agg(file_name, '\n  - ' ORDER BY file_name) AS files
FROM per_file
GROUP BY schema_str;
"
Show both per-file and total

duckdb -c "
SELECT COALESCE(filename, '=== TOTAL ===') AS file, COUNT(*) AS rows
FROM read_parquet('*.parquet', filename=true)
GROUP BY ROLLUP(filename)
ORDER BY filename NULLS LAST;
"
Count columns

duckdb -c "
SELECT file_name, COUNT(*) AS num_columns
FROM parquet_schema('*.parquet')
GROUP BY file_name
ORDER BY file_name;
"
Join all Parquet files, keeping just the columns of interest

duckdb -c "
COPY (
  SELECT \"Name\", \"Creation-Time\", \"Last-Modified\", \"Etag\", \"Content-Length\", \"Content-Type\", \"Content-CRC64\", \"Content-MD5\", \"AccessTier\", \"AccessTierChangeTime\", \"AccessTierInferred\", \"hdi_isfolder\", \"Owner\", \"Group\", \"Permissions\", \"Acl\" 
  FROM read_parquet('*.parquet', union_by_name=true)
) TO 'blob-inventory-all_1000000_joined.parquet' (FORMAT PARQUET);
"

  
## list_7z_http.py
# /// script
# requires-python = ">=3.9"
# dependencies = [
#     "py7zr",
#     "fsspec",
#     "aiohttp",
#     "requests",
# ]
# ///
import sys
import argparse
from pathlib import Path
from tempfile import TemporaryDirectory
import shutil
import fsspec
import py7zr
from datetime import datetime

def human(n):
    # simple humanize, base-1024
    for unit in ("", "K", "M", "G", "T"):
        if n < 1024:
            return f"{n:.0f}{unit}"
        n /= 1024
    return f"{n:.0f}P"

def iso(dt):
    if not dt:
        return ""
    # ensure consistent display
    if isinstance(dt, datetime):
        return dt.isoformat(sep=" ", timespec="seconds")
    return str(dt)

def file_mtime(fi):
    """Best-effort to get a modification time from a py7zr file info object.
    Different py7zr versions/archives may expose different fields.
    """
    for attr in (
        "modified_time",  # common
        "writetime",
        "write_time",
        "mtime",
        "datetime",
        "creationtime",
        "create_time",
    ):
        v = getattr(fi, attr, None)
        if v:
            # Convert numeric epoch to datetime if encountered
            if isinstance(v, (int, float)):
                try:
                    return datetime.fromtimestamp(v)
                except Exception:
                    pass
            return v
    return None

parser = argparse.ArgumentParser(
    description="List a .7z archive (remote/local) or extract a single file.",
    usage="%(prog)s <url> [--get <file>]",
)
parser.add_argument("url", metavar="<url>", help="URL or path to a .7z archive")
parser.add_argument("--get", dest="file", metavar="<file>", help="Extract a single file to the current directory (preserving its path)")
args = parser.parse_args()

url = args.url

# Single-file extraction path (avoid listing)
if args.file:
    target = args.file
    with fsspec.open(url, mode="rb") as fh:
        with py7zr.SevenZipFile(fh, mode="r") as z:
            # Validate presence (exact match preferred)
            entries = [fi for fi in z.list() if not getattr(fi, "is_directory", False)]
            names = [getattr(fi, "filename", "") for fi in entries]
            if target not in names:
                # Relax to unique basename match
                candidates = [n for n in names if Path(n).name == Path(target).name]
                if len(candidates) == 1:
                    target = candidates[0]
                elif len(candidates) > 1:
                    print("Multiple entries match by basename; please specify full path:")
                    for n in candidates:
                        print(f"  {n}")
                    sys.exit(2)
                else:
                    print(f"File not found in archive: {args.file}")
                    sys.exit(2)

            out_path = Path(target)
            out_path.parent.mkdir(parents=True, exist_ok=True)

            with TemporaryDirectory() as td:
                try:
                    z.extract(path=td, targets=[target])
                except TypeError:
                    # Older py7zr signature may require positional path
                    z.extract(td, targets=[target])

                tmp_src = Path(td) / target
                if not tmp_src.exists():
                    # Fallback: search by basename under temp tree
                    matches = list(Path(td).rglob(Path(target).name))
                    if not matches:
                        print(f"Extraction failed: {target} not found in temp dir")
                        sys.exit(2)
                    tmp_src = matches[0]

                # Stream copy to output
                with open(tmp_src, "rb") as rf, open(out_path, "wb") as wf:
                    shutil.copyfileobj(rf, wf, length=1024 * 1024)

            print(f"Saved: {out_path} ({human(out_path.stat().st_size)})")
            sys.exit(0)

# Header
print(f'Listing archive: {url}\n')
print(f'{"Modified":19}  {"Size":>12}  {"Packed":>12}  Name')
print('-' * 64)

total_size = 0
total_packed = 0
count = 0

with fsspec.open(url, mode="rb") as fh:
    with py7zr.SevenZipFile(fh, mode="r") as z:
        # py7zr.list() yields file info objects with attributes:
        # filename, uncompressed, compressed, modified_time, is_directory, etc.
        for fi in z.list():
            if getattr(fi, "is_directory", False):
                continue
            name = getattr(fi, "filename", "")
            size = int(getattr(fi, "uncompressed", 0) or 0)
            packed = int(getattr(fi, "compressed", 0) or 0)
            mtime = file_mtime(fi)

            print(f'{iso(mtime):19}  {human(size):>12}  {human(packed):>12}  {name}')
            total_size += size
            total_packed += packed
            count += 1

print('-' * 64)
print(f'Files: {count}  Total size: {round(total_size / 1024**3, 3)} GB  Total packed: {round(total_packed / 1024**3, 3)} GB')

## list_webdav_tree.py
# /// script
# requires-python = ">=3.9"
# dependencies = [
#     "requests",
# ]
# ///

import sys
import time
import argparse
import urllib.parse as urlparse
import xml.etree.ElementTree as ET
from collections import deque

import requests


DAV_NS = "{DAV:}"
PROPFIND_BODY = (
    "<?xml version=\"1.0\" encoding=\"utf-8\"?>"
    "<d:propfind xmlns:d=\"DAV:\">"
    "<d:prop><d:resourcetype/><d:getcontentlength/><d:getlastmodified/><d:etag/></d:prop>"
    "</d:propfind>"
)


def normalize_dir(url: str) -> str:
    return url if url.endswith("/") else url + "/"


def same_tree(url: str, base: str) -> bool:
    u = urlparse.urlsplit(url)
    b = urlparse.urlsplit(base)
    if (u.scheme, u.netloc) != (b.scheme, b.netloc):
        return False
    # Compare using unquoted (decoded) paths to ignore % encoding differences
    u_path = normalize_dir(urlparse.unquote(u.path))
    b_path = normalize_dir(urlparse.unquote(b.path))
    return u_path.startswith(b_path)


def human_bytes(b: float) -> str:
    b = float(b or 0)
    units = ["B", "KB", "MB", "GB", "TB", "PB"]
    i = 0
    while b >= 1024 and i < len(units) - 1:
        b /= 1024
        i += 1
    return f"{b:.1f}{units[i]}" if i else f"{int(b)}{units[i]}"


def propfind(url: str, auth: tuple[str, str] | None = None) -> ET.Element:
    headers = {
        "Depth": "1",
        "Accept": "application/xml",
        "Content-Type": 'application/xml; charset="utf-8"',
    }
    r = requests.request("PROPFIND", url, headers=headers, data=PROPFIND_BODY, auth=auth)
    r.raise_for_status()
    # Some servers return bytes; ensure text for ET
    text = r.text if isinstance(r.text, str) else r.content.decode("utf-8", "replace")
    return ET.fromstring(text)


def crawl(base_url: str, auth: tuple[str, str] | None, delay: float = 0.0, max_pages: int | None = None):
    base_url = normalize_dir(base_url)
    q = deque([base_url])
    seen = set()
    pages = 0
    while q:
        cur = q.popleft()
        if cur in seen:
            continue
        seen.add(cur)
        try:
            root = propfind(cur, auth=auth)
        except Exception as e:
            print(f"WARN: PROPFIND failed for {cur}: {e}", file=sys.stderr)
            continue

        for resp in root.findall(f".//{DAV_NS}response"):
            href_el = resp.find(f"{DAV_NS}href")
            if href_el is None or not href_el.text:
                continue
            abs_url = urlparse.urljoin(cur, href_el.text)
            # stay in tree
            if not same_tree(abs_url, base_url):
                continue

            rt = resp.find(f".//{DAV_NS}resourcetype")
            is_dir = rt is not None and rt.find(f"{DAV_NS}collection") is not None

            # many servers echo the current directory as an entry; skip it
            # Compare on decoded paths to avoid % encoding mismatches
            cur_path = normalize_dir(urlparse.unquote(urlparse.urlsplit(cur).path)).rstrip("/")
            abs_path = normalize_dir(urlparse.unquote(urlparse.urlsplit(abs_url).path)).rstrip("/")
            if abs_path == cur_path:
                continue

            if is_dir:
                q.append(normalize_dir(abs_url))
            else:
                size_el = resp.find(f".//{DAV_NS}getcontentlength")
                mod_el = resp.find(f".//{DAV_NS}getlastmodified")
                etag_el = resp.find(f".//{DAV_NS}etag")
                yield {
                    "url": abs_url,
                    "size": int(size_el.text) if size_el is not None and size_el.text and size_el.text.isdigit() else size_el.text if size_el is not None else None,
                    "modified": mod_el.text if mod_el is not None else None,
                    "etag": etag_el.text if etag_el is not None else None,
                }

        pages += 1
        if max_pages is not None and pages >= max_pages:
            break
        if delay:
            time.sleep(delay)


def main(argv: list[str]) -> int:
    ap = argparse.ArgumentParser(description="Recursively list files from a WebDAV directory using PROPFIND Depth:1.")
    ap.add_argument("url", help="Base WebDAV directory URL to start from")
    ap.add_argument("--user", default=None, help="Username (if required)")
    ap.add_argument("--password", default=None, help="Password (if required)")
    ap.add_argument("--output", choices=["text", "json"], default="text", help="Output format")
    ap.add_argument("--delay", type=float, default=0.0, help="Polite delay between requests")
    ap.add_argument("--max-pages", type=int, default=None, help="Limit number of directory pages crawled")
    args = ap.parse_args(argv)

    base = args.url
    if not base.startswith(("http://", "https://")):
        print("ERROR: url must start with http:// or https://", file=sys.stderr)
        return 2

    auth = None
    if args.user is not None or args.password is not None:
        auth = (args.user or "", args.password or "")

    items = list(crawl(base, auth=auth, delay=args.delay, max_pages=args.max_pages))

    if args.output == "json":
        import json
        print(json.dumps({
            "base": normalize_dir(base),
            "count": len(items),
            "files": items,
        }, indent=2))
    else:
        for it in items:
            size = it.get("size")
            human = human_bytes(size) if isinstance(size, (int, float)) or (isinstance(size, str) and size.isdigit()) else str(size)
            print(f"{human:>8}\t{it.get('url')}")

    return 0


if __name__ == "__main__":
    raise SystemExit(main(sys.argv[1:]))

## read_inventory_with_schema.py
#!/usr/bin/env python3
# /// script
# requires-python = ">=3.9"
# dependencies = [
#     "pandas>=2.0.0",
#     "pyarrow>=14.0.1",
# ]
# ///
"""
Read inventory data from a CSV.GZ file using schema information from manifest.json
"""

import pandas as pd
import json
import os
from pathlib import Path
import argparse

def read_inventory_with_schema(csv_file_path, manifest_file_path=None):
    """
    Read inventory data from a CSV file using schema from manifest.json

    Args:
        csv_file_path (str): Path to the CSV or CSV.GZ file
        manifest_file_path (str, optional): Path to the manifest.json file.
                                          If None, will look in the same directory as csv_file_path

    Returns:
        pd.DataFrame: DataFrame with the inventory data
    """
    csv_path = Path(csv_file_path)

    # If manifest_file_path is not provided, look in the same directory as csv_file_path
    if manifest_file_path is None:
        manifest_file_path = csv_path.parent / "manifest.json"

    # Check if manifest file exists
    if not Path(manifest_file_path).exists():
        print(f"Warning: Manifest file not found at {manifest_file_path}")
        print("Reading CSV without schema information...")
        return pd.read_csv(csv_file_path)

    # Read the manifest file
    try:
        with open(manifest_file_path, 'r') as f:
            manifest_data = json.load(f)

        if 'fileSchema' not in manifest_data:
            print("Warning: No 'fileSchema' found in manifest file")
            print("Reading CSV without schema information...")
            return pd.read_csv(csv_file_path)

        # Parse column headers from fileSchema
        columns = [col.strip() for col in manifest_data['fileSchema'].split(',')]
        print(f"Found schema with {len(columns)} columns: {', '.join(columns[:5])}...")

        # Read the CSV file with the schema
        df = pd.read_csv(csv_file_path, names=columns, header=None)
        return df

    except Exception as e:
        print(f"Error reading manifest or applying schema: {str(e)}")
        print("Falling back to reading CSV without schema...")
        return pd.read_csv(csv_file_path)

def main():
    parser = argparse.ArgumentParser(description='Read inventory data using schema from manifest.json')
    parser.add_argument('--input', default='data/wri-projects/csv/9a8e0fee-c83e-4269-8368-dbdc2d92b042.csv.gz',
                        help='Path to input CSV or CSV.GZ file')
    parser.add_argument('--manifest', default=None,
                        help='Path to manifest.json file (default: same directory as input file)')
    parser.add_argument('--output', default='data/wri-projects/csv/output.csv',
                        help='Path to output CSV file (optional)')
    parser.add_argument('--show-columns', action='store_true',
                        help='Show column names and exit')
    parser.add_argument('--show-sample', action='store_true',
                        help='Show sample data (first 5 rows)')
    parser.add_argument('--analyze-storage', action='store_true',
                        help='Analyze storage by folder (like analyze_storage_by_folder.py)')
    parser.add_argument('--filter-prefix', default="Aqueduct",
                        help='Only include folders starting with this prefix (e.g., "data/raw/")')

    args = parser.parse_args()

    try:
        # Read the data
        df = read_inventory_with_schema(args.input, args.manifest)

        # Show columns if requested
        if args.show_columns:
            print(f"\nColumns in {args.input}:")
            for i, column in enumerate(df.columns):
                print(f"  {i}: {column}")

        # Show sample data if requested
        if args.show_sample:
            print(f"\nSample data (first 5 rows):")
            print(df.head(5))

        # Analyze storage by folder if requested
        if args.analyze_storage:
            # Try to detect appropriate column names for storage analysis
            name_column = None
            size_column = None

            # Look for common column names
            for col in df.columns:
                if col.lower() in ['key', 'name', 'path']:
                    name_column = col
                if col.lower() in ['size', 'content-length', 'contentlength']:
                    size_column = col

            if name_column is None or size_column is None:
                print("\nCouldn't automatically detect name/size columns.")
                if not args.show_columns:
                    print("Use --show-columns to see available columns.")
                print("Please specify columns manually for analysis:")
                print("Example: --name-column 'Key' --size-column 'Size'")
            else:
                # Calculate storage by folder
                df['Folder'] = df[name_column].astype(str).str.extract(r'^(.+)/[^/]*$')

                # Apply folder prefix filter if specified
                if args.filter_prefix:
                    print(f"\nFiltering folders by prefix: '{args.filter_prefix}'")
                    # Create a filtered dataframe with only matching entries
                    filtered_df = df[df[name_column].astype(str).str.startswith(args.filter_prefix)]
                    if len(filtered_df) == 0:
                        print(f"Warning: No entries found with prefix '{args.filter_prefix}'")
                        print(f"Total entries before filtering: {len(df)}")
                    else:
                        print(f"Filtered {len(filtered_df)} entries out of {len(df)} total")
                        df = filtered_df

                result = df.groupby('Folder', dropna=False).agg(
                    File_Count=(name_column, 'count'),
                    Total_Storage=(size_column, 'sum')
                ).reset_index()

                # Sort by storage size (descending)
                result = result.sort_values('Total_Storage', ascending=False)

                print(f"\nStorage analysis by folder (using {name_column} and {size_column}):")
                print(result.head(10))

                if args.output:
                    result.to_csv(args.output, index=False)
                    print(f"\nFull results saved to {args.output}")

        # Save to output if specified
        elif args.output:
            df.to_csv(args.output, index=False)
            print(f"Data saved to {args.output}")

    except Exception as e:
        print(f"Error: {str(e)}")
        return 1

    return 0

if __name__ == "__main__":
    exit(main())

## storage_treemap.html
<!DOCTYPE html>
<html>
<head>
    <title>Storage Treemap Visualization</title>
    <script src="https://d3js.org/d3.v7.min.js"></script>
    <style>
        html, body {
            height: 100%;
            margin: 0;
            padding: 0;
            font-family: Arial, sans-serif;
            overflow: hidden;
        }
        body {
            display: flex;
            flex-direction: column;
        }
        #header {
            padding: 10px 20px;
            background: #f5f5f5;
            border-bottom: 1px solid #ddd;
        }
        #title-row {
            display: flex;
            justify-content: space-between;
            align-items: center;
            margin-bottom: 10px;
        }
        h1 {
            margin: 0;
            font-size: 24px;
        }
        #file-controls {
            display: flex;
            align-items: center;
            gap: 10px;
        }
        #controls {
            margin: 0;
        }
        #treemap {
            flex: 1;
            position: relative;
            width: 100%;
        }
        .node {
            position: absolute;
            overflow: hidden;
            cursor: pointer;
            line-height: 1.2;
            font-size: 10px;
        }
        .node-label {
            padding: 4px;
            color: white;
            text-shadow: 0 0 3px rgba(0,0,0,0.8);
            overflow: hidden;
            text-overflow: ellipsis;
        }
        #tooltip {
            position: absolute;
            padding: 10px;
            background: rgba(0,0,0,0.9);
            color: white;
            border-radius: 5px;
            pointer-events: none;
            display: none;
            z-index: 1000;
        }
        input[type="file"] { margin-right: 10px; }
        #cached-info {
            display: inline-block;
            margin-left: 10px;
            color: #666;
            font-size: 12px;
        }
        #clear-cache {
            margin-left: 5px;
            font-size: 11px;
            cursor: pointer;
            color: #0066cc;
            text-decoration: underline;
        }
        #depth-control {
            display: inline-flex;
            align-items: center;
            margin-left: 10px;
        }
        #maxDepth {
            width: 150px;
            margin: 0 8px;
        }
        #depth-value {
            min-width: 20px;
            font-weight: bold;
        }
        #breadcrumb {
            padding: 5px 20px;
            background: #fff;
            border-bottom: 1px solid #ddd;
            font-size: 14px;
            min-height: 20px;
        }
        #breadcrumb span {
            color: #0066cc;
            cursor: pointer;
            text-decoration: underline;
        }
        #breadcrumb span:hover {
            color: #0044aa;
        }
        #breadcrumb .separator {
            color: #666;
            margin: 0 5px;
            text-decoration: none;
            cursor: default;
        }
        #breadcrumb .current {
            color: #333;
            text-decoration: none;
            cursor: default;
            font-weight: bold;
        }
        .node.folder {
            cursor: pointer;
        }
        .node.folder:hover {
            opacity: 0.8;
        }
        #welcome {
            position: absolute;
            top: 50%;
            left: 50%;
            transform: translate(-50%, -50%);
            text-align: center;
            padding: 40px;
            background: #f9f9f9;
            border-radius: 10px;
            box-shadow: 0 2px 10px rgba(0,0,0,0.1);
        }
        #welcome h2 {
            margin: 0 0 20px 0;
            color: #333;
        }
        #welcome p {
            margin: 0 0 30px 0;
            color: #666;
        }
        #welcome input[type="file"] {
            padding: 10px 20px;
            font-size: 16px;
            cursor: pointer;
        }
        .has-data #welcome {
            display: none;
        }
        .has-data #header,
        .has-data #breadcrumb,
        .has-data #treemap {
            display: block;
        }
        body:not(.has-data) #header,
        body:not(.has-data) #breadcrumb,
        body:not(.has-data) #treemap {
            display: none;
        }
    </style>
</head>
<body>
    <div id="welcome">
        <h2>Storage Treemap Visualization</h2>
        <p>Select a CSV file to visualize storage data</p>
        <input type="file" id="welcomeFileInput" accept=".csv">
    </div>

    <div id="header">
        <div id="title-row">
            <h1>Storage Treemap Visualization</h1>
            <div id="file-controls">
                <input type="file" id="csvFile" accept=".csv">
                <span id="cached-info" style="display: none;">
                    Using cached: <span id="cached-filename"></span>
                    <span id="clear-cache">[clear]</span>
                </span>
            </div>
        </div>
        <div id="controls">
            <label>Size by: <select id="sizeBy">
                <option value="storage" selected>Total Storage</option>
                <option value="count">File Count</option>
            </select></label>
            <label style="margin-left: 10px;">Color by: <select id="colorBy">
                <option value="size">Total Storage</option>
                <option value="count" selected>File Count</option>
            </select></label>
            <span id="depth-control">
                <label>Max Depth:</label>
                <input type="range" id="maxDepth" min="1" max="10" value="10">
                <span id="depth-value">10</span>
            </span>
        </div>
    </div>
    <div id="breadcrumb"></div>
    <div id="treemap"></div>
    <div id="tooltip"></div>

    <script>
    let currentData = null; // Store current data for resize events
    let originalData = null; // Store original unfiltered data
    let maxDataDepth = 10; // Store the maximum depth found in the data
    let navigationPath = []; // Store navigation path for breadcrumb

    // Check for cached data on page load
    window.addEventListener('DOMContentLoaded', function() {
        const cachedCSV = localStorage.getItem('treemap_csv_data');
        const cachedFilename = localStorage.getItem('treemap_csv_filename');

        if (cachedCSV && cachedFilename) {
            // Show cached info
            document.body.classList.add('has-data');
            document.getElementById('cached-info').style.display = 'inline-block';
            document.getElementById('cached-filename').textContent = cachedFilename;

            // Process the cached data
            const csv = d3.csvParse(cachedCSV);
            processData(csv);
        }
    });

    // Clear cache handler
    document.getElementById('clear-cache').addEventListener('click', function() {
        localStorage.removeItem('treemap_csv_data');
        localStorage.removeItem('treemap_csv_filename');
        document.getElementById('cached-info').style.display = 'none';
        // Clear the visualization
        document.getElementById('treemap').innerHTML = '';
        document.getElementById('breadcrumb').innerHTML = '';
        currentData = null;
        originalData = null;
        navigationPath = [];
        // Show welcome screen again
        document.body.classList.remove('has-data');
    });

    // Handle file input from welcome screen
    document.getElementById('welcomeFileInput').addEventListener('change', handleFileSelect);
    document.getElementById('csvFile').addEventListener('change', handleFileSelect);

    function handleFileSelect(e) {
        const file = e.target.files[0];
        if (file) {
            const reader = new FileReader();
            reader.onload = function(e) {
                const csvContent = e.target.result;

                // Store in localStorage
                try {
                    localStorage.setItem('treemap_csv_data', csvContent);
                    localStorage.setItem('treemap_csv_filename', file.name);

                    // Update UI
                    document.body.classList.add('has-data');
                    document.getElementById('cached-info').style.display = 'inline-block';
                    document.getElementById('cached-filename').textContent = file.name;
                } catch (e) {
                    console.warn('Could not cache file (localStorage full or disabled):', e);
                    document.body.classList.add('has-data');
                }

                const csv = d3.csvParse(csvContent);
                processData(csv);

                // Sync file inputs
                if (e.target.id === 'welcomeFileInput') {
                    document.getElementById('csvFile').files = e.target.files;
                } else {
                    document.getElementById('welcomeFileInput').files = e.target.files;
                }
            };
            reader.readAsText(file);
        }
    }

    function processData(csv) {
        // Build hierarchy from CSV data
        const root = {name: "root", children: []};
        const nodeMap = {};
        maxDataDepth = 0; // Reset max depth

        // Process each row
        csv.forEach(row => {
            const path = row.Folder || "";
            if (!path) return;

            const parts = path.split('/').filter(p => p);
            const fileCount = +row.File_Count || 0;
            const totalStorage = +row.Total_Storage || 0;

            // Track maximum depth
            maxDataDepth = Math.max(maxDataDepth, parts.length);

            // Build the tree structure
            let currentPath = "";
            let parent = root;

            parts.forEach((part, index) => {
                currentPath = currentPath ? `${currentPath}/${part}` : part;

                if (!nodeMap[currentPath]) {
                    const node = {
                        name: part,
                        path: currentPath,
                        depth: index + 1  // Add depth tracking
                    };

                    // If this is a leaf node (last part), add the values
                    if (index === parts.length - 1) {
                        node.fileCount = fileCount;
                        node.totalStorage = totalStorage;
                        // Store both metrics, we'll decide which to use for value later
                        node.storageValue = totalStorage;
                        node.countValue = fileCount;
                    } else {
                        node.children = [];
                    }

                    nodeMap[currentPath] = node;

                    if (!parent.children) {
                        parent.children = [];
                    }
                    parent.children.push(node);
                }

                parent = nodeMap[currentPath];
            });
        });

        // Update slider max value and reset to max
        const slider = document.getElementById('maxDepth');
        slider.max = maxDataDepth;
        slider.value = maxDataDepth;
        document.getElementById('depth-value').textContent = maxDataDepth;

        // Remove empty branches and calculate parent values
        function cleanAndSum(node) {
            if (node.children) {
                node.children = node.children.filter(child => {
                    const hasValue = cleanAndSum(child);
                    return hasValue;
                });

                if (node.children.length === 0) {
                    delete node.children;
                    return false;
                }

                // Sum up children values for parents
                let totalStorage = 0;
                let fileCount = 0;
                node.children.forEach(child => {
                    totalStorage += child.totalStorage || 0;
                    fileCount += child.fileCount || 0;
                });
                node.totalStorage = totalStorage;
                node.fileCount = fileCount;
                node.storageValue = totalStorage;
                node.countValue = fileCount;
                return totalStorage > 0 || fileCount > 0;
            }
            return (node.storageValue || 0) > 0 || (node.countValue || 0) > 0;
        }

        cleanAndSum(root);
        originalData = root; // Store original data
        applyDepthFilter();
    }

    function applyDepthFilter() {
        if (!originalData) return;

        const maxDepth = parseInt(document.getElementById('maxDepth').value);

        // Start from the current navigation point
        let startNode = originalData;
        for (const pathItem of navigationPath) {
            startNode = pathItem.node;
        }

        // Deep clone the original data
        function cloneWithDepth(node, currentDepth = 0) {
            const clone = {
                name: node.name,
                path: node.path,
                depth: node.depth,
                fileCount: node.fileCount || 0,
                totalStorage: node.totalStorage || 0,
                storageValue: node.storageValue || 0,
                countValue: node.countValue || 0
            };

            // If we're at max depth, collapse children into this node
            if (currentDepth >= maxDepth) {
                // Sum all descendant values
                function sumDescendants(n) {
                    let storage = n.storageValue || n.totalStorage || 0;
                    let files = n.countValue || n.fileCount || 0;
                    if (n.children) {
                        n.children.forEach(child => {
                            const childSums = sumDescendants(child);
                            storage += childSums.storage;
                            files += childSums.files;
                        });
                    }
                    return { storage: storage, files: files };
                }

                const sums = sumDescendants(node);
                clone.storageValue = sums.storage;
                clone.totalStorage = sums.storage;
                clone.countValue = sums.files;
                clone.fileCount = sums.files;
                // Don't include children
            } else if (node.children) {
                clone.children = node.children.map(child => cloneWithDepth(child, currentDepth + 1));
            }

            return clone;
        }

        currentData = cloneWithDepth(startNode);
        drawTreemap(currentData);
        updateBreadcrumb();
    }

    function navigateToFolder(node) {
        if (!node.children || node.children.length === 0) return;

        // Add to navigation path
        navigationPath.push({
            name: node.name,
            path: node.path,
            node: node
        });

        // Redraw from this node
        applyDepthFilter();
    }

    function navigateToBreadcrumb(index) {
        // Trim navigation path to the selected index
        navigationPath = navigationPath.slice(0, index);
        applyDepthFilter();
    }

    function updateBreadcrumb() {
        const breadcrumb = document.getElementById('breadcrumb');
        if (navigationPath.length === 0) {
            breadcrumb.innerHTML = '';
            return;
        }

        let html = '<span onclick="navigateToBreadcrumb(0)">Root</span>';
        navigationPath.forEach((item, index) => {
            html += '<span class="separator">›</span>';
            if (index === navigationPath.length - 1) {
                html += `<span class="current">${item.name}</span>`;
            } else {
                html += `<span onclick="navigateToBreadcrumb(${index + 1})">${item.name}</span>`;
            }
        });
        breadcrumb.innerHTML = html;
    }

    function drawTreemap(data) {
        const container = document.getElementById('treemap');
        const width = container.offsetWidth;
        const height = container.offsetHeight;

        // Clear previous content
        container.innerHTML = '';

        // Don't draw if container has no size
        if (width === 0 || height === 0) return;

        // Get the size metric
        const sizeBy = document.getElementById('sizeBy').value;

        // Create hierarchy with selected size metric
        const hierarchy = d3.hierarchy(data)
            .sum(d => {
                if (sizeBy === 'storage') {
                    return d.storageValue || 0;
                } else {
                    return d.countValue || 0;
                }
            })
            .sort((a, b) => b.value - a.value);

        // Create treemap layout
        const treemap = d3.treemap()
            .size([width, height])
            .paddingOuter(3)
            .paddingTop(19)
            .paddingInner(1)
            .round(true);

        // Generate the treemap
        const root = treemap(hierarchy);

        // Color scales
        const colorBy = document.getElementById('colorBy').value;
        const leaves = root.leaves();

        const colorScale = d3.scaleSequential()
            .domain([0, d3.max(leaves, d => colorBy === 'size' ? d.data.totalStorage : d.data.fileCount)])
            .interpolator(d3.interpolateCool);

        // Create the treemap cells
        const cell = d3.select(container)
            .selectAll('.node')
            .data(root.descendants())
            .enter()
            .append('div')
            .attr('class', d => d.children ? 'node folder' : 'node')
            .style('left', d => `${d.x0}px`)
            .style('top', d => `${d.y0}px`)
            .style('width', d => `${d.x1 - d.x0}px`)
            .style('height', d => `${d.y1 - d.y0}px`)
            .style('background', d => {
                if (d.children) {
                    return '#ddd';  // Parent nodes are light gray
                } else {
                    const value = colorBy === 'size' ? d.data.totalStorage : d.data.fileCount;
                    return colorScale(value);
                }
            })
            .style('border', d => d.children ? '2px solid #999' : '1px solid white')
            .on('click', function(event, d) {
                if (d.children) {
                    event.stopPropagation();
                    navigateToFolder(d.data);
                }
            })
            .on('mouseover', function(event, d) {
                const tooltip = document.getElementById('tooltip');
                tooltip.style.display = 'block';
                let tooltipText = `<strong>${d.data.path || d.data.name}</strong><br>
                    Files: ${(d.data.fileCount || 0).toLocaleString()}<br>
                    Size: ${formatBytes(d.data.totalStorage || d.value || 0)}`;
                if (d.children) {
                    tooltipText += '<br><em>Click to explore this folder</em>';
                }
                tooltip.innerHTML = tooltipText;
                tooltip.style.left = `${event.pageX + 10}px`;
                tooltip.style.top = `${event.pageY - 30}px`;
            })
            .on('mouseout', function() {
                document.getElementById('tooltip').style.display = 'none';
            });

        // Add labels
        cell.append('div')
            .attr('class', 'node-label')
            .text(d => {
                const width = d.x1 - d.x0;
                const height = d.y1 - d.y0;
                // Only show label if there's enough space
                if (width > 40 && height > 20) {
                    return d.data.name;
                }
                return '';
            })
            .style('font-size', d => {
                const width = d.x1 - d.x0;
                if (width > 100) return '12px';
                if (width > 50) return '10px';
                return '9px';
            });
    }

    function formatBytes(bytes) {
        if (bytes === 0) return '0 Bytes';
        const k = 1024;
        const sizes = ['Bytes', 'KB', 'MB', 'GB', 'TB'];
        const i = Math.floor(Math.log(bytes) / Math.log(k));
        return parseFloat((bytes / Math.pow(k, i)).toFixed(2)) + ' ' + sizes[i];
    }

    // Handle window resize
    let resizeTimer;
    window.addEventListener('resize', function() {
        clearTimeout(resizeTimer);
        resizeTimer = setTimeout(function() {
            if (currentData) {
                drawTreemap(currentData);
            }
        }, 250); // Debounce resize events
    });

    // Handle color change
    document.getElementById('colorBy').addEventListener('change', function() {
        if (currentData) {
            drawTreemap(currentData);
        }
    });

    // Handle size change
    document.getElementById('sizeBy').addEventListener('change', function() {
        if (currentData) {
            drawTreemap(currentData);
        }
    });

    // Handle depth change with slider
    document.getElementById('maxDepth').addEventListener('input', function(e) {
        document.getElementById('depth-value').textContent = e.target.value;
        applyDepthFilter();
    });
    </script>
</body>
</html>
	#!/usr/bin/env python3
	# /// script
	# requires-python = ">=3.9"
	# dependencies = [
	# "pandas>=2.0.0",
	# "pyarrow>=14.0.1",
	# ]
	# ///
	"""
	Analyze blob storage inventory data to calculate storage usage by folder.
	"""

	import pandas as pd

	def analyze_storage_by_folder(input_df):
	# Create a copy of the input dataframe to avoid modifying the original
	df = input_df.copy()

	# Extract folder paths using regex
	# This will match everything up to the last forward slash
	df['Folder'] = df['Name'].str.extract(r'^(.+)/[^/]*$')

	# Convert Content-Length to numeric, invalid values become NaN
	df['Content-Length'] = pd.to_numeric(df['Content-Length'], errors='coerce')

	# Filter out folders
	df = df[df['hdi_isfolder'] != "True"]

	# Group by folder and calculate metrics
	result = df.groupby('Folder').agg(
	File_Count=('Name', 'count'),
	Total_Storage=('Content-Length', 'sum')
	).reset_index()

	return result

	if __name__ == "__main__":
	import argparse
	from pathlib import Path

	default_input = "data/azure_blob_inventory/2025-08-13/blob-inventory-all_1000000_joined.parquet"
	default_output = "data/azure_blob_inventory/2025-08-13/blob-inventory-all-joined-folders.csv"

	parser = argparse.ArgumentParser(description='Analyze storage usage by folder from blob inventory data')
	parser.add_argument('--input', default=default_input,
	help=f'Path to input file (CSV or Parquet, default: {default_input})')
	parser.add_argument('--output', default=default_output,
	help=f'Path to output CSV file (default: {default_output})')

	args = parser.parse_args()

	try:
	# Determine file type from extension
	input_path = Path(args.input)
	if input_path.suffix.lower() == '.parquet':
	df = pd.read_parquet(args.input)
	else:
	df = pd.read_csv(args.input)

	result_df = analyze_storage_by_folder(df)

	# Create output directory if it doesn't exist
	output_path = Path(args.output)
	output_path.parent.mkdir(parents=True, exist_ok=True)

	result_df.to_csv(args.output, index=False)
	except Exception as e:
	print(f"Error processing data: {str(e)}")
	exit(1)
	<!DOCTYPE html>
	<html>
	<head>
	<title>Storage Treemap Visualization</title>
	<script src="https://d3js.org/d3.v7.min.js"></script>
	<style>
	html, body {
	height: 100%;
	margin: 0;
	padding: 0;
	font-family: Arial, sans-serif;
	overflow: hidden;
	}
	body {
	display: flex;
	flex-direction: column;
	}
	#header {
	padding: 10px 20px;
	background: #f5f5f5;
	border-bottom: 1px solid #ddd;
	}
	#title-row {
	display: flex;
	justify-content: space-between;
	align-items: center;
	margin-bottom: 10px;
	}
	h1 {
	margin: 0;
	font-size: 24px;
	}
	#file-controls {
	display: flex;
	align-items: center;
	gap: 10px;
	}
	#controls {
	margin: 0;
	}
	#treemap {
	flex: 1;
	position: relative;
	width: 100%;
	}
	.node {
	position: absolute;
	overflow: hidden;
	cursor: pointer;
	line-height: 1.2;
	font-size: 10px;
	}
	.node-label {
	padding: 4px;
	color: white;
	text-shadow: 0 0 3px rgba(0,0,0,0.8);
	overflow: hidden;
	text-overflow: ellipsis;
	}
	#tooltip {
	position: absolute;
	padding: 10px;
	background: rgba(0,0,0,0.9);
	color: white;
	border-radius: 5px;
	pointer-events: none;
	display: none;
	z-index: 1000;
	}
	input[type="file"] { margin-right: 10px; }
	#cached-info {
	display: inline-block;
	margin-left: 10px;
	color: #666;
	font-size: 12px;
	}
	#clear-cache {
	margin-left: 5px;
	font-size: 11px;
	cursor: pointer;
	color: #0066cc;
	text-decoration: underline;
	}
	#depth-control {
	display: inline-flex;
	align-items: center;
	margin-left: 10px;
	}
	#maxDepth {
	width: 150px;
	margin: 0 8px;
	}
	#depth-value {
	min-width: 20px;
	font-weight: bold;
	}
	#breadcrumb {
	padding: 5px 20px;
	background: #fff;
	border-bottom: 1px solid #ddd;
	font-size: 14px;
	min-height: 20px;
	}
	#breadcrumb span {
	color: #0066cc;
	cursor: pointer;
	text-decoration: underline;
	}
	#breadcrumb span:hover {
	color: #0044aa;
	}
	#breadcrumb .separator {
	color: #666;
	margin: 0 5px;
	text-decoration: none;
	cursor: default;
	}
	#breadcrumb .current {
	color: #333;
	text-decoration: none;
	cursor: default;
	font-weight: bold;
	}
	.node.folder {
	cursor: pointer;
	}
	.node.folder:hover {
	opacity: 0.8;
	}
	#welcome {
	position: absolute;
	top: 50%;
	left: 50%;
	transform: translate(-50%, -50%);
	text-align: center;
	padding: 40px;
	background: #f9f9f9;
	border-radius: 10px;
	box-shadow: 0 2px 10px rgba(0,0,0,0.1);
	}
	#welcome h2 {
	margin: 0 0 20px 0;
	color: #333;
	}
	#welcome p {
	margin: 0 0 30px 0;
	color: #666;
	}
	#welcome input[type="file"] {
	padding: 10px 20px;
	font-size: 16px;
	cursor: pointer;
	}
	.has-data #welcome {
	display: none;
	}
	.has-data #header,
	.has-data #breadcrumb,
	.has-data #treemap {
	display: block;
	}
	body:not(.has-data) #header,
	body:not(.has-data) #breadcrumb,
	body:not(.has-data) #treemap {
	display: none;
	}
	</style>
	</head>
	<body>
	<div id="welcome">
	<h2>Storage Treemap Visualization</h2>
	<p>Select a CSV file to visualize storage data</p>
	<input type="file" id="welcomeFileInput" accept=".csv">
	</div>

	<div id="header">
	<div id="title-row">
	<h1>Storage Treemap Visualization</h1>
	<div id="file-controls">
	<input type="file" id="csvFile" accept=".csv">
	<span id="cached-info" style="display: none;">
	Using cached: <span id="cached-filename"></span>
	<span id="clear-cache">[clear]</span>
	</span>
	</div>
	</div>
	<div id="controls">
	<label>Size by: <select id="sizeBy">
	<option value="storage" selected>Total Storage</option>
	<option value="count">File Count</option>
	</select></label>
	<label style="margin-left: 10px;">Color by: <select id="colorBy">
	<option value="size">Total Storage</option>
	<option value="count" selected>File Count</option>
	</select></label>
	<span id="depth-control">
	<label>Max Depth:</label>
	<input type="range" id="maxDepth" min="1" max="10" value="10">
	<span id="depth-value">10</span>
	</span>
	</div>
	</div>
	<div id="breadcrumb"></div>
	<div id="treemap"></div>
	<div id="tooltip"></div>

	<script>
	let currentData = null; // Store current data for resize events
	let originalData = null; // Store original unfiltered data
	let maxDataDepth = 10; // Store the maximum depth found in the data
	let navigationPath = []; // Store navigation path for breadcrumb

	// Check for cached data on page load
	window.addEventListener('DOMContentLoaded', function() {
	const cachedCSV = localStorage.getItem('treemap_csv_data');
	const cachedFilename = localStorage.getItem('treemap_csv_filename');

	if (cachedCSV && cachedFilename) {
	// Show cached info
	document.body.classList.add('has-data');
	document.getElementById('cached-info').style.display = 'inline-block';
	document.getElementById('cached-filename').textContent = cachedFilename;

	// Process the cached data
	const csv = d3.csvParse(cachedCSV);
	processData(csv);
	}
	});

	// Clear cache handler
	document.getElementById('clear-cache').addEventListener('click', function() {
	localStorage.removeItem('treemap_csv_data');
	localStorage.removeItem('treemap_csv_filename');
	document.getElementById('cached-info').style.display = 'none';
	// Clear the visualization
	document.getElementById('treemap').innerHTML = '';
	document.getElementById('breadcrumb').innerHTML = '';
	currentData = null;
	originalData = null;
	navigationPath = [];
	// Show welcome screen again
	document.body.classList.remove('has-data');
	});

	// Handle file input from welcome screen
	document.getElementById('welcomeFileInput').addEventListener('change', handleFileSelect);
	document.getElementById('csvFile').addEventListener('change', handleFileSelect);

	function handleFileSelect(e) {
	const file = e.target.files[0];
	if (file) {
	const reader = new FileReader();
	reader.onload = function(e) {
	const csvContent = e.target.result;

	// Store in localStorage
	try {
	localStorage.setItem('treemap_csv_data', csvContent);
	localStorage.setItem('treemap_csv_filename', file.name);

	// Update UI
	document.body.classList.add('has-data');
	document.getElementById('cached-info').style.display = 'inline-block';
	document.getElementById('cached-filename').textContent = file.name;
	} catch (e) {
	console.warn('Could not cache file (localStorage full or disabled):', e);
	document.body.classList.add('has-data');
	}

	const csv = d3.csvParse(csvContent);
	processData(csv);

	// Sync file inputs
	if (e.target.id === 'welcomeFileInput') {
	document.getElementById('csvFile').files = e.target.files;
	} else {
	document.getElementById('welcomeFileInput').files = e.target.files;
	}
	};
	reader.readAsText(file);
	}
	}

	function processData(csv) {
	// Build hierarchy from CSV data
	const root = {name: "root", children: []};
	const nodeMap = {};
	maxDataDepth = 0; // Reset max depth

	// Process each row
	csv.forEach(row => {
	const path = row.Folder \|\| "";
	if (!path) return;

	const parts = path.split('/').filter(p => p);
	const fileCount = +row.File_Count \|\| 0;
	const totalStorage = +row.Total_Storage \|\| 0;

	// Track maximum depth
	maxDataDepth = Math.max(maxDataDepth, parts.length);

	// Build the tree structure
	let currentPath = "";
	let parent = root;

	parts.forEach((part, index) => {
	currentPath = currentPath ? `${currentPath}/${part}` : part;

	if (!nodeMap[currentPath]) {
	const node = {
	name: part,
	path: currentPath,
	depth: index + 1 // Add depth tracking
	};

	// If this is a leaf node (last part), add the values
	if (index === parts.length - 1) {
	node.fileCount = fileCount;
	node.totalStorage = totalStorage;
	// Store both metrics, we'll decide which to use for value later
	node.storageValue = totalStorage;
	node.countValue = fileCount;
	} else {
	node.children = [];
	}

	nodeMap[currentPath] = node;

	if (!parent.children) {
	parent.children = [];
	}
	parent.children.push(node);
	}

	parent = nodeMap[currentPath];
	});
	});

	// Update slider max value and reset to max
	const slider = document.getElementById('maxDepth');
	slider.max = maxDataDepth;
	slider.value = maxDataDepth;
	document.getElementById('depth-value').textContent = maxDataDepth;

	// Remove empty branches and calculate parent values
	function cleanAndSum(node) {
	if (node.children) {
	node.children = node.children.filter(child => {
	const hasValue = cleanAndSum(child);
	return hasValue;
	});

	if (node.children.length === 0) {
	delete node.children;
	return false;
	}

	// Sum up children values for parents
	let totalStorage = 0;
	let fileCount = 0;
	node.children.forEach(child => {
	totalStorage += child.totalStorage \|\| 0;
	fileCount += child.fileCount \|\| 0;
	});
	node.totalStorage = totalStorage;
	node.fileCount = fileCount;
	node.storageValue = totalStorage;
	node.countValue = fileCount;
	return totalStorage > 0 \|\| fileCount > 0;
	}
	return (node.storageValue \|\| 0) > 0 \|\| (node.countValue \|\| 0) > 0;
	}

	cleanAndSum(root);
	originalData = root; // Store original data
	applyDepthFilter();
	}

	function applyDepthFilter() {
	if (!originalData) return;

	const maxDepth = parseInt(document.getElementById('maxDepth').value);

	// Start from the current navigation point
	let startNode = originalData;
	for (const pathItem of navigationPath) {
	startNode = pathItem.node;
	}

	// Deep clone the original data
	function cloneWithDepth(node, currentDepth = 0) {
	const clone = {
	name: node.name,
	path: node.path,
	depth: node.depth,
	fileCount: node.fileCount \|\| 0,
	totalStorage: node.totalStorage \|\| 0,
	storageValue: node.storageValue \|\| 0,
	countValue: node.countValue \|\| 0
	};

	// If we're at max depth, collapse children into this node
	if (currentDepth >= maxDepth) {
	// Sum all descendant values
	function sumDescendants(n) {
	let storage = n.storageValue \|\| n.totalStorage \|\| 0;
	let files = n.countValue \|\| n.fileCount \|\| 0;
	if (n.children) {
	n.children.forEach(child => {
	const childSums = sumDescendants(child);
	storage += childSums.storage;
	files += childSums.files;
	});
	}
	return { storage: storage, files: files };
	}

	const sums = sumDescendants(node);
	clone.storageValue = sums.storage;
	clone.totalStorage = sums.storage;
	clone.countValue = sums.files;
	clone.fileCount = sums.files;
	// Don't include children
	} else if (node.children) {
	clone.children = node.children.map(child => cloneWithDepth(child, currentDepth + 1));
	}

	return clone;
	}

	currentData = cloneWithDepth(startNode);
	drawTreemap(currentData);
	updateBreadcrumb();
	}

	function navigateToFolder(node) {
	if (!node.children \|\| node.children.length === 0) return;

	// Add to navigation path
	navigationPath.push({
	name: node.name,
	path: node.path,
	node: node
	});

	// Redraw from this node
	applyDepthFilter();
	}

	function navigateToBreadcrumb(index) {
	// Trim navigation path to the selected index
	navigationPath = navigationPath.slice(0, index);
	applyDepthFilter();
	}

	function updateBreadcrumb() {
	const breadcrumb = document.getElementById('breadcrumb');
	if (navigationPath.length === 0) {
	breadcrumb.innerHTML = '';
	return;
	}

	let html = '<span onclick="navigateToBreadcrumb(0)">Root</span>';
	navigationPath.forEach((item, index) => {
	html += '<span class="separator">›</span>';
	if (index === navigationPath.length - 1) {
	html += `<span class="current">${item.name}</span>`;
	} else {
	html += `<span onclick="navigateToBreadcrumb(${index + 1})">${item.name}</span>`;
	}
	});
	breadcrumb.innerHTML = html;
	}

	function drawTreemap(data) {
	const container = document.getElementById('treemap');
	const width = container.offsetWidth;
	const height = container.offsetHeight;

	// Clear previous content
	container.innerHTML = '';

	// Don't draw if container has no size
	if (width === 0 \|\| height === 0) return;

	// Get the size metric
	const sizeBy = document.getElementById('sizeBy').value;

	// Create hierarchy with selected size metric
	const hierarchy = d3.hierarchy(data)
	.sum(d => {
	if (sizeBy === 'storage') {
	return d.storageValue \|\| 0;
	} else {
	return d.countValue \|\| 0;
	}
	})
	.sort((a, b) => b.value - a.value);

	// Create treemap layout
	const treemap = d3.treemap()
	.size([width, height])
	.paddingOuter(3)
	.paddingTop(19)
	.paddingInner(1)
	.round(true);

	// Generate the treemap
	const root = treemap(hierarchy);

	// Color scales
	const colorBy = document.getElementById('colorBy').value;
	const leaves = root.leaves();

	const colorScale = d3.scaleSequential()
	.domain([0, d3.max(leaves, d => colorBy === 'size' ? d.data.totalStorage : d.data.fileCount)])
	.interpolator(d3.interpolateCool);

	// Create the treemap cells
	const cell = d3.select(container)
	.selectAll('.node')
	.data(root.descendants())
	.enter()
	.append('div')
	.attr('class', d => d.children ? 'node folder' : 'node')
	.style('left', d => `${d.x0}px`)
	.style('top', d => `${d.y0}px`)
	.style('width', d => `${d.x1 - d.x0}px`)
	.style('height', d => `${d.y1 - d.y0}px`)
	.style('background', d => {
	if (d.children) {
	return '#ddd'; // Parent nodes are light gray
	} else {
	const value = colorBy === 'size' ? d.data.totalStorage : d.data.fileCount;
	return colorScale(value);
	}
	})
	.style('border', d => d.children ? '2px solid #999' : '1px solid white')
	.on('click', function(event, d) {
	if (d.children) {
	event.stopPropagation();
	navigateToFolder(d.data);
	}
	})
	.on('mouseover', function(event, d) {
	const tooltip = document.getElementById('tooltip');
	tooltip.style.display = 'block';
	let tooltipText = `<strong>${d.data.path \|\| d.data.name}</strong><br>
	Files: ${(d.data.fileCount \|\| 0).toLocaleString()}<br>
	Size: ${formatBytes(d.data.totalStorage \|\| d.value \|\| 0)}`;
	if (d.children) {
	tooltipText += '<br><em>Click to explore this folder</em>';
	}
	tooltip.innerHTML = tooltipText;
	tooltip.style.left = `${event.pageX + 10}px`;
	tooltip.style.top = `${event.pageY - 30}px`;
	})
	.on('mouseout', function() {
	document.getElementById('tooltip').style.display = 'none';
	});

	// Add labels
	cell.append('div')
	.attr('class', 'node-label')
	.text(d => {
	const width = d.x1 - d.x0;
	const height = d.y1 - d.y0;
	// Only show label if there's enough space
	if (width > 40 && height > 20) {
	return d.data.name;
	}
	return '';
	})
	.style('font-size', d => {
	const width = d.x1 - d.x0;
	if (width > 100) return '12px';
	if (width > 50) return '10px';
	return '9px';
	});
	}

	function formatBytes(bytes) {
	if (bytes === 0) return '0 Bytes';
	const k = 1024;
	const sizes = ['Bytes', 'KB', 'MB', 'GB', 'TB'];
	const i = Math.floor(Math.log(bytes) / Math.log(k));
	return parseFloat((bytes / Math.pow(k, i)).toFixed(2)) + ' ' + sizes[i];
	}

	// Handle window resize
	let resizeTimer;
	window.addEventListener('resize', function() {
	clearTimeout(resizeTimer);
	resizeTimer = setTimeout(function() {
	if (currentData) {
	drawTreemap(currentData);
	}
	}, 250); // Debounce resize events
	});

	// Handle color change
	document.getElementById('colorBy').addEventListener('change', function() {
	if (currentData) {
	drawTreemap(currentData);
	}
	});

	// Handle size change
	document.getElementById('sizeBy').addEventListener('change', function() {
	if (currentData) {
	drawTreemap(currentData);
	}
	});

	// Handle depth change with slider
	document.getElementById('maxDepth').addEventListener('input', function(e) {
	document.getElementById('depth-value').textContent = e.target.value;
	applyDepthFilter();
	});
	</script>
	</body>
	</html>