aaronwolen/download_visium.py

## download_visium.py
#!/usr/bin/env python3
"""
Download 10X Visium datasets using squidpy.

This tool wraps squidpy's internal visium() function to provide an easy way
to download and extract 10X Genomics Visium spatial transcriptomics datasets.
"""
from __future__ import annotations

import argparse
import os
import sys
import typing
import warnings
from pathlib import Path

# Suppress warnings from squidpy imports
warnings.filterwarnings("ignore")

# Import squidpy's visium function
try:
    from squidpy.datasets._10x_datasets import VisiumDatasets  # type: ignore

    # Extract valid dataset choices
    VALID_DATASETS = list(typing.get_args(VisiumDatasets))
except ImportError:
    print("Error: squidpy must be installed to use this tool.")
    sys.exit(1)

def download_visium_dataset(
    sample_id: VisiumDatasets,
    *,
    include_hires_tiff: bool = False,
    base_dir: os.PathLike[str],
) -> None:
    """
    Download Visium `datasets <https://support.10xgenomics.com/spatial-gene-expression/datasets>`_ from *10x Genomics*.

    Modified version of squidpy's visium() function that only downloads the 10X
    data without reading it into an AnnData object.

    Parameters
    ----------
    sample_id
        Name of the Visium dataset.
    include_hires_tiff
        Whether to download the high-resolution tissue image.
    base_dir
        Directory where to download the data.

    Returns
    -------
    None
    """
    import tarfile

    from squidpy._constants._constants import TenxVersions  # type: ignore
    from squidpy.datasets._10x_datasets import VisiumFiles
    from squidpy.datasets._utils import check_presence_download  # type: ignore

    if sample_id.startswith("V1_"):
        spaceranger_version = TenxVersions.V1
    elif sample_id.startswith("Targeted_") or sample_id.startswith("Parent_"):
        spaceranger_version = TenxVersions.V2
    else:
        spaceranger_version = TenxVersions.V3

    base_dir = Path(base_dir)
    sample_dir = base_dir / sample_id
    sample_dir.mkdir(exist_ok=True, parents=True)

    url_prefix = f"https://cf.10xgenomics.com/samples/spatial-exp/{spaceranger_version}/{sample_id}/"
    visium_files = VisiumFiles(
        f"{sample_id}_filtered_feature_bc_matrix.h5",
        f"{sample_id}_spatial.tar.gz",
        f"{sample_id}_image.tif",
    )

    # download spatial data
    tar_pth = sample_dir / visium_files.spatial_attrs
    check_presence_download(filename=tar_pth, backup_url=url_prefix + visium_files.spatial_attrs)
    try:
        with tarfile.open(tar_pth) as f:
            for el in f:
                if not (sample_dir / el.name).exists():
                    f.extract(el, sample_dir)
    except (tarfile.TarError, OSError) as e:
        raise RuntimeError(f"Failed to extract spatial data tarball: {e}") from e
    else:
        tar_pth.unlink()  # remove tarball after extraction

    # download counts
    check_presence_download(
        filename=sample_dir / "filtered_feature_bc_matrix.h5",
        backup_url=url_prefix + visium_files.feature_matrix,
    )

    if include_hires_tiff:  # download image
        check_presence_download(
            filename=sample_dir / "image.tif",
            backup_url=url_prefix + visium_files.tif_image,
        )


def main() -> None:
    """Main entry point for the CLI."""
    parser = argparse.ArgumentParser(
        description="Download 10X Visium spatial datasets",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  # Download a dataset to the current directory
  %(prog)s V1_Human_Heart

  # Download to a specific directory
  %(prog)s V1_Human_Heart --output-dir /path/to/datasets

  # Include high-resolution TIFF image
  %(prog)s V1_Human_Heart --include-hires-tiff

    # Also creates V1_Human_Heart.h5ad next to the folder by default
    # Disable h5ad writing if you only want raw 10x outputs
    %(prog)s V1_Human_Heart --no-h5ad

  # List all available datasets
  %(prog)s --list-datasets
        """,
    )

    parser.add_argument(
        "dataset",
        nargs="?",
        help="Name of Visium dataset to download. Use --list-datasets to see valid options.",
        choices=VALID_DATASETS,
        metavar="DATASET",
    )

    parser.add_argument(
        "-o",
        "--output-dir",
        type=Path,
        default=Path.cwd(),
        help="Directory where the dataset will be downloaded (default: current directory)",
    )

    parser.add_argument(
        "--include-hires-tiff",
        action="store_true",
        help="Download the high-resolution tissue section image (larger file size)",
    )

    parser.add_argument(
        "--no-h5ad",
        action="store_true",
        help="Do not read with squidpy nor write the consolidated {DATASET}.h5ad file (default: write)",
    )

    parser.add_argument(
        "-l",
        "--list-datasets",
        action="store_true",
        help="List all available dataset IDs and exit",
    )

    args = parser.parse_args()

    # Handle --list-datasets
    if args.list_datasets:
        print("Available Visium datasets:\n")
        for ds in VALID_DATASETS:
            print(f"  {ds}")
        return

    # Require dataset_id if not listing datasets
    if not args.dataset:
        parser.error("the following arguments are required: DATASET")

    # Create output directory if it doesn't exist
    output_dir = args.output_dir.expanduser().absolute()
    output_dir.mkdir(parents=True, exist_ok=True)

    print(f"Downloading dataset: {args.dataset}")
    print(f"Output directory: {output_dir}")
    if args.include_hires_tiff:
        print("Including high-resolution TIFF image")

    try:
        download_visium_dataset(
            sample_id=args.dataset,
            include_hires_tiff=args.include_hires_tiff,
            base_dir=output_dir,
        )

        print("\nDownload complete!")
        print(f"Dataset location: {output_dir / args.dataset}")

        # Load into AnnData via squidpy and write an .h5ad next to the folder
        if not args.no_h5ad:
            try:
                from squidpy.read._read import visium as read_visium  # type: ignore

                sample_path = output_dir / args.dataset
                source_img = None
                if args.include_hires_tiff:
                    # Only pass if we actually downloaded it
                    potential_img = sample_path / "image.tif"
                    if potential_img.exists():
                        source_img = potential_img

                print("Reading dataset into AnnData …")
                adata = (
                    read_visium(sample_path, source_image_path=source_img)
                    if source_img is not None
                    else read_visium(sample_path)
                )

                h5ad_path = output_dir / f"{args.dataset}.h5ad"
                print(f"Writing {h5ad_path} …")
                # Use write_h5ad for clarity; overwrites if exists
                adata.write_h5ad(h5ad_path)
                print("h5ad save complete!")
            except (ImportError, FileNotFoundError, OSError, RuntimeError, ValueError) as e:
                print(
                    f"\nError while creating h5ad (use --no-h5ad to skip): {e}",
                    file=sys.stderr,
                )
                sys.exit(1)
    except (RuntimeError, OSError) as e:
        print(f"\nError downloading dataset: {e}", file=sys.stderr)
        sys.exit(1)


if __name__ == "__main__":
    main()
	#!/usr/bin/env python3
	"""
	Download 10X Visium datasets using squidpy.

	This tool wraps squidpy's internal visium() function to provide an easy way
	to download and extract 10X Genomics Visium spatial transcriptomics datasets.
	"""
	from __future__ import annotations

	import argparse
	import os
	import sys
	import typing
	import warnings
	from pathlib import Path

	# Suppress warnings from squidpy imports
	warnings.filterwarnings("ignore")

	# Import squidpy's visium function
	try:
	from squidpy.datasets._10x_datasets import VisiumDatasets # type: ignore

	# Extract valid dataset choices
	VALID_DATASETS = list(typing.get_args(VisiumDatasets))
	except ImportError:
	print("Error: squidpy must be installed to use this tool.")
	sys.exit(1)

	def download_visium_dataset(
	sample_id: VisiumDatasets,
	*,
	include_hires_tiff: bool = False,
	base_dir: os.PathLike[str],
	) -> None:
	"""
	Download Visium `datasets <https://support.10xgenomics.com/spatial-gene-expression/datasets>`_ from 10x Genomics.

	Modified version of squidpy's visium() function that only downloads the 10X
	data without reading it into an AnnData object.

	Parameters
	----------
	sample_id
	Name of the Visium dataset.
	include_hires_tiff
	Whether to download the high-resolution tissue image.
	base_dir
	Directory where to download the data.

	Returns
	-------
	None
	"""
	import tarfile

	from squidpy._constants._constants import TenxVersions # type: ignore
	from squidpy.datasets._10x_datasets import VisiumFiles
	from squidpy.datasets._utils import check_presence_download # type: ignore

	if sample_id.startswith("V1_"):
	spaceranger_version = TenxVersions.V1
	elif sample_id.startswith("Targeted_") or sample_id.startswith("Parent_"):
	spaceranger_version = TenxVersions.V2
	else:
	spaceranger_version = TenxVersions.V3

	base_dir = Path(base_dir)
	sample_dir = base_dir / sample_id
	sample_dir.mkdir(exist_ok=True, parents=True)

	url_prefix = f"https://cf.10xgenomics.com/samples/spatial-exp/{spaceranger_version}/{sample_id}/"
	visium_files = VisiumFiles(
	f"{sample_id}_filtered_feature_bc_matrix.h5",
	f"{sample_id}_spatial.tar.gz",
	f"{sample_id}_image.tif",
	)

	# download spatial data
	tar_pth = sample_dir / visium_files.spatial_attrs
	check_presence_download(filename=tar_pth, backup_url=url_prefix + visium_files.spatial_attrs)
	try:
	with tarfile.open(tar_pth) as f:
	for el in f:
	if not (sample_dir / el.name).exists():
	f.extract(el, sample_dir)
	except (tarfile.TarError, OSError) as e:
	raise RuntimeError(f"Failed to extract spatial data tarball: {e}") from e
	else:
	tar_pth.unlink() # remove tarball after extraction

	# download counts
	check_presence_download(
	filename=sample_dir / "filtered_feature_bc_matrix.h5",
	backup_url=url_prefix + visium_files.feature_matrix,
	)

	if include_hires_tiff: # download image
	check_presence_download(
	filename=sample_dir / "image.tif",
	backup_url=url_prefix + visium_files.tif_image,
	)


	def main() -> None:
	"""Main entry point for the CLI."""
	parser = argparse.ArgumentParser(
	description="Download 10X Visium spatial datasets",
	formatter_class=argparse.RawDescriptionHelpFormatter,
	epilog="""
	Examples:
	# Download a dataset to the current directory
	%(prog)s V1_Human_Heart

	# Download to a specific directory
	%(prog)s V1_Human_Heart --output-dir /path/to/datasets

	# Include high-resolution TIFF image
	%(prog)s V1_Human_Heart --include-hires-tiff

	# Also creates V1_Human_Heart.h5ad next to the folder by default
	# Disable h5ad writing if you only want raw 10x outputs
	%(prog)s V1_Human_Heart --no-h5ad

	# List all available datasets
	%(prog)s --list-datasets
	""",
	)

	parser.add_argument(
	"dataset",
	nargs="?",
	help="Name of Visium dataset to download. Use --list-datasets to see valid options.",
	choices=VALID_DATASETS,
	metavar="DATASET",
	)

	parser.add_argument(
	"-o",
	"--output-dir",
	type=Path,
	default=Path.cwd(),
	help="Directory where the dataset will be downloaded (default: current directory)",
	)

	parser.add_argument(
	"--include-hires-tiff",
	action="store_true",
	help="Download the high-resolution tissue section image (larger file size)",
	)

	parser.add_argument(
	"--no-h5ad",
	action="store_true",
	help="Do not read with squidpy nor write the consolidated {DATASET}.h5ad file (default: write)",
	)

	parser.add_argument(
	"-l",
	"--list-datasets",
	action="store_true",
	help="List all available dataset IDs and exit",
	)

	args = parser.parse_args()

	# Handle --list-datasets
	if args.list_datasets:
	print("Available Visium datasets:\n")
	for ds in VALID_DATASETS:
	print(f" {ds}")
	return

	# Require dataset_id if not listing datasets
	if not args.dataset:
	parser.error("the following arguments are required: DATASET")

	# Create output directory if it doesn't exist
	output_dir = args.output_dir.expanduser().absolute()
	output_dir.mkdir(parents=True, exist_ok=True)

	print(f"Downloading dataset: {args.dataset}")
	print(f"Output directory: {output_dir}")
	if args.include_hires_tiff:
	print("Including high-resolution TIFF image")

	try:
	download_visium_dataset(
	sample_id=args.dataset,
	include_hires_tiff=args.include_hires_tiff,
	base_dir=output_dir,
	)

	print("\nDownload complete!")
	print(f"Dataset location: {output_dir / args.dataset}")

	# Load into AnnData via squidpy and write an .h5ad next to the folder
	if not args.no_h5ad:
	try:
	from squidpy.read._read import visium as read_visium # type: ignore

	sample_path = output_dir / args.dataset
	source_img = None
	if args.include_hires_tiff:
	# Only pass if we actually downloaded it
	potential_img = sample_path / "image.tif"
	if potential_img.exists():
	source_img = potential_img

	print("Reading dataset into AnnData …")
	adata = (
	read_visium(sample_path, source_image_path=source_img)
	if source_img is not None
	else read_visium(sample_path)
	)

	h5ad_path = output_dir / f"{args.dataset}.h5ad"
	print(f"Writing {h5ad_path} …")
	# Use write_h5ad for clarity; overwrites if exists
	adata.write_h5ad(h5ad_path)
	print("h5ad save complete!")
	except (ImportError, FileNotFoundError, OSError, RuntimeError, ValueError) as e:
	print(
	f"\nError while creating h5ad (use --no-h5ad to skip): {e}",
	file=sys.stderr,
	)
	sys.exit(1)
	except (RuntimeError, OSError) as e:
	print(f"\nError downloading dataset: {e}", file=sys.stderr)
	sys.exit(1)


	if __name__ == "__main__":
	main()
No results found