jsstevenson/software_stats.py

## software_stats.py
# /// script
# dependencies = [
#   "requests",
# ]
# ///
import os
from pathlib import Path
from datetime import datetime
import csv
import requests


projects = [
    # specs
    {
        "name": "VRS",
        "github_org": "ga4gh",
        "github_repo": "vrs",
    },
    {
        "name": "VA-Spec",
        "github_org": "ga4gh",
        "github_repo": "va-spec",
    },
    {
        "name": "Cat-VRS",
        "github_org": "ga4gh",
        "github_repo": "cat-vrs",
    },
    {
        "name": "Metaschema Processor",
        "github_org": "ga4gh",
        "github_repo": "gks-metaschema",
        "pypi_name": "ga4gh.gks.metaschema",
    },
    # spec implementations
    {
        "name": "vrs-python",
        "github_org": "ga4gh",
        "github_repo": "vrs-python",
        "ghcr": False,
        "dockerhub_org": "ga4gh",
        "dockerhub_name": "vrs-python",
        "pypi_name": "ga4gh.vrs",
    },
    {
        "name": "cat-vrs-python",
        "github_org": "ga4gh",
        "github_repo": "cat-vrs-python",
        "ghcr": False,
        "dockerhub_org": None,
        "dockerhub_name": None,
        "pypi_name": "ga4gh.cat-vrs",
    },
    {
        "name": "va-spec-python",
        "github_org": "ga4gh",
        "github_repo": "va-spec-python",
        "ghcr": False,
        "dockerhub_org": None,
        "dockerhub_name": None,
        "pypi_name": "ga4gh.va-spec",
    },
    # biocommons
    {
        "name": "seqrepo",
        "github_org": "biocommons",
        "github_repo": "biocommons.seqrepo",
        "ghcr": False,
        "dockerhub_org": "biocommons",
        "dockerhub_name": "seqrepo",
        "pypi_name": "biocommons.seqrepo",
    },
    {
        "name": "hgvs",
        "github_org": "biocommons",
        "github_repo": "hgvs",
        "ghcr": False,
        "dockerhub_org": "biocommons",
        "dockerhub_name": "hgvs",
        "pypi_name": "hgvs",
    },
    {
        "name": "bioutils",
        "github_org": "biocommons",
        "github_repo": "bioutils",
        "ghcr": False,
        "dockerhub_org": None,
        "dockerhub_name": None,
        "pypi_name": "bioutils",
    },
    {
        "name": "anyvar",
        "github_org": "biocommons",
        "github_repo": "anyvar",
        "ghcr": True,
        "dockerhub_org": None,
        "dockerhub_name": None,
        "pypi_name": "biocommons.anyvar",
    },
    {
        "name": "uta",
        "github_org": "biocommons",
        "github_repo": "uta",
        "ghcr": False,
        "dockerhub_org": "biocommons",
        "dockerhub_name": "uta",
        "pypi_name": None,
    },
]

GITHUB_API = "https://api.github.com"
GITHUB_TOKEN = os.environ.get("GITHUB_TOKEN")


def gh_get(url: str, params: dict | None = None) -> requests.Response:
    headers = {"Accept": "application/vnd.github+json"}
    if GITHUB_TOKEN:
        headers["Authorization"] = f"Bearer {GITHUB_TOKEN}"
    r = requests.get(url, params=params, headers=headers)
    r.raise_for_status()
    return r


def get_gh_repo_stats(org: str, repo: str) -> dict:
    """Fetch stars and forks for a GitHub repo."""
    url = f"{GITHUB_API}/repos/{org}/{repo}"
    r = gh_get(url)
    data = r.json()
    return {
        "stars": data["stargazers_count"],
        "forks": data["forks_count"],
        "html_url": data["html_url"],
    }


# def get_gh_contributor_count(org: str, repo: str) -> int:
#     url = f"{GITHUB_API}/repos/{org}/{repo}/contributors"
#     params = {"per_page": 100}
#     count = 0
#
#     while url:
#         r = gh_get(url, params)
#         contributors = r.json()
#         count += len(contributors)
#
#         link = r.headers.get("Link")
#         if link and 'rel="next"' in link:
#             url = link.split(";")[0].strip("<>")
#             params = None
#         else:
#             url = None
#
#     return count


def _paginate(url: str, params: dict | None = None):
    seen = set()
    while url:
        if url in seen:
            raise RuntimeError(f"Pagination loop detected at {url}")
        seen.add(url)

        r = gh_get(url, params)
        data = r.json()
        if not isinstance(data, list):
            raise TypeError(f"Expected list response from {url}, got {type(data)}")

        yield from data

        url = r.links.get("next", {}).get("url")
        params = None


def get_pr_creators_and_issue_participants(org: str, repo: str, state: str = "all"):
    pr_creators: set[str] = set()
    issue_creators: set[str] = set()
    issue_commenters: set[str] = set()

    # 1) Creators of issues and PRs (same endpoint, differentiate via pull_request key)
    issues_url = f"{GITHUB_API}/repos/{org}/{repo}/issues"
    for item in _paginate(issues_url, params={"state": state, "per_page": 100}):
        user = (item.get("user") or {}).get("login")
        if not user:
            continue

        if "pull_request" in item:
            pr_creators.add(user)
        else:
            issue_creators.add(user)

    # 2) Commenters (repo-wide). Includes comments on both issues and PRs.
    comments_url = f"{GITHUB_API}/repos/{org}/{repo}/issues/comments"
    for c in _paginate(comments_url, params={"per_page": 100}):
        user = (c.get("user") or {}).get("login")
        if user:
            issue_commenters.add(user)

    issue_participants = issue_creators | issue_commenters
    union_all = pr_creators | issue_participants

    return (
        pr_creators,
        issue_participants,
        len(pr_creators),  # unique PR creators
        len(issue_participants),  # unique issue creators + discussants
        len(union_all),  # unique PR creators + issue participants
    )


def get_pypi_month_download_count(package_name: str) -> int:
    url = f"https://pypistats.org/api/packages/{package_name}/recent"
    r = requests.get(url)
    r.raise_for_status()
    return r.json()["data"]["last_month"]


def get_dh_pull_count(org: str, name: str) -> int:
    url = f"https://hub.docker.com/v2/repositories/{org}/{name}/"
    r = requests.get(url)
    r.raise_for_status()
    return r.json()["pull_count"]


def get_ghcr_total_downloads(org: str, name: str) -> int:
    # not sure how long this will work, it relies on a 3rd party service
    url = f"https://ghcr-badge.elias.eu.org/api/{org}/{name}/{name}"
    r = requests.get(url)
    r.raise_for_status()
    return r.json()["downloadCountRaw"]


def main():
    results = []

    for project in projects:
        org = project["github_org"]
        name = project["github_repo"]

        gh_stats = get_gh_repo_stats(org, name)
        # gh_contributors = get_gh_contributor_count(org, name)
        _, _, _, _, gh_participants = get_pr_creators_and_issue_participants(org, name)

        if project.get("pypi_name"):
            pypi_downloads = get_pypi_month_download_count(project["pypi_name"])
        else:
            pypi_downloads = None

        if project.get("ghcr"):
            ghcr_downloads = get_ghcr_total_downloads(org, name)
        else:
            ghcr_downloads = None

        if project.get("dockerhub_org") and project.get("dockerhub_name"):
            dh_pull_count = get_dh_pull_count(
                project["dockerhub_org"], project["dockerhub_name"]
            )
        else:
            dh_pull_count = None

        row = {
            "name": project["name"],
            "github_url": gh_stats["html_url"],
            "gh_stars": gh_stats["stars"],
            "gh_forks": gh_stats["forks"],
            # "gh_contributors": gh_contributors,
            "gh_participants": gh_participants,
            "pypi_name": project.get("pypi_name"),
            "last_month_pypi_downloads": pypi_downloads,
            "dh_pull_count": dh_pull_count,
            "total_ghcr_downloads": ghcr_downloads,
        }

        results.append(row)

    with Path(f"lib_stats_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv").open(
        "w"
    ) as f:
        writer = csv.DictWriter(f, fieldnames=results[0].keys())
        writer.writeheader()
        writer.writerows(results)


if __name__ == "__main__":
    raise SystemExit(main())
	# /// script
	# dependencies = [
	# "requests",
	# ]
	# ///
	import os
	from pathlib import Path
	from datetime import datetime
	import csv
	import requests


	projects = [
	# specs
	{
	"name": "VRS",
	"github_org": "ga4gh",
	"github_repo": "vrs",
	},
	{
	"name": "VA-Spec",
	"github_org": "ga4gh",
	"github_repo": "va-spec",
	},
	{
	"name": "Cat-VRS",
	"github_org": "ga4gh",
	"github_repo": "cat-vrs",
	},
	{
	"name": "Metaschema Processor",
	"github_org": "ga4gh",
	"github_repo": "gks-metaschema",
	"pypi_name": "ga4gh.gks.metaschema",
	},
	# spec implementations
	{
	"name": "vrs-python",
	"github_org": "ga4gh",
	"github_repo": "vrs-python",
	"ghcr": False,
	"dockerhub_org": "ga4gh",
	"dockerhub_name": "vrs-python",
	"pypi_name": "ga4gh.vrs",
	},
	{
	"name": "cat-vrs-python",
	"github_org": "ga4gh",
	"github_repo": "cat-vrs-python",
	"ghcr": False,
	"dockerhub_org": None,
	"dockerhub_name": None,
	"pypi_name": "ga4gh.cat-vrs",
	},
	{
	"name": "va-spec-python",
	"github_org": "ga4gh",
	"github_repo": "va-spec-python",
	"ghcr": False,
	"dockerhub_org": None,
	"dockerhub_name": None,
	"pypi_name": "ga4gh.va-spec",
	},
	# biocommons
	{
	"name": "seqrepo",
	"github_org": "biocommons",
	"github_repo": "biocommons.seqrepo",
	"ghcr": False,
	"dockerhub_org": "biocommons",
	"dockerhub_name": "seqrepo",
	"pypi_name": "biocommons.seqrepo",
	},
	{
	"name": "hgvs",
	"github_org": "biocommons",
	"github_repo": "hgvs",
	"ghcr": False,
	"dockerhub_org": "biocommons",
	"dockerhub_name": "hgvs",
	"pypi_name": "hgvs",
	},
	{
	"name": "bioutils",
	"github_org": "biocommons",
	"github_repo": "bioutils",
	"ghcr": False,
	"dockerhub_org": None,
	"dockerhub_name": None,
	"pypi_name": "bioutils",
	},
	{
	"name": "anyvar",
	"github_org": "biocommons",
	"github_repo": "anyvar",
	"ghcr": True,
	"dockerhub_org": None,
	"dockerhub_name": None,
	"pypi_name": "biocommons.anyvar",
	},
	{
	"name": "uta",
	"github_org": "biocommons",
	"github_repo": "uta",
	"ghcr": False,
	"dockerhub_org": "biocommons",
	"dockerhub_name": "uta",
	"pypi_name": None,
	},
	]

	GITHUB_API = "https://api.github.com"
	GITHUB_TOKEN = os.environ.get("GITHUB_TOKEN")


	def gh_get(url: str, params: dict \| None = None) -> requests.Response:
	headers = {"Accept": "application/vnd.github+json"}
	if GITHUB_TOKEN:
	headers["Authorization"] = f"Bearer {GITHUB_TOKEN}"
	r = requests.get(url, params=params, headers=headers)
	r.raise_for_status()
	return r


	def get_gh_repo_stats(org: str, repo: str) -> dict:
	"""Fetch stars and forks for a GitHub repo."""
	url = f"{GITHUB_API}/repos/{org}/{repo}"
	r = gh_get(url)
	data = r.json()
	return {
	"stars": data["stargazers_count"],
	"forks": data["forks_count"],
	"html_url": data["html_url"],
	}


	# def get_gh_contributor_count(org: str, repo: str) -> int:
	# url = f"{GITHUB_API}/repos/{org}/{repo}/contributors"
	# params = {"per_page": 100}
	# count = 0
	#
	# while url:
	# r = gh_get(url, params)
	# contributors = r.json()
	# count += len(contributors)
	#
	# link = r.headers.get("Link")
	# if link and 'rel="next"' in link:
	# url = link.split(";")[0].strip("<>")
	# params = None
	# else:
	# url = None
	#
	# return count


	def _paginate(url: str, params: dict \| None = None):
	seen = set()
	while url:
	if url in seen:
	raise RuntimeError(f"Pagination loop detected at {url}")
	seen.add(url)

	r = gh_get(url, params)
	data = r.json()
	if not isinstance(data, list):
	raise TypeError(f"Expected list response from {url}, got {type(data)}")

	yield from data

	url = r.links.get("next", {}).get("url")
	params = None


	def get_pr_creators_and_issue_participants(org: str, repo: str, state: str = "all"):
	pr_creators: set[str] = set()
	issue_creators: set[str] = set()
	issue_commenters: set[str] = set()

	# 1) Creators of issues and PRs (same endpoint, differentiate via pull_request key)
	issues_url = f"{GITHUB_API}/repos/{org}/{repo}/issues"
	for item in _paginate(issues_url, params={"state": state, "per_page": 100}):
	user = (item.get("user") or {}).get("login")
	if not user:
	continue

	if "pull_request" in item:
	pr_creators.add(user)
	else:
	issue_creators.add(user)

	# 2) Commenters (repo-wide). Includes comments on both issues and PRs.
	comments_url = f"{GITHUB_API}/repos/{org}/{repo}/issues/comments"
	for c in _paginate(comments_url, params={"per_page": 100}):
	user = (c.get("user") or {}).get("login")
	if user:
	issue_commenters.add(user)

	issue_participants = issue_creators \| issue_commenters
	union_all = pr_creators \| issue_participants

	return (
	pr_creators,
	issue_participants,
	len(pr_creators), # unique PR creators
	len(issue_participants), # unique issue creators + discussants
	len(union_all), # unique PR creators + issue participants
	)


	def get_pypi_month_download_count(package_name: str) -> int:
	url = f"https://pypistats.org/api/packages/{package_name}/recent"
	r = requests.get(url)
	r.raise_for_status()
	return r.json()["data"]["last_month"]


	def get_dh_pull_count(org: str, name: str) -> int:
	url = f"https://hub.docker.com/v2/repositories/{org}/{name}/"
	r = requests.get(url)
	r.raise_for_status()
	return r.json()["pull_count"]


	def get_ghcr_total_downloads(org: str, name: str) -> int:
	# not sure how long this will work, it relies on a 3rd party service
	url = f"https://ghcr-badge.elias.eu.org/api/{org}/{name}/{name}"
	r = requests.get(url)
	r.raise_for_status()
	return r.json()["downloadCountRaw"]


	def main():
	results = []

	for project in projects:
	org = project["github_org"]
	name = project["github_repo"]

	gh_stats = get_gh_repo_stats(org, name)
	# gh_contributors = get_gh_contributor_count(org, name)
	_, _, _, _, gh_participants = get_pr_creators_and_issue_participants(org, name)

	if project.get("pypi_name"):
	pypi_downloads = get_pypi_month_download_count(project["pypi_name"])
	else:
	pypi_downloads = None

	if project.get("ghcr"):
	ghcr_downloads = get_ghcr_total_downloads(org, name)
	else:
	ghcr_downloads = None

	if project.get("dockerhub_org") and project.get("dockerhub_name"):
	dh_pull_count = get_dh_pull_count(
	project["dockerhub_org"], project["dockerhub_name"]
	)
	else:
	dh_pull_count = None

	row = {
	"name": project["name"],
	"github_url": gh_stats["html_url"],
	"gh_stars": gh_stats["stars"],
	"gh_forks": gh_stats["forks"],
	# "gh_contributors": gh_contributors,
	"gh_participants": gh_participants,
	"pypi_name": project.get("pypi_name"),
	"last_month_pypi_downloads": pypi_downloads,
	"dh_pull_count": dh_pull_count,
	"total_ghcr_downloads": ghcr_downloads,
	}

	results.append(row)

	with Path(f"lib_stats_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv").open(
	"w"
	) as f:
	writer = csv.DictWriter(f, fieldnames=results[0].keys())
	writer.writeheader()
	writer.writerows(results)


	if __name__ == "__main__":
	raise SystemExit(main())
No results found