Skip to content

Instantly share code, notes, and snippets.

@thorwhalen
Created November 28, 2024 10:48
Show Gist options
  • Select an option

  • Save thorwhalen/92fa45e8f36f8caf155ea3012ad61629 to your computer and use it in GitHub Desktop.

Select an option

Save thorwhalen/92fa45e8f36f8caf155ea3012ad61629 to your computer and use it in GitHub Desktop.
github utils for raw sources
from urllib.parse import urlparse
import os
from functools import lru_cache
def ensure_raw_github_url(url):
"""
This function takes a GitHub URL and returns the raw version of the URL.
Args:
url (str): The URL of the GitHub file.
Returns:
str: The raw version of the URL, or None if the URL is invalid.
>>> ensure_raw_github_url("https://github.com/USER/REPO/blob/folder/path/to/resource.txt")
'https://raw.githubusercontent.com/USER/REPO/folder/path/to/resource.txt'
"""
if 'github.com' in url and '/blob/' in url:
raw_url = url.replace('github.com', 'raw.githubusercontent.com').replace('/blob/', '/')
return raw_url
else:
return url # Return the original URL if it's not a GitHub file URL
@lru_cache(maxsize=10)
def get_url_text(url):
"""
This function takes a URL and returns the text content of the URL.
Args:
url (str): The URL of the file.
Returns:
str: The text content of the URL, or None if the URL is invalid.
>>> get_url_text("https://raw.githubusercontent.com/USER/REPO/folder/path/to/resource.txt")
'...contents of resource.txt...'
"""
import requests
response = requests.get(url)
response.raise_for_status()
return response.text
def might_be_url(obj):
return obj.startswith('http') or obj.startswith('https')
def ensure_text(obj: str) -> str:
"""
Returns the text content of the input. If the input is a URL, it downloads the content
(converting GitHub URLs to raw content URLs if necessary). Otherwise, returns the input string.
Args:
obj (str): The input string or URL.
Returns:
str: The text content.
Examples:
>>> ensure_text("https://github.com/user/repo/blob/branch/path/to/file.txt") # doctest: +SKIP
'...contents of file.txt...'
>>> ensure_text("Just a regular string.")
'Just a regular string.'
"""
if not isinstance(obj, str):
raise ValueError("Input must be a string.")
if os.path.isfile(obj):
with open(obj, 'r') as f:
return f.read()
elif might_be_url(obj):
url = obj
# Convert GitHub URLs to raw content URLs if it's a github url
url = ensure_raw_github_url(url)
return get_url_text(url)
else:
return obj
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment