Created
November 28, 2024 10:48
-
-
Save thorwhalen/92fa45e8f36f8caf155ea3012ad61629 to your computer and use it in GitHub Desktop.
github utils for raw sources
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from urllib.parse import urlparse | |
| import os | |
| from functools import lru_cache | |
| def ensure_raw_github_url(url): | |
| """ | |
| This function takes a GitHub URL and returns the raw version of the URL. | |
| Args: | |
| url (str): The URL of the GitHub file. | |
| Returns: | |
| str: The raw version of the URL, or None if the URL is invalid. | |
| >>> ensure_raw_github_url("https://github.com/USER/REPO/blob/folder/path/to/resource.txt") | |
| 'https://raw.githubusercontent.com/USER/REPO/folder/path/to/resource.txt' | |
| """ | |
| if 'github.com' in url and '/blob/' in url: | |
| raw_url = url.replace('github.com', 'raw.githubusercontent.com').replace('/blob/', '/') | |
| return raw_url | |
| else: | |
| return url # Return the original URL if it's not a GitHub file URL | |
| @lru_cache(maxsize=10) | |
| def get_url_text(url): | |
| """ | |
| This function takes a URL and returns the text content of the URL. | |
| Args: | |
| url (str): The URL of the file. | |
| Returns: | |
| str: The text content of the URL, or None if the URL is invalid. | |
| >>> get_url_text("https://raw.githubusercontent.com/USER/REPO/folder/path/to/resource.txt") | |
| '...contents of resource.txt...' | |
| """ | |
| import requests | |
| response = requests.get(url) | |
| response.raise_for_status() | |
| return response.text | |
| def might_be_url(obj): | |
| return obj.startswith('http') or obj.startswith('https') | |
| def ensure_text(obj: str) -> str: | |
| """ | |
| Returns the text content of the input. If the input is a URL, it downloads the content | |
| (converting GitHub URLs to raw content URLs if necessary). Otherwise, returns the input string. | |
| Args: | |
| obj (str): The input string or URL. | |
| Returns: | |
| str: The text content. | |
| Examples: | |
| >>> ensure_text("https://github.com/user/repo/blob/branch/path/to/file.txt") # doctest: +SKIP | |
| '...contents of file.txt...' | |
| >>> ensure_text("Just a regular string.") | |
| 'Just a regular string.' | |
| """ | |
| if not isinstance(obj, str): | |
| raise ValueError("Input must be a string.") | |
| if os.path.isfile(obj): | |
| with open(obj, 'r') as f: | |
| return f.read() | |
| elif might_be_url(obj): | |
| url = obj | |
| # Convert GitHub URLs to raw content URLs if it's a github url | |
| url = ensure_raw_github_url(url) | |
| return get_url_text(url) | |
| else: | |
| return obj | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment