Mlawrence95/goodreads_rss_to_yaml.py

## goodreads_rss_to_yaml.py
import feedparser  # via conda install anaconda::feedparser
import yaml
from bs4 import BeautifulSoup

_GOODREADS_RSS_STREAM_URL = "https://www.goodreads.com/review/list_rss/<XXXXXXXXXX>?key=<XXXXXXXXXXXXXX>&shelf=<XXXX>"
# Old yaml lives here. We'll use it to ensure our new dump has unique values.
_EXISTING_YAML_PATH = "docs/_data/books.yml"
_NEW_YAML_PATH = "books.yaml"


def parse_entry(entry: dict) -> dict:
    """Extracts some fields from the Goodreads RSS listing."""
    vals = {}
    vals["title"] = entry["title"]
    vals["author"] = entry["author_name"]
    vals["cover"] = entry["book_large_image_url"]
    vals["pages"] = entry["num_pages"]

    # Description sometimes has HTML in it.
    soup = BeautifulSoup(entry["book_description"], "html.parser")
    vals["description"] = soup.text

    # fill in later
    vals["rating"] = 0
    vals["genre"] = "unspecified"
    vals["status"] = "Read"
    vals["review"] = "Pending"

    return vals


def filter_existing_books(all_parsed: list[dict], existing_path: str | None) -> list[dict]:
    """Loads previous results, then removes them from the new list of results if the book titles match."""
    if not existing_path:
        return all_parsed
    with open(existing_path, "r", encoding="utf-8") as f:
        existing_books = yaml.safe_load(f)
    existing_book_keys = set(b["title"].lower() for b in existing_books)
    return [e for e in all_parsed if e["title"].lower() not in existing_book_keys]


if __name__ == "__main__":
    feed = feedparser.parse(_GOODREADS_RSS_STREAM_URL)

    parsed_entries = []
    for entry in feed.entries:
        parsed_entries += [parse_entry(entry)]

    filtered_values = filter_existing_books(parsed_entries, _EXISTING_YAML_PATH)

    with open(_NEW_YAML_PATH, "w", encoding="utf-8") as file:
        yaml.dump(
            filtered_values,
            file,
            sort_keys=False,
            default_flow_style=False,
            allow_unicode=True,
        )
	import feedparser # via conda install anaconda::feedparser
	import yaml
	from bs4 import BeautifulSoup

	_GOODREADS_RSS_STREAM_URL = "https://www.goodreads.com/review/list_rss/<XXXXXXXXXX>?key=<XXXXXXXXXXXXXX>&shelf=<XXXX>"
	# Old yaml lives here. We'll use it to ensure our new dump has unique values.
	_EXISTING_YAML_PATH = "docs/_data/books.yml"
	_NEW_YAML_PATH = "books.yaml"


	def parse_entry(entry: dict) -> dict:
	"""Extracts some fields from the Goodreads RSS listing."""
	vals = {}
	vals["title"] = entry["title"]
	vals["author"] = entry["author_name"]
	vals["cover"] = entry["book_large_image_url"]
	vals["pages"] = entry["num_pages"]

	# Description sometimes has HTML in it.
	soup = BeautifulSoup(entry["book_description"], "html.parser")
	vals["description"] = soup.text

	# fill in later
	vals["rating"] = 0
	vals["genre"] = "unspecified"
	vals["status"] = "Read"
	vals["review"] = "Pending"

	return vals


	def filter_existing_books(all_parsed: list[dict], existing_path: str \| None) -> list[dict]:
	"""Loads previous results, then removes them from the new list of results if the book titles match."""
	if not existing_path:
	return all_parsed
	with open(existing_path, "r", encoding="utf-8") as f:
	existing_books = yaml.safe_load(f)
	existing_book_keys = set(b["title"].lower() for b in existing_books)
	return [e for e in all_parsed if e["title"].lower() not in existing_book_keys]


	if __name__ == "__main__":
	feed = feedparser.parse(_GOODREADS_RSS_STREAM_URL)

	parsed_entries = []
	for entry in feed.entries:
	parsed_entries += [parse_entry(entry)]

	filtered_values = filter_existing_books(parsed_entries, _EXISTING_YAML_PATH)

	with open(_NEW_YAML_PATH, "w", encoding="utf-8") as file:
	yaml.dump(
	filtered_values,
	file,
	sort_keys=False,
	default_flow_style=False,
	allow_unicode=True,
	)
No results found