Created
August 31, 2020 22:28
-
-
Save mr-rigden/0ed520a8cd943159cf0640ccafb299bc to your computer and use it in GitHub Desktop.
podcast_to_dict.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from dateutil.parser import * | |
| from nested_lookup import nested_lookup | |
| import requests | |
| import requests_cache | |
| from slugify import slugify | |
| import xmltodict | |
| requests_cache.install_cache('demo_cache') | |
| def parse_pubDate(pubDate): | |
| if pubDate is None: | |
| return None | |
| return parse(pubDate).isoformat() | |
| def get_categories(channel): | |
| raw_categories = channel.get('itunes:category', None) | |
| categories = nested_lookup('@text', raw_categories) | |
| return categories | |
| def get_enclosure(item): | |
| raw_enclosure = item.get('enclosure', None) | |
| if raw_enclosure is None: | |
| return None | |
| enclosure = {} | |
| enclosure['length'] = raw_enclosure.get('@length', None) | |
| enclosure['type'] = raw_enclosure.get('@type', None) | |
| enclosure['url'] = raw_enclosure.get('@url', None) | |
| return enclosure | |
| def get_episode(item): | |
| episode = {} | |
| episode['category'] = item.get('category', None) | |
| episode['description'] = item.get('description', None) | |
| episode['enclosure'] = get_enclosure(item) | |
| episode['guid'] = item.get('guid', None).get("#text", None) | |
| episode['itunes:block'] = item.get('itunes:block', None) | |
| episode['itunes:duration'] = item.get('itunes:duration', None) | |
| try: | |
| episode['itunes:episode'] = int(item.get('itunes:episode', None)) | |
| except TypeError: | |
| episode['itunes:episode'] = None | |
| episode['itunes:episodeType'] = item.get('itunes:episodeType', None) | |
| episode['itunes:explicit'] = item.get('itunes:explicit', None) | |
| try: | |
| episode['itunes:image'] = item['itunes:image']['@href'] | |
| except KeyError: | |
| episode['itunes:image'] = None | |
| try: | |
| episode['itunes:season'] = int(item.get('itunes:season', None)) | |
| except TypeError: | |
| episode['itunes:season'] = None | |
| episode['itunes:title'] = item.get('itunes:title', None) | |
| episode['link'] = item.get('link', None) | |
| episode['pubDate'] = item.get('pubDate', None) | |
| episode['pubDate_dt'] = parse(episode['pubDate']) | |
| episode['pubDate_iso'] = parse_pubDate(episode['pubDate']) | |
| episode['pubDate_string'] = episode['pubDate_dt'].strftime("%B %-m, %Y ") | |
| episode['title'] = item.get('title', None) | |
| episode['slug'] = slugify(episode['title']) | |
| return episode | |
| def get_episodes(channel): | |
| episodes = [] | |
| items = channel.get('item', None) | |
| for item in items: | |
| episode = get_episode(item) | |
| episodes.append(episode) | |
| return episodes | |
| def podcast_to_dict(url): | |
| podcast_dict = {} | |
| podcast_dict['url'] = url | |
| r = requests.get(url) | |
| parsed_xml = xmltodict.parse(r.text) | |
| channel = parsed_xml['rss']['channel'] | |
| podcast_dict['copyright'] = channel.get('copyright', None) | |
| podcast_dict['creativeCommons:license'] = channel.get('creativeCommons:license', None) | |
| podcast_dict['description'] = channel.get('description', None) | |
| podcast_dict['episodes'] = get_episodes(channel) | |
| podcast_dict['generator'] = channel.get('generator', None) | |
| podcast_dict['image'] = channel.get('image', None).get('url', None) | |
| podcast_dict['itunes:author'] = channel.get('itunes:author', None) | |
| podcast_dict['itunes:block'] = channel.get('itunes:block', None) | |
| podcast_dict['itunes:category'] = get_categories(channel) | |
| podcast_dict['itunes:complete'] = channel.get('itunes:complete', None) | |
| podcast_dict['itunes:explicit'] = channel.get('itunes:explicit', None) | |
| podcast_dict['itunes:image'] = channel.get('itunes:image', None).get('@href', None) | |
| podcast_dict['itunes:keywords'] = channel.get('itunes:keywords', None) | |
| podcast_dict['itunes:owner'] = channel.get('itunes:owner', None) | |
| podcast_dict['itunes:new-feed-url'] = channel.get('itunes:new-feed-url', None) | |
| podcast_dict['itunes:subtitle'] = channel.get('itunes:subtitle', None) | |
| podcast_dict['itunes:title'] = channel.get('itunes:title', None) | |
| podcast_dict['itunes:type'] = channel.get('itunes:type', None) | |
| podcast_dict['language'] = channel.get('language', None) | |
| podcast_dict['lastBuildDate'] = channel.get('lastBuildDate', None) | |
| podcast_dict['lastBuildDate_iso'] = parse_pubDate(podcast_dict['lastBuildDate']) | |
| podcast_dict['link'] = channel.get('link', None) | |
| podcast_dict['managingEditor'] = channel.get('managingEditor', None) | |
| podcast_dict['pubDate'] = channel.get('pubDate', None) | |
| podcast_dict['pubDate_iso'] = parse_pubDate(podcast_dict['pubDate']) | |
| podcast_dict['webMaster'] = channel.get('webMaster', None) | |
| podcast_dict['title'] = channel.get('title', None) | |
| return podcast_dict |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment