sspaeti/extract_subscriptions.py

## extract_subscriptions.py
#!/usr/bin/env python3
"""
Extract YouTube subscription data and create RSS feed URLs
"""

import json
import re
import html
from pathlib import Path

def extract_json_data(html_file):
    """Extract ytInitialData JSON from HTML file"""
    with open(html_file, 'r', encoding='utf-8') as f:
        content = f.read()

    # Find ytInitialData JSON
    match = re.search(r'var ytInitialData = ({.*?});', content, re.DOTALL)
    if match:
        json_str = match.group(1)
        return json.loads(json_str)
    return None

def extract_subscriptions(data):
    """Extract channel information from ytInitialData"""
    subscriptions = []

    try:
        # Navigate through the JSON structure
        contents = data['contents']['twoColumnBrowseResultsRenderer']['tabs'][0]['tabRenderer']['content']['sectionListRenderer']['contents']

        for section in contents:
            if 'itemSectionRenderer' in section:
                items = section['itemSectionRenderer']['contents']
                for item in items:
                    if 'shelfRenderer' in item:
                        shelf_items = item['shelfRenderer']['content']['expandedShelfContentsRenderer']['items']
                        for channel_item in shelf_items:
                            if 'channelRenderer' in channel_item:
                                channel = channel_item['channelRenderer']

                                channel_id = channel.get('channelId', '')
                                title = channel.get('title', {}).get('simpleText', '')
                                handle = channel.get('subscriberCountText', {}).get('simpleText', '')

                                if channel_id:
                                    subscriptions.append({
                                        'channel_id': channel_id,
                                        'title': title,
                                        'handle': handle
                                    })
    except (KeyError, TypeError) as e:
        print(f"Error parsing data: {e}")

    return subscriptions

def format_rss_feed(channel_id, title, handle):
    """Format RSS feed URL according to user specification"""
    # Clean up the handle if it doesn't start with @
    if not handle.startswith('@'):
        handle = f"@{handle}"

    rss_url = f"https://www.youtube.com/feeds/videos.xml?channel_id={channel_id}"
    tag = f'youtube "~{title} {handle}"'

    return f"{rss_url} ! {tag}"

def main():
    html_file = Path(__file__).parent / "All subscriptions - YouTube.html"
    output_file = Path(__file__).parent / "youtube_subscriptions_rss.txt"

    print(f"Reading: {html_file}")
    data = extract_json_data(html_file)

    if not data:
        print("Error: Could not extract JSON data from HTML file")
        return

    print("Extracting subscriptions...")
    subscriptions = extract_subscriptions(data)

    print(f"Found {len(subscriptions)} subscriptions")

    # Create RSS feed URLs
    rss_feeds = []
    for sub in subscriptions:
        feed = format_rss_feed(sub['channel_id'], sub['title'], sub['handle'])
        rss_feeds.append(feed)

    # Save to file
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write('\n'.join(rss_feeds))

    print(f"\nSaved {len(rss_feeds)} RSS feeds to: {output_file}")

    # Show first 5 examples
    print("\nFirst 5 examples:")
    for feed in rss_feeds[:5]:
        print(feed)

if __name__ == '__main__':
    main()
	#!/usr/bin/env python3
	"""
	Extract YouTube subscription data and create RSS feed URLs
	"""

	import json
	import re
	import html
	from pathlib import Path

	def extract_json_data(html_file):
	"""Extract ytInitialData JSON from HTML file"""
	with open(html_file, 'r', encoding='utf-8') as f:
	content = f.read()

	# Find ytInitialData JSON
	match = re.search(r'var ytInitialData = ({.*?});', content, re.DOTALL)
	if match:
	json_str = match.group(1)
	return json.loads(json_str)
	return None

	def extract_subscriptions(data):
	"""Extract channel information from ytInitialData"""
	subscriptions = []

	try:
	# Navigate through the JSON structure
	contents = data['contents']['twoColumnBrowseResultsRenderer']['tabs'][0]['tabRenderer']['content']['sectionListRenderer']['contents']

	for section in contents:
	if 'itemSectionRenderer' in section:
	items = section['itemSectionRenderer']['contents']
	for item in items:
	if 'shelfRenderer' in item:
	shelf_items = item['shelfRenderer']['content']['expandedShelfContentsRenderer']['items']
	for channel_item in shelf_items:
	if 'channelRenderer' in channel_item:
	channel = channel_item['channelRenderer']

	channel_id = channel.get('channelId', '')
	title = channel.get('title', {}).get('simpleText', '')
	handle = channel.get('subscriberCountText', {}).get('simpleText', '')

	if channel_id:
	subscriptions.append({
	'channel_id': channel_id,
	'title': title,
	'handle': handle
	})
	except (KeyError, TypeError) as e:
	print(f"Error parsing data: {e}")

	return subscriptions

	def format_rss_feed(channel_id, title, handle):
	"""Format RSS feed URL according to user specification"""
	# Clean up the handle if it doesn't start with @
	if not handle.startswith('@'):
	handle = f"@{handle}"

	rss_url = f"https://www.youtube.com/feeds/videos.xml?channel_id={channel_id}"
	tag = f'youtube "~{title} {handle}"'

	return f"{rss_url} ! {tag}"

	def main():
	html_file = Path(__file__).parent / "All subscriptions - YouTube.html"
	output_file = Path(__file__).parent / "youtube_subscriptions_rss.txt"

	print(f"Reading: {html_file}")
	data = extract_json_data(html_file)

	if not data:
	print("Error: Could not extract JSON data from HTML file")
	return

	print("Extracting subscriptions...")
	subscriptions = extract_subscriptions(data)

	print(f"Found {len(subscriptions)} subscriptions")

	# Create RSS feed URLs
	rss_feeds = []
	for sub in subscriptions:
	feed = format_rss_feed(sub['channel_id'], sub['title'], sub['handle'])
	rss_feeds.append(feed)

	# Save to file
	with open(output_file, 'w', encoding='utf-8') as f:
	f.write('\n'.join(rss_feeds))

	print(f"\nSaved {len(rss_feeds)} RSS feeds to: {output_file}")

	# Show first 5 examples
	print("\nFirst 5 examples:")
	for feed in rss_feeds[:5]:
	print(feed)

	if __name__ == '__main__':
	main()
No results found