Skip to content

Instantly share code, notes, and snippets.

@Crocmagnon
Created November 9, 2018 07:41
Show Gist options
  • Select an option

  • Save Crocmagnon/e908234b5d7e0a1f97a989470008d56c to your computer and use it in GitHub Desktop.

Select an option

Save Crocmagnon/e908234b5d7e0a1f97a989470008d56c to your computer and use it in GitHub Desktop.
Extract videos from YT
from bs4 import BeautifulSoup
import sys
import requests
from http.client import RemoteDisconnected
from urllib3.exceptions import ProtocolError
from requests.exceptions import ConnectionError
BASE_URL = "https://www.youtube.com"
first_id = sys.argv[1]
suffix = "/watch?v=" + first_id
url = BASE_URL + suffix
html = requests.get(url).text
soup = BeautifulSoup(html)
title = soup.find('title').getText().replace(' - YouTube', '')
seen = set()
for i in range(1000000000):
seen.add(suffix)
views = int(soup.select('meta[itemprop="interactionCount"]')[0]['content'])
date = soup.select('meta[itemprop="datePublished"]')[0]['content']
data = {
'url': url,
'title': title,
'views': views,
'date': date
}
sent = False
video_id = suffix.replace('/watch?v=', '')
while not sent:
try:
req = requests.put("http://localhost:9200/videos/_doc/" + video_id, json=data)
except (ConnectionError, ProtocolError, RemoteDisconnected):
sent = False
else:
sent = True
print(req.status_code)
print(req.json())
for el in soup.select('a'):
href = el['href'].split('&')[0]
if href.startswith('/watch?') and href not in seen:
new_url = BASE_URL + href
new_video = requests.get(new_url).text
new_soup = BeautifulSoup(new_video)
new_title = new_soup.find('title').getText()
regions = new_soup.select('meta[itemprop="regionsAllowed"]')
available = True
if regions is not None and len(regions) >= 1:
regions = regions[0]['content']
if regions == "":
available = False
if new_title == 'YouTube':
available = False
if available:
suffix = href
html = new_video
soup = new_soup
title = new_title
url = new_url
break
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment