Created
November 9, 2018 07:41
-
-
Save Crocmagnon/e908234b5d7e0a1f97a989470008d56c to your computer and use it in GitHub Desktop.
Extract videos from YT
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from bs4 import BeautifulSoup | |
| import sys | |
| import requests | |
| from http.client import RemoteDisconnected | |
| from urllib3.exceptions import ProtocolError | |
| from requests.exceptions import ConnectionError | |
| BASE_URL = "https://www.youtube.com" | |
| first_id = sys.argv[1] | |
| suffix = "/watch?v=" + first_id | |
| url = BASE_URL + suffix | |
| html = requests.get(url).text | |
| soup = BeautifulSoup(html) | |
| title = soup.find('title').getText().replace(' - YouTube', '') | |
| seen = set() | |
| for i in range(1000000000): | |
| seen.add(suffix) | |
| views = int(soup.select('meta[itemprop="interactionCount"]')[0]['content']) | |
| date = soup.select('meta[itemprop="datePublished"]')[0]['content'] | |
| data = { | |
| 'url': url, | |
| 'title': title, | |
| 'views': views, | |
| 'date': date | |
| } | |
| sent = False | |
| video_id = suffix.replace('/watch?v=', '') | |
| while not sent: | |
| try: | |
| req = requests.put("http://localhost:9200/videos/_doc/" + video_id, json=data) | |
| except (ConnectionError, ProtocolError, RemoteDisconnected): | |
| sent = False | |
| else: | |
| sent = True | |
| print(req.status_code) | |
| print(req.json()) | |
| for el in soup.select('a'): | |
| href = el['href'].split('&')[0] | |
| if href.startswith('/watch?') and href not in seen: | |
| new_url = BASE_URL + href | |
| new_video = requests.get(new_url).text | |
| new_soup = BeautifulSoup(new_video) | |
| new_title = new_soup.find('title').getText() | |
| regions = new_soup.select('meta[itemprop="regionsAllowed"]') | |
| available = True | |
| if regions is not None and len(regions) >= 1: | |
| regions = regions[0]['content'] | |
| if regions == "": | |
| available = False | |
| if new_title == 'YouTube': | |
| available = False | |
| if available: | |
| suffix = href | |
| html = new_video | |
| soup = new_soup | |
| title = new_title | |
| url = new_url | |
| break |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment