Created
June 18, 2023 17:59
-
-
Save kitsamho/3db820ae977056c38a5c62efc9458da9 to your computer and use it in GitHub Desktop.
Class to query data from the TMDB API using tmdbsimple
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| class TMDBMovieScraper: | |
| def __init__(self, years_check: list): | |
| """ | |
| Initializes the TMDBMovieScraper class with a list of years to iterate over for querying movie data from TMDB. | |
| Args: | |
| years_check (list): List of years to iterate over for querying movie data. | |
| """ | |
| self.discover_api = tmdb.Discover() # Instantiate tmdb.Discover module | |
| self.years_check = years_check # List of years to iterate over | |
| self.discover_results = [] # Results for movie discovery | |
| self.movie_payload = get_movie_payload() # Get an empty movie payload | |
| def _check_page_counts(self, discover_api, year: int): | |
| """ | |
| Helper method to check the number of response pages to iterate over for a specific year. | |
| Args: | |
| discover_api (tmdb.Discover): Instance of tmdb.Discover module. | |
| year (int): Year to query. | |
| Returns: | |
| total_pages_to_loop (int): Number of response pages to iterate over. | |
| """ | |
| total_pages_to_loop = discover_api.movie(year=year, page=1)['total_pages'] | |
| return total_pages_to_loop | |
| def _request_discover_data(self, years_check: list): | |
| """ | |
| Helper method to query movie data using the discover API. | |
| Args: | |
| years_check (list): List of years to iterate over for querying movie data. | |
| """ | |
| for year in years_check: # For each year | |
| try: | |
| total_pages_to_loop = self._check_page_counts(self.discover_api, year) # Check number of pages | |
| for page in range(1, total_pages_to_loop): # For each page in a given year | |
| try: | |
| movie_results = self.discover_api.movie(primary_release_year=year, | |
| page=page, | |
| with_original_language='en', | |
| include_adult=False, | |
| vote_count_gte=100) | |
| self.discover_results.append(movie_results['results']) # Append results to a list | |
| except: | |
| pass | |
| except: | |
| pass | |
| def _request_movie_data(self, movie_ids: list): | |
| """ | |
| Helper method to query movie details using movie IDs. | |
| Args: | |
| movie_ids (list): List of movie IDs. | |
| """ | |
| for movie_id in movie_ids: # Iterate over each movie ID | |
| try: | |
| for k, v in self.movie_payload.items(): # Iterate over each key and value in payload | |
| v['movie_response'] = self._get_film_responses(movie_id, k) # Get film response from API instance | |
| v['results_parsed'].append(self._parse_movie_response(v['movie_response'], v['cols'])) | |
| except: | |
| pass | |
| def _transform_discover_results(self, discover_results): | |
| """ | |
| Helper method to transform discover results into a DataFrame. | |
| Args: | |
| discover_results: Results from movie discovery. | |
| Returns: | |
| discover_df (pd.DataFrame): Transformed DataFrame of discover results. | |
| """ | |
| discover_df = pd.concat(pd.DataFrame(i) for i in discover_results) | |
| discover_df = discover_df[ | |
| ['id', 'title', 'overview', 'popularity', 'release_date', 'vote_average', 'poster_path']] | |
| discover_df['release_year'] = discover_df['release_date'].apply(lambda x: x.split("-")[0]) | |
| return discover_df.set_index('id') | |
| def _transform_movie_results(self): | |
| """ | |
| Helper method to transform movie results into a DataFrame. | |
| Returns: | |
| movie_df (pd.DataFrame): Transformed DataFrame of movie results. | |
| """ | |
| results = [pd.DataFrame(self.movie_payload[k]['results_parsed']).set_index('id') for k, v in | |
| self.movie_payload.items()] | |
| movie_df = results[0].join(results[1]).join(results[2]) | |
| return movie_df | |
| def _get_movie_ids(self, discover_dataframe): | |
| """ | |
| Helper method to retrieve movie IDs from the discover DataFrame. | |
| Args: | |
| discover_dataframe (pd.DataFrame): DataFrame of discover results. | |
| Returns: | |
| movie_ids (list): List of movie IDs. | |
| """ | |
| movie_ids = list(discover_dataframe.index) | |
| return list(set(movie_ids)) | |
| def _get_film_response(self, movie_id): | |
| """ | |
| Helper method to get film response from TMDB API. | |
| Args: | |
| movie_id: ID of the movie. | |
| Returns: | |
| film_response: Film response from TMDB API. | |
| """ | |
| film_response = tmdb.Movies(movie_id) | |
| return film_response | |
| def _get_film_responses(self, movie_id, data_stream='info'): | |
| """ | |
| Helper method to get film responses based on data stream type. | |
| Args: | |
| movie_id: ID of the movie. | |
| data_stream (str): Data stream type. | |
| Returns: | |
| film_response: Film response based on the specified data stream. | |
| """ | |
| film_response = self._get_film_response(movie_id) | |
| if data_stream == 'reviews': | |
| return film_response.reviews() | |
| elif data_stream == 'info': | |
| return film_response.info() | |
| elif data_stream == 'keywords': | |
| return film_response.keywords() | |
| def _parse_movie_response(self, response_dic, cols_needed): | |
| """ | |
| Helper method to parse movie response based on the required columns. | |
| Args: | |
| response_dic: Movie response dictionary. | |
| cols_needed (list): Required columns. | |
| Returns: | |
| parsed_response (dict): Parsed movie response containing the required columns. | |
| """ | |
| parsed_response = {k: v for k, v in response_dic.items() if k in cols_needed} | |
| return parsed_response | |
| def _dict_to_list(self, x, key_name): | |
| """ | |
| Helper method to convert a dictionary to a list based on a specific key. | |
| Args: | |
| x: Input dictionary. | |
| key_name (str): Key name. | |
| Returns: | |
| converted_list: List converted from the dictionary. | |
| """ | |
| try: | |
| return [i[key_name] for i in x] | |
| except: | |
| return x | |
| def _merge_clean_and_filter(self): | |
| """ | |
| Helper method to merge, clean, and filter the movie DataFrame. | |
| Returns: | |
| df (pd.DataFrame): Merged, cleaned, and filtered DataFrame. | |
| """ | |
| self.df = self.discover_df.join(self.movie_df) | |
| cols = {'results': 'content', 'genres': 'name', 'production_countries': 'name', 'keywords': 'name'} | |
| for k, v in cols.items(): | |
| self.df[k] = self.df[k].apply(lambda x: self._dict_to_list(x, v)) | |
| self.df = self.df.rename(columns={'id': 'tmdb_id', 'title': 'movie'}) | |
| return self.df | |
| def get_movies(self): | |
| """ | |
| Method to initiate the data retrieval process. | |
| """ | |
| print(f'Getting data for {self.years_check}') | |
| mt = MultiThreading(10, self.years_check, None) | |
| mt.Run(self._request_discover_data) | |
| self.discover_df = self._transform_discover_results(self.discover_results) | |
| self.movie_ids = self._get_movie_ids(self.discover_df) | |
| print(f'Getting movie data for {len(self.movie_ids)} movies..') | |
| mt = MultiThreading(10, self.movie_ids, None) | |
| mt.Run(self._request_movie_data) | |
| self.movie_df = self._transform_movie_results() | |
| self.df_final = self._merge_clean_and_filter() | |
| print('Done') | |
| return self.df_final |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment