-
-
Save jramiresbrito/e95c8366b0c9f7100fc967345a5205b3 to your computer and use it in GitHub Desktop.
Scraper class that extracts lyrics for any given artist from Musixmatch.com
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| class MusixmatchScraper(): | |
| """This class allows you to scrape the lyrics for an artist who has a presence on Musixmatch | |
| An instance of the class needs to be instantiated with an artist URL e.g. | |
| https://www.musixmatch.com/artist/Bob-Dylan | |
| The default number of songs to scrape is 50 | |
| """ | |
| def __init__(self,artist_url,genre_label): | |
| self.artist_url = artist_url #artists URL as attribute | |
| self.artist = artist_url.split('artist/')[-1] #artist string as attribute | |
| self.genre_label = genre_label #genre as attribute | |
| self.song_l = [] #empty list to populate lyrics | |
| def _get_html(self,url): | |
| """Uses Beatiful Soup to extract html from a url. Returns a soup object """ | |
| headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'} | |
| req = Request(url, headers=headers) | |
| return BeautifulSoup(urlopen(req).read(), 'html.parser') | |
| def _multithreadCompile(self,thread_count,iteration_list,func): | |
| """a function that compiles an iteration list to prepare | |
| multi threadding""" | |
| jobs = [] | |
| #distribute iteration list to batches and append to jobs list | |
| batches = [i.tolist() for i in np.array_split(iteration_list,thread_count)] | |
| for i in range(len(batches)): | |
| jobs.append(threading.Thread(target=func,args=[batches[i]])) | |
| return jobs | |
| def _multithreadExecute(self,jobs): | |
| """executes the multi-threadding loop""" | |
| # Start the threads | |
| for j in jobs: | |
| j.start() | |
| # Ensure all of the threads have finished | |
| for j in jobs: | |
| j.join() | |
| return | |
| def _getpageUrls(self,url): | |
| """Gets all the links from an artist page""" | |
| html = self._get_html(url) #gets html for current page | |
| songs = html.find_all('h2',{'class':'media-card-title'}) #element for song | |
| #loop through and extract urls for all songs in soup object | |
| song_urls = ['https://www.musixmatch.com'+i.find('a')['href'] for i in songs] | |
| #return list of song urls | |
| return [i for i in song_urls if 'album' not in i] | |
| def _getLyrics(self,song_url): | |
| """Extracts lyrics from a song url. Duplicated lines are removed e.g. chorus lines | |
| Only unique lyrics are returned""" | |
| html = self._get_html(song_url) #get html for current page | |
| #find all elements containing lyrics | |
| element = html.find_all('span',{'class':'lyrics__content__ok'}) | |
| #numbe of elements to loop through | |
| element_loop = len(element) | |
| song_lyrics = [] #empty list for song lyrics | |
| #extract song lyrics | |
| song_lyrics_raw = [element[i].text.split('\n') for i in range(element_loop)] | |
| #flatten list of lists | |
| song_lyrics_raw = [i for sublist in song_lyrics_raw for i in sublist] | |
| #retain only unique lines in lyrics | |
| song_lyrics.extend(list(dict.fromkeys(song_lyrics_raw))) | |
| #join list and remove empty elements | |
| song_lyrics = ' '.join([i for i in song_lyrics if len(i) >0]) | |
| return song_lyrics #return song lyrics | |
| def _getAllpageUrls(self,target=50): | |
| """Generates page urls for artist. There are 15 songs on each page""" | |
| loops = int(target/15) #specifcy how many loops needed | |
| #generate urls | |
| artist_urls = [self.artist_url+'/'+str(i+1) for i in range(loops)] | |
| all_song_urls = [] #empty list for all song urls | |
| for i in artist_urls: #loop through and get all song urls for all pages | |
| all_song_urls.extend(self._getpageUrls(i)) | |
| return all_song_urls | |
| def _extractData(self,all_song_urls): | |
| """Extracts data from all song urls""" | |
| for i in tqdm_notebook(range(len(all_song_urls))): #loop through all song urls | |
| try: | |
| #get lyrics | |
| song_lyrics = self._getLyrics(all_song_urls[i]) | |
| #get song title | |
| song_title = all_song_urls[i].split('/')[-1] | |
| #create DataFrame | |
| song_df = pd.DataFrame([(self.artist,song_title,song_lyrics)],columns=['artist','song','lyrics']) | |
| #append DataFrame to master list | |
| self.song_l.append(song_df) | |
| except: | |
| pass | |
| return | |
| def Run(self,target): | |
| """Executes all methods above""" | |
| self.all_song_urls = self._getAllpageUrls(target) #get page URL's to get target number of songs | |
| #multi-threaded scraping of all song urls | |
| self._multithreadExecute(self._multithreadCompile(5,self.all_song_urls,self._extractData)) | |
| try: | |
| df_final = pd.concat([i for i in self.song_l]) #concatenate all song Df's | |
| df_final.reset_index(drop=True,inplace=True) #reset index | |
| self.df = df_final[df_final.lyrics.str.len() > 0] #drop any songs with no lyrics or failed scrapes | |
| self.df['genre'] = self.genre_label #add genre column | |
| return self.df | |
| except: | |
| pass | |
| return | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment