-
-
Save Subhraj07/1e1ec2d8e1f50cf5423ca330ecd60394 to your computer and use it in GitHub Desktop.
Python script to scrape a company details from a public company page on LinkedIn.com. Written as part of How to Scrape educational post - https://www.scrapehero.com/tutorial-scraping-linkedin-for-public-company-data/
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from lxml import html | |
| import csv, os, json | |
| import requests | |
| from exceptions import ValueError | |
| from time import sleep | |
| def linkedin_companies_parser(url): | |
| for i in range(5): | |
| try: | |
| headers = { | |
| 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36'} | |
| response = requests.get(url, headers=headers) | |
| formatted_response = response.content.replace('<!--', '').replace('-->', '') | |
| doc = html.fromstring(formatted_response) | |
| datafrom_xpath = doc.xpath('//code[@id="stream-promo-top-bar-embed-id-content"]//text()') | |
| if datafrom_xpath: | |
| try: | |
| json_formatted_data = json.loads(datafrom_xpath[0]) | |
| company_name = json_formatted_data['companyName'] if 'companyName' in json_formatted_data.keys() else None | |
| size = json_formatted_data['size'] if 'size' in json_formatted_data.keys() else None | |
| industry = json_formatted_data['industry'] if 'industry' in json_formatted_data.keys() else None | |
| description = json_formatted_data['description'] if 'description' in json_formatted_data.keys() else None | |
| follower_count = json_formatted_data['followerCount'] if 'followerCount' in json_formatted_data.keys() else None | |
| year_founded = json_formatted_data['yearFounded'] if 'yearFounded' in json_formatted_data.keys() else None | |
| website = json_formatted_data['website'] if 'website' in json_formatted_data.keys() else None | |
| type = json_formatted_data['companyType'] if 'companyType' in json_formatted_data.keys() else None | |
| specialities = json_formatted_data['specialties'] if 'specialties' in json_formatted_data.keys() else None | |
| if "headquarters" in json_formatted_data.keys(): | |
| city = json_formatted_data["headquarters"]['city'] if 'city' in json_formatted_data["headquarters"].keys() else None | |
| country = json_formatted_data["headquarters"]['country'] if 'country' in json_formatted_data['headquarters'].keys() else None | |
| state = json_formatted_data["headquarters"]['state'] if 'state' in json_formatted_data['headquarters'].keys() else None | |
| street1 = json_formatted_data["headquarters"]['street1'] if 'street1' in json_formatted_data['headquarters'].keys() else None | |
| street2 = json_formatted_data["headquarters"]['street2'] if 'street2' in json_formatted_data['headquarters'].keys() else None | |
| zip = json_formatted_data["headquarters"]['zip'] if 'zip' in json_formatted_data['headquarters'].keys() else None | |
| street = street1 + ', ' + street2 | |
| else: | |
| city = None | |
| country = None | |
| state = None | |
| street1 = None | |
| street2 = None | |
| street = None | |
| zip = None | |
| data = { | |
| 'company_name': company_name, | |
| 'size': size, | |
| 'industry': industry, | |
| 'description': description, | |
| 'follower_count': follower_count, | |
| 'founded': year_founded, | |
| 'website': website, | |
| 'type': type, | |
| 'specialities': specialities, | |
| 'city': city, | |
| 'country': country, | |
| 'state': state, | |
| 'street': street, | |
| 'zip': zip, | |
| 'url': url | |
| } | |
| return data | |
| except: | |
| print "cant parse page", url | |
| # Retry in case of captcha or login page redirection | |
| if len(response.content) < 2000 or "trk=login_reg_redirect" in url: | |
| if response.status_code == 404: | |
| print "linkedin page not found" | |
| else: | |
| raise ValueError('redirecting to login page or captcha found') | |
| except : | |
| print "retrying :",url | |
| def readurls(): | |
| companyurls = ['https://www.linkedin.com/company/microsoft'] | |
| extracted_data = [] | |
| for url in companyurls: | |
| extracted_data.append(linkedin_companies_parser(url)) | |
| sleep(5) | |
| f = open('data.json', 'w') | |
| json.dump(extracted_data, f, indent=4) | |
| if __name__ == "__main__": | |
| readurls() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment