Created
August 7, 2013 21:01
-
-
Save Zanfa/6178639 to your computer and use it in GitHub Desktop.
AngelList scraper
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| require 'open-uri' | |
| require 'angellist_api' | |
| require 'mongoid' | |
| # Load the Startup model | |
| require File.expand_path(File.dirname(__FILE__) + '/app/models/startup') | |
| Mongoid.load!('config/mongoid.yml', :development) | |
| page = 1 | |
| startup_ids = [] | |
| while true | |
| # Using 25 per page, can't seem to get more | |
| response = open("https://angel.co/new_tags/load_more?page=#{page}&per_page=25&skip_loading=true&include_ids=&claimed=true&slug=estonia").read | |
| # For development purposes | |
| #temp = File.open('test.txt', 'w+') | |
| #temp.write response | |
| #response = File.open('test.txt', 'r').read | |
| new_startup_ids = [] | |
| response.scan(/data-id=\\"(\d+)\\"/).each do |match| | |
| new_startup_ids << match[0] | |
| end | |
| startup_ids.concat new_startup_ids | |
| # No more new startups, must have hit the end | |
| if new_startup_ids.length == 0 | |
| break | |
| end | |
| page += 1 | |
| end | |
| puts "Number of startups found: #{startup_ids.length}" | |
| # Parsing the server response | |
| startup_ids.each do |startup_id| | |
| puts "Startup with id: #{startup_id}" | |
| angellist_startup = AngellistApi.get_startup(startup_id) | |
| startup = Startup.find_or_initialize_by(angellist_id: angellist_startup.id) | |
| # Remap AngelList fields to ours | |
| startup.angellist_id = angellist_startup.id | |
| startup.name = angellist_startup.name | |
| startup.url = angellist_startup.company_url | |
| startup.angellist_url = angellist_startup.angellist_url | |
| startup.logo_url = angellist_startup.logo_url | |
| startup.twitter_url = angellist_startup.twitter_url | |
| startup.description = angellist_startup.product_desc | |
| startup.tags = [] | |
| # Parse AngelList "markets" aka tags | |
| angellist_startup.markets.each do |tag| | |
| startup.tags << tag.display_name | |
| end | |
| startup.save | |
| puts "#{angellist_startup.name} updated" | |
| end | |
| puts "All listed startups:" | |
| Startup.all().each do |startup| | |
| puts startup.name | |
| end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment