Skip to content

Instantly share code, notes, and snippets.

@renorzr
Created October 29, 2013 05:37
Show Gist options
  • Select an option

  • Save renorzr/7209602 to your computer and use it in GitHub Desktop.

Select an option

Save renorzr/7209602 to your computer and use it in GitHub Desktop.
compose random addresses from www.youbianku.com
# encoding: utf-8
require 'nokogiri'
require 'open-uri'
require 'cgi'
require 'yaml'
PROVINCES = %w(
河北 内蒙古 山西
安徽 江苏 浙江 山东 江西 福建
广东 广西 海南
河南 湖北 湖南
黑龙江 吉林 辽宁
陕西 甘肃 宁夏 青海 新疆
四川 云南 贵州 西藏
)
MUNICIPALITIES = %w(北京 天津 上海 重庆)
class Crawler
BASE_URL = 'http://www.youbianku.com'
def visit(path)
puts url = BASE_URL + path
@doc = Nokogiri::HTML(open(url))
end
def crawl_province(province)
result = {}
visit(province)
city_names = @doc.xpath('//tr/td[1]/p/a[1]').map {|e|e.content}
@doc.xpath('//td/p[contains(text(),"进入:")]').each do |element|
result[city_names.shift()] = crawl_city(element.css('a').first['href'])
end
return result
end
def crawl_city(city, muni=false)
puts CGI.unescape(city)
result = {}
visit(city)
postcodes, district_names, districts = muni ? parse_muni_page : parse_city_page
districts.each do |district|
visit(district)
streets = parse_district_page
result[district_names.shift()] = {postcode: postcodes.shift(), streets: streets}
end
return result
end
def parse_district_page
return @doc.xpath('//table//p/a').map do |street_link|
street_link.content
end
end
def parse_city_page
postcodes = @doc.xpath('//tr/td[2]/dl/dd/a').map{|el|el.content}
district_names = @doc.xpath('//tr/td[1]/dl/dd/a[1]').map{|el|el.content}
districts = @doc.xpath('//tr/td[3]/dl/dd/a').map{|el|el['href']}
return postcodes, district_names, districts
end
def parse_muni_page
postcodes = @doc.xpath('//table[2]//tr/td[1]/p/a[2]').map{|el|el.content}
district_names = @doc.xpath('//table[2]//tr/td[1]/p/a[1]').map{|el|el.content}
district_names.shift()
districts = @doc.xpath('//table[2]//tr/td[2]/p/a[1]').map{|el|el['href']}
return postcodes, district_names, districts
end
end
#p Crawler.new.crawl_province('/' + CGI.escape('河北'))
#p Crawler.new.crawl_city('/' + CGI.escape('北京'), true)
#__END__
country = {}
MUNICIPALITIES.each do |city|
puts "Crawling #{city}"
m = Crawler.new.crawl_city('/' + CGI.escape(city), true)
File.open("#{city}.yml", 'w') {|f| f.write(YAML.dump(m))}
end
PROVINCES.each do |province|
puts "Crawling #{province}"
pr = Crawler.new.crawl_province('/' + CGI.escape(province))
File.open("#{province}.yml", 'w') {|f| f.write(YAML.dump(pr))}
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment