renorzr/crawl_addresses.rb

## crawl_addresses.rb
# encoding: utf-8
require 'nokogiri'
require 'open-uri'
require 'cgi'
require 'yaml'

PROVINCES = %w(
河北 内蒙古 山西
安徽 江苏 浙江 山东 江西 福建
广东 广西 海南
河南 湖北 湖南
黑龙江 吉林 辽宁
陕西 甘肃 宁夏 青海 新疆
四川 云南 贵州 西藏
)
MUNICIPALITIES = %w(北京 天津 上海 重庆)

class Crawler
  BASE_URL = 'http://www.youbianku.com'

  def visit(path)
    puts url = BASE_URL + path
    @doc = Nokogiri::HTML(open(url))
  end

  def crawl_province(province)
    result = {}
    visit(province)
    city_names = @doc.xpath('//tr/td[1]/p/a[1]').map {|e|e.content}
    @doc.xpath('//td/p[contains(text(),"进入：")]').each do |element|
      result[city_names.shift()] = crawl_city(element.css('a').first['href'])
    end
    return result
  end

  def crawl_city(city, muni=false)
    puts CGI.unescape(city)
    result = {}
    visit(city)
    postcodes, district_names, districts = muni ? parse_muni_page : parse_city_page
    districts.each do |district|
      visit(district)
      streets = parse_district_page
      result[district_names.shift()] = {postcode: postcodes.shift(), streets: streets}
    end
    return result
  end

  def parse_district_page
    return @doc.xpath('//table//p/a').map do |street_link|
      street_link.content
    end
  end

  def parse_city_page
    postcodes = @doc.xpath('//tr/td[2]/dl/dd/a').map{|el|el.content}
    district_names = @doc.xpath('//tr/td[1]/dl/dd/a[1]').map{|el|el.content}
    districts = @doc.xpath('//tr/td[3]/dl/dd/a').map{|el|el['href']}
    return postcodes, district_names, districts
  end

  def parse_muni_page
    postcodes = @doc.xpath('//table[2]//tr/td[1]/p/a[2]').map{|el|el.content}
    district_names = @doc.xpath('//table[2]//tr/td[1]/p/a[1]').map{|el|el.content}
    district_names.shift()
    districts = @doc.xpath('//table[2]//tr/td[2]/p/a[1]').map{|el|el['href']}
    return postcodes, district_names, districts
  end
end

#p Crawler.new.crawl_province('/' + CGI.escape('河北'))
#p Crawler.new.crawl_city('/' + CGI.escape('北京'), true)
#__END__
country = {}
MUNICIPALITIES.each do |city|
  puts "Crawling #{city}"
  m = Crawler.new.crawl_city('/' + CGI.escape(city), true)
  File.open("#{city}.yml", 'w') {|f| f.write(YAML.dump(m))}
end
PROVINCES.each do |province|
  puts "Crawling #{province}"
  pr = Crawler.new.crawl_province('/' + CGI.escape(province))
  File.open("#{province}.yml", 'w') {|f| f.write(YAML.dump(pr))}
end
	# encoding: utf-8
	require 'nokogiri'
	require 'open-uri'
	require 'cgi'
	require 'yaml'

	PROVINCES = %w(
	河北内蒙古山西
	安徽江苏浙江山东江西福建
	广东广西海南
	河南湖北湖南
	黑龙江吉林辽宁
	陕西甘肃宁夏青海新疆
	四川云南贵州西藏
	)
	MUNICIPALITIES = %w(北京天津上海重庆)

	class Crawler
	BASE_URL = 'http://www.youbianku.com'

	def visit(path)
	puts url = BASE_URL + path
	@doc = Nokogiri::HTML(open(url))
	end

	def crawl_province(province)
	result = {}
	visit(province)
	city_names = @doc.xpath('//tr/td[1]/p/a[1]').map {\|e\|e.content}
	@doc.xpath('//td/p[contains(text(),"进入：")]').each do \|element\|
	result[city_names.shift()] = crawl_city(element.css('a').first['href'])
	end
	return result
	end

	def crawl_city(city, muni=false)
	puts CGI.unescape(city)
	result = {}
	visit(city)
	postcodes, district_names, districts = muni ? parse_muni_page : parse_city_page
	districts.each do \|district\|
	visit(district)
	streets = parse_district_page
	result[district_names.shift()] = {postcode: postcodes.shift(), streets: streets}
	end
	return result
	end

	def parse_district_page
	return @doc.xpath('//table//p/a').map do \|street_link\|
	street_link.content
	end
	end

	def parse_city_page
	postcodes = @doc.xpath('//tr/td[2]/dl/dd/a').map{\|el\|el.content}
	district_names = @doc.xpath('//tr/td[1]/dl/dd/a[1]').map{\|el\|el.content}
	districts = @doc.xpath('//tr/td[3]/dl/dd/a').map{\|el\|el['href']}
	return postcodes, district_names, districts
	end

	def parse_muni_page
	postcodes = @doc.xpath('//table[2]//tr/td[1]/p/a[2]').map{\|el\|el.content}
	district_names = @doc.xpath('//table[2]//tr/td[1]/p/a[1]').map{\|el\|el.content}
	district_names.shift()
	districts = @doc.xpath('//table[2]//tr/td[2]/p/a[1]').map{\|el\|el['href']}
	return postcodes, district_names, districts
	end
	end

	#p Crawler.new.crawl_province('/' + CGI.escape('河北'))
	#p Crawler.new.crawl_city('/' + CGI.escape('北京'), true)
	#__END__
	country = {}
	MUNICIPALITIES.each do \|city\|
	puts "Crawling #{city}"
	m = Crawler.new.crawl_city('/' + CGI.escape(city), true)
	File.open("#{city}.yml", 'w') {\|f\| f.write(YAML.dump(m))}
	end
	PROVINCES.each do \|province\|
	puts "Crawling #{province}"
	pr = Crawler.new.crawl_province('/' + CGI.escape(province))
	File.open("#{province}.yml", 'w') {\|f\| f.write(YAML.dump(pr))}
	end
No results found