Created
September 21, 2019 23:31
-
-
Save dirceu-jr/03a5fdddf4b3108217e70ccc25c9194b to your computer and use it in GitHub Desktop.
This is a very old (2009) Gist of mine. It is a Ruby based web server (using Sinatra.rb) that parsed data from real estate agents from little london, Brazil. It used the awesome Yahoo Query Language (YQL) to "pre-parse".
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| require 'rubygems' | |
| require 'sinatra' | |
| configure do | |
| require 'memcache' | |
| require 'typhoeus' | |
| require 'iconv' | |
| require 'json' | |
| require 'uri' | |
| require 'libxml' | |
| require 'erb' | |
| require 'geokit' | |
| # address to lat/lng | |
| include Geokit::Geocoders | |
| # encode master | |
| def url_encode(u) | |
| URI.escape(u, Regexp.new("[^#{URI::PATTERN::UNRESERVED}]")) | |
| end | |
| # Typhoeus runs HTTP requests in parallel while cleanly encapsulating handling logic. | |
| # http://github.com/pauldix/typhoeus/tree/master | |
| class YQL | |
| include Typhoeus | |
| remote_defaults :on_success => lambda {|response| JSON.parse(response.body, :max_nesting => false)}, | |
| :on_failure => lambda {|response| puts "error code: #{response.code}"}, | |
| :base_uri => "http://query.yahooapis.com/v1/public" | |
| # rails-like routes for HTTP methods | |
| define_remote_method :search, :path => "/yql?q=select%20*%20from%20html%20where%20url%3D%22:url%22%20and%20xpath%3D':xpath'&format=:format" | |
| end | |
| end | |
| # HTTP requests hurts memcached saves | |
| CACHE = MemCache::new('127.0.0.1:11211') | |
| # http://www.imobiliariainglaterra.com.br | |
| get '/imobiliariainglaterra' do | |
| imoveis = [] | |
| if CACHE["imobiliariainglaterra"].nil? | |
| search = YQL.search( | |
| :url => url_encode('http://www.imobiliariainglaterra.com.br/aluguel.asp'), | |
| :xpath => url_encode('/html/body/table/tr'), | |
| :format => 'json' | |
| ) | |
| search['query']['results']['tr'].each do |i| | |
| ref = i['td'][0] | |
| rua = i['td'][1] | |
| dormitorios = i['td'][2] | |
| valor = i['td'][7] | |
| link = i['td'][8] | |
| unless ref.nil? || ref['font'].nil? || ref['font']['content'].nil? | |
| if ref['font']['content'] =~ /L-CA/ | |
| rua = rua['font']['content'].split('(')[0] unless rua['font'].nil? | |
| dormitorios = dormitorios['font']['content'] unless dormitorios['font'].nil? | |
| valor = valor['font']['content'] unless valor['font'].nil? | |
| link = ["http://www.imobiliariainglaterra.com.br/", link['div']['a']['href']].join('') unless link['div'].nil? | |
| geo = YahooGeocoder.geocode([rua, ' - Londrina'].join('')) | |
| imoveis << { | |
| # :rua => rua, | |
| :geo => [geo.lat, geo.lng], | |
| # :dormitorios => dormitorios, | |
| # :valor => valor, | |
| :link => link, | |
| } | |
| end | |
| end | |
| end | |
| CACHE["imobiliariainglaterra"] = imoveis | |
| else | |
| imoveis = CACHE["imobiliariainglaterra"] | |
| end | |
| [params[:callback], '(', imoveis.to_json,')'] | |
| end | |
| # http://www.imobiliariaatual.com.br | |
| get '/imobiliariaatual' do | |
| imoveis = [] | |
| if CACHE["imobiliariaatual"].nil? | |
| search = YQL.search( | |
| :url => url_encode('http://www.imobiliariaatual.com.br/aluguel.asp'), | |
| :xpath => url_encode('/html/body/table/tr'), | |
| :format => 'json' | |
| ) | |
| search['query']['results']['tr'].each do |i| | |
| ref = i['td'][0] | |
| rua = i['td'][1] | |
| dormitorios = i['td'][2] | |
| valor = i['td'][7] | |
| link = i['td'][8] | |
| unless ref.nil? || ref['font'].nil? || ref['font']['content'].nil? | |
| if ref['font']['content'] =~ /L-CA/ | |
| rua = rua['font']['content'].split('(')[0] unless rua['font'].nil? | |
| dormitorios = dormitorios['font']['content'] unless dormitorios['font'].nil? | |
| valor = valor['font']['content'] unless valor['font'].nil? | |
| link = ["http://www.imobiliariaatual.com.br/", link['div']['a']['href']].join('') unless link['div'].nil? | |
| geo = YahooGeocoder.geocode([rua, ' - Londrina'].join('')) | |
| imoveis << { | |
| # :rua => rua, | |
| :geo => [geo.lat, geo.lng], | |
| # :dormitorios => dormitorios, | |
| # :valor => valor, | |
| :link => link, | |
| } | |
| end | |
| end | |
| end | |
| CACHE["imobiliariaatual"] = imoveis | |
| else | |
| imoveis = CACHE["imobiliariaatual"] | |
| end | |
| [params[:callback], '(', imoveis.to_json,')'] | |
| end | |
| # http://www.imobiliariadelta.com | |
| get '/imobiliariadelta' do | |
| imoveis = [] | |
| if CACHE["imobiliariadelta"].nil? | |
| search = YQL.search( | |
| :url => url_encode('http://www.imobiliariadelta.com/engine.php?id=1444&page=resultado&cd_negocio=1&cd_tipo=11611'), | |
| :xpath => url_encode('//table[@width="400"]'), | |
| :format => 'json' | |
| ) | |
| search['query']['results']['table'].each do |i| | |
| lvl1 = i['tr'][1]['td'][1] | |
| lvl2 = lvl1['table'] unless lvl1.nil? | |
| lvl3 = lvl2[1]['tr']['td'][0]['table']['tr']['td'][1]['a']['href'] unless lvl2.nil? | |
| unless lvl1.nil? || lvl2.nil? || lvl3.nil? | |
| cid = /cd_imovel=(.*)/.match(lvl3)[1] | |
| url = "http://www.imobiliariadelta.com/detalhes_geral.php?id=1444&cd_imovel=#{cid}" | |
| search = YQL.search( | |
| :url => url_encode(url), | |
| :xpath => url_encode('/html/body'), | |
| :format => 'xml', | |
| :on_success => lambda do |response| | |
| LibXML::XML::Parser.string(response.body).parse | |
| end | |
| ) | |
| rua = search.find('//body/div/div/div/div/div[4]/p')[0].first.to_s.strip | |
| geo = YahooGeocoder.geocode([rua, ' - Londrina'].join('')) | |
| imoveis << { | |
| # :rua => rua, | |
| :geo => [geo.lat, geo.lng], | |
| # :dormitorios => search.find('//body/div/div/div/div/div[4]/p/font')[0].first.to_s.strip, | |
| # :preco => search.find('//body/div/div[2]/div[6]/div/span')[0].first.to_s.split('R$')[1].strip.gsub(/ /, ''), | |
| :link => url | |
| } | |
| end | |
| end | |
| CACHE["imobiliariadelta"] = imoveis | |
| else | |
| imoveis = CACHE["imobiliariadelta"] | |
| end | |
| [params[:callback], '(', imoveis.to_json,')'] | |
| end | |
| # http://www.imobiliariaavenida.com.br | |
| get '/imobiliariaavenida' do | |
| imoveis = [] | |
| if CACHE["imobiliariaavenida"].nil? | |
| search = YQL.search( | |
| :url => url_encode('http://www.imobiliariaavenida.com.br/corpo_todos.php?txcomercializacao=loca%E7%E3o&txtipo=casa'), | |
| :xpath => url_encode('//table[@width="95%"]/tr'), | |
| :format => 'json' | |
| ) | |
| search['query']['results'].each do |i| | |
| i[1].each do |k| | |
| link = /\((.*)\)/.match(k['td'][2]['div']['a']['href'])[1] unless k['td'][2].nil? || k['td'][2]['div']['a'].nil? | |
| rua = k['td'][0]['div']['a']['font']['content'].gsub(/ |\n/, ' ') unless k['td'][0].nil? || k['td'][0]['div']['a'].nil? | |
| dormitorios = k['td'][1]['div']['a']['font']['content'].gsub(/ |\n/, ' ') unless k['td'][1].nil? || k['td'][1]['div']['a'].nil? | |
| valor = k['td'][2]['div']['a']['font']['content'].gsub(/ |\n/, ' ') unless k['td'][2].nil? || k['td'][2]['div']['a'].nil? | |
| geo = YahooGeocoder.geocode([rua, ' - Londrina'].join('')) | |
| imoveis << { | |
| # :rua => rua, | |
| :geo => [geo.lat, geo.lng], | |
| # :dormitorios => dormitorios, | |
| # :valor => valor, | |
| :link => ['http://www.imobiliariaavenida.com.br/pop_detalhes.php?cdimovel=', link].join('') | |
| } unless rua.nil? || valor.nil? | |
| end | |
| end | |
| CACHE["imobiliariaavenida"] = imoveis | |
| else | |
| imoveis = CACHE["imobiliariaavenida"] | |
| end | |
| [params[:callback], '(', imoveis.to_json,')'] | |
| end | |
| # http://www.ihimoveis.com.br/ | |
| get '/ihimoveis' do | |
| imoveis = [] | |
| if CACHE["ihimoveis"].nil? | |
| search = YQL.search( | |
| :url => url_encode('http://www.ihimoveis.com.br/imoveis.php?situacao=locacao&tipo=3'), | |
| :xpath => url_encode('//div[@id="destaque_fundo"]'), | |
| :format => 'xml', | |
| :on_success => lambda do |response| | |
| LibXML::XML::Parser.string(response.body.gsub(/<br\/>/, '')).parse | |
| end | |
| ) | |
| search.find('//div[@id="destaque_fundo"]').each do |o| | |
| if /.*quarto.*/.match(o.find('div/a/strong[2]')[0].first.to_s) | |
| dormitorios = /.*quarto.*/.match(o.find('div/a/strong[2]')[0].first.to_s)[0].split('quarto')[0].strip | |
| elsif /.*quarto.*/.match(o.find('div/a/strong[3]')[0].first.to_s) | |
| dormitorios = /.*quarto.*/.match(o.find('div/a/strong[3]')[0].first.to_s)[0].split('quarto')[0].strip | |
| end | |
| rua = o.find('div/a')[0].inner_xml.split('strong>')[-1].split('Londrina')[0].strip | |
| geo = YahooGeocoder.geocode([rua, ' - Londrina'].join('')) | |
| imoveis << { | |
| # :rua => rua, | |
| :geo => [geo.lat, geo.lng], | |
| # :dormitorios => dormitorios, | |
| # :valor => o.find('div[@id="destaques_valor"]/a')[0].inner_xml.split('R$')[1].strip, | |
| :link => ["http://www.ihimoveis.com.br/", o.find('div[@id="destaques_valor"]/a')[0]['href']].join('') | |
| } | |
| end | |
| CACHE["ihimoveis"] = imoveis | |
| else | |
| imoveis = CACHE["ihimoveis"] | |
| end | |
| [params[:callback], '(', imoveis.to_json,')'] | |
| end | |
| # http://www.imobiliariasenador.com.br | |
| get '/imobiliariasenador' do | |
| imoveis = [] | |
| if CACHE["imobiliariasenador"].nil? | |
| search = YQL.search( | |
| :url => url_encode('http://www.imobiliariasenador.com.br/corpo_todos.php?txcomercializacao=loca%E7%E3o&txtipo=casa'), | |
| :xpath => url_encode('/html/body/table/tr/td/div/form/table/tr/td/table/tr[2]/td/table/tr'), | |
| :format => 'json' | |
| ) | |
| search['query']['results']['tr'].each do |i| | |
| rua = i['td']['table']['tr'][1]['td'][0]['div']['font']['font']['font']['content'].gsub(/ /, '').gsub(/\n/, ' ') | |
| geo = YahooGeocoder.geocode([rua, ' - Londrina'].join('')) | |
| imoveis << { | |
| # :rua => rua, | |
| :geo => [geo.lat, geo.lng], | |
| :link => ["http://www.imobiliariasenador.com.br/pop_detalhes.php?cdimovel=", /\((.*)\)/.match(i['td']['table']['onclick'])[1]].join('') | |
| # :dormitorios => i['td']['table']['tr'][1]['td'][1]['div']['font']['font']['font']['content'].gsub(/ /, '').gsub(/\n/, ' '), | |
| # :valor => i['td']['table']['tr'][1]['td'][6]['div']['font']['font']['font']['content'].gsub(/ /, '').gsub(/\n/, ' ').split('R$')[1].strip | |
| } | |
| end | |
| CACHE["imobiliariasenador"] = imoveis | |
| else | |
| imoveis = CACHE["imobiliariasenador"] | |
| end | |
| [params[:callback], '(', imoveis.to_json,')'] | |
| end | |
| # http://www.sub100.com.br | |
| get '/imobiliariaperez' do | |
| imoveis = [] | |
| if CACHE["imobiliariaperez"].nil? | |
| pagina = 1 | |
| while | |
| search = YQL.search( | |
| :url => url_encode("http://www.sub100.com.br/empresas/imob/imobiliariaperez/mostra_resultado_rapido.php?b_negocio=Locacao&b_tipo=CASAS&b_cidade=57&b_bairro=TODOS&b_dormitorios=TODOS&b_valores=TODOS&PV=#{pagina}"), | |
| :xpath => url_encode('//div[@class="AT_lista"]'), | |
| :format => 'json' | |
| ) | |
| results = 0 | |
| search['query']['results']['div'].each do |i| | |
| url = i['div'][1]['span']['a']['href'] unless i['div'].nil? || i['div'][1].nil? || i['div'][1]['span'].nil? | |
| unless url.nil? | |
| search = YQL.search( | |
| :url => url_encode("http://www.sub100.com.br/empresas/imob/imobiliariaperez/#{url}"), | |
| :xpath => url_encode('//body'), | |
| :format => 'xml', | |
| :on_success => lambda do |response| | |
| LibXML::XML::Parser.string(response.body.gsub(/<br\/>/, '')).parse | |
| end | |
| ) | |
| if /.*R\$.*/.match(search.find('//td[@class="clCell2"]/strong')[0].first.to_s) | |
| valor = search.find('//td[@class="clCell2"]/strong')[0].first.to_s.split('R$')[1].strip | |
| elsif /.*R\$.*/.match(search.find('//td[@class="clCell2"]/strong')[1].first.to_s) | |
| valor = search.find('//td[@class="clCell2"]/strong')[1].first.to_s.split('R$')[1].strip | |
| end | |
| rua = search.find('//td[@class="clCell2"]/p')[0].first.to_s.gsub(/ /, '').strip | |
| geo = YahooGeocoder.geocode([rua, ' - Londrina'].join('')) | |
| imoveis << { | |
| # :rua => rua, | |
| :geo => [geo.lat, geo.lng], | |
| :link => ["http://www.sub100.com.br/empresas/imob/imobiliariaperez/", url].join('') | |
| # :dormitorios => search.find('//div[@class="clCell"]')[0].first.to_s.split('Dormit�rio')[0].split('>')[1].strip, | |
| # :valor => valor, | |
| } | |
| results += 1 | |
| end | |
| end | |
| if results < 15 | |
| break | |
| end | |
| pagina += 1 | |
| end | |
| CACHE["imobiliariaperez"] = imoveis | |
| else | |
| imoveis = CACHE["imobiliariaperez"] | |
| end | |
| [params[:callback], '(', imoveis.to_json,')'] | |
| end | |
| get '/' do | |
| erb :index | |
| end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment