Skip to content

Instantly share code, notes, and snippets.

@utsavsabharwal
Created September 4, 2012 08:57
Show Gist options
  • Select an option

  • Save utsavsabharwal/3618752 to your computer and use it in GitHub Desktop.

Select an option

Save utsavsabharwal/3618752 to your computer and use it in GitHub Desktop.
Featch top 10 urls on Google for a given list of UPC
#!/usr/bin/env python
"""crawler.py: Featch top 10 urls on Google for a given list of UPC"""
__author__ = "Utsav Sabharwal"
import sys
import zlib
import time
import pycurl
from BeautifulSoup import BeautifulSoup
f = open("upc").readlines()
urls = []
for upc in f:
url = upc.strip()+":::https://www.google.co.in/search?num=100&hl=en&authuser=0&site=webhp&source=hp&q="+upc.strip()+"&oq="+upc.strip()
urls.append(url)
class Test:
def __init__(self):
self.contents = ''
def body_callback(self, buf):
self.contents = self.contents + buf
print >>sys.stderr, 'Testing', pycurl.version
output = open("output", "a+")
while len(urls) > 0:
t = Test()
c = pycurl.Curl()
upc, uri = urls.pop().split(":::")
print "Crawling ", uri
c.setopt(c.URL, uri)
headers = ['User-Agent: Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0 FirePHP/0.7.1',
'Host: www.google.co.in',
'Connection: keep-alive',
'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Encoding: gzip, deflate',
'Accept-Language: en-us,en;q=0.5',
'Cookie: PREF=ID=a0490047eebbd9f3:U=56c91a5aeeaef764:FF=0:TM=1346742582:LM=1346742602:S=vShc5R0ehiRpOCyL; NID=63=0dgPknGem3gJCQMkYcRwnd-jSPN3Tf2Y4E3ZBqccaSY7s2A2wWbUii1vC_M3EMluLu9gif8yBFq0FtdvWg9zQY780enWl9mEgFmnaIY4rWSOIFYtiQGISmgzyh_mUUyM',
'x-insight: activate']
c.setopt(pycurl.HTTPHEADER, headers)
c.setopt(c.WRITEFUNCTION, t.body_callback)
c.perform()
c.close()
t.contents = zlib.decompress(t.contents, 16+zlib.MAX_WBITS)
print "--> New URLs Found:"
soup = BeautifulSoup(t.contents)
count = 0
for data in soup.findAll('h3', attrs={'class':'r'}):
print "Count", count
count+=1
if len(data.findAll('a',attrs = {}))>0:
output_url = data.findAll('a',attrs = {})[0]['href']
print output_url
output.write(upc+":::"+output_url+chr(10))
output.flush()
time.sleep(2)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment