-
-
Save tecknoh19/10767105 to your computer and use it in GitHub Desktop.
| # URL Harvester written by Andy Bricker | |
| # http://andybricker.com | |
| # andy at andybricker.com | |
| # Requirements | |
| # Python 2.7 (Has not been tasted on later versions) | |
| # Beautiful Soup library for Python (http://www.crummy.com/software/BeautifulSoup/) | |
| # Usage: | |
| # python urlHarvest.py books stores -n 50 -l myLogFile.txt | |
| # Google Dorks are supported | |
| # python urlHarvest.py inurl:.com.eu/foobar.php intext:I like computers -n 50 -l /home/me/logs/myLogFile.txt | |
| # Script will crawl google collections the specified number of results for a given search. The script will then | |
| # build a URL array while preventing duplicate entries. Finally, a line my line logfile is generated containins | |
| # the results. | |
| # Like the script? Donate | |
| # LiteCoin: LcFU5upJyS7FsEeB5sb25vFTS69dH6fugr | |
| # DogeCoin: D7SPH1LYJn9Co4GCZePH3JvzR5RkZEPi5M | |
| from optparse import OptionParser | |
| options = OptionParser(usage='%prog search [options]', description='Python URL Harvester by Andy Bricker. http://AndyBricker.Com') | |
| options.add_option('-n', '--number', type='int', default=5, help='Number of search results to parse (default: 5)') | |
| options.add_option('-l', '--log_file', type='string', default='urlHarvest.txt', help='Name of the output logfile. Paths accepted. (default: urlHarvest.txt)') | |
| def addLog(target, opts): | |
| log_file = open(opts.log_file, "a") | |
| log_file.write(target + '\n') | |
| log_file.close() | |
| def main(): | |
| print "" | |
| print "=======================================================" | |
| print "Checking arguments." | |
| opts, args = options.parse_args() | |
| z = 0 | |
| if len(args) < 1: | |
| options.print_help() | |
| exit() | |
| domainList = [] | |
| print "Beginning Google Search of " + str(opts.number) + " records. Please be patient." | |
| # Check Google against our search to build URL list | |
| from google import search | |
| for url in search(args[0], stop=opts.number): | |
| from urlparse import urlparse | |
| parsed_uri = urlparse( url ) | |
| domain = '{uri.netloc}'.format(uri=parsed_uri) | |
| domainList.append(domain); | |
| print "Search Complete, filtering results." | |
| domainList = list(set(domainList)) | |
| print "Building log file." | |
| for target in domainList: | |
| addLog(target, opts) | |
| print "Harvest complete. Log data written to " + opts.log_file | |
| print "" | |
| print "=======================================================" | |
| if __name__ == '__main__': | |
| main() |
@demogorgonz
pip install google
Checking arguments.
Beginning Google Search of 500 records. Please be patient.
Traceback (most recent call last):
File "urlHarvest.py", line 69, in
main()
File "urlHarvest.py", line 51, in main
for url in search(args[0], stop=opts.number):
File "/usr/local/lib/python2.7/dist-packages/google/init.py", line 269, in search
html = get_page(url)
File "/usr/local/lib/python2.7/dist-packages/google/init.py", line 89, in get_page
response = urlopen(request)
File "/usr/lib/python2.7/urllib2.py", line 127, in urlopen
return _opener.open(url, data, timeout)
File "/usr/lib/python2.7/urllib2.py", line 410, in open
response = meth(req, response)
File "/usr/lib/python2.7/urllib2.py", line 523, in http_response
'http', request, response, code, msg, hdrs)
File "/usr/lib/python2.7/urllib2.py", line 442, in error
result = self._call_chain(_args)
File "/usr/lib/python2.7/urllib2.py", line 382, in _call_chain
result = func(_args)
File "/usr/lib/python2.7/urllib2.py", line 629, in http_error_302
return self.parent.open(new, timeout=req.timeout)
File "/usr/lib/python2.7/urllib2.py", line 410, in open
response = meth(req, response)
File "/usr/lib/python2.7/urllib2.py", line 523, in http_response
'http', request, response, code, msg, hdrs)
File "/usr/lib/python2.7/urllib2.py", line 448, in error
return self._call_chain(_args)
File "/usr/lib/python2.7/urllib2.py", line 382, in _call_chain
result = func(_args)
File "/usr/lib/python2.7/urllib2.py", line 531, in http_error_default
raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
urllib2.HTTPError: HTTP Error 503: Service Unavailable
Checking arguments.
Beginning Google Search of 50 records. Please be patient.
Traceback (most recent call last):
File "urlHarvest.py", line 69, in
main()
File "urlHarvest.py", line 50, in main
from google import search
ImportError: No module named google