Skip to content

Instantly share code, notes, and snippets.

@aeaia
Last active May 27, 2021 22:37
Show Gist options
  • Select an option

  • Save aeaia/c73b6f8b6f90626a88b03f0a7db9ede4 to your computer and use it in GitHub Desktop.

Select an option

Save aeaia/c73b6f8b6f90626a88b03f0a7db9ede4 to your computer and use it in GitHub Desktop.
import re
import csv
# select file to parse
list_directory = '/Users/ali/Documents/Roadtrippers/Hackday/May2021/'
csv_name = 'hackday_websites_to_parse.csv'
csv_path = list_directory + csv_name
#create list for imported data
l = []
# import the file into a nested list
with open(csv_path, 'rb') as csvfile:
file_reader = csv.reader(csvfile)
for row in file_reader:
l.append(row)
#flatten nested list
websites = [item[2] for item in l]
#create list for found urls
found_urls = []
#parse urls with some regex found on the internet
for i in websites:
# replace the %20s with spaces to reduce false positives
found_urls.append(re.findall('(?:(?:https?|ftp):\/\/)?[\w/\-?=%.]+\.[\w/\-&?=%.]+', re.sub('%20', ' ', re.sub('%0A', ' ', i.lower()))))
#flatten nested list
all_urls = [item for sublist in found_urls for item in sublist]
#get rid of some low-hanging fruit
# too short
def is_too_short(url):
#7 because don't want to miss out on stuff like rec.gov
#if the url is truly shorter than that, it's probably a large organization (i.e. no hidden gems) or non-us
return len(url) < 7
#test cases
# is_too_short(all_urls[100])
# is_too_short(all_urls[0])
# no letters
#they might be real websites, but we don't want to waste human time trying to figure out whether 123.123 has any camping relevance
def has_no_letters(url):
return len(re.findall('[a-zA-Z]', url)) == 0
#test cases
# has_no_letters(all_urls[100])
# has_no_letters(all_urls[0])
# has_useless_domain
# facebook is a terrible source, and we don't need to check out stuff that's already on RT, etc.
useless_domains = ['facebook\.com', 'paypal\.com', 'gmail\.com', 'youtube\.com', 'google\.com', 'roadtrippers\.com',
'paypalobjects\.com', 'constantcontact\.com', 'shutterstock\.com', 'twitter\.com', 'vrbo\.com',
'myspace\.com', 'linkedin\.com', 'goodreads\.com', 'etsy\.com', 'amazon\.com', 'ebay\.com', 'ihg\.com',
'instagram\.com', 'express\.com', 'pinterest\.com', 'manta\.com', 'goarm\.com', 'army\.mil',
'flickr\.com', 'fs\.usda\.gov', 'localedge\.com', 'local\.mysanantonio\.com']
def has_useless_domain(url):
useless_domain_detected = False
for i in useless_domains:
if len(re.findall(i, url)) > 0:
useless_domain_detected = True
return useless_domain_detected
#test cases
# has_useless_domain(all_urls[175])
# has_useless_domain(all_urls[0])
# other filters to add later
#def has_bad_starting_character(url):
#starts with a negative
#starts with a number
#
#def has_erroneous_punctuation(url):
#2 dots in a row
#def is_probably_an_image(url):
#.gif, .jped
def bad_url(url):
return is_too_short(url) or has_no_letters(url) or has_useless_domain(url)
#test cases
# bad_url(all_urls[0])
# bad_url(all_urls[1])
# bad_url(all_urls[175])
# bad_url(all_urls[199])
#create list for filtered urls
filtered_urls = []
for i in all_urls:
if not bad_url(i):
filtered_urls.append(i)
#dedupe
urls_to_test = list(dict.fromkeys(filtered_urls))
#this narrowd it down by ~43% --> actually more after domain pruning
#see if these places are in RT
# theoretically, many of them should be in RT if the website was at all accurate... but the whole point of this is that we do not think they're all accurate
# additionally, many of these urls were just one of many 20 urls listed in one place's website attribute
# if the url/slug has a match in Places, we should then check if the if the id returned matches the id or ids from the list of place ids
# if yes, offer it up as a potential new url
# if no, this is a potential duplicated listing
# if the url/slug does not have a match in Places, it could be a hidden gem
#generate slugs
#clean up urls
def remove_leading_http(url):
return re.sub('^https*', '', url)
def remove_leading_colon(url):
return re.sub('^:', '', url)
def remove_leading_slashes(url):
return re.sub('^/+', '', url)
def remove_leading_www(url):
return re.sub('^www\.', '', url)
def remove_prefixes(url):
return remove_leading_www(remove_leading_slashes(remove_leading_colon(remove_leading_http(url))))
def find_domain_and_tld(url):
return re.sub('/.*', '', url)
#find most common domains
domains = []
for i in urls_to_test:
domains.append(find_domain_and_tld(remove_prefixes(i)))
import collections
counter = collections.Counter(domains)
print(counter.most_common)
#these are common enough that they probably have a specific url formula that's worth processing
slug_domains = ['foursquare.com', 'yelp.com', 'dps-siteplatform.com', 'urbanspoon.com', 'tripadvisor.com', 'local.yahoo.com']
# collect the urls of these domains to figure out which ones may need special treatment
domain_urls = []
for i in urls_to_test:
if find_domain_and_tld(remove_prefixes(i)) in slug_domains:
domain_urls.append(remove_prefixes(i))
# come up with the special treatment
def foursquare_slug(url_noprefix):
if re.search('foursquare\.com/v/', url_noprefix):
return re.sub('/.*', '', re.sub('foursquare\.com/v/', '', url_noprefix))
def yelp_slug(url_noprefix):
if re.search('yelp\.com/biz/', url_noprefix):
return re.sub('yelp\.com/biz/', '', url_noprefix)
def dps_slug(url_noprefix):
if re.search('dps-siteplatform\.com/', url_noprefix):
return re.sub('-[0-9]+$', '', re.sub('dps-siteplatform\.com/', '', url_noprefix))
def urbanspoon_slug(url_noprefix):
if re.search('urbanspoon\.com/r/.+/restaurant/', url_noprefix):
return re.sub('/', '-', re.sub('urbanspoon\.com/r/.+/restaurant/', '', url_noprefix))
def tripadvisor_slug(url_noprefix):
if re.search('tripadvisor\.com/.+-reviews-', url_noprefix):
return re.sub('\.html.*$', '', re.sub('tripadvisor\.com/.+-reviews-', '', url_noprefix))
def yahoo_slug(url_noprefix):
if re.search('local\.yahoo\.com/info-[0-9]+-', url_noprefix):
return re.sub('local\.yahoo\.com/info-[0-9]+-', '', url_noprefix)
def generic_slug(url_noprefix):
return re.sub('\.[a-zA-Z]+$', '', url_noprefix)
def url_to_slug(url):
domain = find_domain_and_tld(remove_prefixes(url))
slug = None
if domain in slug_domains:
if domain == 'foursquare.com':
slug = foursquare_slug(remove_prefixes(url))
if domain == 'yelp.com':
slug = yelp_slug(remove_prefixes(url))
if domain == 'dps-siteplatform.com':
slug = dps_slug(remove_prefixes(url))
if domain == 'urbanspoon.com':
slug = urbanspoon_slug(remove_prefixes(url))
if domain == 'tripadvisor.com':
slug = tripadvisor_slug(remove_prefixes(url))
if domain == 'local.yahoo.com':
slug = yahoo_slug(remove_prefixes(url))
else:
slug = generic_slug(domain)
if slug:
slug = re.sub('/', '-', re.sub('\.', '-', slug))
return slug
#select diretory for autocomplete results
autocomplete_directory = list_directory + 'autocomplete/'
import requests
import json
import hmac
import hashlib
import httplib
# initialize a list for api keys
s = []
# grab those keys
with open('/Users/ali/.pythonsecrets', 'rb') as secrets:
secretreader = csv.reader(secrets)
for row in secrets:
s.append(row.replace('\n', ''))
# create parameters to be used to connect
# keys
clientId = s[0]
secretKey = s[1]
def autocomplete (slug):
# connection
conn = httplib.HTTPSConnection('api.boone.ai')
#
# endpoint
endpoint_start = '/api/v2/autocomplete?input='
endpoint_end = '&locations=country:US|country:CA'
endpoint = endpoint_start + slug + endpoint_end
#
signature = hmac.new(secretKey, format(endpoint), hashlib.sha256).hexdigest()
headers = { "Content-Type" : "application/json",
"RT-ORG-APP-CLIENT-ID" : clientId,
"RT-ORG-APP-HMAC": signature
}
conn.request("GET", endpoint, None, headers)
response = conn.getresponse()
return response
summary_file = list_directory + 'summary.csv'
outfile = open(summary_file, 'w')
csvwriter = csv.writer(outfile)
header = ['url', 'slug', 'canonical_place_id']
csvwriter.writerow(header)
n=1
autocomplete_successes = []
autocomplete_failures = []
for i in urls_to_test:
print i
slug = url_to_slug(i)
line = []
if slug:
response = autocomplete(slug)
if response.status == 200:
response = json.loads(response.read())
if response["data"] != []:
line.append(i)
line.append(slug)
line.append(response["data"][0]["properties"]["id"]) #canonical_place_id
autocomplete_successes.append(i)
else:
autocomplete_failures.append(i)
csvwriter.writerow(line)
outfile.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment