aeaia/hackday_long_urls.py

## hackday_long_urls.py
import re
import csv

# select file to parse
list_directory = '/Users/ali/Documents/Roadtrippers/Hackday/May2021/'
csv_name = 'hackday_websites_to_parse.csv'
csv_path = list_directory + csv_name

#create list for imported data
l = []

# import the file into a nested list
with open(csv_path, 'rb') as csvfile:
	file_reader = csv.reader(csvfile)
	for row in file_reader:
		l.append(row)

#flatten nested list
websites = [item[2] for item in l]

#create list for found urls
found_urls = []

#parse urls with some regex found on the internet
for i in websites:
	# replace the %20s with spaces to reduce false positives
	found_urls.append(re.findall('(?:(?:https?|ftp):\/\/)?[\w/\-?=%.]+\.[\w/\-&?=%.]+', re.sub('%20', ' ', re.sub('%0A', ' ', i.lower()))))


#flatten nested list
all_urls = [item for sublist in found_urls for item in sublist]


#get rid of some low-hanging fruit

# too short
def is_too_short(url):
	#7 because don't want to miss out on stuff like rec.gov
	#if the url is truly shorter than that, it's probably a large organization (i.e. no hidden gems) or non-us
	return len(url) < 7

#test cases
# is_too_short(all_urls[100])
# is_too_short(all_urls[0])


# no letters
#they might be real websites, but we don't want to waste human time trying to figure out whether 123.123 has any camping relevance
def has_no_letters(url):
	return len(re.findall('[a-zA-Z]', url)) == 0

#test cases
# has_no_letters(all_urls[100])
# has_no_letters(all_urls[0])


# has_useless_domain
# facebook is a terrible source, and we don't need to check out stuff that's already on RT, etc.
useless_domains = ['facebook\.com', 'paypal\.com', 'gmail\.com', 'youtube\.com', 'google\.com', 'roadtrippers\.com',
				   'paypalobjects\.com', 'constantcontact\.com', 'shutterstock\.com', 'twitter\.com', 'vrbo\.com',
				   'myspace\.com', 'linkedin\.com', 'goodreads\.com', 'etsy\.com', 'amazon\.com', 'ebay\.com', 'ihg\.com',
				   'instagram\.com', 'express\.com', 'pinterest\.com', 'manta\.com', 'goarm\.com', 'army\.mil',
				   'flickr\.com', 'fs\.usda\.gov', 'localedge\.com', 'local\.mysanantonio\.com']

def has_useless_domain(url):
	useless_domain_detected = False
	for i in useless_domains:
		if len(re.findall(i, url)) > 0:
			useless_domain_detected = True
	return useless_domain_detected

#test cases
# has_useless_domain(all_urls[175])
# has_useless_domain(all_urls[0])


# other filters to add later
#def has_bad_starting_character(url):
	#starts with a negative
	#starts with a number
	#

#def has_erroneous_punctuation(url):
	#2 dots in a row

#def is_probably_an_image(url):
	#.gif, .jped

def bad_url(url):
	return is_too_short(url) or has_no_letters(url) or has_useless_domain(url)

#test cases
# bad_url(all_urls[0])
# bad_url(all_urls[1])
# bad_url(all_urls[175])
# bad_url(all_urls[199])

#create list for filtered urls
filtered_urls = []

for i in all_urls:
	if not bad_url(i):
		filtered_urls.append(i)

#dedupe
urls_to_test = list(dict.fromkeys(filtered_urls))

#this narrowd it down by ~43% --> actually more after domain pruning

#see if these places are in RT
# theoretically, many of them should be in RT if the website was at all accurate... but the whole point of this is that we do not think they're all accurate
# additionally, many of these urls were just one of many 20 urls listed in one place's website attribute

# if the url/slug has a match in Places, we should then check if the if the id returned matches the id or ids from the list of place ids
# 	if yes, offer it up as a potential new url
# 	if no, this is a potential duplicated listing
# if the url/slug does not have a match in Places, it could be a hidden gem

#generate slugs

#clean up urls
def remove_leading_http(url):
	return re.sub('^https*', '', url)

def remove_leading_colon(url):
	return re.sub('^:', '', url)

def remove_leading_slashes(url):
	return re.sub('^/+', '', url)

def remove_leading_www(url):
	return re.sub('^www\.', '', url)

def remove_prefixes(url):
	return remove_leading_www(remove_leading_slashes(remove_leading_colon(remove_leading_http(url))))

def find_domain_and_tld(url):
	return re.sub('/.*', '', url)

#find most common domains
domains = []
for i in urls_to_test:
	domains.append(find_domain_and_tld(remove_prefixes(i)))

import collections
counter = collections.Counter(domains)
print(counter.most_common)

#these are common enough that they probably have a specific url formula that's worth processing
slug_domains = ['foursquare.com', 'yelp.com', 'dps-siteplatform.com', 'urbanspoon.com', 'tripadvisor.com', 'local.yahoo.com']

# collect the urls of these domains to figure out which ones may need special treatment
domain_urls = []

for i in urls_to_test:
	if find_domain_and_tld(remove_prefixes(i)) in slug_domains:
		domain_urls.append(remove_prefixes(i))

# come up with the special treatment
def foursquare_slug(url_noprefix):
	if re.search('foursquare\.com/v/', url_noprefix):
		return re.sub('/.*', '', re.sub('foursquare\.com/v/', '', url_noprefix))

def yelp_slug(url_noprefix):
	if re.search('yelp\.com/biz/', url_noprefix):
		return re.sub('yelp\.com/biz/', '', url_noprefix)

def dps_slug(url_noprefix):
	if re.search('dps-siteplatform\.com/', url_noprefix):
		return re.sub('-[0-9]+$', '', re.sub('dps-siteplatform\.com/', '', url_noprefix))

def urbanspoon_slug(url_noprefix):
	if re.search('urbanspoon\.com/r/.+/restaurant/', url_noprefix):
		return re.sub('/', '-', re.sub('urbanspoon\.com/r/.+/restaurant/', '', url_noprefix))

def tripadvisor_slug(url_noprefix):
	if re.search('tripadvisor\.com/.+-reviews-', url_noprefix):
		return re.sub('\.html.*$', '', re.sub('tripadvisor\.com/.+-reviews-', '', url_noprefix))

def yahoo_slug(url_noprefix):
	if re.search('local\.yahoo\.com/info-[0-9]+-', url_noprefix):
		return re.sub('local\.yahoo\.com/info-[0-9]+-', '', url_noprefix)

def generic_slug(url_noprefix):
	return re.sub('\.[a-zA-Z]+$', '', url_noprefix)

def url_to_slug(url):
	domain = find_domain_and_tld(remove_prefixes(url))
	slug = None
	if domain in slug_domains:
		if domain == 'foursquare.com':
			slug = foursquare_slug(remove_prefixes(url))
		if domain == 'yelp.com':
			slug = yelp_slug(remove_prefixes(url))
		if domain == 'dps-siteplatform.com':
			slug = dps_slug(remove_prefixes(url))
		if domain == 'urbanspoon.com':
			slug = urbanspoon_slug(remove_prefixes(url))
		if domain == 'tripadvisor.com':
			slug = tripadvisor_slug(remove_prefixes(url))
		if domain == 'local.yahoo.com':
			slug = yahoo_slug(remove_prefixes(url))
	else:
		slug = generic_slug(domain)
	if slug:
		slug = re.sub('/', '-', re.sub('\.', '-', slug))
	return slug


#select diretory for autocomplete results
autocomplete_directory = list_directory + 'autocomplete/'

import requests
import json
import hmac
import hashlib
import httplib


# initialize a list for api keys
s = []

# grab those keys
with open('/Users/ali/.pythonsecrets', 'rb') as secrets:
	secretreader = csv.reader(secrets)
	for row in secrets:
		s.append(row.replace('\n', ''))

# create parameters to be used to connect
# keys
clientId = s[0]
secretKey = s[1]


def autocomplete (slug):
	# connection
	conn = httplib.HTTPSConnection('api.boone.ai')
	#
	# endpoint
	endpoint_start = '/api/v2/autocomplete?input='
	endpoint_end = '&locations=country:US|country:CA'
	endpoint = endpoint_start + slug + endpoint_end
	#
	signature = hmac.new(secretKey, format(endpoint), hashlib.sha256).hexdigest()
	headers = { "Content-Type" : "application/json",
				"RT-ORG-APP-CLIENT-ID" : clientId,
				"RT-ORG-APP-HMAC": signature
				}
	conn.request("GET", endpoint, None, headers)
	response = conn.getresponse()
	return response

summary_file = list_directory + 'summary.csv'
outfile = open(summary_file, 'w')
csvwriter = csv.writer(outfile)
header = ['url', 'slug', 'canonical_place_id']
csvwriter.writerow(header)
n=1
autocomplete_successes = []
autocomplete_failures = []
for i in urls_to_test:
	print i
	slug = url_to_slug(i)
	line = []
	if slug:
		response = autocomplete(slug)
		if response.status == 200:
			response = json.loads(response.read())
			if response["data"] != []:
				line.append(i)
				line.append(slug)
				line.append(response["data"][0]["properties"]["id"]) #canonical_place_id
				autocomplete_successes.append(i)
		else:
			autocomplete_failures.append(i)
	csvwriter.writerow(line)

outfile.close()
	import re
	import csv

	# select file to parse
	list_directory = '/Users/ali/Documents/Roadtrippers/Hackday/May2021/'
	csv_name = 'hackday_websites_to_parse.csv'
	csv_path = list_directory + csv_name

	#create list for imported data
	l = []

	# import the file into a nested list
	with open(csv_path, 'rb') as csvfile:
	file_reader = csv.reader(csvfile)
	for row in file_reader:
	l.append(row)

	#flatten nested list
	websites = [item[2] for item in l]

	#create list for found urls
	found_urls = []

	#parse urls with some regex found on the internet
	for i in websites:
	# replace the %20s with spaces to reduce false positives
	found_urls.append(re.findall('(?:(?:https?\|ftp):\/\/)?[\w/\-?=%.]+\.[\w/\-&?=%.]+', re.sub('%20', ' ', re.sub('%0A', ' ', i.lower()))))


	#flatten nested list
	all_urls = [item for sublist in found_urls for item in sublist]


	#get rid of some low-hanging fruit

	# too short
	def is_too_short(url):
	#7 because don't want to miss out on stuff like rec.gov
	#if the url is truly shorter than that, it's probably a large organization (i.e. no hidden gems) or non-us
	return len(url) < 7

	#test cases
	# is_too_short(all_urls[100])
	# is_too_short(all_urls[0])


	# no letters
	#they might be real websites, but we don't want to waste human time trying to figure out whether 123.123 has any camping relevance
	def has_no_letters(url):
	return len(re.findall('[a-zA-Z]', url)) == 0

	#test cases
	# has_no_letters(all_urls[100])
	# has_no_letters(all_urls[0])


	# has_useless_domain
	# facebook is a terrible source, and we don't need to check out stuff that's already on RT, etc.
	useless_domains = ['facebook\.com', 'paypal\.com', 'gmail\.com', 'youtube\.com', 'google\.com', 'roadtrippers\.com',
	'paypalobjects\.com', 'constantcontact\.com', 'shutterstock\.com', 'twitter\.com', 'vrbo\.com',
	'myspace\.com', 'linkedin\.com', 'goodreads\.com', 'etsy\.com', 'amazon\.com', 'ebay\.com', 'ihg\.com',
	'instagram\.com', 'express\.com', 'pinterest\.com', 'manta\.com', 'goarm\.com', 'army\.mil',
	'flickr\.com', 'fs\.usda\.gov', 'localedge\.com', 'local\.mysanantonio\.com']

	def has_useless_domain(url):
	useless_domain_detected = False
	for i in useless_domains:
	if len(re.findall(i, url)) > 0:
	useless_domain_detected = True
	return useless_domain_detected

	#test cases
	# has_useless_domain(all_urls[175])
	# has_useless_domain(all_urls[0])


	# other filters to add later
	#def has_bad_starting_character(url):
	#starts with a negative
	#starts with a number
	#

	#def has_erroneous_punctuation(url):
	#2 dots in a row

	#def is_probably_an_image(url):
	#.gif, .jped

	def bad_url(url):
	return is_too_short(url) or has_no_letters(url) or has_useless_domain(url)

	#test cases
	# bad_url(all_urls[0])
	# bad_url(all_urls[1])
	# bad_url(all_urls[175])
	# bad_url(all_urls[199])

	#create list for filtered urls
	filtered_urls = []

	for i in all_urls:
	if not bad_url(i):
	filtered_urls.append(i)

	#dedupe
	urls_to_test = list(dict.fromkeys(filtered_urls))

	#this narrowd it down by ~43% --> actually more after domain pruning

	#see if these places are in RT
	# theoretically, many of them should be in RT if the website was at all accurate... but the whole point of this is that we do not think they're all accurate
	# additionally, many of these urls were just one of many 20 urls listed in one place's website attribute

	# if the url/slug has a match in Places, we should then check if the if the id returned matches the id or ids from the list of place ids
	# if yes, offer it up as a potential new url
	# if no, this is a potential duplicated listing
	# if the url/slug does not have a match in Places, it could be a hidden gem

	#generate slugs

	#clean up urls
	def remove_leading_http(url):
	return re.sub('^https*', '', url)

	def remove_leading_colon(url):
	return re.sub('^:', '', url)

	def remove_leading_slashes(url):
	return re.sub('^/+', '', url)

	def remove_leading_www(url):
	return re.sub('^www\.', '', url)

	def remove_prefixes(url):
	return remove_leading_www(remove_leading_slashes(remove_leading_colon(remove_leading_http(url))))

	def find_domain_and_tld(url):
	return re.sub('/.*', '', url)

	#find most common domains
	domains = []
	for i in urls_to_test:
	domains.append(find_domain_and_tld(remove_prefixes(i)))

	import collections
	counter = collections.Counter(domains)
	print(counter.most_common)

	#these are common enough that they probably have a specific url formula that's worth processing
	slug_domains = ['foursquare.com', 'yelp.com', 'dps-siteplatform.com', 'urbanspoon.com', 'tripadvisor.com', 'local.yahoo.com']

	# collect the urls of these domains to figure out which ones may need special treatment
	domain_urls = []

	for i in urls_to_test:
	if find_domain_and_tld(remove_prefixes(i)) in slug_domains:
	domain_urls.append(remove_prefixes(i))

	# come up with the special treatment
	def foursquare_slug(url_noprefix):
	if re.search('foursquare\.com/v/', url_noprefix):
	return re.sub('/.*', '', re.sub('foursquare\.com/v/', '', url_noprefix))

	def yelp_slug(url_noprefix):
	if re.search('yelp\.com/biz/', url_noprefix):
	return re.sub('yelp\.com/biz/', '', url_noprefix)

	def dps_slug(url_noprefix):
	if re.search('dps-siteplatform\.com/', url_noprefix):
	return re.sub('-[0-9]+$', '', re.sub('dps-siteplatform\.com/', '', url_noprefix))

	def urbanspoon_slug(url_noprefix):
	if re.search('urbanspoon\.com/r/.+/restaurant/', url_noprefix):
	return re.sub('/', '-', re.sub('urbanspoon\.com/r/.+/restaurant/', '', url_noprefix))

	def tripadvisor_slug(url_noprefix):
	if re.search('tripadvisor\.com/.+-reviews-', url_noprefix):
	return re.sub('\.html.*$', '', re.sub('tripadvisor\.com/.+-reviews-', '', url_noprefix))

	def yahoo_slug(url_noprefix):
	if re.search('local\.yahoo\.com/info-[0-9]+-', url_noprefix):
	return re.sub('local\.yahoo\.com/info-[0-9]+-', '', url_noprefix)

	def generic_slug(url_noprefix):
	return re.sub('\.[a-zA-Z]+$', '', url_noprefix)

	def url_to_slug(url):
	domain = find_domain_and_tld(remove_prefixes(url))
	slug = None
	if domain in slug_domains:
	if domain == 'foursquare.com':
	slug = foursquare_slug(remove_prefixes(url))
	if domain == 'yelp.com':
	slug = yelp_slug(remove_prefixes(url))
	if domain == 'dps-siteplatform.com':
	slug = dps_slug(remove_prefixes(url))
	if domain == 'urbanspoon.com':
	slug = urbanspoon_slug(remove_prefixes(url))
	if domain == 'tripadvisor.com':
	slug = tripadvisor_slug(remove_prefixes(url))
	if domain == 'local.yahoo.com':
	slug = yahoo_slug(remove_prefixes(url))
	else:
	slug = generic_slug(domain)
	if slug:
	slug = re.sub('/', '-', re.sub('\.', '-', slug))
	return slug


	#select diretory for autocomplete results
	autocomplete_directory = list_directory + 'autocomplete/'

	import requests
	import json
	import hmac
	import hashlib
	import httplib


	# initialize a list for api keys
	s = []

	# grab those keys
	with open('/Users/ali/.pythonsecrets', 'rb') as secrets:
	secretreader = csv.reader(secrets)
	for row in secrets:
	s.append(row.replace('\n', ''))

	# create parameters to be used to connect
	# keys
	clientId = s[0]
	secretKey = s[1]


	def autocomplete (slug):
	# connection
	conn = httplib.HTTPSConnection('api.boone.ai')
	#
	# endpoint
	endpoint_start = '/api/v2/autocomplete?input='
	endpoint_end = '&locations=country:US\|country:CA'
	endpoint = endpoint_start + slug + endpoint_end
	#
	signature = hmac.new(secretKey, format(endpoint), hashlib.sha256).hexdigest()
	headers = { "Content-Type" : "application/json",
	"RT-ORG-APP-CLIENT-ID" : clientId,
	"RT-ORG-APP-HMAC": signature
	}
	conn.request("GET", endpoint, None, headers)
	response = conn.getresponse()
	return response

	summary_file = list_directory + 'summary.csv'
	outfile = open(summary_file, 'w')
	csvwriter = csv.writer(outfile)
	header = ['url', 'slug', 'canonical_place_id']
	csvwriter.writerow(header)
	n=1
	autocomplete_successes = []
	autocomplete_failures = []
	for i in urls_to_test:
	print i
	slug = url_to_slug(i)
	line = []
	if slug:
	response = autocomplete(slug)
	if response.status == 200:
	response = json.loads(response.read())
	if response["data"] != []:
	line.append(i)
	line.append(slug)
	line.append(response["data"][0]["properties"]["id"]) #canonical_place_id
	autocomplete_successes.append(i)
	else:
	autocomplete_failures.append(i)
	csvwriter.writerow(line)

	outfile.close()
No results found