spennyf/make xml file

## make xml file
# coding: utf-8

import pprint
from lxml import etree
import cgi
from bs4 import BeautifulSoup
import datetime
import urllib2
import cookielib
import re
import string

#where to send the file at the end
output_path = ""#"/home/spencerf/public_html/uvm/"

def Get_website_text(url):

	# url for website
	base_url = url

	# file for storing cookies
	cookie_file = 'mfp.cookies'

	# set up a cookie jar to store cookies
	cj = cookielib.MozillaCookieJar(cookie_file)

	# set up opener to handle cookies, redirects etc
	opener = urllib2.build_opener(
	     urllib2.HTTPRedirectHandler(),
	     urllib2.HTTPHandler(debuglevel=0),
	     urllib2.HTTPSHandler(debuglevel=0),
	     urllib2.HTTPCookieProcessor(cj)
	)

	# pretend we're a web browser and not a python script
	opener.addheaders = [('User-agent',
	    ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_7) '
	     'AppleWebKit/535.1 (KHTML, like Gecko) '
	     'Chrome/13.0.782.13 Safari/535.1'))
	]

	# open the front page of the website to set
	# and save initial cookies
	response = opener.open(base_url)
	web_text = response.read()
	response.close()

	return web_text


#get union menus
def getUnionMenuUrls(soup):


    monthly_urls = soup.findAll('div',{'id':'accordion_11669'})[0]('a',href=True)[1:3]
    menu_urls = []
    today = datetime.datetime.today() # get todays date
    url = "https://uvmdining.sodexomyway.com"
    for tag in monthly_urls:
    	#print tag
        if ".htm" in tag['href']:
            name = str(tag.text)
            name = name.replace('\n','').replace("'",'').replace(' ','')
            datestrings = name.split('-') # split string and get the list of dates
            date_range = [datetime.datetime.strptime(d, '%m/%d/%Y') for d in datestrings] # convert datestrings to datetime objects
            #date_range[1] = date_range[1] + datetime.timedelta(days = -2)
            if date_range[0] <= today <= date_range[1]: # check if today in that range
            	newurl = url + tag['href']
            	menu_urls.append([name,newurl])

    return menu_urls


def get_xml(url):
	tag_stack = []
	output_lines = []

	html = urllib2.urlopen(url).read().replace('&nbsp;',"")#.replace('&#0039;',"'")
	filter(lambda x: x in string.printable, html)
	xml = etree.HTML(html)
	#print html

	open_tag(tag_stack, output_lines, "menu", "")
	days = xml.xpath('//td[@class="dayouter"]')
	# make the xml for each day
	for day in days:
		day_name = day.xpath('./a/@name')[0]
		safe_open_tag(tag_stack, output_lines, "day", "menu", day_name)

		dayinner_trs = day.xpath('.//table[@class="dayinner"]//tr')
		for dayinner_tr in dayinner_trs:
			# change meal
			if (dayinner_tr.xpath('./td[@class="mealname"]')):
				meal_name = dayinner_tr.xpath('./td[@class="mealname"]/text()')[0]
				safe_open_tag(tag_stack, output_lines, "meal", "day", meal_name)

			# change counter
			if (dayinner_tr.xpath('./td[@class="station"]/text()')):
				counter_name = dayinner_tr.xpath('./td[@class="station"]/text()')[0]
				safe_open_tag(tag_stack, output_lines, "counter", "meal", counter_name)

			# change dish
			if (dayinner_tr.xpath('./td[@class="menuitem"]')):
				item_name = "".join(dayinner_tr.xpath('./td[@class="menuitem"]/div//text()')).strip()
				safe_open_tag(tag_stack, output_lines, "dish", "counter", "")
				output_lines.append("<name>%s</name>" % cgi.escape(item_name))

	close_tags(tag_stack, output_lines, "")
	output_string = '\n'.join([line.encode('utf-8') for line in output_lines])


	#print BeautifulSoup('<name>O&#0039;Brien Potatoes</name>').text
	#print output_string
	return output_string


# close the tags up to the parent of last tag in tag_stack
def close_tags(tag_stack, output_lines, parent_tag):
	while tag_stack and tag_stack[-1] != parent_tag:
		top = tag_stack.pop()
		output_lines.append(' ' * len(tag_stack) + '</%s>' % top)

# open the new_tag using the suitable style based on name_property
def open_tag(tag_stack, output_lines, new_tag, name_property):
	if name_property:
		output_lines.append(' ' * len(tag_stack) + '<%s name="%s">' % (new_tag, name_property))
	else:
		output_lines.append(' ' * len(tag_stack) + '<%s>' % new_tag)
	tag_stack.append(new_tag)

# check if the new_tag parent is in the stack, if not it'll add the parent
def safe_open_tag(tag_stack, output_lines, new_tag, parent_tag, name_property):
	if parent_tag not in tag_stack:
		output_lines.append(' ' * len(tag_stack) + '<%s>' % parent_tag)
		tag_stack.append(parent_tag)
	else:
		close_tags(tag_stack, output_lines, parent_tag)
	open_tag(tag_stack, output_lines, new_tag, name_property)

# sample use of get_xml function


# In[17]:

if __name__ == "__main__":
    base_url_u = "https://uvmdining.sodexomyway.com/dining-choices/resident/index.html"
    htmltext_u = Get_website_text(base_url_u)
    soup_u = BeautifulSoup(htmltext_u)
    menu_url_list = getUnionMenuUrls(soup_u)
    ofname = str(menu_url_list[0][0].replace("/","")) + "uvm_cook_commons_menu" + ".xml"
    ofname = ofname.replace('1','').replace('2','').replace('3','').replace('4','').replace('5','').replace('6','')
    ofname = ofname.replace('7','').replace('7','').replace('8','').replace('9','').replace('0','').replace('-','')
    output_file = output_path + ofname
    open(output_file, "w").write(get_xml(menu_url_list[0][1]))
	# coding: utf-8

	import pprint
	from lxml import etree
	import cgi
	from bs4 import BeautifulSoup
	import datetime
	import urllib2
	import cookielib
	import re
	import string

	#where to send the file at the end
	output_path = ""#"/home/spencerf/public_html/uvm/"

	def Get_website_text(url):

	# url for website
	base_url = url

	# file for storing cookies
	cookie_file = 'mfp.cookies'

	# set up a cookie jar to store cookies
	cj = cookielib.MozillaCookieJar(cookie_file)

	# set up opener to handle cookies, redirects etc
	opener = urllib2.build_opener(
	urllib2.HTTPRedirectHandler(),
	urllib2.HTTPHandler(debuglevel=0),
	urllib2.HTTPSHandler(debuglevel=0),
	urllib2.HTTPCookieProcessor(cj)
	)

	# pretend we're a web browser and not a python script
	opener.addheaders = [('User-agent',
	('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_7) '
	'AppleWebKit/535.1 (KHTML, like Gecko) '
	'Chrome/13.0.782.13 Safari/535.1'))
	]

	# open the front page of the website to set
	# and save initial cookies
	response = opener.open(base_url)
	web_text = response.read()
	response.close()

	return web_text


	#get union menus
	def getUnionMenuUrls(soup):


	monthly_urls = soup.findAll('div',{'id':'accordion_11669'})[0]('a',href=True)[1:3]
	menu_urls = []
	today = datetime.datetime.today() # get todays date
	url = "https://uvmdining.sodexomyway.com"
	for tag in monthly_urls:
	#print tag
	if ".htm" in tag['href']:
	name = str(tag.text)
	name = name.replace('\n','').replace("'",'').replace(' ','')
	datestrings = name.split('-') # split string and get the list of dates
	date_range = [datetime.datetime.strptime(d, '%m/%d/%Y') for d in datestrings] # convert datestrings to datetime objects
	#date_range[1] = date_range[1] + datetime.timedelta(days = -2)
	if date_range[0] <= today <= date_range[1]: # check if today in that range
	newurl = url + tag['href']
	menu_urls.append([name,newurl])

	return menu_urls


	def get_xml(url):
	tag_stack = []
	output_lines = []

	html = urllib2.urlopen(url).read().replace(' ',"")#.replace(''',"'")
	filter(lambda x: x in string.printable, html)
	xml = etree.HTML(html)
	#print html

	open_tag(tag_stack, output_lines, "menu", "")
	days = xml.xpath('//td[@class="dayouter"]')
	# make the xml for each day
	for day in days:
	day_name = day.xpath('./a/@name')[0]
	safe_open_tag(tag_stack, output_lines, "day", "menu", day_name)

	dayinner_trs = day.xpath('.//table[@class="dayinner"]//tr')
	for dayinner_tr in dayinner_trs:
	# change meal
	if (dayinner_tr.xpath('./td[@class="mealname"]')):
	meal_name = dayinner_tr.xpath('./td[@class="mealname"]/text()')[0]
	safe_open_tag(tag_stack, output_lines, "meal", "day", meal_name)

	# change counter
	if (dayinner_tr.xpath('./td[@class="station"]/text()')):
	counter_name = dayinner_tr.xpath('./td[@class="station"]/text()')[0]
	safe_open_tag(tag_stack, output_lines, "counter", "meal", counter_name)

	# change dish
	if (dayinner_tr.xpath('./td[@class="menuitem"]')):
	item_name = "".join(dayinner_tr.xpath('./td[@class="menuitem"]/div//text()')).strip()
	safe_open_tag(tag_stack, output_lines, "dish", "counter", "")
	output_lines.append("<name>%s</name>" % cgi.escape(item_name))

	close_tags(tag_stack, output_lines, "")
	output_string = '\n'.join([line.encode('utf-8') for line in output_lines])


	#print BeautifulSoup('<name>O'Brien Potatoes</name>').text
	#print output_string
	return output_string


	# close the tags up to the parent of last tag in tag_stack
	def close_tags(tag_stack, output_lines, parent_tag):
	while tag_stack and tag_stack[-1] != parent_tag:
	top = tag_stack.pop()
	output_lines.append(' ' * len(tag_stack) + '</%s>' % top)

	# open the new_tag using the suitable style based on name_property
	def open_tag(tag_stack, output_lines, new_tag, name_property):
	if name_property:
	output_lines.append(' ' * len(tag_stack) + '<%s name="%s">' % (new_tag, name_property))
	else:
	output_lines.append(' ' * len(tag_stack) + '<%s>' % new_tag)
	tag_stack.append(new_tag)

	# check if the new_tag parent is in the stack, if not it'll add the parent
	def safe_open_tag(tag_stack, output_lines, new_tag, parent_tag, name_property):
	if parent_tag not in tag_stack:
	output_lines.append(' ' * len(tag_stack) + '<%s>' % parent_tag)
	tag_stack.append(parent_tag)
	else:
	close_tags(tag_stack, output_lines, parent_tag)
	open_tag(tag_stack, output_lines, new_tag, name_property)

	# sample use of get_xml function


	# In[17]:

	if __name__ == "__main__":
	base_url_u = "https://uvmdining.sodexomyway.com/dining-choices/resident/index.html"
	htmltext_u = Get_website_text(base_url_u)
	soup_u = BeautifulSoup(htmltext_u)
	menu_url_list = getUnionMenuUrls(soup_u)
	ofname = str(menu_url_list[0][0].replace("/","")) + "uvm_cook_commons_menu" + ".xml"
	ofname = ofname.replace('1','').replace('2','').replace('3','').replace('4','').replace('5','').replace('6','')
	ofname = ofname.replace('7','').replace('7','').replace('8','').replace('9','').replace('0','').replace('-','')
	output_file = output_path + ofname
	open(output_file, "w").write(get_xml(menu_url_list[0][1]))
No results found