Created
November 20, 2014 18:26
-
-
Save spennyf/5ac05df5ca302cd76853 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # coding: utf-8 | |
| import pprint | |
| from lxml import etree | |
| import cgi | |
| from bs4 import BeautifulSoup | |
| import datetime | |
| import urllib2 | |
| import cookielib | |
| import re | |
| import string | |
| #where to send the file at the end | |
| output_path = ""#"/home/spencerf/public_html/uvm/" | |
| def Get_website_text(url): | |
| # url for website | |
| base_url = url | |
| # file for storing cookies | |
| cookie_file = 'mfp.cookies' | |
| # set up a cookie jar to store cookies | |
| cj = cookielib.MozillaCookieJar(cookie_file) | |
| # set up opener to handle cookies, redirects etc | |
| opener = urllib2.build_opener( | |
| urllib2.HTTPRedirectHandler(), | |
| urllib2.HTTPHandler(debuglevel=0), | |
| urllib2.HTTPSHandler(debuglevel=0), | |
| urllib2.HTTPCookieProcessor(cj) | |
| ) | |
| # pretend we're a web browser and not a python script | |
| opener.addheaders = [('User-agent', | |
| ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_7) ' | |
| 'AppleWebKit/535.1 (KHTML, like Gecko) ' | |
| 'Chrome/13.0.782.13 Safari/535.1')) | |
| ] | |
| # open the front page of the website to set | |
| # and save initial cookies | |
| response = opener.open(base_url) | |
| web_text = response.read() | |
| response.close() | |
| return web_text | |
| #get union menus | |
| def getUnionMenuUrls(soup): | |
| monthly_urls = soup.findAll('div',{'id':'accordion_11669'})[0]('a',href=True)[1:3] | |
| menu_urls = [] | |
| today = datetime.datetime.today() # get todays date | |
| url = "https://uvmdining.sodexomyway.com" | |
| for tag in monthly_urls: | |
| #print tag | |
| if ".htm" in tag['href']: | |
| name = str(tag.text) | |
| name = name.replace('\n','').replace("'",'').replace(' ','') | |
| datestrings = name.split('-') # split string and get the list of dates | |
| date_range = [datetime.datetime.strptime(d, '%m/%d/%Y') for d in datestrings] # convert datestrings to datetime objects | |
| #date_range[1] = date_range[1] + datetime.timedelta(days = -2) | |
| if date_range[0] <= today <= date_range[1]: # check if today in that range | |
| newurl = url + tag['href'] | |
| menu_urls.append([name,newurl]) | |
| return menu_urls | |
| def get_xml(url): | |
| tag_stack = [] | |
| output_lines = [] | |
| html = urllib2.urlopen(url).read().replace(' ',"")#.replace(''',"'") | |
| filter(lambda x: x in string.printable, html) | |
| xml = etree.HTML(html) | |
| #print html | |
| open_tag(tag_stack, output_lines, "menu", "") | |
| days = xml.xpath('//td[@class="dayouter"]') | |
| # make the xml for each day | |
| for day in days: | |
| day_name = day.xpath('./a/@name')[0] | |
| safe_open_tag(tag_stack, output_lines, "day", "menu", day_name) | |
| dayinner_trs = day.xpath('.//table[@class="dayinner"]//tr') | |
| for dayinner_tr in dayinner_trs: | |
| # change meal | |
| if (dayinner_tr.xpath('./td[@class="mealname"]')): | |
| meal_name = dayinner_tr.xpath('./td[@class="mealname"]/text()')[0] | |
| safe_open_tag(tag_stack, output_lines, "meal", "day", meal_name) | |
| # change counter | |
| if (dayinner_tr.xpath('./td[@class="station"]/text()')): | |
| counter_name = dayinner_tr.xpath('./td[@class="station"]/text()')[0] | |
| safe_open_tag(tag_stack, output_lines, "counter", "meal", counter_name) | |
| # change dish | |
| if (dayinner_tr.xpath('./td[@class="menuitem"]')): | |
| item_name = "".join(dayinner_tr.xpath('./td[@class="menuitem"]/div//text()')).strip() | |
| safe_open_tag(tag_stack, output_lines, "dish", "counter", "") | |
| output_lines.append("<name>%s</name>" % cgi.escape(item_name)) | |
| close_tags(tag_stack, output_lines, "") | |
| output_string = '\n'.join([line.encode('utf-8') for line in output_lines]) | |
| #print BeautifulSoup('<name>O'Brien Potatoes</name>').text | |
| #print output_string | |
| return output_string | |
| # close the tags up to the parent of last tag in tag_stack | |
| def close_tags(tag_stack, output_lines, parent_tag): | |
| while tag_stack and tag_stack[-1] != parent_tag: | |
| top = tag_stack.pop() | |
| output_lines.append(' ' * len(tag_stack) + '</%s>' % top) | |
| # open the new_tag using the suitable style based on name_property | |
| def open_tag(tag_stack, output_lines, new_tag, name_property): | |
| if name_property: | |
| output_lines.append(' ' * len(tag_stack) + '<%s name="%s">' % (new_tag, name_property)) | |
| else: | |
| output_lines.append(' ' * len(tag_stack) + '<%s>' % new_tag) | |
| tag_stack.append(new_tag) | |
| # check if the new_tag parent is in the stack, if not it'll add the parent | |
| def safe_open_tag(tag_stack, output_lines, new_tag, parent_tag, name_property): | |
| if parent_tag not in tag_stack: | |
| output_lines.append(' ' * len(tag_stack) + '<%s>' % parent_tag) | |
| tag_stack.append(parent_tag) | |
| else: | |
| close_tags(tag_stack, output_lines, parent_tag) | |
| open_tag(tag_stack, output_lines, new_tag, name_property) | |
| # sample use of get_xml function | |
| # In[17]: | |
| if __name__ == "__main__": | |
| base_url_u = "https://uvmdining.sodexomyway.com/dining-choices/resident/index.html" | |
| htmltext_u = Get_website_text(base_url_u) | |
| soup_u = BeautifulSoup(htmltext_u) | |
| menu_url_list = getUnionMenuUrls(soup_u) | |
| ofname = str(menu_url_list[0][0].replace("/","")) + "uvm_cook_commons_menu" + ".xml" | |
| ofname = ofname.replace('1','').replace('2','').replace('3','').replace('4','').replace('5','').replace('6','') | |
| ofname = ofname.replace('7','').replace('7','').replace('8','').replace('9','').replace('0','').replace('-','') | |
| output_file = output_path + ofname | |
| open(output_file, "w").write(get_xml(menu_url_list[0][1])) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment