Created
February 10, 2014 02:00
-
-
Save sbirch/8909094 to your computer and use it in GitHub Desktop.
A quick script to convert from RecipeXML to text. YMMV.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import sys | |
| import xml.etree.ElementTree as ET | |
| import re, htmlentitydefs, textwrap | |
| def wrap(text): | |
| paragraphs = re.split('\n{2,}', text) | |
| wrapped = [] | |
| for para in paragraphs: | |
| para = re.sub('\s{2,}', ' ', para).strip() | |
| wrapped.append('\n'.join(textwrap.wrap(para, 80))) | |
| return '\n\n'.join(wrapped) | |
| # Due to Fredrik Lundh <http://effbot.org/zone/re-sub.htm#unescape-html> | |
| # Considered public domain per <http://effbot.org/zone/copyright.htm> | |
| # Removes HTML or XML character references and entities from a text string. | |
| # | |
| # @param text The HTML (or XML) source text. | |
| # @return The plain text, as a Unicode string, if necessary. | |
| def entity_unescape(text): | |
| def fixup(m): | |
| text = m.group(0) | |
| if text[:2] == "&#": | |
| # character reference | |
| try: | |
| if text[:3] == "&#x": | |
| return unichr(int(text[3:-1], 16)) | |
| else: | |
| return unichr(int(text[2:-1])) | |
| except ValueError: | |
| pass | |
| else: | |
| # named entity | |
| try: | |
| text = unichr(htmlentitydefs.name2codepoint[text[1:-1]]) | |
| except KeyError: | |
| pass | |
| return text # leave as is | |
| return re.sub("&#?\w+;", fixup, text) | |
| def unescape(text): | |
| text = re.sub('</?p>', '', text) | |
| # note that node text from ET is unescaped -- this | |
| # unescapes the double-escaped ones. | |
| return entity_unescape(text) | |
| def convert_quantity(q): | |
| q = q.strip() | |
| try: | |
| if str(int(float(q))) == q: | |
| return q | |
| q = float(q) | |
| R = q - int(q) | |
| table = { | |
| 0.25: u'\xbc', | |
| 0.5: u'\xbd', | |
| 0.75: u'\xbe', | |
| 0.333: u'\u2153', | |
| 0.666: u'\u2154', | |
| 0.2: u'\u2155', | |
| 0.4: u'\u2156', | |
| 0.6: u'\u2157', | |
| 0.8: u'\u2158', | |
| 0.1666: u'\u2159', | |
| 0.8333: u'\u215A', | |
| 0.125: u'\u215B', | |
| 0.375: u'\u215C', | |
| 0.625: u'\u215D', | |
| 0.875: u'\u215E' | |
| } | |
| for value, code in table.items(): | |
| if abs(R-value) < 0.001: | |
| R = code | |
| break | |
| else: | |
| return q | |
| if int(q) != 0: | |
| return '%d%s' % (q,R) | |
| return '%s' % R | |
| except ValueError: | |
| return q | |
| def convert(recipe): | |
| def nt(parent, node, format=None, default=''): | |
| if parent.find(node) is None: | |
| return default | |
| result = parent.find(node) | |
| if result is None or result.text is None: | |
| return default | |
| result = unescape(result.text) | |
| if len(result.strip()) == 0: | |
| return default | |
| if format is not None: | |
| return format % result | |
| return result | |
| head = recipe.find('head') | |
| description = recipe.find('description') | |
| ingredients = recipe.find('ingredients') | |
| directions = recipe.find('directions') | |
| notes = recipe.find('note') | |
| converted = [] | |
| title = nt(head, 'title') | |
| converted.append(title) | |
| if description is not None: | |
| description = unescape(description.text) | |
| description = description.replace('imported from Recipe Import', '').strip() | |
| if description != title: | |
| converted.append(description) | |
| attribution = nt(head, 'version') + nt(head, 'source', u' from %s') | |
| converted.append(attribution.strip()) | |
| yields = head.find('yield') | |
| if yields is not None: | |
| yields = nt(yields, 'qty') + nt(yields, 'unit', ' %s') | |
| if yields != '1 Servings': | |
| converted.append(u'Yields: %s' % yields ) | |
| preptime = head.find('preptime') | |
| if preptime is not None: | |
| preptime = preptime.find('time') | |
| converted.append(u'Preparation time: ' + nt(preptime, 'qty') + ' ' + nt(preptime, 'timeunit')) | |
| converted.append('\nIngredients:') | |
| max_width = 0 | |
| contents = [] | |
| for ing in ingredients.iter('ing'): | |
| amount = ing.find('amt') | |
| qty, unit = convert_quantity(nt(amount, 'qty')), nt(amount, 'unit', ' %s') | |
| if qty.strip() in ['0',''] and unit.strip() == '': | |
| amount = '' | |
| else: | |
| amount = qty+unit+' ' | |
| max_width = max(len(amount), max_width) | |
| contents.append((amount, nt(ing, 'item') + nt(ing, 'prep', ' (%s)'))) | |
| for amount, rest in contents: | |
| converted.append(' ' + amount.rjust(max_width) + rest) | |
| if directions is not None and directions.text is not None: | |
| converted.append('\nDirections:') | |
| converted.append(wrap(unescape(directions.text))) | |
| if notes is not None and notes.text is not None: | |
| converted.append(u'\nNote:') | |
| note = unescape(notes.text) | |
| note = re.sub('Nutr\. Assoc\. :( 0)+', '', note) | |
| note = re.sub('Exchanges: (.|\n)+\.', '', note) | |
| converted.append(wrap(note)) | |
| return title, '\n'.join(converted) | |
| if __name__ == '__main__': | |
| root = ET.parse(sys.argv[1]).getroot() | |
| f = open('output.txt', 'wb') | |
| for recipe in root.iter('recipe'): | |
| title, r = convert(recipe) | |
| #slug = title.replace('/', ' or ') | |
| #result = open('recipes/%s.txt' % slug, 'wb') | |
| #result.write(r.encode('utf8')) | |
| #result.close() | |
| f.write(r.encode('utf8')) | |
| f.write('\n\n\n' + '-'*80 + '\n\n\n') | |
| f.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment