Skip to content

Instantly share code, notes, and snippets.

@sbirch
Created February 10, 2014 02:00
Show Gist options
  • Select an option

  • Save sbirch/8909094 to your computer and use it in GitHub Desktop.

Select an option

Save sbirch/8909094 to your computer and use it in GitHub Desktop.
A quick script to convert from RecipeXML to text. YMMV.
import sys
import xml.etree.ElementTree as ET
import re, htmlentitydefs, textwrap
def wrap(text):
paragraphs = re.split('\n{2,}', text)
wrapped = []
for para in paragraphs:
para = re.sub('\s{2,}', ' ', para).strip()
wrapped.append('\n'.join(textwrap.wrap(para, 80)))
return '\n\n'.join(wrapped)
# Due to Fredrik Lundh <http://effbot.org/zone/re-sub.htm#unescape-html>
# Considered public domain per <http://effbot.org/zone/copyright.htm>
# Removes HTML or XML character references and entities from a text string.
#
# @param text The HTML (or XML) source text.
# @return The plain text, as a Unicode string, if necessary.
def entity_unescape(text):
def fixup(m):
text = m.group(0)
if text[:2] == "&#":
# character reference
try:
if text[:3] == "&#x":
return unichr(int(text[3:-1], 16))
else:
return unichr(int(text[2:-1]))
except ValueError:
pass
else:
# named entity
try:
text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
except KeyError:
pass
return text # leave as is
return re.sub("&#?\w+;", fixup, text)
def unescape(text):
text = re.sub('</?p>', '', text)
# note that node text from ET is unescaped -- this
# unescapes the double-escaped ones.
return entity_unescape(text)
def convert_quantity(q):
q = q.strip()
try:
if str(int(float(q))) == q:
return q
q = float(q)
R = q - int(q)
table = {
0.25: u'\xbc',
0.5: u'\xbd',
0.75: u'\xbe',
0.333: u'\u2153',
0.666: u'\u2154',
0.2: u'\u2155',
0.4: u'\u2156',
0.6: u'\u2157',
0.8: u'\u2158',
0.1666: u'\u2159',
0.8333: u'\u215A',
0.125: u'\u215B',
0.375: u'\u215C',
0.625: u'\u215D',
0.875: u'\u215E'
}
for value, code in table.items():
if abs(R-value) < 0.001:
R = code
break
else:
return q
if int(q) != 0:
return '%d%s' % (q,R)
return '%s' % R
except ValueError:
return q
def convert(recipe):
def nt(parent, node, format=None, default=''):
if parent.find(node) is None:
return default
result = parent.find(node)
if result is None or result.text is None:
return default
result = unescape(result.text)
if len(result.strip()) == 0:
return default
if format is not None:
return format % result
return result
head = recipe.find('head')
description = recipe.find('description')
ingredients = recipe.find('ingredients')
directions = recipe.find('directions')
notes = recipe.find('note')
converted = []
title = nt(head, 'title')
converted.append(title)
if description is not None:
description = unescape(description.text)
description = description.replace('imported from Recipe Import', '').strip()
if description != title:
converted.append(description)
attribution = nt(head, 'version') + nt(head, 'source', u' from %s')
converted.append(attribution.strip())
yields = head.find('yield')
if yields is not None:
yields = nt(yields, 'qty') + nt(yields, 'unit', ' %s')
if yields != '1 Servings':
converted.append(u'Yields: %s' % yields )
preptime = head.find('preptime')
if preptime is not None:
preptime = preptime.find('time')
converted.append(u'Preparation time: ' + nt(preptime, 'qty') + ' ' + nt(preptime, 'timeunit'))
converted.append('\nIngredients:')
max_width = 0
contents = []
for ing in ingredients.iter('ing'):
amount = ing.find('amt')
qty, unit = convert_quantity(nt(amount, 'qty')), nt(amount, 'unit', ' %s')
if qty.strip() in ['0',''] and unit.strip() == '':
amount = ''
else:
amount = qty+unit+' '
max_width = max(len(amount), max_width)
contents.append((amount, nt(ing, 'item') + nt(ing, 'prep', ' (%s)')))
for amount, rest in contents:
converted.append(' ' + amount.rjust(max_width) + rest)
if directions is not None and directions.text is not None:
converted.append('\nDirections:')
converted.append(wrap(unescape(directions.text)))
if notes is not None and notes.text is not None:
converted.append(u'\nNote:')
note = unescape(notes.text)
note = re.sub('Nutr\. Assoc\. :( 0)+', '', note)
note = re.sub('Exchanges: (.|\n)+\.', '', note)
converted.append(wrap(note))
return title, '\n'.join(converted)
if __name__ == '__main__':
root = ET.parse(sys.argv[1]).getroot()
f = open('output.txt', 'wb')
for recipe in root.iter('recipe'):
title, r = convert(recipe)
#slug = title.replace('/', ' or ')
#result = open('recipes/%s.txt' % slug, 'wb')
#result.write(r.encode('utf8'))
#result.close()
f.write(r.encode('utf8'))
f.write('\n\n\n' + '-'*80 + '\n\n\n')
f.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment