sbirch/convert.py

## convert.py
import sys
import xml.etree.ElementTree as ET
import re, htmlentitydefs, textwrap

def wrap(text):
	paragraphs = re.split('\n{2,}', text)
	wrapped = []
	for para in paragraphs:
		para = re.sub('\s{2,}', ' ', para).strip()
		wrapped.append('\n'.join(textwrap.wrap(para, 80)))
	return '\n\n'.join(wrapped)

# Due to Fredrik Lundh <http://effbot.org/zone/re-sub.htm#unescape-html>
# Considered public domain per <http://effbot.org/zone/copyright.htm>
# Removes HTML or XML character references and entities from a text string.
#
# @param text The HTML (or XML) source text.
# @return The plain text, as a Unicode string, if necessary.
def entity_unescape(text):
    def fixup(m):
        text = m.group(0)
        if text[:2] == "&#":
            # character reference
            try:
                if text[:3] == "&#x":
                    return unichr(int(text[3:-1], 16))
                else:
                    return unichr(int(text[2:-1]))
            except ValueError:
                pass
        else:
            # named entity
            try:
                text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
            except KeyError:
                pass
        return text # leave as is
    return re.sub("&#?\w+;", fixup, text)

def unescape(text):
	text = re.sub('</?p>', '', text)
	# note that node text from ET is unescaped -- this
	# unescapes the double-escaped ones.
	return entity_unescape(text)

def convert_quantity(q):
	q = q.strip()
	try:
		if str(int(float(q))) == q:
			return q

		q = float(q)
		R = q - int(q)

		table = {
			0.25: u'\xbc',
			0.5: u'\xbd',
			0.75: u'\xbe',
			0.333: u'\u2153',
			0.666: u'\u2154',
			0.2: u'\u2155',
			0.4: u'\u2156',
			0.6: u'\u2157',
			0.8: u'\u2158',
			0.1666: u'\u2159',
			0.8333: u'\u215A',
			0.125: u'\u215B',
			0.375: u'\u215C',
			0.625: u'\u215D',
			0.875: u'\u215E'
		}
		for value, code in table.items():
			if abs(R-value) < 0.001:
				R = code
				break
		else:
			return q

		if int(q) != 0:
			return '%d%s' % (q,R)
		return '%s' % R
	except ValueError:
		return q

def convert(recipe):
	def nt(parent, node, format=None, default=''):
		if parent.find(node) is None:
			return default
		result = parent.find(node)
		if result is None or result.text is None:
			return default
		result = unescape(result.text)
		if len(result.strip()) == 0:
			return default
		if format is not None:
			return format % result
		return result

	head = recipe.find('head')
	description = recipe.find('description')
	ingredients = recipe.find('ingredients')
	directions = recipe.find('directions')
	notes = recipe.find('note')

	converted = []

	title = nt(head, 'title')
	converted.append(title)

	if description is not None:
		description = unescape(description.text)
		description = description.replace('imported from Recipe Import', '').strip()
		if description != title:
			converted.append(description)

	attribution = nt(head, 'version') + nt(head, 'source', u' from %s')
	converted.append(attribution.strip())

	yields = head.find('yield')
	if yields is not None:
		yields = nt(yields, 'qty') + nt(yields, 'unit', ' %s')
		if yields != '1 Servings':
			converted.append(u'Yields: %s' % yields )

	preptime = head.find('preptime')
	if preptime is not None:
		preptime = preptime.find('time')
		converted.append(u'Preparation time: ' + nt(preptime, 'qty') + ' ' + nt(preptime, 'timeunit'))

	converted.append('\nIngredients:')
	max_width = 0
	contents = []
	for ing in ingredients.iter('ing'):
		amount = ing.find('amt')
		qty, unit = convert_quantity(nt(amount, 'qty')), nt(amount, 'unit', ' %s')
		if qty.strip() in ['0',''] and unit.strip() == '':
			amount = ''
		else:
			amount = qty+unit+' '
		max_width = max(len(amount), max_width)
		contents.append((amount, nt(ing, 'item') + nt(ing, 'prep', ' (%s)')))

	for amount, rest in contents:
		converted.append('    ' + amount.rjust(max_width) + rest)

	if directions is not None and directions.text is not None:
		converted.append('\nDirections:')
		converted.append(wrap(unescape(directions.text)))

	if notes is not None and notes.text is not None:
		converted.append(u'\nNote:')
		note = unescape(notes.text)
		note = re.sub('Nutr\. Assoc\. :( 0)+', '', note)
		note = re.sub('Exchanges: (.|\n)+\.', '', note)
		converted.append(wrap(note))

	return title, '\n'.join(converted)

if __name__ == '__main__':
	root = ET.parse(sys.argv[1]).getroot()

	f = open('output.txt', 'wb')
	for recipe in root.iter('recipe'):
		title, r = convert(recipe)

		#slug = title.replace('/', ' or ')
		#result = open('recipes/%s.txt' % slug, 'wb')
		#result.write(r.encode('utf8'))
		#result.close()

		f.write(r.encode('utf8'))
		f.write('\n\n\n' + '-'*80 + '\n\n\n')
	f.close()
	import sys
	import xml.etree.ElementTree as ET
	import re, htmlentitydefs, textwrap

	def wrap(text):
	paragraphs = re.split('\n{2,}', text)
	wrapped = []
	for para in paragraphs:
	para = re.sub('\s{2,}', ' ', para).strip()
	wrapped.append('\n'.join(textwrap.wrap(para, 80)))
	return '\n\n'.join(wrapped)

	# Due to Fredrik Lundh <http://effbot.org/zone/re-sub.htm#unescape-html>
	# Considered public domain per <http://effbot.org/zone/copyright.htm>
	# Removes HTML or XML character references and entities from a text string.
	#
	# @param text The HTML (or XML) source text.
	# @return The plain text, as a Unicode string, if necessary.
	def entity_unescape(text):
	def fixup(m):
	text = m.group(0)
	if text[:2] == "&#":
	# character reference
	try:
	if text[:3] == "&#x":
	return unichr(int(text[3:-1], 16))
	else:
	return unichr(int(text[2:-1]))
	except ValueError:
	pass
	else:
	# named entity
	try:
	text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
	except KeyError:
	pass
	return text # leave as is
	return re.sub("&#?\w+;", fixup, text)

	def unescape(text):
	text = re.sub('</?p>', '', text)
	# note that node text from ET is unescaped -- this
	# unescapes the double-escaped ones.
	return entity_unescape(text)

	def convert_quantity(q):
	q = q.strip()
	try:
	if str(int(float(q))) == q:
	return q

	q = float(q)
	R = q - int(q)

	table = {
	0.25: u'\xbc',
	0.5: u'\xbd',
	0.75: u'\xbe',
	0.333: u'\u2153',
	0.666: u'\u2154',
	0.2: u'\u2155',
	0.4: u'\u2156',
	0.6: u'\u2157',
	0.8: u'\u2158',
	0.1666: u'\u2159',
	0.8333: u'\u215A',
	0.125: u'\u215B',
	0.375: u'\u215C',
	0.625: u'\u215D',
	0.875: u'\u215E'
	}
	for value, code in table.items():
	if abs(R-value) < 0.001:
	R = code
	break
	else:
	return q

	if int(q) != 0:
	return '%d%s' % (q,R)
	return '%s' % R
	except ValueError:
	return q

	def convert(recipe):
	def nt(parent, node, format=None, default=''):
	if parent.find(node) is None:
	return default
	result = parent.find(node)
	if result is None or result.text is None:
	return default
	result = unescape(result.text)
	if len(result.strip()) == 0:
	return default
	if format is not None:
	return format % result
	return result

	head = recipe.find('head')
	description = recipe.find('description')
	ingredients = recipe.find('ingredients')
	directions = recipe.find('directions')
	notes = recipe.find('note')

	converted = []

	title = nt(head, 'title')
	converted.append(title)

	if description is not None:
	description = unescape(description.text)
	description = description.replace('imported from Recipe Import', '').strip()
	if description != title:
	converted.append(description)

	attribution = nt(head, 'version') + nt(head, 'source', u' from %s')
	converted.append(attribution.strip())

	yields = head.find('yield')
	if yields is not None:
	yields = nt(yields, 'qty') + nt(yields, 'unit', ' %s')
	if yields != '1 Servings':
	converted.append(u'Yields: %s' % yields )

	preptime = head.find('preptime')
	if preptime is not None:
	preptime = preptime.find('time')
	converted.append(u'Preparation time: ' + nt(preptime, 'qty') + ' ' + nt(preptime, 'timeunit'))

	converted.append('\nIngredients:')
	max_width = 0
	contents = []
	for ing in ingredients.iter('ing'):
	amount = ing.find('amt')
	qty, unit = convert_quantity(nt(amount, 'qty')), nt(amount, 'unit', ' %s')
	if qty.strip() in ['0',''] and unit.strip() == '':
	amount = ''
	else:
	amount = qty+unit+' '
	max_width = max(len(amount), max_width)
	contents.append((amount, nt(ing, 'item') + nt(ing, 'prep', ' (%s)')))

	for amount, rest in contents:
	converted.append(' ' + amount.rjust(max_width) + rest)

	if directions is not None and directions.text is not None:
	converted.append('\nDirections:')
	converted.append(wrap(unescape(directions.text)))

	if notes is not None and notes.text is not None:
	converted.append(u'\nNote:')
	note = unescape(notes.text)
	note = re.sub('Nutr\. Assoc\. :( 0)+', '', note)
	note = re.sub('Exchanges: (.\|\n)+\.', '', note)
	converted.append(wrap(note))

	return title, '\n'.join(converted)

	if __name__ == '__main__':
	root = ET.parse(sys.argv[1]).getroot()

	f = open('output.txt', 'wb')
	for recipe in root.iter('recipe'):
	title, r = convert(recipe)

	#slug = title.replace('/', ' or ')
	#result = open('recipes/%s.txt' % slug, 'wb')
	#result.write(r.encode('utf8'))
	#result.close()

	f.write(r.encode('utf8'))
	f.write('\n\n\n' + '-'*80 + '\n\n\n')
	f.close()
No results found