Skip to content

Instantly share code, notes, and snippets.

@vadimkantorov
Last active February 25, 2026 17:12
Show Gist options
  • Select an option

  • Save vadimkantorov/bfd138a41877d5ab318db79abdf713f9 to your computer and use it in GitHub Desktop.

Select an option

Save vadimkantorov/bfd138a41877d5ab318db79abdf713f9 to your computer and use it in GitHub Desktop.
Primer of html2xml converter in python using html.parser and xml.dom.minidom modules
# python html2htxml.py test.html test.xml
import html.parser
import xml.dom.minidom
class HTMLToMinidom(html.parser.HTMLParser):
# https://developer.mozilla.org/en-US/docs/Glossary/Void_element
def __init__(self, void_elements = ('area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'link', 'meta', 'param', 'source', 'track', 'wbr')):
super().__init__()
self.doc = xml.dom.minidom.Document()
self.current_node = self.doc
self.node_stack = []
self.shortcode = ''
self.shortcode_arg = {}
self.void_elements = void_elements
def handle_starttag(self, tag, attrs):
element = self.doc.createElement(tag)
for name, value in attrs:
element.setAttribute(name, value)
self.current_node.appendChild(element)
if tag.lower() not in self.void_elements:
self.node_stack.append(self.current_node)
self.current_node = element
def handle_endtag(self, tag):
if tag.lower() in self.void_elements:
return
if self.node_stack:
self.current_node = self.node_stack.pop()
def handle_data(self, data):
if data.strip():
text_node = self.doc.createTextNode(data.strip())
self.current_node.appendChild(text_node)
def handle_comment(self, data):
if data.strip():
comment_node = self.doc.createComment(data.strip())
self.current_node.appendChild(comment_node)
def get_minidom_document(self):
return self.doc
if __name__ == '__main__':
import sys
input_path_html = sys.argv[1]
output_path_xml = sys.argv[2]
print(input_path_html)
parser = HTMLToMinidom()
with open(input_path_html) as f:
parser.feed(f.read())
xmldoc = parser.get_minidom_document()
print(output_path_xml)
with open(output_path_xml, 'w') as f:
xmldoc.writexml(f, addindent = ' ', newl = '\n', encoding = 'utf-8')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment