Skip to content

Instantly share code, notes, and snippets.

@rafalkrupinski
Created July 14, 2025 08:45
Show Gist options
  • Select an option

  • Save rafalkrupinski/11fd533eede83d60ee0bd9278819de03 to your computer and use it in GitHub Desktop.

Select an option

Save rafalkrupinski/11fd533eede83d60ee0bd9278819de03 to your computer and use it in GitHub Desktop.
Convert ODT (Open Document Text) to Markdown. Small subset of markers is supported
# /// script
# requires-python = ">=3.12"
# dependencies = [
# "isodate",
# "odfdo",
# ]
# ///
import dataclasses
import datetime as dt
import sys
from collections.abc import Iterable
from pathlib import Path
import isodate
from odfdo import Body, Document, Element, ListItem, PageBreak, Paragraph, Span
@dataclasses.dataclass
class Entry:
date: dt.date
text: str
class Parser:
def __init__(self):
self._current_entry = ''
self._current_date = dt.date(2009, 1, 1) # first entry as no date
self._current_nest = 0
def finish_entry(self):
return Entry(
date=self._current_date,
text=self._current_entry,
)
def process_children(self, children: Iterable[Element], tail=False):
for child in children:
yield from self.visit_element(child)
if tail and child.tail:
self._current_entry += child.tail
def visit_body(self, body: Body):
yield from self.process_children(body.children)
yield self.finish_entry()
def visit_soft_page_break(self, elem: Element):
yield from self.visit_line_break(elem)
def visit_p(self, elem: Paragraph):
self._current_entry += elem.text
yield from self.process_children(elem.children, True)
if self._current_entry:
if self._current_nest == 0:
self._current_entry += '\n\n'
else:
self._current_entry += '\n'
visit_h = visit_p
def visit_list_item(self, elem: ListItem, ordered: bool = False):
self._current_entry += ' ' * (self._current_nest-1) + '- '
yield from self.process_children(elem.children, True)
def visit_line_break(self, elem: PageBreak):
self._current_entry += '\n'
if elem.tail:
self._current_entry += elem.tail
yield from ()
def visit_s(self, elem: Element):
if elem.tail:
self._current_entry += elem.tail
yield from ()
def visit_date(self, elem: Element):
if self._current_entry:
yield self.finish_entry()
self._current_date = dt.datetime.fromisoformat(elem.attributes['text:date-value'])
if adjust_text := elem.attributes.get('text:date-adjust'):
self._current_date += isodate.parse_duration(adjust_text)
self._current_date = self._current_date.date()
self._current_entry = ''
def visit_element(self, elem: Element):
match elem:
case Element(tag='text:list'):
self._current_nest += 1
yield from self.process_children(elem.children)
self._current_nest -= 1
case _:
tag = elem.tag.split(':')[1].replace('-', '_')
visitor_name = f'visit_{tag}'
if hasattr(self, visitor_name):
yield from getattr(self, visitor_name)(elem)
else:
print(elem.tag, file=sys.stderr)
def visit_span(self, elem: Span):
self._current_entry+= elem.text
yield from self.process_children(elem.children, True)
def main():
doc = Document(Path(sys.argv[1]).expanduser())
for entry in Parser().visit_body(doc.body):
with open(Path(sys.argv[2] if len(sys.argv)>1 else '').expanduser()/f'{entry.date.strftime("%Y_%m_%d")}.md', 'a') as f:
f.write(entry.text)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment