Skip to content

Instantly share code, notes, and snippets.

@sjgallagher2
Created July 17, 2025 15:40
Show Gist options
  • Select an option

  • Save sjgallagher2/d9d82685f0c724385993c4182eb04f2c to your computer and use it in GitHub Desktop.

Select an option

Save sjgallagher2/d9d82685f0c724385993c4182eb04f2c to your computer and use it in GitHub Desktop.
Latin OCR cleaning with PyWORDS
# -*- coding: utf-8 -*-
"""
Created on Thu Jul 17
@author: Sam.Gallagher
"""
import pywords.lookup as lookup
import pywords.utils as pwutils
from itertools import product
# %%
def letter_swap_variants(word: str, target: str = 'f', replacement: str = 's'):
"""
Generator for every permutation of letter swaps from target -> replacement
ChatGPT
Parameters
----------
word : str
DESCRIPTION.
target : str, optional
DESCRIPTION. The default is 'f'.
replacement : str, optional
DESCRIPTION. The default is 's'.
Yields
------
TYPE
DESCRIPTION.
"""
# Indices of the target letter
idxs = [i for i, ch in enumerate(word) if ch == target]
if not idxs:
yield word # nothing to replace
return
# For each combination of keeping target or swapping to replacement
# product yields tuples like ('f','s','f',...) aligned with idxs
for combo in product((target, replacement), repeat=len(idxs)):
chars = list(word)
for pos, val in zip(idxs, combo):
chars[pos] = val
yield ''.join(chars)
# Common operations:
# Remove non-ASCII (opt. replace with ?), remove *, _
# æ -> ae
# œ -> oe
# ã, ẽ, õ, ũ -> am/an, em/en, om/on, um/un
# ę -> ae
# ſ -> s
# & -> et (also cover &c. -> etc.)
# wor- d (i.e. a line break) -> word
fname = 'cotes1.txt'
txt = ''
with open(fname,'r',encoding='utf-8') as f:
txt = f.read()
# %%
# 1. Replacements
txt = txt.replace('æ','ae')
txt = txt.replace('Æ','Ae')
txt = txt.replace('œ','oe')
txt = txt.replace('Œ','Oe')
txt = txt.replace('ę','ae')
txt = txt.replace('ſ','s')
txt = txt.replace('&','et')
# NOTE: Need to update this so it doesn't replace accented characters
# non-ascii characters (https://stackoverflow.com/a/20078869/8565545)
#txt = ''.join([i if ord(i) < 128 else '?' for i in txt])
words_in = txt.split(' ')
words_in = [w.strip() for w in words_in]
words_out = []
# 2. Combine words with -
i = 0
while i < len(words_in):
if words_in[i].find('-') != -1:
words_out.append(words_in[i][:-1] + words_in[i+1])
i += 1 # Skip next word
else:
words_out.append(words_in[i])
i += 1
# 3. Check if words exist, track words that don't
words_in = words_out # refresh
words_out = []
for w in words_in:
wm = w.lower().strip(',').strip(';').strip(':').strip('?').strip('.')
matches = lookup.match_word(wm)
#print(wm)
if len(matches) != 0:
#print(w)
words_out.append(w)
else:
# Try replacing f with s, one appearance at a time; keep punctation if we find something that works
found_match = False
for var_w in letter_swap_variants(w):
var_wm = var_w.lower().strip(',').strip(';').strip(':').strip('?').strip('.')
#print(f"Checking {var_wm}...")
matches = lookup.match_word(var_wm)
if len(matches) != 0:
#print(var_w)
words_out.append(var_w)
found_match = True
break
if not found_match:
# Put the word back in as-is
words_out.append(w)
print(' '.join(words_out))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment