Created
July 17, 2025 15:40
-
-
Save sjgallagher2/d9d82685f0c724385993c4182eb04f2c to your computer and use it in GitHub Desktop.
Latin OCR cleaning with PyWORDS
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # -*- coding: utf-8 -*- | |
| """ | |
| Created on Thu Jul 17 | |
| @author: Sam.Gallagher | |
| """ | |
| import pywords.lookup as lookup | |
| import pywords.utils as pwutils | |
| from itertools import product | |
| # %% | |
| def letter_swap_variants(word: str, target: str = 'f', replacement: str = 's'): | |
| """ | |
| Generator for every permutation of letter swaps from target -> replacement | |
| ChatGPT | |
| Parameters | |
| ---------- | |
| word : str | |
| DESCRIPTION. | |
| target : str, optional | |
| DESCRIPTION. The default is 'f'. | |
| replacement : str, optional | |
| DESCRIPTION. The default is 's'. | |
| Yields | |
| ------ | |
| TYPE | |
| DESCRIPTION. | |
| """ | |
| # Indices of the target letter | |
| idxs = [i for i, ch in enumerate(word) if ch == target] | |
| if not idxs: | |
| yield word # nothing to replace | |
| return | |
| # For each combination of keeping target or swapping to replacement | |
| # product yields tuples like ('f','s','f',...) aligned with idxs | |
| for combo in product((target, replacement), repeat=len(idxs)): | |
| chars = list(word) | |
| for pos, val in zip(idxs, combo): | |
| chars[pos] = val | |
| yield ''.join(chars) | |
| # Common operations: | |
| # Remove non-ASCII (opt. replace with ?), remove *, _ | |
| # æ -> ae | |
| # œ -> oe | |
| # ã, ẽ, õ, ũ -> am/an, em/en, om/on, um/un | |
| # ę -> ae | |
| # ſ -> s | |
| # & -> et (also cover &c. -> etc.) | |
| # wor- d (i.e. a line break) -> word | |
| fname = 'cotes1.txt' | |
| txt = '' | |
| with open(fname,'r',encoding='utf-8') as f: | |
| txt = f.read() | |
| # %% | |
| # 1. Replacements | |
| txt = txt.replace('æ','ae') | |
| txt = txt.replace('Æ','Ae') | |
| txt = txt.replace('œ','oe') | |
| txt = txt.replace('Œ','Oe') | |
| txt = txt.replace('ę','ae') | |
| txt = txt.replace('ſ','s') | |
| txt = txt.replace('&','et') | |
| # NOTE: Need to update this so it doesn't replace accented characters | |
| # non-ascii characters (https://stackoverflow.com/a/20078869/8565545) | |
| #txt = ''.join([i if ord(i) < 128 else '?' for i in txt]) | |
| words_in = txt.split(' ') | |
| words_in = [w.strip() for w in words_in] | |
| words_out = [] | |
| # 2. Combine words with - | |
| i = 0 | |
| while i < len(words_in): | |
| if words_in[i].find('-') != -1: | |
| words_out.append(words_in[i][:-1] + words_in[i+1]) | |
| i += 1 # Skip next word | |
| else: | |
| words_out.append(words_in[i]) | |
| i += 1 | |
| # 3. Check if words exist, track words that don't | |
| words_in = words_out # refresh | |
| words_out = [] | |
| for w in words_in: | |
| wm = w.lower().strip(',').strip(';').strip(':').strip('?').strip('.') | |
| matches = lookup.match_word(wm) | |
| #print(wm) | |
| if len(matches) != 0: | |
| #print(w) | |
| words_out.append(w) | |
| else: | |
| # Try replacing f with s, one appearance at a time; keep punctation if we find something that works | |
| found_match = False | |
| for var_w in letter_swap_variants(w): | |
| var_wm = var_w.lower().strip(',').strip(';').strip(':').strip('?').strip('.') | |
| #print(f"Checking {var_wm}...") | |
| matches = lookup.match_word(var_wm) | |
| if len(matches) != 0: | |
| #print(var_w) | |
| words_out.append(var_w) | |
| found_match = True | |
| break | |
| if not found_match: | |
| # Put the word back in as-is | |
| words_out.append(w) | |
| print(' '.join(words_out)) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment