sjgallagher2/latinocr.py

## latinocr.py
# -*- coding: utf-8 -*-
"""
Created on Thu Jul 17

@author: Sam.Gallagher
"""

import pywords.lookup as lookup
import pywords.utils as pwutils
from itertools import product
# %%
def letter_swap_variants(word: str, target: str = 'f', replacement: str = 's'):
    """
    Generator for every permutation of letter swaps from target -> replacement

    ChatGPT

    Parameters
    ----------
    word : str
        DESCRIPTION.
    target : str, optional
        DESCRIPTION. The default is 'f'.
    replacement : str, optional
        DESCRIPTION. The default is 's'.

    Yields
    ------
    TYPE
        DESCRIPTION.

    """
    # Indices of the target letter
    idxs = [i for i, ch in enumerate(word) if ch == target]
    if not idxs:
        yield word  # nothing to replace
        return

    # For each combination of keeping target or swapping to replacement
    # product yields tuples like ('f','s','f',...) aligned with idxs
    for combo in product((target, replacement), repeat=len(idxs)):
        chars = list(word)
        for pos, val in zip(idxs, combo):
            chars[pos] = val
        yield ''.join(chars)

# Common operations:
#  Remove non-ASCII (opt. replace with ?), remove *, _
#  æ -> ae
#  œ -> oe
#  ã, ẽ, õ, ũ -> am/an, em/en, om/on, um/un
#  ę -> ae
#  ſ -> s
#  & -> et  (also cover &c. -> etc.)
#  wor- d (i.e. a line break) -> word

fname = 'cotes1.txt'
txt = ''
with open(fname,'r',encoding='utf-8') as f:
    txt = f.read()

# %%

# 1. Replacements
txt = txt.replace('æ','ae')
txt = txt.replace('Æ','Ae')
txt = txt.replace('œ','oe')
txt = txt.replace('Œ','Oe')
txt = txt.replace('ę','ae')
txt = txt.replace('ſ','s')
txt = txt.replace('&','et')

# NOTE: Need to update this so it doesn't replace accented characters
# non-ascii characters (https://stackoverflow.com/a/20078869/8565545)
#txt = ''.join([i if ord(i) < 128 else '?' for i in txt])

words_in = txt.split(' ')
words_in = [w.strip() for w in words_in]
words_out = []

# 2. Combine words with -
i = 0
while i < len(words_in):
    if words_in[i].find('-') != -1:
        words_out.append(words_in[i][:-1] + words_in[i+1])
        i += 1  # Skip next word
    else:
        words_out.append(words_in[i])
    i += 1


# 3. Check if words exist, track words that don't

words_in = words_out  # refresh
words_out = []

for w in words_in:
    wm = w.lower().strip(',').strip(';').strip(':').strip('?').strip('.')
    matches = lookup.match_word(wm)
    #print(wm)
    if len(matches) != 0:
        #print(w)
        words_out.append(w)
    else:
        # Try replacing f with s, one appearance at a time; keep punctation if we find something that works
        found_match = False
        for var_w in letter_swap_variants(w):
            var_wm = var_w.lower().strip(',').strip(';').strip(':').strip('?').strip('.')
            #print(f"Checking {var_wm}...")
            matches = lookup.match_word(var_wm)
            if len(matches) != 0:
                #print(var_w)
                words_out.append(var_w)
                found_match = True
                break
        if not found_match:
            # Put the word back in as-is
            words_out.append(w)
print(' '.join(words_out))
	# -- coding: utf-8 --
	"""
	Created on Thu Jul 17

	@author: Sam.Gallagher
	"""

	import pywords.lookup as lookup
	import pywords.utils as pwutils
	from itertools import product
	# %%
	def letter_swap_variants(word: str, target: str = 'f', replacement: str = 's'):
	"""
	Generator for every permutation of letter swaps from target -> replacement

	ChatGPT

	Parameters
	----------
	word : str
	DESCRIPTION.
	target : str, optional
	DESCRIPTION. The default is 'f'.
	replacement : str, optional
	DESCRIPTION. The default is 's'.

	Yields
	------
	TYPE
	DESCRIPTION.

	"""
	# Indices of the target letter
	idxs = [i for i, ch in enumerate(word) if ch == target]
	if not idxs:
	yield word # nothing to replace
	return

	# For each combination of keeping target or swapping to replacement
	# product yields tuples like ('f','s','f',...) aligned with idxs
	for combo in product((target, replacement), repeat=len(idxs)):
	chars = list(word)
	for pos, val in zip(idxs, combo):
	chars[pos] = val
	yield ''.join(chars)

	# Common operations:
	# Remove non-ASCII (opt. replace with ?), remove *, _
	# æ -> ae
	# œ -> oe
	# ã, ẽ, õ, ũ -> am/an, em/en, om/on, um/un
	# ę -> ae
	# ſ -> s
	# & -> et (also cover &c. -> etc.)
	# wor- d (i.e. a line break) -> word

	fname = 'cotes1.txt'
	txt = ''
	with open(fname,'r',encoding='utf-8') as f:
	txt = f.read()

	# %%

	# 1. Replacements
	txt = txt.replace('æ','ae')
	txt = txt.replace('Æ','Ae')
	txt = txt.replace('œ','oe')
	txt = txt.replace('Œ','Oe')
	txt = txt.replace('ę','ae')
	txt = txt.replace('ſ','s')
	txt = txt.replace('&','et')

	# NOTE: Need to update this so it doesn't replace accented characters
	# non-ascii characters (https://stackoverflow.com/a/20078869/8565545)
	#txt = ''.join([i if ord(i) < 128 else '?' for i in txt])

	words_in = txt.split(' ')
	words_in = [w.strip() for w in words_in]
	words_out = []

	# 2. Combine words with -
	i = 0
	while i < len(words_in):
	if words_in[i].find('-') != -1:
	words_out.append(words_in[i][:-1] + words_in[i+1])
	i += 1 # Skip next word
	else:
	words_out.append(words_in[i])
	i += 1


	# 3. Check if words exist, track words that don't

	words_in = words_out # refresh
	words_out = []

	for w in words_in:
	wm = w.lower().strip(',').strip(';').strip(':').strip('?').strip('.')
	matches = lookup.match_word(wm)
	#print(wm)
	if len(matches) != 0:
	#print(w)
	words_out.append(w)
	else:
	# Try replacing f with s, one appearance at a time; keep punctation if we find something that works
	found_match = False
	for var_w in letter_swap_variants(w):
	var_wm = var_w.lower().strip(',').strip(';').strip(':').strip('?').strip('.')
	#print(f"Checking {var_wm}...")
	matches = lookup.match_word(var_wm)
	if len(matches) != 0:
	#print(var_w)
	words_out.append(var_w)
	found_match = True
	break
	if not found_match:
	# Put the word back in as-is
	words_out.append(w)
	print(' '.join(words_out))
No results found