Skip to content

Instantly share code, notes, and snippets.

@JarbasAl
Created July 24, 2025 14:06
Show Gist options
  • Select an option

  • Save JarbasAl/80edd6dbd630f9da81def93772262d3e to your computer and use it in GitHub Desktop.

Select an option

Save JarbasAl/80edd6dbd630f9da81def93772262d3e to your computer and use it in GitHub Desktop.
rule based phonemizer for mirandese
import re
import string
mwl_alphabet_map = {
"a": ["a", "ä", "ɐ"],
"á": ["a", "ɐ̃"],
"ai": ["aj"],
"an": ["ɐ̃ŋ"],
"b": ["b", "β"], # - b = [β] between vowels and after voiced consonants
"c": ["s̻", "k", "s", "z"], # - c = [s̻] before e or i, [k] elsewhere
"ç": ["z̻"], # - ç = [z̻] before words starting with voiced consonants
"ch": ["tʃ"],
"d": ["d", "ð"], # - d = [ð] between vowels and after r
"e": ["ɛ", "e", "ɨ", "ɨ̃"], # - e = [ɨ/ɨ̃] before stressed syllables
"en": ["ẽŋ", "ɨ̃"],
"é": ["ɛ"],
"ei": ["ej"],
"eu": ["ew"],
"éu": ["ɛw"],
"f": ["f"],
"g": ["g", "ɣ", "ʒ", "ɡu", "gu̯"],
# - g = [ɣ] between vowels and after r. Before e and i, g = [ʒ]. g = [ɡu] in certain words, such as guira, guiron and guirica. g = [gu̯] before a
"gu": ["ɣ", "g", "gu"], # - gu = [ɣ] between vowels and after r
"h": [""], # silent
"i": ["i", "j"],
"in": ["ĩŋ", "ɨ̃j̃"], # ɨ̃j̃ (Sendinese dialect)
"í": ["i"],
"ia": ["ja"],
"iê": ["je", "jê"],
"iu": ["iw"],
"j": ["ʒ"],
"k": ["k"], # - k is only used in loanwords from other languages
"l": ["ɫ", "l", "ʎ"], # - l = [ʎ] at the beginning of words, and [l] elsewhere
"lh": ["ʎ"],
"m": ["m", "~"], # - m is silent before nasalized front vowels, e.g. amportante
"n": ["n", "ŋ", "~"],
# - n is silent before consonants and at the end of words before nasalized front vowels, e.g. lhéngua, sons, quien
"nh": ["ɲ"],
"o": ["ɔ", "o", "u", "ʊ"], # - o = [u] when unstressed
"on": ["õŋ"],
"ó": ["ɔ"],
"oi": ["oj"],
"ói": ["ɔj"],
"ou": ["ow"],
"p": ["p"],
"q": ["k"],
"qu": ["k", "kṷ"], # - qu = [k] before e and i, and [kṷ] before a and en
"r": ["ɾ", "r", "rr"], # - r = [rr] at the beginning of words and after n
"rr": ["r"],
"s": ["s̺", "z̺"],
# - s = [s̺] when in initial position and before silent consonants. Between vowels and before voiced consonants, s = [z̺]
"ss": ["s̺"],
"t": ["t"],
"u": ["u", "w", "ũ"],
"un": ["ũŋ", "ʊ̃ŋ"],
"ú": ["u"],
"ũ": ["ũ"],
"ua": ["wa"],
"ui": ["uj"],
"uo": ["wo", "u"],
"v": ["b", "v"], # - v is only used in loanwords from other languages
"w": ["w", "b", "β"], # - w is only used in loanwords from other languages
"x": ["ʃ"],
"y": ["j"],
"z": ["z"]
}
vowels = "aeiouáéíóúäɐɛɨɪɔʊ" # Extended set of vowels for context checking
voiced_consonants = "bdgjlmnrvz" # Approximated list of voiced consonants
def is_vowel(char):
return char in vowels
def is_voiced_consonant(char):
return char in voiced_consonants
def phonemize_word(word):
phonemes = []
i = 0
while i < len(word):
matched = False
# Try to match multi-character graphemes first
for length in sorted([len(g) for g in mwl_alphabet_map.keys()], reverse=True):
if i + length <= len(word):
grapheme = word[i:i + length].lower()
if grapheme in mwl_alphabet_map:
# Apply specific rules for graphemes
if grapheme == "b":
if (i > 0 and is_vowel(word[i - 1])) and \
(i + 1 < len(word) and is_vowel(word[i + 1])):
phonemes.append(mwl_alphabet_map["b"][1]) # [β] between vowels
elif i > 0 and is_voiced_consonant(word[i - 1]):
phonemes.append(mwl_alphabet_map["b"][1]) # [β] after voiced consonants
else:
phonemes.append(mwl_alphabet_map["b"][0]) # [b] otherwise
elif grapheme == "c":
if (i + 1 < len(word) and word[i + 1].lower() in "ei"):
phonemes.append(mwl_alphabet_map["c"][0]) # [s̻] before e or i
else:
phonemes.append(mwl_alphabet_map["c"][1]) # [k] elsewhere
elif grapheme == "ç":
# This rule is tricky without full word context (e.g., "words starting with voiced consonants")
# For now, a simplified interpretation: if followed by a voiced consonant at word boundary
# This needs more sophisticated look-ahead if the rule means the *next word* starts with a voiced consonant
if i + 1 < len(word) and is_voiced_consonant(word[i + 1]):
phonemes.append(
mwl_alphabet_map["ç"][0]) # [z̻] before words starting with voiced consonants
else: # Fallback for now if not matching the specific rule
phonemes.append(mwl_alphabet_map["ç"][0]) # Default to [z̻]
elif grapheme == "d":
if (i > 0 and is_vowel(word[i - 1])) and \
(i + 1 < len(word) and is_vowel(word[i + 1])):
phonemes.append(mwl_alphabet_map["d"][1]) # [ð] between vowels
elif i > 0 and word[i - 1].lower() == 'r':
phonemes.append(mwl_alphabet_map["d"][1]) # [ð] after r
else:
phonemes.append(mwl_alphabet_map["d"][0]) # [d] otherwise
elif grapheme == "e":
# Simplified for now: always take the first phoneme.
# "before stressed syllables" would require a syllabification and stress prediction model
phonemes.append(mwl_alphabet_map["e"][0])
elif grapheme == "g":
if (i > 0 and is_vowel(word[i - 1])) and \
(i + 1 < len(word) and is_vowel(word[i + 1])):
phonemes.append(mwl_alphabet_map["g"][1]) # [ɣ] between vowels
elif i > 0 and word[i - 1].lower() == 'r':
phonemes.append(mwl_alphabet_map["g"][1]) # [ɣ] after r
elif (i + 1 < len(word) and word[i + 1].lower() in "ei"):
phonemes.append(mwl_alphabet_map["g"][2]) # [ʒ] before e and i
# "g = [ɡu] in certain words" and "g = [gu̯] before a" are context-specific and harder to rule-base without a dictionary
# Falling back to default [g] for now if specific rules don't match
else:
phonemes.append(mwl_alphabet_map["g"][0])
elif grapheme == "gu":
# Simplified for now, similar to 'g' for [ɣ]
if (i > 0 and is_vowel(word[i - 1])) and \
(i + 2 < len(word) and is_vowel(word[i + 2])): # Check the character *after* 'u'
phonemes.append(mwl_alphabet_map["gu"][0]) # [ɣ] between vowels
elif i > 0 and word[i - 1].lower() == 'r':
phonemes.append(mwl_alphabet_map["gu"][0]) # [ɣ] after r
else:
phonemes.append(mwl_alphabet_map["gu"][1]) # [g] otherwise
elif grapheme == "l":
if i == 0: # At the beginning of words
phonemes.append(mwl_alphabet_map["l"][2]) # [ʎ]
else:
phonemes.append(mwl_alphabet_map["l"][1]) # [l] elsewhere
elif grapheme == "m":
# "m is silent before nasalized front vowels, e.g. amportante" - requires nasal vowel detection
# For simplicity, assuming the first phoneme unless clear context for silence
# This rule is tricky without knowing which vowels are "nasalized front vowels"
# For now, a simplified assumption: if 'm' is followed by 'p' or 'b' or 'f'
# This rule might need further refinement for accuracy
if (i + 1 < len(word) and word[i + 1].lower() in "pb" and i > 0): # Check for 'mp' or 'mb'
phonemes.append(mwl_alphabet_map["m"][1]) # Silent
else:
phonemes.append(mwl_alphabet_map["m"][0])
elif grapheme == "n":
# "n is silent before consonants and at the end of words before nasalized front vowels"
if (i + 1 < len(word) and not is_vowel(word[i + 1])) or (
i == len(word) - 1): # Before consonants or at end
phonemes.append(mwl_alphabet_map["n"][2]) # Silent
else:
phonemes.append(mwl_alphabet_map["n"][0])
elif grapheme == "o":
# "o = [u] when unstressed" - requires stress prediction
phonemes.append(mwl_alphabet_map["o"][0]) # Default to [ɔ] for now
elif grapheme == "qu":
if (i + 2 < len(word) and word[i + 2].lower() in "ei"):
phonemes.append(mwl_alphabet_map["qu"][0]) # [k] before e and i
elif (i + 2 < len(word) and word[i + 2].lower() in "aen"): # Added 'en' as per rule
phonemes.append(mwl_alphabet_map["qu"][1]) # [kṷ] before a and en
else:
phonemes.append(mwl_alphabet_map["qu"][0]) # default to [k]
elif grapheme == "r":
if i == 0 or (i > 0 and word[i - 1].lower() == 'n'): # At beginning or after n
phonemes.append(mwl_alphabet_map["r"][2]) # [rr]
else:
phonemes.append(mwl_alphabet_map["r"][0]) # [ɾ] elsewhere
elif grapheme == "s":
if i == 0 or (i + 1 < len(word) and not is_vowel(
word[i + 1])): # Initial or before silent consonants (simplified to any non-vowel)
phonemes.append(mwl_alphabet_map["s"][0]) # [s̺]
elif (i > 0 and is_vowel(word[i - 1])) and \
(i + 1 < len(word) and is_voiced_consonant(word[i + 1])):
phonemes.append(
mwl_alphabet_map["s"][1]) # [z̺] between vowels and before voiced consonants
elif (i > 0 and is_vowel(word[i - 1])) and \
(i + 1 < len(word) and is_vowel(word[i + 1])):
phonemes.append(mwl_alphabet_map["s"][1]) # [z̺] between vowels
else:
phonemes.append(mwl_alphabet_map["s"][0]) # Default [s̺]
elif grapheme == "u":
# "u = [ũ] for nasalized, needs context
phonemes.append(mwl_alphabet_map["u"][0]) # Default to [u]
elif grapheme == "v":
# "v is only used in loanwords from other languages" - defaulting to first phoneme
phonemes.append(mwl_alphabet_map["v"][0])
elif grapheme == "w":
# "w is only used in loanwords from other languages" - defaulting to first phoneme
phonemes.append(mwl_alphabet_map["w"][0])
else:
phonemes.append(mwl_alphabet_map[grapheme][0]) # Take the first phoneme in the list as default
i += length
matched = True
break
# If no multi-character grapheme matched, try single character
if not matched:
if word[i].lower() in mwl_alphabet_map:
phonemes.append(mwl_alphabet_map[word[i].lower()][0])
elif word[i] in string.punctuation + string.whitespace:
phonemes.append(word[i]) # Keep punctuation as is
i += 1
return "".join(phonemes)
def mirandese_phonemizer(text):
words = re.findall(r"\b\w+\b|[\W_]+", text) # Split by words and keep punctuation/spaces
phonemized_parts = []
for word_or_punc in words:
if word_or_punc.isalpha():
phonemized_parts.append(phonemize_word(word_or_punc))
else:
phonemized_parts.append(word_or_punc) # Keep punctuation and spaces as is
return "".join(phonemized_parts)
if __name__ == "__main__":
sample_texts = [
"Muitas lhénguas ténen proua de ls sous pergaminos antigos, de la lhiteratura screbida hai cientos d'anhos i de scritores hai muito afamados, hoije bandeiras dessas lhénguas. Mas outras hai que nun puoden tener proua de nada desso, cumo ye l causo de la lhéngua mirandesa.",
"Todos ls seres houmanos nácen lhibres i eiguales an honra i an dreitos. Dotados de rezon i de cuncéncia, dében de se dar bien uns culs outros i cumo armano",
"""Quien dirie qu'antre ls matos eiriçados
Las ourriêtas i ls rius d'esta tiêrra,
Bibie, cumo l chaugarço de la siêrra,
Ua lhéngua de sons tan bariados?
Mostre-se i fale-s' essa lhéngua filha
D'un pobo que ten neilha l choro i l canto!
Nada por ciêrto mos cautiba tanto
Cumo la form' an que l'eideia brilha.
Zgraçiado d'aquel, qu'abandonando
La patri' an que naciu, la casa i l huôrto.
Tamien se squeçe de la fala! Quando
L furdes ber, talbéç que stéia muôrto!"""
]
for text in sample_texts:
print(f"Original: {text}")
print(f"Phonemized: {mirandese_phonemizer(text)}\n")
# Original: Muitas lhénguas ténen proua de ls sous pergaminos antigos, de la lhiteratura screbida hai cientos d'anhos i de scritores hai muito afamados, hoije bandeiras dessas lhénguas. Mas outras hai que nun puoden tener proua de nada desso, cumo ye l causo de la lhéngua mirandesa.
# Phonemized: mujtas̺ ʎɛ~gas̺ tɛnẽŋ pɾowa dɛ ʎs̺ s̺ows̺ pɛɾɣamĩŋɔs̺ ɐ̃ŋtiɣɔs̺, dɛ ʎa ʎitɛɾatuɾa s̺kɾɛβiða aj s̻iẽŋtɔs̺ d'ɐ̃ŋɔs̺ i dɛ s̺kɾitɔɾɛs̺ aj mujtɔ afamaðɔs̺, ojʒɛ bɐ̃ŋdejɾas̺ dɛs̺as̺ ʎɛ~gas̺. mas̺ owtɾas̺ aj kɛ nũŋ pwoðẽŋ tẽŋɛɾ pɾowa dɛ naða dɛs̺ɔ, kumɔ jɛ ʎ kauz̺ɔ dɛ ʎa ʎɛ~ga miɾɐ̃ŋdɛz̺a.
#
# Original: Todos ls seres houmanos nácen lhibres i eiguales an honra i an dreitos. Dotados de rezon i de cuncéncia, dében de se dar bien uns culs outros i cumo armano
# Phonemized: tɔðɔs̺ ʎs̺ s̺ɛɾɛs̺ owmɐ̃ŋɔs̺ nas̻ẽŋ ʎibɾɛs̺ i ejɣalɛs̺ ɐ̃ŋ õŋrra i ɐ̃ŋ dɾejtɔs̺. dɔtaðɔs̺ dɛ rrɛzõŋ i dɛ kũŋkɛ~s̻ja, dɛβẽŋ dɛ s̺ɛ daɾ biẽŋ ũŋs̺ kuls̺ owtɾɔs̺ i kumɔ aɾmɐ̃ŋɔ
#
# Original: Quien dirie qu'antre ls matos eiriçados
# Las ourriêtas i ls rius d'esta tiêrra,
# Bibie, cumo l chaugarço de la siêrra,
# Ua lhéngua de sons tan bariados?
#
# Mostre-se i fale-s' essa lhéngua filha
# D'un pobo que ten neilha l choro i l canto!
# Nada por ciêrto mos cautiba tanto
# Cumo la form' an que l'eideia brilha.
#
# Zgraçiado d'aquel, qu'abandonando
# La patri' an que naciu, la casa i l huôrto.
# Tamien se squeçe de la fala! Quando
# L furdes ber, talbéç que stéia muôrto!
# Phonemized: kiẽŋ diɾiɛ k'ɐ̃ŋtɾɛ ʎs̺ matɔs̺ ejɾiz̻aðɔs̺
# ʎas̺ owrjetas̺ i ʎs̺ rriws̺ d'ɛs̺ta tjera,
# biβiɛ, kumɔ ʎ tʃauɣaɾz̻ɔ dɛ ʎa s̺jera,
# wa ʎɛ~ga dɛ s̺õŋs̺ tɐ̃ŋ baɾjaðɔs̺?
#
# mɔs̺tɾɛ-s̺ɛ i falɛ-s̺' ɛs̺a ʎɛ~ga fiʎa
# d'ũŋ pɔβɔ kɛ tẽŋ nejʎa ʎ tʃɔɾɔ i ʎ kɐ̃ŋtɔ!
# naða pɔɾ s̻jeɾtɔ mɔs̺ kautiβa tɐ̃ŋtɔ
# kumɔ ʎa fɔɾm' ɐ̃ŋ kɛ ʎ'ejðeja bɾiʎa.
#
# zgɾaz̻jaðɔ d'akɛl, k'aβɐ̃ŋdõŋɐ̃ŋdɔ
# ʎa patɾi' ɐ̃ŋ kɛ nas̻iw, ʎa kaz̺a i ʎ uɾtɔ.
# tamiẽŋ s̺ɛ s̺kɛz̻ɛ dɛ ʎa fala! kṷɐ̃ŋdɔ
# ʎ fuɾðɛs̺ bɛɾ, talβɛz̻ kɛ s̺tɛja muɾtɔ!
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment