Created
July 24, 2025 14:06
-
-
Save JarbasAl/80edd6dbd630f9da81def93772262d3e to your computer and use it in GitHub Desktop.
rule based phonemizer for mirandese
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import re | |
| import string | |
| mwl_alphabet_map = { | |
| "a": ["a", "ä", "ɐ"], | |
| "á": ["a", "ɐ̃"], | |
| "ai": ["aj"], | |
| "an": ["ɐ̃ŋ"], | |
| "b": ["b", "β"], # - b = [β] between vowels and after voiced consonants | |
| "c": ["s̻", "k", "s", "z"], # - c = [s̻] before e or i, [k] elsewhere | |
| "ç": ["z̻"], # - ç = [z̻] before words starting with voiced consonants | |
| "ch": ["tʃ"], | |
| "d": ["d", "ð"], # - d = [ð] between vowels and after r | |
| "e": ["ɛ", "e", "ɨ", "ɨ̃"], # - e = [ɨ/ɨ̃] before stressed syllables | |
| "en": ["ẽŋ", "ɨ̃"], | |
| "é": ["ɛ"], | |
| "ei": ["ej"], | |
| "eu": ["ew"], | |
| "éu": ["ɛw"], | |
| "f": ["f"], | |
| "g": ["g", "ɣ", "ʒ", "ɡu", "gu̯"], | |
| # - g = [ɣ] between vowels and after r. Before e and i, g = [ʒ]. g = [ɡu] in certain words, such as guira, guiron and guirica. g = [gu̯] before a | |
| "gu": ["ɣ", "g", "gu"], # - gu = [ɣ] between vowels and after r | |
| "h": [""], # silent | |
| "i": ["i", "j"], | |
| "in": ["ĩŋ", "ɨ̃j̃"], # ɨ̃j̃ (Sendinese dialect) | |
| "í": ["i"], | |
| "ia": ["ja"], | |
| "iê": ["je", "jê"], | |
| "iu": ["iw"], | |
| "j": ["ʒ"], | |
| "k": ["k"], # - k is only used in loanwords from other languages | |
| "l": ["ɫ", "l", "ʎ"], # - l = [ʎ] at the beginning of words, and [l] elsewhere | |
| "lh": ["ʎ"], | |
| "m": ["m", "~"], # - m is silent before nasalized front vowels, e.g. amportante | |
| "n": ["n", "ŋ", "~"], | |
| # - n is silent before consonants and at the end of words before nasalized front vowels, e.g. lhéngua, sons, quien | |
| "nh": ["ɲ"], | |
| "o": ["ɔ", "o", "u", "ʊ"], # - o = [u] when unstressed | |
| "on": ["õŋ"], | |
| "ó": ["ɔ"], | |
| "oi": ["oj"], | |
| "ói": ["ɔj"], | |
| "ou": ["ow"], | |
| "p": ["p"], | |
| "q": ["k"], | |
| "qu": ["k", "kṷ"], # - qu = [k] before e and i, and [kṷ] before a and en | |
| "r": ["ɾ", "r", "rr"], # - r = [rr] at the beginning of words and after n | |
| "rr": ["r"], | |
| "s": ["s̺", "z̺"], | |
| # - s = [s̺] when in initial position and before silent consonants. Between vowels and before voiced consonants, s = [z̺] | |
| "ss": ["s̺"], | |
| "t": ["t"], | |
| "u": ["u", "w", "ũ"], | |
| "un": ["ũŋ", "ʊ̃ŋ"], | |
| "ú": ["u"], | |
| "ũ": ["ũ"], | |
| "ua": ["wa"], | |
| "ui": ["uj"], | |
| "uo": ["wo", "u"], | |
| "v": ["b", "v"], # - v is only used in loanwords from other languages | |
| "w": ["w", "b", "β"], # - w is only used in loanwords from other languages | |
| "x": ["ʃ"], | |
| "y": ["j"], | |
| "z": ["z"] | |
| } | |
| vowels = "aeiouáéíóúäɐɛɨɪɔʊ" # Extended set of vowels for context checking | |
| voiced_consonants = "bdgjlmnrvz" # Approximated list of voiced consonants | |
| def is_vowel(char): | |
| return char in vowels | |
| def is_voiced_consonant(char): | |
| return char in voiced_consonants | |
| def phonemize_word(word): | |
| phonemes = [] | |
| i = 0 | |
| while i < len(word): | |
| matched = False | |
| # Try to match multi-character graphemes first | |
| for length in sorted([len(g) for g in mwl_alphabet_map.keys()], reverse=True): | |
| if i + length <= len(word): | |
| grapheme = word[i:i + length].lower() | |
| if grapheme in mwl_alphabet_map: | |
| # Apply specific rules for graphemes | |
| if grapheme == "b": | |
| if (i > 0 and is_vowel(word[i - 1])) and \ | |
| (i + 1 < len(word) and is_vowel(word[i + 1])): | |
| phonemes.append(mwl_alphabet_map["b"][1]) # [β] between vowels | |
| elif i > 0 and is_voiced_consonant(word[i - 1]): | |
| phonemes.append(mwl_alphabet_map["b"][1]) # [β] after voiced consonants | |
| else: | |
| phonemes.append(mwl_alphabet_map["b"][0]) # [b] otherwise | |
| elif grapheme == "c": | |
| if (i + 1 < len(word) and word[i + 1].lower() in "ei"): | |
| phonemes.append(mwl_alphabet_map["c"][0]) # [s̻] before e or i | |
| else: | |
| phonemes.append(mwl_alphabet_map["c"][1]) # [k] elsewhere | |
| elif grapheme == "ç": | |
| # This rule is tricky without full word context (e.g., "words starting with voiced consonants") | |
| # For now, a simplified interpretation: if followed by a voiced consonant at word boundary | |
| # This needs more sophisticated look-ahead if the rule means the *next word* starts with a voiced consonant | |
| if i + 1 < len(word) and is_voiced_consonant(word[i + 1]): | |
| phonemes.append( | |
| mwl_alphabet_map["ç"][0]) # [z̻] before words starting with voiced consonants | |
| else: # Fallback for now if not matching the specific rule | |
| phonemes.append(mwl_alphabet_map["ç"][0]) # Default to [z̻] | |
| elif grapheme == "d": | |
| if (i > 0 and is_vowel(word[i - 1])) and \ | |
| (i + 1 < len(word) and is_vowel(word[i + 1])): | |
| phonemes.append(mwl_alphabet_map["d"][1]) # [ð] between vowels | |
| elif i > 0 and word[i - 1].lower() == 'r': | |
| phonemes.append(mwl_alphabet_map["d"][1]) # [ð] after r | |
| else: | |
| phonemes.append(mwl_alphabet_map["d"][0]) # [d] otherwise | |
| elif grapheme == "e": | |
| # Simplified for now: always take the first phoneme. | |
| # "before stressed syllables" would require a syllabification and stress prediction model | |
| phonemes.append(mwl_alphabet_map["e"][0]) | |
| elif grapheme == "g": | |
| if (i > 0 and is_vowel(word[i - 1])) and \ | |
| (i + 1 < len(word) and is_vowel(word[i + 1])): | |
| phonemes.append(mwl_alphabet_map["g"][1]) # [ɣ] between vowels | |
| elif i > 0 and word[i - 1].lower() == 'r': | |
| phonemes.append(mwl_alphabet_map["g"][1]) # [ɣ] after r | |
| elif (i + 1 < len(word) and word[i + 1].lower() in "ei"): | |
| phonemes.append(mwl_alphabet_map["g"][2]) # [ʒ] before e and i | |
| # "g = [ɡu] in certain words" and "g = [gu̯] before a" are context-specific and harder to rule-base without a dictionary | |
| # Falling back to default [g] for now if specific rules don't match | |
| else: | |
| phonemes.append(mwl_alphabet_map["g"][0]) | |
| elif grapheme == "gu": | |
| # Simplified for now, similar to 'g' for [ɣ] | |
| if (i > 0 and is_vowel(word[i - 1])) and \ | |
| (i + 2 < len(word) and is_vowel(word[i + 2])): # Check the character *after* 'u' | |
| phonemes.append(mwl_alphabet_map["gu"][0]) # [ɣ] between vowels | |
| elif i > 0 and word[i - 1].lower() == 'r': | |
| phonemes.append(mwl_alphabet_map["gu"][0]) # [ɣ] after r | |
| else: | |
| phonemes.append(mwl_alphabet_map["gu"][1]) # [g] otherwise | |
| elif grapheme == "l": | |
| if i == 0: # At the beginning of words | |
| phonemes.append(mwl_alphabet_map["l"][2]) # [ʎ] | |
| else: | |
| phonemes.append(mwl_alphabet_map["l"][1]) # [l] elsewhere | |
| elif grapheme == "m": | |
| # "m is silent before nasalized front vowels, e.g. amportante" - requires nasal vowel detection | |
| # For simplicity, assuming the first phoneme unless clear context for silence | |
| # This rule is tricky without knowing which vowels are "nasalized front vowels" | |
| # For now, a simplified assumption: if 'm' is followed by 'p' or 'b' or 'f' | |
| # This rule might need further refinement for accuracy | |
| if (i + 1 < len(word) and word[i + 1].lower() in "pb" and i > 0): # Check for 'mp' or 'mb' | |
| phonemes.append(mwl_alphabet_map["m"][1]) # Silent | |
| else: | |
| phonemes.append(mwl_alphabet_map["m"][0]) | |
| elif grapheme == "n": | |
| # "n is silent before consonants and at the end of words before nasalized front vowels" | |
| if (i + 1 < len(word) and not is_vowel(word[i + 1])) or ( | |
| i == len(word) - 1): # Before consonants or at end | |
| phonemes.append(mwl_alphabet_map["n"][2]) # Silent | |
| else: | |
| phonemes.append(mwl_alphabet_map["n"][0]) | |
| elif grapheme == "o": | |
| # "o = [u] when unstressed" - requires stress prediction | |
| phonemes.append(mwl_alphabet_map["o"][0]) # Default to [ɔ] for now | |
| elif grapheme == "qu": | |
| if (i + 2 < len(word) and word[i + 2].lower() in "ei"): | |
| phonemes.append(mwl_alphabet_map["qu"][0]) # [k] before e and i | |
| elif (i + 2 < len(word) and word[i + 2].lower() in "aen"): # Added 'en' as per rule | |
| phonemes.append(mwl_alphabet_map["qu"][1]) # [kṷ] before a and en | |
| else: | |
| phonemes.append(mwl_alphabet_map["qu"][0]) # default to [k] | |
| elif grapheme == "r": | |
| if i == 0 or (i > 0 and word[i - 1].lower() == 'n'): # At beginning or after n | |
| phonemes.append(mwl_alphabet_map["r"][2]) # [rr] | |
| else: | |
| phonemes.append(mwl_alphabet_map["r"][0]) # [ɾ] elsewhere | |
| elif grapheme == "s": | |
| if i == 0 or (i + 1 < len(word) and not is_vowel( | |
| word[i + 1])): # Initial or before silent consonants (simplified to any non-vowel) | |
| phonemes.append(mwl_alphabet_map["s"][0]) # [s̺] | |
| elif (i > 0 and is_vowel(word[i - 1])) and \ | |
| (i + 1 < len(word) and is_voiced_consonant(word[i + 1])): | |
| phonemes.append( | |
| mwl_alphabet_map["s"][1]) # [z̺] between vowels and before voiced consonants | |
| elif (i > 0 and is_vowel(word[i - 1])) and \ | |
| (i + 1 < len(word) and is_vowel(word[i + 1])): | |
| phonemes.append(mwl_alphabet_map["s"][1]) # [z̺] between vowels | |
| else: | |
| phonemes.append(mwl_alphabet_map["s"][0]) # Default [s̺] | |
| elif grapheme == "u": | |
| # "u = [ũ] for nasalized, needs context | |
| phonemes.append(mwl_alphabet_map["u"][0]) # Default to [u] | |
| elif grapheme == "v": | |
| # "v is only used in loanwords from other languages" - defaulting to first phoneme | |
| phonemes.append(mwl_alphabet_map["v"][0]) | |
| elif grapheme == "w": | |
| # "w is only used in loanwords from other languages" - defaulting to first phoneme | |
| phonemes.append(mwl_alphabet_map["w"][0]) | |
| else: | |
| phonemes.append(mwl_alphabet_map[grapheme][0]) # Take the first phoneme in the list as default | |
| i += length | |
| matched = True | |
| break | |
| # If no multi-character grapheme matched, try single character | |
| if not matched: | |
| if word[i].lower() in mwl_alphabet_map: | |
| phonemes.append(mwl_alphabet_map[word[i].lower()][0]) | |
| elif word[i] in string.punctuation + string.whitespace: | |
| phonemes.append(word[i]) # Keep punctuation as is | |
| i += 1 | |
| return "".join(phonemes) | |
| def mirandese_phonemizer(text): | |
| words = re.findall(r"\b\w+\b|[\W_]+", text) # Split by words and keep punctuation/spaces | |
| phonemized_parts = [] | |
| for word_or_punc in words: | |
| if word_or_punc.isalpha(): | |
| phonemized_parts.append(phonemize_word(word_or_punc)) | |
| else: | |
| phonemized_parts.append(word_or_punc) # Keep punctuation and spaces as is | |
| return "".join(phonemized_parts) | |
| if __name__ == "__main__": | |
| sample_texts = [ | |
| "Muitas lhénguas ténen proua de ls sous pergaminos antigos, de la lhiteratura screbida hai cientos d'anhos i de scritores hai muito afamados, hoije bandeiras dessas lhénguas. Mas outras hai que nun puoden tener proua de nada desso, cumo ye l causo de la lhéngua mirandesa.", | |
| "Todos ls seres houmanos nácen lhibres i eiguales an honra i an dreitos. Dotados de rezon i de cuncéncia, dében de se dar bien uns culs outros i cumo armano", | |
| """Quien dirie qu'antre ls matos eiriçados | |
| Las ourriêtas i ls rius d'esta tiêrra, | |
| Bibie, cumo l chaugarço de la siêrra, | |
| Ua lhéngua de sons tan bariados? | |
| Mostre-se i fale-s' essa lhéngua filha | |
| D'un pobo que ten neilha l choro i l canto! | |
| Nada por ciêrto mos cautiba tanto | |
| Cumo la form' an que l'eideia brilha. | |
| Zgraçiado d'aquel, qu'abandonando | |
| La patri' an que naciu, la casa i l huôrto. | |
| Tamien se squeçe de la fala! Quando | |
| L furdes ber, talbéç que stéia muôrto!""" | |
| ] | |
| for text in sample_texts: | |
| print(f"Original: {text}") | |
| print(f"Phonemized: {mirandese_phonemizer(text)}\n") | |
| # Original: Muitas lhénguas ténen proua de ls sous pergaminos antigos, de la lhiteratura screbida hai cientos d'anhos i de scritores hai muito afamados, hoije bandeiras dessas lhénguas. Mas outras hai que nun puoden tener proua de nada desso, cumo ye l causo de la lhéngua mirandesa. | |
| # Phonemized: mujtas̺ ʎɛ~gas̺ tɛnẽŋ pɾowa dɛ ʎs̺ s̺ows̺ pɛɾɣamĩŋɔs̺ ɐ̃ŋtiɣɔs̺, dɛ ʎa ʎitɛɾatuɾa s̺kɾɛβiða aj s̻iẽŋtɔs̺ d'ɐ̃ŋɔs̺ i dɛ s̺kɾitɔɾɛs̺ aj mujtɔ afamaðɔs̺, ojʒɛ bɐ̃ŋdejɾas̺ dɛs̺as̺ ʎɛ~gas̺. mas̺ owtɾas̺ aj kɛ nũŋ pwoðẽŋ tẽŋɛɾ pɾowa dɛ naða dɛs̺ɔ, kumɔ jɛ ʎ kauz̺ɔ dɛ ʎa ʎɛ~ga miɾɐ̃ŋdɛz̺a. | |
| # | |
| # Original: Todos ls seres houmanos nácen lhibres i eiguales an honra i an dreitos. Dotados de rezon i de cuncéncia, dében de se dar bien uns culs outros i cumo armano | |
| # Phonemized: tɔðɔs̺ ʎs̺ s̺ɛɾɛs̺ owmɐ̃ŋɔs̺ nas̻ẽŋ ʎibɾɛs̺ i ejɣalɛs̺ ɐ̃ŋ õŋrra i ɐ̃ŋ dɾejtɔs̺. dɔtaðɔs̺ dɛ rrɛzõŋ i dɛ kũŋkɛ~s̻ja, dɛβẽŋ dɛ s̺ɛ daɾ biẽŋ ũŋs̺ kuls̺ owtɾɔs̺ i kumɔ aɾmɐ̃ŋɔ | |
| # | |
| # Original: Quien dirie qu'antre ls matos eiriçados | |
| # Las ourriêtas i ls rius d'esta tiêrra, | |
| # Bibie, cumo l chaugarço de la siêrra, | |
| # Ua lhéngua de sons tan bariados? | |
| # | |
| # Mostre-se i fale-s' essa lhéngua filha | |
| # D'un pobo que ten neilha l choro i l canto! | |
| # Nada por ciêrto mos cautiba tanto | |
| # Cumo la form' an que l'eideia brilha. | |
| # | |
| # Zgraçiado d'aquel, qu'abandonando | |
| # La patri' an que naciu, la casa i l huôrto. | |
| # Tamien se squeçe de la fala! Quando | |
| # L furdes ber, talbéç que stéia muôrto! | |
| # Phonemized: kiẽŋ diɾiɛ k'ɐ̃ŋtɾɛ ʎs̺ matɔs̺ ejɾiz̻aðɔs̺ | |
| # ʎas̺ owrjetas̺ i ʎs̺ rriws̺ d'ɛs̺ta tjera, | |
| # biβiɛ, kumɔ ʎ tʃauɣaɾz̻ɔ dɛ ʎa s̺jera, | |
| # wa ʎɛ~ga dɛ s̺õŋs̺ tɐ̃ŋ baɾjaðɔs̺? | |
| # | |
| # mɔs̺tɾɛ-s̺ɛ i falɛ-s̺' ɛs̺a ʎɛ~ga fiʎa | |
| # d'ũŋ pɔβɔ kɛ tẽŋ nejʎa ʎ tʃɔɾɔ i ʎ kɐ̃ŋtɔ! | |
| # naða pɔɾ s̻jeɾtɔ mɔs̺ kautiβa tɐ̃ŋtɔ | |
| # kumɔ ʎa fɔɾm' ɐ̃ŋ kɛ ʎ'ejðeja bɾiʎa. | |
| # | |
| # zgɾaz̻jaðɔ d'akɛl, k'aβɐ̃ŋdõŋɐ̃ŋdɔ | |
| # ʎa patɾi' ɐ̃ŋ kɛ nas̻iw, ʎa kaz̺a i ʎ uɾtɔ. | |
| # tamiẽŋ s̺ɛ s̺kɛz̻ɛ dɛ ʎa fala! kṷɐ̃ŋdɔ | |
| # ʎ fuɾðɛs̺ bɛɾ, talβɛz̻ kɛ s̺tɛja muɾtɔ! |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment