251 lines
8.5 KiB
Python
251 lines
8.5 KiB
Python
from collections import defaultdict
|
||
import json
|
||
import unicodedata
|
||
import regex as re
|
||
|
||
GRAVE = chr(0x300)
|
||
ACUTE = chr(0x301)
|
||
BREVE = chr(0x306)
|
||
TIE = chr(0x361)
|
||
PRIMARY = chr(0x2C8)
|
||
SECONDARY = chr(0x2CC)
|
||
TIE = chr(0x361)
|
||
FRONTED = chr(0x31F)
|
||
DOTUNDER = chr(0x323)
|
||
HYPH = chr(0x2027)
|
||
|
||
vowels = "aɤɔuɛiɐo"
|
||
vowels_c = f"[{vowels}]"
|
||
vowels_g = "[аъоуеияѝюАЪОУЕИЯЍЮ]"
|
||
cons = f"bvɡdʒzjklwmnprstfxʃɣʲ{TIE}"
|
||
cons_c = f"[{cons}]"
|
||
voiced_cons = f"bvɡdʒzɣ{TIE}"
|
||
voiced_cons_c = f"[{voiced_cons}]"
|
||
accents = PRIMARY + SECONDARY
|
||
accents_c = f"[{accents}]"
|
||
|
||
phonetic_chars_map = {
|
||
"а": "a",
|
||
"б": "b",
|
||
"в": "v",
|
||
"г": "ɡ",
|
||
"д": "d",
|
||
"е": "ɛ",
|
||
"ж": "ʒ",
|
||
"з": "z",
|
||
"и": "i",
|
||
"й": "j",
|
||
"к": "k",
|
||
"л": "l",
|
||
"м": "m",
|
||
"н": "n",
|
||
"о": "ɔ",
|
||
"п": "p",
|
||
"р": "r",
|
||
"с": "s",
|
||
"т": "t",
|
||
"у": "u",
|
||
"ў": "w",
|
||
"ф": "f",
|
||
"х": "x",
|
||
"ц": f"t{TIE}s",
|
||
"ч": f"t{TIE}ʃ",
|
||
"ш": "ʃ",
|
||
"щ": "ʃt",
|
||
"ъ": "ɤ",
|
||
"ь": "ʲ",
|
||
"ю": "ʲu",
|
||
"я": "ʲa",
|
||
GRAVE: SECONDARY,
|
||
ACUTE: PRIMARY
|
||
}
|
||
|
||
devoicing = {
|
||
"b": "p", "d": "t", "ɡ": "k",
|
||
"z": "s", "ʒ": "ʃ",
|
||
"v": "f"
|
||
}
|
||
|
||
voicing = {
|
||
"p": "b", "t": "d", "k": "ɡ",
|
||
"s": "z", "ʃ": "ʒ", "x": "ɣ",
|
||
"f": "v"
|
||
}
|
||
|
||
def count_vowels(word):
|
||
vowel_count = len(re.findall(vowels_g, word))
|
||
return vowel_count
|
||
|
||
|
||
IPA_prefixes = ["bɛz", "vɤz", "vɤzproiz", "iz", "naiz", "poiz", "prɛvɤz", "proiz", "raz"]
|
||
|
||
|
||
def rsub(word: str, pattern: str, repl) -> str:
|
||
if isinstance(repl, dict):
|
||
transl = lambda m: m.group(0).translate(str.maketrans(repl))
|
||
return re.sub(pattern, transl, word)
|
||
else:
|
||
return re.sub(pattern, repl, word)
|
||
|
||
|
||
def rsub_repeatedly(word: str, pattern: str, repl) -> str:
|
||
old = ""
|
||
while old != word:
|
||
old = word
|
||
word = rsub(word, pattern, repl)
|
||
return word
|
||
|
||
|
||
def rmatch(word: str, pattern: str) -> list[str]:
|
||
comp = re.compile(pattern)
|
||
m = comp.match(word)
|
||
if m:
|
||
return m.groups()
|
||
else:
|
||
return [None for _ in range(comp.groups)]
|
||
|
||
|
||
def toIPA(term, endschwa=False):
|
||
origterm = term
|
||
term = unicodedata.normalize("NFD", term.lower())
|
||
term = rsub(term, "у" + BREVE, "ў") # recompose ў
|
||
term = rsub(term, "и" + BREVE, "й") # recompose й
|
||
|
||
if term.find(GRAVE) != -1 and not term.find(ACUTE):
|
||
raise ValueError(f"Use acute accent, not grave accent, for primary stress: {origterm}")
|
||
|
||
# allow DOTUNDER to signal same as endschwa=1
|
||
term = rsub(term, f"а({accents_c}?){DOTUNDER}", "ъ\\1")
|
||
term = rsub(term, f"я({accents_c}?){DOTUNDER}", "ʲɤ\\1")
|
||
term = rsub(term, ".", phonetic_chars_map)
|
||
|
||
# Mark word boundaries
|
||
term = rsub(term, r"(\s+)", "#\\1#")
|
||
term = f"#{term}#"
|
||
|
||
# Convert verbal and definite endings
|
||
if endschwa:
|
||
term = rsub(term, "a(" + PRIMARY + "t?#)", "ɤ\\1")
|
||
|
||
# Change ʲ to j after vowels or word-initially
|
||
term = rsub(term, "([" + vowels + "#]" + accents_c + "?)ʲ", "\\1j")
|
||
|
||
########## Move stress #######-
|
||
|
||
# First, move leftwards over the vowel.
|
||
term = rsub(term, "(" + vowels_c + ")(" + accents_c + ")", "\\2\\1")
|
||
# Then, move leftwards over j or soft sign.
|
||
term = rsub(term, "([jʲ])(" + accents_c + ")", "\\2\\1")
|
||
# Then, move leftwards over a single consonant.
|
||
term = rsub(term, "(" + cons_c + ")(" + accents_c + ")", "\\2\\1")
|
||
# Then, move leftwards over Cl/Cr combinations where C is an obstruent (NOTE: IPA ɡ).
|
||
term = rsub(term, "([bdɡptkxfv]" + ")(" + accents_c + ")([rl])", "\\2\\1\\3")
|
||
# Then, move leftwards over kv/gv (NOTE: IPA ɡ).
|
||
term = rsub(term, "([kɡ]" + ")(" + accents_c + ")(v)", "\\2\\1\\3")
|
||
# Then, move leftwards over sC combinations, where C is a stop or resonant (NOTE: IPA ɡ).
|
||
term = rsub(term, "([sz]" + ")(" + accents_c + ")([bdɡptkvlrmn])", "\\2\\1\\3")
|
||
# Then, move leftwards over affricates not followed by a consonant.
|
||
term = rsub(term, "([td]" + TIE + "?)(" + accents_c + ")([szʃʒ][" + vowels + "ʲ])", "\\2\\1\\3")
|
||
# If we ended up in the middle of a tied affricate, move to its right.
|
||
term = rsub(term, "(" + TIE + ")(" + accents_c + ")(" + cons_c + ")", "\\1\\3\\2")
|
||
# Then, move leftwards over any remaining consonants at the beginning of a word.
|
||
term = rsub(term, "#(" + cons_c + "*)(" + accents_c + ")", "#\\2\\1")
|
||
# Then correct for known prefixes.
|
||
for prefix in IPA_prefixes:
|
||
prefix_prefix, prefix_final_cons = rmatch(prefix, "^(.*?)(" + cons_c + "*)$")
|
||
if prefix_final_cons:
|
||
# Check for accent moved too far to the left into a prefix, e.g. безбрачие accented as беˈзбрачие instead
|
||
# of безˈбрачие
|
||
term = rsub(term, "#(" + prefix_prefix + ")(" + accents_c + ")(" + prefix_final_cons + ")", "#\\1\\3\\2")
|
||
|
||
|
||
# Finally, if there is an explicit syllable boundary in the cluster of consonants where the stress is, put it there.
|
||
# First check for accent to the right of the explicit syllable boundary.
|
||
term = rsub(term, "(" + cons_c + "*)\\.(" + cons_c + "*)(" + accents_c + ")(" + cons_c + "*)", "\\1\\3\\2\\4")
|
||
# Then check for accent to the left of the explicit syllable boundary.
|
||
term = rsub(term, "(" + cons_c + "*)(" + accents_c + ")(" + cons_c + "*)\\.(" + cons_c + "*)", "\\1\\3\\2\\4")
|
||
# Finally, remove any remaining syllable boundaries.
|
||
term = rsub(term, "\\.", "")
|
||
|
||
########## Vowel reduction (in unstressed syllables) #######-
|
||
def reduce_vowel(vowel):
|
||
return rsub(vowel, "[aɔɤu]", { "a": "ɐ", "ɔ": "o", "ɤ": "ɐ", "u": "o" })
|
||
|
||
# Reduce all vowels before the stress, except if the word has no accent at all. (FIXME: This is presumably
|
||
# intended for single-syllable words without accents, but if the word is multisyllabic without accents,
|
||
# presumably all vowels should be reduced.)
|
||
def reduce_overall(m):
|
||
a, b = m.groups()
|
||
if count_vowels(origterm) <= 1:
|
||
return a + b
|
||
else:
|
||
return reduce_vowel(a) + b
|
||
|
||
term = rsub(term, "(#[^#" + accents + "]*)(.*?#)", reduce_overall)
|
||
|
||
# Reduce all vowels after the accent except the first vowel after the accent mark (which is stressed).
|
||
term = rsub(term, "(" + accents_c + "[^aɛiɔuɤ#]*[aɛiɔuɤ])([^#" + accents + "]*)", lambda m: m.group(1) + reduce_vowel(m.group(2)))
|
||
|
||
########## Vowel assimilation to adjacent consonants (fronting/raising) #######-
|
||
term = rsub(term, "([ʃʒʲj])([aouɤ])", "\\1\\2" + FRONTED)
|
||
|
||
# Hard l
|
||
term = rsub_repeatedly(term, "l([^ʲɛi])", "ɫ\\1")
|
||
|
||
|
||
# Voicing assimilation
|
||
term = rsub(term, "([bdɡzʒv" + TIE + "]*)(" + accents_c + "?[ptksʃfx#])", lambda m: rsub(m.group(1), ".", devoicing) + m.group(2))
|
||
term = rsub(term, "([ptksʃfx" + TIE + "]*)(" + accents_c + "?[bdɡzʒ])", lambda m: rsub(m.group(1), ".", voicing) + m.group(2))
|
||
term = rsub(term, "n(" + accents_c + "?[ɡk]+)", "ŋ\\1")
|
||
term = rsub(term, "m(" + accents_c + "?[fv]+)", "ɱ\\1")
|
||
|
||
# Sibilant assimilation
|
||
term = rsub(term, "[sz](" + accents_c + "?[td]?" + TIE + "?)([ʃʒ])", "\\2\\1\\2")
|
||
|
||
# Reduce consonant clusters
|
||
term = rsub(term, "([szʃʒ])[td](" + accents_c + "?)([tdknml])", "\\2\\1\\3")
|
||
|
||
# Strip hashes
|
||
term = rsub(term, "#", "")
|
||
|
||
return term
|
||
|
||
|
||
def get_rhyme(term: str) -> str:
|
||
def get_rhyme_ipa(ipa: str):
|
||
stress_index = ipa.rindex(PRIMARY)
|
||
rhyme_start_index = stress_index
|
||
while rhyme_start_index < len(ipa) and not re.match(vowels_c, ipa[rhyme_start_index]):
|
||
rhyme_start_index += 1
|
||
|
||
return f"{ipa[rhyme_start_index:]}"
|
||
|
||
return get_rhyme_ipa(toIPA(term))
|
||
|
||
|
||
# Each possible rhyme (suffix) will have a list of member words
|
||
rhymes: dict[str, list[str]] = defaultdict(list)
|
||
|
||
# Issues to consider:
|
||
# - Terms that end in a stressed vowel
|
||
# - Terms that have more than one word (space or hyphen)
|
||
# - Whether to include fronting in the IPA transcription
|
||
with open("out/words.txt") as f:
|
||
for line in f:
|
||
line = line.strip().replace("`", ACUTE)
|
||
if ACUTE not in line:
|
||
if count_vowels(line) == 1:
|
||
line = re.sub(f"({vowels_g})", "\\1" + ACUTE, line)
|
||
else:
|
||
continue
|
||
|
||
if re.search(f"{vowels_g}{ACUTE}$"):
|
||
continue
|
||
|
||
rhyme = get_rhyme(line)
|
||
rhymes[rhyme].append(line)
|
||
|
||
rhymes = {key: value for (key, value) in rhymes.items() if len(value) >= 3}
|
||
with open("rhymes.json", "w") as f:
|
||
json.dump(rhymes, f, ensure_ascii=False)
|