kovachev-bot/rhymes/rhyme_generator.py
2023-12-16 20:36:09 +00:00

251 lines
8.5 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from collections import defaultdict
import json
import unicodedata
import regex as re
GRAVE = chr(0x300)
ACUTE = chr(0x301)
BREVE = chr(0x306)
TIE = chr(0x361)
PRIMARY = chr(0x2C8)
SECONDARY = chr(0x2CC)
TIE = chr(0x361)
FRONTED = chr(0x31F)
DOTUNDER = chr(0x323)
HYPH = chr(0x2027)
vowels = "aɤɔuɛiɐo"
vowels_c = f"[{vowels}]"
vowels_g = "[аъоуеияѝюАЪОУЕИЯЍЮ]"
cons = f"bvɡdʒzjklwmnprstfxʃɣʲ{TIE}"
cons_c = f"[{cons}]"
voiced_cons = f"bvɡdʒzɣ{TIE}"
voiced_cons_c = f"[{voiced_cons}]"
accents = PRIMARY + SECONDARY
accents_c = f"[{accents}]"
phonetic_chars_map = {
"а": "a",
"б": "b",
"в": "v",
"г": "ɡ",
"д": "d",
"е": "ɛ",
"ж": "ʒ",
"з": "z",
"и": "i",
"й": "j",
"к": "k",
"л": "l",
"м": "m",
"н": "n",
"о": "ɔ",
"п": "p",
"р": "r",
"с": "s",
"т": "t",
"у": "u",
"ў": "w",
"ф": "f",
"х": "x",
"ц": f"t{TIE}s",
"ч": f"t{TIE}ʃ",
"ш": "ʃ",
"щ": "ʃt",
"ъ": "ɤ",
"ь": "ʲ",
"ю": "ʲu",
"я": "ʲa",
GRAVE: SECONDARY,
ACUTE: PRIMARY
}
devoicing = {
"b": "p", "d": "t", "ɡ": "k",
"z": "s", "ʒ": "ʃ",
"v": "f"
}
voicing = {
"p": "b", "t": "d", "k": "ɡ",
"s": "z", "ʃ": "ʒ", "x": "ɣ",
"f": "v"
}
def count_vowels(word):
vowel_count = len(re.findall(vowels_g, word))
return vowel_count
IPA_prefixes = ["bɛz", "vɤz", "vɤzproiz", "iz", "naiz", "poiz", "prɛvɤz", "proiz", "raz"]
def rsub(word: str, pattern: str, repl) -> str:
if isinstance(repl, dict):
transl = lambda m: m.group(0).translate(str.maketrans(repl))
return re.sub(pattern, transl, word)
else:
return re.sub(pattern, repl, word)
def rsub_repeatedly(word: str, pattern: str, repl) -> str:
old = ""
while old != word:
old = word
word = rsub(word, pattern, repl)
return word
def rmatch(word: str, pattern: str) -> list[str]:
comp = re.compile(pattern)
m = comp.match(word)
if m:
return m.groups()
else:
return [None for _ in range(comp.groups)]
def toIPA(term, endschwa=False):
origterm = term
term = unicodedata.normalize("NFD", term.lower())
term = rsub(term, "у" + BREVE, "ў") # recompose ў
term = rsub(term, "и" + BREVE, "й") # recompose й
if term.find(GRAVE) != -1 and not term.find(ACUTE):
raise ValueError(f"Use acute accent, not grave accent, for primary stress: {origterm}")
# allow DOTUNDER to signal same as endschwa=1
term = rsub(term, f"а({accents_c}?){DOTUNDER}", "ъ\\1")
term = rsub(term, f"я({accents_c}?){DOTUNDER}", "ʲɤ\\1")
term = rsub(term, ".", phonetic_chars_map)
# Mark word boundaries
term = rsub(term, r"(\s+)", "#\\1#")
term = f"#{term}#"
# Convert verbal and definite endings
if endschwa:
term = rsub(term, "a(" + PRIMARY + "t?#)", "ɤ\\1")
# Change ʲ to j after vowels or word-initially
term = rsub(term, "([" + vowels + "#]" + accents_c + "?)ʲ", "\\1j")
########## Move stress #######-
# First, move leftwards over the vowel.
term = rsub(term, "(" + vowels_c + ")(" + accents_c + ")", "\\2\\1")
# Then, move leftwards over j or soft sign.
term = rsub(term, "([jʲ])(" + accents_c + ")", "\\2\\1")
# Then, move leftwards over a single consonant.
term = rsub(term, "(" + cons_c + ")(" + accents_c + ")", "\\2\\1")
# Then, move leftwards over Cl/Cr combinations where C is an obstruent (NOTE: IPA ɡ).
term = rsub(term, "([bdɡptkxfv]" + ")(" + accents_c + ")([rl])", "\\2\\1\\3")
# Then, move leftwards over kv/gv (NOTE: IPA ɡ).
term = rsub(term, "([kɡ]" + ")(" + accents_c + ")(v)", "\\2\\1\\3")
# Then, move leftwards over sC combinations, where C is a stop or resonant (NOTE: IPA ɡ).
term = rsub(term, "([sz]" + ")(" + accents_c + ")([bdɡptkvlrmn])", "\\2\\1\\3")
# Then, move leftwards over affricates not followed by a consonant.
term = rsub(term, "([td]" + TIE + "?)(" + accents_c + ")([szʃʒ][" + vowels + "ʲ])", "\\2\\1\\3")
# If we ended up in the middle of a tied affricate, move to its right.
term = rsub(term, "(" + TIE + ")(" + accents_c + ")(" + cons_c + ")", "\\1\\3\\2")
# Then, move leftwards over any remaining consonants at the beginning of a word.
term = rsub(term, "#(" + cons_c + "*)(" + accents_c + ")", "#\\2\\1")
# Then correct for known prefixes.
for prefix in IPA_prefixes:
prefix_prefix, prefix_final_cons = rmatch(prefix, "^(.*?)(" + cons_c + "*)$")
if prefix_final_cons:
# Check for accent moved too far to the left into a prefix, e.g. безбрачие accented as беˈзбрачие instead
# of безˈбрачие
term = rsub(term, "#(" + prefix_prefix + ")(" + accents_c + ")(" + prefix_final_cons + ")", "#\\1\\3\\2")
# Finally, if there is an explicit syllable boundary in the cluster of consonants where the stress is, put it there.
# First check for accent to the right of the explicit syllable boundary.
term = rsub(term, "(" + cons_c + "*)\\.(" + cons_c + "*)(" + accents_c + ")(" + cons_c + "*)", "\\1\\3\\2\\4")
# Then check for accent to the left of the explicit syllable boundary.
term = rsub(term, "(" + cons_c + "*)(" + accents_c + ")(" + cons_c + "*)\\.(" + cons_c + "*)", "\\1\\3\\2\\4")
# Finally, remove any remaining syllable boundaries.
term = rsub(term, "\\.", "")
########## Vowel reduction (in unstressed syllables) #######-
def reduce_vowel(vowel):
return rsub(vowel, "[aɔɤu]", { "a": "ɐ", "ɔ": "o", "ɤ": "ɐ", "u": "o" })
# Reduce all vowels before the stress, except if the word has no accent at all. (FIXME: This is presumably
# intended for single-syllable words without accents, but if the word is multisyllabic without accents,
# presumably all vowels should be reduced.)
def reduce_overall(m):
a, b = m.groups()
if count_vowels(origterm) <= 1:
return a + b
else:
return reduce_vowel(a) + b
term = rsub(term, "(#[^#" + accents + "]*)(.*?#)", reduce_overall)
# Reduce all vowels after the accent except the first vowel after the accent mark (which is stressed).
term = rsub(term, "(" + accents_c + "[^aɛiɔuɤ#]*[aɛiɔuɤ])([^#" + accents + "]*)", lambda m: m.group(1) + reduce_vowel(m.group(2)))
########## Vowel assimilation to adjacent consonants (fronting/raising) #######-
term = rsub(term, "([ʃʒʲj])([aouɤ])", "\\1\\2" + FRONTED)
# Hard l
term = rsub_repeatedly(term, "l([^ʲɛi])", "ɫ\\1")
# Voicing assimilation
term = rsub(term, "([bdɡzʒv" + TIE + "]*)(" + accents_c + "?[ptksʃfx#])", lambda m: rsub(m.group(1), ".", devoicing) + m.group(2))
term = rsub(term, "([ptksʃfx" + TIE + "]*)(" + accents_c + "?[bdɡzʒ])", lambda m: rsub(m.group(1), ".", voicing) + m.group(2))
term = rsub(term, "n(" + accents_c + "?[ɡk]+)", "ŋ\\1")
term = rsub(term, "m(" + accents_c + "?[fv]+)", "ɱ\\1")
# Sibilant assimilation
term = rsub(term, "[sz](" + accents_c + "?[td]?" + TIE + "?)([ʃʒ])", "\\2\\1\\2")
# Reduce consonant clusters
term = rsub(term, "([szʃʒ])[td](" + accents_c + "?)([tdknml])", "\\2\\1\\3")
# Strip hashes
term = rsub(term, "#", "")
return term
def get_rhyme(term: str) -> str:
def get_rhyme_ipa(ipa: str):
stress_index = ipa.rindex(PRIMARY)
rhyme_start_index = stress_index
while rhyme_start_index < len(ipa) and not re.match(vowels_c, ipa[rhyme_start_index]):
rhyme_start_index += 1
return f"{ipa[rhyme_start_index:]}"
return get_rhyme_ipa(toIPA(term))
# Each possible rhyme (suffix) will have a list of member words
rhymes: dict[str, list[str]] = defaultdict(list)
# Issues to consider:
# - Terms that end in a stressed vowel
# - Terms that have more than one word (space or hyphen)
# - Whether to include fronting in the IPA transcription
with open("out/words.txt") as f:
for line in f:
line = line.strip().replace("`", ACUTE)
if ACUTE not in line:
if count_vowels(line) == 1:
line = re.sub(f"({vowels_g})", "\\1" + ACUTE, line)
else:
continue
if re.search(f"{vowels_g}{ACUTE}$"):
continue
rhyme = get_rhyme(line)
rhymes[rhyme].append(line)
rhymes = {key: value for (key, value) in rhymes.items() if len(value) >= 3}
with open("rhymes.json", "w") as f:
json.dump(rhymes, f, ensure_ascii=False)