kovachev-bot/rhymes/rhyme_generator.py

from collections import defaultdict
import json
import unicodedata
import regex as re

GRAVE = chr(0x300)
ACUTE = chr(0x301)
BREVE = chr(0x306)
TIE   = chr(0x361)
PRIMARY = chr(0x2C8)
SECONDARY = chr(0x2CC)
TIE = chr(0x361)
FRONTED = chr(0x31F)
DOTUNDER = chr(0x323)
HYPH = chr(0x2027)

vowels = "aɤɔuɛiɐo"
vowels_c = f"[{vowels}]"
vowels_g = "[аъоуеияѝюАЪОУЕИЯЍЮ]"
cons = f"bvɡdʒzjklwmnprstfxʃɣʲ{TIE}"
cons_c = f"[{cons}]"
voiced_cons = f"bvɡdʒzɣ{TIE}"
voiced_cons_c = f"[{voiced_cons}]"
accents = PRIMARY + SECONDARY
accents_c = f"[{accents}]"

phonetic_chars_map = {
    "а": "a",
    "б": "b",
    "в": "v",
    "г": "ɡ",
    "д": "d",
    "е": "ɛ",
    "ж": "ʒ",
    "з": "z",
    "и": "i",
    "й": "j",
    "к": "k",
    "л": "l",
    "м": "m",
    "н": "n",
    "о": "ɔ",
    "п": "p",
    "р": "r",
    "с": "s",
    "т": "t",
    "у": "u",
    "ў": "w",
    "ф": "f",
    "х": "x",
    "ц": f"t{TIE}s",
    "ч": f"t{TIE}ʃ",
    "ш": "ʃ",
    "щ": "ʃt",
    "ъ": "ɤ",
    "ь": "ʲ",
    "ю": "ʲu",
    "я": "ʲa",
    GRAVE: SECONDARY,
    ACUTE: PRIMARY
}

devoicing = {
    "b": "p", "d": "t", "ɡ": "k",
    "z": "s", "ʒ": "ʃ",
    "v": "f"
}

voicing = {
    "p": "b", "t": "d", "k": "ɡ",
    "s": "z", "ʃ": "ʒ", "x": "ɣ",
    "f": "v"
}

def count_vowels(word):
    vowel_count = len(re.findall(vowels_g, word))
    return vowel_count


IPA_prefixes = ["bɛz", "vɤz", "vɤzproiz", "iz", "naiz", "poiz", "prɛvɤz", "proiz", "raz"]


def rsub(word: str, pattern: str, repl) -> str:
    if isinstance(repl, dict):
        transl = lambda m: m.group(0).translate(str.maketrans(repl))
        return re.sub(pattern, transl, word)
    else:
        return re.sub(pattern, repl, word)


def rsub_repeatedly(word: str, pattern: str, repl) -> str:
    old = ""
    while old != word:
        old = word
        word = rsub(word, pattern, repl)
    return word


def rmatch(word: str, pattern: str) -> list[str]:
    comp = re.compile(pattern)
    m = comp.match(word)
    if m:
        return m.groups()
    else:
        return [None for _ in range(comp.groups)]


def toIPA(term, endschwa=False):
    origterm = term
    term = unicodedata.normalize("NFD", term.lower())
    term = rsub(term, "у" + BREVE, "ў") # recompose ў
    term = rsub(term, "и" + BREVE, "й") # recompose й

    if term.find(GRAVE) != -1 and not term.find(ACUTE):
        raise ValueError(f"Use acute accent, not grave accent, for primary stress: {origterm}")

    # allow DOTUNDER to signal same as endschwa=1
    term = rsub(term, f"а({accents_c}?){DOTUNDER}", "ъ\\1")
    term = rsub(term, f"я({accents_c}?){DOTUNDER}", "ʲɤ\\1")
    term = rsub(term, ".", phonetic_chars_map)

    # Mark word boundaries
    term = rsub(term, r"(\s+)", "#\\1#")
    term = f"#{term}#"

    # Convert verbal and definite endings
    if endschwa:
        term = rsub(term, "a(" + PRIMARY + "t?#)", "ɤ\\1")

    # Change ʲ to j after vowels or word-initially
    term = rsub(term, "([" + vowels + "#]" + accents_c + "?)ʲ", "\\1j")

    ########## Move stress #######-

    # First, move leftwards over the vowel.
    term = rsub(term, "(" + vowels_c + ")(" + accents_c + ")", "\\2\\1")
    # Then, move leftwards over j or soft sign.
    term = rsub(term, "([jʲ])(" + accents_c + ")", "\\2\\1")
    # Then, move leftwards over a single consonant.
    term = rsub(term, "(" + cons_c + ")(" + accents_c + ")", "\\2\\1")
    # Then, move leftwards over Cl/Cr combinations where C is an obstruent (NOTE: IPA ɡ).
    term = rsub(term, "([bdɡptkxfv]" + ")(" + accents_c + ")([rl])", "\\2\\1\\3")
    # Then, move leftwards over kv/gv (NOTE: IPA ɡ).
    term = rsub(term, "([kɡ]" + ")(" + accents_c + ")(v)", "\\2\\1\\3")
    # Then, move leftwards over sC combinations, where C is a stop or resonant (NOTE: IPA ɡ).
    term = rsub(term, "([sz]" + ")(" + accents_c + ")([bdɡptkvlrmn])", "\\2\\1\\3")
    # Then, move leftwards over affricates not followed by a consonant.
    term = rsub(term, "([td]" + TIE + "?)(" + accents_c + ")([szʃʒ][" + vowels + "ʲ])", "\\2\\1\\3")
    # If we ended up in the middle of a tied affricate, move to its right.
    term = rsub(term, "(" + TIE + ")(" + accents_c + ")(" + cons_c + ")", "\\1\\3\\2")
    # Then, move leftwards over any remaining consonants at the beginning of a word.
    term = rsub(term, "#(" + cons_c + "*)(" + accents_c + ")", "#\\2\\1")
    # Then correct for known prefixes.
    for prefix in IPA_prefixes:
        prefix_prefix, prefix_final_cons = rmatch(prefix, "^(.*?)(" + cons_c + "*)$")
        if prefix_final_cons:
            # Check for accent moved too far to the left into a prefix, e.g. безбрачие accented as беˈзбрачие instead
            # of безˈбрачие
            term = rsub(term, "#(" + prefix_prefix + ")(" + accents_c + ")(" + prefix_final_cons + ")", "#\\1\\3\\2")


    # Finally, if there is an explicit syllable boundary in the cluster of consonants where the stress is, put it there.
    # First check for accent to the right of the explicit syllable boundary.
    term = rsub(term, "(" + cons_c + "*)\\.(" + cons_c + "*)(" + accents_c + ")(" + cons_c + "*)", "\\1\\3\\2\\4")
    # Then check for accent to the left of the explicit syllable boundary.
    term = rsub(term, "(" + cons_c + "*)(" + accents_c + ")(" + cons_c + "*)\\.(" + cons_c + "*)", "\\1\\3\\2\\4")
    # Finally, remove any remaining syllable boundaries.
    term = rsub(term, "\\.", "")

    ########## Vowel reduction (in unstressed syllables) #######-
    def reduce_vowel(vowel):
        return rsub(vowel, "[aɔɤu]", { "a": "ɐ", "ɔ": "o", "ɤ": "ɐ", "u": "o" })

    # Reduce all vowels before the stress, except if the word has no accent at all. (FIXME: This is presumably
    # intended for single-syllable words without accents, but if the word is multisyllabic without accents,
    # presumably all vowels should be reduced.)
    def reduce_overall(m):
        a, b = m.groups()
        if count_vowels(origterm) <= 1:
            return a + b
        else:
            return reduce_vowel(a) + b

    term = rsub(term, "(#[^#" + accents + "]*)(.*?#)", reduce_overall)

    # Reduce all vowels after the accent except the first vowel after the accent mark (which is stressed).
    term = rsub(term, "(" + accents_c + "[^aɛiɔuɤ#]*[aɛiɔuɤ])([^#" + accents + "]*)", lambda m:  m.group(1) + reduce_vowel(m.group(2)))

    ########## Vowel assimilation to adjacent consonants (fronting/raising) #######-
    term = rsub(term, "([ʃʒʲj])([aouɤ])", "\\1\\2" + FRONTED)

    # Hard l
    term = rsub_repeatedly(term, "l([^ʲɛi])", "ɫ\\1")


    # Voicing assimilation
    term = rsub(term, "([bdɡzʒv" + TIE + "]*)(" + accents_c + "?[ptksʃfx#])", lambda m: rsub(m.group(1), ".", devoicing) + m.group(2))
    term = rsub(term, "([ptksʃfx" + TIE + "]*)(" + accents_c + "?[bdɡzʒ])", lambda m: rsub(m.group(1), ".", voicing) + m.group(2))
    term = rsub(term, "n(" + accents_c + "?[ɡk]+)", "ŋ\\1")
    term = rsub(term, "m(" + accents_c + "?[fv]+)", "ɱ\\1")

    # Sibilant assimilation
    term = rsub(term, "[sz](" + accents_c + "?[td]?" + TIE + "?)([ʃʒ])", "\\2\\1\\2")

    # Reduce consonant clusters
    term = rsub(term, "([szʃʒ])[td](" + accents_c + "?)([tdknml])", "\\2\\1\\3")

    # Strip hashes
    term = rsub(term, "#", "")

    return term


def get_rhyme(term: str) -> str:
    def get_rhyme_ipa(ipa: str):
        stress_index = ipa.rindex(PRIMARY)
        rhyme_start_index = stress_index
        while rhyme_start_index < len(ipa) and not re.match(vowels_c, ipa[rhyme_start_index]):
            rhyme_start_index += 1

        return f"{ipa[rhyme_start_index:]}"

    return get_rhyme_ipa(toIPA(term))


# Each possible rhyme (suffix) will have a list of member words
rhymes: dict[str, list[str]] = defaultdict(list)

# Issues to consider:
#  - Terms that end in a stressed vowel
#  - Terms that have more than one word (space or hyphen)
#  - Whether to include fronting in the IPA transcription
with open("out/words.txt") as f:
    for line in f:
        line = line.strip().replace("`", ACUTE)
        if ACUTE not in line:
            if count_vowels(line) == 1:
                line = re.sub(f"({vowels_g})", "\\1" + ACUTE, line)
            else:
                continue

        if re.search(f"{vowels_g}{ACUTE}$"):
            continue

        rhyme = get_rhyme(line)
        rhymes[rhyme].append(line)

rhymes = {key: value for (key, value) in rhymes.items() if len(value) >= 3}
with open("rhymes.json", "w") as f:
    json.dump(rhymes, f, ensure_ascii=False)