Add rhyme project

2023-12-16 20:36:09 +00:00 · 2023-12-16 20:36:09 +00:00 · e242e787f3
commit e242e787f3
parent 4d6a267fd7
2 changed files with 34537 additions and 0 deletions
--- a/rhymes/rhyme_generator.py
+++ b/rhymes/rhyme_generator.py
@ -0,0 +1,250 @@
+from collections import defaultdict
+import json
+import unicodedata
+import regex as re
+
+GRAVE = chr(0x300)
+ACUTE = chr(0x301)
+BREVE = chr(0x306)
+TIE   = chr(0x361)
+PRIMARY = chr(0x2C8)
+SECONDARY = chr(0x2CC)
+TIE = chr(0x361)
+FRONTED = chr(0x31F)
+DOTUNDER = chr(0x323)
+HYPH = chr(0x2027)
+
+vowels = "aɤɔuɛiɐo"
+vowels_c = f"[{vowels}]"
+vowels_g = "[аъоуеияѝюАЪОУЕИЯЍЮ]"
+cons = f"bvɡdʒzjklwmnprstfxʃɣʲ{TIE}"
+cons_c = f"[{cons}]"
+voiced_cons = f"bvɡdʒzɣ{TIE}"
+voiced_cons_c = f"[{voiced_cons}]"
+accents = PRIMARY + SECONDARY
+accents_c = f"[{accents}]"
+
+phonetic_chars_map = {
+    "а": "a",
+    "б": "b",
+    "в": "v",
+    "г": "ɡ",
+    "д": "d",
+    "е": "ɛ",
+    "ж": "ʒ",
+    "з": "z",
+    "и": "i",
+    "й": "j",
+    "к": "k",
+    "л": "l",
+    "м": "m",
+    "н": "n",
+    "о": "ɔ",
+    "п": "p",
+    "р": "r",
+    "с": "s",
+    "т": "t",
+    "у": "u",
+    "ў": "w",
+    "ф": "f",
+    "х": "x",
+    "ц": f"t{TIE}s",
+    "ч": f"t{TIE}ʃ",
+    "ш": "ʃ",
+    "щ": "ʃt",
+    "ъ": "ɤ",
+    "ь": "ʲ",
+    "ю": "ʲu",
+    "я": "ʲa",
+    GRAVE: SECONDARY,
+    ACUTE: PRIMARY
+}
+
+devoicing = {
+    "b": "p", "d": "t", "ɡ": "k",
+    "z": "s", "ʒ": "ʃ",
+    "v": "f"
+}
+
+voicing = {
+    "p": "b", "t": "d", "k": "ɡ",
+    "s": "z", "ʃ": "ʒ", "x": "ɣ",
+    "f": "v"
+}
+
+def count_vowels(word):
+    vowel_count = len(re.findall(vowels_g, word))
+    return vowel_count
+
+
+IPA_prefixes = ["bɛz", "vɤz", "vɤzproiz", "iz", "naiz", "poiz", "prɛvɤz", "proiz", "raz"]
+
+
+def rsub(word: str, pattern: str, repl) -> str:
+    if isinstance(repl, dict):
+        transl = lambda m: m.group(0).translate(str.maketrans(repl))
+        return re.sub(pattern, transl, word)
+    else:
+        return re.sub(pattern, repl, word)
+
+
+def rsub_repeatedly(word: str, pattern: str, repl) -> str:
+    old = ""
+    while old != word:
+        old = word
+        word = rsub(word, pattern, repl)
+    return word
+
+
+def rmatch(word: str, pattern: str) -> list[str]:
+    comp = re.compile(pattern)
+    m = comp.match(word)
+    if m:
+        return m.groups()
+    else:
+        return [None for _ in range(comp.groups)]
+
+
+def toIPA(term, endschwa=False):
+    origterm = term
+    term = unicodedata.normalize("NFD", term.lower())
+    term = rsub(term, "у" + BREVE, "ў") # recompose ў
+    term = rsub(term, "и" + BREVE, "й") # recompose й
+    
+    if term.find(GRAVE) != -1 and not term.find(ACUTE):
+        raise ValueError(f"Use acute accent, not grave accent, for primary stress: {origterm}")
+
+    # allow DOTUNDER to signal same as endschwa=1    
+    term = rsub(term, f"а({accents_c}?){DOTUNDER}", "ъ\\1")
+    term = rsub(term, f"я({accents_c}?){DOTUNDER}", "ʲɤ\\1")
+    term = rsub(term, ".", phonetic_chars_map)
+
+    # Mark word boundaries
+    term = rsub(term, r"(\s+)", "#\\1#")
+    term = f"#{term}#"
+
+    # Convert verbal and definite endings
+    if endschwa:
+        term = rsub(term, "a(" + PRIMARY + "t?#)", "ɤ\\1")
+
+    # Change ʲ to j after vowels or word-initially
+    term = rsub(term, "([" + vowels + "#]" + accents_c + "?)ʲ", "\\1j")
+
+    ########## Move stress #######-
+
+    # First, move leftwards over the vowel.
+    term = rsub(term, "(" + vowels_c + ")(" + accents_c + ")", "\\2\\1")
+    # Then, move leftwards over j or soft sign.
+    term = rsub(term, "([jʲ])(" + accents_c + ")", "\\2\\1")
+    # Then, move leftwards over a single consonant.
+    term = rsub(term, "(" + cons_c + ")(" + accents_c + ")", "\\2\\1")
+    # Then, move leftwards over Cl/Cr combinations where C is an obstruent (NOTE: IPA ɡ).
+    term = rsub(term, "([bdɡptkxfv]" + ")(" + accents_c + ")([rl])", "\\2\\1\\3")
+    # Then, move leftwards over kv/gv (NOTE: IPA ɡ).
+    term = rsub(term, "([kɡ]" + ")(" + accents_c + ")(v)", "\\2\\1\\3")
+    # Then, move leftwards over sC combinations, where C is a stop or resonant (NOTE: IPA ɡ).
+    term = rsub(term, "([sz]" + ")(" + accents_c + ")([bdɡptkvlrmn])", "\\2\\1\\3")
+    # Then, move leftwards over affricates not followed by a consonant.
+    term = rsub(term, "([td]" + TIE + "?)(" + accents_c + ")([szʃʒ][" + vowels + "ʲ])", "\\2\\1\\3")
+    # If we ended up in the middle of a tied affricate, move to its right.
+    term = rsub(term, "(" + TIE + ")(" + accents_c + ")(" + cons_c + ")", "\\1\\3\\2")
+    # Then, move leftwards over any remaining consonants at the beginning of a word.
+    term = rsub(term, "#(" + cons_c + "*)(" + accents_c + ")", "#\\2\\1")
+    # Then correct for known prefixes.
+    for prefix in IPA_prefixes:
+        prefix_prefix, prefix_final_cons = rmatch(prefix, "^(.*?)(" + cons_c + "*)$")
+        if prefix_final_cons:
+            # Check for accent moved too far to the left into a prefix, e.g. безбрачие accented as беˈзбрачие instead
+            # of безˈбрачие
+            term = rsub(term, "#(" + prefix_prefix + ")(" + accents_c + ")(" + prefix_final_cons + ")", "#\\1\\3\\2")
+
+
+    # Finally, if there is an explicit syllable boundary in the cluster of consonants where the stress is, put it there.
+    # First check for accent to the right of the explicit syllable boundary.
+    term = rsub(term, "(" + cons_c + "*)\\.(" + cons_c + "*)(" + accents_c + ")(" + cons_c + "*)", "\\1\\3\\2\\4")
+    # Then check for accent to the left of the explicit syllable boundary.
+    term = rsub(term, "(" + cons_c + "*)(" + accents_c + ")(" + cons_c + "*)\\.(" + cons_c + "*)", "\\1\\3\\2\\4")
+    # Finally, remove any remaining syllable boundaries.
+    term = rsub(term, "\\.", "")
+
+    ########## Vowel reduction (in unstressed syllables) #######-
+    def reduce_vowel(vowel):
+        return rsub(vowel, "[aɔɤu]", { "a": "ɐ", "ɔ": "o", "ɤ": "ɐ", "u": "o" })
+
+    # Reduce all vowels before the stress, except if the word has no accent at all. (FIXME: This is presumably
+    # intended for single-syllable words without accents, but if the word is multisyllabic without accents,
+    # presumably all vowels should be reduced.)
+    def reduce_overall(m):
+        a, b = m.groups()
+        if count_vowels(origterm) <= 1:
+            return a + b
+        else:
+            return reduce_vowel(a) + b
+
+    term = rsub(term, "(#[^#" + accents + "]*)(.*?#)", reduce_overall)
+
+    # Reduce all vowels after the accent except the first vowel after the accent mark (which is stressed).
+    term = rsub(term, "(" + accents_c + "[^aɛiɔuɤ#]*[aɛiɔuɤ])([^#" + accents + "]*)", lambda m:  m.group(1) + reduce_vowel(m.group(2)))
+
+    ########## Vowel assimilation to adjacent consonants (fronting/raising) #######-
+    term = rsub(term, "([ʃʒʲj])([aouɤ])", "\\1\\2" + FRONTED)
+
+    # Hard l
+    term = rsub_repeatedly(term, "l([^ʲɛi])", "ɫ\\1")
+
+
+    # Voicing assimilation
+    term = rsub(term, "([bdɡzʒv" + TIE + "]*)(" + accents_c + "?[ptksʃfx#])", lambda m: rsub(m.group(1), ".", devoicing) + m.group(2))
+    term = rsub(term, "([ptksʃfx" + TIE + "]*)(" + accents_c + "?[bdɡzʒ])", lambda m: rsub(m.group(1), ".", voicing) + m.group(2))
+    term = rsub(term, "n(" + accents_c + "?[ɡk]+)", "ŋ\\1")
+    term = rsub(term, "m(" + accents_c + "?[fv]+)", "ɱ\\1")
+
+    # Sibilant assimilation
+    term = rsub(term, "[sz](" + accents_c + "?[td]?" + TIE + "?)([ʃʒ])", "\\2\\1\\2")
+
+    # Reduce consonant clusters
+    term = rsub(term, "([szʃʒ])[td](" + accents_c + "?)([tdknml])", "\\2\\1\\3")
+
+    # Strip hashes
+    term = rsub(term, "#", "")
+    
+    return term
+
+
+def get_rhyme(term: str) -> str:
+    def get_rhyme_ipa(ipa: str):
+        stress_index = ipa.rindex(PRIMARY)
+        rhyme_start_index = stress_index
+        while rhyme_start_index < len(ipa) and not re.match(vowels_c, ipa[rhyme_start_index]):
+            rhyme_start_index += 1
+        
+        return f"{ipa[rhyme_start_index:]}"
+
+    return get_rhyme_ipa(toIPA(term))
+
+
+# Each possible rhyme (suffix) will have a list of member words
+rhymes: dict[str, list[str]] = defaultdict(list)
+
+# Issues to consider:
+#  - Terms that end in a stressed vowel
+#  - Terms that have more than one word (space or hyphen)
+#  - Whether to include fronting in the IPA transcription
+with open("out/words.txt") as f:
+    for line in f:
+        line = line.strip().replace("`", ACUTE)
+        if ACUTE not in line:
+            if count_vowels(line) == 1:
+                line = re.sub(f"({vowels_g})", "\\1" + ACUTE, line)
+            else:
+                continue
+        
+        if re.search(f"{vowels_g}{ACUTE}$"):
+            continue
+
+        rhyme = get_rhyme(line)
+        rhymes[rhyme].append(line)
+
+rhymes = {key: value for (key, value) in rhymes.items() if len(value) >= 3}
+with open("rhymes.json", "w") as f:
+    json.dump(rhymes, f, ensure_ascii=False)
--- a/rhymes/rhymes.json
+++ b/rhymes/rhymes.json