Add rhyme project
This commit is contained in:
parent
4d6a267fd7
commit
e242e787f3
250
rhymes/rhyme_generator.py
Normal file
250
rhymes/rhyme_generator.py
Normal file
@ -0,0 +1,250 @@
|
||||
from collections import defaultdict
|
||||
import json
|
||||
import unicodedata
|
||||
import regex as re
|
||||
|
||||
GRAVE = chr(0x300)
|
||||
ACUTE = chr(0x301)
|
||||
BREVE = chr(0x306)
|
||||
TIE = chr(0x361)
|
||||
PRIMARY = chr(0x2C8)
|
||||
SECONDARY = chr(0x2CC)
|
||||
TIE = chr(0x361)
|
||||
FRONTED = chr(0x31F)
|
||||
DOTUNDER = chr(0x323)
|
||||
HYPH = chr(0x2027)
|
||||
|
||||
vowels = "aɤɔuɛiɐo"
|
||||
vowels_c = f"[{vowels}]"
|
||||
vowels_g = "[аъоуеияѝюАЪОУЕИЯЍЮ]"
|
||||
cons = f"bvɡdʒzjklwmnprstfxʃɣʲ{TIE}"
|
||||
cons_c = f"[{cons}]"
|
||||
voiced_cons = f"bvɡdʒzɣ{TIE}"
|
||||
voiced_cons_c = f"[{voiced_cons}]"
|
||||
accents = PRIMARY + SECONDARY
|
||||
accents_c = f"[{accents}]"
|
||||
|
||||
phonetic_chars_map = {
|
||||
"а": "a",
|
||||
"б": "b",
|
||||
"в": "v",
|
||||
"г": "ɡ",
|
||||
"д": "d",
|
||||
"е": "ɛ",
|
||||
"ж": "ʒ",
|
||||
"з": "z",
|
||||
"и": "i",
|
||||
"й": "j",
|
||||
"к": "k",
|
||||
"л": "l",
|
||||
"м": "m",
|
||||
"н": "n",
|
||||
"о": "ɔ",
|
||||
"п": "p",
|
||||
"р": "r",
|
||||
"с": "s",
|
||||
"т": "t",
|
||||
"у": "u",
|
||||
"ў": "w",
|
||||
"ф": "f",
|
||||
"х": "x",
|
||||
"ц": f"t{TIE}s",
|
||||
"ч": f"t{TIE}ʃ",
|
||||
"ш": "ʃ",
|
||||
"щ": "ʃt",
|
||||
"ъ": "ɤ",
|
||||
"ь": "ʲ",
|
||||
"ю": "ʲu",
|
||||
"я": "ʲa",
|
||||
GRAVE: SECONDARY,
|
||||
ACUTE: PRIMARY
|
||||
}
|
||||
|
||||
devoicing = {
|
||||
"b": "p", "d": "t", "ɡ": "k",
|
||||
"z": "s", "ʒ": "ʃ",
|
||||
"v": "f"
|
||||
}
|
||||
|
||||
voicing = {
|
||||
"p": "b", "t": "d", "k": "ɡ",
|
||||
"s": "z", "ʃ": "ʒ", "x": "ɣ",
|
||||
"f": "v"
|
||||
}
|
||||
|
||||
def count_vowels(word):
|
||||
vowel_count = len(re.findall(vowels_g, word))
|
||||
return vowel_count
|
||||
|
||||
|
||||
IPA_prefixes = ["bɛz", "vɤz", "vɤzproiz", "iz", "naiz", "poiz", "prɛvɤz", "proiz", "raz"]
|
||||
|
||||
|
||||
def rsub(word: str, pattern: str, repl) -> str:
|
||||
if isinstance(repl, dict):
|
||||
transl = lambda m: m.group(0).translate(str.maketrans(repl))
|
||||
return re.sub(pattern, transl, word)
|
||||
else:
|
||||
return re.sub(pattern, repl, word)
|
||||
|
||||
|
||||
def rsub_repeatedly(word: str, pattern: str, repl) -> str:
|
||||
old = ""
|
||||
while old != word:
|
||||
old = word
|
||||
word = rsub(word, pattern, repl)
|
||||
return word
|
||||
|
||||
|
||||
def rmatch(word: str, pattern: str) -> list[str]:
|
||||
comp = re.compile(pattern)
|
||||
m = comp.match(word)
|
||||
if m:
|
||||
return m.groups()
|
||||
else:
|
||||
return [None for _ in range(comp.groups)]
|
||||
|
||||
|
||||
def toIPA(term, endschwa=False):
|
||||
origterm = term
|
||||
term = unicodedata.normalize("NFD", term.lower())
|
||||
term = rsub(term, "у" + BREVE, "ў") # recompose ў
|
||||
term = rsub(term, "и" + BREVE, "й") # recompose й
|
||||
|
||||
if term.find(GRAVE) != -1 and not term.find(ACUTE):
|
||||
raise ValueError(f"Use acute accent, not grave accent, for primary stress: {origterm}")
|
||||
|
||||
# allow DOTUNDER to signal same as endschwa=1
|
||||
term = rsub(term, f"а({accents_c}?){DOTUNDER}", "ъ\\1")
|
||||
term = rsub(term, f"я({accents_c}?){DOTUNDER}", "ʲɤ\\1")
|
||||
term = rsub(term, ".", phonetic_chars_map)
|
||||
|
||||
# Mark word boundaries
|
||||
term = rsub(term, r"(\s+)", "#\\1#")
|
||||
term = f"#{term}#"
|
||||
|
||||
# Convert verbal and definite endings
|
||||
if endschwa:
|
||||
term = rsub(term, "a(" + PRIMARY + "t?#)", "ɤ\\1")
|
||||
|
||||
# Change ʲ to j after vowels or word-initially
|
||||
term = rsub(term, "([" + vowels + "#]" + accents_c + "?)ʲ", "\\1j")
|
||||
|
||||
########## Move stress #######-
|
||||
|
||||
# First, move leftwards over the vowel.
|
||||
term = rsub(term, "(" + vowels_c + ")(" + accents_c + ")", "\\2\\1")
|
||||
# Then, move leftwards over j or soft sign.
|
||||
term = rsub(term, "([jʲ])(" + accents_c + ")", "\\2\\1")
|
||||
# Then, move leftwards over a single consonant.
|
||||
term = rsub(term, "(" + cons_c + ")(" + accents_c + ")", "\\2\\1")
|
||||
# Then, move leftwards over Cl/Cr combinations where C is an obstruent (NOTE: IPA ɡ).
|
||||
term = rsub(term, "([bdɡptkxfv]" + ")(" + accents_c + ")([rl])", "\\2\\1\\3")
|
||||
# Then, move leftwards over kv/gv (NOTE: IPA ɡ).
|
||||
term = rsub(term, "([kɡ]" + ")(" + accents_c + ")(v)", "\\2\\1\\3")
|
||||
# Then, move leftwards over sC combinations, where C is a stop or resonant (NOTE: IPA ɡ).
|
||||
term = rsub(term, "([sz]" + ")(" + accents_c + ")([bdɡptkvlrmn])", "\\2\\1\\3")
|
||||
# Then, move leftwards over affricates not followed by a consonant.
|
||||
term = rsub(term, "([td]" + TIE + "?)(" + accents_c + ")([szʃʒ][" + vowels + "ʲ])", "\\2\\1\\3")
|
||||
# If we ended up in the middle of a tied affricate, move to its right.
|
||||
term = rsub(term, "(" + TIE + ")(" + accents_c + ")(" + cons_c + ")", "\\1\\3\\2")
|
||||
# Then, move leftwards over any remaining consonants at the beginning of a word.
|
||||
term = rsub(term, "#(" + cons_c + "*)(" + accents_c + ")", "#\\2\\1")
|
||||
# Then correct for known prefixes.
|
||||
for prefix in IPA_prefixes:
|
||||
prefix_prefix, prefix_final_cons = rmatch(prefix, "^(.*?)(" + cons_c + "*)$")
|
||||
if prefix_final_cons:
|
||||
# Check for accent moved too far to the left into a prefix, e.g. безбрачие accented as беˈзбрачие instead
|
||||
# of безˈбрачие
|
||||
term = rsub(term, "#(" + prefix_prefix + ")(" + accents_c + ")(" + prefix_final_cons + ")", "#\\1\\3\\2")
|
||||
|
||||
|
||||
# Finally, if there is an explicit syllable boundary in the cluster of consonants where the stress is, put it there.
|
||||
# First check for accent to the right of the explicit syllable boundary.
|
||||
term = rsub(term, "(" + cons_c + "*)\\.(" + cons_c + "*)(" + accents_c + ")(" + cons_c + "*)", "\\1\\3\\2\\4")
|
||||
# Then check for accent to the left of the explicit syllable boundary.
|
||||
term = rsub(term, "(" + cons_c + "*)(" + accents_c + ")(" + cons_c + "*)\\.(" + cons_c + "*)", "\\1\\3\\2\\4")
|
||||
# Finally, remove any remaining syllable boundaries.
|
||||
term = rsub(term, "\\.", "")
|
||||
|
||||
########## Vowel reduction (in unstressed syllables) #######-
|
||||
def reduce_vowel(vowel):
|
||||
return rsub(vowel, "[aɔɤu]", { "a": "ɐ", "ɔ": "o", "ɤ": "ɐ", "u": "o" })
|
||||
|
||||
# Reduce all vowels before the stress, except if the word has no accent at all. (FIXME: This is presumably
|
||||
# intended for single-syllable words without accents, but if the word is multisyllabic without accents,
|
||||
# presumably all vowels should be reduced.)
|
||||
def reduce_overall(m):
|
||||
a, b = m.groups()
|
||||
if count_vowels(origterm) <= 1:
|
||||
return a + b
|
||||
else:
|
||||
return reduce_vowel(a) + b
|
||||
|
||||
term = rsub(term, "(#[^#" + accents + "]*)(.*?#)", reduce_overall)
|
||||
|
||||
# Reduce all vowels after the accent except the first vowel after the accent mark (which is stressed).
|
||||
term = rsub(term, "(" + accents_c + "[^aɛiɔuɤ#]*[aɛiɔuɤ])([^#" + accents + "]*)", lambda m: m.group(1) + reduce_vowel(m.group(2)))
|
||||
|
||||
########## Vowel assimilation to adjacent consonants (fronting/raising) #######-
|
||||
term = rsub(term, "([ʃʒʲj])([aouɤ])", "\\1\\2" + FRONTED)
|
||||
|
||||
# Hard l
|
||||
term = rsub_repeatedly(term, "l([^ʲɛi])", "ɫ\\1")
|
||||
|
||||
|
||||
# Voicing assimilation
|
||||
term = rsub(term, "([bdɡzʒv" + TIE + "]*)(" + accents_c + "?[ptksʃfx#])", lambda m: rsub(m.group(1), ".", devoicing) + m.group(2))
|
||||
term = rsub(term, "([ptksʃfx" + TIE + "]*)(" + accents_c + "?[bdɡzʒ])", lambda m: rsub(m.group(1), ".", voicing) + m.group(2))
|
||||
term = rsub(term, "n(" + accents_c + "?[ɡk]+)", "ŋ\\1")
|
||||
term = rsub(term, "m(" + accents_c + "?[fv]+)", "ɱ\\1")
|
||||
|
||||
# Sibilant assimilation
|
||||
term = rsub(term, "[sz](" + accents_c + "?[td]?" + TIE + "?)([ʃʒ])", "\\2\\1\\2")
|
||||
|
||||
# Reduce consonant clusters
|
||||
term = rsub(term, "([szʃʒ])[td](" + accents_c + "?)([tdknml])", "\\2\\1\\3")
|
||||
|
||||
# Strip hashes
|
||||
term = rsub(term, "#", "")
|
||||
|
||||
return term
|
||||
|
||||
|
||||
def get_rhyme(term: str) -> str:
|
||||
def get_rhyme_ipa(ipa: str):
|
||||
stress_index = ipa.rindex(PRIMARY)
|
||||
rhyme_start_index = stress_index
|
||||
while rhyme_start_index < len(ipa) and not re.match(vowels_c, ipa[rhyme_start_index]):
|
||||
rhyme_start_index += 1
|
||||
|
||||
return f"{ipa[rhyme_start_index:]}"
|
||||
|
||||
return get_rhyme_ipa(toIPA(term))
|
||||
|
||||
|
||||
# Each possible rhyme (suffix) will have a list of member words
|
||||
rhymes: dict[str, list[str]] = defaultdict(list)
|
||||
|
||||
# Issues to consider:
|
||||
# - Terms that end in a stressed vowel
|
||||
# - Terms that have more than one word (space or hyphen)
|
||||
# - Whether to include fronting in the IPA transcription
|
||||
with open("out/words.txt") as f:
|
||||
for line in f:
|
||||
line = line.strip().replace("`", ACUTE)
|
||||
if ACUTE not in line:
|
||||
if count_vowels(line) == 1:
|
||||
line = re.sub(f"({vowels_g})", "\\1" + ACUTE, line)
|
||||
else:
|
||||
continue
|
||||
|
||||
if re.search(f"{vowels_g}{ACUTE}$"):
|
||||
continue
|
||||
|
||||
rhyme = get_rhyme(line)
|
||||
rhymes[rhyme].append(line)
|
||||
|
||||
rhymes = {key: value for (key, value) in rhymes.items() if len(value) >= 3}
|
||||
with open("rhymes.json", "w") as f:
|
||||
json.dump(rhymes, f, ensure_ascii=False)
|
34287
rhymes/rhymes.json
Normal file
34287
rhymes/rhymes.json
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user