kovachev-bot/ja-accents/daijirin.py

111 lines
4.6 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import regex as re
from os.path import expanduser
DJR_DATA_FILE = expanduser("~/Downloads/三省堂スーパー大辞林3.0/term_bank_1.json") # Replace with your path to the DJR JSON data
ACCENT_LIST_REGEX = re.compile(r"(?:\[\d{1,2}\])+")
class ItemNotFoundError(ValueError):
"""Error when looking up an item in Daijirin; the item was not found."""
class NoAccentError(ValueError):
"""Error when trying to find the accent of a term: the entry defines no accent. No accent exists in the data."""
# NOTICE: requires 3GB+ RAM at runtime.
# Be cautious if your system does not currently have sufficient available memory.
with open(DJR_DATA_FILE) as f:
DAIJIRIN = eval(f.read())
def is_kana(s: str) -> bool:
HIRAGANA_START = '\u3040'
HIRAGANA_END = '\u309f'
KATAKANA_START = '\u30a0'
KATAKANA_END = '\u30ff'
return all((HIRAGANA_START <= char <= HIRAGANA_END) or (KATAKANA_START <= char <= KATAKANA_END) for char in s)
def validate_headword_and_kana(main_headword: str = None, kana: str = None) -> tuple[str, str]:
"""
If the kana parameter is not specified for a term containing non-kana characters (i.e. kanji), raises an error;
if the `main_headword` parameter is not specified, but `kana` is, then the term is kana-only, and so `main_headword`
is updated to match the `kana` parameter's value. Returns the 2-tuple containing post-processed (`main_headword`, `kana`).
"""
if kana is not None and main_headword is None:
main_headword = kana
elif kana is None:
raise ValueError("Must specify kana parameter")
return main_headword, kana
def are_duplicate_kanas(list_of_kana_readings: list[str]) -> bool:
"""Illustrative input:
`headword`=人, `list_of_kana_readings`=["ひと", "にん", "じん"]
This will return `False` because there is no term where there are two identical kanas
"""
# Sets contain unique items only, so if there are duplicates, the set will have fewer elements than the list.
# If there're no duplicates, then, we expect their lengths to be the same.
return len(set(list_of_kana_readings)) < len(list_of_kana_readings)
def find_entry(*, main_headword: str = None, kana: str = None) -> list:
"""
Finds the record in the dictionary data file corresponding to the input `main_headword` (usually kanji)
and `kana` (if the term is kana-only, only `kana` needs to be specified; otherwise, both need to be specified.)
If nothing is found, raises an error.
"""
main_headword, kana = validate_headword_and_kana(main_headword, kana)
def entry_matches(entry: list) -> bool:
if is_kana(main_headword):
return entry[0] == main_headword
return entry[0] == main_headword and entry[1] == kana
for item in DAIJIRIN:
if entry_matches(item):
return item
# If nothing is found, return empty list
return []
def get_body(entry: list) -> str:
# Although the 5th element of an entry in our format is a list,
# every single entry in the dictionary only has 1 item in that list, which
# is the body of the entry (the definition, pitch accent information are both in there.).
return entry[5][0]
def get_accent_from_body(entry_body: str) -> tuple[bool, str]:
"""
From an entry body, returns both whether there is a pitch accent defined, and the string representing
all the possible pitch accents in a row (e.g. [1][0], [4][3], etc.)
"""
match = ACCENT_LIST_REGEX.search(entry_body)
return bool(match), match.group(0) if bool(match) else ""
def process_djr_accents(acc_str: str) -> list[str]:
"""Return list of accents from a string like [1][0]."""
accs = []
current = ""
for char in acc_str:
if char == "[":
pass
elif char == "]":
accs.append(current)
current = ""
else:
current += char
return accs
def get_accent(*, main_headword: str = None, kana: str = None) -> list[str]:
"""
Return a list of possible accents for a headword-kana combination. Must pass parameters as keywords to avoid confusion.
If there is no accent available, raises a `NoAccentError`.
"""
main_headword, kana = validate_headword_and_kana(main_headword, kana)
entry = find_entry(main_headword=main_headword, kana=kana)
if entry == []: return []
entry_body = get_body(entry)
has_accent, accents_raw = get_accent_from_body(entry_body)
if has_accent:
possible_accents = process_djr_accents(accents_raw)
return possible_accents
else:
raise NoAccentError(f"Term {main_headword}({kana}) has no accent in Daijirin.")