From 8081f406258f72739ba4e1c733b6db9680e6634a Mon Sep 17 00:00:00 2001 From: Spooghetti420 Date: Mon, 10 Jul 2023 21:12:10 +0100 Subject: [PATCH] Add accent scraping script --- ja-accents/.gitignore | 1 + ja-accents/daijirin.py | 110 +++++++++++++++++++++++++++++++++++ ja-accents/ja-accent-add.py | 112 ++++++++++++++++++++++++++++++++++++ 3 files changed, 223 insertions(+) create mode 100644 ja-accents/.gitignore create mode 100644 ja-accents/daijirin.py create mode 100644 ja-accents/ja-accent-add.py diff --git a/ja-accents/.gitignore b/ja-accents/.gitignore new file mode 100644 index 0000000..2494723 --- /dev/null +++ b/ja-accents/.gitignore @@ -0,0 +1 @@ +term_bank_1.json \ No newline at end of file diff --git a/ja-accents/daijirin.py b/ja-accents/daijirin.py new file mode 100644 index 0000000..ab94bb0 --- /dev/null +++ b/ja-accents/daijirin.py @@ -0,0 +1,110 @@ +import regex as re +from os.path import expanduser + +DJR_DATA_FILE = expanduser("~/Downloads/(三省堂)スーパー大辞林[3.0]/term_bank_1.json") # Replace with your path to the DJR JSON data +ACCENT_LIST_REGEX = re.compile(r"(?:\[\d{1,2}\])+") + +class ItemNotFoundError(ValueError): + """Error when looking up an item in Daijirin; the item was not found.""" + +class NoAccentError(ValueError): + """Error when trying to find the accent of a term: the entry defines no accent. No accent exists in the data.""" + + +# NOTICE: requires 3GB+ RAM at runtime. +# Be cautious if your system does not currently have sufficient available memory. +with open(DJR_DATA_FILE) as f: + DAIJIRIN = eval(f.read()) + +def is_kana(s: str) -> bool: + HIRAGANA_START = '\u3040' + HIRAGANA_END = '\u309f' + KATAKANA_START = '\u30a0' + KATAKANA_END = '\u30ff' + return all((HIRAGANA_START <= char <= HIRAGANA_END) or (KATAKANA_START <= char <= KATAKANA_END) for char in s) + +def validate_headword_and_kana(main_headword: str = None, kana: str = None) -> tuple[str, str]: + """ + If the kana parameter is not specified for a term containing non-kana characters (i.e. kanji), raises an error; + if the `main_headword` parameter is not specified, but `kana` is, then the term is kana-only, and so `main_headword` + is updated to match the `kana` parameter's value. Returns the 2-tuple containing post-processed (`main_headword`, `kana`). + """ + if kana is not None and main_headword is None: + main_headword = kana + elif kana is None: + raise ValueError("Must specify kana parameter") + + return main_headword, kana + +def are_duplicate_kanas(list_of_kana_readings: list[str]) -> bool: + """Illustrative input: + `headword`=人, `list_of_kana_readings`=["ひと", "にん", "じん"] + This will return `False` because there is no term where there are two identical kanas + """ + # Sets contain unique items only, so if there are duplicates, the set will have fewer elements than the list. + # If there're no duplicates, then, we expect their lengths to be the same. + return len(set(list_of_kana_readings)) < len(list_of_kana_readings) + +def find_entry(*, main_headword: str = None, kana: str = None) -> list: + """ + Finds the record in the dictionary data file corresponding to the input `main_headword` (usually kanji) + and `kana` (if the term is kana-only, only `kana` needs to be specified; otherwise, both need to be specified.) + If nothing is found, raises an error. + """ + main_headword, kana = validate_headword_and_kana(main_headword, kana) + + def entry_matches(entry: list) -> bool: + if is_kana(main_headword): + return entry[0] == main_headword + return entry[0] == main_headword and entry[1] == kana + + for item in DAIJIRIN: + if entry_matches(item): + return item + + # If nothing is found, return empty list + return [] + +def get_body(entry: list) -> str: + # Although the 5th element of an entry in our format is a list, + # every single entry in the dictionary only has 1 item in that list, which + # is the body of the entry (the definition, pitch accent information are both in there.). + return entry[5][0] + +def get_accent_from_body(entry_body: str) -> tuple[bool, str]: + """ + From an entry body, returns both whether there is a pitch accent defined, and the string representing + all the possible pitch accents in a row (e.g. [1][0], [4][3], etc.) + """ + match = ACCENT_LIST_REGEX.search(entry_body) + return bool(match), match.group(0) if bool(match) else "" + +def process_djr_accents(acc_str: str) -> list[str]: + """Return list of accents from a string like [1][0].""" + accs = [] + current = "" + for char in acc_str: + if char == "[": + pass + elif char == "]": + accs.append(current) + current = "" + else: + current += char + return accs + +def get_accent(*, main_headword: str = None, kana: str = None) -> list[str]: + """ + Return a list of possible accents for a headword-kana combination. Must pass parameters as keywords to avoid confusion. + If there is no accent available, raises a `NoAccentError`. + """ + main_headword, kana = validate_headword_and_kana(main_headword, kana) + entry = find_entry(main_headword=main_headword, kana=kana) + if entry == []: return [] + entry_body = get_body(entry) + has_accent, accents_raw = get_accent_from_body(entry_body) + if has_accent: + possible_accents = process_djr_accents(accents_raw) + return possible_accents + else: + raise NoAccentError(f"Term {main_headword}({kana}) has no accent in Daijirin.") diff --git a/ja-accents/ja-accent-add.py b/ja-accents/ja-accent-add.py new file mode 100644 index 0000000..ccf57b5 --- /dev/null +++ b/ja-accents/ja-accent-add.py @@ -0,0 +1,112 @@ +import pywikibot +import mwparserfromhell +from typing import Generator +from daijirin import are_duplicate_kanas, is_kana, get_accent + + +SITE = pywikibot.Site("en", "wiktionary") +NO_ACC_TRACKING_PAGE = "tracking/ja-pron/no accent" +BLACKLIST = "blacklist.txt" + +class JapaneseSectionNotFound(ValueError): + """The entry had no Japanese section.""" + +def get_japanese_section(parsed_text: mwparserfromhell.wikicode.Wikicode): + try: + return parsed_text.get_sections([2], "Japanese")[0] + except: + raise JapaneseSectionNotFound() + +def get_kana_from_pron(ja_pron: mwparserfromhell.wikicode.Template, page_title: str) -> str: + # If entry is all kana, no kana will be provided in the {{ja-pron}}, so infer from title + if ja_pron.has("1"): + kana = str(ja_pron.get("1")) + else: + if not is_kana(page_title): + raise ValueError(f"ERROR, improperly formatted template on page {page_title}: pron template did not have kana despite non-kana title.") + kana = page_title + return kana + +def there_are_duplicate_readings(ja_prons: list[mwparserfromhell.wikicode.Template], title: str) -> bool: + return are_duplicate_kanas([get_kana_from_pron(pron, page_title=title) for pron in ja_prons]) + +def update_page(title: str): + page = pywikibot.Page(SITE, title) + parsed = mwparserfromhell.parse(page.text) + japanese_section = get_japanese_section(parsed) + ja_prons = [template for template in japanese_section.filter(forcetype=mwparserfromhell.wikicode.Template) if template.name == "ja-pron"] + + if len(ja_prons) == 0: + raise ValueError(f"ERROR, no ja-pron on the page {title} to begin with, doing nothing.") + + if there_are_duplicate_readings(ja_prons, title): + raise ValueError(f"ERROR, there are multiple indistinguishable terms on this page {title} with the same reading") + + for template in ja_prons: + template: mwparserfromhell.wikicode.Template + + kana = get_kana_from_pron(template, title) + + possible_pitches = get_accent(main_headword=title, kana=kana) + + for i, accent in enumerate(possible_pitches): + acc_param = f"acc{i+1 if i > 0 else ''}" + acc_ref_param = f"{acc_param}_ref" + + if template.has(acc_param) or template.has(acc_ref_param): + break + # print("ERROR, template already has accent information") + # return SuccessCode.FAILURE + + template.add(acc_param, accent) + template.add(acc_ref_param, "DJR") + + if "===References===" not in japanese_section: + japanese_section.append("\n\n===References===\n\n\n") + + page.text = str(parsed) + while "\n\n\n" in page.text: + print("deez") + page.text = page.text.replace("\n\n\n", "\n\n") + + print(str(mwparserfromhell.parse(page.text).get_sections([2], "Japanese")[0]), "Is this text acceptable? (y/n)", sep="\n") + + valid = False + while not valid: + answer = input() + if answer == "y" or answer == "n": + valid = True + + if answer == "y": + page.save("Added accents to page", minor=False) + +def get_accentless_pages() -> Generator[pywikibot.Page, None, None]: + TEMPLATE_NAMESPACE = SITE.namespaces.TEMPLATE + MAIN_NAMESPACE = SITE.namespaces.MAIN + return pywikibot.Page(SITE, NO_ACC_TRACKING_PAGE, ns=TEMPLATE_NAMESPACE).getReferences(only_template_inclusion=True, namespaces=[MAIN_NAMESPACE]) + +def iterate_pages(): + for page in get_accentless_pages(): + try: + update_page(page) + except Exception as e: + print(f"Unable to update {page.title()} due to error: {e}") + +def main(): + try: + with open(BLACKLIST) as f: + blacklist = set(map(str.strip, f.readlines())) + except FileNotFoundError: + blacklist = set() + + # update_page("碧玉") + # update_page("パイプカット") + + try: + iterate_pages() + except: + with open(BLACKLIST, mode="w") as f: + f.writelines(blacklist) + +if __name__ == "__main__": + main() \ No newline at end of file