kovachev-bot/ja-accents/ja-accent-add.py

import sys
import pywikibot
import mwparserfromhell
from typing import Generator
from daijirin import are_duplicate_kanas, is_kana, get_accent


SITE = pywikibot.Site("en", "wiktionary")
NO_ACC_TRACKING_PAGE = "tracking/ja-pron/no accent"
BLACKLIST = "blacklist.txt"

class JapaneseSectionNotFound(ValueError):
    """The entry had no Japanese section."""

def get_japanese_section(parsed_text: mwparserfromhell.wikicode.Wikicode):
    try:
        return parsed_text.get_sections([2], "Japanese")[0]
    except:
        raise JapaneseSectionNotFound()

def get_kana_from_pron(ja_pron: mwparserfromhell.wikicode.Template, page_title: str) -> str:
    # If entry is all kana, no kana will be provided in the {{ja-pron}}, so infer from title
    if ja_pron.has("1"):
        kana = str(ja_pron.get("1"))
    else:
        if not is_kana(page_title):
            raise ValueError(f"ERROR, improperly formatted template on page {page_title}: pron template did not have kana despite non-kana title.")
        kana = page_title
    return kana

def there_are_duplicate_readings(ja_prons: list[mwparserfromhell.wikicode.Template], title: str) -> bool:
    return are_duplicate_kanas([get_kana_from_pron(pron, page_title=title) for pron in ja_prons])

def update_page(title: str):
    page = pywikibot.Page(SITE, title)
    parsed = mwparserfromhell.parse(page.text)
    japanese_section = get_japanese_section(parsed)
    ja_prons = [template for template in japanese_section.filter(forcetype=mwparserfromhell.wikicode.Template) if template.name == "ja-pron"]

    if len(ja_prons) == 0:
        raise ValueError(f"ERROR, no ja-pron on the page {title} to begin with, doing nothing.")

    if there_are_duplicate_readings(ja_prons, title):
        raise ValueError(f"ERROR, there are multiple indistinguishable terms on this page {title} with the same reading")

    accent_added = False
    for template in ja_prons:
        template: mwparserfromhell.wikicode.Template

        kana = get_kana_from_pron(template, title)

        possible_pitches = get_accent(main_headword=title, kana=kana)

        for i, accent in enumerate(possible_pitches):
            acc_param = f"acc{i+1 if i > 0 else ''}"
            acc_ref_param = f"{acc_param}_ref"

            if template.has(acc_param) or template.has(acc_ref_param):
                print("Template already has accent information, continuing", file=sys.stderr)
                break

            template.add(acc_param, accent)
            template.add(acc_ref_param, "DJR")
            accent_added = True

    # Only add references if we have actually added any accents to the page
    if accent_added and "===References===" not in japanese_section:
        japanese_section.append("\n\n===References===\n<references />\n\n")

    previous_text = page.text
    page.text = str(parsed)
    while "\n\n\n" in page.text:
        page.text = page.text.replace("\n\n\n", "\n\n")

    if page.text == previous_text:
        print("Content was identical, exiting...")
        return

    print(str(mwparserfromhell.parse(page.text).get_sections([2], "Japanese")[0]), "Is this text acceptable? (y/n)", sep="\n")

    valid = False
    while not valid:
        answer = input()
        if answer == "y" or answer == "n":
            valid = True

    if answer == "y":
        page.save("Added pitch accents from Daijirin to Japanese", minor=False)

def get_accentless_pages() -> Generator[pywikibot.Page, None, None]:
    TEMPLATE_NAMESPACE = SITE.namespaces.TEMPLATE
    MAIN_NAMESPACE = SITE.namespaces.MAIN
    return pywikibot.Page(SITE, NO_ACC_TRACKING_PAGE, ns=TEMPLATE_NAMESPACE).getReferences(only_template_inclusion=True, namespaces=[MAIN_NAMESPACE])

def iterate_pages(blacklist: set):
    for page in get_accentless_pages():
        title = page.title()
        if title in blacklist:
            print(f"Skipping page {title}")
            continue

        try:
            print(f"Updating pitch accents for page {title}")
            update_page(title)
        except Exception as e:
            print(f"Unable to update {title} due to error: {e}", file=sys.stderr)
            print(f"Adding {title} to blacklist")
            blacklist.add(title)

def main():
    try:
        with open(BLACKLIST) as f:
            blacklist = set(map(str.strip, f.readlines()))
    except FileNotFoundError:
        blacklist = set()

    # update_page("碧玉")
    # update_page("パイプカット")
    # update_page("火手")
    # update_page("AA")

    try:
        iterate_pages(blacklist)
    finally:
        with open(BLACKLIST, mode="w") as f:
            f.write("\n".join(blacklist))

if __name__ == "__main__":
    main()