from enum import Enum import sys from typing import Generator, Iterable import webbrowser import re import pyperclip import pywikibot import mwparserfromhell import kovachevbot SITE = pywikibot.Site("en", "wiktionary") COMMONS = pywikibot.Site("commons", "commons") NEED_ATTENTION = "audio_needs_attention.txt" SEEN_ENTRIES = "audio_seen_files.txt" HEAD_TEMPLATES = {"bg-noun", "bg-verb", "bg-adj", "head", "bg-adv", "bg-verbal noun", "bg-verbal noun form", "bg-letter", "bg-part", "bg-part form", "bg-phrase", "bg-proper noun"} def get_audio_template_from_file_name(file_name: str) -> str: return f"* {{{{audio|bg|{file_name.replace('File:', '')}|Audio}}}}" def section_name(section: mwparserfromhell.wikicode.Wikicode) -> str: return re.sub("=+", "", str(section.nodes[0])) def get_section_candidate(sections) -> str: """ Returns a code, which is the ordinal number of the section which is the most appropriate one to edit on this page. For example, if the page has only a Bulgarian section with 1 Pronunciation and 1 Noun header, this will return 2, as 1 is the first section (Bulgarian), and 2 is the second section in the page, which is the Pronunciation section. If there are in fact multiple Bulgarian pronunciation headers, then that requires that we edit the whole Bulgarian section so I can figure out which is the correct place to put the audio. """ return [section_name(s) for s in sections].index("Bulgarian") + 1 def get_wikitonary_edit_url(page_name: str, sections) -> str: section_to_edit = get_section_candidate(sections) return f"https://en.wiktionary.org/w/index.php?title={page_name}&action=edit§ion={section_to_edit}" def add_pronunciation_section(bulgarian_section: mwparserfromhell.wikicode.Wikicode, term: str) -> None: term = kovachevbot.links_to_plaintext(term) # Remove link syntax from term PRONUNCIATION_CONTENT = "\n\n===Pronunciation===\n* {{bg-IPA|" + term + "}}\n\n" if (e := bulgarian_section.get_sections([3], "Etymology")): etymology: mwparserfromhell.wikicode.Wikicode = e[0] etymology.append(PRONUNCIATION_CONTENT) elif (a := bulgarian_section.get_sections([3], "Alternative forms")): alternative_forms: mwparserfromhell.wikicode.Wikicode = a[0] alternative_forms.append(PRONUNCIATION_CONTENT) else: i = 1 while not type(bulgarian_section.nodes[i]) is mwparserfromhell.nodes.heading.Heading: i += 1 bulgarian_section.get_sections([2], "Bulgarian")[0].insert_before(bulgarian_section.nodes[i], PRONUNCIATION_CONTENT) while "\n\n\n" in bulgarian_section: bulgarian_section.replace("\n\n\n", "\n\n") def process_header(template: mwparserfromhell.wikicode.Template) -> set[str]: name = str(template.name) def default_class() -> set[str]: s = set() if template.has("1"): s.add(str(template.get(1).value)) i = 2 while template.has((param_name := f"head{i}")): s.add(str(template.get(param_name).value)) i += 1 return s if name == "head": s = set() if template.has("head"): s.add(str(template.get("head").value)) i = 2 while template.has((param_name := f"head{i}")): s.add(str(template.get(param_name).value)) i += 1 else: s = default_class() return s class PageEditStatus(Enum): SUCCESS = 0 NOTHING_TO_ADD = 1 CANNOT_ADD = 2 def visit_page(page_name: str, audio_file_name: str) -> bool: p = pywikibot.Page(SITE, page_name) parsed = mwparserfromhell.parse(p.text) # sections = parsed.get_sections([2, 3, 4, 5, 6, 7]) try: bulgarian_section: mwparserfromhell.wikicode.Wikicode = parsed.get_sections([2], "Bulgarian")[0] except IndexError: print("No Bulgarian entry for term", page_name, file=sys.stderr) return edit_summary = "" # bulgarian_subsections = bulgarian_section.get_sections([3, 4, 5, 6, 7]) etymology_sections = bulgarian_section.get_sections([3], "Etymology") pronunciation_subsections = bulgarian_section.get_sections([3, 4], "Pronunciation") pronunciation_subsections_l3 = bulgarian_section.get_sections([3], "Pronunciation") if len(etymology_sections) < 2 and not pronunciation_subsections: all_headers = set() for template in bulgarian_section.filter(forcetype=mwparserfromhell.wikicode.Template): if str(template.name) in HEAD_TEMPLATES: all_headers.update(process_header(template)) if len(all_headers) > 1: pass else: if len(all_headers) == 1: term = all_headers.pop() else: term = page_name add_pronunciation_section(bulgarian_section, term) pronunciation_subsections = bulgarian_section.get_sections([3, 4], "Pronunciation") pronunciation_subsections_l3 = bulgarian_section.get_sections([3], "Pronunciation") edit_summary += "Added pronunciation section; " def should_edit() -> PageEditStatus: if len(pronunciation_subsections) != 1 and not pronunciation_subsections_l3: # Need one existing pronunciation section to attach to, # or if none exists then it could be created later return PageEditStatus.CANNOT_ADD for template in pronunciation_subsections[0].filter(forcetype=mwparserfromhell.wikicode.Template): if template.name == "audio": return PageEditStatus.NOTHING_TO_ADD vowels = "[аъоуеияѝю]" if len(re.findall(vowels, page_name)) == 1: # Monosyllabic so add audio return PageEditStatus.SUCCESS prons = 0 heads = 0 for template in bulgarian_section.filter(forcetype=mwparserfromhell.wikicode.Template): if template.name == "bg-IPA": prons += 1 elif str(template.name) in HEAD_TEMPLATES: heads += 1 if prons > 1: # More than one pronunciation template or more than one part of speech (which may have different stress?) return PageEditStatus.CANNOT_ADD if pronunciation_subsections_l3: # If there is an L3, that means an editor has identified there to be only one common pronunciatin return PageEditStatus.SUCCESS if heads > 1: return PageEditStatus.CANNOT_ADD if len(bulgarian_section.get_sections([3], "Etymology")) > 1: # If multiple etymologies (homographs), skip return PageEditStatus.CANNOT_ADD return PageEditStatus.SUCCESS # pyperclip.copy(get_audio_template_from_file_name(audio_file_name)) # webbrowser.open_new_tab(get_wikitonary_edit_url(page_name, sections)) edit_status = should_edit() if edit_status is PageEditStatus.SUCCESS: pronunciation_subsections[0].nodes[-1] = pronunciation_subsections[0].nodes[-1].replace("\n\n", "\n") pronunciation_subsections[0].append(get_audio_template_from_file_name(audio_file_name) + "\n\n") p.text = str(parsed) edit_summary += ("A" if edit_summary == "" else "a") + "dd audio from User:Kiril kovachev" p.save(edit_summary, minor=False) return edit_status def run(attention: list[str], contribs: Iterable[str]): for namespaced_filename in contribs: filename = namespaced_filename[5:] page_name = namespaced_filename[namespaced_filename.rfind("Kiril kovachev-")+1+len("Kiril kovachev"):-4] status = visit_page(page_name, filename) if status is PageEditStatus.CANNOT_ADD: attention.append(page_name) print(f"Failed to update page {page_name}, requires manual attention", file=sys.stderr) def get_lines(filename: str) -> list[str]: try: with open(filename) as f: lines = [line.strip() for line in f.readlines()] except: with open(filename, mode="w") as f: pass lines = [] return lines def contributions(user: pywikibot.User, seen: list[str] = None, quit_if_seen: bool = True) -> Generator[str, None, None]: if seen is None: seen = [] for record in user.contributions(total=-1): file = record[0].title() if quit_if_seen and file in seen: print("Caught up to latest changes, quitting") return if file.startswith("File") and file.endswith(".wav") and "LL" in file: yield file else: print("Ignoring contribution", file, file=sys.stderr) seen.append(file) def auto_add(): attention = get_lines(NEED_ATTENTION) seen = get_lines(SEEN_ENTRIES) me = pywikibot.User(COMMONS, "User:Kiril kovachev") try: run(attention, contributions(me, seen)) except KeyboardInterrupt: print() finally: with open(NEED_ATTENTION, mode="w") as f: f.write("\n".join(attention)) with open(SEEN_ENTRIES, mode="w") as f: f.write("\n".join(seen)) def manual(): attention = get_lines(NEED_ATTENTION) try: for line in attention: p = pywikibot.Page(SITE, line) parsed = mwparserfromhell.parse(p.text) sections = parsed.get_sections([2, 3, 4, 5, 6, 7]) pyperclip.copy(f"\n===Pronunciation===\n* {{{{bg-IPA|{line}}}}}\n" + get_audio_template_from_file_name(f"File:LL-Q7918 (bul)-Kiril kovachev-{line}.wav") + "\n") webbrowser.open_new_tab(get_wikitonary_edit_url(line, sections)) input("Press enter for the next file: ") attention.remove(line) except KeyboardInterrupt: print() finally: with open(NEED_ATTENTION, mode="w") as f: f.write("\n".join(attention)) def reorder(limit: int = 2300): me = pywikibot.User(SITE, "User:KovachevBot") disordered: list[str] = [] PRECEDENCE = [["bg-IPA", "IPA"], "audio", "rhymes", ["bg-hyph", "hyph"]] get_precedence = lambda x: PRECEDENCE.index([item for item in PRECEDENCE if (x in item if type(item) is list else x == item)][0]) for page, *_ in me.contributions(limit): title = page.title() print("Visiting", title) content = page.text parsed = mwparserfromhell.parse(content) bulgarian_section: mwparserfromhell.wikicode.Wikicode = parsed.get_sections([2], "Bulgarian")[0] pronunciation = bulgarian_section.get_sections([3], "Pronunciation") if not pronunciation: continue pronunciation: mwparserfromhell.wikicode.Wikicode = pronunciation[0] highest_precedence = 0 for template in pronunciation.filter(forcetype=mwparserfromhell.wikicode.Template): try: template_precedence = get_precedence(str(template.name)) except: template_precedence = -1 if template_precedence > highest_precedence: highest_precedence = template_precedence elif template_precedence < highest_precedence: print("Entry is out of order:", title) disordered.append(title) break print(disordered) def main(): mode = len(sys.argv) > 1 and sys.argv[1] or "auto" if mode == "auto": auto_add() elif mode == "manual": manual() elif mode == "reorder": reorder() else: print("Unrecognized mode", mode) if __name__ == "__main__": main()