kovachev-bot/auto-audio/auto-audio.py
2023-09-08 19:31:56 +01:00

313 lines
12 KiB
Python

from enum import Enum
import sys
from typing import Generator, Iterable
import webbrowser
import re
import pyperclip
import pywikibot
import mwparserfromhell
import kovachevbot
SITE = pywikibot.Site("en", "wiktionary")
COMMONS = pywikibot.Site("commons", "commons")
NEED_ATTENTION = "audio_needs_attention.txt"
SEEN_ENTRIES = "audio_seen_files.txt"
HEAD_TEMPLATES = {"bg-noun", "bg-verb", "bg-adj", "head", "bg-adv", "bg-verbal noun", "bg-verbal noun form", "bg-letter", "bg-part", "bg-part form", "bg-phrase", "bg-proper noun"}
def get_audio_template_from_file_name(file_name: str) -> str:
return f"* {{{{audio|bg|{file_name.replace('File:', '')}|Audio}}}}"
def section_name(section: mwparserfromhell.wikicode.Wikicode) -> str:
return re.sub("=+", "", str(section.nodes[0]))
def get_section_candidate(sections) -> str:
"""
Returns a code, which is the ordinal number of the section which is the most appropriate one
to edit on this page. For example, if the page has only a Bulgarian section with
1 Pronunciation and 1 Noun header, this will return 2, as 1 is the first section (Bulgarian),
and 2 is the second section in the page, which is the Pronunciation section.
If there are in fact multiple Bulgarian pronunciation headers, then that requires that we edit
the whole Bulgarian section so I can figure out which is the correct place to put the audio.
"""
return [section_name(s) for s in sections].index("Bulgarian") + 1
def get_wikitonary_edit_url(page_name: str, sections) -> str:
section_to_edit = get_section_candidate(sections)
return f"https://en.wiktionary.org/w/index.php?title={page_name}&action=edit&section={section_to_edit}"
def add_pronunciation_section(bulgarian_section: mwparserfromhell.wikicode.Wikicode, term: str) -> None:
term = kovachevbot.links_to_plaintext(term) # Remove link syntax from term
PRONUNCIATION_CONTENT = "\n\n===Pronunciation===\n* {{bg-IPA|" + term + "}}\n\n"
if (e := bulgarian_section.get_sections([3], "Etymology")):
etymology: mwparserfromhell.wikicode.Wikicode = e[0]
etymology.append(PRONUNCIATION_CONTENT)
elif (a := bulgarian_section.get_sections([3], "Alternative forms")):
alternative_forms: mwparserfromhell.wikicode.Wikicode = a[0]
alternative_forms.append(PRONUNCIATION_CONTENT)
else:
i = 1
while not type(bulgarian_section.nodes[i]) is mwparserfromhell.nodes.heading.Heading:
i += 1
bulgarian_section.get_sections([2], "Bulgarian")[0].insert_before(bulgarian_section.nodes[i], PRONUNCIATION_CONTENT)
while "\n\n\n" in bulgarian_section:
bulgarian_section.replace("\n\n\n", "\n\n")
def process_header(template: mwparserfromhell.wikicode.Template) -> set[str]:
name = str(template.name)
def default_class() -> set[str]:
s = set()
if template.has("1"):
s.add(str(template.get(1).value))
i = 2
while template.has((param_name := f"head{i}")):
s.add(str(template.get(param_name).value))
i += 1
return s
if name == "head":
s = set()
if template.has("head"):
s.add(str(template.get("head").value))
i = 2
while template.has((param_name := f"head{i}")):
s.add(str(template.get(param_name).value))
i += 1
else:
s = default_class()
return s
class PageEditStatus(Enum):
SUCCESS = 0
NOTHING_TO_ADD = 1
CANNOT_ADD = 2
def is_bg_ipa(node) -> bool:
return isinstance(node, mwparserfromhell.wikicode.Template) and node.name == "bg-IPA"
def visit_page(page_name: str, audio_file_name: str) -> bool:
p = pywikibot.Page(SITE, page_name)
parsed = mwparserfromhell.parse(p.text)
# sections = parsed.get_sections([2, 3, 4, 5, 6, 7])
try:
bulgarian_section: mwparserfromhell.wikicode.Wikicode = parsed.get_sections([2], "Bulgarian")[0]
except IndexError:
print("No Bulgarian entry for term", page_name, file=sys.stderr)
return
edit_summary = ""
# bulgarian_subsections = bulgarian_section.get_sections([3, 4, 5, 6, 7])
etymology_sections = bulgarian_section.get_sections([3], "Etymology")
pronunciation_subsections = bulgarian_section.get_sections([3, 4], "Pronunciation")
pronunciation_subsections_l3 = bulgarian_section.get_sections([3], "Pronunciation")
if len(etymology_sections) < 2 and not pronunciation_subsections:
all_headers = set()
for template in bulgarian_section.filter(forcetype=mwparserfromhell.wikicode.Template):
if str(template.name) in HEAD_TEMPLATES:
all_headers.update(process_header(template))
if len(all_headers) > 1:
pass
else:
if len(all_headers) == 1:
term = all_headers.pop()
else:
term = page_name
add_pronunciation_section(bulgarian_section, term)
pronunciation_subsections = bulgarian_section.get_sections([3, 4], "Pronunciation")
pronunciation_subsections_l3 = bulgarian_section.get_sections([3], "Pronunciation")
edit_summary += "Added pronunciation section; "
def should_edit() -> PageEditStatus:
if len(pronunciation_subsections) != 1 and not pronunciation_subsections_l3:
# Need one existing pronunciation section to attach to,
# or if none exists then it could be created later
return PageEditStatus.CANNOT_ADD
for template in pronunciation_subsections[0].filter(forcetype=mwparserfromhell.wikicode.Template):
if template.name == "audio":
return PageEditStatus.NOTHING_TO_ADD
vowels = "[аъоуеияѝю]"
if len(re.findall(vowels, page_name)) == 1:
# Monosyllabic so add audio
return PageEditStatus.SUCCESS
prons = 0
heads = 0
for template in bulgarian_section.filter(forcetype=mwparserfromhell.wikicode.Template):
if template.name == "bg-IPA":
prons += 1
elif str(template.name) in HEAD_TEMPLATES:
heads += 1
if prons > 1:
# More than one pronunciation template or more than one part of speech (which may have different stress?)
return PageEditStatus.CANNOT_ADD
if pronunciation_subsections_l3:
# If there is an L3, that means an editor has identified there to be only one common pronunciatin
return PageEditStatus.SUCCESS
if heads > 1:
return PageEditStatus.CANNOT_ADD
if len(bulgarian_section.get_sections([3], "Etymology")) > 1:
# If multiple etymologies (homographs), skip
return PageEditStatus.CANNOT_ADD
return PageEditStatus.SUCCESS
# pyperclip.copy(get_audio_template_from_file_name(audio_file_name))
# webbrowser.open_new_tab(get_wikitonary_edit_url(page_name, sections))
edit_status = should_edit()
if edit_status is PageEditStatus.SUCCESS:
i = 0
pron_section: mwparserfromhell.wikicode.Wikicode = pronunciation_subsections[0]
templates: list[mwparserfromhell.wikicode.Template] = pron_section.filter_templates()
TO_INSERT = "\n" + get_audio_template_from_file_name(audio_file_name)
if is_bg_ipa(templates[0]):
pron_section.insert_after(templates[0], TO_INSERT)
else:
pron_section.insert(1, TO_INSERT)
p.text = str(parsed)
edit_summary += ("A" if edit_summary == "" else "a") + "dd audio from User:Kiril kovachev"
p.save(edit_summary, minor=False)
return edit_status
def run(attention: list[str], contribs: Iterable[str]):
for namespaced_filename in contribs:
filename = namespaced_filename[5:]
page_name = namespaced_filename[namespaced_filename.rfind("Kiril kovachev-")+1+len("Kiril kovachev"):-4]
status = visit_page(page_name, filename)
if status is PageEditStatus.CANNOT_ADD:
attention.append(page_name)
print(f"Failed to update page {page_name}, requires manual attention", file=sys.stderr)
def get_lines(filename: str) -> list[str]:
try:
with open(filename) as f:
lines = [line.strip() for line in f.readlines()]
except:
with open(filename, mode="w") as f:
pass
lines = []
return lines
def contributions(user: pywikibot.User, seen: list[str] = None, quit_if_seen: bool = True) -> Generator[str, None, None]:
if seen is None: seen = []
for record in user.contributions(total=-1):
file = record[0].title()
if quit_if_seen and file in seen:
print("Caught up to latest changes, quitting")
return
if file.startswith("File") and file.endswith(".wav") and "LL" in file:
yield file
else:
print("Ignoring contribution", file, file=sys.stderr)
seen.append(file)
def auto_add():
attention = get_lines(NEED_ATTENTION)
seen = get_lines(SEEN_ENTRIES)
me = pywikibot.User(COMMONS, "User:Kiril kovachev")
try:
run(attention, contributions(me, seen))
except KeyboardInterrupt:
print()
finally:
with open(NEED_ATTENTION, mode="w") as f:
f.write("\n".join(attention))
with open(SEEN_ENTRIES, mode="w") as f:
f.write("\n".join(seen))
def manual():
attention = get_lines(NEED_ATTENTION)
try:
for line in attention:
p = pywikibot.Page(SITE, line)
parsed = mwparserfromhell.parse(p.text)
sections = parsed.get_sections([2, 3, 4, 5, 6, 7])
pyperclip.copy(f"\n===Pronunciation===\n* {{{{bg-IPA|{line}}}}}\n" + get_audio_template_from_file_name(f"File:LL-Q7918 (bul)-Kiril kovachev-{line}.wav") + "\n")
webbrowser.open_new_tab(get_wikitonary_edit_url(line, sections))
input("Press enter for the next file: ")
attention.remove(line)
except KeyboardInterrupt:
print()
finally:
with open(NEED_ATTENTION, mode="w") as f:
f.write("\n".join(attention))
def reorder(limit: int = 2300):
me = pywikibot.User(SITE, "User:KovachevBot")
disordered: list[str] = []
PRECEDENCE = [["bg-IPA", "IPA"], "audio", "rhymes", ["bg-hyph", "hyph"]]
get_precedence = lambda x: PRECEDENCE.index([item for item in PRECEDENCE if (x in item if type(item) is list else x == item)][0])
for page, *_ in me.contributions(limit):
title = page.title()
print("Visiting", title)
content = page.text
parsed = mwparserfromhell.parse(content)
bulgarian_section: mwparserfromhell.wikicode.Wikicode = parsed.get_sections([2], "Bulgarian")[0]
pronunciation = bulgarian_section.get_sections([3], "Pronunciation")
if not pronunciation: continue
pronunciation: mwparserfromhell.wikicode.Wikicode = pronunciation[0]
highest_precedence = 0
for template in pronunciation.filter(forcetype=mwparserfromhell.wikicode.Template):
try:
template_precedence = get_precedence(str(template.name))
except:
template_precedence = -1
if template_precedence > highest_precedence:
highest_precedence = template_precedence
elif template_precedence < highest_precedence:
print("Entry is out of order:", title)
disordered.append(title)
break
print(disordered)
def main():
mode = len(sys.argv) > 1 and sys.argv[1] or "auto"
if mode == "auto":
auto_add()
elif mode == "manual":
manual()
elif mode == "reorder":
reorder()
else:
print("Unrecognized mode", mode)
if __name__ == "__main__":
main()