313 lines
12 KiB
Python
313 lines
12 KiB
Python
from enum import Enum
|
|
import sys
|
|
from typing import Generator, Iterable
|
|
import webbrowser
|
|
import re
|
|
import pyperclip
|
|
import pywikibot
|
|
import mwparserfromhell
|
|
import kovachevbot
|
|
|
|
SITE = pywikibot.Site("en", "wiktionary")
|
|
COMMONS = pywikibot.Site("commons", "commons")
|
|
NEED_ATTENTION = "audio_needs_attention.txt"
|
|
SEEN_ENTRIES = "audio_seen_files.txt"
|
|
HEAD_TEMPLATES = {"bg-noun", "bg-verb", "bg-adj", "head", "bg-adv", "bg-verbal noun", "bg-verbal noun form", "bg-letter", "bg-part", "bg-part form", "bg-phrase", "bg-proper noun"}
|
|
|
|
|
|
def get_audio_template_from_file_name(file_name: str) -> str:
|
|
return f"* {{{{audio|bg|{file_name.replace('File:', '')}|Audio}}}}"
|
|
|
|
def section_name(section: mwparserfromhell.wikicode.Wikicode) -> str:
|
|
return re.sub("=+", "", str(section.nodes[0]))
|
|
|
|
def get_section_candidate(sections) -> str:
|
|
"""
|
|
Returns a code, which is the ordinal number of the section which is the most appropriate one
|
|
to edit on this page. For example, if the page has only a Bulgarian section with
|
|
1 Pronunciation and 1 Noun header, this will return 2, as 1 is the first section (Bulgarian),
|
|
and 2 is the second section in the page, which is the Pronunciation section.
|
|
If there are in fact multiple Bulgarian pronunciation headers, then that requires that we edit
|
|
the whole Bulgarian section so I can figure out which is the correct place to put the audio.
|
|
"""
|
|
return [section_name(s) for s in sections].index("Bulgarian") + 1
|
|
|
|
def get_wikitonary_edit_url(page_name: str, sections) -> str:
|
|
section_to_edit = get_section_candidate(sections)
|
|
return f"https://en.wiktionary.org/w/index.php?title={page_name}&action=edit§ion={section_to_edit}"
|
|
|
|
def add_pronunciation_section(bulgarian_section: mwparserfromhell.wikicode.Wikicode, term: str) -> None:
|
|
term = kovachevbot.links_to_plaintext(term) # Remove link syntax from term
|
|
PRONUNCIATION_CONTENT = "\n\n===Pronunciation===\n* {{bg-IPA|" + term + "}}\n\n"
|
|
|
|
if (e := bulgarian_section.get_sections([3], "Etymology")):
|
|
etymology: mwparserfromhell.wikicode.Wikicode = e[0]
|
|
etymology.append(PRONUNCIATION_CONTENT)
|
|
elif (a := bulgarian_section.get_sections([3], "Alternative forms")):
|
|
alternative_forms: mwparserfromhell.wikicode.Wikicode = a[0]
|
|
alternative_forms.append(PRONUNCIATION_CONTENT)
|
|
else:
|
|
i = 1
|
|
while not type(bulgarian_section.nodes[i]) is mwparserfromhell.nodes.heading.Heading:
|
|
i += 1
|
|
|
|
bulgarian_section.get_sections([2], "Bulgarian")[0].insert_before(bulgarian_section.nodes[i], PRONUNCIATION_CONTENT)
|
|
|
|
while "\n\n\n" in bulgarian_section:
|
|
bulgarian_section.replace("\n\n\n", "\n\n")
|
|
|
|
def process_header(template: mwparserfromhell.wikicode.Template) -> set[str]:
|
|
name = str(template.name)
|
|
|
|
def default_class() -> set[str]:
|
|
s = set()
|
|
if template.has("1"):
|
|
s.add(str(template.get(1).value))
|
|
|
|
i = 2
|
|
while template.has((param_name := f"head{i}")):
|
|
s.add(str(template.get(param_name).value))
|
|
i += 1
|
|
|
|
return s
|
|
|
|
if name == "head":
|
|
s = set()
|
|
if template.has("head"):
|
|
s.add(str(template.get("head").value))
|
|
i = 2
|
|
while template.has((param_name := f"head{i}")):
|
|
s.add(str(template.get(param_name).value))
|
|
i += 1
|
|
else:
|
|
s = default_class()
|
|
|
|
return s
|
|
|
|
|
|
class PageEditStatus(Enum):
|
|
SUCCESS = 0
|
|
NOTHING_TO_ADD = 1
|
|
CANNOT_ADD = 2
|
|
|
|
def is_bg_ipa(node) -> bool:
|
|
return isinstance(node, mwparserfromhell.wikicode.Template) and node.name == "bg-IPA"
|
|
|
|
def visit_page(page_name: str, audio_file_name: str) -> bool:
|
|
p = pywikibot.Page(SITE, page_name)
|
|
parsed = mwparserfromhell.parse(p.text)
|
|
|
|
# sections = parsed.get_sections([2, 3, 4, 5, 6, 7])
|
|
try:
|
|
bulgarian_section: mwparserfromhell.wikicode.Wikicode = parsed.get_sections([2], "Bulgarian")[0]
|
|
except IndexError:
|
|
print("No Bulgarian entry for term", page_name, file=sys.stderr)
|
|
return
|
|
|
|
edit_summary = ""
|
|
# bulgarian_subsections = bulgarian_section.get_sections([3, 4, 5, 6, 7])
|
|
etymology_sections = bulgarian_section.get_sections([3], "Etymology")
|
|
pronunciation_subsections = bulgarian_section.get_sections([3, 4], "Pronunciation")
|
|
pronunciation_subsections_l3 = bulgarian_section.get_sections([3], "Pronunciation")
|
|
|
|
if len(etymology_sections) < 2 and not pronunciation_subsections:
|
|
all_headers = set()
|
|
for template in bulgarian_section.filter(forcetype=mwparserfromhell.wikicode.Template):
|
|
if str(template.name) in HEAD_TEMPLATES:
|
|
all_headers.update(process_header(template))
|
|
|
|
if len(all_headers) > 1:
|
|
pass
|
|
else:
|
|
if len(all_headers) == 1:
|
|
term = all_headers.pop()
|
|
else:
|
|
term = page_name
|
|
|
|
add_pronunciation_section(bulgarian_section, term)
|
|
pronunciation_subsections = bulgarian_section.get_sections([3, 4], "Pronunciation")
|
|
pronunciation_subsections_l3 = bulgarian_section.get_sections([3], "Pronunciation")
|
|
edit_summary += "Added pronunciation section; "
|
|
|
|
def should_edit() -> PageEditStatus:
|
|
if len(pronunciation_subsections) != 1 and not pronunciation_subsections_l3:
|
|
# Need one existing pronunciation section to attach to,
|
|
# or if none exists then it could be created later
|
|
return PageEditStatus.CANNOT_ADD
|
|
|
|
for template in pronunciation_subsections[0].filter(forcetype=mwparserfromhell.wikicode.Template):
|
|
if template.name == "audio":
|
|
return PageEditStatus.NOTHING_TO_ADD
|
|
|
|
vowels = "[аъоуеияѝю]"
|
|
if len(re.findall(vowels, page_name)) == 1:
|
|
# Monosyllabic so add audio
|
|
return PageEditStatus.SUCCESS
|
|
|
|
prons = 0
|
|
heads = 0
|
|
for template in bulgarian_section.filter(forcetype=mwparserfromhell.wikicode.Template):
|
|
if template.name == "bg-IPA":
|
|
prons += 1
|
|
elif str(template.name) in HEAD_TEMPLATES:
|
|
heads += 1
|
|
|
|
if prons > 1:
|
|
# More than one pronunciation template or more than one part of speech (which may have different stress?)
|
|
return PageEditStatus.CANNOT_ADD
|
|
|
|
if pronunciation_subsections_l3:
|
|
# If there is an L3, that means an editor has identified there to be only one common pronunciatin
|
|
return PageEditStatus.SUCCESS
|
|
|
|
if heads > 1:
|
|
return PageEditStatus.CANNOT_ADD
|
|
|
|
if len(bulgarian_section.get_sections([3], "Etymology")) > 1:
|
|
# If multiple etymologies (homographs), skip
|
|
return PageEditStatus.CANNOT_ADD
|
|
|
|
|
|
return PageEditStatus.SUCCESS
|
|
|
|
# pyperclip.copy(get_audio_template_from_file_name(audio_file_name))
|
|
# webbrowser.open_new_tab(get_wikitonary_edit_url(page_name, sections))
|
|
edit_status = should_edit()
|
|
if edit_status is PageEditStatus.SUCCESS:
|
|
i = 0
|
|
pron_section: mwparserfromhell.wikicode.Wikicode = pronunciation_subsections[0]
|
|
|
|
templates: list[mwparserfromhell.wikicode.Template] = pron_section.filter_templates()
|
|
TO_INSERT = "\n" + get_audio_template_from_file_name(audio_file_name)
|
|
|
|
if is_bg_ipa(templates[0]):
|
|
pron_section.insert_after(templates[0], TO_INSERT)
|
|
else:
|
|
pron_section.insert(1, TO_INSERT)
|
|
|
|
p.text = str(parsed)
|
|
edit_summary += ("A" if edit_summary == "" else "a") + "dd audio from User:Kiril kovachev"
|
|
p.save(edit_summary, minor=False)
|
|
|
|
return edit_status
|
|
|
|
def run(attention: list[str], contribs: Iterable[str]):
|
|
for namespaced_filename in contribs:
|
|
filename = namespaced_filename[5:]
|
|
page_name = namespaced_filename[namespaced_filename.rfind("Kiril kovachev-")+1+len("Kiril kovachev"):-4]
|
|
status = visit_page(page_name, filename)
|
|
|
|
if status is PageEditStatus.CANNOT_ADD:
|
|
attention.append(page_name)
|
|
print(f"Failed to update page {page_name}, requires manual attention", file=sys.stderr)
|
|
|
|
def get_lines(filename: str) -> list[str]:
|
|
try:
|
|
with open(filename) as f:
|
|
lines = [line.strip() for line in f.readlines()]
|
|
except:
|
|
with open(filename, mode="w") as f:
|
|
pass
|
|
|
|
lines = []
|
|
|
|
return lines
|
|
|
|
def contributions(user: pywikibot.User, seen: list[str] = None, quit_if_seen: bool = True) -> Generator[str, None, None]:
|
|
if seen is None: seen = []
|
|
|
|
for record in user.contributions(total=-1):
|
|
file = record[0].title()
|
|
|
|
if quit_if_seen and file in seen:
|
|
print("Caught up to latest changes, quitting")
|
|
return
|
|
|
|
if file.startswith("File") and file.endswith(".wav") and "LL" in file:
|
|
yield file
|
|
else:
|
|
print("Ignoring contribution", file, file=sys.stderr)
|
|
|
|
seen.append(file)
|
|
|
|
def auto_add():
|
|
attention = get_lines(NEED_ATTENTION)
|
|
seen = get_lines(SEEN_ENTRIES)
|
|
|
|
me = pywikibot.User(COMMONS, "User:Kiril kovachev")
|
|
|
|
try:
|
|
run(attention, contributions(me, seen))
|
|
except KeyboardInterrupt:
|
|
print()
|
|
finally:
|
|
with open(NEED_ATTENTION, mode="w") as f:
|
|
f.write("\n".join(attention))
|
|
|
|
with open(SEEN_ENTRIES, mode="w") as f:
|
|
f.write("\n".join(seen))
|
|
|
|
def manual():
|
|
attention = get_lines(NEED_ATTENTION)
|
|
|
|
try:
|
|
for line in attention:
|
|
p = pywikibot.Page(SITE, line)
|
|
parsed = mwparserfromhell.parse(p.text)
|
|
sections = parsed.get_sections([2, 3, 4, 5, 6, 7])
|
|
|
|
pyperclip.copy(f"\n===Pronunciation===\n* {{{{bg-IPA|{line}}}}}\n" + get_audio_template_from_file_name(f"File:LL-Q7918 (bul)-Kiril kovachev-{line}.wav") + "\n")
|
|
webbrowser.open_new_tab(get_wikitonary_edit_url(line, sections))
|
|
input("Press enter for the next file: ")
|
|
attention.remove(line)
|
|
except KeyboardInterrupt:
|
|
print()
|
|
finally:
|
|
with open(NEED_ATTENTION, mode="w") as f:
|
|
f.write("\n".join(attention))
|
|
|
|
def reorder(limit: int = 2300):
|
|
me = pywikibot.User(SITE, "User:KovachevBot")
|
|
|
|
disordered: list[str] = []
|
|
PRECEDENCE = [["bg-IPA", "IPA"], "audio", "rhymes", ["bg-hyph", "hyph"]]
|
|
|
|
get_precedence = lambda x: PRECEDENCE.index([item for item in PRECEDENCE if (x in item if type(item) is list else x == item)][0])
|
|
|
|
for page, *_ in me.contributions(limit):
|
|
title = page.title()
|
|
print("Visiting", title)
|
|
content = page.text
|
|
parsed = mwparserfromhell.parse(content)
|
|
bulgarian_section: mwparserfromhell.wikicode.Wikicode = parsed.get_sections([2], "Bulgarian")[0]
|
|
pronunciation = bulgarian_section.get_sections([3], "Pronunciation")
|
|
if not pronunciation: continue
|
|
pronunciation: mwparserfromhell.wikicode.Wikicode = pronunciation[0]
|
|
highest_precedence = 0
|
|
for template in pronunciation.filter(forcetype=mwparserfromhell.wikicode.Template):
|
|
try:
|
|
template_precedence = get_precedence(str(template.name))
|
|
except:
|
|
template_precedence = -1
|
|
if template_precedence > highest_precedence:
|
|
highest_precedence = template_precedence
|
|
elif template_precedence < highest_precedence:
|
|
print("Entry is out of order:", title)
|
|
disordered.append(title)
|
|
break
|
|
print(disordered)
|
|
|
|
def main():
|
|
mode = len(sys.argv) > 1 and sys.argv[1] or "auto"
|
|
|
|
if mode == "auto":
|
|
auto_add()
|
|
elif mode == "manual":
|
|
manual()
|
|
elif mode == "reorder":
|
|
reorder()
|
|
else:
|
|
print("Unrecognized mode", mode)
|
|
|
|
if __name__ == "__main__":
|
|
main() |