Add audio porter
This commit is contained in:
parent
140aa5005a
commit
1849b5e9ea
303
auto-audio/auto-audio.py
Normal file
303
auto-audio/auto-audio.py
Normal file
@ -0,0 +1,303 @@
|
||||
from enum import Enum
|
||||
import sys
|
||||
from typing import Generator, Iterable
|
||||
import webbrowser
|
||||
import re
|
||||
import pyperclip
|
||||
import pywikibot
|
||||
import mwparserfromhell
|
||||
import kovachevbot
|
||||
|
||||
SITE = pywikibot.Site("en", "wiktionary")
|
||||
COMMONS = pywikibot.Site("commons", "commons")
|
||||
NEED_ATTENTION = "audio_needs_attention.txt"
|
||||
SEEN_ENTRIES = "audio_seen_files.txt"
|
||||
HEAD_TEMPLATES = {"bg-noun", "bg-verb", "bg-adj", "head", "bg-adv", "bg-verbal noun", "bg-verbal noun form", "bg-letter", "bg-part", "bg-part form", "bg-phrase", "bg-proper noun"}
|
||||
|
||||
|
||||
def get_audio_template_from_file_name(file_name: str) -> str:
|
||||
return f"* {{{{audio|bg|{file_name.replace('File:', '')}|Audio}}}}"
|
||||
|
||||
def section_name(section: mwparserfromhell.wikicode.Wikicode) -> str:
|
||||
return re.sub("=+", "", str(section.nodes[0]))
|
||||
|
||||
def get_section_candidate(sections) -> str:
|
||||
"""
|
||||
Returns a code, which is the ordinal number of the section which is the most appropriate one
|
||||
to edit on this page. For example, if the page has only a Bulgarian section with
|
||||
1 Pronunciation and 1 Noun header, this will return 2, as 1 is the first section (Bulgarian),
|
||||
and 2 is the second section in the page, which is the Pronunciation section.
|
||||
If there are in fact multiple Bulgarian pronunciation headers, then that requires that we edit
|
||||
the whole Bulgarian section so I can figure out which is the correct place to put the audio.
|
||||
"""
|
||||
return [section_name(s) for s in sections].index("Bulgarian") + 1
|
||||
|
||||
def get_wikitonary_edit_url(page_name: str, sections) -> str:
|
||||
section_to_edit = get_section_candidate(sections)
|
||||
return f"https://en.wiktionary.org/w/index.php?title={page_name}&action=edit§ion={section_to_edit}"
|
||||
|
||||
def add_pronunciation_section(bulgarian_section: mwparserfromhell.wikicode.Wikicode, term: str) -> None:
|
||||
term = kovachevbot.links_to_plaintext(term) # Remove link syntax from term
|
||||
PRONUNCIATION_CONTENT = "\n\n===Pronunciation===\n* {{bg-IPA|" + term + "}}\n\n"
|
||||
|
||||
if (e := bulgarian_section.get_sections([3], "Etymology")):
|
||||
etymology: mwparserfromhell.wikicode.Wikicode = e[0]
|
||||
etymology.append(PRONUNCIATION_CONTENT)
|
||||
elif (a := bulgarian_section.get_sections([3], "Alternative forms")):
|
||||
alternative_forms: mwparserfromhell.wikicode.Wikicode = a[0]
|
||||
alternative_forms.append(PRONUNCIATION_CONTENT)
|
||||
else:
|
||||
i = 1
|
||||
while not type(bulgarian_section.nodes[i]) is mwparserfromhell.nodes.heading.Heading:
|
||||
i += 1
|
||||
|
||||
bulgarian_section.get_sections([2], "Bulgarian")[0].insert_before(bulgarian_section.nodes[i], PRONUNCIATION_CONTENT)
|
||||
|
||||
while "\n\n\n" in bulgarian_section:
|
||||
bulgarian_section.replace("\n\n\n", "\n\n")
|
||||
|
||||
def process_header(template: mwparserfromhell.wikicode.Template) -> set[str]:
|
||||
name = str(template.name)
|
||||
|
||||
def default_class() -> set[str]:
|
||||
s = set()
|
||||
if template.has("1"):
|
||||
s.add(str(template.get(1).value))
|
||||
|
||||
i = 2
|
||||
while template.has((param_name := f"head{i}")):
|
||||
s.add(str(template.get(param_name).value))
|
||||
i += 1
|
||||
|
||||
return s
|
||||
|
||||
if name == "head":
|
||||
s = set()
|
||||
if template.has("head"):
|
||||
s.add(str(template.get("head").value))
|
||||
i = 2
|
||||
while template.has((param_name := f"head{i}")):
|
||||
s.add(str(template.get(param_name).value))
|
||||
i += 1
|
||||
else:
|
||||
s = default_class()
|
||||
|
||||
return s
|
||||
|
||||
|
||||
class PageEditStatus(Enum):
|
||||
SUCCESS = 0
|
||||
NOTHING_TO_ADD = 1
|
||||
CANNOT_ADD = 2
|
||||
|
||||
def visit_page(page_name: str, audio_file_name: str) -> bool:
|
||||
p = pywikibot.Page(SITE, page_name)
|
||||
parsed = mwparserfromhell.parse(p.text)
|
||||
|
||||
# sections = parsed.get_sections([2, 3, 4, 5, 6, 7])
|
||||
try:
|
||||
bulgarian_section: mwparserfromhell.wikicode.Wikicode = parsed.get_sections([2], "Bulgarian")[0]
|
||||
except IndexError:
|
||||
print("No Bulgarian entry for term", page_name, file=sys.stderr)
|
||||
return
|
||||
|
||||
edit_summary = ""
|
||||
# bulgarian_subsections = bulgarian_section.get_sections([3, 4, 5, 6, 7])
|
||||
etymology_sections = bulgarian_section.get_sections([3], "Etymology")
|
||||
pronunciation_subsections = bulgarian_section.get_sections([3, 4], "Pronunciation")
|
||||
pronunciation_subsections_l3 = bulgarian_section.get_sections([3], "Pronunciation")
|
||||
|
||||
if len(etymology_sections) < 2 and not pronunciation_subsections:
|
||||
all_headers = set()
|
||||
for template in bulgarian_section.filter(forcetype=mwparserfromhell.wikicode.Template):
|
||||
if str(template.name) in HEAD_TEMPLATES:
|
||||
all_headers.update(process_header(template))
|
||||
|
||||
if len(all_headers) > 1:
|
||||
pass
|
||||
else:
|
||||
if len(all_headers) == 1:
|
||||
term = all_headers.pop()
|
||||
else:
|
||||
term = page_name
|
||||
|
||||
add_pronunciation_section(bulgarian_section, term)
|
||||
pronunciation_subsections = bulgarian_section.get_sections([3, 4], "Pronunciation")
|
||||
pronunciation_subsections_l3 = bulgarian_section.get_sections([3], "Pronunciation")
|
||||
edit_summary += "Added pronunciation section; "
|
||||
|
||||
def should_edit() -> PageEditStatus:
|
||||
if len(pronunciation_subsections) != 1 and not pronunciation_subsections_l3:
|
||||
# Need one existing pronunciation section to attach to,
|
||||
# or if none exists then it could be created later
|
||||
return PageEditStatus.CANNOT_ADD
|
||||
|
||||
for template in pronunciation_subsections[0].filter(forcetype=mwparserfromhell.wikicode.Template):
|
||||
if template.name == "audio":
|
||||
return PageEditStatus.NOTHING_TO_ADD
|
||||
|
||||
vowels = "[аъоуеияѝю]"
|
||||
if len(re.findall(vowels, page_name)) == 1:
|
||||
# Monosyllabic so add audio
|
||||
return PageEditStatus.SUCCESS
|
||||
|
||||
prons = 0
|
||||
heads = 0
|
||||
for template in bulgarian_section.filter(forcetype=mwparserfromhell.wikicode.Template):
|
||||
if template.name == "bg-IPA":
|
||||
prons += 1
|
||||
elif str(template.name) in HEAD_TEMPLATES:
|
||||
heads += 1
|
||||
|
||||
if prons > 1:
|
||||
# More than one pronunciation template or more than one part of speech (which may have different stress?)
|
||||
return PageEditStatus.CANNOT_ADD
|
||||
|
||||
if pronunciation_subsections_l3:
|
||||
# If there is an L3, that means an editor has identified there to be only one common pronunciatin
|
||||
return PageEditStatus.SUCCESS
|
||||
|
||||
if heads > 1:
|
||||
return PageEditStatus.CANNOT_ADD
|
||||
|
||||
if len(bulgarian_section.get_sections([3], "Etymology")) > 1:
|
||||
# If multiple etymologies (homographs), skip
|
||||
return PageEditStatus.CANNOT_ADD
|
||||
|
||||
|
||||
return PageEditStatus.SUCCESS
|
||||
|
||||
# pyperclip.copy(get_audio_template_from_file_name(audio_file_name))
|
||||
# webbrowser.open_new_tab(get_wikitonary_edit_url(page_name, sections))
|
||||
edit_status = should_edit()
|
||||
if edit_status is PageEditStatus.SUCCESS:
|
||||
|
||||
pronunciation_subsections[0].nodes[-1] = pronunciation_subsections[0].nodes[-1].replace("\n\n", "\n")
|
||||
pronunciation_subsections[0].append(get_audio_template_from_file_name(audio_file_name) + "\n\n")
|
||||
|
||||
p.text = str(parsed)
|
||||
edit_summary += ("A" if edit_summary == "" else "a") + "dd audio from User:Kiril kovachev"
|
||||
p.save(edit_summary, minor=False)
|
||||
|
||||
return edit_status
|
||||
|
||||
def run(attention: list[str], contribs: Iterable[str]):
|
||||
for namespaced_filename in contribs:
|
||||
filename = namespaced_filename[5:]
|
||||
page_name = namespaced_filename[namespaced_filename.rfind("Kiril kovachev-")+1+len("Kiril kovachev"):-4]
|
||||
status = visit_page(page_name, filename)
|
||||
|
||||
if status is PageEditStatus.CANNOT_ADD:
|
||||
attention.append(page_name)
|
||||
print(f"Failed to update page {page_name}, requires manual attention", file=sys.stderr)
|
||||
|
||||
def get_lines(filename: str) -> list[str]:
|
||||
try:
|
||||
with open(filename) as f:
|
||||
lines = [line.strip() for line in f.readlines()]
|
||||
except:
|
||||
with open(filename, mode="w") as f:
|
||||
pass
|
||||
|
||||
lines = []
|
||||
|
||||
return lines
|
||||
|
||||
def contributions(user: pywikibot.User, seen: list[str] = None, quit_if_seen: bool = True) -> Generator[str, None, None]:
|
||||
if seen is None: seen = []
|
||||
|
||||
for record in user.contributions(total=-1):
|
||||
file = record[0].title()
|
||||
|
||||
if quit_if_seen and file in seen:
|
||||
print("Caught up to latest changes, quitting")
|
||||
return
|
||||
|
||||
if file.startswith("File") and file.endswith(".wav") and "LL" in file:
|
||||
yield file
|
||||
else:
|
||||
print("Ignoring contribution", file, file=sys.stderr)
|
||||
|
||||
seen.append(file)
|
||||
|
||||
def auto_add():
|
||||
attention = get_lines(NEED_ATTENTION)
|
||||
seen = get_lines(SEEN_ENTRIES)
|
||||
|
||||
me = pywikibot.User(COMMONS, "User:Kiril kovachev")
|
||||
|
||||
try:
|
||||
run(attention, contributions(me, seen))
|
||||
except KeyboardInterrupt:
|
||||
print()
|
||||
finally:
|
||||
with open(NEED_ATTENTION, mode="w") as f:
|
||||
f.write("\n".join(attention))
|
||||
|
||||
with open(SEEN_ENTRIES, mode="w") as f:
|
||||
f.write("\n".join(seen))
|
||||
|
||||
def manual():
|
||||
attention = get_lines(NEED_ATTENTION)
|
||||
|
||||
try:
|
||||
for line in attention:
|
||||
p = pywikibot.Page(SITE, line)
|
||||
parsed = mwparserfromhell.parse(p.text)
|
||||
sections = parsed.get_sections([2, 3, 4, 5, 6, 7])
|
||||
|
||||
pyperclip.copy(f"\n===Pronunciation===\n* {{{{bg-IPA|{line}}}}}\n" + get_audio_template_from_file_name(f"File:LL-Q7918 (bul)-Kiril kovachev-{line}.wav") + "\n")
|
||||
webbrowser.open_new_tab(get_wikitonary_edit_url(line, sections))
|
||||
input("Press enter for the next file: ")
|
||||
attention.remove(line)
|
||||
except KeyboardInterrupt:
|
||||
print()
|
||||
finally:
|
||||
with open(NEED_ATTENTION, mode="w") as f:
|
||||
f.write("\n".join(attention))
|
||||
|
||||
def reorder(limit: int = 2300):
|
||||
me = pywikibot.User(SITE, "User:KovachevBot")
|
||||
|
||||
disordered: list[str] = []
|
||||
PRECEDENCE = [["bg-IPA", "IPA"], "audio", "rhymes", ["bg-hyph", "hyph"]]
|
||||
|
||||
get_precedence = lambda x: PRECEDENCE.index([item for item in PRECEDENCE if (x in item if type(item) is list else x == item)][0])
|
||||
|
||||
for page, *_ in me.contributions(limit):
|
||||
title = page.title()
|
||||
print("Visiting", title)
|
||||
content = page.text
|
||||
parsed = mwparserfromhell.parse(content)
|
||||
bulgarian_section: mwparserfromhell.wikicode.Wikicode = parsed.get_sections([2], "Bulgarian")[0]
|
||||
pronunciation = bulgarian_section.get_sections([3], "Pronunciation")
|
||||
if not pronunciation: continue
|
||||
pronunciation: mwparserfromhell.wikicode.Wikicode = pronunciation[0]
|
||||
highest_precedence = 0
|
||||
for template in pronunciation.filter(forcetype=mwparserfromhell.wikicode.Template):
|
||||
try:
|
||||
template_precedence = get_precedence(str(template.name))
|
||||
except:
|
||||
template_precedence = -1
|
||||
if template_precedence > highest_precedence:
|
||||
highest_precedence = template_precedence
|
||||
elif template_precedence < highest_precedence:
|
||||
print("Entry is out of order:", title)
|
||||
disordered.append(title)
|
||||
break
|
||||
print(disordered)
|
||||
|
||||
def main():
|
||||
mode = len(sys.argv) > 1 and sys.argv[1] or "auto"
|
||||
|
||||
if mode == "auto":
|
||||
auto_add()
|
||||
elif mode == "manual":
|
||||
manual()
|
||||
elif mode == "reorder":
|
||||
reorder()
|
||||
else:
|
||||
print("Unrecognized mode", mode)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
Loading…
Reference in New Issue
Block a user