264 lines
14 KiB
Python
264 lines
14 KiB
Python
import pywikibot
|
|
import mwparserfromhell
|
|
import wikitextparser
|
|
import os
|
|
|
|
numbers = {
|
|
"singular": "s",
|
|
"plural": "p",
|
|
}
|
|
|
|
forms = {
|
|
"indefinite": "indef",
|
|
"definite": "def",
|
|
"definite<br>(subject form)": "sbjv",
|
|
"definite<br>(object form)": "objv",
|
|
"vocative form": "voc",
|
|
"count form": "count"
|
|
}
|
|
|
|
|
|
# Connect to Wiktionary (according to user profile)
|
|
site = pywikibot.Site()
|
|
|
|
"""Returns a list of dictionaries; each dictionary looks like this:
|
|
{ "lemma": <term>
|
|
"forms": {
|
|
<form 1>: [("indef", "s"), ("voc", "s")
|
|
etc.
|
|
}
|
|
}
|
|
Each dictionary is able to be used to generate an entire set of noun forms.
|
|
The reason there are multiple dictionaries in the list is because there can be
|
|
multiple noun forms per page, each of which *can* have different declensions.
|
|
If there is only one noun header on a page, it is not necessary to include multiple etymologies on the derived form page."""
|
|
def get_forms(page):
|
|
# 'result' is the list of dictionaries to be returned
|
|
result = []
|
|
text = page.expand_text() # Provides fully-expanded wikitext of a page, i.e. shows the full table markup instead of {{bg-ndecl}}
|
|
|
|
document = mwparserfromhell.parse(text)
|
|
bulgarian = document.get_sections(matches=r"Bulgarian")[0] # Find the section that contains the Bulgarian entry
|
|
nouns = bulgarian.get_sections(matches=r"Noun( \d+)*") # Locate any noun sections
|
|
for n in nouns:
|
|
all_forms = {"lemma": "", "forms": {}} # Each 'noun' has a list of forms in its declension, which we will be populating as a dictionary
|
|
declension = n.get_sections(matches=r"Declension") # Finds declension header
|
|
if declension == []: continue
|
|
else: declension = declension[0] # If there is no declension for a noun form, skip, else set 'declension' to the first element in the result
|
|
nav_frame = mwparserfromhell.parse(declension).nodes[2] # Selects the wikitable itself
|
|
table = wikitextparser.parse(str(nav_frame)).tables[0].data() # Converts into 2D list
|
|
columns = len(table[0]) # Always equals 3: the blank tile, plus the singular and plural
|
|
rows = len(table) # Differs between masculine and non-masculine nouns; also depends on vocative presence
|
|
|
|
# Removes wiki formatting, leaving only the text contents of the cell
|
|
get_tags = lambda item: [i for i in mwparserfromhell.parse(mwparserfromhell.parse(item)).nodes] # Lists all tags in a given cell
|
|
stripper = lambda tags: [mwparserfromhell.parse(t).strip_code() for t in [t for t in tags if type(t) == mwparserfromhell.nodes.tag.Tag] if not ' class="tr Latn"' in t.attributes and t.tag == "span"]
|
|
lemma = stripper(get_tags(table[1][1]))[0]
|
|
all_forms["lemma"] = lemma
|
|
for i in range(1, columns):
|
|
column = table[0][i]
|
|
for j in range(1, rows):
|
|
row = table[j][0]
|
|
tags = get_tags(table[j][i])
|
|
values = stripper(tags)
|
|
|
|
for v in values:
|
|
# Converts the table's "singular", "plural", "vocative form", etc., labels into "s", "p", "v", etc.
|
|
form = forms[row]
|
|
number = numbers[column]
|
|
# Count form is a bit different, as the parameters are not "form|number" but the literal, "count|form".
|
|
if row == "count form":
|
|
form, number = "count", "form"
|
|
if v not in all_forms["forms"]:
|
|
all_forms["forms"][v] = [(form, number)]
|
|
else:
|
|
all_forms["forms"][v].append((form, number))
|
|
# This check is useful for masculine terms, as the above code would profile certain noun forms as being both
|
|
# 'definite (subject form) plural' and 'definite (object form) plural', even though those forms are the same in Bulgarian.
|
|
# Hence, this snippet is run to ensure the two get merged into simply 'definite plural'.
|
|
for key in all_forms["forms"]:
|
|
if all_forms["forms"][key] == [("sbjv", "p"), ("objv", "p")]:
|
|
all_forms["forms"][key] = [("def", "p")]
|
|
result.append(all_forms)
|
|
return result
|
|
|
|
"""Used to generate pages from a list of dictionaries corresponding to the derived forms from declension tables.
|
|
Iterates over all forms and generates new pages for them if no page exists with a Bulgarian entry.
|
|
"""
|
|
def generate_derivatives(form_list):
|
|
|
|
pages_to_create = dict()
|
|
for dic in form_list:
|
|
for key in dic["forms"]:
|
|
stripped = key.replace("́", "")
|
|
if stripped not in pages_to_create:
|
|
pages_to_create[stripped] = {
|
|
"associations": [
|
|
{
|
|
"mapping": [
|
|
dic["lemma"],
|
|
key
|
|
],
|
|
"forms": dic["forms"][key]
|
|
|
|
}
|
|
]
|
|
}
|
|
else:
|
|
pages_to_create[stripped]["associations"].append(
|
|
{
|
|
"mapping": [
|
|
dic["lemma"],
|
|
key
|
|
],
|
|
"forms": dic["forms"][key]
|
|
|
|
}
|
|
)
|
|
|
|
for title in pages_to_create:
|
|
print(f"Creating page {title}.")
|
|
derivative_page = pywikibot.Page(site, title)
|
|
page_content = mwparserfromhell.parse(derivative_page.text)
|
|
entry = ""
|
|
|
|
# Check whether the bot can feasibly/permissibly edit the page. If not, quit.
|
|
if bool(page_content.get_sections(matches=r"Bulgarian")):
|
|
print(f"NOTE: page {title} already contains existing Bulgarian entry, exiting")
|
|
continue
|
|
if not derivative_page.botMayEdit:
|
|
print(f"ERROR: page {title} disallows bot editing, exiting")
|
|
continue
|
|
|
|
# We are clear to edit once these checks have been ascertained. That there is no Bulgarian header means
|
|
# we are also safe to generate the entries now: we are guaranteed not to waste any processing time,
|
|
# as it is certain that the content to be generated will find a place on the page (so long as there is no existing
|
|
# Bulgarin entry, as we have hereby discovered.)
|
|
# A few scenarios now exist:
|
|
# 1. For a given title, it corresponds to only one of the original etymologies. Check out 'кукла':
|
|
# The form 'кукло' is a vocative singular, but only applies to the first type.
|
|
# → In this case, we need to create a page with only one Pronunciation, no Etymology header, and a
|
|
# reference to the given lemma.
|
|
#
|
|
# 2. For a given title, it corresponds to multiple etymologies. The same page 'кукла' also has multiple
|
|
# forms that are shared by all 3 declension tables in that entry, for example the definite singular 'куклата'.
|
|
# → In this case, we need to create a page containing separate Etymology headers for each form.
|
|
# We should also check all of the dictionaries for their spelling of the title, as if all of the forms have the
|
|
# exact same spelling (including acute symbols for accentuation), the prounciation header can be relegated to the
|
|
# top as an L3 header, rather than repeating the same IPA template underneath each etymology (which can be the same.)
|
|
|
|
def generate_definition(formlist, lemma):
|
|
definition = "# {{inflection of|bg|" + lemma + "||"
|
|
for i, variant in enumerate(formlist):
|
|
form, number = variant
|
|
definition += form + "|" + number
|
|
if i < len(formlist)-1:
|
|
definition += "|;|"
|
|
definition += "}}"
|
|
return definition
|
|
|
|
# Case 1: only one association
|
|
if len(pages_to_create[title]["associations"]) == 1:
|
|
lemma = pages_to_create[title]["associations"][0]["mapping"][0]
|
|
declined_form = pages_to_create[title]["associations"][0]["mapping"][1]
|
|
forms = pages_to_create[title]["associations"][0]["forms"]
|
|
|
|
definition = generate_definition(forms, lemma)
|
|
entry = "==Bulgarian==\n\n===Pronunciation===\n* {{bg-IPA|" + declined_form + "}}\n\n===Noun===\n{{head|bg|noun form|head=" + declined_form + "}}\n\n" + definition + "\n"
|
|
|
|
# Case 2: multiple associations
|
|
else:
|
|
entry = "==Bulgarian==\n\n"
|
|
uniform_pronunciation = True # Assume each form has the same pronunciation
|
|
previous = ""
|
|
for a in pages_to_create[title]["associations"]:
|
|
# If the spelling of the current term differs from the previous, we have different pronunciations
|
|
if a["mapping"][1] != previous and previous != "":
|
|
uniform_pronunciation = False
|
|
break
|
|
# At each iteration, set the 'previous' variable equal to the
|
|
previous = a["mapping"][1]
|
|
if uniform_pronunciation:
|
|
entry += "===Pronunciation===\n* {{bg-IPA|" + previous + "}}\n\n"
|
|
|
|
for i, a in enumerate(pages_to_create[title]["associations"]):
|
|
entry += "===Etymology " + str(i+1) + "===\n\n"
|
|
lemma = a["mapping"][0]
|
|
declined_form = a["mapping"][1]
|
|
forms = a["forms"]
|
|
definition = generate_definition(forms, lemma)
|
|
if not uniform_pronunciation:
|
|
entry += "====Pronunciation====\n* {{bg-IPA|" + declined_form + "}}\n\n"
|
|
entry += "====Noun===="
|
|
entry += "\n{{head|bg|noun form|head=" + declined_form + "}}\n\n" + definition + "\n\n"
|
|
if entry.endswith("\n\n"):
|
|
entry = entry[:-1]
|
|
|
|
# The page we are trying to create either exists, or does not.
|
|
# 3. The page exists already.
|
|
# This means that someone has already entered content into the page. So long as the page even vaguely conforms
|
|
# to the requirements of a page on this wiki, it will have some number of L2 language headers.
|
|
# → In this case, we need to find the first language header that comes after Bulgarian, and then place the Bulgarian
|
|
# entry after the language header that precedes it (or at the beginning of the page, if no language precedes the
|
|
# existing language header). Furthermore, there may not be any language header that comes after Bulgarian, e.g.
|
|
# the only entry is for some other language that comes before Bulgarian, e.g. Belarusian.
|
|
# Should this occur, the Bulgarian can simply be placed at the end of the page.
|
|
#
|
|
# 4. The page does not exist yet.
|
|
# This is the by-far simpler case, as it means I can simply paste the desired contents into the page and save it.
|
|
# → In this case, all that we need to do is set the page's content equal to the generated entry and smash that
|
|
# save request. Not too bad in this case.
|
|
|
|
save_message = "" # Edit summary
|
|
|
|
# Case 3: the page already exists.
|
|
if derivative_page.exists():
|
|
found, index = "", -1
|
|
for i, section in enumerate(page_content.get_sections(levels=[2])):
|
|
t = section.nodes[0].title
|
|
if t > "Bulgarian":
|
|
found = section
|
|
index = i
|
|
break
|
|
|
|
# Some considerations here: if there is now a heading preceding the heading that was found,
|
|
# the Bulgarian entry must go after it. Else (i.e. i will be 0), the Bulgarian can be placed
|
|
# at the beginning of the page. We must be aware that sometimes, the top of the page contains the {{also}}
|
|
# template, and in fact, can contain any number of bizarre templates theoretically.
|
|
# However, this can be accounted for by placing the entry right before the first heading ("==").
|
|
# If i remains -1, that means there is no heading that should follow ==Bulgarian===; consequently,
|
|
# the Bulgarian can be placed at the end of the page straight away.
|
|
|
|
if i == -1:
|
|
derivative_page.text = derivative_page.text + "\n----\n\n" + entry
|
|
elif i == 0:
|
|
first_lang = derivative_page.text.find("==")
|
|
derivative_page.text = derivative_page.text[0:first_lang] + entry + "\n----\n\n" + derivative_page.text[first_lang:]
|
|
else:
|
|
# The Bulgarian entry lies somewhere between two other entries
|
|
preceding = page_content.get_sections(levels=[2])[i-1].nodes[0].title
|
|
stringed = str(page_content)
|
|
split_location = stringed.find("----", stringed.find(str(preceding))) + 5
|
|
derivative_page.text = stringed[:split_location] + "\n" + entry + "\n----\n\n" + stringed[split_location:]
|
|
save_message = f"Updated page {title} with content: {derivative_page.text}"
|
|
|
|
# Case 4: the page does not exist.
|
|
else:
|
|
derivative_page.text = entry
|
|
save_message = f"Created derived form of {lemma}"
|
|
|
|
derivative_page.save(save_message)
|
|
if os.path.exists(os.path.expanduser(f"~/Documents/Programming/Wiktionary/Bot/output/")):
|
|
with open(os.path.expanduser(f"~/Documents/Programming/Wiktionary/Bot/output/{title}.txt"), mode="w", encoding="utf-8") as output:
|
|
output.write(derivative_page.text)
|
|
|
|
# Just a little alias
|
|
def analyze_and_generate(page):
|
|
generate_derivatives(get_forms(page))
|
|
|
|
if __name__ == "__main__":
|
|
bg_nouns = pywikibot.Category(site, "Category:Bulgarian_nouns")
|
|
n_editing = 10
|
|
for p in bg_nouns.articles(total=n_editing):
|
|
analyze_and_generate(p)
|