Final form of bot before submission
Expanded functionality of the bot to create derived forms for multiple etymologies at a time; created a loop to iterate over all lemmas in the category for "Bulgarian nouns". Probably various other improvements that I cannot recall, however.
This commit is contained in:
parent
8f96b945d5
commit
80d1735a6b
283
test.py
283
test.py
@ -1,37 +1,57 @@
|
||||
import pywikibot
|
||||
import mwparserfromhell
|
||||
import wikitextparser
|
||||
import os
|
||||
|
||||
PAGENAME = "къща"
|
||||
#PAGENAME = "кукла"
|
||||
numbers = {
|
||||
"singular": "s",
|
||||
"plural": "p",
|
||||
}
|
||||
|
||||
forms = {
|
||||
"indefinite": "indef",
|
||||
"definite": "def",
|
||||
"definite<br>(subject form)": "sbjv",
|
||||
"definite<br>(object form)": "objv",
|
||||
"vocative form": "voc",
|
||||
"count form": "count"
|
||||
}
|
||||
|
||||
|
||||
# Connect to Wiktionary (according to user profile)
|
||||
site = pywikibot.Site()
|
||||
page = pywikibot.Page(site, PAGENAME)
|
||||
print(f"FETCHED: {PAGENAME}")
|
||||
|
||||
"""Returns a list of dictionaries; each dictionary looks like this:
|
||||
{ "lemma": <term>
|
||||
"forms": {
|
||||
<form 1>: [("indef", "s"), ("voc", "s")
|
||||
etc.
|
||||
}
|
||||
}
|
||||
Each dictionary is able to be used to generate an entire set of noun forms.
|
||||
The reason there are multiple dictionaries in the list is because there can be
|
||||
multiple noun forms per page, each of which *can* have different declensions.
|
||||
If there is only one noun header on a page, it is not necessary to include multiple etymologies on the derived form page."""
|
||||
def get_forms(page):
|
||||
# 'result' is the list of dictionaries to be returned
|
||||
result = []
|
||||
text = page.expand_text()
|
||||
text = page.expand_text() # Provides fully-expanded wikitext of a page, i.e. shows the full table markup instead of {{bg-ndecl}}
|
||||
|
||||
document = mwparserfromhell.parse(text)
|
||||
bulgarian = document.get_sections(matches=r"Bulgarian")[0] # Find the section that contains the Bulgarian entry
|
||||
nouns = bulgarian.get_sections(matches=r"Noun( \d+)*") # Locate any noun sections
|
||||
# document.get_sections()
|
||||
for n in nouns:
|
||||
|
||||
all_forms = {"lemma": "", "forms": {}} # Each 'noun' has a list of forms in its declension, which we will be populating as a dictionary
|
||||
declension = n.get_sections(matches=r"Declension")
|
||||
if declension == []: continue
|
||||
else: declension = declension[0]
|
||||
nav_frame = mwparserfromhell.parse(declension).nodes[2]
|
||||
table = wikitextparser.parse(str(nav_frame)).tables[0].data()
|
||||
declension = n.get_sections(matches=r"Declension") # Finds declension header
|
||||
if declension == []: continue
|
||||
else: declension = declension[0] # If there is no declension for a noun form, skip, else set 'declension' to the first element in the result
|
||||
nav_frame = mwparserfromhell.parse(declension).nodes[2] # Selects the wikitable itself
|
||||
table = wikitextparser.parse(str(nav_frame)).tables[0].data() # Converts into 2D list
|
||||
columns = len(table[0]) # Always equals 3: the blank tile, plus the singular and plural
|
||||
rows = len(table) # Differs between masculine and non-masculine nouns; also depends on vocative presence
|
||||
|
||||
# Removes wiki formatting, leaving only the text contents of the cell
|
||||
#cyrillic = lambda text: not bool(re.search('[A-ЌЎЏѐ-ќў-嶲]', text))
|
||||
#stripper = lambda item: [i.strip_code() for i in mwparserfromhell.parse(mwparserfromhell.parse(item)).nodes if cyrillic(i.strip_code())]
|
||||
get_tags = lambda item: [i for i in mwparserfromhell.parse(mwparserfromhell.parse(item)).nodes]
|
||||
get_tags = lambda item: [i for i in mwparserfromhell.parse(mwparserfromhell.parse(item)).nodes] # Lists all tags in a given cell
|
||||
stripper = lambda tags: [mwparserfromhell.parse(t).strip_code() for t in [t for t in tags if type(t) == mwparserfromhell.nodes.tag.Tag] if not ' class="tr Latn"' in t.attributes and t.tag == "span"]
|
||||
lemma = stripper(get_tags(table[1][1]))[0]
|
||||
all_forms["lemma"] = lemma
|
||||
@ -40,71 +60,204 @@ def get_forms(page):
|
||||
for j in range(1, rows):
|
||||
row = table[j][0]
|
||||
tags = get_tags(table[j][i])
|
||||
|
||||
#print(tags)
|
||||
values = (stripper(tags))
|
||||
values = stripper(tags)
|
||||
|
||||
numbers = {
|
||||
"singular": "s",
|
||||
"plural": "p",
|
||||
}
|
||||
|
||||
forms = {
|
||||
"indefinite": "indef",
|
||||
"definite": "def",
|
||||
"definite<br>(subject form)": "sbjv",
|
||||
"definite<br>(object form)": "objv",
|
||||
"vocative form": "voc"
|
||||
}
|
||||
for v in values:
|
||||
# Converts the table's "singular", "plural", "vocative form", etc., labels into "s", "p", "v", etc.
|
||||
form = forms[row]
|
||||
number = numbers[column]
|
||||
# Count form is a bit different, as the parameters are not "form|number" but the literal, "count|form".
|
||||
if row == "count form":
|
||||
form, number = "count", "form"
|
||||
if v not in all_forms["forms"]:
|
||||
all_forms["forms"][v] = [(form, number)]
|
||||
else:
|
||||
all_forms["forms"][v].append((form, number))
|
||||
|
||||
#print(f"{number}, {form}: {value}")
|
||||
# This check is useful for masculine terms, as the above code would profile certain noun forms as being both
|
||||
# 'definite (subject form) plural' and 'definite (object form) plural', even though those forms are the same in Bulgarian.
|
||||
# Hence, this snippet is run to ensure the two get merged into simply 'definite plural'.
|
||||
for key in all_forms["forms"]:
|
||||
if all_forms["forms"][key] == [("sbjv", "p"), ("objv", "p")]:
|
||||
all_forms["forms"][key] = [("def", "p")]
|
||||
#print(all_forms[key])
|
||||
|
||||
|
||||
result.append(all_forms)
|
||||
return result
|
||||
|
||||
"""Used to generate pages from a list of dictionaries corresponding to the derived forms from declension tables.
|
||||
Iterates over all forms and generates new pages for them if no page exists with a Bulgarian entry.
|
||||
"""
|
||||
def generate_derivatives(form_list):
|
||||
forms = {dictionary["lemma"] for dictionary in form_list}
|
||||
if len(forms) < len(form_list):
|
||||
print("Multiple conflicting senses for this term exist with identical stresses, exiting")
|
||||
return
|
||||
|
||||
pages_to_create = dict()
|
||||
for dic in form_list:
|
||||
strip_acute = lambda text: text.replace("́", "")
|
||||
lemma = dic["lemma"]
|
||||
for key in dic["forms"]:
|
||||
title = strip_acute(key)
|
||||
derivative_page = pywikibot.Page(site, title)
|
||||
content = mwparserfromhell.parse(derivative_page.text)
|
||||
if bool(content.get_sections(matches=r"Bulgarian")):
|
||||
print(f"ERROR: page {title} already contains existing Bulgarian entry, exiting")
|
||||
continue
|
||||
if not derivative_page.botMayEdit:
|
||||
print(f"ERROR: page {title} disallows bot editing, exiting")
|
||||
assert "==Bulgarian==" not in content
|
||||
if derivative_page.exists():
|
||||
pass
|
||||
stripped = key.replace("́", "")
|
||||
if stripped not in pages_to_create:
|
||||
pages_to_create[stripped] = {
|
||||
"associations": [
|
||||
{
|
||||
"mapping": [
|
||||
dic["lemma"],
|
||||
key
|
||||
],
|
||||
"forms": dic["forms"][key]
|
||||
|
||||
}
|
||||
]
|
||||
}
|
||||
else:
|
||||
print(f"Page {title} does not exist, creating derived form...")
|
||||
definition = "# {{inflection of|bg|" + lemma + "||"
|
||||
for i, variant in enumerate(dic["forms"][key]):
|
||||
form, number = variant
|
||||
definition += form + "|" + number
|
||||
if i < len(dic["forms"][key])-1:
|
||||
definition += "|;|"
|
||||
definition += "}}"
|
||||
entry = "==Bulgarian==\n\n===Pronunciation===\n* {{bg-IPA|" + key + "}}\n\n===Noun===\n{{head|bg|noun form|head=" + key + "}}\n\n" + definition + "\n"
|
||||
derivative_page.text = entry
|
||||
derivative_page.save(f"Created derived form of {lemma}")
|
||||
print(f"Created page {title} with content: {entry}")
|
||||
generate_derivatives(get_forms(page))
|
||||
pages_to_create[stripped]["associations"].append(
|
||||
{
|
||||
"mapping": [
|
||||
dic["lemma"],
|
||||
key
|
||||
],
|
||||
"forms": dic["forms"][key]
|
||||
|
||||
}
|
||||
)
|
||||
|
||||
for title in pages_to_create:
|
||||
print(f"Creating page {title}.")
|
||||
derivative_page = pywikibot.Page(site, title)
|
||||
page_content = mwparserfromhell.parse(derivative_page.text)
|
||||
entry = ""
|
||||
|
||||
# Check whether the bot can feasibly/permissibly edit the page. If not, quit.
|
||||
# if bool(page_content.get_sections(matches=r"Bulgarian")):
|
||||
# print(f"NOTE: page {title} already contains existing Bulgarian entry, exiting")
|
||||
# continue
|
||||
if not derivative_page.botMayEdit:
|
||||
print(f"ERROR: page {title} disallows bot editing, exiting")
|
||||
continue
|
||||
|
||||
# We are clear to edit once these checks have been ascertained. That there is no Bulgarian header means
|
||||
# we are also safe to generate the entries now: we are guaranteed not to waste any processing time,
|
||||
# as it is certain that the content to be generated will find a place on the page (so long as there is no existing
|
||||
# Bulgarin entry, as we have hereby discovered.)
|
||||
# A few scenarios now exist:
|
||||
# 1. For a given title, it corresponds to only one of the original etymologies. Check out 'кукла':
|
||||
# The form 'кукло' is a vocative singular, but only applies to the first type.
|
||||
# → In this case, we need to create a page with only one Pronunciation, no Etymology header, and a
|
||||
# reference to the given lemma.
|
||||
#
|
||||
# 2. For a given title, it corresponds to multiple etymologies. The same page 'кукла' also has multiple
|
||||
# forms that are shared by all 3 declension tables in that entry, for example the definite singular 'куклата'.
|
||||
# → In this case, we need to create a page containing separate Etymology headers for each form.
|
||||
# We should also check all of the dictionaries for their spelling of the title, as if all of the forms have the
|
||||
# exact same spelling (including acute symbols for accentuation), the prounciation header can be relegated to the
|
||||
# top as an L3 header, rather than repeating the same IPA template underneath each etymology (which can be the same.)
|
||||
|
||||
def generate_definition(formlist, lemma):
|
||||
definition = "# {{inflection of|bg|" + lemma + "||"
|
||||
for i, variant in enumerate(formlist):
|
||||
form, number = variant
|
||||
definition += form + "|" + number
|
||||
if i < len(formlist)-1:
|
||||
definition += "|;|"
|
||||
definition += "}}"
|
||||
return definition
|
||||
|
||||
# Case 1: only one association
|
||||
if len(pages_to_create[title]["associations"]) == 1:
|
||||
lemma = pages_to_create[title]["associations"][0]["mapping"][0]
|
||||
declined_form = pages_to_create[title]["associations"][0]["mapping"][1]
|
||||
forms = pages_to_create[title]["associations"][0]["forms"]
|
||||
|
||||
definition = generate_definition(forms, lemma)
|
||||
entry = "==Bulgarian==\n\n===Pronunciation===\n* {{bg-IPA|" + declined_form + "}}\n\n===Noun===\n{{head|bg|noun form|head=" + declined_form + "}}\n\n" + definition + "\n"
|
||||
|
||||
# Case 2: multiple associations
|
||||
else:
|
||||
entry = "==Bulgarian==\n\n"
|
||||
uniform_pronunciation = True # Assume each form has the same pronunciation
|
||||
previous = ""
|
||||
for a in pages_to_create[title]["associations"]:
|
||||
# If the spelling of the current term differs from the previous, we have different pronunciations
|
||||
if a["mapping"][1] != previous and previous != "":
|
||||
uniform_pronunciation = False
|
||||
break
|
||||
# At each iteration, set the 'previous' variable equal to the
|
||||
previous = a["mapping"][1]
|
||||
if uniform_pronunciation:
|
||||
entry += "===Pronunciation===\n* {{bg-IPA|" + previous + "}}\n\n"
|
||||
|
||||
for i, a in enumerate(pages_to_create[title]["associations"]):
|
||||
entry += "===Etymology " + str(i+1) + "===\n\n"
|
||||
lemma = a["mapping"][0]
|
||||
declined_form = a["mapping"][1]
|
||||
forms = a["forms"]
|
||||
definition = generate_definition(forms, lemma)
|
||||
if not uniform_pronunciation:
|
||||
entry += "====Pronunciation====\n* {{bg-IPA|" + declined_form + "}}\n\n"
|
||||
entry += "====Noun===="
|
||||
entry += "\n{{head|bg|noun form|head=" + declined_form + "}}\n\n" + definition + "\n\n"
|
||||
if entry.endswith("\n\n"):
|
||||
entry = entry[:-1]
|
||||
|
||||
# The page we are trying to create either exists, or does not.
|
||||
# 3. The page exists already.
|
||||
# This means that someone has already entered content into the page. So long as the page even vaguely conforms
|
||||
# to the requirements of a page on this wiki, it will have some number of L2 language headers.
|
||||
# → In this case, we need to find the first language header that comes after Bulgarian, and then place the Bulgarian
|
||||
# entry after the language header that precedes it (or at the beginning of the page, if no language precedes the
|
||||
# existing language header). Furthermore, there may not be any language header that comes after Bulgarian, e.g.
|
||||
# the only entry is for some other language that comes before Bulgarian, e.g. Belarusian.
|
||||
# Should this occur, the Bulgarian can simply be placed at the end of the page.
|
||||
#
|
||||
# 4. The page does not exist yet.
|
||||
# This is the by-far simpler case, as it means I can simply paste the desired contents into the page and save it.
|
||||
# → In this case, all that we need to do is set the page's content equal to the generated entry and smash that
|
||||
# save request. Not too bad in this case.
|
||||
|
||||
save_message = "" # Edit summary
|
||||
|
||||
# Case 3: the page already exists.
|
||||
if derivative_page.exists():
|
||||
found, index = "", -1
|
||||
for i, section in enumerate(page_content.get_sections(levels=[2])):
|
||||
t = section.nodes[0].title
|
||||
if t > "Bulgarian":
|
||||
found = section
|
||||
index = i
|
||||
break
|
||||
|
||||
# Some considerations here: if there is now a heading preceding the heading that was found,
|
||||
# the Bulgarian entry must go after it. Else (i.e. i will be 0), the Bulgarian can be placed
|
||||
# at the beginning of the page. We must be aware that sometimes, the top of the page contains the {{also}}
|
||||
# template, and in fact, can contain any number of bizarre templates theoretically.
|
||||
# However, this can be accounted for by placing the entry right before the first heading ("==").
|
||||
# If i remains -1, that means there is no heading that should follow ==Bulgarian===; consequently,
|
||||
# the Bulgarian can be placed at the end of the page straight away.
|
||||
|
||||
if i == -1:
|
||||
derivative_page.text = derivative_page.text + "\n----\n\n" + entry
|
||||
elif i == 0:
|
||||
first_lang = derivative_page.text.find("==")
|
||||
derivative_page.text = derivative_page.text[0:first_lang] + entry + "\n----\n\n" + derivative_page.text[first_lang:]
|
||||
else:
|
||||
# The Bulgarian entry lies somewhere between two other entries
|
||||
preceding = page_content.get_sections(levels=[2])[i-1].nodes[0].title
|
||||
stringed = str(page_content)
|
||||
split_location = stringed.find("----", stringed.find(str(preceding))) + 5
|
||||
derivative_page.text = stringed[:split_location] + "\n" + entry + "\n----\n\n" + stringed[split_location:]
|
||||
save_message = f"Updated page {title} with content: {derivative_page.text}"
|
||||
|
||||
# Case 4: the page does not exist.
|
||||
else:
|
||||
derivative_page.text = entry
|
||||
save_message = f"Created derived form of {lemma}"
|
||||
|
||||
derivative_page.save(save_message)
|
||||
if os.path.exists(os.path.expanduser(f"~/Documents/Programming/Wiktionary/Bot/output/")):
|
||||
with open(os.path.expanduser(f"~/Documents/Programming/Wiktionary/Bot/output/{title}.txt"), mode="w", encoding="utf-8") as output:
|
||||
output.write(derivative_page.text)
|
||||
|
||||
# Just a little alias
|
||||
def analyze_and_generate(page):
|
||||
generate_derivatives(get_forms(page))
|
||||
|
||||
if __name__ == "__main__":
|
||||
bg_nouns = pywikibot.Category(site, "Category:Bulgarian_nouns")
|
||||
n_editing = 10
|
||||
for p in bg_nouns.articles(total=n_editing):
|
||||
analyze_and_generate(p)
|
||||
|
Loading…
Reference in New Issue
Block a user