From 80d1735a6bd7f79ed338840c237a47adcc6006ca Mon Sep 17 00:00:00 2001 From: KovachevBot <86124720+KovachevBot@users.noreply.github.com> Date: Sun, 20 Jun 2021 16:06:26 +0100 Subject: [PATCH] Final form of bot before submission Expanded functionality of the bot to create derived forms for multiple etymologies at a time; created a loop to iterate over all lemmas in the category for "Bulgarian nouns". Probably various other improvements that I cannot recall, however. --- test.py | 283 +++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 218 insertions(+), 65 deletions(-) diff --git a/test.py b/test.py index 10187b0..45d4e21 100644 --- a/test.py +++ b/test.py @@ -1,37 +1,57 @@ import pywikibot import mwparserfromhell import wikitextparser +import os -PAGENAME = "къща" -#PAGENAME = "кукла" +numbers = { + "singular": "s", + "plural": "p", +} + +forms = { + "indefinite": "indef", + "definite": "def", + "definite
(subject form)": "sbjv", + "definite
(object form)": "objv", + "vocative form": "voc", + "count form": "count" +} +# Connect to Wiktionary (according to user profile) site = pywikibot.Site() -page = pywikibot.Page(site, PAGENAME) -print(f"FETCHED: {PAGENAME}") + +"""Returns a list of dictionaries; each dictionary looks like this: +{ "lemma": + "forms": { +
: [("indef", "s"), ("voc", "s") + etc. + } +} +Each dictionary is able to be used to generate an entire set of noun forms. +The reason there are multiple dictionaries in the list is because there can be +multiple noun forms per page, each of which *can* have different declensions. +If there is only one noun header on a page, it is not necessary to include multiple etymologies on the derived form page.""" def get_forms(page): + # 'result' is the list of dictionaries to be returned result = [] - text = page.expand_text() + text = page.expand_text() # Provides fully-expanded wikitext of a page, i.e. shows the full table markup instead of {{bg-ndecl}} document = mwparserfromhell.parse(text) bulgarian = document.get_sections(matches=r"Bulgarian")[0] # Find the section that contains the Bulgarian entry nouns = bulgarian.get_sections(matches=r"Noun( \d+)*") # Locate any noun sections - # document.get_sections() for n in nouns: - all_forms = {"lemma": "", "forms": {}} # Each 'noun' has a list of forms in its declension, which we will be populating as a dictionary - declension = n.get_sections(matches=r"Declension") - if declension == []: continue - else: declension = declension[0] - nav_frame = mwparserfromhell.parse(declension).nodes[2] - table = wikitextparser.parse(str(nav_frame)).tables[0].data() + declension = n.get_sections(matches=r"Declension") # Finds declension header + if declension == []: continue + else: declension = declension[0] # If there is no declension for a noun form, skip, else set 'declension' to the first element in the result + nav_frame = mwparserfromhell.parse(declension).nodes[2] # Selects the wikitable itself + table = wikitextparser.parse(str(nav_frame)).tables[0].data() # Converts into 2D list columns = len(table[0]) # Always equals 3: the blank tile, plus the singular and plural rows = len(table) # Differs between masculine and non-masculine nouns; also depends on vocative presence # Removes wiki formatting, leaving only the text contents of the cell - #cyrillic = lambda text: not bool(re.search('[A-ЌЎЏѐ-ќў-嶲]', text)) - #stripper = lambda item: [i.strip_code() for i in mwparserfromhell.parse(mwparserfromhell.parse(item)).nodes if cyrillic(i.strip_code())] - get_tags = lambda item: [i for i in mwparserfromhell.parse(mwparserfromhell.parse(item)).nodes] + get_tags = lambda item: [i for i in mwparserfromhell.parse(mwparserfromhell.parse(item)).nodes] # Lists all tags in a given cell stripper = lambda tags: [mwparserfromhell.parse(t).strip_code() for t in [t for t in tags if type(t) == mwparserfromhell.nodes.tag.Tag] if not ' class="tr Latn"' in t.attributes and t.tag == "span"] lemma = stripper(get_tags(table[1][1]))[0] all_forms["lemma"] = lemma @@ -40,71 +60,204 @@ def get_forms(page): for j in range(1, rows): row = table[j][0] tags = get_tags(table[j][i]) - - #print(tags) - values = (stripper(tags)) + values = stripper(tags) - numbers = { - "singular": "s", - "plural": "p", - } - - forms = { - "indefinite": "indef", - "definite": "def", - "definite
(subject form)": "sbjv", - "definite
(object form)": "objv", - "vocative form": "voc" - } for v in values: + # Converts the table's "singular", "plural", "vocative form", etc., labels into "s", "p", "v", etc. form = forms[row] number = numbers[column] + # Count form is a bit different, as the parameters are not "form|number" but the literal, "count|form". + if row == "count form": + form, number = "count", "form" if v not in all_forms["forms"]: all_forms["forms"][v] = [(form, number)] else: all_forms["forms"][v].append((form, number)) - - #print(f"{number}, {form}: {value}") + # This check is useful for masculine terms, as the above code would profile certain noun forms as being both + # 'definite (subject form) plural' and 'definite (object form) plural', even though those forms are the same in Bulgarian. + # Hence, this snippet is run to ensure the two get merged into simply 'definite plural'. for key in all_forms["forms"]: if all_forms["forms"][key] == [("sbjv", "p"), ("objv", "p")]: all_forms["forms"][key] = [("def", "p")] - #print(all_forms[key]) - - result.append(all_forms) return result +"""Used to generate pages from a list of dictionaries corresponding to the derived forms from declension tables. +Iterates over all forms and generates new pages for them if no page exists with a Bulgarian entry. +""" def generate_derivatives(form_list): - forms = {dictionary["lemma"] for dictionary in form_list} - if len(forms) < len(form_list): - print("Multiple conflicting senses for this term exist with identical stresses, exiting") - return + + pages_to_create = dict() for dic in form_list: - strip_acute = lambda text: text.replace("́", "") - lemma = dic["lemma"] for key in dic["forms"]: - title = strip_acute(key) - derivative_page = pywikibot.Page(site, title) - content = mwparserfromhell.parse(derivative_page.text) - if bool(content.get_sections(matches=r"Bulgarian")): - print(f"ERROR: page {title} already contains existing Bulgarian entry, exiting") - continue - if not derivative_page.botMayEdit: - print(f"ERROR: page {title} disallows bot editing, exiting") - assert "==Bulgarian==" not in content - if derivative_page.exists(): - pass + stripped = key.replace("́", "") + if stripped not in pages_to_create: + pages_to_create[stripped] = { + "associations": [ + { + "mapping": [ + dic["lemma"], + key + ], + "forms": dic["forms"][key] + + } + ] + } else: - print(f"Page {title} does not exist, creating derived form...") - definition = "# {{inflection of|bg|" + lemma + "||" - for i, variant in enumerate(dic["forms"][key]): - form, number = variant - definition += form + "|" + number - if i < len(dic["forms"][key])-1: - definition += "|;|" - definition += "}}" - entry = "==Bulgarian==\n\n===Pronunciation===\n* {{bg-IPA|" + key + "}}\n\n===Noun===\n{{head|bg|noun form|head=" + key + "}}\n\n" + definition + "\n" - derivative_page.text = entry - derivative_page.save(f"Created derived form of {lemma}") - print(f"Created page {title} with content: {entry}") -generate_derivatives(get_forms(page)) \ No newline at end of file + pages_to_create[stripped]["associations"].append( + { + "mapping": [ + dic["lemma"], + key + ], + "forms": dic["forms"][key] + + } + ) + + for title in pages_to_create: + print(f"Creating page {title}.") + derivative_page = pywikibot.Page(site, title) + page_content = mwparserfromhell.parse(derivative_page.text) + entry = "" + + # Check whether the bot can feasibly/permissibly edit the page. If not, quit. + # if bool(page_content.get_sections(matches=r"Bulgarian")): + # print(f"NOTE: page {title} already contains existing Bulgarian entry, exiting") + # continue + if not derivative_page.botMayEdit: + print(f"ERROR: page {title} disallows bot editing, exiting") + continue + + # We are clear to edit once these checks have been ascertained. That there is no Bulgarian header means + # we are also safe to generate the entries now: we are guaranteed not to waste any processing time, + # as it is certain that the content to be generated will find a place on the page (so long as there is no existing + # Bulgarin entry, as we have hereby discovered.) + # A few scenarios now exist: + # 1. For a given title, it corresponds to only one of the original etymologies. Check out 'кукла': + # The form 'кукло' is a vocative singular, but only applies to the first type. + # → In this case, we need to create a page with only one Pronunciation, no Etymology header, and a + # reference to the given lemma. + # + # 2. For a given title, it corresponds to multiple etymologies. The same page 'кукла' also has multiple + # forms that are shared by all 3 declension tables in that entry, for example the definite singular 'куклата'. + # → In this case, we need to create a page containing separate Etymology headers for each form. + # We should also check all of the dictionaries for their spelling of the title, as if all of the forms have the + # exact same spelling (including acute symbols for accentuation), the prounciation header can be relegated to the + # top as an L3 header, rather than repeating the same IPA template underneath each etymology (which can be the same.) + + def generate_definition(formlist, lemma): + definition = "# {{inflection of|bg|" + lemma + "||" + for i, variant in enumerate(formlist): + form, number = variant + definition += form + "|" + number + if i < len(formlist)-1: + definition += "|;|" + definition += "}}" + return definition + + # Case 1: only one association + if len(pages_to_create[title]["associations"]) == 1: + lemma = pages_to_create[title]["associations"][0]["mapping"][0] + declined_form = pages_to_create[title]["associations"][0]["mapping"][1] + forms = pages_to_create[title]["associations"][0]["forms"] + + definition = generate_definition(forms, lemma) + entry = "==Bulgarian==\n\n===Pronunciation===\n* {{bg-IPA|" + declined_form + "}}\n\n===Noun===\n{{head|bg|noun form|head=" + declined_form + "}}\n\n" + definition + "\n" + + # Case 2: multiple associations + else: + entry = "==Bulgarian==\n\n" + uniform_pronunciation = True # Assume each form has the same pronunciation + previous = "" + for a in pages_to_create[title]["associations"]: + # If the spelling of the current term differs from the previous, we have different pronunciations + if a["mapping"][1] != previous and previous != "": + uniform_pronunciation = False + break + # At each iteration, set the 'previous' variable equal to the + previous = a["mapping"][1] + if uniform_pronunciation: + entry += "===Pronunciation===\n* {{bg-IPA|" + previous + "}}\n\n" + + for i, a in enumerate(pages_to_create[title]["associations"]): + entry += "===Etymology " + str(i+1) + "===\n\n" + lemma = a["mapping"][0] + declined_form = a["mapping"][1] + forms = a["forms"] + definition = generate_definition(forms, lemma) + if not uniform_pronunciation: + entry += "====Pronunciation====\n* {{bg-IPA|" + declined_form + "}}\n\n" + entry += "====Noun====" + entry += "\n{{head|bg|noun form|head=" + declined_form + "}}\n\n" + definition + "\n\n" + if entry.endswith("\n\n"): + entry = entry[:-1] + + # The page we are trying to create either exists, or does not. + # 3. The page exists already. + # This means that someone has already entered content into the page. So long as the page even vaguely conforms + # to the requirements of a page on this wiki, it will have some number of L2 language headers. + # → In this case, we need to find the first language header that comes after Bulgarian, and then place the Bulgarian + # entry after the language header that precedes it (or at the beginning of the page, if no language precedes the + # existing language header). Furthermore, there may not be any language header that comes after Bulgarian, e.g. + # the only entry is for some other language that comes before Bulgarian, e.g. Belarusian. + # Should this occur, the Bulgarian can simply be placed at the end of the page. + # + # 4. The page does not exist yet. + # This is the by-far simpler case, as it means I can simply paste the desired contents into the page and save it. + # → In this case, all that we need to do is set the page's content equal to the generated entry and smash that + # save request. Not too bad in this case. + + save_message = "" # Edit summary + + # Case 3: the page already exists. + if derivative_page.exists(): + found, index = "", -1 + for i, section in enumerate(page_content.get_sections(levels=[2])): + t = section.nodes[0].title + if t > "Bulgarian": + found = section + index = i + break + + # Some considerations here: if there is now a heading preceding the heading that was found, + # the Bulgarian entry must go after it. Else (i.e. i will be 0), the Bulgarian can be placed + # at the beginning of the page. We must be aware that sometimes, the top of the page contains the {{also}} + # template, and in fact, can contain any number of bizarre templates theoretically. + # However, this can be accounted for by placing the entry right before the first heading ("=="). + # If i remains -1, that means there is no heading that should follow ==Bulgarian===; consequently, + # the Bulgarian can be placed at the end of the page straight away. + + if i == -1: + derivative_page.text = derivative_page.text + "\n----\n\n" + entry + elif i == 0: + first_lang = derivative_page.text.find("==") + derivative_page.text = derivative_page.text[0:first_lang] + entry + "\n----\n\n" + derivative_page.text[first_lang:] + else: + # The Bulgarian entry lies somewhere between two other entries + preceding = page_content.get_sections(levels=[2])[i-1].nodes[0].title + stringed = str(page_content) + split_location = stringed.find("----", stringed.find(str(preceding))) + 5 + derivative_page.text = stringed[:split_location] + "\n" + entry + "\n----\n\n" + stringed[split_location:] + save_message = f"Updated page {title} with content: {derivative_page.text}" + + # Case 4: the page does not exist. + else: + derivative_page.text = entry + save_message = f"Created derived form of {lemma}" + + derivative_page.save(save_message) + if os.path.exists(os.path.expanduser(f"~/Documents/Programming/Wiktionary/Bot/output/")): + with open(os.path.expanduser(f"~/Documents/Programming/Wiktionary/Bot/output/{title}.txt"), mode="w", encoding="utf-8") as output: + output.write(derivative_page.text) + +# Just a little alias +def analyze_and_generate(page): + generate_derivatives(get_forms(page)) + +if __name__ == "__main__": + bg_nouns = pywikibot.Category(site, "Category:Bulgarian_nouns") + n_editing = 10 + for p in bg_nouns.articles(total=n_editing): + analyze_and_generate(p)