kovachev-bot/bulgarian-derived-forms/bulgarian_derivatives.py

import pywikibot
import mwparserfromhell
import wikitextparser
import os

numbers = {
    "singular": "s",
    "plural": "p",
}

forms = {
    "indefinite": "indef",
    "definite": "def",
    "definite<br>(subject form)": "sbjv",
    "definite<br>(object form)": "objv",
    "vocative form": "voc",
    "count form": "count"
}


# Connect to Wiktionary (according to user profile)
site = pywikibot.Site()

"""Returns a list of dictionaries; each dictionary looks like this:
{ "lemma": <term>
  "forms": {
      <form 1>: [("indef", "s"), ("voc", "s")
      etc.
  }
}
Each dictionary is able to be used to generate an entire set of noun forms.
The reason there are multiple dictionaries in the list is because there can be
multiple noun forms per page, each of which *can* have different declensions.
If there is only one noun header on a page, it is not necessary to include multiple etymologies on the derived form page."""
def get_forms(page):
    # 'result' is the list of dictionaries to be returned
    result = []
    text = page.expand_text() # Provides fully-expanded wikitext of a page, i.e. shows the full table markup instead of {{bg-ndecl}}

    document = mwparserfromhell.parse(text)
    bulgarian = document.get_sections(matches=r"Bulgarian")[0] # Find the section that contains the Bulgarian entry
    nouns = bulgarian.get_sections(matches=r"Noun( \d+)*") # Locate any noun sections
    for n in nouns:
        all_forms = {"lemma": "", "forms": {}} # Each 'noun' has a list of forms in its declension, which we will be populating as a dictionary
        declension = n.get_sections(matches=r"Declension") # Finds declension header
        if declension == []: continue
        else: declension = declension[0] # If there is no declension for a noun form, skip, else set 'declension' to the first element in the result
        nav_frame = mwparserfromhell.parse(declension).nodes[2] # Selects the wikitable itself
        table = wikitextparser.parse(str(nav_frame)).tables[0].data() # Converts into 2D list
        columns = len(table[0]) # Always equals 3: the blank tile, plus the singular and plural
        rows = len(table) # Differs between masculine and non-masculine nouns; also depends on vocative presence

        # Removes wiki formatting, leaving only the text contents of the cell
        get_tags = lambda item: [i for i in mwparserfromhell.parse(mwparserfromhell.parse(item)).nodes] # Lists all tags in a given cell
        stripper = lambda tags: [mwparserfromhell.parse(t).strip_code() for t in [t for t in tags if type(t) == mwparserfromhell.nodes.tag.Tag] if not ' class="tr Latn"' in t.attributes and t.tag == "span"]
        lemma = stripper(get_tags(table[1][1]))[0]
        all_forms["lemma"] = lemma
        for i in range(1, columns):
            column = table[0][i]
            for j in range(1, rows):
                row = table[j][0]
                tags = get_tags(table[j][i])
                values = stripper(tags)

                for v in values:
                    # Converts the table's "singular", "plural", "vocative form", etc., labels into "s", "p", "v", etc.
                    form = forms[row]
                    number = numbers[column]
                    # Count form is a bit different, as the parameters are not "form|number" but the literal, "count|form".
                    if row == "count form":
                        form, number = "count", "form"
                    if v not in all_forms["forms"]:
                        all_forms["forms"][v] = [(form, number)]
                    else:
                        all_forms["forms"][v].append((form, number))
        # This check is useful for masculine terms, as the above code would profile certain noun forms as being both
        # 'definite (subject form) plural' and 'definite (object form) plural', even though those forms are the same in Bulgarian.
        # Hence, this snippet is run to ensure the two get merged into simply 'definite plural'.
        for key in all_forms["forms"]:
            if all_forms["forms"][key] == [("sbjv", "p"), ("objv", "p")]:
                all_forms["forms"][key] = [("def", "p")]
        result.append(all_forms)
    return result

"""Used to generate pages from a list of dictionaries corresponding to the derived forms from declension tables.
Iterates over all forms and generates new pages for them if no page exists with a Bulgarian entry.
"""
def generate_derivatives(form_list):

    pages_to_create = dict()
    for dic in form_list:
        for key in dic["forms"]:
            stripped = key.replace("́", "")
            if stripped not in pages_to_create:
                pages_to_create[stripped] = {
                    "associations": [
                        {
                            "mapping": [
                                dic["lemma"],
                                key
                            ],
                            "forms": dic["forms"][key]

                        }
                    ]
                }
            else:
                pages_to_create[stripped]["associations"].append(
                    {
                            "mapping": [
                                dic["lemma"],
                                key
                            ],
                            "forms": dic["forms"][key]

                        }
                )

    for title in pages_to_create:
        print(f"Creating page {title}.")
        derivative_page = pywikibot.Page(site, title)
        page_content = mwparserfromhell.parse(derivative_page.text)
        entry = ""

        # Check whether the bot can feasibly/permissibly edit the page. If not, quit.
        if bool(page_content.get_sections(matches=r"Bulgarian")):
            print(f"NOTE: page {title} already contains existing Bulgarian entry, exiting")
            continue
        if not derivative_page.botMayEdit:
            print(f"ERROR: page {title} disallows bot editing, exiting")
            continue

        # We are clear to edit once these checks have been ascertained. That there is no Bulgarian header means
        # we are also safe to generate the entries now: we are guaranteed not to waste any processing time,
        # as it is certain that the content to be generated will find a place on the page (so long as there is no existing
        # Bulgarin entry, as we have hereby discovered.)
        # A few scenarios now exist:
        # 1. For a given title, it corresponds to only one of the original etymologies. Check out 'кукла':
        # The form 'кукло' is a vocative singular, but only applies to the first type.
        # → In this case, we need to create a page with only one Pronunciation, no Etymology header, and a
        # reference to the given lemma.
        #
        # 2. For a given title, it corresponds to multiple etymologies. The same page 'кукла' also has multiple
        # forms that are shared by all 3 declension tables in that entry, for example the definite singular 'куклата'.
        # → In this case, we need to create a page containing separate Etymology headers for each form.
        # We should also check all of the dictionaries for their spelling of the title, as if all of the forms have the
        # exact same spelling (including acute symbols for accentuation), the prounciation header can be relegated to the
        # top as an L3 header, rather than repeating the same IPA template underneath each etymology (which can be the same.)

        def generate_definition(formlist, lemma):
            definition = "# {{inflection of|bg|" + lemma + "||"
            for i, variant in enumerate(formlist):
                form, number = variant
                definition += form + "|" + number
                if i < len(formlist)-1:
                    definition += "|;|"
            definition += "}}"
            return definition

        # Case 1: only one association
        if len(pages_to_create[title]["associations"]) == 1:
            lemma = pages_to_create[title]["associations"][0]["mapping"][0]
            declined_form = pages_to_create[title]["associations"][0]["mapping"][1]
            forms = pages_to_create[title]["associations"][0]["forms"]

            definition = generate_definition(forms, lemma)
            entry = "==Bulgarian==\n\n===Pronunciation===\n* {{bg-IPA|" + declined_form + "}}\n\n===Noun===\n{{head|bg|noun form|head=" + declined_form + "}}\n\n" + definition + "\n"

        # Case 2: multiple associations
        else:
            entry = "==Bulgarian==\n\n"
            uniform_pronunciation = True # Assume each form has the same pronunciation
            previous = ""
            for a in pages_to_create[title]["associations"]:
                # If the spelling of the current term differs from the previous, we have different pronunciations
                if a["mapping"][1] != previous and previous != "":
                    uniform_pronunciation = False
                    break
                # At each iteration, set the 'previous' variable equal to the
                previous = a["mapping"][1]
            if uniform_pronunciation:
                entry += "===Pronunciation===\n* {{bg-IPA|" + previous + "}}\n\n"

            for i, a in enumerate(pages_to_create[title]["associations"]):
                entry += "===Etymology "  + str(i+1) + "===\n\n"
                lemma = a["mapping"][0]
                declined_form = a["mapping"][1]
                forms = a["forms"]
                definition = generate_definition(forms, lemma)
                if not uniform_pronunciation:
                    entry += "====Pronunciation====\n* {{bg-IPA|" + declined_form + "}}\n\n"
                entry += "====Noun===="
                entry += "\n{{head|bg|noun form|head=" + declined_form + "}}\n\n" + definition + "\n\n"
            if entry.endswith("\n\n"):
                entry = entry[:-1]

        # The page we are trying to create either exists, or does not.
        # 3. The page exists already.
        # This means that someone has already entered content into the page. So long as the page even vaguely conforms
        # to the requirements of a page on this wiki, it will have some number of L2 language headers.
        # → In this case, we need to find the first language header that comes after Bulgarian, and then place the Bulgarian
        # entry after the language header that precedes it (or at the beginning of the page, if no language precedes the
        # existing language header). Furthermore, there may not be any language header that comes after Bulgarian, e.g.
        # the only entry is for some other language that comes before Bulgarian, e.g. Belarusian.
        # Should this occur, the Bulgarian can simply be placed at the end of the page.
        #
        # 4. The page does not exist yet.
        # This is the by-far simpler case, as it means I can simply paste the desired contents into the page and save it.
        # → In this case, all that we need to do is set the page's content equal to the generated entry and smash that
        # save request. Not too bad in this case.

        save_message = "" # Edit summary

        # Case 3: the page already exists.
        if derivative_page.exists():
            found, index  = "", -1
            for i, section in enumerate(page_content.get_sections(levels=[2])):
                t = section.nodes[0].title
                if t > "Bulgarian":
                    found = section
                    index = i
                    break

            # Some considerations here: if there is now a heading preceding the heading that was found,
            # the Bulgarian entry must go after it. Else (i.e. i will be 0), the Bulgarian can be placed
            # at the beginning of the page. We must be aware that sometimes, the top of the page contains the {{also}}
            # template, and in fact, can contain any number of bizarre templates theoretically.
            # However, this can be accounted for by placing the entry right before the first heading ("==").
            # If i remains -1, that means there is no heading that should follow ==Bulgarian===; consequently,
            # the Bulgarian can be placed at the end of the page straight away.

            if i == -1:
                derivative_page.text = derivative_page.text + "\n----\n\n" + entry
            elif i == 0:
                first_lang = derivative_page.text.find("==")
                derivative_page.text = derivative_page.text[0:first_lang] + entry + "\n----\n\n" + derivative_page.text[first_lang:]
            else:
                # The Bulgarian entry lies somewhere between two other entries
                preceding = page_content.get_sections(levels=[2])[i-1].nodes[0].title
                stringed = str(page_content)
                split_location = stringed.find("----", stringed.find(str(preceding))) + 5
                derivative_page.text = stringed[:split_location] + "\n" + entry + "\n----\n\n" + stringed[split_location:]
            save_message = f"Updated page {title} with content: {derivative_page.text}"

        # Case 4: the page does not exist.
        else:
            derivative_page.text = entry
            save_message = f"Created derived form of {lemma}"

        derivative_page.save(save_message)
        if os.path.exists(os.path.expanduser(f"~/Documents/Programming/Wiktionary/Bot/output/")):
            with open(os.path.expanduser(f"~/Documents/Programming/Wiktionary/Bot/output/{title}.txt"), mode="w", encoding="utf-8") as output:
                output.write(derivative_page.text)

# Just a little alias
def analyze_and_generate(page):
    generate_derivatives(get_forms(page))

if __name__ == "__main__":
    bg_nouns = pywikibot.Category(site, "Category:Bulgarian_nouns")
    n_editing = 10
    for p in bg_nouns.articles(total=n_editing):
        analyze_and_generate(p)