kovachev-bot/bulgarian-anagrams/bg-anagrams.py

import os
import subprocess
import sys
import pywikibot
import mwparserfromhell
import regex as re
from collections import defaultdict

# From User:JeffDoozan's bot AutoDooz
CAT_TEMPLATES = [ "c", "C", "cat", "top", "topic", "topics", "categorize", "catlangname", "catlangcode", "cln", "zh-cat",
        "eo F", "eo [1-9]OA", "eo-categoryTOC", "eo BRO", "eo GCSE", "Universala Vortaro" ]
RE_CAT_TEMPLATES = r"\{\{\s*(" + "|".join(CAT_TEMPLATES) + r")\s*[|}][^{}]*\}*"
RE_CATEGORIES = r"\[\[\s*[cC]at(egory)?\s*:[^\]]*\]\]"
RE_MATCH_CATEGORIES = re.compile(fr"({RE_CAT_TEMPLATES}|{RE_CATEGORIES})")
SITE = pywikibot.Site("en", "wiktionary")
BACKUP_PATH = "bg-anagrams-backup"
ALPHABET = "абвгдежзийклмнопрстуфхцчшщъьюя"
NUMERIC = "0123456789"
NON_ALPHANUMERIC = f"[^{ALPHABET}{NUMERIC}]"
NOT_CREATED_LOG = "non_existent_anagrams.txt"

def create_diff(old_text: str, current_page: pywikibot.Page) -> None:
    """
    Copy the contents of the page to local storage for backup in case there is a problem
    with the script later; this will allow the error to be automatically corrected at that time.
    """
    os.makedirs(BACKUP_PATH, exist_ok=True)
    with open("temp1", mode="w", encoding="utf-8") as f:
        f.write(old_text)

    with open("temp2", mode="w", encoding="utf-8") as f:
        f.write(current_page.text)

    diff = subprocess.getoutput("diff -u temp2 temp1") # Get differences between new revision and previous
    diff = diff + "\n" # patch will complain if we don't end the file with a newline

    with open(os.path.join(BACKUP_PATH, current_page.title()), mode="w", encoding="utf-8") as f:
        f.write(diff)

def normalise(word: str) -> str:
    return re.sub(NON_ALPHANUMERIC, "", re.sub("ѝ", "и", word.casefold()))
    # return re.sub("[-.;:?!‒–—]", "", re.sub("\s", "", word.casefold()))

def get_alphagram(word: str) -> str:
    return "".join(sorted(normalise(word)))

def has_bulgarian(page: pywikibot.Page) -> bool:
    return bool(mwparserfromhell.parse(page.text).get_sections([2], "Bulgarian"))


# Calculate all anagrams from the file of words
with open("words.txt") as f:
    wordlist: list[str] = f.readlines()

anagrams = defaultdict(set)

for word in wordlist:
    anagrams[get_alphagram(word)].add(word.strip())

# ---------------------------------------------

anagrams = {letter_count: anas for letter_count, anas in anagrams.items() if len(anas) > 1} # Only keep words with multiple anagrams

def count_anagrams():
    return sum(len(anagram_list) for anagram_list in anagrams.values())

def generate_anagrams_section(anagrams: set[str]) -> str:
    return "\n\n===Anagrams===\n* " + generate_anagrams_template(anagrams, get_alphagram(anagrams.copy().pop())) + "\n\n"

def generate_anagrams_template(anagrams: set[str], alphagram: str) -> str:
    return "{{" + f"anagrams|bg|a={alphagram}|" + "|".join(anagrams) + "}}"

def add_anagrams(contents: str, anagrams_to_add: set[str], alphagram):
    parsed = mwparserfromhell.parse(contents)

    anagrams_added = anagrams_to_add.copy()

    bulgarian_section: mwparserfromhell.wikicode.Wikicode = parsed.get_sections([2], "Bulgarian")[0]
    anagrams_section: mwparserfromhell.wikicode.Wikicode = bulgarian_section.get_sections([3], "Anagrams")
    if anagrams_section:
        anagrams_section = anagrams_section[0]
        anagrams_templates = anagrams_section.filter(forcetype=mwparserfromhell.wikicode.Template)
        anagrams_templates = [t for t in anagrams_templates if t.name == "anagrams"]
        if len(anagrams_templates) == 0:
            return contents, set()

        existing = set()
        anagrams_template = anagrams_templates[0]
        i = 2
        while anagrams_template.has(i):
            existing.add(str(anagrams_template.get(i)))
            i += 1

        if existing.union(anagrams_to_add) == existing:  # If there are no new anagrams present
            return contents, set()

        anagrams_to_add = anagrams_to_add.union(existing)

        anagrams_section.nodes[anagrams_section.index(anagrams_template)] = generate_anagrams_template(anagrams_to_add, alphagram)

        anagrams_added = anagrams_to_add.difference(existing)
    else:
        index = len(bulgarian_section.nodes)-1
        keep_going = True
        while index > 0 and keep_going:
            node_str_form = str(bulgarian_section.nodes[index])
            if not (node_str_form.isspace() or RE_MATCH_CATEGORIES.match(node_str_form)):
                keep_going = False
                index += 1  # Insert just after the content that isn't a whitespace/category
            else:
                index -= 1

        while index < len(bulgarian_section.nodes) and (node_str_form := str(bulgarian_section.nodes[index]).isspace()):
            index += 1

        bulgarian_section.insert(index, generate_anagrams_section(anagrams_to_add))

    return str(parsed), anagrams_added

def update_page(title: str, alphagram: str, uncreated: set[str]) -> bool:
    """Update a page with its anagrams. Returns whether changes were made."""
    page = pywikibot.Page(SITE, title)

    create_diff(page.text, page)

    if has_bulgarian(page):
        anagrams_to_add = anagrams[alphagram] - {title}
        new_content, anagrams_added = add_anagrams(page.text, anagrams_to_add, alphagram)
        new_content = re.sub("\n{3,}", "\n\n", new_content)

        for anagram in anagrams_to_add:
            other_page = pywikibot.Page(SITE, anagram)
            if not has_bulgarian(other_page):
                uncreated.add(f"{anagram}\n")

        if new_content == page.text:
            print(f"Did nothing on page {title} as there are already anagrams present", file=sys.stderr)
            return False
        else:
            page.text = new_content
            plural_s = "s" if len(anagrams_added) > 1 else ""
            if len(anagrams_added) == 0:
                print("Nothing was added, but the content was changed! (not saved)")
                return False

            page.save(f"Added anagram{plural_s} ({', '.join(anagrams_added)}) to Bulgarian section", minor=False)
            return True
    else:
        print(f"Skipping page {title}, as it does not exist or has no Bulgarian content", file=sys.stderr)

    return False

def main(uncreated: set[str]):
    try:
        LIMIT = int(pywikibot.argvu[1])
    except:
        LIMIT = -1

    print("Preparing to iterate over", len(anagrams), "alphragrams", f"({count_anagrams()} anagrams)")

    edit_count = 0  # Updated for every individual page
    iterations = 0  # Updated for every set of anagrams
    for alphagram, anas in anagrams.items():

        if iterations % 5 == 0: # Every fifth set of anagrams, consider whether to halt
            halt_page = pywikibot.Page(SITE, "User:KovachevBot/halt")
            if "halt" in halt_page.text.casefold():
                print(f"ERROR: BOT WAS MANUALLY HALTED BY {halt_page.userName()}", file=sys.stderr)
                return

        for anagram in anas:
            if edit_count == LIMIT:
                return

            edit_count += int(update_page(anagram, alphagram, uncreated))  # If a change was made, increase the edit count

        iterations += 1

def there_are_erroneous_anagrams(original, anagrams: set[str]) -> bool:
    for anagram in anagrams:
        if anagram == original: continue
        if normalise(anagram) == normalise(original):
            return True
    return False

def find_erroneous_anagrams():
    errors = []
    for anagram_list in anagrams.values():
        for anagram in anagram_list:
            page = pywikibot.Page(SITE, anagram)

            if not page.exists(): continue
            if not has_bulgarian(page): continue

            print("Traversing page", anagram + "...")

            for template in mwparserfromhell.parse(page.text).filter(forcetype=mwparserfromhell.wikicode.Template):
                template: mwparserfromhell.wikicode.Template
                if template.name != "anagrams": continue
                if not template.has_param(1): continue
                if template.get(1) != "bg": continue

                if there_are_erroneous_anagrams(anagram, template.params[2:]):
                    print("Found erroneous anagrams: ", template.params[2:])
                    errors.append(anagram)
                    break

    with open("dubious_anagrams.txt", mode="w") as f:
        f.write("\n".join(errors))

if __name__ == "__main__":
    # uncreated = set()
    # try:
    #     with open(NOT_CREATED_LOG) as f:
    #         uncreated = set(f.readlines())
    # except FileNotFoundError:
    #     with open(NOT_CREATED_LOG, "w") as f:
    #         pass
    # try:
    #     main(uncreated)
    # finally:
    #     with open(NOT_CREATED_LOG, "w") as f:
    #         f.writelines(uncreated)
    find_erroneous_anagrams()