kovachev-bot/bulgarian-anagrams/bg-anagrams.py

import os
import subprocess
import sys
import pywikibot
import mwparserfromhell
import regex as re
from collections import defaultdict

# From User:JeffDoozan's bot AutoDooz
CAT_TEMPLATES = [ "c", "C", "cat", "top", "topic", "topics", "categorize", "catlangname", "catlangcode", "cln", "zh-cat",
        "eo F", "eo [1-9]OA", "eo-categoryTOC", "eo BRO", "eo GCSE", "Universala Vortaro" ]
RE_CAT_TEMPLATES = r"\{\{\s*(" + "|".join(CAT_TEMPLATES) + r")\s*[|}][^{}]*\}*"
RE_CATEGORIES = r"\[\[\s*[cC]at(egory)?\s*:[^\]]*\]\]"
RE_MATCH_CATEGORIES = re.compile(fr"({RE_CAT_TEMPLATES}|{RE_CATEGORIES})")
SITE = pywikibot.Site("en", "wiktionary")
BACKUP_PATH = "bg-anagrams-backup"
ALPHABET = "абвгдежзийклмнопрстуфхцчшщъьюя"
NUMERIC = "0123456789"
NON_ALPHANUMERIC = f"[^{ALPHABET}{NUMERIC}]"
NOT_CREATED_LOG = "non_existent_anagrams.txt"

def create_diff(old_text: str, current_page: pywikibot.Page) -> None:
    """
    Copy the contents of the page to local storage for backup in case there is a problem
    with the script later; this will allow the error to be automatically corrected at that time.
    """
    os.makedirs(BACKUP_PATH, exist_ok=True)
    with open("temp1", mode="w", encoding="utf-8") as f:
        f.write(old_text)

    with open("temp2", mode="w", encoding="utf-8") as f:
        f.write(current_page.text)

    diff = subprocess.getoutput("diff -u temp2 temp1") # Get differences between new revision and previous
    diff = diff + "\n" # patch will complain if we don't end the file with a newline

    with open(os.path.join(BACKUP_PATH, current_page.title()), mode="w", encoding="utf-8") as f:
        f.write(diff)

def normalise(word: str) -> str:
    return re.sub(NON_ALPHANUMERIC, "", re.sub("ѝ", "и", word.casefold()))
    # return re.sub("[-.;:?!‒–—]", "", re.sub("\s", "", word.casefold()))

def get_alphagram(word: str) -> str:
    return "".join(sorted(normalise(word)))

def has_bulgarian(page: pywikibot.Page) -> bool:
    return bool(mwparserfromhell.parse(page.text).get_sections([2], "Bulgarian"))


# Calculate all anagrams from the file of words
with open("words.txt") as f:
    wordlist: list[str] = f.readlines()

anagrams = defaultdict(set)

for word in wordlist:
    anagrams[get_alphagram(word)].add(word.strip())

# ---------------------------------------------

anagrams = {letter_count: anas for letter_count, anas in anagrams.items() if len(anas) > 1} # Only keep words with multiple anagrams

def count_anagrams():
    return sum(len(anagram_list) for anagram_list in anagrams.values())

def generate_anagrams_section(anagrams: set[str]) -> str:
    return "\n\n===Anagrams===\n* " + generate_anagrams_template(anagrams, get_alphagram(anagrams.copy().pop())) + "\n\n"

def generate_anagrams_template(anagrams: set[str], alphagram: str) -> str:
    return "{{" + f"anagrams|bg|a={alphagram}|" + "|".join(anagrams) + "}}"

def add_anagrams(contents: str, anagrams_to_add: set[str], alphagram):
    parsed = mwparserfromhell.parse(contents)

    anagrams_added = anagrams_to_add.copy()

    bulgarian_section: mwparserfromhell.wikicode.Wikicode = parsed.get_sections([2], "Bulgarian")[0]
    anagrams_section: mwparserfromhell.wikicode.Wikicode = bulgarian_section.get_sections([3], "Anagrams")
    if anagrams_section:
        anagrams_section = anagrams_section[0]
        anagrams_templates = anagrams_section.filter(forcetype=mwparserfromhell.wikicode.Template)
        anagrams_templates = [t for t in anagrams_templates if t.name == "anagrams"]
        if len(anagrams_templates) == 0:
            return contents, set()
    
        existing = set()
        anagrams_template = anagrams_templates[0]
        i = 2
        while anagrams_template.has(i):
            existing.add(str(anagrams_template.get(i)))
            i += 1

        if existing.union(anagrams_to_add) == existing:  # If there are no new anagrams present
            return contents, set()
        
        anagrams_to_add = anagrams_to_add.union(existing)

        anagrams_section.nodes[anagrams_section.index(anagrams_template)] = generate_anagrams_template(anagrams_to_add, alphagram)

        anagrams_added = anagrams_to_add.difference(existing)
    else:
        index = len(bulgarian_section.nodes)-1
        keep_going = True
        while index > 0 and keep_going:
            node_str_form = str(bulgarian_section.nodes[index])
            if not (node_str_form.isspace() or RE_MATCH_CATEGORIES.match(node_str_form)):
                keep_going = False
                index += 1  # Insert just after the content that isn't a whitespace/category
            else:
                index -= 1

        while index < len(bulgarian_section.nodes) and (node_str_form := str(bulgarian_section.nodes[index]).isspace()):
            index += 1

        bulgarian_section.insert(index, generate_anagrams_section(anagrams_to_add))

    return str(parsed), anagrams_added

def update_page(title: str, alphagram: str, uncreated: set[str]) -> bool:
    """Update a page with its anagrams. Returns whether changes were made."""
    page = pywikibot.Page(SITE, title)

    create_diff(page.text, page)
    
    if has_bulgarian(page):
        anagrams_to_add = anagrams[alphagram] - {title}
        new_content, anagrams_added = add_anagrams(page.text, anagrams_to_add, alphagram)
        new_content = re.sub("\n{3,}", "\n\n", new_content)

        for anagram in anagrams_to_add:
            other_page = pywikibot.Page(SITE, anagram)
            if not has_bulgarian(other_page):
                uncreated.add(f"{anagram}\n")

        if new_content == page.text:
            print(f"Did nothing on page {title} as there are already anagrams present", file=sys.stderr)
            return False
        else:
            page.text = new_content
            plural_s = "s" if len(anagrams_added) > 1 else ""
            if len(anagrams_added) == 0:
                print("Nothing was added, but the content was changed! (not saved)")
                return False

            page.save(f"Added anagram{plural_s} ({', '.join(anagrams_added)}) to Bulgarian section", minor=False)
            return True
    else:
        print(f"Skipping page {title}, as it does not exist or has no Bulgarian content", file=sys.stderr)
    
    return False

def main(uncreated: set[str]):
    try:
        LIMIT = int(pywikibot.argvu[1])
    except:
        LIMIT = -1

    print("Preparing to iterate over", len(anagrams), "alphragrams", f"({count_anagrams()} anagrams)")

    edit_count = 0  # Updated for every individual page
    iterations = 0  # Updated for every set of anagrams
    for alphagram, anas in anagrams.items():

        if iterations % 5 == 0: # Every fifth set of anagrams, consider whether to halt
            halt_page = pywikibot.Page(SITE, "User:KovachevBot/halt")
            if "halt" in halt_page.text.casefold():
                print(f"ERROR: BOT WAS MANUALLY HALTED BY {halt_page.userName()}", file=sys.stderr)
                return

        for anagram in anas:
            if edit_count == LIMIT:
                return

            edit_count += int(update_page(anagram, alphagram, uncreated))  # If a change was made, increase the edit count

        iterations += 1

def there_are_erroneous_anagrams(original, anagrams: set[str]) -> bool:
    for anagram in anagrams:
        if anagram == original: continue
        if normalise(anagram) == normalise(original):
            return True
    return False

def find_erroneous_anagrams():
    errors = []
    for anagram_list in anagrams.values():
        for anagram in anagram_list:
            page = pywikibot.Page(SITE, anagram)
            
            if not page.exists(): continue
            if not has_bulgarian(page): continue

            print("Traversing page", anagram + "...")

            for template in mwparserfromhell.parse(page.text).filter(forcetype=mwparserfromhell.wikicode.Template):
                template: mwparserfromhell.wikicode.Template
                if template.name != "anagrams": continue
                if not template.has_param(1): continue
                if template.get(1) != "bg": continue

                if there_are_erroneous_anagrams(anagram, template.params[2:]):
                    print("Found erroneous anagrams: ", template.params[2:])
                    errors.append(anagram)
                    break

    with open("dubious_anagrams.txt", mode="w") as f:
        f.write("\n".join(errors))

if __name__ == "__main__":
    # uncreated = set()
    # try:
    #     with open(NOT_CREATED_LOG) as f:
    #         uncreated = set(f.readlines())
    # except FileNotFoundError:
    #     with open(NOT_CREATED_LOG, "w") as f:
    #         pass
    # try:
    #     main(uncreated)
    # finally:
    #     with open(NOT_CREATED_LOG, "w") as f:
    #         f.writelines(uncreated)
    find_erroneous_anagrams()
Add diff saving, normalsation 2023-08-01 08:47:34 +00:00			`import os`
			`import subprocess`
Upload anagram script 2023-07-31 20:57:49 +00:00			`import sys`
			`import pywikibot`
			`import mwparserfromhell`
			`import regex as re`
Add diff saving, normalsation 2023-08-01 08:47:34 +00:00			`from collections import defaultdict`
Upload anagram script 2023-07-31 20:57:49 +00:00
			`# From User:JeffDoozan's bot AutoDooz`
			`CAT_TEMPLATES = [ "c", "C", "cat", "top", "topic", "topics", "categorize", "catlangname", "catlangcode", "cln", "zh-cat",`
			`"eo F", "eo [1-9]OA", "eo-categoryTOC", "eo BRO", "eo GCSE", "Universala Vortaro" ]`
			`RE_CAT_TEMPLATES = r"\{\{\s(" + "\|".join(CAT_TEMPLATES) + r")\s[\|}][^{}]\}"`
			`RE_CATEGORIES = r"\[\[\s[cC]at(egory)?\s:[^\]]*\]\]"`
			`RE_MATCH_CATEGORIES = re.compile(fr"({RE_CAT_TEMPLATES}\|{RE_CATEGORIES})")`
			`SITE = pywikibot.Site("en", "wiktionary")`
Add diff saving, normalsation 2023-08-01 08:47:34 +00:00			`BACKUP_PATH = "bg-anagrams-backup"`
			`ALPHABET = "абвгдежзийклмнопрстуфхцчшщъьюя"`
Add ѝ/и distinction 2023-08-02 13:22:44 +00:00			`NUMERIC = "0123456789"`
			`NON_ALPHANUMERIC = f"[^{ALPHABET}{NUMERIC}]"`
Add logging capabilities 2023-08-03 20:14:09 +00:00			`NOT_CREATED_LOG = "non_existent_anagrams.txt"`
Add diff saving, normalsation 2023-08-01 08:47:34 +00:00
			`def create_diff(old_text: str, current_page: pywikibot.Page) -> None:`
			`"""`
			`Copy the contents of the page to local storage for backup in case there is a problem`
			`with the script later; this will allow the error to be automatically corrected at that time.`
			`"""`
			`os.makedirs(BACKUP_PATH, exist_ok=True)`
			`with open("temp1", mode="w", encoding="utf-8") as f:`
			`f.write(old_text)`

			`with open("temp2", mode="w", encoding="utf-8") as f:`
			`f.write(current_page.text)`

			`diff = subprocess.getoutput("diff -u temp2 temp1") # Get differences between new revision and previous`
			`diff = diff + "\n" # patch will complain if we don't end the file with a newline`

			`with open(os.path.join(BACKUP_PATH, current_page.title()), mode="w", encoding="utf-8") as f:`
			`f.write(diff)`

			`def normalise(word: str) -> str:`
Add ѝ/и distinction 2023-08-02 13:22:44 +00:00			`return re.sub(NON_ALPHANUMERIC, "", re.sub("ѝ", "и", word.casefold()))`
			`# return re.sub("[-.;:?!‒–—]", "", re.sub("\s", "", word.casefold()))`
Upload anagram script 2023-07-31 20:57:49 +00:00
			`def get_alphagram(word: str) -> str:`
Add diff saving, normalsation 2023-08-01 08:47:34 +00:00			`return "".join(sorted(normalise(word)))`
Upload anagram script 2023-07-31 20:57:49 +00:00
			`def has_bulgarian(page: pywikibot.Page) -> bool:`
			`return bool(mwparserfromhell.parse(page.text).get_sections([2], "Bulgarian"))`

Add logging capabilities 2023-08-03 20:14:09 +00:00
			`# Calculate all anagrams from the file of words`
Upload anagram script 2023-07-31 20:57:49 +00:00			`with open("words.txt") as f:`
			`wordlist: list[str] = f.readlines()`

			`anagrams = defaultdict(set)`

			`for word in wordlist:`
			`anagrams[get_alphagram(word)].add(word.strip())`

Add logging capabilities 2023-08-03 20:14:09 +00:00			`# ---------------------------------------------`

Upload anagram script 2023-07-31 20:57:49 +00:00			`anagrams = {letter_count: anas for letter_count, anas in anagrams.items() if len(anas) > 1} # Only keep words with multiple anagrams`

Add ѝ/и distinction 2023-08-02 13:22:44 +00:00			`def count_anagrams():`
			`return sum(len(anagram_list) for anagram_list in anagrams.values())`

Upload anagram script 2023-07-31 20:57:49 +00:00			`def generate_anagrams_section(anagrams: set[str]) -> str:`
			`return "\n\n===Anagrams===\n* " + generate_anagrams_template(anagrams, get_alphagram(anagrams.copy().pop())) + "\n\n"`

			`def generate_anagrams_template(anagrams: set[str], alphagram: str) -> str:`
			`return "{{" + f"anagrams\|bg\|a={alphagram}\|" + "\|".join(anagrams) + "}}"`

			`def add_anagrams(contents: str, anagrams_to_add: set[str], alphagram):`
			`parsed = mwparserfromhell.parse(contents)`
Add logging capabilities 2023-08-03 20:14:09 +00:00
			`anagrams_added = anagrams_to_add.copy()`

Upload anagram script 2023-07-31 20:57:49 +00:00			`bulgarian_section: mwparserfromhell.wikicode.Wikicode = parsed.get_sections([2], "Bulgarian")[0]`
			`anagrams_section: mwparserfromhell.wikicode.Wikicode = bulgarian_section.get_sections([3], "Anagrams")`
			`if anagrams_section:`
			`anagrams_section = anagrams_section[0]`
			`anagrams_templates = anagrams_section.filter(forcetype=mwparserfromhell.wikicode.Template)`
			`anagrams_templates = [t for t in anagrams_templates if t.name == "anagrams"]`
			`if len(anagrams_templates) == 0:`
Add logging capabilities 2023-08-03 20:14:09 +00:00			`return contents, set()`
Upload anagram script 2023-07-31 20:57:49 +00:00
			`existing = set()`
			`anagrams_template = anagrams_templates[0]`
			`i = 2`
			`while anagrams_template.has(i):`
			`existing.add(str(anagrams_template.get(i)))`
			`i += 1`

Fix non-idempotence bug 2023-08-01 11:01:40 +00:00			`if existing.union(anagrams_to_add) == existing: # If there are no new anagrams present`
Add logging capabilities 2023-08-03 20:14:09 +00:00			`return contents, set()`
Upload anagram script 2023-07-31 20:57:49 +00:00
			`anagrams_to_add = anagrams_to_add.union(existing)`

			`anagrams_section.nodes[anagrams_section.index(anagrams_template)] = generate_anagrams_template(anagrams_to_add, alphagram)`
Add logging capabilities 2023-08-03 20:14:09 +00:00
			`anagrams_added = anagrams_to_add.difference(existing)`
Upload anagram script 2023-07-31 20:57:49 +00:00			`else:`
			`index = len(bulgarian_section.nodes)-1`
			`keep_going = True`
			`while index > 0 and keep_going:`
			`node_str_form = str(bulgarian_section.nodes[index])`
			`if not (node_str_form.isspace() or RE_MATCH_CATEGORIES.match(node_str_form)):`
			`keep_going = False`
			`index += 1 # Insert just after the content that isn't a whitespace/category`
			`else:`
			`index -= 1`

			`while index < len(bulgarian_section.nodes) and (node_str_form := str(bulgarian_section.nodes[index]).isspace()):`
			`index += 1`

			`bulgarian_section.insert(index, generate_anagrams_section(anagrams_to_add))`

Add logging capabilities 2023-08-03 20:14:09 +00:00			`return str(parsed), anagrams_added`
Upload anagram script 2023-07-31 20:57:49 +00:00
Add logging capabilities 2023-08-03 20:14:09 +00:00			`def update_page(title: str, alphagram: str, uncreated: set[str]) -> bool:`
Upload anagram script 2023-07-31 20:57:49 +00:00			`"""Update a page with its anagrams. Returns whether changes were made."""`
			`page = pywikibot.Page(SITE, title)`
Add diff saving, normalsation 2023-08-01 08:47:34 +00:00
			`create_diff(page.text, page)`
Upload anagram script 2023-07-31 20:57:49 +00:00
			`if has_bulgarian(page):`
Add diff saving, normalsation 2023-08-01 08:47:34 +00:00			`anagrams_to_add = anagrams[alphagram] - {title}`
Add logging capabilities 2023-08-03 20:14:09 +00:00			`new_content, anagrams_added = add_anagrams(page.text, anagrams_to_add, alphagram)`
Upload anagram script 2023-07-31 20:57:49 +00:00			`new_content = re.sub("\n{3,}", "\n\n", new_content)`

Add logging capabilities 2023-08-03 20:14:09 +00:00			`for anagram in anagrams_to_add:`
			`other_page = pywikibot.Page(SITE, anagram)`
			`if not has_bulgarian(other_page):`
			`uncreated.add(f"{anagram}\n")`

Upload anagram script 2023-07-31 20:57:49 +00:00			`if new_content == page.text:`
			`print(f"Did nothing on page {title} as there are already anagrams present", file=sys.stderr)`
			`return False`
			`else:`
			`page.text = new_content`
Add logging capabilities 2023-08-03 20:14:09 +00:00			`plural_s = "s" if len(anagrams_added) > 1 else ""`
Update with library, scritps 2023-09-01 17:12:08 +00:00			`if len(anagrams_added) == 0:`
			`print("Nothing was added, but the content was changed! (not saved)")`
			`return False`

Add logging capabilities 2023-08-03 20:14:09 +00:00			`page.save(f"Added anagram{plural_s} ({', '.join(anagrams_added)}) to Bulgarian section", minor=False)`
Upload anagram script 2023-07-31 20:57:49 +00:00			`return True`
			`else:`
			`print(f"Skipping page {title}, as it does not exist or has no Bulgarian content", file=sys.stderr)`

			`return False`

Add logging capabilities 2023-08-03 20:14:09 +00:00			`def main(uncreated: set[str]):`
Upload anagram script 2023-07-31 20:57:49 +00:00			`try:`
			`LIMIT = int(pywikibot.argvu[1])`
			`except:`
			`LIMIT = -1`

Add ѝ/и distinction 2023-08-02 13:22:44 +00:00			`print("Preparing to iterate over", len(anagrams), "alphragrams", f"({count_anagrams()} anagrams)")`
Add diff saving, normalsation 2023-08-01 08:47:34 +00:00
Upload anagram script 2023-07-31 20:57:49 +00:00			`edit_count = 0 # Updated for every individual page`
			`iterations = 0 # Updated for every set of anagrams`
			`for alphagram, anas in anagrams.items():`

			`if iterations % 5 == 0: # Every fifth set of anagrams, consider whether to halt`
			`halt_page = pywikibot.Page(SITE, "User:KovachevBot/halt")`
			`if "halt" in halt_page.text.casefold():`
			`print(f"ERROR: BOT WAS MANUALLY HALTED BY {halt_page.userName()}", file=sys.stderr)`
			`return`

			`for anagram in anas:`
			`if edit_count == LIMIT:`
			`return`

Add logging capabilities 2023-08-03 20:14:09 +00:00			`edit_count += int(update_page(anagram, alphagram, uncreated)) # If a change was made, increase the edit count`
Upload anagram script 2023-07-31 20:57:49 +00:00
			`iterations += 1`

Update with library, scritps 2023-09-01 17:12:08 +00:00			`def there_are_erroneous_anagrams(original, anagrams: set[str]) -> bool:`
			`for anagram in anagrams:`
			`if anagram == original: continue`
			`if normalise(anagram) == normalise(original):`
			`return True`
			`return False`

			`def find_erroneous_anagrams():`
			`errors = []`
			`for anagram_list in anagrams.values():`
			`for anagram in anagram_list:`
			`page = pywikibot.Page(SITE, anagram)`

			`if not page.exists(): continue`
			`if not has_bulgarian(page): continue`

			`print("Traversing page", anagram + "...")`

			`for template in mwparserfromhell.parse(page.text).filter(forcetype=mwparserfromhell.wikicode.Template):`
			`template: mwparserfromhell.wikicode.Template`
			`if template.name != "anagrams": continue`
			`if not template.has_param(1): continue`
			`if template.get(1) != "bg": continue`

			`if there_are_erroneous_anagrams(anagram, template.params[2:]):`
			`print("Found erroneous anagrams: ", template.params[2:])`
			`errors.append(anagram)`
			`break`

			`with open("dubious_anagrams.txt", mode="w") as f:`
			`f.write("\n".join(errors))`

Upload anagram script 2023-07-31 20:57:49 +00:00			`if __name__ == "__main__":`
Update with library, scritps 2023-09-01 17:12:08 +00:00			`# uncreated = set()`
			`# try:`
			`# with open(NOT_CREATED_LOG) as f:`
			`# uncreated = set(f.readlines())`
			`# except FileNotFoundError:`
			`# with open(NOT_CREATED_LOG, "w") as f:`
			`# pass`
			`# try:`
			`# main(uncreated)`
			`# finally:`
			`# with open(NOT_CREATED_LOG, "w") as f:`
			`# f.writelines(uncreated)`
			`find_erroneous_anagrams()`