kovachev-bot/bulgarian-anagrams/bg-anagrams.py

224 lines
8.7 KiB
Python
Raw Permalink Normal View History

2023-08-01 08:47:34 +00:00
import os
import subprocess
2023-07-31 20:57:49 +00:00
import sys
import pywikibot
import mwparserfromhell
import regex as re
2023-08-01 08:47:34 +00:00
from collections import defaultdict
2023-07-31 20:57:49 +00:00
# From User:JeffDoozan's bot AutoDooz
CAT_TEMPLATES = [ "c", "C", "cat", "top", "topic", "topics", "categorize", "catlangname", "catlangcode", "cln", "zh-cat",
"eo F", "eo [1-9]OA", "eo-categoryTOC", "eo BRO", "eo GCSE", "Universala Vortaro" ]
RE_CAT_TEMPLATES = r"\{\{\s*(" + "|".join(CAT_TEMPLATES) + r")\s*[|}][^{}]*\}*"
RE_CATEGORIES = r"\[\[\s*[cC]at(egory)?\s*:[^\]]*\]\]"
RE_MATCH_CATEGORIES = re.compile(fr"({RE_CAT_TEMPLATES}|{RE_CATEGORIES})")
SITE = pywikibot.Site("en", "wiktionary")
2023-08-01 08:47:34 +00:00
BACKUP_PATH = "bg-anagrams-backup"
ALPHABET = "абвгдежзийклмнопрстуфхцчшщъьюя"
2023-08-02 13:22:44 +00:00
NUMERIC = "0123456789"
NON_ALPHANUMERIC = f"[^{ALPHABET}{NUMERIC}]"
2023-08-03 20:14:09 +00:00
NOT_CREATED_LOG = "non_existent_anagrams.txt"
2023-08-01 08:47:34 +00:00
def create_diff(old_text: str, current_page: pywikibot.Page) -> None:
"""
Copy the contents of the page to local storage for backup in case there is a problem
with the script later; this will allow the error to be automatically corrected at that time.
"""
os.makedirs(BACKUP_PATH, exist_ok=True)
with open("temp1", mode="w", encoding="utf-8") as f:
f.write(old_text)
with open("temp2", mode="w", encoding="utf-8") as f:
f.write(current_page.text)
diff = subprocess.getoutput("diff -u temp2 temp1") # Get differences between new revision and previous
diff = diff + "\n" # patch will complain if we don't end the file with a newline
with open(os.path.join(BACKUP_PATH, current_page.title()), mode="w", encoding="utf-8") as f:
f.write(diff)
def normalise(word: str) -> str:
2023-08-02 13:22:44 +00:00
return re.sub(NON_ALPHANUMERIC, "", re.sub("ѝ", "и", word.casefold()))
# return re.sub("[-.;:?!‒–—]", "", re.sub("\s", "", word.casefold()))
2023-07-31 20:57:49 +00:00
def get_alphagram(word: str) -> str:
2023-08-01 08:47:34 +00:00
return "".join(sorted(normalise(word)))
2023-07-31 20:57:49 +00:00
def has_bulgarian(page: pywikibot.Page) -> bool:
return bool(mwparserfromhell.parse(page.text).get_sections([2], "Bulgarian"))
2023-08-03 20:14:09 +00:00
# Calculate all anagrams from the file of words
2023-07-31 20:57:49 +00:00
with open("words.txt") as f:
wordlist: list[str] = f.readlines()
anagrams = defaultdict(set)
for word in wordlist:
anagrams[get_alphagram(word)].add(word.strip())
2023-08-03 20:14:09 +00:00
# ---------------------------------------------
2023-07-31 20:57:49 +00:00
anagrams = {letter_count: anas for letter_count, anas in anagrams.items() if len(anas) > 1} # Only keep words with multiple anagrams
2023-08-02 13:22:44 +00:00
def count_anagrams():
return sum(len(anagram_list) for anagram_list in anagrams.values())
2023-07-31 20:57:49 +00:00
def generate_anagrams_section(anagrams: set[str]) -> str:
return "\n\n===Anagrams===\n* " + generate_anagrams_template(anagrams, get_alphagram(anagrams.copy().pop())) + "\n\n"
def generate_anagrams_template(anagrams: set[str], alphagram: str) -> str:
return "{{" + f"anagrams|bg|a={alphagram}|" + "|".join(anagrams) + "}}"
def add_anagrams(contents: str, anagrams_to_add: set[str], alphagram):
parsed = mwparserfromhell.parse(contents)
2023-08-03 20:14:09 +00:00
anagrams_added = anagrams_to_add.copy()
2023-07-31 20:57:49 +00:00
bulgarian_section: mwparserfromhell.wikicode.Wikicode = parsed.get_sections([2], "Bulgarian")[0]
anagrams_section: mwparserfromhell.wikicode.Wikicode = bulgarian_section.get_sections([3], "Anagrams")
if anagrams_section:
anagrams_section = anagrams_section[0]
anagrams_templates = anagrams_section.filter(forcetype=mwparserfromhell.wikicode.Template)
anagrams_templates = [t for t in anagrams_templates if t.name == "anagrams"]
if len(anagrams_templates) == 0:
2023-08-03 20:14:09 +00:00
return contents, set()
2023-07-31 20:57:49 +00:00
existing = set()
anagrams_template = anagrams_templates[0]
i = 2
while anagrams_template.has(i):
existing.add(str(anagrams_template.get(i)))
i += 1
2023-08-01 11:01:40 +00:00
if existing.union(anagrams_to_add) == existing: # If there are no new anagrams present
2023-08-03 20:14:09 +00:00
return contents, set()
2023-07-31 20:57:49 +00:00
anagrams_to_add = anagrams_to_add.union(existing)
anagrams_section.nodes[anagrams_section.index(anagrams_template)] = generate_anagrams_template(anagrams_to_add, alphagram)
2023-08-03 20:14:09 +00:00
anagrams_added = anagrams_to_add.difference(existing)
2023-07-31 20:57:49 +00:00
else:
index = len(bulgarian_section.nodes)-1
keep_going = True
while index > 0 and keep_going:
node_str_form = str(bulgarian_section.nodes[index])
if not (node_str_form.isspace() or RE_MATCH_CATEGORIES.match(node_str_form)):
keep_going = False
index += 1 # Insert just after the content that isn't a whitespace/category
else:
index -= 1
while index < len(bulgarian_section.nodes) and (node_str_form := str(bulgarian_section.nodes[index]).isspace()):
index += 1
bulgarian_section.insert(index, generate_anagrams_section(anagrams_to_add))
2023-08-03 20:14:09 +00:00
return str(parsed), anagrams_added
2023-07-31 20:57:49 +00:00
2023-08-03 20:14:09 +00:00
def update_page(title: str, alphagram: str, uncreated: set[str]) -> bool:
2023-07-31 20:57:49 +00:00
"""Update a page with its anagrams. Returns whether changes were made."""
page = pywikibot.Page(SITE, title)
2023-08-01 08:47:34 +00:00
create_diff(page.text, page)
2023-07-31 20:57:49 +00:00
if has_bulgarian(page):
2023-08-01 08:47:34 +00:00
anagrams_to_add = anagrams[alphagram] - {title}
2023-08-03 20:14:09 +00:00
new_content, anagrams_added = add_anagrams(page.text, anagrams_to_add, alphagram)
2023-07-31 20:57:49 +00:00
new_content = re.sub("\n{3,}", "\n\n", new_content)
2023-08-03 20:14:09 +00:00
for anagram in anagrams_to_add:
other_page = pywikibot.Page(SITE, anagram)
if not has_bulgarian(other_page):
uncreated.add(f"{anagram}\n")
2023-07-31 20:57:49 +00:00
if new_content == page.text:
print(f"Did nothing on page {title} as there are already anagrams present", file=sys.stderr)
return False
else:
page.text = new_content
2023-08-03 20:14:09 +00:00
plural_s = "s" if len(anagrams_added) > 1 else ""
2023-09-01 17:12:08 +00:00
if len(anagrams_added) == 0:
print("Nothing was added, but the content was changed! (not saved)")
return False
2023-08-03 20:14:09 +00:00
page.save(f"Added anagram{plural_s} ({', '.join(anagrams_added)}) to Bulgarian section", minor=False)
2023-07-31 20:57:49 +00:00
return True
else:
print(f"Skipping page {title}, as it does not exist or has no Bulgarian content", file=sys.stderr)
return False
2023-08-03 20:14:09 +00:00
def main(uncreated: set[str]):
2023-07-31 20:57:49 +00:00
try:
LIMIT = int(pywikibot.argvu[1])
except:
LIMIT = -1
2023-08-02 13:22:44 +00:00
print("Preparing to iterate over", len(anagrams), "alphragrams", f"({count_anagrams()} anagrams)")
2023-08-01 08:47:34 +00:00
2023-07-31 20:57:49 +00:00
edit_count = 0 # Updated for every individual page
iterations = 0 # Updated for every set of anagrams
for alphagram, anas in anagrams.items():
if iterations % 5 == 0: # Every fifth set of anagrams, consider whether to halt
halt_page = pywikibot.Page(SITE, "User:KovachevBot/halt")
if "halt" in halt_page.text.casefold():
print(f"ERROR: BOT WAS MANUALLY HALTED BY {halt_page.userName()}", file=sys.stderr)
return
for anagram in anas:
if edit_count == LIMIT:
return
2023-08-03 20:14:09 +00:00
edit_count += int(update_page(anagram, alphagram, uncreated)) # If a change was made, increase the edit count
2023-07-31 20:57:49 +00:00
iterations += 1
2023-09-01 17:12:08 +00:00
def there_are_erroneous_anagrams(original, anagrams: set[str]) -> bool:
for anagram in anagrams:
if anagram == original: continue
if normalise(anagram) == normalise(original):
return True
return False
def find_erroneous_anagrams():
errors = []
for anagram_list in anagrams.values():
for anagram in anagram_list:
page = pywikibot.Page(SITE, anagram)
if not page.exists(): continue
if not has_bulgarian(page): continue
print("Traversing page", anagram + "...")
for template in mwparserfromhell.parse(page.text).filter(forcetype=mwparserfromhell.wikicode.Template):
template: mwparserfromhell.wikicode.Template
if template.name != "anagrams": continue
if not template.has_param(1): continue
if template.get(1) != "bg": continue
if there_are_erroneous_anagrams(anagram, template.params[2:]):
print("Found erroneous anagrams: ", template.params[2:])
errors.append(anagram)
break
with open("dubious_anagrams.txt", mode="w") as f:
f.write("\n".join(errors))
2023-07-31 20:57:49 +00:00
if __name__ == "__main__":
2023-09-01 17:12:08 +00:00
# uncreated = set()
# try:
# with open(NOT_CREATED_LOG) as f:
# uncreated = set(f.readlines())
# except FileNotFoundError:
# with open(NOT_CREATED_LOG, "w") as f:
# pass
# try:
# main(uncreated)
# finally:
# with open(NOT_CREATED_LOG, "w") as f:
# f.writelines(uncreated)
find_erroneous_anagrams()