Add diff saving, normalsation

This commit is contained in:
Spooghetti420 2023-08-01 09:47:34 +01:00
parent f01b1ff9c4
commit 77756473e5

View File

@ -1,8 +1,10 @@
import os
import subprocess
import sys
import pywikibot
import mwparserfromhell
import regex as re
from collections import Counter, defaultdict
from collections import defaultdict
# From User:JeffDoozan's bot AutoDooz
CAT_TEMPLATES = [ "c", "C", "cat", "top", "topic", "topics", "categorize", "catlangname", "catlangcode", "cln", "zh-cat",
@ -11,9 +13,33 @@ RE_CAT_TEMPLATES = r"\{\{\s*(" + "|".join(CAT_TEMPLATES) + r")\s*[|}][^{}]*\}*"
RE_CATEGORIES = r"\[\[\s*[cC]at(egory)?\s*:[^\]]*\]\]"
RE_MATCH_CATEGORIES = re.compile(fr"({RE_CAT_TEMPLATES}|{RE_CATEGORIES})")
SITE = pywikibot.Site("en", "wiktionary")
BACKUP_PATH = "bg-anagrams-backup"
ALPHABET = "абвгдежзийклмнопрстуфхцчшщъьюя"
NON_ALPHABETIC = f"[^{ALPHABET}]"
def create_diff(old_text: str, current_page: pywikibot.Page) -> None:
"""
Copy the contents of the page to local storage for backup in case there is a problem
with the script later; this will allow the error to be automatically corrected at that time.
"""
os.makedirs(BACKUP_PATH, exist_ok=True)
with open("temp1", mode="w", encoding="utf-8") as f:
f.write(old_text)
with open("temp2", mode="w", encoding="utf-8") as f:
f.write(current_page.text)
diff = subprocess.getoutput("diff -u temp2 temp1") # Get differences between new revision and previous
diff = diff + "\n" # patch will complain if we don't end the file with a newline
with open(os.path.join(BACKUP_PATH, current_page.title()), mode="w", encoding="utf-8") as f:
f.write(diff)
def normalise(word: str) -> str:
return re.sub(NON_ALPHABETIC, "", re.sub("\s", "", word.strip().casefold()))
def get_alphagram(word: str) -> str:
return "".join(sorted(word.strip()))
return "".join(sorted(normalise(word)))
def has_bulgarian(page: pywikibot.Page) -> bool:
return bool(mwparserfromhell.parse(page.text).get_sections([2], "Bulgarian"))
@ -76,13 +102,15 @@ def add_anagrams(contents: str, anagrams_to_add: set[str], alphagram):
return str(parsed)
def update_page(title: str, counted: list[tuple[str, int]]) -> bool:
def update_page(title: str, alphagram: str) -> bool:
"""Update a page with its anagrams. Returns whether changes were made."""
page = pywikibot.Page(SITE, title)
create_diff(page.text, page)
if has_bulgarian(page):
anagrams_to_add = anagrams[counted] - {title}
new_content = add_anagrams(page.text, anagrams_to_add, counted)
anagrams_to_add = anagrams[alphagram] - {title}
new_content = add_anagrams(page.text, anagrams_to_add, alphagram)
new_content = re.sub("\n{3,}", "\n\n", new_content)
if new_content == page.text:
@ -104,6 +132,8 @@ def main():
except:
LIMIT = -1
print("Preparing to iterate over", len(anagrams), "alphragrams")
edit_count = 0 # Updated for every individual page
iterations = 0 # Updated for every set of anagrams
for alphagram, anas in anagrams.items():