Update anagram classification method

2023-08-04 19:50:44 +01:00 · 2023-08-04 19:50:44 +01:00 · e22c88c737
commit e22c88c737
parent 69d581840b
1 changed files with 9 additions and 8 deletions
--- a/english-anagrams/en-anagrams.py
+++ b/english-anagrams/en-anagrams.py
@ -16,9 +16,9 @@ RE_CATEGORIES = r"\[\[\s*[cC]at(egory)?\s*:[^\]]*\]\]"
 RE_MATCH_CATEGORIES = re.compile(fr"({RE_CAT_TEMPLATES}|{RE_CATEGORIES})")
 SITE = pywikibot.Site("en", "wiktionary")
 BACKUP_PATH = "en-anagrams-backup"
-ALPHABET = "abcdefghijklmnopqrstuvwxyz"
-NUMERIC = "0123456789"
-MISCELLANEOUS = "βðπø" # These characters are lexically significant, i.e. there may be multiple anagrams that have them
+DIACRITICS = f"{chr(0x0300)}-{chr(0x036F)}"
+PUNCTUATION = r"’'\(\)\[\]\{\}<>:,‒–—―…!.«»-‐?‘’“”;/⁄␠·&@\*\•^¤¢$€£¥₩₪†‡°¡¿¬#№%‰‱¶′§~¨_|¦⁂☞∴‽※" + f"{chr(0x2000)}-{chr(0x206F)}"
+REDUNDANT_CHARS = f"[{DIACRITICS}{PUNCTUATION}]"

 CONVERSIONS = {
    "æ": "ae",
@ -26,8 +26,6 @@ CONVERSIONS = {
    "ı": "i",
 }

-NON_ALPHANUMERIC = f"[^{ALPHABET}{NUMERIC}{MISCELLANEOUS}]" # Use this pattern to remove all characters that don't distinguish an anagram
-
 def create_diff(old_text: str, current_page: pywikibot.Page) -> None:
    """
    Copy the contents of the page to local storage for backup in case there is a problem
@ -52,14 +50,14 @@ def normalise(word: str) -> str:
        - Remove all whitespace at the start and end.
        - Decompose all characters to their simplest, e.g. é becomes e + ACUTE
        - Convert to lowercase (casefold)
-        - Remove all irrelevant elements (non-alphanumeric characters).
+        - Remove all irrelevant elements (punctuation, diacritics).
    """
    word = word.strip().casefold()

    for source_char, replacement in CONVERSIONS.items():
        word = word.replace(source_char, replacement)

-    word = re.sub(NON_ALPHANUMERIC, "", unicodedata.normalize("NFKD", word.strip()).casefold())
+    word = re.sub(REDUNDANT_CHARS, "", unicodedata.normalize("NFKD", word.strip()).casefold())
    return word

 def get_alphagram(word: str) -> str:
@ -82,6 +80,9 @@ anagrams = {letter_count: anas for letter_count, anas in anagrams.items() if len
 def count_anagrams():
    return sum(len(anagram_list) for anagram_list in anagrams.values())

+def get_anagrams(word: str, alphagram: str) -> set[str]:
+    return anagrams[alphagram] - {word} - {ana for ana in anagrams[alphagram] if normalise(ana) == normalise(word)}
+
 def generate_anagrams_section(anagrams: set[str]) -> str:
    return "\n\n===Anagrams===\n* " + generate_anagrams_template(anagrams, get_alphagram(anagrams.copy().pop())) + "\n\n"

@ -156,7 +157,7 @@ def update_page(title: str, alphagram: str) -> bool:

    create_diff(page.text, page)
    
-    anagrams_to_add = anagrams[alphagram] - {title} - {ana for ana in anagrams[alphagram] if normalise(ana) == normalise(title)}
+    anagrams_to_add = get_anagrams(title, alphagram)
    new_content, added_anagrams = add_anagrams(page.text, anagrams_to_add, alphagram)
    new_content = re.sub("\n{3,}", "\n\n", new_content)