diff --git a/english-anagrams/en-anagrams.py b/english-anagrams/en-anagrams.py index 63d1874..e3cdd49 100644 --- a/english-anagrams/en-anagrams.py +++ b/english-anagrams/en-anagrams.py @@ -16,9 +16,9 @@ RE_CATEGORIES = r"\[\[\s*[cC]at(egory)?\s*:[^\]]*\]\]" RE_MATCH_CATEGORIES = re.compile(fr"({RE_CAT_TEMPLATES}|{RE_CATEGORIES})") SITE = pywikibot.Site("en", "wiktionary") BACKUP_PATH = "en-anagrams-backup" -ALPHABET = "abcdefghijklmnopqrstuvwxyz" -NUMERIC = "0123456789" -MISCELLANEOUS = "βðπø" # These characters are lexically significant, i.e. there may be multiple anagrams that have them +DIACRITICS = f"{chr(0x0300)}-{chr(0x036F)}" +PUNCTUATION = r"’'\(\)\[\]\{\}<>:,‒–—―…!.«»-‐?‘’“”;/⁄␠·&@\*\•^¤¢$€£¥₩₪†‡°¡¿¬#№%‰‱¶′§~¨_|¦⁂☞∴‽※" + f"{chr(0x2000)}-{chr(0x206F)}" +REDUNDANT_CHARS = f"[{DIACRITICS}{PUNCTUATION}]" CONVERSIONS = { "æ": "ae", @@ -26,8 +26,6 @@ CONVERSIONS = { "ı": "i", } -NON_ALPHANUMERIC = f"[^{ALPHABET}{NUMERIC}{MISCELLANEOUS}]" # Use this pattern to remove all characters that don't distinguish an anagram - def create_diff(old_text: str, current_page: pywikibot.Page) -> None: """ Copy the contents of the page to local storage for backup in case there is a problem @@ -52,14 +50,14 @@ def normalise(word: str) -> str: - Remove all whitespace at the start and end. - Decompose all characters to their simplest, e.g. é becomes e + ACUTE - Convert to lowercase (casefold) - - Remove all irrelevant elements (non-alphanumeric characters). + - Remove all irrelevant elements (punctuation, diacritics). """ word = word.strip().casefold() for source_char, replacement in CONVERSIONS.items(): word = word.replace(source_char, replacement) - word = re.sub(NON_ALPHANUMERIC, "", unicodedata.normalize("NFKD", word.strip()).casefold()) + word = re.sub(REDUNDANT_CHARS, "", unicodedata.normalize("NFKD", word.strip()).casefold()) return word def get_alphagram(word: str) -> str: @@ -82,6 +80,9 @@ anagrams = {letter_count: anas for letter_count, anas in anagrams.items() if len def count_anagrams(): return sum(len(anagram_list) for anagram_list in anagrams.values()) +def get_anagrams(word: str, alphagram: str) -> set[str]: + return anagrams[alphagram] - {word} - {ana for ana in anagrams[alphagram] if normalise(ana) == normalise(word)} + def generate_anagrams_section(anagrams: set[str]) -> str: return "\n\n===Anagrams===\n* " + generate_anagrams_template(anagrams, get_alphagram(anagrams.copy().pop())) + "\n\n" @@ -156,7 +157,7 @@ def update_page(title: str, alphagram: str) -> bool: create_diff(page.text, page) - anagrams_to_add = anagrams[alphagram] - {title} - {ana for ana in anagrams[alphagram] if normalise(ana) == normalise(title)} + anagrams_to_add = get_anagrams(title, alphagram) new_content, added_anagrams = add_anagrams(page.text, anagrams_to_add, alphagram) new_content = re.sub("\n{3,}", "\n\n", new_content)