Update anagram classification method
This commit is contained in:
parent
69d581840b
commit
e22c88c737
@ -16,9 +16,9 @@ RE_CATEGORIES = r"\[\[\s*[cC]at(egory)?\s*:[^\]]*\]\]"
|
||||
RE_MATCH_CATEGORIES = re.compile(fr"({RE_CAT_TEMPLATES}|{RE_CATEGORIES})")
|
||||
SITE = pywikibot.Site("en", "wiktionary")
|
||||
BACKUP_PATH = "en-anagrams-backup"
|
||||
ALPHABET = "abcdefghijklmnopqrstuvwxyz"
|
||||
NUMERIC = "0123456789"
|
||||
MISCELLANEOUS = "βðπø" # These characters are lexically significant, i.e. there may be multiple anagrams that have them
|
||||
DIACRITICS = f"{chr(0x0300)}-{chr(0x036F)}"
|
||||
PUNCTUATION = r"’'\(\)\[\]\{\}<>:,‒–—―…!.«»-‐?‘’“”;/⁄␠·&@\*\•^¤¢$€£¥₩₪†‡°¡¿¬#№%‰‱¶′§~¨_|¦⁂☞∴‽※" + f"{chr(0x2000)}-{chr(0x206F)}"
|
||||
REDUNDANT_CHARS = f"[{DIACRITICS}{PUNCTUATION}]"
|
||||
|
||||
CONVERSIONS = {
|
||||
"æ": "ae",
|
||||
@ -26,8 +26,6 @@ CONVERSIONS = {
|
||||
"ı": "i",
|
||||
}
|
||||
|
||||
NON_ALPHANUMERIC = f"[^{ALPHABET}{NUMERIC}{MISCELLANEOUS}]" # Use this pattern to remove all characters that don't distinguish an anagram
|
||||
|
||||
def create_diff(old_text: str, current_page: pywikibot.Page) -> None:
|
||||
"""
|
||||
Copy the contents of the page to local storage for backup in case there is a problem
|
||||
@ -52,14 +50,14 @@ def normalise(word: str) -> str:
|
||||
- Remove all whitespace at the start and end.
|
||||
- Decompose all characters to their simplest, e.g. é becomes e + ACUTE
|
||||
- Convert to lowercase (casefold)
|
||||
- Remove all irrelevant elements (non-alphanumeric characters).
|
||||
- Remove all irrelevant elements (punctuation, diacritics).
|
||||
"""
|
||||
word = word.strip().casefold()
|
||||
|
||||
for source_char, replacement in CONVERSIONS.items():
|
||||
word = word.replace(source_char, replacement)
|
||||
|
||||
word = re.sub(NON_ALPHANUMERIC, "", unicodedata.normalize("NFKD", word.strip()).casefold())
|
||||
word = re.sub(REDUNDANT_CHARS, "", unicodedata.normalize("NFKD", word.strip()).casefold())
|
||||
return word
|
||||
|
||||
def get_alphagram(word: str) -> str:
|
||||
@ -82,6 +80,9 @@ anagrams = {letter_count: anas for letter_count, anas in anagrams.items() if len
|
||||
def count_anagrams():
|
||||
return sum(len(anagram_list) for anagram_list in anagrams.values())
|
||||
|
||||
def get_anagrams(word: str, alphagram: str) -> set[str]:
|
||||
return anagrams[alphagram] - {word} - {ana for ana in anagrams[alphagram] if normalise(ana) == normalise(word)}
|
||||
|
||||
def generate_anagrams_section(anagrams: set[str]) -> str:
|
||||
return "\n\n===Anagrams===\n* " + generate_anagrams_template(anagrams, get_alphagram(anagrams.copy().pop())) + "\n\n"
|
||||
|
||||
@ -156,7 +157,7 @@ def update_page(title: str, alphagram: str) -> bool:
|
||||
|
||||
create_diff(page.text, page)
|
||||
|
||||
anagrams_to_add = anagrams[alphagram] - {title} - {ana for ana in anagrams[alphagram] if normalise(ana) == normalise(title)}
|
||||
anagrams_to_add = get_anagrams(title, alphagram)
|
||||
new_content, added_anagrams = add_anagrams(page.text, anagrams_to_add, alphagram)
|
||||
new_content = re.sub("\n{3,}", "\n\n", new_content)
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user