From ce72400fd83f0956abf3704cd2dc64f4cbe4d070 Mon Sep 17 00:00:00 2001 From: Spooghetti420 Date: Wed, 2 Aug 2023 14:22:44 +0100 Subject: [PATCH] =?UTF-8?q?Add=20=D1=9D/=D0=B8=20distinction?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- bulgarian-anagrams/bg-anagrams.py | 13 ++++++++++--- bulgarian-anagrams/words.txt | 2 +- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/bulgarian-anagrams/bg-anagrams.py b/bulgarian-anagrams/bg-anagrams.py index 71dd71d..b896758 100644 --- a/bulgarian-anagrams/bg-anagrams.py +++ b/bulgarian-anagrams/bg-anagrams.py @@ -15,7 +15,8 @@ RE_MATCH_CATEGORIES = re.compile(fr"({RE_CAT_TEMPLATES}|{RE_CATEGORIES})") SITE = pywikibot.Site("en", "wiktionary") BACKUP_PATH = "bg-anagrams-backup" ALPHABET = "абвгдежзийклмнопрстуфхцчшщъьюя" -NON_ALPHABETIC = f"[^{ALPHABET}]" +NUMERIC = "0123456789" +NON_ALPHANUMERIC = f"[^{ALPHABET}{NUMERIC}]" def create_diff(old_text: str, current_page: pywikibot.Page) -> None: """ @@ -36,7 +37,8 @@ def create_diff(old_text: str, current_page: pywikibot.Page) -> None: f.write(diff) def normalise(word: str) -> str: - return re.sub(NON_ALPHABETIC, "", re.sub("\s", "", word.strip().casefold())) + return re.sub(NON_ALPHANUMERIC, "", re.sub("ѝ", "и", word.casefold())) + # return re.sub("[-.;:?!‒–—]", "", re.sub("\s", "", word.casefold())) def get_alphagram(word: str) -> str: return "".join(sorted(normalise(word))) @@ -54,6 +56,9 @@ for word in wordlist: anagrams = {letter_count: anas for letter_count, anas in anagrams.items() if len(anas) > 1} # Only keep words with multiple anagrams +def count_anagrams(): + return sum(len(anagram_list) for anagram_list in anagrams.values()) + def generate_anagrams_section(anagrams: set[str]) -> str: return "\n\n===Anagrams===\n* " + generate_anagrams_template(anagrams, get_alphagram(anagrams.copy().pop())) + "\n\n" @@ -132,7 +137,9 @@ def main(): except: LIMIT = -1 - print("Preparing to iterate over", len(anagrams), "alphragrams") + print(anagrams["и"]) + print("Preparing to iterate over", len(anagrams), "alphragrams", f"({count_anagrams()} anagrams)") + return edit_count = 0 # Updated for every individual page iterations = 0 # Updated for every set of anagrams diff --git a/bulgarian-anagrams/words.txt b/bulgarian-anagrams/words.txt index ace4784..16a4734 100644 --- a/bulgarian-anagrams/words.txt +++ b/bulgarian-anagrams/words.txt @@ -34356,7 +34356,7 @@ ей ето зер -и +ѝ именно леле ли