Update with library, scritps

This commit is contained in:
Kiril Kovachev 2023-09-01 18:12:08 +01:00
parent e22c88c737
commit 8dc7c70cd9
20 changed files with 120214 additions and 39 deletions

View File

@ -139,6 +139,10 @@ def update_page(title: str, alphagram: str, uncreated: set[str]) -> bool:
else:
page.text = new_content
plural_s = "s" if len(anagrams_added) > 1 else ""
if len(anagrams_added) == 0:
print("Nothing was added, but the content was changed! (not saved)")
return False
page.save(f"Added anagram{plural_s} ({', '.join(anagrams_added)}) to Bulgarian section", minor=False)
return True
else:
@ -172,16 +176,49 @@ def main(uncreated: set[str]):
iterations += 1
def there_are_erroneous_anagrams(original, anagrams: set[str]) -> bool:
for anagram in anagrams:
if anagram == original: continue
if normalise(anagram) == normalise(original):
return True
return False
def find_erroneous_anagrams():
errors = []
for anagram_list in anagrams.values():
for anagram in anagram_list:
page = pywikibot.Page(SITE, anagram)
if not page.exists(): continue
if not has_bulgarian(page): continue
print("Traversing page", anagram + "...")
for template in mwparserfromhell.parse(page.text).filter(forcetype=mwparserfromhell.wikicode.Template):
template: mwparserfromhell.wikicode.Template
if template.name != "anagrams": continue
if not template.has_param(1): continue
if template.get(1) != "bg": continue
if there_are_erroneous_anagrams(anagram, template.params[2:]):
print("Found erroneous anagrams: ", template.params[2:])
errors.append(anagram)
break
with open("dubious_anagrams.txt", mode="w") as f:
f.write("\n".join(errors))
if __name__ == "__main__":
uncreated = set()
try:
with open(NOT_CREATED_LOG) as f:
uncreated = set(f.readlines())
except FileNotFoundError:
with open(NOT_CREATED_LOG, "w") as f:
pass
try:
main(uncreated)
finally:
with open(NOT_CREATED_LOG, "w") as f:
f.writelines(uncreated)
# uncreated = set()
# try:
# with open(NOT_CREATED_LOG) as f:
# uncreated = set(f.readlines())
# except FileNotFoundError:
# with open(NOT_CREATED_LOG, "w") as f:
# pass
# try:
# main(uncreated)
# finally:
# with open(NOT_CREATED_LOG, "w") as f:
# f.writelines(uncreated)
find_erroneous_anagrams()

View File

@ -0,0 +1,16 @@
зар
иск
пра-
пот
глоба
картон
арап
тема
треска
нокът
аз
то
А
а
ни
това

View File

@ -0,0 +1,11 @@
def process(line: str) -> str:
return line[line.rfind(" ")+1:].replace("-", "")
with open("words.txt") as f:
contents = f.readlines()
for i, line in enumerate(contents):
contents[i] = process(line)
with open("modified.txt", mode="w") as f:
f.writelines(contents)

76110
bulgarian-anagrams/words2.txt Normal file

File diff suppressed because it is too large Load Diff

43646
bulgarian-anagrams/words3.txt Normal file

File diff suppressed because it is too large Load Diff

View File

@ -6,6 +6,7 @@ import pywikibot
import mwparserfromhell
import unicodedata
import regex as re
import random
from collections import defaultdict
# From User:JeffDoozan's bot AutoDooz
@ -17,8 +18,8 @@ RE_MATCH_CATEGORIES = re.compile(fr"({RE_CAT_TEMPLATES}|{RE_CATEGORIES})")
SITE = pywikibot.Site("en", "wiktionary")
BACKUP_PATH = "en-anagrams-backup"
DIACRITICS = f"{chr(0x0300)}-{chr(0x036F)}"
PUNCTUATION = r"'\(\)\[\]\{\}<>:,‒–—―…!.«»-?‘’“”;/⁄␠·&@\*\•^¤¢$€£¥₩₪†‡°¡¿¬#№%‰‱¶′§~¨_|¦⁂☞∴‽※" + f"{chr(0x2000)}-{chr(0x206F)}"
REDUNDANT_CHARS = f"[{DIACRITICS}{PUNCTUATION}]"
PUNCTUATION = r"'()\[\]{}<>:,‒–—―…!.«»\-?‘’“”;/⁄␠·&@*\\•^¤¢$€£¥₩₪†‡°¡¿¬#№%‰‱¶′§~¨_|¦⁂☞∴‽※" + f"{chr(0x2000)}-{chr(0x206F)}"
REDUNDANT_CHARS = f"[{DIACRITICS}{PUNCTUATION} ]"
CONVERSIONS = {
"æ": "ae",
@ -52,7 +53,7 @@ def normalise(word: str) -> str:
- Convert to lowercase (casefold)
- Remove all irrelevant elements (punctuation, diacritics).
"""
word = word.strip().casefold()
word = word.casefold()
for source_char, replacement in CONVERSIONS.items():
word = word.replace(source_char, replacement)
@ -178,6 +179,9 @@ def main():
LIMIT = -1
print("Preparing to iterate over", len(anagrams), "alphragrams", f"({count_anagrams()} anagrams)")
for anagram_list in anagrams.values():
if random.randint(1, 1000) == 50:
print(anagram_list)
edit_count = 0 # Updated for every individual page
iterations = 0 # Updated for every set of anagrams

37
english-anagrams/temp1 Normal file
View File

@ -0,0 +1,37 @@
{{also|Unicef|U.N.I.C.E.F.}}
==English==
{{wikipedia}}
===Alternative forms===
* {{alter|en|Unicef||Britain, Australia, New Zealand}}
* {{alter|en|U.N.I.C.E.F.}}
===Pronunciation===
* English: {{a|US}} {{IPA|en|/ˈju.nəˌsɛf/}}
===Proper noun===
{{en-proper noun}}
# {{acronym of|en|[[United Nations]] [[international|International]] [[child|Children's]] [[emergency|Emergency]] [[fund|Fund]]}}
====Translations====
{{trans-top|United Nations International Children's Emergency Fund}}
* Japanese: {{t+|ja|UNICEF|tr=Yunisefu}}
* Macedonian: {{t|mk|УНИЦЕФ}}, {{t|mk|Уницеф}}
* Russian: {{t+|ru|ЮНИСЕ́Ф|m|sc=Cyrl}}
{{trans-bottom}}
{{cln|en|acronyms}}
{{C|en|United Nations}}
==Japanese==
===Etymology===
Borrowed from {{bor|ja|en|UNICEF|sort=ゆにせふ}}.
===Proper noun===
{{ja-pos|proper|ユニセフ}}
# {{syn of|ja|国際連合児童基金|tr=Kokusai Rengō Jidō Kikin||{{w|United Nations Children's Emergency Fund}}}}; the {{l|en|UNICEF}}
{{topics|ja|Organizations|sort=ゆにせふ}}

37
english-anagrams/temp2 Normal file
View File

@ -0,0 +1,37 @@
{{also|Unicef|U.N.I.C.E.F.}}
==English==
{{wikipedia}}
===Alternative forms===
* {{alter|en|Unicef||Britain, Australia, New Zealand}}
* {{alter|en|U.N.I.C.E.F.}}
===Pronunciation===
* English: {{a|US}} {{IPA|en|/ˈju.nəˌsɛf/}}
===Proper noun===
{{en-proper noun}}
# {{acronym of|en|[[United Nations]] [[international|International]] [[child|Children's]] [[emergency|Emergency]] [[fund|Fund]]}}
====Translations====
{{trans-top|United Nations International Children's Emergency Fund}}
* Japanese: {{t+|ja|UNICEF|tr=Yunisefu}}
* Macedonian: {{t|mk|УНИЦЕФ}}, {{t|mk|Уницеф}}
* Russian: {{t+|ru|ЮНИСЕ́Ф|m|sc=Cyrl}}
{{trans-bottom}}
{{cln|en|acronyms}}
{{C|en|United Nations}}
==Japanese==
===Etymology===
Borrowed from {{bor|ja|en|UNICEF|sort=ゆにせふ}}.
===Proper noun===
{{ja-pos|proper|ユニセフ}}
# {{syn of|ja|国際連合児童基金|tr=Kokusai Rengō Jidō Kikin||{{w|United Nations Children's Emergency Fund}}}}; the {{l|en|UNICEF}}
{{topics|ja|Organizations|sort=ゆにせふ}}

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,65 @@
import traceback
from typing import Iterator
import kovachevbot
import mwparserfromhell
import sys
import pywikibot
import regex as re
ROMAJI_TRANSLITERATION_PATTERN = re.compile(r"\(\w+?\)")
def fix_reading_str(reading_str: str) -> str:
all_readings = [each.strip() for each in reading_str.split(",")]
all_readings = [kovachevbot.links_to_plaintext(ROMAJI_TRANSLITERATION_PATTERN.sub("", each)).strip() for each in all_readings]
return ", ".join(all_readings)
def fix_page(page: pywikibot.Page):
kanji = page.title()
parsed = mwparserfromhell.parse(page.text)
japanese_section_search = parsed.get_sections([2], "Japanese")
if len(japanese_section_search) == 0:
print("Skipping page", kanji, "as it has no Japanese section", file=sys.stderr)
japanese_section: mwparserfromhell.wikicode.Wikicode = japanese_section_search[0]
ja_readingses: list[mwparserfromhell.wikicode.Template] = japanese_section.filter(forcetype=mwparserfromhell.wikicode.Template, matches="ja-readings")
for ja_reading_template in ja_readingses:
params_to_remove = list()
for param in ja_reading_template.params:
param: mwparserfromhell.nodes.extras.Parameter
# Can't delete params while iterating, so we need to store them to delete later
if param.value == "": # Delete parameters that are supplied but not populated, e.g. "|nanori="
params_to_remove.append(param)
else:
param.value = fix_reading_str(str(param.value))
for param in params_to_remove:
ja_reading_template.remove(param)
page.text = str(parsed)
def main():
with open("ja-readings-to-fix.txt") as f:
kanji_to_fix = f.read()
pages = (kovachevbot.wikt_page(kanji) for kanji in kanji_to_fix)
checked_pages_iter: Iterator[pywikibot.Page] = kovachevbot.iterate_safe(pages)
try:
for i, page in enumerate(checked_pages_iter):
print(page.title())
fix_page(page)
page.save("Remove redundant ja-readings markup (manual transliterations; manual links; empty params)")
except:
i -= 1
if i < 0: i = 0
traceback.print_exc()
finally:
kanji_to_fix = kanji_to_fix[max(i+1, 0):]
with open("ja-readings-to-fix.txt", mode="w") as f:
kanji_to_fix = f.write(kanji_to_fix)
if __name__ == "__main__":
main()

View File

@ -3,10 +3,10 @@ import pywikibot
import os
import subprocess
import mwparserfromhell
import kovachevbot
from mwparserfromhell.wikicode import Template
from restore_pages import BACKUP_PATH
JA_YOMI_TRACKING_PAGE = "tracking/ja-pron/yomi"
SITE = pywikibot.Site("en", "wiktionary")
@ -15,7 +15,6 @@ def get_yomi_pages() -> Generator[pywikibot.Page, None, None]:
MAIN_NAMESPACE = SITE.namespaces.MAIN
return pywikibot.Page(SITE, JA_YOMI_TRACKING_PAGE, ns=TEMPLATE_NAMESPACE).getReferences(only_template_inclusion=True, namespaces=[MAIN_NAMESPACE])
# Use mwparserfromhell to filter all the templates, select the ja-pron ones, and remove any "y" or "yomi"
# arguments they might have.
def remove_yomi_from_page(page: pywikibot.Page) -> None:
@ -27,7 +26,7 @@ def remove_yomi_from_page(page: pywikibot.Page) -> None:
parsed = mwparserfromhell.parse(text)
for template in parsed.ifilter(forcetype=Template, recursive=False):
template: Template
if template.name != "ja-pron":
if template.name != "ja-pron" and str(template.name).casefold() != "ja-ipa":
continue
if template.has("y"):
@ -72,37 +71,32 @@ def template_argument_counts_accord(previous_text: str, current_text: str) -> bo
if previous_pron.name != "ja-pron" or current_pron.name != "ja-pron":
continue
if not (previous_pron.has("y") or previous_pron.has("yomi")):
continue
if len(current_pron.params) != len(previous_pron.params) - 1:
return False
return True
def main():
# Get the maximum number of edits to make from the user (e.g. `pwb ja-yomi-remove 100`);
# if not found then set to unlimited (-1)
try:
LIMIT = int(pywikibot.argvu[1])
except:
LIMIT = -1
for edit_count, page in enumerate(get_yomi_pages()):
if edit_count == LIMIT:
return
if edit_count % 5 == 0:
halt_page = pywikibot.Page(SITE, "User:KovachevBot/halt")
if "halt" in halt_page.text.casefold():
print(f"ERROR: BOT WAS MANUALLY HALTED BY {halt_page.userName()}")
return
for page in kovachevbot.iterate_safe((get_yomi_pages())):
original_text = page.text
print(f"Removing yomi from {page.title()}...")
page.text = remove_yomi_from_page(page)
print(f"Backing up {page.title()}...")
create_diff(original_text, page)
kovachevbot.backup_page(original_text, page, BACKUP_PATH)
try:
assert template_argument_counts_accord(original_text, page.text)
page.save("Removed deprecated yomi/y parameters from {{ja-pron}} (automated task)", minor=True, botflag=True)
except AssertionError:
print("ERROR: page raised error, template argument-counting failsafe did not accord")
continue
if __name__ == "__main__":
main()

3
kovachevbot/README.md Normal file
View File

@ -0,0 +1,3 @@
# KovachevBot commons
This is a directory which acts as a Python module, containing numerous functions and patterns
that I typically reuse throughout my bot code.

View File

@ -0,0 +1,8 @@
Metadata-Version: 2.1
Name: kovachevbot
Version: 0.1
Summary: Library of generic functions and constants used in my bot scripts
Home-page: https://www.kovachev.xyz
Author: Kiril Kovachev
Author-email: kkovachev1947@gmail.com
License: MIT

View File

@ -0,0 +1,9 @@
README.md
setup.py
kovachevbot/__init__.py
kovachevbot/common.py
kovachevbot.egg-info/PKG-INFO
kovachevbot.egg-info/SOURCES.txt
kovachevbot.egg-info/dependency_links.txt
kovachevbot.egg-info/not-zip-safe
kovachevbot.egg-info/top_level.txt

View File

@ -0,0 +1 @@

View File

@ -0,0 +1 @@

View File

@ -0,0 +1 @@
kovachevbot

View File

@ -0,0 +1,3 @@
# When `import kovachevbot` is run, extract all of its data under the simple namespace `kovachevbot`
# (as opposed to resolving `kovachevbot.common` for all module data)
from kovachevbot.common import *

View File

@ -0,0 +1,180 @@
import os
import subprocess
import pywikibot
import tkinter
import sys
import itertools
import mwparserfromhell
import regex as re
from typing import Generator, Iterator
WIKTIONARY = pywikibot.Site("en", "wiktionary")
TEMPLATE_NAMESPACE = WIKTIONARY.namespaces.TEMPLATE
MAIN_NAMESPACE = WIKTIONARY.namespaces.MAIN
COMMONS = pywikibot.Site("commons", "commons")
LINK_PATTERN = re.compile(r"\[\[(.+?)(?:\|(.+?))?\]\]")
def wikt_page(title: str) -> pywikibot.Page:
return pywikibot.Page(WIKTIONARY, title)
def save_gui(page: pywikibot.Page, default_edit_summary: str = "") -> bool:
"""Returns whether the edit was successfully completed through the save button or not."""
window = tkinter.Tk()
window.title(f"Editing page {page.title()}")
window.geometry("800x600")
window.config(bg="#000000")
page_text_label = tkinter.Label(master=window, text="Page contents")
page_text_box = tkinter.Text(master=window)
page_text_box.place(x=0, y=0)
page_text_box.pack(fill="x", expand=False, padx=20, pady=0)
page_text_box.insert("1.0", page.text)
edit_summary_label = tkinter.Label(window, text="Edit summary")
edit_summar_var = tkinter.StringVar()
edit_summar_var.set(default_edit_summary)
edit_summary_box = tkinter.Entry(window, textvariable=edit_summar_var, width=60)
edit_summary_label.place(x=20, y=500)
edit_summary_box.place(x=20, y=525)
success = False
def save_content():
page.text = page_text_box.get("1.0", "end")
edit_summary = edit_summar_var.get()
window.destroy()
page.save(edit_summary, minor=False)
nonlocal success
success = True
button = tkinter.Button(window, text="Save", command=save_content)
button.place(x=400, y=550)
window.mainloop()
return success
# save_gui(pywikibot.Page(WIKTIONARY, "User:Kiril kovachev/Sandbox"))
def convert_link_to_plaintext(link: mwparserfromhell.wikicode.Wikilink) -> str:
if link.text is not None:
if link.text == "": return link.title
else: return link.text
else:
return link.title
def links_to_plaintext(text: str) -> str:
parsed: mwparserfromhell.wikicode.Wikicode = mwparserfromhell.parse(text)
links = parsed.filter(forcetype=mwparserfromhell.wikicode.Wikilink)
for link in links:
plain = convert_link_to_plaintext(link)
parsed.replace(link, plain)
return str(parsed)
ABORT_CHECK_INTERVAL = 5
HALT_PAGE = wikt_page("User:KovachevBot/halt") # Do not edit, please!
def iterate_with_abort_check(iterator: Iterator, interval: int = ABORT_CHECK_INTERVAL, halt_page = HALT_PAGE):
"""
Run over an iterator, checking at every interval of 5 (or other specified value)
whether the bot has been ordered to stop. The failsafe site is defined as User:KovachevBot/halt by default.
"""
for edit_count, value in enumerate(iterator):
# Check halt page
if edit_count % interval == 0:
if "halt" in halt_page.text.casefold():
print(f"ERROR: BOT WAS MANUALLY HALTED BY {halt_page.userName()}", file=sys.stderr)
return
yield value
def iterate_entries(iterator: Iterator, max_edits: int = None):
"""Iterate at most `max_edits` entries of an iterator (of pages), or unlimited.
If no `max_edits` is provided as an arg, try to get the value from the command-line arguments.
If it still isn't found, default to running indefinitely.
If it is provided, but it's not a valid integer, it will default to unlimited again.
In the unlimited case, this effectively means this iterator will run until the original one is exhausted.
"""
if max_edits is None:
try:
edit_iter = range(int(pywikibot.argvu[1]))
except:
edit_iter = itertools.count()
else:
try:
edit_iter = range(int(max_edits))
except ValueError:
edit_iter = itertools.count()
for _, value in zip(edit_iter, iterator):
yield value
def iterate_safe(iterator: Iterator, max_entries: int = None, abort_check_interval: int = ABORT_CHECK_INTERVAL, halt_page: pywikibot.Page = HALT_PAGE):
"""Iterate safely over an iterator of pages, checking every `abort_check_interval` for whether to halt
the bot based on a user's manual request (by editing the `halt_page` to contain the word 'halt'),
yielding at most `max_entries`.
"""
return iterate_entries(iterate_with_abort_check(iterator, abort_check_interval, halt_page), max_entries)
def iterate_tracking(tracking_page: str) -> Generator[pywikibot.Page, None, None]:
"""
Iterate over pages in a tracking category on Wiktionary (linked to within Template:tracking/(page_name_here)).
`tracking_page` should be the name of the tracking category: e.g. if you want to iterate
over `Template:tracking/ja-pron/yomi`, you would enter `ja-pron/yomi`.
Returns only entries in the main entry namespace.
"""
return pywikibot.Page(WIKTIONARY, f"tracking/{tracking_page}", ns=TEMPLATE_NAMESPACE).getReferences(only_template_inclusion=True, namespaces=[MAIN_NAMESPACE])
def iterate_category(category_name: str) -> Generator[pywikibot.Page, None, None]:
"""Iterate pages in a category on Wiktionary.
The `category_name` should be the name without the Category: namespace, e.g.
`category_name="Bulgarian lemmas"`.
"""
return pywikibot.Category(WIKTIONARY, category_name).articles(namespaces=[MAIN_NAMESPACE])
def backup_page(old_text: str, new_page: pywikibot.Page, backup_path: str, file_name: str = None) -> None:
"""
Copy the contents of the page to local storage for backup in case there is a problem
with the script later; this will allow the error to be automatically corrected at that time.
"""
file_name = file_name or new_page.title()
os.makedirs(backup_path, exist_ok=True)
with open("temp1", mode="w", encoding="utf-8") as f:
f.write(old_text)
with open("temp2", mode="w", encoding="utf-8") as f:
f.write(new_page.text)
diff = subprocess.getoutput("diff -u temp2 temp1") # Get differences between new revision and previous
diff = diff + "\n" # patch will complain if we don't end the file with a newline
with open(os.path.join(backup_path, new_page.title()), mode="w", encoding="utf-8") as f:
f.write(diff)
def add_l2(parsed: mwparserfromhell.wikicode.Wikicode, l2_section: mwparserfromhell.wikicode.Wikicode) -> None:
parsed = mwparserfromhell.parse(parsed)
l2_section = mwparserfromhell.parse(l2_section)
l2_title = l2_section.nodes[0].title
if l2_title in [section.nodes[0].title for section in parsed.get_sections([2])]:
return
new = mwparserfromhell.parse("")
l2_sections = parsed.get_sections([2])
l2_sections.append(l2_section)
l2_sections.sort(key=lambda section: section.nodes[0].title)
for section in l2_sections:
section.append("\n\n")
new.append(section)
parsed.nodes = new.nodes
while "\n\n\n" in parsed:
parsed.replace("\n\n\n", "\n\n")

11
kovachevbot/setup.py Normal file
View File

@ -0,0 +1,11 @@
from setuptools import setup
setup(name="kovachevbot",
version="0.1",
description="Library of generic functions and constants used in my bot scripts",
url="https://www.kovachev.xyz",
author="Kiril Kovachev",
author_email="kkovachev1947@gmail.com",
license="MIT",
packages=["kovachevbot"],
zip_safe=False)