Uploaded the bot script file
This commit is contained in:
parent
b18325878c
commit
8f96b945d5
110
test.py
Normal file
110
test.py
Normal file
@ -0,0 +1,110 @@
|
||||
import pywikibot
|
||||
import mwparserfromhell
|
||||
import wikitextparser
|
||||
|
||||
PAGENAME = "къща"
|
||||
#PAGENAME = "кукла"
|
||||
|
||||
|
||||
site = pywikibot.Site()
|
||||
page = pywikibot.Page(site, PAGENAME)
|
||||
print(f"FETCHED: {PAGENAME}")
|
||||
def get_forms(page):
|
||||
result = []
|
||||
text = page.expand_text()
|
||||
|
||||
document = mwparserfromhell.parse(text)
|
||||
bulgarian = document.get_sections(matches=r"Bulgarian")[0] # Find the section that contains the Bulgarian entry
|
||||
nouns = bulgarian.get_sections(matches=r"Noun( \d+)*") # Locate any noun sections
|
||||
# document.get_sections()
|
||||
for n in nouns:
|
||||
|
||||
all_forms = {"lemma": "", "forms": {}} # Each 'noun' has a list of forms in its declension, which we will be populating as a dictionary
|
||||
declension = n.get_sections(matches=r"Declension")
|
||||
if declension == []: continue
|
||||
else: declension = declension[0]
|
||||
nav_frame = mwparserfromhell.parse(declension).nodes[2]
|
||||
table = wikitextparser.parse(str(nav_frame)).tables[0].data()
|
||||
columns = len(table[0]) # Always equals 3: the blank tile, plus the singular and plural
|
||||
rows = len(table) # Differs between masculine and non-masculine nouns; also depends on vocative presence
|
||||
|
||||
# Removes wiki formatting, leaving only the text contents of the cell
|
||||
#cyrillic = lambda text: not bool(re.search('[A-ЌЎЏѐ-ќў-嶲]', text))
|
||||
#stripper = lambda item: [i.strip_code() for i in mwparserfromhell.parse(mwparserfromhell.parse(item)).nodes if cyrillic(i.strip_code())]
|
||||
get_tags = lambda item: [i for i in mwparserfromhell.parse(mwparserfromhell.parse(item)).nodes]
|
||||
stripper = lambda tags: [mwparserfromhell.parse(t).strip_code() for t in [t for t in tags if type(t) == mwparserfromhell.nodes.tag.Tag] if not ' class="tr Latn"' in t.attributes and t.tag == "span"]
|
||||
lemma = stripper(get_tags(table[1][1]))[0]
|
||||
all_forms["lemma"] = lemma
|
||||
for i in range(1, columns):
|
||||
column = table[0][i]
|
||||
for j in range(1, rows):
|
||||
row = table[j][0]
|
||||
tags = get_tags(table[j][i])
|
||||
|
||||
#print(tags)
|
||||
values = (stripper(tags))
|
||||
|
||||
numbers = {
|
||||
"singular": "s",
|
||||
"plural": "p",
|
||||
}
|
||||
|
||||
forms = {
|
||||
"indefinite": "indef",
|
||||
"definite": "def",
|
||||
"definite<br>(subject form)": "sbjv",
|
||||
"definite<br>(object form)": "objv",
|
||||
"vocative form": "voc"
|
||||
}
|
||||
for v in values:
|
||||
form = forms[row]
|
||||
number = numbers[column]
|
||||
if v not in all_forms["forms"]:
|
||||
all_forms["forms"][v] = [(form, number)]
|
||||
else:
|
||||
all_forms["forms"][v].append((form, number))
|
||||
|
||||
#print(f"{number}, {form}: {value}")
|
||||
for key in all_forms["forms"]:
|
||||
if all_forms["forms"][key] == [("sbjv", "p"), ("objv", "p")]:
|
||||
all_forms["forms"][key] = [("def", "p")]
|
||||
#print(all_forms[key])
|
||||
|
||||
|
||||
result.append(all_forms)
|
||||
return result
|
||||
|
||||
def generate_derivatives(form_list):
|
||||
forms = {dictionary["lemma"] for dictionary in form_list}
|
||||
if len(forms) < len(form_list):
|
||||
print("Multiple conflicting senses for this term exist with identical stresses, exiting")
|
||||
return
|
||||
for dic in form_list:
|
||||
strip_acute = lambda text: text.replace("́", "")
|
||||
lemma = dic["lemma"]
|
||||
for key in dic["forms"]:
|
||||
title = strip_acute(key)
|
||||
derivative_page = pywikibot.Page(site, title)
|
||||
content = mwparserfromhell.parse(derivative_page.text)
|
||||
if bool(content.get_sections(matches=r"Bulgarian")):
|
||||
print(f"ERROR: page {title} already contains existing Bulgarian entry, exiting")
|
||||
continue
|
||||
if not derivative_page.botMayEdit:
|
||||
print(f"ERROR: page {title} disallows bot editing, exiting")
|
||||
assert "==Bulgarian==" not in content
|
||||
if derivative_page.exists():
|
||||
pass
|
||||
else:
|
||||
print(f"Page {title} does not exist, creating derived form...")
|
||||
definition = "# {{inflection of|bg|" + lemma + "||"
|
||||
for i, variant in enumerate(dic["forms"][key]):
|
||||
form, number = variant
|
||||
definition += form + "|" + number
|
||||
if i < len(dic["forms"][key])-1:
|
||||
definition += "|;|"
|
||||
definition += "}}"
|
||||
entry = "==Bulgarian==\n\n===Pronunciation===\n* {{bg-IPA|" + key + "}}\n\n===Noun===\n{{head|bg|noun form|head=" + key + "}}\n\n" + definition + "\n"
|
||||
derivative_page.text = entry
|
||||
derivative_page.save(f"Created derived form of {lemma}")
|
||||
print(f"Created page {title} with content: {entry}")
|
||||
generate_derivatives(get_forms(page))
|
Loading…
Reference in New Issue
Block a user