Add accent scraping script
This commit is contained in:
parent
0820700811
commit
8081f40625
1
ja-accents/.gitignore
vendored
Normal file
1
ja-accents/.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
||||
term_bank_1.json
|
110
ja-accents/daijirin.py
Normal file
110
ja-accents/daijirin.py
Normal file
@ -0,0 +1,110 @@
|
||||
import regex as re
|
||||
from os.path import expanduser
|
||||
|
||||
DJR_DATA_FILE = expanduser("~/Downloads/(三省堂)スーパー大辞林[3.0]/term_bank_1.json") # Replace with your path to the DJR JSON data
|
||||
ACCENT_LIST_REGEX = re.compile(r"(?:\[\d{1,2}\])+")
|
||||
|
||||
class ItemNotFoundError(ValueError):
|
||||
"""Error when looking up an item in Daijirin; the item was not found."""
|
||||
|
||||
class NoAccentError(ValueError):
|
||||
"""Error when trying to find the accent of a term: the entry defines no accent. No accent exists in the data."""
|
||||
|
||||
|
||||
# NOTICE: requires 3GB+ RAM at runtime.
|
||||
# Be cautious if your system does not currently have sufficient available memory.
|
||||
with open(DJR_DATA_FILE) as f:
|
||||
DAIJIRIN = eval(f.read())
|
||||
|
||||
def is_kana(s: str) -> bool:
|
||||
HIRAGANA_START = '\u3040'
|
||||
HIRAGANA_END = '\u309f'
|
||||
KATAKANA_START = '\u30a0'
|
||||
KATAKANA_END = '\u30ff'
|
||||
return all((HIRAGANA_START <= char <= HIRAGANA_END) or (KATAKANA_START <= char <= KATAKANA_END) for char in s)
|
||||
|
||||
def validate_headword_and_kana(main_headword: str = None, kana: str = None) -> tuple[str, str]:
|
||||
"""
|
||||
If the kana parameter is not specified for a term containing non-kana characters (i.e. kanji), raises an error;
|
||||
if the `main_headword` parameter is not specified, but `kana` is, then the term is kana-only, and so `main_headword`
|
||||
is updated to match the `kana` parameter's value. Returns the 2-tuple containing post-processed (`main_headword`, `kana`).
|
||||
"""
|
||||
if kana is not None and main_headword is None:
|
||||
main_headword = kana
|
||||
elif kana is None:
|
||||
raise ValueError("Must specify kana parameter")
|
||||
|
||||
return main_headword, kana
|
||||
|
||||
def are_duplicate_kanas(list_of_kana_readings: list[str]) -> bool:
|
||||
"""Illustrative input:
|
||||
`headword`=人, `list_of_kana_readings`=["ひと", "にん", "じん"]
|
||||
This will return `False` because there is no term where there are two identical kanas
|
||||
"""
|
||||
# Sets contain unique items only, so if there are duplicates, the set will have fewer elements than the list.
|
||||
# If there're no duplicates, then, we expect their lengths to be the same.
|
||||
return len(set(list_of_kana_readings)) < len(list_of_kana_readings)
|
||||
|
||||
def find_entry(*, main_headword: str = None, kana: str = None) -> list:
|
||||
"""
|
||||
Finds the record in the dictionary data file corresponding to the input `main_headword` (usually kanji)
|
||||
and `kana` (if the term is kana-only, only `kana` needs to be specified; otherwise, both need to be specified.)
|
||||
If nothing is found, raises an error.
|
||||
"""
|
||||
main_headword, kana = validate_headword_and_kana(main_headword, kana)
|
||||
|
||||
def entry_matches(entry: list) -> bool:
|
||||
if is_kana(main_headword):
|
||||
return entry[0] == main_headword
|
||||
return entry[0] == main_headword and entry[1] == kana
|
||||
|
||||
for item in DAIJIRIN:
|
||||
if entry_matches(item):
|
||||
return item
|
||||
|
||||
# If nothing is found, return empty list
|
||||
return []
|
||||
|
||||
def get_body(entry: list) -> str:
|
||||
# Although the 5th element of an entry in our format is a list,
|
||||
# every single entry in the dictionary only has 1 item in that list, which
|
||||
# is the body of the entry (the definition, pitch accent information are both in there.).
|
||||
return entry[5][0]
|
||||
|
||||
def get_accent_from_body(entry_body: str) -> tuple[bool, str]:
|
||||
"""
|
||||
From an entry body, returns both whether there is a pitch accent defined, and the string representing
|
||||
all the possible pitch accents in a row (e.g. [1][0], [4][3], etc.)
|
||||
"""
|
||||
match = ACCENT_LIST_REGEX.search(entry_body)
|
||||
return bool(match), match.group(0) if bool(match) else ""
|
||||
|
||||
def process_djr_accents(acc_str: str) -> list[str]:
|
||||
"""Return list of accents from a string like [1][0]."""
|
||||
accs = []
|
||||
current = ""
|
||||
for char in acc_str:
|
||||
if char == "[":
|
||||
pass
|
||||
elif char == "]":
|
||||
accs.append(current)
|
||||
current = ""
|
||||
else:
|
||||
current += char
|
||||
return accs
|
||||
|
||||
def get_accent(*, main_headword: str = None, kana: str = None) -> list[str]:
|
||||
"""
|
||||
Return a list of possible accents for a headword-kana combination. Must pass parameters as keywords to avoid confusion.
|
||||
If there is no accent available, raises a `NoAccentError`.
|
||||
"""
|
||||
main_headword, kana = validate_headword_and_kana(main_headword, kana)
|
||||
entry = find_entry(main_headword=main_headword, kana=kana)
|
||||
if entry == []: return []
|
||||
entry_body = get_body(entry)
|
||||
has_accent, accents_raw = get_accent_from_body(entry_body)
|
||||
if has_accent:
|
||||
possible_accents = process_djr_accents(accents_raw)
|
||||
return possible_accents
|
||||
else:
|
||||
raise NoAccentError(f"Term {main_headword}({kana}) has no accent in Daijirin.")
|
112
ja-accents/ja-accent-add.py
Normal file
112
ja-accents/ja-accent-add.py
Normal file
@ -0,0 +1,112 @@
|
||||
import pywikibot
|
||||
import mwparserfromhell
|
||||
from typing import Generator
|
||||
from daijirin import are_duplicate_kanas, is_kana, get_accent
|
||||
|
||||
|
||||
SITE = pywikibot.Site("en", "wiktionary")
|
||||
NO_ACC_TRACKING_PAGE = "tracking/ja-pron/no accent"
|
||||
BLACKLIST = "blacklist.txt"
|
||||
|
||||
class JapaneseSectionNotFound(ValueError):
|
||||
"""The entry had no Japanese section."""
|
||||
|
||||
def get_japanese_section(parsed_text: mwparserfromhell.wikicode.Wikicode):
|
||||
try:
|
||||
return parsed_text.get_sections([2], "Japanese")[0]
|
||||
except:
|
||||
raise JapaneseSectionNotFound()
|
||||
|
||||
def get_kana_from_pron(ja_pron: mwparserfromhell.wikicode.Template, page_title: str) -> str:
|
||||
# If entry is all kana, no kana will be provided in the {{ja-pron}}, so infer from title
|
||||
if ja_pron.has("1"):
|
||||
kana = str(ja_pron.get("1"))
|
||||
else:
|
||||
if not is_kana(page_title):
|
||||
raise ValueError(f"ERROR, improperly formatted template on page {page_title}: pron template did not have kana despite non-kana title.")
|
||||
kana = page_title
|
||||
return kana
|
||||
|
||||
def there_are_duplicate_readings(ja_prons: list[mwparserfromhell.wikicode.Template], title: str) -> bool:
|
||||
return are_duplicate_kanas([get_kana_from_pron(pron, page_title=title) for pron in ja_prons])
|
||||
|
||||
def update_page(title: str):
|
||||
page = pywikibot.Page(SITE, title)
|
||||
parsed = mwparserfromhell.parse(page.text)
|
||||
japanese_section = get_japanese_section(parsed)
|
||||
ja_prons = [template for template in japanese_section.filter(forcetype=mwparserfromhell.wikicode.Template) if template.name == "ja-pron"]
|
||||
|
||||
if len(ja_prons) == 0:
|
||||
raise ValueError(f"ERROR, no ja-pron on the page {title} to begin with, doing nothing.")
|
||||
|
||||
if there_are_duplicate_readings(ja_prons, title):
|
||||
raise ValueError(f"ERROR, there are multiple indistinguishable terms on this page {title} with the same reading")
|
||||
|
||||
for template in ja_prons:
|
||||
template: mwparserfromhell.wikicode.Template
|
||||
|
||||
kana = get_kana_from_pron(template, title)
|
||||
|
||||
possible_pitches = get_accent(main_headword=title, kana=kana)
|
||||
|
||||
for i, accent in enumerate(possible_pitches):
|
||||
acc_param = f"acc{i+1 if i > 0 else ''}"
|
||||
acc_ref_param = f"{acc_param}_ref"
|
||||
|
||||
if template.has(acc_param) or template.has(acc_ref_param):
|
||||
break
|
||||
# print("ERROR, template already has accent information")
|
||||
# return SuccessCode.FAILURE
|
||||
|
||||
template.add(acc_param, accent)
|
||||
template.add(acc_ref_param, "DJR")
|
||||
|
||||
if "===References===" not in japanese_section:
|
||||
japanese_section.append("\n\n===References===\n<references />\n\n")
|
||||
|
||||
page.text = str(parsed)
|
||||
while "\n\n\n" in page.text:
|
||||
print("deez")
|
||||
page.text = page.text.replace("\n\n\n", "\n\n")
|
||||
|
||||
print(str(mwparserfromhell.parse(page.text).get_sections([2], "Japanese")[0]), "Is this text acceptable? (y/n)", sep="\n")
|
||||
|
||||
valid = False
|
||||
while not valid:
|
||||
answer = input()
|
||||
if answer == "y" or answer == "n":
|
||||
valid = True
|
||||
|
||||
if answer == "y":
|
||||
page.save("Added accents to page", minor=False)
|
||||
|
||||
def get_accentless_pages() -> Generator[pywikibot.Page, None, None]:
|
||||
TEMPLATE_NAMESPACE = SITE.namespaces.TEMPLATE
|
||||
MAIN_NAMESPACE = SITE.namespaces.MAIN
|
||||
return pywikibot.Page(SITE, NO_ACC_TRACKING_PAGE, ns=TEMPLATE_NAMESPACE).getReferences(only_template_inclusion=True, namespaces=[MAIN_NAMESPACE])
|
||||
|
||||
def iterate_pages():
|
||||
for page in get_accentless_pages():
|
||||
try:
|
||||
update_page(page)
|
||||
except Exception as e:
|
||||
print(f"Unable to update {page.title()} due to error: {e}")
|
||||
|
||||
def main():
|
||||
try:
|
||||
with open(BLACKLIST) as f:
|
||||
blacklist = set(map(str.strip, f.readlines()))
|
||||
except FileNotFoundError:
|
||||
blacklist = set()
|
||||
|
||||
# update_page("碧玉")
|
||||
# update_page("パイプカット")
|
||||
|
||||
try:
|
||||
iterate_pages()
|
||||
except:
|
||||
with open(BLACKLIST, mode="w") as f:
|
||||
f.writelines(blacklist)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
Loading…
Reference in New Issue
Block a user