Files
anki-hsk-creator/anki-hsk-creator/__main__.py
2026-05-26 21:33:43 +08:00

265 lines
7.7 KiB
Python

## Imports
from pathlib import Path
import random
import csv
## PIP
from cedict_utils.cedict import CedictParser
from pinyin_tone_converter.pinyin_tone_converter import PinyinToneConverter
from genanki import Deck, Note, Model, Package
import argostranslate.package
import argostranslate.translate
## Constants
CCCEDICT = Path(__file__).parent / "cedict_ts.u8"
DATA = Path(__file__).parent.parent / "data"
INPUT = DATA / "input"
OUTPUT = DATA / "output"
CN = "zh"
EN = "en"
PHRASES_TYPE = ".phrases"
DICT_TYPE = ".dictionary"
CSS = """
.card {
font-family: arial;
font-size: 20px;
text-align: center;
color: black;
background-color: white;
}
.simple {
font-family: Arial;
font-size: 100px;
}
.trad {
font-family: Arial;
font-size: 75px;
}
"""
## Classess
SIMPLE_MODEL = Model(
2076166425,
"Simple Model",
fields=[
{"name": "Question"},
{"name": "Answer"},
],
templates=[
{
"name": "Card 1",
"qfmt": "{{Question}}",
"afmt": '{{FrontSide}}<hr id="answer">{{Answer}}',
},
],
css=CSS,
)
HSK_FRONT_TEMPLATE = """
<tts service="android" voice="zh-CN">
<strong>{{Pinyin}}</strong>
</tts>
<br>
<tts service="android" voice="en-US">
{{English}}
</tts>
"""
HSK_MODEL = Model(
1708536519,
"HSK Model",
fields=[
{"name": "English"},
{"name": "Pinyin"},
{"name": "Simplified"},
{"name": "Traditional"},
],
templates=[
{
"name": "Card 1",
"qfmt": HSK_FRONT_TEMPLATE,
"afmt": "{{FrontSide}}<hr id='answer''><div class='simple'>"
"{{Simplified}}</div><br><div class='trad'>{{Traditional}}</div>",
},
{
"name": "Card 2",
"qfmt": "<div class='simple'>{{Simplified}}</div><br><div class='trad'>"
"{{Traditional}}</div>",
"afmt": '{{FrontSide}}<hr id="answer"><strong>{{Pinyin}}</strong><br>{{English}}',
},
],
css=CSS,
)
## Functions
def create_cedict():
"""Creates a create_cedict dictionary object"""
parser = CedictParser()
parser.read_file(CCCEDICT)
entries = parser.parse()
dictionary = {}
for entry in entries:
if entry.simplified not in dictionary:
dictionary[entry.simplified] = [entry]
else:
dictionary[entry.simplified].append(entry)
return dictionary
def create_translator():
"""Download and install Argos Translate package"""
argostranslate.package.update_package_index()
available_packages = argostranslate.package.get_available_packages()
package_to_install = next(
filter(lambda x: x.from_code == CN and x.to_code == EN, available_packages)
)
argostranslate.package.install_from_path(package_to_install.download())
## Main
def process_files():
print("Select data file:")
in_file = None
level = INPUT
while not in_file:
files = []
for n, file in enumerate(level.glob("*")):
files.append(file)
print(f"{n+1} - {file.relative_to(INPUT)}")
s = None
while not s or not s.isnumeric() or not (1 <= int(s) <= len(files)):
s = input(f"Please select the file [1-{len(files)}]: ")
selected = files[int(s) - 1]
if selected.is_file():
in_file = selected
else:
level = selected
relative = in_file.relative_to(INPUT)
out_file = OUTPUT / relative
out_file.parent.mkdir(parents=True, exist_ok=True)
with in_file.open(encoding="utf8") as input_file:
file_type = input_file.read().split()[0]
return in_file, out_file, file_type
def dictionary_process(dictionary, in_file):
"""Process dictionary files"""
words_list = in_file.open(encoding="utf8").read().split("\n")
results = []
with in_file.open("w", encoding="utf8") as input_file:
for words in words_list:
word = words.split()[0]
pinyin = " ".join(words.split()[1:]) if len(words.split()) > 1 else None
if v := dictionary.get(word):
if len(v) > 1:
print(f"\nWARNING: {word} has multiple meanings:")
if pinyin and pinyin != "ERROR":
ml = filter(lambda x: v.pinyin == pinyin, v)
else:
ml = v
for n, w in enumerate(ml):
print(f"{n+1} - {w}")
for m in w.meanings:
print(f"\t{m}")
s = None
while not s or not s.isnumeric() or not (1 <= int(s) <= len(v)):
s = input(f"Please select the correct word [1-{len(v)}]: ")
v = v[int(s) - 1]
else:
v = v[0]
input_file.write(f"{word}\t{v.pinyin}\n")
results.append(v)
else:
print("============================================")
print(f"===================>ERROR: {word} not found")
print("============================================")
input_file.write(f"{word}\tERROR\n")
return results
def translator_process(in_file):
"""Process text trasnlate files"""
text_list = in_file.open(encoding="utf8").read().split()[1:]
results = []
for text in text_list:
text = text.strip()
for par in text.split(""):
if par:
translatedText = argostranslate.translate.translate(par, CN, EN)
results.append([translatedText, par])
return results
def output_tsv(out_file, results):
"""writes the output as a tsv file"""
final_file = out_file.parent / f"{out_file.stem}.tsv"
with final_file.open("w", encoding="utf8", newline="") as csvfile:
writer = csv.writer(csvfile, delimiter="\t", quotechar='"')
for entry in results:
writer.writerow(
[
"\n ".join(f"{n+1}. {m}" for n, m in enumerate(entry.meanings)),
PinyinToneConverter().convert_text(entry.pinyin),
entry.simplified,
entry.traditional,
]
)
def output_anki_dictionary(out_file, results):
final_file = out_file.parent / f"{out_file.stem}.apkg"
deck_name = "::".join(out_file.relative_to(OUTPUT).parts[:-1] + (out_file.stem,))
deck = Deck(random.randrange(1 << 30, 1 << 31), deck_name)
for entry in results:
note = Note(
model=HSK_MODEL,
fields=[
"\n ".join(f"{n+1}. {m}" for n, m in enumerate(entry.meanings)),
PinyinToneConverter().convert_text(entry.pinyin),
entry.simplified,
entry.traditional,
],
)
deck.add_note(note)
Package(deck).write_to_file(final_file)
def output_anki_text(out_file, results):
final_file = out_file.parent / f"{out_file.stem}.apkg"
deck_name = "::".join(out_file.relative_to(OUTPUT).parts[:-1] + (out_file.stem,))
deck = Deck(random.randrange(1 << 30, 1 << 31), deck_name)
for entry in results:
note = Note(
model=SIMPLE_MODEL,
fields=entry,
)
deck.add_note(note)
Package(deck).write_to_file(final_file)
def main():
in_file, out_file, file_type = process_files()
if PHRASES_TYPE in in_file.suffixes:
create_translator()
results = translator_process(in_file)
output_anki_text(out_file, results)
elif DICT_TYPE in in_file.suffixes:
dictionary = create_cedict()
results = dictionary_process(dictionary, in_file)
output_anki_dictionary(out_file, results)
else:
raise TypeError("Error, filetype not especified!")
if __name__ == "__main__":
main()