anki-hsk-creator/anki-hsk-creator/__main__.py

## Imports
from pathlib import Path
import random
import csv

## PIP
from cedict_utils.cedict import CedictParser
from pinyin_tone_converter.pinyin_tone_converter import PinyinToneConverter
from genanki import Deck, Note, Model, Package
import argostranslate.package
import argostranslate.translate

## Constants

CCCEDICT = Path(__file__).parent / "cedict_ts.u8"
DATA = Path(__file__).parent.parent / "data"
INPUT = DATA / "input"
OUTPUT = DATA / "output"
CN = "zh"
EN = "en"
PHRASES_TYPE = ".phrases"
DICT_TYPE = ".dictionary"
CSS = """
.card {
 font-family: arial;
 font-size: 20px;
 text-align: center;
 color: black;
 background-color: white;
}
.simple {
font-family: Arial;
font-size: 100px;
}
.trad {
font-family: Arial;
font-size: 75px;
}
"""

## Classess

SIMPLE_MODEL = Model(
    2076166425,
    "Simple Model",
    fields=[
        {"name": "Question"},
        {"name": "Answer"},
    ],
    templates=[
        {
            "name": "Card 1",
            "qfmt": "{{Question}}",
            "afmt": '{{FrontSide}}<hr id="answer">{{Answer}}',
        },
    ],
    css=CSS,
)

HSK_FRONT_TEMPLATE = """
<tts service="android" voice="zh-CN">
<strong>{{Pinyin}}</strong>
</tts>
<br>
<tts service="android" voice="en-US">
{{English}}
</tts>
"""

HSK_MODEL = Model(
    1708536519,
    "HSK Model",
    fields=[
        {"name": "English"},
        {"name": "Pinyin"},
        {"name": "Simplified"},
        {"name": "Traditional"},
    ],
    templates=[
        {
            "name": "Card 1",
            "qfmt": HSK_FRONT_TEMPLATE,
            "afmt": "{{FrontSide}}<hr id='answer''><div class='simple'>"
            "{{Simplified}}</div><br><div class='trad'>{{Traditional}}</div>",
        },
        {
            "name": "Card 2",
            "qfmt": "<div class='simple'>{{Simplified}}</div><br><div class='trad'>"
            "{{Traditional}}</div>",
            "afmt": '{{FrontSide}}<hr id="answer"><strong>{{Pinyin}}</strong><br>{{English}}',
        },
    ],
    css=CSS,
)


## Functions


def create_cedict():
    """Creates a create_cedict dictionary object"""
    parser = CedictParser()
    parser.read_file(CCCEDICT)
    entries = parser.parse()

    dictionary = {}
    for entry in entries:
        if entry.simplified not in dictionary:
            dictionary[entry.simplified] = [entry]
        else:
            dictionary[entry.simplified].append(entry)

    return dictionary


def create_translator():
    """Download and install Argos Translate package"""
    argostranslate.package.update_package_index()
    available_packages = argostranslate.package.get_available_packages()
    package_to_install = next(
        filter(lambda x: x.from_code == CN and x.to_code == EN, available_packages)
    )
    argostranslate.package.install_from_path(package_to_install.download())


## Main


def process_files():
    print("Select data file:")
    in_file = None
    level = INPUT
    while not in_file:
        files = []
        for n, file in enumerate(level.glob("*")):
            files.append(file)
            print(f"{n+1} - {file.relative_to(INPUT)}")
        s = None
        while not s or not s.isnumeric() or not (1 <= int(s) <= len(files)):
            s = input(f"Please select the file [1-{len(files)}]: ")
        selected = files[int(s) - 1]
        if selected.is_file():
            in_file = selected
        else:
            level = selected
    relative = in_file.relative_to(INPUT)
    out_file = OUTPUT / relative
    out_file.parent.mkdir(parents=True, exist_ok=True)
    with in_file.open(encoding="utf8") as input_file:
        file_type = input_file.read().split()[0]
    return in_file, out_file, file_type


def dictionary_process(dictionary, in_file):
    """Process dictionary files"""
    words_list = in_file.open(encoding="utf8").read().split("\n")
    results = []
    with in_file.open("w", encoding="utf8") as input_file:
        for words in words_list:
            word = words.split()[0]
            pinyin = " ".join(words.split()[1:]) if len(words.split()) > 1 else None
            if v := dictionary.get(word):
                if len(v) > 1:
                    print(f"\nWARNING: {word} has multiple meanings:")
                    if pinyin and pinyin != "ERROR":
                        ml = filter(lambda x: v.pinyin == pinyin, v)
                    else:
                        ml = v
                    for n, w in enumerate(ml):
                        print(f"{n+1} - {w}")
                        for m in w.meanings:
                            print(f"\t{m}")
                    s = None
                    while not s or not s.isnumeric() or not (1 <= int(s) <= len(v)):
                        s = input(f"Please select the correct word [1-{len(v)}]: ")
                    v = v[int(s) - 1]
                else:
                    v = v[0]
                input_file.write(f"{word}\t{v.pinyin}\n")
                results.append(v)
            else:
                print("============================================")
                print(f"===================>ERROR: {word} not found")
                print("============================================")
                input_file.write(f"{word}\tERROR\n")
    return results


def translator_process(in_file):
    """Process text trasnlate files"""
    text_list = in_file.open(encoding="utf8").read().split()[1:]
    results = []
    for text in text_list:
        text = text.strip()
        for par in text.split("。"):
            if par:
                translatedText = argostranslate.translate.translate(par, CN, EN)
                results.append([translatedText, par])
    return results


def output_tsv(out_file, results):
    """writes the output as a tsv file"""
    final_file = out_file.parent / f"{out_file.stem}.tsv"
    with final_file.open("w", encoding="utf8", newline="") as csvfile:
        writer = csv.writer(csvfile, delimiter="\t", quotechar='"')
        for entry in results:
            writer.writerow(
                [
                    "\n ".join(f"{n+1}. {m}" for n, m in enumerate(entry.meanings)),
                    PinyinToneConverter().convert_text(entry.pinyin),
                    entry.simplified,
                    entry.traditional,
                ]
            )


def output_anki_dictionary(out_file, results):
    final_file = out_file.parent / f"{out_file.stem}.apkg"
    deck_name = "::".join(out_file.relative_to(OUTPUT).parts[:-1] + (out_file.stem,))
    deck = Deck(random.randrange(1 << 30, 1 << 31), deck_name)
    for entry in results:
        note = Note(
            model=HSK_MODEL,
            fields=[
                "\n ".join(f"{n+1}. {m}" for n, m in enumerate(entry.meanings)),
                PinyinToneConverter().convert_text(entry.pinyin),
                entry.simplified,
                entry.traditional,
            ],
        )
        deck.add_note(note)
    Package(deck).write_to_file(final_file)


def output_anki_text(out_file, results):
    final_file = out_file.parent / f"{out_file.stem}.apkg"
    deck_name = "::".join(out_file.relative_to(OUTPUT).parts[:-1] + (out_file.stem,))
    deck = Deck(random.randrange(1 << 30, 1 << 31), deck_name)
    for entry in results:
        note = Note(
            model=SIMPLE_MODEL,
            fields=entry,
        )
        deck.add_note(note)
    Package(deck).write_to_file(final_file)


def main():
    in_file, out_file, file_type = process_files()
    if PHRASES_TYPE in in_file.suffixes:
        create_translator()
        results = translator_process(in_file)
        output_anki_text(out_file, results)
    elif DICT_TYPE in in_file.suffixes:
        dictionary = create_cedict()
        results = dictionary_process(dictionary, in_file)
        output_anki_dictionary(out_file, results)
    else:
        raise TypeError("Error, filetype not especified!")


if __name__ == "__main__":
    main()