anki-hsk-creator/anki-hsk-creator/__main__.py

## Imports
from pathlib import Path
import csv

from cedict_utils.cedict import CedictParser
from pinyin_tone_converter.pinyin_tone_converter import PinyinToneConverter

# from genanki import Deck, Note

# import argostranslate.package
# import argostranslate.translate

## Constants

CCCEDICT = Path(__file__).parent / "cedict_ts.u8"
DATA = Path(__file__).parent.parent / "data"
INPUT = DATA / "input"
OUTPUT = DATA / "output"
CN = "cn"
EN = "en"

## Classess

## Main

# Download and install Argos Translate package
# argostranslate.package.update_package_index()
# available_packages = argostranslate.package.get_available_packages()
# package_to_install = next(
#     filter(
#         lambda x: x.from_code == CN and x.to_code == EN, available_packages
#     )
# )
# argostranslate.package.install_from_path(package_to_install.download())

def process_files():
    print("Select data file:")
    files = []
    for n, file in enumerate(INPUT.glob('**/*.txt')):
        files.append(file)
        print(f"{n+1} - {file.relative_to(INPUT)}")
    s = None
    while not s or not s.isnumeric() or not (1 <= int(s) <= len(files)):
        s = input(f"Please select the file [1-{len(files)}]: ")
    in_file = files[int(s)-1]
    relative = in_file.relative_to(INPUT)
    out_file = OUTPUT / relative
    out_file.parent.mkdir(parents=True, exist_ok=True)
    return in_file, out_file

def dictionary_process(in_file, out_file):
    """Process dictionary files"""
    parser = CedictParser()
    parser.read_file(CCCEDICT)
    entries = parser.parse()

    dictionary = {}
    for entry in entries:
        if entry.simplified not in dictionary:
            dictionary[entry.simplified] = [entry]
        else:
            dictionary[entry.simplified].append(entry)

    out_file = DATA / f"{in_file.stem}.tsv"
    words_list = in_file.open(encoding="utf8").read().split()

    results = []
    for word in words_list:
        if v := dictionary.get(word):
            if len(v) > 1:
                print(
                    f"\nWARNING: {word} has multiple meanings:"
                )
                for n, w in enumerate(v):
                    print(f"{n+1} - {w}")
                    for m in w.meanings:
                        print(f"\t{m}")
                s = None
                while not s or not s.isnumeric() or not (1 <= int(s) <= len(v)):
                    s = input(f"Please select the correct word [1-{len(v)}]: ")
                v = v[int(s)-1]
            else:
                v = v[0]
            results.append(v)
        else:
            print("============================================")
            print(f"===================>ERROR: {word} not found")
            print("============================================")
    with out_file.open("w", encoding="utf8", newline="") as csvfile:
        writer = csv.writer(csvfile, delimiter="\t", quotechar='"')
        for entry in results:
            writer.writerow(
                [
                    "\n ".join(f"{n+1}. {m}" for n,m in enumerate(entry.meanings)),
                    PinyinToneConverter().convert_text(entry.pinyin),
                    entry.simplified,
                    entry.traditional,
                ]
            )

def main():
    in_file, out_file = process_files()
    dictionary_process(in_file, out_file)

if __name__ == "__main__":
    main()