From 21c6416cfd19028f358f09b095bc0bf06c5a9bd1 Mon Sep 17 00:00:00 2001 From: Wolfang Torres Date: Thu, 11 Jun 2026 21:23:34 +0800 Subject: [PATCH] version 0.1 --- pyproject.toml | 52 +++-- src/anki-hsk-creator/__init__.py | 9 - src/anki-hsk-creator/__main__.py | 184 ------------------ src/anki-hsk-creator/anki-models.py | 88 --------- src/anki-hsk-creator/constants.py | 19 -- src/anki-hsk-creator/untility.py | 53 ----- .../__about__.py | 3 +- src/anki_hsk_creator/__init__.py | 20 ++ src/anki_hsk_creator/__main__.py | 52 +++++ src/anki_hsk_creator/anki_generation.py | 159 +++++++++++++++ src/anki_hsk_creator/api.py | 72 +++++++ .../cedict_ts.u8 | 0 src/anki_hsk_creator/constants.py | 51 +++++ src/anki_hsk_creator/proccessor.py | 96 +++++++++ src/anki_hsk_creator/utility.py | 154 +++++++++++++++ 15 files changed, 645 insertions(+), 367 deletions(-) delete mode 100644 src/anki-hsk-creator/__init__.py delete mode 100644 src/anki-hsk-creator/__main__.py delete mode 100644 src/anki-hsk-creator/anki-models.py delete mode 100644 src/anki-hsk-creator/constants.py delete mode 100644 src/anki-hsk-creator/untility.py rename src/{anki-hsk-creator => anki_hsk_creator}/__about__.py (77%) create mode 100644 src/anki_hsk_creator/__init__.py create mode 100644 src/anki_hsk_creator/__main__.py create mode 100644 src/anki_hsk_creator/anki_generation.py create mode 100644 src/anki_hsk_creator/api.py rename src/{anki-hsk-creator => anki_hsk_creator}/cedict_ts.u8 (100%) create mode 100644 src/anki_hsk_creator/constants.py create mode 100644 src/anki_hsk_creator/proccessor.py create mode 100644 src/anki_hsk_creator/utility.py diff --git a/pyproject.toml b/pyproject.toml index 15caf19..b0ce95d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,6 +34,7 @@ dependencies = [ "torch", "torchaudio", "torchcodec", + "python-dotenv", ] [project.optional-dependencies] @@ -41,7 +42,9 @@ dev = [ "pytest", "black", "pylint", - "flakehell" + "flake8", + "flake8-pyproject", + # "flakeheaven", ] [project.urls] @@ -50,17 +53,18 @@ Issues = "https://gitea.wolfang.info.ve/wolfang/anki-hsk-creator/issues" Source = "https://gitea.wolfang.info.ve/wolfang/anki-hsk-creator" [tool.hatch.version] -path = "src/anki-hsk-creator/__about__.py" +path = "src/anki_hsk_creator/__about__.py" [tool.hatch.build.targets.sdist] +packages = ["src/anki_hsk_creator"] include = [ - "src/anki-hsk-creator/cedict_ts.u8", + "src/anki_hsk_creator/cedict_ts.u8", ] [tool.hatch.build.targets.wheel] -packages = ["src/anki-hsk-creator"] +packages = ["src/anki_hsk_creator"] include = [ - "src/anki-hsk-creator/cedict_ts.u8", + "src/anki_hsk_creator/cedict_ts.u8", ] [tool.hatch.envs.default] @@ -69,7 +73,8 @@ extra-dependencies = [ ] [tool.hatch.envs.default.scripts] -format = "black --target-version=py314 anki-hsk-creator tests && isort anki-hsk-creator tests" +format = "black --target-version=py314 src tests && isort src tests" +lint = "flake8 src" [tool.hatch.envs.types] extra-dependencies = [ @@ -80,16 +85,16 @@ extra-dependencies = [ check = "mypy --install-types --non-interactive {args:src/anki-hsk-creator tests}" [tool.coverage.run] -source_pkgs = ["anki-hsk-creator", "tests"] +source_pkgs = ["src", "tests"] branch = true parallel = true omit = [ - "src/anki-hsk-creator/__about__.py", + "src/anki_hsk_creator/__about__.py", ] [tool.coverage.paths] -anki-hsk-creator = ["src/anki-hsk-creator", "*/anki-hsk-creator/src/anki-hsk-creator"] -tests = ["tests", "*src/anki-hsk-creator/tests"] +anki-hsk-creator = ["src/anki_hsk_creator", "*/anki-hsk-creator/src/anki_hsk_creator"] +tests = ["tests", "*src/anki_hsk_creator/tests"] [tool.coverage.report] exclude_lines = [ @@ -111,6 +116,7 @@ exclude = ''' ) ''' + [tool.isort] src_paths = ["src", "test"] skip_glob = [".git", "__pycache__", ".vscode", "*venv", "build", "dist", "old", "*.egg-info"] @@ -145,9 +151,8 @@ msg-template="{path}:{module}:{line}: [{msg_id}({symbol}), {obj}] {msg}" logging-format-style="new" logging-modules="logging" -[tool.flakehell] +[tool.flake8] max_line_length = 88 -format = "grouped" show_source = false exclude = [ ".git", @@ -160,8 +165,29 @@ exclude = [ "*.egg-info", ] -[tool.flakehell.plugins] +[tool.flake8.plugins] mccabe = ["+C*"] pycodestyle = ["+E*", "+W*", "-E203", "-E501", "-W503"] pyflakes = ["+F*"] flake8-bugbear = ["+*", "+B950"] + +# [tool.flakeheaven] +# max_line_length = 88 +# format = "grouped" +# show_source = false +# exclude = [ +# ".git", +# "__pycache__", +# ".vscode", +# "*venv", +# "build", +# "dist", +# "old", +# "*.egg-info", +# ] + +# [tool.flakeheaven.plugins] +# mccabe = ["+C*"] +# pycodestyle = ["+E*", "+W*", "-E203", "-E501", "-W503"] +# pyflakes = ["+F*"] +# flake8-bugbear = ["+*", "+B950"] diff --git a/src/anki-hsk-creator/__init__.py b/src/anki-hsk-creator/__init__.py deleted file mode 100644 index 9bac128..0000000 --- a/src/anki-hsk-creator/__init__.py +++ /dev/null @@ -1,9 +0,0 @@ -"""anki-hsk-creator""" - -import os - -# Globals - -os.environ["HF_TOKEN"] = "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" -ARGOS_UPDATED = False -ARGOS_PACKAGES = None \ No newline at end of file diff --git a/src/anki-hsk-creator/__main__.py b/src/anki-hsk-creator/__main__.py deleted file mode 100644 index a8a763d..0000000 --- a/src/anki-hsk-creator/__main__.py +++ /dev/null @@ -1,184 +0,0 @@ -## Imports -from pathlib import Path -import random -import csv - -## PIP -from pinyin_tone_converter.pinyin_tone_converter import PinyinToneConverter - -## Main - - -def process_files(): - print("Select data file:") - in_file = None - level = INPUT - while not in_file: - files = [] - for n, file in enumerate(level.glob("*")): - files.append(file) - print(f"{n+1} - {file.relative_to(INPUT)}") - s = None - while not s or not s.isnumeric() or not (1 <= int(s) <= len(files)): - s = input(f"Please select the file [1-{len(files)}]: ") - selected = files[int(s) - 1] - if selected.is_file(): - in_file = selected - else: - level = selected - relative = in_file.relative_to(INPUT) - out_file = OUTPUT / relative - resources = RESOURCES / relative - resources = resources.parent / resources.stem - resources.mkdir(parents=True, exist_ok=True) - out_file.parent.mkdir(parents=True, exist_ok=True) - with in_file.open(encoding="utf8") as input_file: - file_type = input_file.read().split()[0] - return in_file, out_file, resources, file_type - - -def dictionary_process(dictionary, tts, in_file, resources): - """Process dictionary files""" - words_list = in_file.open(encoding="utf8").read().strip().split("\n") - results = [] - try: - with in_file.open("w", encoding="utf8") as input_file: - for words in words_list: - word = words.split()[0] - pinyin = " ".join(words.split()[1:]) if len(words.split()) > 1 else None - if v := dictionary.get(word): - if len(v) > 1: - print(f"\nWARNING: {word} has multiple meanings:") - if pinyin and pinyin != "ERROR": - ml = list(filter(lambda x: x.pinyin == pinyin, v)) - else: - ml = v - if len(ml) > 1: - for n, w in enumerate(ml): - print(f"{n+1} - {w}") - for m in w.meanings: - print(f"\t{m}") - s = None - while ( - not s - or not s.isnumeric() - or not (1 <= int(s) <= len(v)) - ): - s = input( - f"Please select the correct word [1-{len(v)}]: " - ) - v = v[int(s) - 1] - else: - v = ml[0] - else: - v = v[0] - audio_path = resources / f"{word}.wav" - if not audio_path.exists(): - audio = tts.generate(f"{word}。", language_id="zh") - torchaudio.save(audio_path, audio, tts.sr) - input_file.write(f"{word}\t{v.pinyin}\n") - results.append((v, audio_path)) - else: - print("============================================") - print(f"===================>ERROR: {word} not found") - print("============================================") - input_file.write(f"{word}\tERROR\n") - except Exception: - with in_file.open("w", encoding="utf8") as input_file: - input_file.write("\n".join(words_list)) - return results - - -def translator_process(tts, resources, in_file): - """Process for phases trasnlation""" - text_list = in_file.open(encoding="utf8").read().strip().split() - results = [] - for n, phrase in enumerate(text_list): - phrase = phrase.strip() - audio_path = resources / f"N{n}.wav" - if not audio_path.exists(): - audio = tts.generate(f"{phrase}。", language_id="zh") - torchaudio.save(audio_path, audio, tts.sr) - translated = argostranslate.translate.translate(phrase, CN, EN) - results.append([translated, phrase, audio_path]) - return results - - -# def output_tsv(out_file, results): -# """writes the output as a tsv file""" -# final_file = out_file.parent / f"{out_file.stem}.tsv" -# with final_file.open("w", encoding="utf8", newline="") as csvfile: -# writer = csv.writer(csvfile, delimiter="\t", quotechar='"') -# for entry in results: -# writer.writerow( -# [ -# "\n ".join(f"{n+1}. {m}" for n, m in enumerate(entry.meanings)), -# PinyinToneConverter().convert_text(entry.pinyin), -# entry.simplified, -# entry.traditional, -# ] -# ) - - -def output_anki_dictionary(out_file, results): - final_file = out_file.parent / f"{out_file.stem}.apkg" - deck_name = "::".join(out_file.relative_to(OUTPUT).parts[:-1] + (out_file.stem,)) - deck = Deck(random.randrange(1 << 30, 1 << 31), deck_name) - package = Package(deck) - audios = [] - for entry, audio in results: - note = Note( - model=HSK_MODEL, - fields=[ - "\n ".join(f"{n+1}. {m}" for n, m in enumerate(entry.meanings)), - PinyinToneConverter().convert_text(entry.pinyin), - entry.simplified, - entry.traditional, - f"[sound:{audio.name}]", - ], - ) - audios.append(audio) - deck.add_note(note) - package.media_files = audios - package.write_to_file(final_file) - - -def output_anki_phrase(out_file, results): - final_file = out_file.parent / f"{out_file.stem}.apkg" - deck_name = "::".join(out_file.relative_to(OUTPUT).parts[:-1] + (out_file.stem,)) - deck = Deck(random.randrange(1 << 30, 1 << 31), deck_name) - package = Package(deck) - audios = [] - for translated, phrase, audio in results: - note = Note( - model=PHRASE_MODEL, - fields=[ - translated, - phrase, - f"[sound:{audio.name}]", - ], - ) - deck.add_note(note) - audios.append(audio) - package.media_files = audios - package.write_to_file(final_file) - - -def main(): - tts = create_tts() - dictionary = create_cedict() - create_translator() - while True: - in_file, out_file, resources, file_type = process_files() - if PHRASES_TYPE in in_file.suffixes: - results = translator_process(tts, resources, in_file) - output_anki_phrase(out_file, results) - elif DICT_TYPE in in_file.suffixes: - results = dictionary_process(dictionary, tts, in_file, resources) - output_anki_dictionary(out_file, results) - else: - raise TypeError("Error, filetype not especified!") - - -if __name__ == "__main__": - main() diff --git a/src/anki-hsk-creator/anki-models.py b/src/anki-hsk-creator/anki-models.py deleted file mode 100644 index 555d00a..0000000 --- a/src/anki-hsk-creator/anki-models.py +++ /dev/null @@ -1,88 +0,0 @@ -# anki-models.py - -from genanki import Deck, Note, Model, Package - - -# Constants - -CSS = """ -.card { - font-family: arial; - font-size: 20px; - text-align: center; - color: black; - background-color: white; -} -.simple { -font-family: Arial; -font-size: 100px; -} -.trad { -font-family: Arial; -font-size: 75px; -} -""" - -# Models - -PHRASE_MODEL = Model( - 2076166425, - "Phrase Model", - fields=[ - {"name": "Translated"}, - {"name": "Phrase"}, - {"name": "Audio"}, - ], - templates=[ - { - "name": "Card 1", - "qfmt": "{{Translated}}
{{Audio}}", - "afmt": '{{FrontSide}}
{{Phrase}}', - }, - { - "name": "Card 2", - "qfmt": "{{Phrase}}
{{Audio}}", - "afmt": '{{FrontSide}}
{{Translated}}', - }, - { - "name": "Card 3", - "qfmt": "{{Audio}}", - "afmt": '{{FrontSide}}
{{Phrase}}', - }, - ], - css=CSS, -) - - -HSK_MODEL = Model( - 1708536519, - "HSK Model", - fields=[ - {"name": "English"}, - {"name": "Pinyin"}, - {"name": "Simplified"}, - {"name": "Traditional"}, - {"name": "Audio"}, - ], - templates=[ - { - "name": "Card 1", - "qfmt": "{{Pinyin}}
{{English}}
{{Audio}}", - "afmt": "{{FrontSide}}
" - "{{Simplified}}

{{Traditional}}
", - }, - { - "name": "Card 2", - "qfmt": "
{{Simplified}}

" - "{{Traditional}}
", - "afmt": '{{FrontSide}}
{{Pinyin}}
{{English}}
{{Audio}}', - }, - { - "name": "Card 3", - "qfmt": "{{Audio}}", - "afmt": "{{FrontSide}}
{{Pinyin}}
" - "{{Simplified}}

{{Traditional}}
", - }, - ], - css=CSS, -) \ No newline at end of file diff --git a/src/anki-hsk-creator/constants.py b/src/anki-hsk-creator/constants.py deleted file mode 100644 index 0bac8ad..0000000 --- a/src/anki-hsk-creator/constants.py +++ /dev/null @@ -1,19 +0,0 @@ -## Imports -from pathlib import Path -import random -import importlib.resources - -CCCEDICT = importlib.resources.files("anki-hsk-creator").joinpath("cedict_ts.u8") - -DATA = Path(__file__).parent.parent / "data" -INPUT = DATA / "input" -OUTPUT = DATA / "output" -RESOURCES = DATA / "resources" - -# File Types -PHRASES_TYPE = ".phrases" -DICT_TYPE = ".dictionary" - -# Language codes -CN = "zh" -EN = "en" \ No newline at end of file diff --git a/src/anki-hsk-creator/untility.py b/src/anki-hsk-creator/untility.py deleted file mode 100644 index 63ecdf6..0000000 --- a/src/anki-hsk-creator/untility.py +++ /dev/null @@ -1,53 +0,0 @@ -from cedict_utils.cedict import CedictParser -import argostranslate.package -import argostranslate.translate -from chatterbox.mtl_tts import ChatterboxMultilingualTTS - -import torch -import torchaudio - -from . import ARGOS_UPDATED, ARGOS_PACKAGES -from . import CCCEDICT - -## Functions - - -def create_cedict(language_id="en"): - """Creates a create_cedict dictionary object""" - - parser = CedictParser() - parser.read_file(CCCEDICT) - entries = parser.parse() - - dictionary = {} - for entry in entries: - if entry.simplified not in dictionary: - dictionary[entry.simplified] = [entry] - else: - dictionary[entry.simplified].append(entry) - - return dictionary - - -def create_translator(from_code, to_code): - """Download and install Argos Translate package""" - if not ARGOS_UPDATED: - argostranslate.package.update_package_index() - ARGOS_PACKAGES = argostranslate.package.get_available_packages() - ARGOS_UPDATED = True - package_to_install = next( - filter(lambda x: x.from_code == CN and x.to_code == EN, ARGOS_PACKAGES) - ) - argostranslate.package.install_from_path(package_to_install.download()) - - -def create_tts(): - # Automatically detect the best available device - if torch.cuda.is_available(): - device = "cuda" - elif torch.backends.mps.is_available(): - device = "mps" - else: - device = "cpu" - tts = ChatterboxMultilingualTTS.from_pretrained(device=device, t3_model="v3") - return tts \ No newline at end of file diff --git a/src/anki-hsk-creator/__about__.py b/src/anki_hsk_creator/__about__.py similarity index 77% rename from src/anki-hsk-creator/__about__.py rename to src/anki_hsk_creator/__about__.py index ab19860..33859a3 100644 --- a/src/anki-hsk-creator/__about__.py +++ b/src/anki_hsk_creator/__about__.py @@ -1,4 +1,5 @@ +"""about.py""" # SPDX-FileCopyrightText: 2026-present Wolfang Torres # # SPDX-License-Identifier: GPL-3.0-or-later -__version__ = "0.0.1" +__version__ = "0.1.0" diff --git a/src/anki_hsk_creator/__init__.py b/src/anki_hsk_creator/__init__.py new file mode 100644 index 0000000..3210c4b --- /dev/null +++ b/src/anki_hsk_creator/__init__.py @@ -0,0 +1,20 @@ +"""anki_hsk_creator""" + +# Standard Library +import os +from pathlib import Path + +# Pip +from dotenv import load_dotenv + +load_dotenv() + +# Globals + +# Get an HF_TOKEN from huggingface for TTS generation +HF_TOKEN = os.environ.get("HF_TOKEN") + +# Path +default_path = Path.home() / "anki-hsk-creator-data" +DATA_FOLDER = Path(os.environ.get("DATA_FOLDER", default_path)) +DATA_FOLDER.mkdir(exist_ok=True, parents=True) diff --git a/src/anki_hsk_creator/__main__.py b/src/anki_hsk_creator/__main__.py new file mode 100644 index 0000000..49d03ee --- /dev/null +++ b/src/anki_hsk_creator/__main__.py @@ -0,0 +1,52 @@ +"""__main__.py""" + +# Standard Library +from pathlib import Path + +# Local +from .api import list_input_files, process_a_file, select_file +from .constants import LANGUAGES + + +def cli_select_files(): + """Loops until it finds a valid input_file""" + print("Select data file:") + in_file = None + level = Path() + while not in_file: + files = list_input_files(level) + for n, file in enumerate(files): + print(f"{n+1} - {file}") + s = None + while not s or not s.isnumeric() or not (1 <= int(s) <= len(files)): + s = input(f"Please select the file [1-{len(files)}]: ") + selected = files[int(s) - 1] + if selected.is_file(): + in_file = selected + else: + level = selected + input_file = select_file(in_file) + return input_file + + +def cli_select_language(): + """Selects a language for the trasnlatatio""" + print("Select a language:") + for language_id, language in LANGUAGES.language_names.items(): + print(f"{language_id} - {language}") + s = None + while not s or s not in LANGUAGES.available_languages: + s = input(f"Please select the language: ({ LANGUAGES.available_languages})") + return s + + +def main(): + """CLI interface for the module""" + while True: + input_file = cli_select_files() + language_id = cli_select_language() + process_a_file(input_file, language_id) + + +if __name__ == "__main__": + main() diff --git a/src/anki_hsk_creator/anki_generation.py b/src/anki_hsk_creator/anki_generation.py new file mode 100644 index 0000000..44926f8 --- /dev/null +++ b/src/anki_hsk_creator/anki_generation.py @@ -0,0 +1,159 @@ +"""anki_generation.py + +Produces anki output +""" + +# Standard Library +import random + +# Pip +from genanki import Deck, Model, Note, Package + +# Local +from .utility import ProcessFile, TranslationResult + +# from pinyin_tone_converter.pinyin_tone_converter import PinyinToneConverter + + +# Constants + +CSS = """ +.card { + font-family: arial; + font-size: 20px; + text-align: center; + color: black; + background-color: white; +} +.simple { +font-family: Arial; +font-size: 100px; +} +.trad { +font-family: Arial; +font-size: 75px; +} +""" + +# Models + +PHRASE_MODEL = Model( + 2076166425, + "Phrase Model", + fields=[ + {"name": "Translated"}, + {"name": "Phrase"}, + {"name": "Audio"}, + ], + templates=[ + { + "name": "Card 1", + "qfmt": "{{Translated}}
{{Audio}}", + "afmt": '{{FrontSide}}
{{Phrase}}', + }, + { + "name": "Card 2", + "qfmt": "{{Phrase}}
{{Audio}}", + "afmt": '{{FrontSide}}
{{Translated}}', + }, + { + "name": "Card 3", + "qfmt": "{{Audio}}", + "afmt": '{{FrontSide}}
{{Phrase}}', + }, + ], + css=CSS, +) + + +HSK_MODEL = Model( + 1708536519, + "HSK Model", + fields=[ + {"name": "English"}, + {"name": "Pinyin"}, + {"name": "Simplified"}, + {"name": "Traditional"}, + {"name": "Audio"}, + ], + templates=[ + { + "name": "Card 1", + "qfmt": "{{Pinyin}}
{{English}}
{{Audio}}", + "afmt": ( + "{{FrontSide}}
{{Simplified}}
" + "
{{Traditional}}
" + ), + }, + { + "name": "Card 2", + "qfmt": "
{{Simplified}}

" + "{{Traditional}}
", + "afmt": ( + "{{FrontSide}}
{{Pinyin}}" + "
{{English}}
{{Audio}}" + ), + }, + { + "name": "Card 3", + "qfmt": "{{Audio}}", + "afmt": ( + "{{FrontSide}}
{{Pinyin}}" + "
{{Simplified}}
" + "
{{Traditional}}
" + ), + }, + ], + css=CSS, +) + +# Proccess + + +# def output_anki_dictionary(out_file, results): +# """Creates an anki file from a dictionary results""" +# final_file = out_file.parent / f"{out_file.stem}.apkg" +# deck_name = "::".join(out_file.relative_to(OUTPUT).parts[:-1] + (out_file.stem,)) +# deck = Deck(random.randrange(1 << 30, 1 << 31), deck_name) +# package = Package(deck) +# audios = [] +# for entry, audio in results: +# note = Note( +# model=HSK_MODEL, +# fields=[ +# "\n ".join(f"{n+1}. {m}" for n, m in enumerate(entry.meanings)), +# PinyinToneConverter().convert_text(entry.pinyin), +# entry.simplified, +# entry.traditional, +# f"[sound:{audio.name}]", +# ], +# ) +# audios.append(audio) +# deck.add_note(note) +# package.media_files = audios +# package.write_to_file(final_file) + + +def output_anki_phrase(process_file: ProcessFile, results: list[TranslationResult]): + """Creates an anki file from a phrases results""" + + final_file = process_file.output_name.with_suffix(".apkg") + deck_name = "::".join( + process_file.input_file.parts[:-1] + (process_file.input_fil.stem,) + ) + deck = Deck(random.randrange(1 << 30, 1 << 31), deck_name) + package = Package(deck) + audios = [] + for result in results: + note = Note( + model=PHRASE_MODEL, + fields=[ + result.translated, + result.line, + f"[sound:{result.audio_path.name}]", + ], + ) + deck.add_note(note) + audios.append(result.audio_path) + package.media_files = audios + package.write_to_file(final_file) diff --git a/src/anki_hsk_creator/api.py b/src/anki_hsk_creator/api.py new file mode 100644 index 0000000..cee2aa0 --- /dev/null +++ b/src/anki_hsk_creator/api.py @@ -0,0 +1,72 @@ +"""api.py + +Interface for managuing and procesing files +""" + +# Standard Library +from pathlib import Path + +# Local +from . import DATA_FOLDER +from .anki_generation import output_anki_phrase +from .constants import DICT_TYPE, INPUT, LANGUAGES, PHRASES_TYPE +from .proccessor import translator_process +from .utility import TRANS, TTS, ProcessFile + +# interface + + +def get_data_folder() -> Path: + """Utility function, return the data folder""" + return DATA_FOLDER + + +def list_input_files(search_path: Path = Path()) -> list[Path]: + """Return a list of files relative to the INPUT path""" + level = INPUT / search_path + return [path.relative_to(INPUT) for path in level.glob("*")] + + +def select_file(file_path: Path) -> ProcessFile: + """Given a relative path from `list_input_files`, return a ProcessFile""" + if (INPUT / file_path).is_file(): + return ProcessFile(file_path) + else: + raise ValueError(f"{file_path} is not a file") + + +def create_input_file( + name: str, file_type: str, text: str, sub_folder: Path = Path() +) -> ProcessFile: + """Creates an input file, with a name and a type form the available ones, + writes a text to it, if a sub_folder is given, + it is created and the file placed inside. + returns the relative path for future processing + + valid file_types: ".phrases", ".dictionary" + """ + if file_type not in (PHRASES_TYPE, DICT_TYPE): + raise ValueError(f"file_type {file_type} not in {(PHRASES_TYPE, DICT_TYPE)}") + filename = f"{name}{file_type}.txt" + relative = sub_folder / filename + # write file + file_path = INPUT / relative + file_path.parent.mkdir(exist_ok=True, parents=True) + file_path.write_text(text, encoding="utf8") + # create process_file for future + process_file = ProcessFile(relative) + return process_file + + +def process_a_file(process_file: ProcessFile, language_id: str): + """From a input_file, a language and an output type, process a file""" + process_file.language_id = language_id + if PHRASES_TYPE in process_file.input_file.suffix: + TTS.create_tts() + TRANS.create_translator(LANGUAGES.CN, language_id) + with process_file.absolute_input_file.open("r") as file: + text_lines = [line.strip() for line in file.readlines()] + results = translator_process(text_lines, process_file, language_id) + output_anki_phrase(process_file, results) + elif DICT_TYPE in process_file.input_file.suffix: + print("not implemented") diff --git a/src/anki-hsk-creator/cedict_ts.u8 b/src/anki_hsk_creator/cedict_ts.u8 similarity index 100% rename from src/anki-hsk-creator/cedict_ts.u8 rename to src/anki_hsk_creator/cedict_ts.u8 diff --git a/src/anki_hsk_creator/constants.py b/src/anki_hsk_creator/constants.py new file mode 100644 index 0000000..16c8f65 --- /dev/null +++ b/src/anki_hsk_creator/constants.py @@ -0,0 +1,51 @@ +"""constants.py""" + +# Standard Library +import importlib.resources + +# Local +from . import DATA_FOLDER + +# Resources +CCCEDICT_PATH = importlib.resources.files("anki-hsk-creator").joinpath("cedict_ts.u8") + +# Data folder structure +INPUT = DATA_FOLDER / "input" +INPUT.mkdir(exist_ok=True, parents=True) +OUTPUT = DATA_FOLDER / "output" +OUTPUT.mkdir(exist_ok=True, parents=True) +RESOURCES = DATA_FOLDER / "resources" +RESOURCES.mkdir(exist_ok=True, parents=True) + +# File Types +PHRASES_TYPE = ".phrases" +DICT_TYPE = ".dictionary" + + +class LANGUAGES: + """Available laguages for translation""" + + CN = "zh" + EN = "en" + ES = "es" + FR = "fr" + RU = "ru" + TR = "tr" + TH = "th" + + @property + def available_languages(self) -> tuple: + """Available laguages for translation""" + return (self.EN, self.ES, self.FR, self.RU, self.TR, self.TH) + + @property + def language_names(self) -> dict: + """Gets the name of a language code""" + return { + self.EN: "English", + self.ES: "Spanish", + self.FR: "French", + self.RU: "Russian", + self.TR: "Turkish", + self.TH: "Thai", + } diff --git a/src/anki_hsk_creator/proccessor.py b/src/anki_hsk_creator/proccessor.py new file mode 100644 index 0000000..47b6e82 --- /dev/null +++ b/src/anki_hsk_creator/proccessor.py @@ -0,0 +1,96 @@ +"""processor.py""" + +# Pip +import argostranslate.translate +import torchaudio + +# Local +from .constants import LANGUAGES +from .utility import TTS, ProcessFile, TranslationResult # , CCCEDICT + +# Results Classes + + +def translator_process( + text_lines: list[str], + process_file: ProcessFile, + language_id: str, +) -> list[TranslationResult]: + """Process for phases or sentence translation""" + results = [] + for n, line in enumerate(text_lines): + line = line.strip() + audio_path = process_file.resources / f"N{n::03.0n}.wav" + if not audio_path.exists(): + audio = TTS.MODEL.generate(f"{line}。", language_id=LANGUAGES.CN) + torchaudio.save(audio_path, audio, TTS.MODEL.sr) + translated = argostranslate.translate.translate(line, LANGUAGES.CN, language_id) + results.append(TranslationResult(language_id, translated, line, audio_path)) + return results + + +# def dictionary_process(dictionary, tts, in_file, resources): +# """Process dictionary files""" +# words_list = in_file.open(encoding="utf8").read().strip().split("\n") +# results = [] +# try: +# with in_file.open("w", encoding="utf8") as input_file: +# for words in words_list: +# word = words.split()[0] +# pinyin = " ".join(words.split()[1:]) if len(words.split()) > 1 else None +# if v := dictionary.get(word): +# if len(v) > 1: +# print(f"\nWARNING: {word} has multiple meanings:") +# if pinyin and pinyin != "ERROR": +# ml = list(filter(lambda x: x.pinyin == pinyin, v)) +# else: +# ml = v +# if len(ml) > 1: +# for n, w in enumerate(ml): +# print(f"{n+1} - {w}") +# for m in w.meanings: +# print(f"\t{m}") +# s = None +# while ( +# not s +# or not s.isnumeric() +# or not (1 <= int(s) <= len(v)) +# ): +# s = input( +# f"Please select the correct word [1-{len(v)}]: " +# ) +# v = v[int(s) - 1] +# else: +# v = ml[0] +# else: +# v = v[0] +# audio_path = resources / f"{word}.wav" +# if not audio_path.exists(): +# audio = tts.generate(f"{word}。", language_id="zh") +# torchaudio.save(audio_path, audio, tts.sr) +# input_file.write(f"{word}\t{v.pinyin}\n") +# results.append((v, audio_path)) +# else: +# print("============================================") +# print(f"===================>ERROR: {word} not found") +# print("============================================") +# input_file.write(f"{word}\tERROR\n") +# except Exception: +# with in_file.open("w", encoding="utf8") as input_file: +# input_file.write("\n".join(words_list)) +# return results + +# def output_tsv(out_file, results): +# """writes the output as a tsv file""" +# final_file = out_file.parent / f"{out_file.stem}.tsv" +# with final_file.open("w", encoding="utf8", newline="") as csvfile: +# writer = csv.writer(csvfile, delimiter="\t", quotechar='"') +# for entry in results: +# writer.writerow( +# [ +# "\n ".join(f"{n+1}. {m}" for n, m in enumerate(entry.meanings)), +# PinyinToneConverter().convert_text(entry.pinyin), +# entry.simplified, +# entry.traditional, +# ] +# ) diff --git a/src/anki_hsk_creator/utility.py b/src/anki_hsk_creator/utility.py new file mode 100644 index 0000000..545251d --- /dev/null +++ b/src/anki_hsk_creator/utility.py @@ -0,0 +1,154 @@ +"""utility.py + + +Static clasess and functions for general use +""" + +# Standard Library +from pathlib import Path + +# Pip +import argostranslate.package +import argostranslate.translate +import torch +from cedict_utils.cedict import CedictParser +from chatterbox.mtl_tts import ChatterboxMultilingualTTS + +# Local +from .constants import CCCEDICT_PATH, INPUT, LANGUAGES, OUTPUT, RESOURCES + +# Static Clases + + +class TRANS: + """Static Class for Argos translate""" + + UPDATED = False + PACKAGES = None + + @staticmethod + def create_translator(from_code, to_code): + """Download and install Argos Translate package""" + if not TRANS.UPDATED: + argostranslate.package.update_package_index() + TRANS.PACKAGES = argostranslate.package.get_available_packages() + TRANS.UPDATED = True + package_to_install = next( + filter( + lambda x: x.from_code == from_code and x.to_code == to_code, + TRANS.PACKAGES, + ) + ) + argostranslate.package.install_from_path(package_to_install.download()) + + +class CCCEDICT: + """Static Class for the CCCEDIT dictionary""" + + PARSER = None + ENTRIES = [] + DICTIONARY_LIST = {} + + @staticmethod + def create_cedict(language_id=LANGUAGES.EN): + """Creates a create_cedict dictionary object""" + if not CCCEDICT.PARSER: + CCCEDICT.PARSER = CedictParser() + CCCEDICT.PARSER.read_file(CCCEDICT_PATH) + CCCEDICT.ENTRIES = CCCEDICT.PARSER.parse() + if language_id not in CCCEDICT.DICTIONARY_LIST: + dictionary = {} + for entry in CCCEDICT.ENTRIES: + if language_id != LANGUAGES.EN: + TRANS.create_translator(LANGUAGES.EN, language_id) + entry = argostranslate.translate.translate( + entry, LANGUAGES.EN, language_id + ) + if entry.simplified not in dictionary: + dictionary[entry.simplified] = [entry] + else: + dictionary[entry.simplified].append(entry) + CCCEDICT.DICTIONARY_LIST[language_id] = dictionary + else: + dictionary = CCCEDICT.DICTIONARY_LIST[language_id] + return dictionary + + +class TTS: + """Static class for the the TTS engine""" + + MODEL = None + DEVICE = None + + @staticmethod + def create_tts(): + """Creates a TTS engine""" + if TTS.DEVICE is None: + # Automatically detect the best available device + if torch.cuda.is_available(): + TTS.DEVICE = "cuda" + elif torch.backends.mps.is_available(): + TTS.DEVICE = "mps" + else: + TTS.DEVICE = "cpu" + if TTS.MODEL is None: + TTS.MODEL = ChatterboxMultilingualTTS.from_pretrained( + device=TTS.DEVICE, t3_model="v3" + ) + + +# Clases + + +class ProcessFile: + """Class that represents a file to processs + + diferent input files has direfent process_files depending on language + """ + + def __init__(self, input_file: Path, language_id: str = None): + self.input_file = input_file + self._language_id = language_id + # process file type + self.out_folder = OUTPUT / input_file.parent + self.out_folder.mkdir(parents=True, exist_ok=True) + resources = RESOURCES / input_file + self.resources = resources.parent / resources.stem + self.resources.mkdir(parents=True, exist_ok=True) + + @property + def absolute_input_file(self): + """Absolute input file""" + return INPUT / self.input_file + + @property + def language_id(self): + """language for this trasnlation process""" + return self._language_id + + @language_id.setter + def language_id(self, value): + self._language_id = value + + @property + def output_name(self): + """Posible name for the output file, still missing the filetype""" + if self.language_id is None: + raise ValueError("Not a valid language selected") + return self.input_file.parent / f"{self.input_file.stem}.{self.language_id})." + + +class TranslationResult: + """Result of a translated process""" + + def __init__( + self, + language_id: str, + translated: str, + line: str, + audio_path: Path, + ): + self.language_id = language_id + self.translated = translated + self.line = line + self.audio_path = audio_path