version 0.1

2026-06-11 21:23:34 +08:00
parent ea057668bc
commit 21c6416cfd
15 changed files with 645 additions and 367 deletions
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -34,6 +34,7 @@ dependencies = [
  "torch",
  "torchaudio",
  "torchcodec",
+  "python-dotenv",
 ]

 [project.optional-dependencies]
@@ -41,7 +42,9 @@ dev = [
    "pytest",
    "black",
    "pylint",
-    "flakehell"
+    "flake8",
+    "flake8-pyproject",
+    # "flakeheaven",
 ]

 [project.urls]
@@ -50,17 +53,18 @@ Issues = "https://gitea.wolfang.info.ve/wolfang/anki-hsk-creator/issues"
 Source = "https://gitea.wolfang.info.ve/wolfang/anki-hsk-creator"

 [tool.hatch.version]
-path = "src/anki-hsk-creator/__about__.py"
+path = "src/anki_hsk_creator/__about__.py"

 [tool.hatch.build.targets.sdist]
+packages = ["src/anki_hsk_creator"]
 include = [
-    "src/anki-hsk-creator/cedict_ts.u8",
+    "src/anki_hsk_creator/cedict_ts.u8",
 ]

 [tool.hatch.build.targets.wheel]
-packages = ["src/anki-hsk-creator"]
+packages = ["src/anki_hsk_creator"]
 include = [
-    "src/anki-hsk-creator/cedict_ts.u8",
+    "src/anki_hsk_creator/cedict_ts.u8",
 ]

 [tool.hatch.envs.default]
@@ -69,7 +73,8 @@ extra-dependencies = [
 ]

 [tool.hatch.envs.default.scripts]
-format = "black --target-version=py314 anki-hsk-creator tests && isort anki-hsk-creator tests"
+format = "black --target-version=py314 src tests && isort src tests"
+lint = "flake8 src" 

 [tool.hatch.envs.types]
 extra-dependencies = [
@@ -80,16 +85,16 @@ extra-dependencies = [
 check = "mypy --install-types --non-interactive {args:src/anki-hsk-creator tests}"

 [tool.coverage.run]
-source_pkgs = ["anki-hsk-creator", "tests"]
+source_pkgs = ["src", "tests"]
 branch = true
 parallel = true
 omit = [
-  "src/anki-hsk-creator/__about__.py",
+  "src/anki_hsk_creator/__about__.py",
 ]

 [tool.coverage.paths]
-anki-hsk-creator = ["src/anki-hsk-creator", "*/anki-hsk-creator/src/anki-hsk-creator"]
-tests = ["tests", "*src/anki-hsk-creator/tests"]
+anki-hsk-creator = ["src/anki_hsk_creator", "*/anki-hsk-creator/src/anki_hsk_creator"]
+tests = ["tests", "*src/anki_hsk_creator/tests"]

 [tool.coverage.report]
 exclude_lines = [
@@ -111,6 +116,7 @@ exclude = '''
 )
 '''

+
 [tool.isort]
 src_paths = ["src", "test"]
 skip_glob = [".git", "__pycache__", ".vscode", "*venv", "build", "dist", "old", "*.egg-info"]
@@ -145,9 +151,8 @@ msg-template="{path}:{module}:{line}: [{msg_id}({symbol}), {obj}] {msg}"
 logging-format-style="new"
 logging-modules="logging"

-[tool.flakehell]
+[tool.flake8]
 max_line_length = 88
-format = "grouped"
 show_source = false
 exclude = [
    ".git",
@@ -160,8 +165,29 @@ exclude = [
    "*.egg-info",
 ]

-[tool.flakehell.plugins]
+[tool.flake8.plugins]
 mccabe = ["+C*"]
 pycodestyle = ["+E*", "+W*", "-E203", "-E501", "-W503"]
 pyflakes = ["+F*"]
 flake8-bugbear = ["+*", "+B950"]
+
+# [tool.flakeheaven]
+# max_line_length = 88
+# format = "grouped"
+# show_source = false
+# exclude = [
+#     ".git",
+#     "__pycache__",
+#     ".vscode",
+#     "*venv",
+#     "build",
+#     "dist",
+#     "old",
+#     "*.egg-info",
+# ]
+
+# [tool.flakeheaven.plugins]
+# mccabe = ["+C*"]
+# pycodestyle = ["+E*", "+W*", "-E203", "-E501", "-W503"]
+# pyflakes = ["+F*"]
+# flake8-bugbear = ["+*", "+B950"]
--- a/src/anki-hsk-creator/init.py
+++ b/src/anki-hsk-creator/init.py
@@ -1,9 +0,0 @@
-"""anki-hsk-creator"""
-
-import os
-
-# Globals
-
-os.environ["HF_TOKEN"] = "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"
-ARGOS_UPDATED = False
-ARGOS_PACKAGES = None
--- a/src/anki-hsk-creator/main.py
+++ b/src/anki-hsk-creator/main.py
@@ -1,184 +0,0 @@
-## Imports
-from pathlib import Path
-import random
-import csv
-
-## PIP
-from pinyin_tone_converter.pinyin_tone_converter import PinyinToneConverter
-
-## Main
-
-
-def process_files():
-    print("Select data file:")
-    in_file = None
-    level = INPUT
-    while not in_file:
-        files = []
-        for n, file in enumerate(level.glob("*")):
-            files.append(file)
-            print(f"{n+1} - {file.relative_to(INPUT)}")
-        s = None
-        while not s or not s.isnumeric() or not (1 <= int(s) <= len(files)):
-            s = input(f"Please select the file [1-{len(files)}]: ")
-        selected = files[int(s) - 1]
-        if selected.is_file():
-            in_file = selected
-        else:
-            level = selected
-    relative = in_file.relative_to(INPUT)
-    out_file = OUTPUT / relative
-    resources = RESOURCES / relative
-    resources = resources.parent / resources.stem
-    resources.mkdir(parents=True, exist_ok=True)
-    out_file.parent.mkdir(parents=True, exist_ok=True)
-    with in_file.open(encoding="utf8") as input_file:
-        file_type = input_file.read().split()[0]
-    return in_file, out_file, resources, file_type
-
-
-def dictionary_process(dictionary, tts, in_file, resources):
-    """Process dictionary files"""
-    words_list = in_file.open(encoding="utf8").read().strip().split("\n")
-    results = []
-    try:
-        with in_file.open("w", encoding="utf8") as input_file:
-            for words in words_list:
-                word = words.split()[0]
-                pinyin = " ".join(words.split()[1:]) if len(words.split()) > 1 else None
-                if v := dictionary.get(word):
-                    if len(v) > 1:
-                        print(f"\nWARNING: {word} has multiple meanings:")
-                        if pinyin and pinyin != "ERROR":
-                            ml = list(filter(lambda x: x.pinyin == pinyin, v))
-                        else:
-                            ml = v
-                        if len(ml) > 1:
-                            for n, w in enumerate(ml):
-                                print(f"{n+1} - {w}")
-                                for m in w.meanings:
-                                    print(f"\t{m}")
-                            s = None
-                            while (
-                                not s
-                                or not s.isnumeric()
-                                or not (1 <= int(s) <= len(v))
-                            ):
-                                s = input(
-                                    f"Please select the correct word [1-{len(v)}]: "
-                                )
-                            v = v[int(s) - 1]
-                        else:
-                            v = ml[0]
-                    else:
-                        v = v[0]
-                    audio_path = resources / f"{word}.wav"
-                    if not audio_path.exists():
-                        audio = tts.generate(f"{word}。", language_id="zh")
-                        torchaudio.save(audio_path, audio, tts.sr)
-                    input_file.write(f"{word}\t{v.pinyin}\n")
-                    results.append((v, audio_path))
-                else:
-                    print("============================================")
-                    print(f"===================>ERROR: {word} not found")
-                    print("============================================")
-                    input_file.write(f"{word}\tERROR\n")
-    except Exception:
-        with in_file.open("w", encoding="utf8") as input_file:
-            input_file.write("\n".join(words_list))
-    return results
-
-
-def translator_process(tts, resources, in_file):
-    """Process for phases trasnlation"""
-    text_list = in_file.open(encoding="utf8").read().strip().split()
-    results = []
-    for n, phrase in enumerate(text_list):
-        phrase = phrase.strip()
-        audio_path = resources / f"N{n}.wav"
-        if not audio_path.exists():
-            audio = tts.generate(f"{phrase}。", language_id="zh")
-            torchaudio.save(audio_path, audio, tts.sr)
-        translated = argostranslate.translate.translate(phrase, CN, EN)
-        results.append([translated, phrase, audio_path])
-    return results
-
-
-# def output_tsv(out_file, results):
-#     """writes the output as a tsv file"""
-#     final_file = out_file.parent / f"{out_file.stem}.tsv"
-#     with final_file.open("w", encoding="utf8", newline="") as csvfile:
-#         writer = csv.writer(csvfile, delimiter="\t", quotechar='"')
-#         for entry in results:
-#             writer.writerow(
-#                 [
-#                     "\n ".join(f"{n+1}. {m}" for n, m in enumerate(entry.meanings)),
-#                     PinyinToneConverter().convert_text(entry.pinyin),
-#                     entry.simplified,
-#                     entry.traditional,
-#                 ]
-#             )
-
-
-def output_anki_dictionary(out_file, results):
-    final_file = out_file.parent / f"{out_file.stem}.apkg"
-    deck_name = "::".join(out_file.relative_to(OUTPUT).parts[:-1] + (out_file.stem,))
-    deck = Deck(random.randrange(1 << 30, 1 << 31), deck_name)
-    package = Package(deck)
-    audios = []
-    for entry, audio in results:
-        note = Note(
-            model=HSK_MODEL,
-            fields=[
-                "\n ".join(f"{n+1}. {m}" for n, m in enumerate(entry.meanings)),
-                PinyinToneConverter().convert_text(entry.pinyin),
-                entry.simplified,
-                entry.traditional,
-                f"[sound:{audio.name}]",
-            ],
-        )
-        audios.append(audio)
-        deck.add_note(note)
-    package.media_files = audios
-    package.write_to_file(final_file)
-
-
-def output_anki_phrase(out_file, results):
-    final_file = out_file.parent / f"{out_file.stem}.apkg"
-    deck_name = "::".join(out_file.relative_to(OUTPUT).parts[:-1] + (out_file.stem,))
-    deck = Deck(random.randrange(1 << 30, 1 << 31), deck_name)
-    package = Package(deck)
-    audios = []
-    for translated, phrase, audio in results:
-        note = Note(
-            model=PHRASE_MODEL,
-            fields=[
-                translated,
-                phrase,
-                f"[sound:{audio.name}]",
-            ],
-        )
-        deck.add_note(note)
-        audios.append(audio)
-    package.media_files = audios
-    package.write_to_file(final_file)
-
-
-def main():
-    tts = create_tts()
-    dictionary = create_cedict()
-    create_translator()
-    while True:
-        in_file, out_file, resources, file_type = process_files()
-        if PHRASES_TYPE in in_file.suffixes:
-            results = translator_process(tts, resources, in_file)
-            output_anki_phrase(out_file, results)
-        elif DICT_TYPE in in_file.suffixes:
-            results = dictionary_process(dictionary, tts, in_file, resources)
-            output_anki_dictionary(out_file, results)
-        else:
-            raise TypeError("Error, filetype not especified!")
-
-
-if __name__ == "__main__":
-    main()
--- a/src/anki-hsk-creator/anki-models.py
+++ b/src/anki-hsk-creator/anki-models.py
@@ -1,88 +0,0 @@
-# anki-models.py
-
-from genanki import Deck, Note, Model, Package
-
-
-# Constants
-
-CSS = """
-.card {
- font-family: arial;
- font-size: 20px;
- text-align: center;
- color: black;
- background-color: white;
-}
-.simple {
-font-family: Arial;
-font-size: 100px;
-}
-.trad {
-font-family: Arial;
-font-size: 75px;
-}
-"""
-
-# Models
-
-PHRASE_MODEL = Model(
-    2076166425,
-    "Phrase Model",
-    fields=[
-        {"name": "Translated"},
-        {"name": "Phrase"},
-        {"name": "Audio"},
-    ],
-    templates=[
-        {
-            "name": "Card 1",
-            "qfmt": "{{Translated}}<br>{{Audio}}",
-            "afmt": '{{FrontSide}}<hr id="answer">{{Phrase}}',
-        },
-        {
-            "name": "Card 2",
-            "qfmt": "{{Phrase}}<br>{{Audio}}",
-            "afmt": '{{FrontSide}}<hr id="answer">{{Translated}}',
-        },
-        {
-            "name": "Card 3",
-            "qfmt": "{{Audio}}",
-            "afmt": '{{FrontSide}}<hr id="answer">{{Phrase}}',
-        },
-    ],
-    css=CSS,
-)
-
-
-HSK_MODEL = Model(
-    1708536519,
-    "HSK Model",
-    fields=[
-        {"name": "English"},
-        {"name": "Pinyin"},
-        {"name": "Simplified"},
-        {"name": "Traditional"},
-        {"name": "Audio"},
-    ],
-    templates=[
-        {
-            "name": "Card 1",
-            "qfmt": "<strong>{{Pinyin}}</strong><br>{{English}}<br>{{Audio}}",
-            "afmt": "{{FrontSide}}<hr id='answer''><div class='simple'>"
-            "{{Simplified}}</div><br><div class='trad'>{{Traditional}}</div>",
-        },
-        {
-            "name": "Card 2",
-            "qfmt": "<div class='simple'>{{Simplified}}</div><br><div class='trad'>"
-            "{{Traditional}}</div>",
-            "afmt": '{{FrontSide}}<hr id="answer"><strong>{{Pinyin}}</strong><br>{{English}}<br>{{Audio}}',
-        },
-        {
-            "name": "Card 3",
-            "qfmt": "{{Audio}}",
-            "afmt": "{{FrontSide}}<hr id='answer''><strong>{{Pinyin}}</strong><br><div class='simple'>"
-            "{{Simplified}}</div><br><div class='trad'>{{Traditional}}</div>",
-        },
-    ],
-    css=CSS,
-)
--- a/src/anki-hsk-creator/constants.py
+++ b/src/anki-hsk-creator/constants.py
@@ -1,19 +0,0 @@
-## Imports
-from pathlib import Path
-import random
-import importlib.resources
-
-CCCEDICT = importlib.resources.files("anki-hsk-creator").joinpath("cedict_ts.u8")
-
-DATA = Path(__file__).parent.parent / "data"
-INPUT = DATA / "input"
-OUTPUT = DATA / "output"
-RESOURCES = DATA / "resources"
-
-# File Types
-PHRASES_TYPE = ".phrases"
-DICT_TYPE = ".dictionary"
-
-# Language codes
-CN = "zh"
-EN = "en"
--- a/src/anki-hsk-creator/untility.py
+++ b/src/anki-hsk-creator/untility.py
@@ -1,53 +0,0 @@
-from cedict_utils.cedict import CedictParser
-import argostranslate.package
-import argostranslate.translate
-from chatterbox.mtl_tts import ChatterboxMultilingualTTS
-
-import torch
-import torchaudio
-
-from . import ARGOS_UPDATED, ARGOS_PACKAGES
-from . import CCCEDICT
-
-## Functions
-
-
-def create_cedict(language_id="en"):
-    """Creates a create_cedict dictionary object"""
-
-    parser = CedictParser()
-    parser.read_file(CCCEDICT)
-    entries = parser.parse()
-
-    dictionary = {}
-    for entry in entries:
-        if entry.simplified not in dictionary:
-            dictionary[entry.simplified] = [entry]
-        else:
-            dictionary[entry.simplified].append(entry)
-
-    return dictionary
-
-
-def create_translator(from_code, to_code):
-    """Download and install Argos Translate package"""
-    if not ARGOS_UPDATED:
-        argostranslate.package.update_package_index()
-        ARGOS_PACKAGES = argostranslate.package.get_available_packages()
-        ARGOS_UPDATED = True
-    package_to_install = next(
-        filter(lambda x: x.from_code == CN and x.to_code == EN, ARGOS_PACKAGES)
-    )
-    argostranslate.package.install_from_path(package_to_install.download())
-
-
-def create_tts():
-    # Automatically detect the best available device
-    if torch.cuda.is_available():
-        device = "cuda"
-    elif torch.backends.mps.is_available():
-        device = "mps"
-    else:
-        device = "cpu"
-    tts = ChatterboxMultilingualTTS.from_pretrained(device=device, t3_model="v3")
-    return tts
--- a/src/anki_hsk_creator/about.py
+++ b/src/anki_hsk_creator/about.py
@@ -1,4 +1,5 @@
+"""about.py"""
 # SPDX-FileCopyrightText: 2026-present Wolfang Torres <wolfang.torres@gmail.com>
 #
 # SPDX-License-Identifier: GPL-3.0-or-later
-__version__ = "0.0.1"
+__version__ = "0.1.0"
--- a/src/anki_hsk_creator/init.py
+++ b/src/anki_hsk_creator/init.py
@@ -0,0 +1,20 @@
+"""anki_hsk_creator"""
+
+# Standard Library
+import os
+from pathlib import Path
+
+# Pip
+from dotenv import load_dotenv
+
+load_dotenv()
+
+# Globals
+
+# Get an HF_TOKEN from huggingface for TTS generation
+HF_TOKEN = os.environ.get("HF_TOKEN")
+
+# Path
+default_path = Path.home() / "anki-hsk-creator-data"
+DATA_FOLDER = Path(os.environ.get("DATA_FOLDER", default_path))
+DATA_FOLDER.mkdir(exist_ok=True, parents=True)
--- a/src/anki_hsk_creator/main.py
+++ b/src/anki_hsk_creator/main.py
@@ -0,0 +1,52 @@
+"""__main__.py"""
+
+# Standard Library
+from pathlib import Path
+
+# Local
+from .api import list_input_files, process_a_file, select_file
+from .constants import LANGUAGES
+
+
+def cli_select_files():
+    """Loops until it finds a valid input_file"""
+    print("Select data file:")
+    in_file = None
+    level = Path()
+    while not in_file:
+        files = list_input_files(level)
+        for n, file in enumerate(files):
+            print(f"{n+1} - {file}")
+        s = None
+        while not s or not s.isnumeric() or not (1 <= int(s) <= len(files)):
+            s = input(f"Please select the file [1-{len(files)}]: ")
+        selected = files[int(s) - 1]
+        if selected.is_file():
+            in_file = selected
+        else:
+            level = selected
+    input_file = select_file(in_file)
+    return input_file
+
+
+def cli_select_language():
+    """Selects a language for the trasnlatatio"""
+    print("Select a language:")
+    for language_id, language in LANGUAGES.language_names.items():
+        print(f"{language_id} - {language}")
+    s = None
+    while not s or s not in LANGUAGES.available_languages:
+        s = input(f"Please select the language: ({ LANGUAGES.available_languages})")
+    return s
+
+
+def main():
+    """CLI interface for the module"""
+    while True:
+        input_file = cli_select_files()
+        language_id = cli_select_language()
+        process_a_file(input_file, language_id)
+
+
+if __name__ == "__main__":
+    main()
--- a/src/anki_hsk_creator/anki_generation.py
+++ b/src/anki_hsk_creator/anki_generation.py
@@ -0,0 +1,159 @@
+"""anki_generation.py
+
+Produces anki output
+"""
+
+# Standard Library
+import random
+
+# Pip
+from genanki import Deck, Model, Note, Package
+
+# Local
+from .utility import ProcessFile, TranslationResult
+
+# from pinyin_tone_converter.pinyin_tone_converter import PinyinToneConverter
+
+
+# Constants
+
+CSS = """
+.card {
+ font-family: arial;
+ font-size: 20px;
+ text-align: center;
+ color: black;
+ background-color: white;
+}
+.simple {
+font-family: Arial;
+font-size: 100px;
+}
+.trad {
+font-family: Arial;
+font-size: 75px;
+}
+"""
+
+# Models
+
+PHRASE_MODEL = Model(
+    2076166425,
+    "Phrase Model",
+    fields=[
+        {"name": "Translated"},
+        {"name": "Phrase"},
+        {"name": "Audio"},
+    ],
+    templates=[
+        {
+            "name": "Card 1",
+            "qfmt": "{{Translated}}<br>{{Audio}}",
+            "afmt": '{{FrontSide}}<hr id="answer">{{Phrase}}',
+        },
+        {
+            "name": "Card 2",
+            "qfmt": "{{Phrase}}<br>{{Audio}}",
+            "afmt": '{{FrontSide}}<hr id="answer">{{Translated}}',
+        },
+        {
+            "name": "Card 3",
+            "qfmt": "{{Audio}}",
+            "afmt": '{{FrontSide}}<hr id="answer">{{Phrase}}',
+        },
+    ],
+    css=CSS,
+)
+
+
+HSK_MODEL = Model(
+    1708536519,
+    "HSK Model",
+    fields=[
+        {"name": "English"},
+        {"name": "Pinyin"},
+        {"name": "Simplified"},
+        {"name": "Traditional"},
+        {"name": "Audio"},
+    ],
+    templates=[
+        {
+            "name": "Card 1",
+            "qfmt": "<strong>{{Pinyin}}</strong><br>{{English}}<br>{{Audio}}",
+            "afmt": (
+                "{{FrontSide}}<hr id='answer''><div class='simple'>{{Simplified}}</div>"
+                "<br><div class='trad'>{{Traditional}}</div>"
+            ),
+        },
+        {
+            "name": "Card 2",
+            "qfmt": "<div class='simple'>{{Simplified}}</div><br><div class='trad'>"
+            "{{Traditional}}</div>",
+            "afmt": (
+                "{{FrontSide}}<hr id='answer'><strong>{{Pinyin}}</strong>"
+                "<br>{{English}}<br>{{Audio}}"
+            ),
+        },
+        {
+            "name": "Card 3",
+            "qfmt": "{{Audio}}",
+            "afmt": (
+                "{{FrontSide}}<hr id='answer'><strong>{{Pinyin}}</strong>"
+                "<br><div class='simple'>{{Simplified}}</div>"
+                "<br><div class='trad'>{{Traditional}}</div>"
+            ),
+        },
+    ],
+    css=CSS,
+)
+
+# Proccess
+
+
+# def output_anki_dictionary(out_file, results):
+#     """Creates an anki file from a dictionary results"""
+#     final_file = out_file.parent / f"{out_file.stem}.apkg"
+#     deck_name = "::".join(out_file.relative_to(OUTPUT).parts[:-1] + (out_file.stem,))
+#     deck = Deck(random.randrange(1 << 30, 1 << 31), deck_name)
+#     package = Package(deck)
+#     audios = []
+#     for entry, audio in results:
+#         note = Note(
+#             model=HSK_MODEL,
+#             fields=[
+#                 "\n ".join(f"{n+1}. {m}" for n, m in enumerate(entry.meanings)),
+#                 PinyinToneConverter().convert_text(entry.pinyin),
+#                 entry.simplified,
+#                 entry.traditional,
+#                 f"[sound:{audio.name}]",
+#             ],
+#         )
+#         audios.append(audio)
+#         deck.add_note(note)
+#     package.media_files = audios
+#     package.write_to_file(final_file)
+
+
+def output_anki_phrase(process_file: ProcessFile, results: list[TranslationResult]):
+    """Creates an anki file from a phrases results"""
+
+    final_file = process_file.output_name.with_suffix(".apkg")
+    deck_name = "::".join(
+        process_file.input_file.parts[:-1] + (process_file.input_fil.stem,)
+    )
+    deck = Deck(random.randrange(1 << 30, 1 << 31), deck_name)
+    package = Package(deck)
+    audios = []
+    for result in results:
+        note = Note(
+            model=PHRASE_MODEL,
+            fields=[
+                result.translated,
+                result.line,
+                f"[sound:{result.audio_path.name}]",
+            ],
+        )
+        deck.add_note(note)
+        audios.append(result.audio_path)
+    package.media_files = audios
+    package.write_to_file(final_file)
--- a/src/anki_hsk_creator/api.py
+++ b/src/anki_hsk_creator/api.py
@@ -0,0 +1,72 @@
+"""api.py
+
+Interface for managuing and procesing files
+"""
+
+# Standard Library
+from pathlib import Path
+
+# Local
+from . import DATA_FOLDER
+from .anki_generation import output_anki_phrase
+from .constants import DICT_TYPE, INPUT, LANGUAGES, PHRASES_TYPE
+from .proccessor import translator_process
+from .utility import TRANS, TTS, ProcessFile
+
+# interface
+
+
+def get_data_folder() -> Path:
+    """Utility function, return the data folder"""
+    return DATA_FOLDER
+
+
+def list_input_files(search_path: Path = Path()) -> list[Path]:
+    """Return a list of files relative to the INPUT path"""
+    level = INPUT / search_path
+    return [path.relative_to(INPUT) for path in level.glob("*")]
+
+
+def select_file(file_path: Path) -> ProcessFile:
+    """Given a relative path from `list_input_files`, return a ProcessFile"""
+    if (INPUT / file_path).is_file():
+        return ProcessFile(file_path)
+    else:
+        raise ValueError(f"{file_path} is not a file")
+
+
+def create_input_file(
+    name: str, file_type: str, text: str, sub_folder: Path = Path()
+) -> ProcessFile:
+    """Creates an input file, with a name and a type form the available ones,
+    writes a text to it, if a sub_folder is given,
+    it is created and the file placed inside.
+    returns the relative path for future processing
+
+    valid file_types: ".phrases", ".dictionary"
+    """
+    if file_type not in (PHRASES_TYPE, DICT_TYPE):
+        raise ValueError(f"file_type {file_type} not in {(PHRASES_TYPE, DICT_TYPE)}")
+    filename = f"{name}{file_type}.txt"
+    relative = sub_folder / filename
+    # write file
+    file_path = INPUT / relative
+    file_path.parent.mkdir(exist_ok=True, parents=True)
+    file_path.write_text(text, encoding="utf8")
+    # create process_file for future
+    process_file = ProcessFile(relative)
+    return process_file
+
+
+def process_a_file(process_file: ProcessFile, language_id: str):
+    """From a input_file, a language and an output type, process a file"""
+    process_file.language_id = language_id
+    if PHRASES_TYPE in process_file.input_file.suffix:
+        TTS.create_tts()
+        TRANS.create_translator(LANGUAGES.CN, language_id)
+        with process_file.absolute_input_file.open("r") as file:
+            text_lines = [line.strip() for line in file.readlines()]
+        results = translator_process(text_lines, process_file, language_id)
+        output_anki_phrase(process_file, results)
+    elif DICT_TYPE in process_file.input_file.suffix:
+        print("not implemented")
--- a/src/anki_hsk_creator/cedict_ts.u8
+++ b/src/anki_hsk_creator/cedict_ts.u8
--- a/src/anki_hsk_creator/constants.py
+++ b/src/anki_hsk_creator/constants.py
@@ -0,0 +1,51 @@
+"""constants.py"""
+
+# Standard Library
+import importlib.resources
+
+# Local
+from . import DATA_FOLDER
+
+# Resources
+CCCEDICT_PATH = importlib.resources.files("anki-hsk-creator").joinpath("cedict_ts.u8")
+
+# Data folder structure
+INPUT = DATA_FOLDER / "input"
+INPUT.mkdir(exist_ok=True, parents=True)
+OUTPUT = DATA_FOLDER / "output"
+OUTPUT.mkdir(exist_ok=True, parents=True)
+RESOURCES = DATA_FOLDER / "resources"
+RESOURCES.mkdir(exist_ok=True, parents=True)
+
+# File Types
+PHRASES_TYPE = ".phrases"
+DICT_TYPE = ".dictionary"
+
+
+class LANGUAGES:
+    """Available laguages for translation"""
+
+    CN = "zh"
+    EN = "en"
+    ES = "es"
+    FR = "fr"
+    RU = "ru"
+    TR = "tr"
+    TH = "th"
+
+    @property
+    def available_languages(self) -> tuple:
+        """Available laguages for translation"""
+        return (self.EN, self.ES, self.FR, self.RU, self.TR, self.TH)
+
+    @property
+    def language_names(self) -> dict:
+        """Gets the name of a language code"""
+        return {
+            self.EN: "English",
+            self.ES: "Spanish",
+            self.FR: "French",
+            self.RU: "Russian",
+            self.TR: "Turkish",
+            self.TH: "Thai",
+        }
--- a/src/anki_hsk_creator/proccessor.py
+++ b/src/anki_hsk_creator/proccessor.py
@@ -0,0 +1,96 @@
+"""processor.py"""
+
+# Pip
+import argostranslate.translate
+import torchaudio
+
+# Local
+from .constants import LANGUAGES
+from .utility import TTS, ProcessFile, TranslationResult  # , CCCEDICT
+
+# Results Classes
+
+
+def translator_process(
+    text_lines: list[str],
+    process_file: ProcessFile,
+    language_id: str,
+) -> list[TranslationResult]:
+    """Process for phases or sentence translation"""
+    results = []
+    for n, line in enumerate(text_lines):
+        line = line.strip()
+        audio_path = process_file.resources / f"N{n::03.0n}.wav"
+        if not audio_path.exists():
+            audio = TTS.MODEL.generate(f"{line}。", language_id=LANGUAGES.CN)
+            torchaudio.save(audio_path, audio, TTS.MODEL.sr)
+        translated = argostranslate.translate.translate(line, LANGUAGES.CN, language_id)
+        results.append(TranslationResult(language_id, translated, line, audio_path))
+    return results
+
+
+# def dictionary_process(dictionary, tts, in_file, resources):
+#     """Process dictionary files"""
+#     words_list = in_file.open(encoding="utf8").read().strip().split("\n")
+#     results = []
+#     try:
+#         with in_file.open("w", encoding="utf8") as input_file:
+#             for words in words_list:
+#                 word = words.split()[0]
+#                 pinyin = " ".join(words.split()[1:]) if len(words.split()) > 1 else None
+#                 if v := dictionary.get(word):
+#                     if len(v) > 1:
+#                         print(f"\nWARNING: {word} has multiple meanings:")
+#                         if pinyin and pinyin != "ERROR":
+#                             ml = list(filter(lambda x: x.pinyin == pinyin, v))
+#                         else:
+#                             ml = v
+#                         if len(ml) > 1:
+#                             for n, w in enumerate(ml):
+#                                 print(f"{n+1} - {w}")
+#                                 for m in w.meanings:
+#                                     print(f"\t{m}")
+#                             s = None
+#                             while (
+#                                 not s
+#                                 or not s.isnumeric()
+#                                 or not (1 <= int(s) <= len(v))
+#                             ):
+#                                 s = input(
+#                                     f"Please select the correct word [1-{len(v)}]: "
+#                                 )
+#                             v = v[int(s) - 1]
+#                         else:
+#                             v = ml[0]
+#                     else:
+#                         v = v[0]
+#                     audio_path = resources / f"{word}.wav"
+#                     if not audio_path.exists():
+#                         audio = tts.generate(f"{word}。", language_id="zh")
+#                         torchaudio.save(audio_path, audio, tts.sr)
+#                     input_file.write(f"{word}\t{v.pinyin}\n")
+#                     results.append((v, audio_path))
+#                 else:
+#                     print("============================================")
+#                     print(f"===================>ERROR: {word} not found")
+#                     print("============================================")
+#                     input_file.write(f"{word}\tERROR\n")
+#     except Exception:
+#         with in_file.open("w", encoding="utf8") as input_file:
+#             input_file.write("\n".join(words_list))
+#     return results
+
+# def output_tsv(out_file, results):
+#     """writes the output as a tsv file"""
+#     final_file = out_file.parent / f"{out_file.stem}.tsv"
+#     with final_file.open("w", encoding="utf8", newline="") as csvfile:
+#         writer = csv.writer(csvfile, delimiter="\t", quotechar='"')
+#         for entry in results:
+#             writer.writerow(
+#                 [
+#                     "\n ".join(f"{n+1}. {m}" for n, m in enumerate(entry.meanings)),
+#                     PinyinToneConverter().convert_text(entry.pinyin),
+#                     entry.simplified,
+#                     entry.traditional,
+#                 ]
+#             )
--- a/src/anki_hsk_creator/utility.py
+++ b/src/anki_hsk_creator/utility.py
@@ -0,0 +1,154 @@
+"""utility.py
+
+
+Static clasess and functions for general use
+"""
+
+# Standard Library
+from pathlib import Path
+
+# Pip
+import argostranslate.package
+import argostranslate.translate
+import torch
+from cedict_utils.cedict import CedictParser
+from chatterbox.mtl_tts import ChatterboxMultilingualTTS
+
+# Local
+from .constants import CCCEDICT_PATH, INPUT, LANGUAGES, OUTPUT, RESOURCES
+
+# Static Clases
+
+
+class TRANS:
+    """Static Class for Argos translate"""
+
+    UPDATED = False
+    PACKAGES = None
+
+    @staticmethod
+    def create_translator(from_code, to_code):
+        """Download and install Argos Translate package"""
+        if not TRANS.UPDATED:
+            argostranslate.package.update_package_index()
+            TRANS.PACKAGES = argostranslate.package.get_available_packages()
+            TRANS.UPDATED = True
+        package_to_install = next(
+            filter(
+                lambda x: x.from_code == from_code and x.to_code == to_code,
+                TRANS.PACKAGES,
+            )
+        )
+        argostranslate.package.install_from_path(package_to_install.download())
+
+
+class CCCEDICT:
+    """Static Class for the CCCEDIT dictionary"""
+
+    PARSER = None
+    ENTRIES = []
+    DICTIONARY_LIST = {}
+
+    @staticmethod
+    def create_cedict(language_id=LANGUAGES.EN):
+        """Creates a create_cedict dictionary object"""
+        if not CCCEDICT.PARSER:
+            CCCEDICT.PARSER = CedictParser()
+            CCCEDICT.PARSER.read_file(CCCEDICT_PATH)
+            CCCEDICT.ENTRIES = CCCEDICT.PARSER.parse()
+        if language_id not in CCCEDICT.DICTIONARY_LIST:
+            dictionary = {}
+            for entry in CCCEDICT.ENTRIES:
+                if language_id != LANGUAGES.EN:
+                    TRANS.create_translator(LANGUAGES.EN, language_id)
+                    entry = argostranslate.translate.translate(
+                        entry, LANGUAGES.EN, language_id
+                    )
+                if entry.simplified not in dictionary:
+                    dictionary[entry.simplified] = [entry]
+                else:
+                    dictionary[entry.simplified].append(entry)
+            CCCEDICT.DICTIONARY_LIST[language_id] = dictionary
+        else:
+            dictionary = CCCEDICT.DICTIONARY_LIST[language_id]
+        return dictionary
+
+
+class TTS:
+    """Static class for the the TTS engine"""
+
+    MODEL = None
+    DEVICE = None
+
+    @staticmethod
+    def create_tts():
+        """Creates a TTS engine"""
+        if TTS.DEVICE is None:
+            # Automatically detect the best available device
+            if torch.cuda.is_available():
+                TTS.DEVICE = "cuda"
+            elif torch.backends.mps.is_available():
+                TTS.DEVICE = "mps"
+            else:
+                TTS.DEVICE = "cpu"
+        if TTS.MODEL is None:
+            TTS.MODEL = ChatterboxMultilingualTTS.from_pretrained(
+                device=TTS.DEVICE, t3_model="v3"
+            )
+
+
+# Clases
+
+
+class ProcessFile:
+    """Class that represents a file to processs
+
+    diferent input files has direfent process_files depending on language
+    """
+
+    def __init__(self, input_file: Path, language_id: str = None):
+        self.input_file = input_file
+        self._language_id = language_id
+        # process file type
+        self.out_folder = OUTPUT / input_file.parent
+        self.out_folder.mkdir(parents=True, exist_ok=True)
+        resources = RESOURCES / input_file
+        self.resources = resources.parent / resources.stem
+        self.resources.mkdir(parents=True, exist_ok=True)
+
+    @property
+    def absolute_input_file(self):
+        """Absolute input file"""
+        return INPUT / self.input_file
+
+    @property
+    def language_id(self):
+        """language for this trasnlation process"""
+        return self._language_id
+
+    @language_id.setter
+    def language_id(self, value):
+        self._language_id = value
+
+    @property
+    def output_name(self):
+        """Posible name for the output file, still missing the filetype"""
+        if self.language_id is None:
+            raise ValueError("Not a valid language selected")
+        return self.input_file.parent / f"{self.input_file.stem}.{self.language_id})."
+
+
+class TranslationResult:
+    """Result of a translated process"""
+
+    def __init__(
+        self,
+        language_id: str,
+        translated: str,
+        line: str,
+        audio_path: Path,
+    ):
+        self.language_id = language_id
+        self.translated = translated
+        self.line = line
+        self.audio_path = audio_path