diff --git a/pyproject.toml b/pyproject.toml
index 15caf19..b0ce95d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -34,6 +34,7 @@ dependencies = [
"torch",
"torchaudio",
"torchcodec",
+ "python-dotenv",
]
[project.optional-dependencies]
@@ -41,7 +42,9 @@ dev = [
"pytest",
"black",
"pylint",
- "flakehell"
+ "flake8",
+ "flake8-pyproject",
+ # "flakeheaven",
]
[project.urls]
@@ -50,17 +53,18 @@ Issues = "https://gitea.wolfang.info.ve/wolfang/anki-hsk-creator/issues"
Source = "https://gitea.wolfang.info.ve/wolfang/anki-hsk-creator"
[tool.hatch.version]
-path = "src/anki-hsk-creator/__about__.py"
+path = "src/anki_hsk_creator/__about__.py"
[tool.hatch.build.targets.sdist]
+packages = ["src/anki_hsk_creator"]
include = [
- "src/anki-hsk-creator/cedict_ts.u8",
+ "src/anki_hsk_creator/cedict_ts.u8",
]
[tool.hatch.build.targets.wheel]
-packages = ["src/anki-hsk-creator"]
+packages = ["src/anki_hsk_creator"]
include = [
- "src/anki-hsk-creator/cedict_ts.u8",
+ "src/anki_hsk_creator/cedict_ts.u8",
]
[tool.hatch.envs.default]
@@ -69,7 +73,8 @@ extra-dependencies = [
]
[tool.hatch.envs.default.scripts]
-format = "black --target-version=py314 anki-hsk-creator tests && isort anki-hsk-creator tests"
+format = "black --target-version=py314 src tests && isort src tests"
+lint = "flake8 src"
[tool.hatch.envs.types]
extra-dependencies = [
@@ -80,16 +85,16 @@ extra-dependencies = [
check = "mypy --install-types --non-interactive {args:src/anki-hsk-creator tests}"
[tool.coverage.run]
-source_pkgs = ["anki-hsk-creator", "tests"]
+source_pkgs = ["src", "tests"]
branch = true
parallel = true
omit = [
- "src/anki-hsk-creator/__about__.py",
+ "src/anki_hsk_creator/__about__.py",
]
[tool.coverage.paths]
-anki-hsk-creator = ["src/anki-hsk-creator", "*/anki-hsk-creator/src/anki-hsk-creator"]
-tests = ["tests", "*src/anki-hsk-creator/tests"]
+anki-hsk-creator = ["src/anki_hsk_creator", "*/anki-hsk-creator/src/anki_hsk_creator"]
+tests = ["tests", "*src/anki_hsk_creator/tests"]
[tool.coverage.report]
exclude_lines = [
@@ -111,6 +116,7 @@ exclude = '''
)
'''
+
[tool.isort]
src_paths = ["src", "test"]
skip_glob = [".git", "__pycache__", ".vscode", "*venv", "build", "dist", "old", "*.egg-info"]
@@ -145,9 +151,8 @@ msg-template="{path}:{module}:{line}: [{msg_id}({symbol}), {obj}] {msg}"
logging-format-style="new"
logging-modules="logging"
-[tool.flakehell]
+[tool.flake8]
max_line_length = 88
-format = "grouped"
show_source = false
exclude = [
".git",
@@ -160,8 +165,29 @@ exclude = [
"*.egg-info",
]
-[tool.flakehell.plugins]
+[tool.flake8.plugins]
mccabe = ["+C*"]
pycodestyle = ["+E*", "+W*", "-E203", "-E501", "-W503"]
pyflakes = ["+F*"]
flake8-bugbear = ["+*", "+B950"]
+
+# [tool.flakeheaven]
+# max_line_length = 88
+# format = "grouped"
+# show_source = false
+# exclude = [
+# ".git",
+# "__pycache__",
+# ".vscode",
+# "*venv",
+# "build",
+# "dist",
+# "old",
+# "*.egg-info",
+# ]
+
+# [tool.flakeheaven.plugins]
+# mccabe = ["+C*"]
+# pycodestyle = ["+E*", "+W*", "-E203", "-E501", "-W503"]
+# pyflakes = ["+F*"]
+# flake8-bugbear = ["+*", "+B950"]
diff --git a/src/anki-hsk-creator/__init__.py b/src/anki-hsk-creator/__init__.py
deleted file mode 100644
index 9bac128..0000000
--- a/src/anki-hsk-creator/__init__.py
+++ /dev/null
@@ -1,9 +0,0 @@
-"""anki-hsk-creator"""
-
-import os
-
-# Globals
-
-os.environ["HF_TOKEN"] = "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"
-ARGOS_UPDATED = False
-ARGOS_PACKAGES = None
\ No newline at end of file
diff --git a/src/anki-hsk-creator/__main__.py b/src/anki-hsk-creator/__main__.py
deleted file mode 100644
index a8a763d..0000000
--- a/src/anki-hsk-creator/__main__.py
+++ /dev/null
@@ -1,184 +0,0 @@
-## Imports
-from pathlib import Path
-import random
-import csv
-
-## PIP
-from pinyin_tone_converter.pinyin_tone_converter import PinyinToneConverter
-
-## Main
-
-
-def process_files():
- print("Select data file:")
- in_file = None
- level = INPUT
- while not in_file:
- files = []
- for n, file in enumerate(level.glob("*")):
- files.append(file)
- print(f"{n+1} - {file.relative_to(INPUT)}")
- s = None
- while not s or not s.isnumeric() or not (1 <= int(s) <= len(files)):
- s = input(f"Please select the file [1-{len(files)}]: ")
- selected = files[int(s) - 1]
- if selected.is_file():
- in_file = selected
- else:
- level = selected
- relative = in_file.relative_to(INPUT)
- out_file = OUTPUT / relative
- resources = RESOURCES / relative
- resources = resources.parent / resources.stem
- resources.mkdir(parents=True, exist_ok=True)
- out_file.parent.mkdir(parents=True, exist_ok=True)
- with in_file.open(encoding="utf8") as input_file:
- file_type = input_file.read().split()[0]
- return in_file, out_file, resources, file_type
-
-
-def dictionary_process(dictionary, tts, in_file, resources):
- """Process dictionary files"""
- words_list = in_file.open(encoding="utf8").read().strip().split("\n")
- results = []
- try:
- with in_file.open("w", encoding="utf8") as input_file:
- for words in words_list:
- word = words.split()[0]
- pinyin = " ".join(words.split()[1:]) if len(words.split()) > 1 else None
- if v := dictionary.get(word):
- if len(v) > 1:
- print(f"\nWARNING: {word} has multiple meanings:")
- if pinyin and pinyin != "ERROR":
- ml = list(filter(lambda x: x.pinyin == pinyin, v))
- else:
- ml = v
- if len(ml) > 1:
- for n, w in enumerate(ml):
- print(f"{n+1} - {w}")
- for m in w.meanings:
- print(f"\t{m}")
- s = None
- while (
- not s
- or not s.isnumeric()
- or not (1 <= int(s) <= len(v))
- ):
- s = input(
- f"Please select the correct word [1-{len(v)}]: "
- )
- v = v[int(s) - 1]
- else:
- v = ml[0]
- else:
- v = v[0]
- audio_path = resources / f"{word}.wav"
- if not audio_path.exists():
- audio = tts.generate(f"{word}。", language_id="zh")
- torchaudio.save(audio_path, audio, tts.sr)
- input_file.write(f"{word}\t{v.pinyin}\n")
- results.append((v, audio_path))
- else:
- print("============================================")
- print(f"===================>ERROR: {word} not found")
- print("============================================")
- input_file.write(f"{word}\tERROR\n")
- except Exception:
- with in_file.open("w", encoding="utf8") as input_file:
- input_file.write("\n".join(words_list))
- return results
-
-
-def translator_process(tts, resources, in_file):
- """Process for phases trasnlation"""
- text_list = in_file.open(encoding="utf8").read().strip().split()
- results = []
- for n, phrase in enumerate(text_list):
- phrase = phrase.strip()
- audio_path = resources / f"N{n}.wav"
- if not audio_path.exists():
- audio = tts.generate(f"{phrase}。", language_id="zh")
- torchaudio.save(audio_path, audio, tts.sr)
- translated = argostranslate.translate.translate(phrase, CN, EN)
- results.append([translated, phrase, audio_path])
- return results
-
-
-# def output_tsv(out_file, results):
-# """writes the output as a tsv file"""
-# final_file = out_file.parent / f"{out_file.stem}.tsv"
-# with final_file.open("w", encoding="utf8", newline="") as csvfile:
-# writer = csv.writer(csvfile, delimiter="\t", quotechar='"')
-# for entry in results:
-# writer.writerow(
-# [
-# "\n ".join(f"{n+1}. {m}" for n, m in enumerate(entry.meanings)),
-# PinyinToneConverter().convert_text(entry.pinyin),
-# entry.simplified,
-# entry.traditional,
-# ]
-# )
-
-
-def output_anki_dictionary(out_file, results):
- final_file = out_file.parent / f"{out_file.stem}.apkg"
- deck_name = "::".join(out_file.relative_to(OUTPUT).parts[:-1] + (out_file.stem,))
- deck = Deck(random.randrange(1 << 30, 1 << 31), deck_name)
- package = Package(deck)
- audios = []
- for entry, audio in results:
- note = Note(
- model=HSK_MODEL,
- fields=[
- "\n ".join(f"{n+1}. {m}" for n, m in enumerate(entry.meanings)),
- PinyinToneConverter().convert_text(entry.pinyin),
- entry.simplified,
- entry.traditional,
- f"[sound:{audio.name}]",
- ],
- )
- audios.append(audio)
- deck.add_note(note)
- package.media_files = audios
- package.write_to_file(final_file)
-
-
-def output_anki_phrase(out_file, results):
- final_file = out_file.parent / f"{out_file.stem}.apkg"
- deck_name = "::".join(out_file.relative_to(OUTPUT).parts[:-1] + (out_file.stem,))
- deck = Deck(random.randrange(1 << 30, 1 << 31), deck_name)
- package = Package(deck)
- audios = []
- for translated, phrase, audio in results:
- note = Note(
- model=PHRASE_MODEL,
- fields=[
- translated,
- phrase,
- f"[sound:{audio.name}]",
- ],
- )
- deck.add_note(note)
- audios.append(audio)
- package.media_files = audios
- package.write_to_file(final_file)
-
-
-def main():
- tts = create_tts()
- dictionary = create_cedict()
- create_translator()
- while True:
- in_file, out_file, resources, file_type = process_files()
- if PHRASES_TYPE in in_file.suffixes:
- results = translator_process(tts, resources, in_file)
- output_anki_phrase(out_file, results)
- elif DICT_TYPE in in_file.suffixes:
- results = dictionary_process(dictionary, tts, in_file, resources)
- output_anki_dictionary(out_file, results)
- else:
- raise TypeError("Error, filetype not especified!")
-
-
-if __name__ == "__main__":
- main()
diff --git a/src/anki-hsk-creator/anki-models.py b/src/anki-hsk-creator/anki-models.py
deleted file mode 100644
index 555d00a..0000000
--- a/src/anki-hsk-creator/anki-models.py
+++ /dev/null
@@ -1,88 +0,0 @@
-# anki-models.py
-
-from genanki import Deck, Note, Model, Package
-
-
-# Constants
-
-CSS = """
-.card {
- font-family: arial;
- font-size: 20px;
- text-align: center;
- color: black;
- background-color: white;
-}
-.simple {
-font-family: Arial;
-font-size: 100px;
-}
-.trad {
-font-family: Arial;
-font-size: 75px;
-}
-"""
-
-# Models
-
-PHRASE_MODEL = Model(
- 2076166425,
- "Phrase Model",
- fields=[
- {"name": "Translated"},
- {"name": "Phrase"},
- {"name": "Audio"},
- ],
- templates=[
- {
- "name": "Card 1",
- "qfmt": "{{Translated}}
{{Audio}}",
- "afmt": '{{FrontSide}}
{{Phrase}}',
- },
- {
- "name": "Card 2",
- "qfmt": "{{Phrase}}
{{Audio}}",
- "afmt": '{{FrontSide}}
{{Translated}}',
- },
- {
- "name": "Card 3",
- "qfmt": "{{Audio}}",
- "afmt": '{{FrontSide}}
{{Phrase}}',
- },
- ],
- css=CSS,
-)
-
-
-HSK_MODEL = Model(
- 1708536519,
- "HSK Model",
- fields=[
- {"name": "English"},
- {"name": "Pinyin"},
- {"name": "Simplified"},
- {"name": "Traditional"},
- {"name": "Audio"},
- ],
- templates=[
- {
- "name": "Card 1",
- "qfmt": "{{Pinyin}}
{{English}}
{{Audio}}",
- "afmt": "{{FrontSide}}
"
- "{{Simplified}}
{{Traditional}}
",
- },
- {
- "name": "Card 2",
- "qfmt": "{{Simplified}}
"
- "{{Traditional}}
",
- "afmt": '{{FrontSide}}
{{Pinyin}}
{{English}}
{{Audio}}',
- },
- {
- "name": "Card 3",
- "qfmt": "{{Audio}}",
- "afmt": "{{FrontSide}}
{{Pinyin}}
"
- "{{Simplified}}
{{Traditional}}
",
- },
- ],
- css=CSS,
-)
\ No newline at end of file
diff --git a/src/anki-hsk-creator/constants.py b/src/anki-hsk-creator/constants.py
deleted file mode 100644
index 0bac8ad..0000000
--- a/src/anki-hsk-creator/constants.py
+++ /dev/null
@@ -1,19 +0,0 @@
-## Imports
-from pathlib import Path
-import random
-import importlib.resources
-
-CCCEDICT = importlib.resources.files("anki-hsk-creator").joinpath("cedict_ts.u8")
-
-DATA = Path(__file__).parent.parent / "data"
-INPUT = DATA / "input"
-OUTPUT = DATA / "output"
-RESOURCES = DATA / "resources"
-
-# File Types
-PHRASES_TYPE = ".phrases"
-DICT_TYPE = ".dictionary"
-
-# Language codes
-CN = "zh"
-EN = "en"
\ No newline at end of file
diff --git a/src/anki-hsk-creator/untility.py b/src/anki-hsk-creator/untility.py
deleted file mode 100644
index 63ecdf6..0000000
--- a/src/anki-hsk-creator/untility.py
+++ /dev/null
@@ -1,53 +0,0 @@
-from cedict_utils.cedict import CedictParser
-import argostranslate.package
-import argostranslate.translate
-from chatterbox.mtl_tts import ChatterboxMultilingualTTS
-
-import torch
-import torchaudio
-
-from . import ARGOS_UPDATED, ARGOS_PACKAGES
-from . import CCCEDICT
-
-## Functions
-
-
-def create_cedict(language_id="en"):
- """Creates a create_cedict dictionary object"""
-
- parser = CedictParser()
- parser.read_file(CCCEDICT)
- entries = parser.parse()
-
- dictionary = {}
- for entry in entries:
- if entry.simplified not in dictionary:
- dictionary[entry.simplified] = [entry]
- else:
- dictionary[entry.simplified].append(entry)
-
- return dictionary
-
-
-def create_translator(from_code, to_code):
- """Download and install Argos Translate package"""
- if not ARGOS_UPDATED:
- argostranslate.package.update_package_index()
- ARGOS_PACKAGES = argostranslate.package.get_available_packages()
- ARGOS_UPDATED = True
- package_to_install = next(
- filter(lambda x: x.from_code == CN and x.to_code == EN, ARGOS_PACKAGES)
- )
- argostranslate.package.install_from_path(package_to_install.download())
-
-
-def create_tts():
- # Automatically detect the best available device
- if torch.cuda.is_available():
- device = "cuda"
- elif torch.backends.mps.is_available():
- device = "mps"
- else:
- device = "cpu"
- tts = ChatterboxMultilingualTTS.from_pretrained(device=device, t3_model="v3")
- return tts
\ No newline at end of file
diff --git a/src/anki-hsk-creator/__about__.py b/src/anki_hsk_creator/__about__.py
similarity index 77%
rename from src/anki-hsk-creator/__about__.py
rename to src/anki_hsk_creator/__about__.py
index ab19860..33859a3 100644
--- a/src/anki-hsk-creator/__about__.py
+++ b/src/anki_hsk_creator/__about__.py
@@ -1,4 +1,5 @@
+"""about.py"""
# SPDX-FileCopyrightText: 2026-present Wolfang Torres
#
# SPDX-License-Identifier: GPL-3.0-or-later
-__version__ = "0.0.1"
+__version__ = "0.1.0"
diff --git a/src/anki_hsk_creator/__init__.py b/src/anki_hsk_creator/__init__.py
new file mode 100644
index 0000000..3210c4b
--- /dev/null
+++ b/src/anki_hsk_creator/__init__.py
@@ -0,0 +1,20 @@
+"""anki_hsk_creator"""
+
+# Standard Library
+import os
+from pathlib import Path
+
+# Pip
+from dotenv import load_dotenv
+
+load_dotenv()
+
+# Globals
+
+# Get an HF_TOKEN from huggingface for TTS generation
+HF_TOKEN = os.environ.get("HF_TOKEN")
+
+# Path
+default_path = Path.home() / "anki-hsk-creator-data"
+DATA_FOLDER = Path(os.environ.get("DATA_FOLDER", default_path))
+DATA_FOLDER.mkdir(exist_ok=True, parents=True)
diff --git a/src/anki_hsk_creator/__main__.py b/src/anki_hsk_creator/__main__.py
new file mode 100644
index 0000000..49d03ee
--- /dev/null
+++ b/src/anki_hsk_creator/__main__.py
@@ -0,0 +1,52 @@
+"""__main__.py"""
+
+# Standard Library
+from pathlib import Path
+
+# Local
+from .api import list_input_files, process_a_file, select_file
+from .constants import LANGUAGES
+
+
+def cli_select_files():
+ """Loops until it finds a valid input_file"""
+ print("Select data file:")
+ in_file = None
+ level = Path()
+ while not in_file:
+ files = list_input_files(level)
+ for n, file in enumerate(files):
+ print(f"{n+1} - {file}")
+ s = None
+ while not s or not s.isnumeric() or not (1 <= int(s) <= len(files)):
+ s = input(f"Please select the file [1-{len(files)}]: ")
+ selected = files[int(s) - 1]
+ if selected.is_file():
+ in_file = selected
+ else:
+ level = selected
+ input_file = select_file(in_file)
+ return input_file
+
+
+def cli_select_language():
+ """Selects a language for the trasnlatatio"""
+ print("Select a language:")
+ for language_id, language in LANGUAGES.language_names.items():
+ print(f"{language_id} - {language}")
+ s = None
+ while not s or s not in LANGUAGES.available_languages:
+ s = input(f"Please select the language: ({ LANGUAGES.available_languages})")
+ return s
+
+
+def main():
+ """CLI interface for the module"""
+ while True:
+ input_file = cli_select_files()
+ language_id = cli_select_language()
+ process_a_file(input_file, language_id)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/src/anki_hsk_creator/anki_generation.py b/src/anki_hsk_creator/anki_generation.py
new file mode 100644
index 0000000..44926f8
--- /dev/null
+++ b/src/anki_hsk_creator/anki_generation.py
@@ -0,0 +1,159 @@
+"""anki_generation.py
+
+Produces anki output
+"""
+
+# Standard Library
+import random
+
+# Pip
+from genanki import Deck, Model, Note, Package
+
+# Local
+from .utility import ProcessFile, TranslationResult
+
+# from pinyin_tone_converter.pinyin_tone_converter import PinyinToneConverter
+
+
+# Constants
+
+CSS = """
+.card {
+ font-family: arial;
+ font-size: 20px;
+ text-align: center;
+ color: black;
+ background-color: white;
+}
+.simple {
+font-family: Arial;
+font-size: 100px;
+}
+.trad {
+font-family: Arial;
+font-size: 75px;
+}
+"""
+
+# Models
+
+PHRASE_MODEL = Model(
+ 2076166425,
+ "Phrase Model",
+ fields=[
+ {"name": "Translated"},
+ {"name": "Phrase"},
+ {"name": "Audio"},
+ ],
+ templates=[
+ {
+ "name": "Card 1",
+ "qfmt": "{{Translated}}
{{Audio}}",
+ "afmt": '{{FrontSide}}
{{Phrase}}',
+ },
+ {
+ "name": "Card 2",
+ "qfmt": "{{Phrase}}
{{Audio}}",
+ "afmt": '{{FrontSide}}
{{Translated}}',
+ },
+ {
+ "name": "Card 3",
+ "qfmt": "{{Audio}}",
+ "afmt": '{{FrontSide}}
{{Phrase}}',
+ },
+ ],
+ css=CSS,
+)
+
+
+HSK_MODEL = Model(
+ 1708536519,
+ "HSK Model",
+ fields=[
+ {"name": "English"},
+ {"name": "Pinyin"},
+ {"name": "Simplified"},
+ {"name": "Traditional"},
+ {"name": "Audio"},
+ ],
+ templates=[
+ {
+ "name": "Card 1",
+ "qfmt": "{{Pinyin}}
{{English}}
{{Audio}}",
+ "afmt": (
+ "{{FrontSide}}
{{Simplified}}
"
+ "
{{Traditional}}
"
+ ),
+ },
+ {
+ "name": "Card 2",
+ "qfmt": "{{Simplified}}
"
+ "{{Traditional}}
",
+ "afmt": (
+ "{{FrontSide}}
{{Pinyin}}"
+ "
{{English}}
{{Audio}}"
+ ),
+ },
+ {
+ "name": "Card 3",
+ "qfmt": "{{Audio}}",
+ "afmt": (
+ "{{FrontSide}}
{{Pinyin}}"
+ "
{{Simplified}}
"
+ "
{{Traditional}}
"
+ ),
+ },
+ ],
+ css=CSS,
+)
+
+# Proccess
+
+
+# def output_anki_dictionary(out_file, results):
+# """Creates an anki file from a dictionary results"""
+# final_file = out_file.parent / f"{out_file.stem}.apkg"
+# deck_name = "::".join(out_file.relative_to(OUTPUT).parts[:-1] + (out_file.stem,))
+# deck = Deck(random.randrange(1 << 30, 1 << 31), deck_name)
+# package = Package(deck)
+# audios = []
+# for entry, audio in results:
+# note = Note(
+# model=HSK_MODEL,
+# fields=[
+# "\n ".join(f"{n+1}. {m}" for n, m in enumerate(entry.meanings)),
+# PinyinToneConverter().convert_text(entry.pinyin),
+# entry.simplified,
+# entry.traditional,
+# f"[sound:{audio.name}]",
+# ],
+# )
+# audios.append(audio)
+# deck.add_note(note)
+# package.media_files = audios
+# package.write_to_file(final_file)
+
+
+def output_anki_phrase(process_file: ProcessFile, results: list[TranslationResult]):
+ """Creates an anki file from a phrases results"""
+
+ final_file = process_file.output_name.with_suffix(".apkg")
+ deck_name = "::".join(
+ process_file.input_file.parts[:-1] + (process_file.input_fil.stem,)
+ )
+ deck = Deck(random.randrange(1 << 30, 1 << 31), deck_name)
+ package = Package(deck)
+ audios = []
+ for result in results:
+ note = Note(
+ model=PHRASE_MODEL,
+ fields=[
+ result.translated,
+ result.line,
+ f"[sound:{result.audio_path.name}]",
+ ],
+ )
+ deck.add_note(note)
+ audios.append(result.audio_path)
+ package.media_files = audios
+ package.write_to_file(final_file)
diff --git a/src/anki_hsk_creator/api.py b/src/anki_hsk_creator/api.py
new file mode 100644
index 0000000..cee2aa0
--- /dev/null
+++ b/src/anki_hsk_creator/api.py
@@ -0,0 +1,72 @@
+"""api.py
+
+Interface for managuing and procesing files
+"""
+
+# Standard Library
+from pathlib import Path
+
+# Local
+from . import DATA_FOLDER
+from .anki_generation import output_anki_phrase
+from .constants import DICT_TYPE, INPUT, LANGUAGES, PHRASES_TYPE
+from .proccessor import translator_process
+from .utility import TRANS, TTS, ProcessFile
+
+# interface
+
+
+def get_data_folder() -> Path:
+ """Utility function, return the data folder"""
+ return DATA_FOLDER
+
+
+def list_input_files(search_path: Path = Path()) -> list[Path]:
+ """Return a list of files relative to the INPUT path"""
+ level = INPUT / search_path
+ return [path.relative_to(INPUT) for path in level.glob("*")]
+
+
+def select_file(file_path: Path) -> ProcessFile:
+ """Given a relative path from `list_input_files`, return a ProcessFile"""
+ if (INPUT / file_path).is_file():
+ return ProcessFile(file_path)
+ else:
+ raise ValueError(f"{file_path} is not a file")
+
+
+def create_input_file(
+ name: str, file_type: str, text: str, sub_folder: Path = Path()
+) -> ProcessFile:
+ """Creates an input file, with a name and a type form the available ones,
+ writes a text to it, if a sub_folder is given,
+ it is created and the file placed inside.
+ returns the relative path for future processing
+
+ valid file_types: ".phrases", ".dictionary"
+ """
+ if file_type not in (PHRASES_TYPE, DICT_TYPE):
+ raise ValueError(f"file_type {file_type} not in {(PHRASES_TYPE, DICT_TYPE)}")
+ filename = f"{name}{file_type}.txt"
+ relative = sub_folder / filename
+ # write file
+ file_path = INPUT / relative
+ file_path.parent.mkdir(exist_ok=True, parents=True)
+ file_path.write_text(text, encoding="utf8")
+ # create process_file for future
+ process_file = ProcessFile(relative)
+ return process_file
+
+
+def process_a_file(process_file: ProcessFile, language_id: str):
+ """From a input_file, a language and an output type, process a file"""
+ process_file.language_id = language_id
+ if PHRASES_TYPE in process_file.input_file.suffix:
+ TTS.create_tts()
+ TRANS.create_translator(LANGUAGES.CN, language_id)
+ with process_file.absolute_input_file.open("r") as file:
+ text_lines = [line.strip() for line in file.readlines()]
+ results = translator_process(text_lines, process_file, language_id)
+ output_anki_phrase(process_file, results)
+ elif DICT_TYPE in process_file.input_file.suffix:
+ print("not implemented")
diff --git a/src/anki-hsk-creator/cedict_ts.u8 b/src/anki_hsk_creator/cedict_ts.u8
similarity index 100%
rename from src/anki-hsk-creator/cedict_ts.u8
rename to src/anki_hsk_creator/cedict_ts.u8
diff --git a/src/anki_hsk_creator/constants.py b/src/anki_hsk_creator/constants.py
new file mode 100644
index 0000000..16c8f65
--- /dev/null
+++ b/src/anki_hsk_creator/constants.py
@@ -0,0 +1,51 @@
+"""constants.py"""
+
+# Standard Library
+import importlib.resources
+
+# Local
+from . import DATA_FOLDER
+
+# Resources
+CCCEDICT_PATH = importlib.resources.files("anki-hsk-creator").joinpath("cedict_ts.u8")
+
+# Data folder structure
+INPUT = DATA_FOLDER / "input"
+INPUT.mkdir(exist_ok=True, parents=True)
+OUTPUT = DATA_FOLDER / "output"
+OUTPUT.mkdir(exist_ok=True, parents=True)
+RESOURCES = DATA_FOLDER / "resources"
+RESOURCES.mkdir(exist_ok=True, parents=True)
+
+# File Types
+PHRASES_TYPE = ".phrases"
+DICT_TYPE = ".dictionary"
+
+
+class LANGUAGES:
+ """Available laguages for translation"""
+
+ CN = "zh"
+ EN = "en"
+ ES = "es"
+ FR = "fr"
+ RU = "ru"
+ TR = "tr"
+ TH = "th"
+
+ @property
+ def available_languages(self) -> tuple:
+ """Available laguages for translation"""
+ return (self.EN, self.ES, self.FR, self.RU, self.TR, self.TH)
+
+ @property
+ def language_names(self) -> dict:
+ """Gets the name of a language code"""
+ return {
+ self.EN: "English",
+ self.ES: "Spanish",
+ self.FR: "French",
+ self.RU: "Russian",
+ self.TR: "Turkish",
+ self.TH: "Thai",
+ }
diff --git a/src/anki_hsk_creator/proccessor.py b/src/anki_hsk_creator/proccessor.py
new file mode 100644
index 0000000..47b6e82
--- /dev/null
+++ b/src/anki_hsk_creator/proccessor.py
@@ -0,0 +1,96 @@
+"""processor.py"""
+
+# Pip
+import argostranslate.translate
+import torchaudio
+
+# Local
+from .constants import LANGUAGES
+from .utility import TTS, ProcessFile, TranslationResult # , CCCEDICT
+
+# Results Classes
+
+
+def translator_process(
+ text_lines: list[str],
+ process_file: ProcessFile,
+ language_id: str,
+) -> list[TranslationResult]:
+ """Process for phases or sentence translation"""
+ results = []
+ for n, line in enumerate(text_lines):
+ line = line.strip()
+ audio_path = process_file.resources / f"N{n::03.0n}.wav"
+ if not audio_path.exists():
+ audio = TTS.MODEL.generate(f"{line}。", language_id=LANGUAGES.CN)
+ torchaudio.save(audio_path, audio, TTS.MODEL.sr)
+ translated = argostranslate.translate.translate(line, LANGUAGES.CN, language_id)
+ results.append(TranslationResult(language_id, translated, line, audio_path))
+ return results
+
+
+# def dictionary_process(dictionary, tts, in_file, resources):
+# """Process dictionary files"""
+# words_list = in_file.open(encoding="utf8").read().strip().split("\n")
+# results = []
+# try:
+# with in_file.open("w", encoding="utf8") as input_file:
+# for words in words_list:
+# word = words.split()[0]
+# pinyin = " ".join(words.split()[1:]) if len(words.split()) > 1 else None
+# if v := dictionary.get(word):
+# if len(v) > 1:
+# print(f"\nWARNING: {word} has multiple meanings:")
+# if pinyin and pinyin != "ERROR":
+# ml = list(filter(lambda x: x.pinyin == pinyin, v))
+# else:
+# ml = v
+# if len(ml) > 1:
+# for n, w in enumerate(ml):
+# print(f"{n+1} - {w}")
+# for m in w.meanings:
+# print(f"\t{m}")
+# s = None
+# while (
+# not s
+# or not s.isnumeric()
+# or not (1 <= int(s) <= len(v))
+# ):
+# s = input(
+# f"Please select the correct word [1-{len(v)}]: "
+# )
+# v = v[int(s) - 1]
+# else:
+# v = ml[0]
+# else:
+# v = v[0]
+# audio_path = resources / f"{word}.wav"
+# if not audio_path.exists():
+# audio = tts.generate(f"{word}。", language_id="zh")
+# torchaudio.save(audio_path, audio, tts.sr)
+# input_file.write(f"{word}\t{v.pinyin}\n")
+# results.append((v, audio_path))
+# else:
+# print("============================================")
+# print(f"===================>ERROR: {word} not found")
+# print("============================================")
+# input_file.write(f"{word}\tERROR\n")
+# except Exception:
+# with in_file.open("w", encoding="utf8") as input_file:
+# input_file.write("\n".join(words_list))
+# return results
+
+# def output_tsv(out_file, results):
+# """writes the output as a tsv file"""
+# final_file = out_file.parent / f"{out_file.stem}.tsv"
+# with final_file.open("w", encoding="utf8", newline="") as csvfile:
+# writer = csv.writer(csvfile, delimiter="\t", quotechar='"')
+# for entry in results:
+# writer.writerow(
+# [
+# "\n ".join(f"{n+1}. {m}" for n, m in enumerate(entry.meanings)),
+# PinyinToneConverter().convert_text(entry.pinyin),
+# entry.simplified,
+# entry.traditional,
+# ]
+# )
diff --git a/src/anki_hsk_creator/utility.py b/src/anki_hsk_creator/utility.py
new file mode 100644
index 0000000..545251d
--- /dev/null
+++ b/src/anki_hsk_creator/utility.py
@@ -0,0 +1,154 @@
+"""utility.py
+
+
+Static clasess and functions for general use
+"""
+
+# Standard Library
+from pathlib import Path
+
+# Pip
+import argostranslate.package
+import argostranslate.translate
+import torch
+from cedict_utils.cedict import CedictParser
+from chatterbox.mtl_tts import ChatterboxMultilingualTTS
+
+# Local
+from .constants import CCCEDICT_PATH, INPUT, LANGUAGES, OUTPUT, RESOURCES
+
+# Static Clases
+
+
+class TRANS:
+ """Static Class for Argos translate"""
+
+ UPDATED = False
+ PACKAGES = None
+
+ @staticmethod
+ def create_translator(from_code, to_code):
+ """Download and install Argos Translate package"""
+ if not TRANS.UPDATED:
+ argostranslate.package.update_package_index()
+ TRANS.PACKAGES = argostranslate.package.get_available_packages()
+ TRANS.UPDATED = True
+ package_to_install = next(
+ filter(
+ lambda x: x.from_code == from_code and x.to_code == to_code,
+ TRANS.PACKAGES,
+ )
+ )
+ argostranslate.package.install_from_path(package_to_install.download())
+
+
+class CCCEDICT:
+ """Static Class for the CCCEDIT dictionary"""
+
+ PARSER = None
+ ENTRIES = []
+ DICTIONARY_LIST = {}
+
+ @staticmethod
+ def create_cedict(language_id=LANGUAGES.EN):
+ """Creates a create_cedict dictionary object"""
+ if not CCCEDICT.PARSER:
+ CCCEDICT.PARSER = CedictParser()
+ CCCEDICT.PARSER.read_file(CCCEDICT_PATH)
+ CCCEDICT.ENTRIES = CCCEDICT.PARSER.parse()
+ if language_id not in CCCEDICT.DICTIONARY_LIST:
+ dictionary = {}
+ for entry in CCCEDICT.ENTRIES:
+ if language_id != LANGUAGES.EN:
+ TRANS.create_translator(LANGUAGES.EN, language_id)
+ entry = argostranslate.translate.translate(
+ entry, LANGUAGES.EN, language_id
+ )
+ if entry.simplified not in dictionary:
+ dictionary[entry.simplified] = [entry]
+ else:
+ dictionary[entry.simplified].append(entry)
+ CCCEDICT.DICTIONARY_LIST[language_id] = dictionary
+ else:
+ dictionary = CCCEDICT.DICTIONARY_LIST[language_id]
+ return dictionary
+
+
+class TTS:
+ """Static class for the the TTS engine"""
+
+ MODEL = None
+ DEVICE = None
+
+ @staticmethod
+ def create_tts():
+ """Creates a TTS engine"""
+ if TTS.DEVICE is None:
+ # Automatically detect the best available device
+ if torch.cuda.is_available():
+ TTS.DEVICE = "cuda"
+ elif torch.backends.mps.is_available():
+ TTS.DEVICE = "mps"
+ else:
+ TTS.DEVICE = "cpu"
+ if TTS.MODEL is None:
+ TTS.MODEL = ChatterboxMultilingualTTS.from_pretrained(
+ device=TTS.DEVICE, t3_model="v3"
+ )
+
+
+# Clases
+
+
+class ProcessFile:
+ """Class that represents a file to processs
+
+ diferent input files has direfent process_files depending on language
+ """
+
+ def __init__(self, input_file: Path, language_id: str = None):
+ self.input_file = input_file
+ self._language_id = language_id
+ # process file type
+ self.out_folder = OUTPUT / input_file.parent
+ self.out_folder.mkdir(parents=True, exist_ok=True)
+ resources = RESOURCES / input_file
+ self.resources = resources.parent / resources.stem
+ self.resources.mkdir(parents=True, exist_ok=True)
+
+ @property
+ def absolute_input_file(self):
+ """Absolute input file"""
+ return INPUT / self.input_file
+
+ @property
+ def language_id(self):
+ """language for this trasnlation process"""
+ return self._language_id
+
+ @language_id.setter
+ def language_id(self, value):
+ self._language_id = value
+
+ @property
+ def output_name(self):
+ """Posible name for the output file, still missing the filetype"""
+ if self.language_id is None:
+ raise ValueError("Not a valid language selected")
+ return self.input_file.parent / f"{self.input_file.stem}.{self.language_id})."
+
+
+class TranslationResult:
+ """Result of a translated process"""
+
+ def __init__(
+ self,
+ language_id: str,
+ translated: str,
+ line: str,
+ audio_path: Path,
+ ):
+ self.language_id = language_id
+ self.translated = translated
+ self.line = line
+ self.audio_path = audio_path