version 0.1

This commit is contained in:
Wolfang Torres
2026-06-11 21:23:34 +08:00
parent ea057668bc
commit 21c6416cfd
15 changed files with 645 additions and 367 deletions

View File

@@ -34,6 +34,7 @@ dependencies = [
"torch", "torch",
"torchaudio", "torchaudio",
"torchcodec", "torchcodec",
"python-dotenv",
] ]
[project.optional-dependencies] [project.optional-dependencies]
@@ -41,7 +42,9 @@ dev = [
"pytest", "pytest",
"black", "black",
"pylint", "pylint",
"flakehell" "flake8",
"flake8-pyproject",
# "flakeheaven",
] ]
[project.urls] [project.urls]
@@ -50,17 +53,18 @@ Issues = "https://gitea.wolfang.info.ve/wolfang/anki-hsk-creator/issues"
Source = "https://gitea.wolfang.info.ve/wolfang/anki-hsk-creator" Source = "https://gitea.wolfang.info.ve/wolfang/anki-hsk-creator"
[tool.hatch.version] [tool.hatch.version]
path = "src/anki-hsk-creator/__about__.py" path = "src/anki_hsk_creator/__about__.py"
[tool.hatch.build.targets.sdist] [tool.hatch.build.targets.sdist]
packages = ["src/anki_hsk_creator"]
include = [ include = [
"src/anki-hsk-creator/cedict_ts.u8", "src/anki_hsk_creator/cedict_ts.u8",
] ]
[tool.hatch.build.targets.wheel] [tool.hatch.build.targets.wheel]
packages = ["src/anki-hsk-creator"] packages = ["src/anki_hsk_creator"]
include = [ include = [
"src/anki-hsk-creator/cedict_ts.u8", "src/anki_hsk_creator/cedict_ts.u8",
] ]
[tool.hatch.envs.default] [tool.hatch.envs.default]
@@ -69,7 +73,8 @@ extra-dependencies = [
] ]
[tool.hatch.envs.default.scripts] [tool.hatch.envs.default.scripts]
format = "black --target-version=py314 anki-hsk-creator tests && isort anki-hsk-creator tests" format = "black --target-version=py314 src tests && isort src tests"
lint = "flake8 src"
[tool.hatch.envs.types] [tool.hatch.envs.types]
extra-dependencies = [ extra-dependencies = [
@@ -80,16 +85,16 @@ extra-dependencies = [
check = "mypy --install-types --non-interactive {args:src/anki-hsk-creator tests}" check = "mypy --install-types --non-interactive {args:src/anki-hsk-creator tests}"
[tool.coverage.run] [tool.coverage.run]
source_pkgs = ["anki-hsk-creator", "tests"] source_pkgs = ["src", "tests"]
branch = true branch = true
parallel = true parallel = true
omit = [ omit = [
"src/anki-hsk-creator/__about__.py", "src/anki_hsk_creator/__about__.py",
] ]
[tool.coverage.paths] [tool.coverage.paths]
anki-hsk-creator = ["src/anki-hsk-creator", "*/anki-hsk-creator/src/anki-hsk-creator"] anki-hsk-creator = ["src/anki_hsk_creator", "*/anki-hsk-creator/src/anki_hsk_creator"]
tests = ["tests", "*src/anki-hsk-creator/tests"] tests = ["tests", "*src/anki_hsk_creator/tests"]
[tool.coverage.report] [tool.coverage.report]
exclude_lines = [ exclude_lines = [
@@ -111,6 +116,7 @@ exclude = '''
) )
''' '''
[tool.isort] [tool.isort]
src_paths = ["src", "test"] src_paths = ["src", "test"]
skip_glob = [".git", "__pycache__", ".vscode", "*venv", "build", "dist", "old", "*.egg-info"] skip_glob = [".git", "__pycache__", ".vscode", "*venv", "build", "dist", "old", "*.egg-info"]
@@ -145,9 +151,8 @@ msg-template="{path}:{module}:{line}: [{msg_id}({symbol}), {obj}] {msg}"
logging-format-style="new" logging-format-style="new"
logging-modules="logging" logging-modules="logging"
[tool.flakehell] [tool.flake8]
max_line_length = 88 max_line_length = 88
format = "grouped"
show_source = false show_source = false
exclude = [ exclude = [
".git", ".git",
@@ -160,8 +165,29 @@ exclude = [
"*.egg-info", "*.egg-info",
] ]
[tool.flakehell.plugins] [tool.flake8.plugins]
mccabe = ["+C*"] mccabe = ["+C*"]
pycodestyle = ["+E*", "+W*", "-E203", "-E501", "-W503"] pycodestyle = ["+E*", "+W*", "-E203", "-E501", "-W503"]
pyflakes = ["+F*"] pyflakes = ["+F*"]
flake8-bugbear = ["+*", "+B950"] flake8-bugbear = ["+*", "+B950"]
# [tool.flakeheaven]
# max_line_length = 88
# format = "grouped"
# show_source = false
# exclude = [
# ".git",
# "__pycache__",
# ".vscode",
# "*venv",
# "build",
# "dist",
# "old",
# "*.egg-info",
# ]
# [tool.flakeheaven.plugins]
# mccabe = ["+C*"]
# pycodestyle = ["+E*", "+W*", "-E203", "-E501", "-W503"]
# pyflakes = ["+F*"]
# flake8-bugbear = ["+*", "+B950"]

View File

@@ -1,9 +0,0 @@
"""anki-hsk-creator"""
import os
# Globals
os.environ["HF_TOKEN"] = "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"
ARGOS_UPDATED = False
ARGOS_PACKAGES = None

View File

@@ -1,184 +0,0 @@
## Imports
from pathlib import Path
import random
import csv
## PIP
from pinyin_tone_converter.pinyin_tone_converter import PinyinToneConverter
## Main
def process_files():
print("Select data file:")
in_file = None
level = INPUT
while not in_file:
files = []
for n, file in enumerate(level.glob("*")):
files.append(file)
print(f"{n+1} - {file.relative_to(INPUT)}")
s = None
while not s or not s.isnumeric() or not (1 <= int(s) <= len(files)):
s = input(f"Please select the file [1-{len(files)}]: ")
selected = files[int(s) - 1]
if selected.is_file():
in_file = selected
else:
level = selected
relative = in_file.relative_to(INPUT)
out_file = OUTPUT / relative
resources = RESOURCES / relative
resources = resources.parent / resources.stem
resources.mkdir(parents=True, exist_ok=True)
out_file.parent.mkdir(parents=True, exist_ok=True)
with in_file.open(encoding="utf8") as input_file:
file_type = input_file.read().split()[0]
return in_file, out_file, resources, file_type
def dictionary_process(dictionary, tts, in_file, resources):
"""Process dictionary files"""
words_list = in_file.open(encoding="utf8").read().strip().split("\n")
results = []
try:
with in_file.open("w", encoding="utf8") as input_file:
for words in words_list:
word = words.split()[0]
pinyin = " ".join(words.split()[1:]) if len(words.split()) > 1 else None
if v := dictionary.get(word):
if len(v) > 1:
print(f"\nWARNING: {word} has multiple meanings:")
if pinyin and pinyin != "ERROR":
ml = list(filter(lambda x: x.pinyin == pinyin, v))
else:
ml = v
if len(ml) > 1:
for n, w in enumerate(ml):
print(f"{n+1} - {w}")
for m in w.meanings:
print(f"\t{m}")
s = None
while (
not s
or not s.isnumeric()
or not (1 <= int(s) <= len(v))
):
s = input(
f"Please select the correct word [1-{len(v)}]: "
)
v = v[int(s) - 1]
else:
v = ml[0]
else:
v = v[0]
audio_path = resources / f"{word}.wav"
if not audio_path.exists():
audio = tts.generate(f"{word}", language_id="zh")
torchaudio.save(audio_path, audio, tts.sr)
input_file.write(f"{word}\t{v.pinyin}\n")
results.append((v, audio_path))
else:
print("============================================")
print(f"===================>ERROR: {word} not found")
print("============================================")
input_file.write(f"{word}\tERROR\n")
except Exception:
with in_file.open("w", encoding="utf8") as input_file:
input_file.write("\n".join(words_list))
return results
def translator_process(tts, resources, in_file):
"""Process for phases trasnlation"""
text_list = in_file.open(encoding="utf8").read().strip().split()
results = []
for n, phrase in enumerate(text_list):
phrase = phrase.strip()
audio_path = resources / f"N{n}.wav"
if not audio_path.exists():
audio = tts.generate(f"{phrase}", language_id="zh")
torchaudio.save(audio_path, audio, tts.sr)
translated = argostranslate.translate.translate(phrase, CN, EN)
results.append([translated, phrase, audio_path])
return results
# def output_tsv(out_file, results):
# """writes the output as a tsv file"""
# final_file = out_file.parent / f"{out_file.stem}.tsv"
# with final_file.open("w", encoding="utf8", newline="") as csvfile:
# writer = csv.writer(csvfile, delimiter="\t", quotechar='"')
# for entry in results:
# writer.writerow(
# [
# "\n ".join(f"{n+1}. {m}" for n, m in enumerate(entry.meanings)),
# PinyinToneConverter().convert_text(entry.pinyin),
# entry.simplified,
# entry.traditional,
# ]
# )
def output_anki_dictionary(out_file, results):
final_file = out_file.parent / f"{out_file.stem}.apkg"
deck_name = "::".join(out_file.relative_to(OUTPUT).parts[:-1] + (out_file.stem,))
deck = Deck(random.randrange(1 << 30, 1 << 31), deck_name)
package = Package(deck)
audios = []
for entry, audio in results:
note = Note(
model=HSK_MODEL,
fields=[
"\n ".join(f"{n+1}. {m}" for n, m in enumerate(entry.meanings)),
PinyinToneConverter().convert_text(entry.pinyin),
entry.simplified,
entry.traditional,
f"[sound:{audio.name}]",
],
)
audios.append(audio)
deck.add_note(note)
package.media_files = audios
package.write_to_file(final_file)
def output_anki_phrase(out_file, results):
final_file = out_file.parent / f"{out_file.stem}.apkg"
deck_name = "::".join(out_file.relative_to(OUTPUT).parts[:-1] + (out_file.stem,))
deck = Deck(random.randrange(1 << 30, 1 << 31), deck_name)
package = Package(deck)
audios = []
for translated, phrase, audio in results:
note = Note(
model=PHRASE_MODEL,
fields=[
translated,
phrase,
f"[sound:{audio.name}]",
],
)
deck.add_note(note)
audios.append(audio)
package.media_files = audios
package.write_to_file(final_file)
def main():
tts = create_tts()
dictionary = create_cedict()
create_translator()
while True:
in_file, out_file, resources, file_type = process_files()
if PHRASES_TYPE in in_file.suffixes:
results = translator_process(tts, resources, in_file)
output_anki_phrase(out_file, results)
elif DICT_TYPE in in_file.suffixes:
results = dictionary_process(dictionary, tts, in_file, resources)
output_anki_dictionary(out_file, results)
else:
raise TypeError("Error, filetype not especified!")
if __name__ == "__main__":
main()

View File

@@ -1,88 +0,0 @@
# anki-models.py
from genanki import Deck, Note, Model, Package
# Constants
CSS = """
.card {
font-family: arial;
font-size: 20px;
text-align: center;
color: black;
background-color: white;
}
.simple {
font-family: Arial;
font-size: 100px;
}
.trad {
font-family: Arial;
font-size: 75px;
}
"""
# Models
PHRASE_MODEL = Model(
2076166425,
"Phrase Model",
fields=[
{"name": "Translated"},
{"name": "Phrase"},
{"name": "Audio"},
],
templates=[
{
"name": "Card 1",
"qfmt": "{{Translated}}<br>{{Audio}}",
"afmt": '{{FrontSide}}<hr id="answer">{{Phrase}}',
},
{
"name": "Card 2",
"qfmt": "{{Phrase}}<br>{{Audio}}",
"afmt": '{{FrontSide}}<hr id="answer">{{Translated}}',
},
{
"name": "Card 3",
"qfmt": "{{Audio}}",
"afmt": '{{FrontSide}}<hr id="answer">{{Phrase}}',
},
],
css=CSS,
)
HSK_MODEL = Model(
1708536519,
"HSK Model",
fields=[
{"name": "English"},
{"name": "Pinyin"},
{"name": "Simplified"},
{"name": "Traditional"},
{"name": "Audio"},
],
templates=[
{
"name": "Card 1",
"qfmt": "<strong>{{Pinyin}}</strong><br>{{English}}<br>{{Audio}}",
"afmt": "{{FrontSide}}<hr id='answer''><div class='simple'>"
"{{Simplified}}</div><br><div class='trad'>{{Traditional}}</div>",
},
{
"name": "Card 2",
"qfmt": "<div class='simple'>{{Simplified}}</div><br><div class='trad'>"
"{{Traditional}}</div>",
"afmt": '{{FrontSide}}<hr id="answer"><strong>{{Pinyin}}</strong><br>{{English}}<br>{{Audio}}',
},
{
"name": "Card 3",
"qfmt": "{{Audio}}",
"afmt": "{{FrontSide}}<hr id='answer''><strong>{{Pinyin}}</strong><br><div class='simple'>"
"{{Simplified}}</div><br><div class='trad'>{{Traditional}}</div>",
},
],
css=CSS,
)

View File

@@ -1,19 +0,0 @@
## Imports
from pathlib import Path
import random
import importlib.resources
CCCEDICT = importlib.resources.files("anki-hsk-creator").joinpath("cedict_ts.u8")
DATA = Path(__file__).parent.parent / "data"
INPUT = DATA / "input"
OUTPUT = DATA / "output"
RESOURCES = DATA / "resources"
# File Types
PHRASES_TYPE = ".phrases"
DICT_TYPE = ".dictionary"
# Language codes
CN = "zh"
EN = "en"

View File

@@ -1,53 +0,0 @@
from cedict_utils.cedict import CedictParser
import argostranslate.package
import argostranslate.translate
from chatterbox.mtl_tts import ChatterboxMultilingualTTS
import torch
import torchaudio
from . import ARGOS_UPDATED, ARGOS_PACKAGES
from . import CCCEDICT
## Functions
def create_cedict(language_id="en"):
"""Creates a create_cedict dictionary object"""
parser = CedictParser()
parser.read_file(CCCEDICT)
entries = parser.parse()
dictionary = {}
for entry in entries:
if entry.simplified not in dictionary:
dictionary[entry.simplified] = [entry]
else:
dictionary[entry.simplified].append(entry)
return dictionary
def create_translator(from_code, to_code):
"""Download and install Argos Translate package"""
if not ARGOS_UPDATED:
argostranslate.package.update_package_index()
ARGOS_PACKAGES = argostranslate.package.get_available_packages()
ARGOS_UPDATED = True
package_to_install = next(
filter(lambda x: x.from_code == CN and x.to_code == EN, ARGOS_PACKAGES)
)
argostranslate.package.install_from_path(package_to_install.download())
def create_tts():
# Automatically detect the best available device
if torch.cuda.is_available():
device = "cuda"
elif torch.backends.mps.is_available():
device = "mps"
else:
device = "cpu"
tts = ChatterboxMultilingualTTS.from_pretrained(device=device, t3_model="v3")
return tts

View File

@@ -1,4 +1,5 @@
"""about.py"""
# SPDX-FileCopyrightText: 2026-present Wolfang Torres <wolfang.torres@gmail.com> # SPDX-FileCopyrightText: 2026-present Wolfang Torres <wolfang.torres@gmail.com>
# #
# SPDX-License-Identifier: GPL-3.0-or-later # SPDX-License-Identifier: GPL-3.0-or-later
__version__ = "0.0.1" __version__ = "0.1.0"

View File

@@ -0,0 +1,20 @@
"""anki_hsk_creator"""
# Standard Library
import os
from pathlib import Path
# Pip
from dotenv import load_dotenv
load_dotenv()
# Globals
# Get an HF_TOKEN from huggingface for TTS generation
HF_TOKEN = os.environ.get("HF_TOKEN")
# Path
default_path = Path.home() / "anki-hsk-creator-data"
DATA_FOLDER = Path(os.environ.get("DATA_FOLDER", default_path))
DATA_FOLDER.mkdir(exist_ok=True, parents=True)

View File

@@ -0,0 +1,52 @@
"""__main__.py"""
# Standard Library
from pathlib import Path
# Local
from .api import list_input_files, process_a_file, select_file
from .constants import LANGUAGES
def cli_select_files():
"""Loops until it finds a valid input_file"""
print("Select data file:")
in_file = None
level = Path()
while not in_file:
files = list_input_files(level)
for n, file in enumerate(files):
print(f"{n+1} - {file}")
s = None
while not s or not s.isnumeric() or not (1 <= int(s) <= len(files)):
s = input(f"Please select the file [1-{len(files)}]: ")
selected = files[int(s) - 1]
if selected.is_file():
in_file = selected
else:
level = selected
input_file = select_file(in_file)
return input_file
def cli_select_language():
"""Selects a language for the trasnlatatio"""
print("Select a language:")
for language_id, language in LANGUAGES.language_names.items():
print(f"{language_id} - {language}")
s = None
while not s or s not in LANGUAGES.available_languages:
s = input(f"Please select the language: ({ LANGUAGES.available_languages})")
return s
def main():
"""CLI interface for the module"""
while True:
input_file = cli_select_files()
language_id = cli_select_language()
process_a_file(input_file, language_id)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,159 @@
"""anki_generation.py
Produces anki output
"""
# Standard Library
import random
# Pip
from genanki import Deck, Model, Note, Package
# Local
from .utility import ProcessFile, TranslationResult
# from pinyin_tone_converter.pinyin_tone_converter import PinyinToneConverter
# Constants
CSS = """
.card {
font-family: arial;
font-size: 20px;
text-align: center;
color: black;
background-color: white;
}
.simple {
font-family: Arial;
font-size: 100px;
}
.trad {
font-family: Arial;
font-size: 75px;
}
"""
# Models
PHRASE_MODEL = Model(
2076166425,
"Phrase Model",
fields=[
{"name": "Translated"},
{"name": "Phrase"},
{"name": "Audio"},
],
templates=[
{
"name": "Card 1",
"qfmt": "{{Translated}}<br>{{Audio}}",
"afmt": '{{FrontSide}}<hr id="answer">{{Phrase}}',
},
{
"name": "Card 2",
"qfmt": "{{Phrase}}<br>{{Audio}}",
"afmt": '{{FrontSide}}<hr id="answer">{{Translated}}',
},
{
"name": "Card 3",
"qfmt": "{{Audio}}",
"afmt": '{{FrontSide}}<hr id="answer">{{Phrase}}',
},
],
css=CSS,
)
HSK_MODEL = Model(
1708536519,
"HSK Model",
fields=[
{"name": "English"},
{"name": "Pinyin"},
{"name": "Simplified"},
{"name": "Traditional"},
{"name": "Audio"},
],
templates=[
{
"name": "Card 1",
"qfmt": "<strong>{{Pinyin}}</strong><br>{{English}}<br>{{Audio}}",
"afmt": (
"{{FrontSide}}<hr id='answer''><div class='simple'>{{Simplified}}</div>"
"<br><div class='trad'>{{Traditional}}</div>"
),
},
{
"name": "Card 2",
"qfmt": "<div class='simple'>{{Simplified}}</div><br><div class='trad'>"
"{{Traditional}}</div>",
"afmt": (
"{{FrontSide}}<hr id='answer'><strong>{{Pinyin}}</strong>"
"<br>{{English}}<br>{{Audio}}"
),
},
{
"name": "Card 3",
"qfmt": "{{Audio}}",
"afmt": (
"{{FrontSide}}<hr id='answer'><strong>{{Pinyin}}</strong>"
"<br><div class='simple'>{{Simplified}}</div>"
"<br><div class='trad'>{{Traditional}}</div>"
),
},
],
css=CSS,
)
# Proccess
# def output_anki_dictionary(out_file, results):
# """Creates an anki file from a dictionary results"""
# final_file = out_file.parent / f"{out_file.stem}.apkg"
# deck_name = "::".join(out_file.relative_to(OUTPUT).parts[:-1] + (out_file.stem,))
# deck = Deck(random.randrange(1 << 30, 1 << 31), deck_name)
# package = Package(deck)
# audios = []
# for entry, audio in results:
# note = Note(
# model=HSK_MODEL,
# fields=[
# "\n ".join(f"{n+1}. {m}" for n, m in enumerate(entry.meanings)),
# PinyinToneConverter().convert_text(entry.pinyin),
# entry.simplified,
# entry.traditional,
# f"[sound:{audio.name}]",
# ],
# )
# audios.append(audio)
# deck.add_note(note)
# package.media_files = audios
# package.write_to_file(final_file)
def output_anki_phrase(process_file: ProcessFile, results: list[TranslationResult]):
"""Creates an anki file from a phrases results"""
final_file = process_file.output_name.with_suffix(".apkg")
deck_name = "::".join(
process_file.input_file.parts[:-1] + (process_file.input_fil.stem,)
)
deck = Deck(random.randrange(1 << 30, 1 << 31), deck_name)
package = Package(deck)
audios = []
for result in results:
note = Note(
model=PHRASE_MODEL,
fields=[
result.translated,
result.line,
f"[sound:{result.audio_path.name}]",
],
)
deck.add_note(note)
audios.append(result.audio_path)
package.media_files = audios
package.write_to_file(final_file)

View File

@@ -0,0 +1,72 @@
"""api.py
Interface for managuing and procesing files
"""
# Standard Library
from pathlib import Path
# Local
from . import DATA_FOLDER
from .anki_generation import output_anki_phrase
from .constants import DICT_TYPE, INPUT, LANGUAGES, PHRASES_TYPE
from .proccessor import translator_process
from .utility import TRANS, TTS, ProcessFile
# interface
def get_data_folder() -> Path:
"""Utility function, return the data folder"""
return DATA_FOLDER
def list_input_files(search_path: Path = Path()) -> list[Path]:
"""Return a list of files relative to the INPUT path"""
level = INPUT / search_path
return [path.relative_to(INPUT) for path in level.glob("*")]
def select_file(file_path: Path) -> ProcessFile:
"""Given a relative path from `list_input_files`, return a ProcessFile"""
if (INPUT / file_path).is_file():
return ProcessFile(file_path)
else:
raise ValueError(f"{file_path} is not a file")
def create_input_file(
name: str, file_type: str, text: str, sub_folder: Path = Path()
) -> ProcessFile:
"""Creates an input file, with a name and a type form the available ones,
writes a text to it, if a sub_folder is given,
it is created and the file placed inside.
returns the relative path for future processing
valid file_types: ".phrases", ".dictionary"
"""
if file_type not in (PHRASES_TYPE, DICT_TYPE):
raise ValueError(f"file_type {file_type} not in {(PHRASES_TYPE, DICT_TYPE)}")
filename = f"{name}{file_type}.txt"
relative = sub_folder / filename
# write file
file_path = INPUT / relative
file_path.parent.mkdir(exist_ok=True, parents=True)
file_path.write_text(text, encoding="utf8")
# create process_file for future
process_file = ProcessFile(relative)
return process_file
def process_a_file(process_file: ProcessFile, language_id: str):
"""From a input_file, a language and an output type, process a file"""
process_file.language_id = language_id
if PHRASES_TYPE in process_file.input_file.suffix:
TTS.create_tts()
TRANS.create_translator(LANGUAGES.CN, language_id)
with process_file.absolute_input_file.open("r") as file:
text_lines = [line.strip() for line in file.readlines()]
results = translator_process(text_lines, process_file, language_id)
output_anki_phrase(process_file, results)
elif DICT_TYPE in process_file.input_file.suffix:
print("not implemented")

View File

@@ -0,0 +1,51 @@
"""constants.py"""
# Standard Library
import importlib.resources
# Local
from . import DATA_FOLDER
# Resources
CCCEDICT_PATH = importlib.resources.files("anki-hsk-creator").joinpath("cedict_ts.u8")
# Data folder structure
INPUT = DATA_FOLDER / "input"
INPUT.mkdir(exist_ok=True, parents=True)
OUTPUT = DATA_FOLDER / "output"
OUTPUT.mkdir(exist_ok=True, parents=True)
RESOURCES = DATA_FOLDER / "resources"
RESOURCES.mkdir(exist_ok=True, parents=True)
# File Types
PHRASES_TYPE = ".phrases"
DICT_TYPE = ".dictionary"
class LANGUAGES:
"""Available laguages for translation"""
CN = "zh"
EN = "en"
ES = "es"
FR = "fr"
RU = "ru"
TR = "tr"
TH = "th"
@property
def available_languages(self) -> tuple:
"""Available laguages for translation"""
return (self.EN, self.ES, self.FR, self.RU, self.TR, self.TH)
@property
def language_names(self) -> dict:
"""Gets the name of a language code"""
return {
self.EN: "English",
self.ES: "Spanish",
self.FR: "French",
self.RU: "Russian",
self.TR: "Turkish",
self.TH: "Thai",
}

View File

@@ -0,0 +1,96 @@
"""processor.py"""
# Pip
import argostranslate.translate
import torchaudio
# Local
from .constants import LANGUAGES
from .utility import TTS, ProcessFile, TranslationResult # , CCCEDICT
# Results Classes
def translator_process(
text_lines: list[str],
process_file: ProcessFile,
language_id: str,
) -> list[TranslationResult]:
"""Process for phases or sentence translation"""
results = []
for n, line in enumerate(text_lines):
line = line.strip()
audio_path = process_file.resources / f"N{n::03.0n}.wav"
if not audio_path.exists():
audio = TTS.MODEL.generate(f"{line}", language_id=LANGUAGES.CN)
torchaudio.save(audio_path, audio, TTS.MODEL.sr)
translated = argostranslate.translate.translate(line, LANGUAGES.CN, language_id)
results.append(TranslationResult(language_id, translated, line, audio_path))
return results
# def dictionary_process(dictionary, tts, in_file, resources):
# """Process dictionary files"""
# words_list = in_file.open(encoding="utf8").read().strip().split("\n")
# results = []
# try:
# with in_file.open("w", encoding="utf8") as input_file:
# for words in words_list:
# word = words.split()[0]
# pinyin = " ".join(words.split()[1:]) if len(words.split()) > 1 else None
# if v := dictionary.get(word):
# if len(v) > 1:
# print(f"\nWARNING: {word} has multiple meanings:")
# if pinyin and pinyin != "ERROR":
# ml = list(filter(lambda x: x.pinyin == pinyin, v))
# else:
# ml = v
# if len(ml) > 1:
# for n, w in enumerate(ml):
# print(f"{n+1} - {w}")
# for m in w.meanings:
# print(f"\t{m}")
# s = None
# while (
# not s
# or not s.isnumeric()
# or not (1 <= int(s) <= len(v))
# ):
# s = input(
# f"Please select the correct word [1-{len(v)}]: "
# )
# v = v[int(s) - 1]
# else:
# v = ml[0]
# else:
# v = v[0]
# audio_path = resources / f"{word}.wav"
# if not audio_path.exists():
# audio = tts.generate(f"{word}。", language_id="zh")
# torchaudio.save(audio_path, audio, tts.sr)
# input_file.write(f"{word}\t{v.pinyin}\n")
# results.append((v, audio_path))
# else:
# print("============================================")
# print(f"===================>ERROR: {word} not found")
# print("============================================")
# input_file.write(f"{word}\tERROR\n")
# except Exception:
# with in_file.open("w", encoding="utf8") as input_file:
# input_file.write("\n".join(words_list))
# return results
# def output_tsv(out_file, results):
# """writes the output as a tsv file"""
# final_file = out_file.parent / f"{out_file.stem}.tsv"
# with final_file.open("w", encoding="utf8", newline="") as csvfile:
# writer = csv.writer(csvfile, delimiter="\t", quotechar='"')
# for entry in results:
# writer.writerow(
# [
# "\n ".join(f"{n+1}. {m}" for n, m in enumerate(entry.meanings)),
# PinyinToneConverter().convert_text(entry.pinyin),
# entry.simplified,
# entry.traditional,
# ]
# )

View File

@@ -0,0 +1,154 @@
"""utility.py
Static clasess and functions for general use
"""
# Standard Library
from pathlib import Path
# Pip
import argostranslate.package
import argostranslate.translate
import torch
from cedict_utils.cedict import CedictParser
from chatterbox.mtl_tts import ChatterboxMultilingualTTS
# Local
from .constants import CCCEDICT_PATH, INPUT, LANGUAGES, OUTPUT, RESOURCES
# Static Clases
class TRANS:
"""Static Class for Argos translate"""
UPDATED = False
PACKAGES = None
@staticmethod
def create_translator(from_code, to_code):
"""Download and install Argos Translate package"""
if not TRANS.UPDATED:
argostranslate.package.update_package_index()
TRANS.PACKAGES = argostranslate.package.get_available_packages()
TRANS.UPDATED = True
package_to_install = next(
filter(
lambda x: x.from_code == from_code and x.to_code == to_code,
TRANS.PACKAGES,
)
)
argostranslate.package.install_from_path(package_to_install.download())
class CCCEDICT:
"""Static Class for the CCCEDIT dictionary"""
PARSER = None
ENTRIES = []
DICTIONARY_LIST = {}
@staticmethod
def create_cedict(language_id=LANGUAGES.EN):
"""Creates a create_cedict dictionary object"""
if not CCCEDICT.PARSER:
CCCEDICT.PARSER = CedictParser()
CCCEDICT.PARSER.read_file(CCCEDICT_PATH)
CCCEDICT.ENTRIES = CCCEDICT.PARSER.parse()
if language_id not in CCCEDICT.DICTIONARY_LIST:
dictionary = {}
for entry in CCCEDICT.ENTRIES:
if language_id != LANGUAGES.EN:
TRANS.create_translator(LANGUAGES.EN, language_id)
entry = argostranslate.translate.translate(
entry, LANGUAGES.EN, language_id
)
if entry.simplified not in dictionary:
dictionary[entry.simplified] = [entry]
else:
dictionary[entry.simplified].append(entry)
CCCEDICT.DICTIONARY_LIST[language_id] = dictionary
else:
dictionary = CCCEDICT.DICTIONARY_LIST[language_id]
return dictionary
class TTS:
"""Static class for the the TTS engine"""
MODEL = None
DEVICE = None
@staticmethod
def create_tts():
"""Creates a TTS engine"""
if TTS.DEVICE is None:
# Automatically detect the best available device
if torch.cuda.is_available():
TTS.DEVICE = "cuda"
elif torch.backends.mps.is_available():
TTS.DEVICE = "mps"
else:
TTS.DEVICE = "cpu"
if TTS.MODEL is None:
TTS.MODEL = ChatterboxMultilingualTTS.from_pretrained(
device=TTS.DEVICE, t3_model="v3"
)
# Clases
class ProcessFile:
"""Class that represents a file to processs
diferent input files has direfent process_files depending on language
"""
def __init__(self, input_file: Path, language_id: str = None):
self.input_file = input_file
self._language_id = language_id
# process file type
self.out_folder = OUTPUT / input_file.parent
self.out_folder.mkdir(parents=True, exist_ok=True)
resources = RESOURCES / input_file
self.resources = resources.parent / resources.stem
self.resources.mkdir(parents=True, exist_ok=True)
@property
def absolute_input_file(self):
"""Absolute input file"""
return INPUT / self.input_file
@property
def language_id(self):
"""language for this trasnlation process"""
return self._language_id
@language_id.setter
def language_id(self, value):
self._language_id = value
@property
def output_name(self):
"""Posible name for the output file, still missing the filetype"""
if self.language_id is None:
raise ValueError("Not a valid language selected")
return self.input_file.parent / f"{self.input_file.stem}.{self.language_id})."
class TranslationResult:
"""Result of a translated process"""
def __init__(
self,
language_id: str,
translated: str,
line: str,
audio_path: Path,
):
self.language_id = language_id
self.translated = translated
self.line = line
self.audio_path = audio_path