version 0.1
This commit is contained in:
@@ -34,6 +34,7 @@ dependencies = [
|
|||||||
"torch",
|
"torch",
|
||||||
"torchaudio",
|
"torchaudio",
|
||||||
"torchcodec",
|
"torchcodec",
|
||||||
|
"python-dotenv",
|
||||||
]
|
]
|
||||||
|
|
||||||
[project.optional-dependencies]
|
[project.optional-dependencies]
|
||||||
@@ -41,7 +42,9 @@ dev = [
|
|||||||
"pytest",
|
"pytest",
|
||||||
"black",
|
"black",
|
||||||
"pylint",
|
"pylint",
|
||||||
"flakehell"
|
"flake8",
|
||||||
|
"flake8-pyproject",
|
||||||
|
# "flakeheaven",
|
||||||
]
|
]
|
||||||
|
|
||||||
[project.urls]
|
[project.urls]
|
||||||
@@ -50,17 +53,18 @@ Issues = "https://gitea.wolfang.info.ve/wolfang/anki-hsk-creator/issues"
|
|||||||
Source = "https://gitea.wolfang.info.ve/wolfang/anki-hsk-creator"
|
Source = "https://gitea.wolfang.info.ve/wolfang/anki-hsk-creator"
|
||||||
|
|
||||||
[tool.hatch.version]
|
[tool.hatch.version]
|
||||||
path = "src/anki-hsk-creator/__about__.py"
|
path = "src/anki_hsk_creator/__about__.py"
|
||||||
|
|
||||||
[tool.hatch.build.targets.sdist]
|
[tool.hatch.build.targets.sdist]
|
||||||
|
packages = ["src/anki_hsk_creator"]
|
||||||
include = [
|
include = [
|
||||||
"src/anki-hsk-creator/cedict_ts.u8",
|
"src/anki_hsk_creator/cedict_ts.u8",
|
||||||
]
|
]
|
||||||
|
|
||||||
[tool.hatch.build.targets.wheel]
|
[tool.hatch.build.targets.wheel]
|
||||||
packages = ["src/anki-hsk-creator"]
|
packages = ["src/anki_hsk_creator"]
|
||||||
include = [
|
include = [
|
||||||
"src/anki-hsk-creator/cedict_ts.u8",
|
"src/anki_hsk_creator/cedict_ts.u8",
|
||||||
]
|
]
|
||||||
|
|
||||||
[tool.hatch.envs.default]
|
[tool.hatch.envs.default]
|
||||||
@@ -69,7 +73,8 @@ extra-dependencies = [
|
|||||||
]
|
]
|
||||||
|
|
||||||
[tool.hatch.envs.default.scripts]
|
[tool.hatch.envs.default.scripts]
|
||||||
format = "black --target-version=py314 anki-hsk-creator tests && isort anki-hsk-creator tests"
|
format = "black --target-version=py314 src tests && isort src tests"
|
||||||
|
lint = "flake8 src"
|
||||||
|
|
||||||
[tool.hatch.envs.types]
|
[tool.hatch.envs.types]
|
||||||
extra-dependencies = [
|
extra-dependencies = [
|
||||||
@@ -80,16 +85,16 @@ extra-dependencies = [
|
|||||||
check = "mypy --install-types --non-interactive {args:src/anki-hsk-creator tests}"
|
check = "mypy --install-types --non-interactive {args:src/anki-hsk-creator tests}"
|
||||||
|
|
||||||
[tool.coverage.run]
|
[tool.coverage.run]
|
||||||
source_pkgs = ["anki-hsk-creator", "tests"]
|
source_pkgs = ["src", "tests"]
|
||||||
branch = true
|
branch = true
|
||||||
parallel = true
|
parallel = true
|
||||||
omit = [
|
omit = [
|
||||||
"src/anki-hsk-creator/__about__.py",
|
"src/anki_hsk_creator/__about__.py",
|
||||||
]
|
]
|
||||||
|
|
||||||
[tool.coverage.paths]
|
[tool.coverage.paths]
|
||||||
anki-hsk-creator = ["src/anki-hsk-creator", "*/anki-hsk-creator/src/anki-hsk-creator"]
|
anki-hsk-creator = ["src/anki_hsk_creator", "*/anki-hsk-creator/src/anki_hsk_creator"]
|
||||||
tests = ["tests", "*src/anki-hsk-creator/tests"]
|
tests = ["tests", "*src/anki_hsk_creator/tests"]
|
||||||
|
|
||||||
[tool.coverage.report]
|
[tool.coverage.report]
|
||||||
exclude_lines = [
|
exclude_lines = [
|
||||||
@@ -111,6 +116,7 @@ exclude = '''
|
|||||||
)
|
)
|
||||||
'''
|
'''
|
||||||
|
|
||||||
|
|
||||||
[tool.isort]
|
[tool.isort]
|
||||||
src_paths = ["src", "test"]
|
src_paths = ["src", "test"]
|
||||||
skip_glob = [".git", "__pycache__", ".vscode", "*venv", "build", "dist", "old", "*.egg-info"]
|
skip_glob = [".git", "__pycache__", ".vscode", "*venv", "build", "dist", "old", "*.egg-info"]
|
||||||
@@ -145,9 +151,8 @@ msg-template="{path}:{module}:{line}: [{msg_id}({symbol}), {obj}] {msg}"
|
|||||||
logging-format-style="new"
|
logging-format-style="new"
|
||||||
logging-modules="logging"
|
logging-modules="logging"
|
||||||
|
|
||||||
[tool.flakehell]
|
[tool.flake8]
|
||||||
max_line_length = 88
|
max_line_length = 88
|
||||||
format = "grouped"
|
|
||||||
show_source = false
|
show_source = false
|
||||||
exclude = [
|
exclude = [
|
||||||
".git",
|
".git",
|
||||||
@@ -160,8 +165,29 @@ exclude = [
|
|||||||
"*.egg-info",
|
"*.egg-info",
|
||||||
]
|
]
|
||||||
|
|
||||||
[tool.flakehell.plugins]
|
[tool.flake8.plugins]
|
||||||
mccabe = ["+C*"]
|
mccabe = ["+C*"]
|
||||||
pycodestyle = ["+E*", "+W*", "-E203", "-E501", "-W503"]
|
pycodestyle = ["+E*", "+W*", "-E203", "-E501", "-W503"]
|
||||||
pyflakes = ["+F*"]
|
pyflakes = ["+F*"]
|
||||||
flake8-bugbear = ["+*", "+B950"]
|
flake8-bugbear = ["+*", "+B950"]
|
||||||
|
|
||||||
|
# [tool.flakeheaven]
|
||||||
|
# max_line_length = 88
|
||||||
|
# format = "grouped"
|
||||||
|
# show_source = false
|
||||||
|
# exclude = [
|
||||||
|
# ".git",
|
||||||
|
# "__pycache__",
|
||||||
|
# ".vscode",
|
||||||
|
# "*venv",
|
||||||
|
# "build",
|
||||||
|
# "dist",
|
||||||
|
# "old",
|
||||||
|
# "*.egg-info",
|
||||||
|
# ]
|
||||||
|
|
||||||
|
# [tool.flakeheaven.plugins]
|
||||||
|
# mccabe = ["+C*"]
|
||||||
|
# pycodestyle = ["+E*", "+W*", "-E203", "-E501", "-W503"]
|
||||||
|
# pyflakes = ["+F*"]
|
||||||
|
# flake8-bugbear = ["+*", "+B950"]
|
||||||
|
|||||||
@@ -1,9 +0,0 @@
|
|||||||
"""anki-hsk-creator"""
|
|
||||||
|
|
||||||
import os
|
|
||||||
|
|
||||||
# Globals
|
|
||||||
|
|
||||||
os.environ["HF_TOKEN"] = "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"
|
|
||||||
ARGOS_UPDATED = False
|
|
||||||
ARGOS_PACKAGES = None
|
|
||||||
@@ -1,184 +0,0 @@
|
|||||||
## Imports
|
|
||||||
from pathlib import Path
|
|
||||||
import random
|
|
||||||
import csv
|
|
||||||
|
|
||||||
## PIP
|
|
||||||
from pinyin_tone_converter.pinyin_tone_converter import PinyinToneConverter
|
|
||||||
|
|
||||||
## Main
|
|
||||||
|
|
||||||
|
|
||||||
def process_files():
|
|
||||||
print("Select data file:")
|
|
||||||
in_file = None
|
|
||||||
level = INPUT
|
|
||||||
while not in_file:
|
|
||||||
files = []
|
|
||||||
for n, file in enumerate(level.glob("*")):
|
|
||||||
files.append(file)
|
|
||||||
print(f"{n+1} - {file.relative_to(INPUT)}")
|
|
||||||
s = None
|
|
||||||
while not s or not s.isnumeric() or not (1 <= int(s) <= len(files)):
|
|
||||||
s = input(f"Please select the file [1-{len(files)}]: ")
|
|
||||||
selected = files[int(s) - 1]
|
|
||||||
if selected.is_file():
|
|
||||||
in_file = selected
|
|
||||||
else:
|
|
||||||
level = selected
|
|
||||||
relative = in_file.relative_to(INPUT)
|
|
||||||
out_file = OUTPUT / relative
|
|
||||||
resources = RESOURCES / relative
|
|
||||||
resources = resources.parent / resources.stem
|
|
||||||
resources.mkdir(parents=True, exist_ok=True)
|
|
||||||
out_file.parent.mkdir(parents=True, exist_ok=True)
|
|
||||||
with in_file.open(encoding="utf8") as input_file:
|
|
||||||
file_type = input_file.read().split()[0]
|
|
||||||
return in_file, out_file, resources, file_type
|
|
||||||
|
|
||||||
|
|
||||||
def dictionary_process(dictionary, tts, in_file, resources):
|
|
||||||
"""Process dictionary files"""
|
|
||||||
words_list = in_file.open(encoding="utf8").read().strip().split("\n")
|
|
||||||
results = []
|
|
||||||
try:
|
|
||||||
with in_file.open("w", encoding="utf8") as input_file:
|
|
||||||
for words in words_list:
|
|
||||||
word = words.split()[0]
|
|
||||||
pinyin = " ".join(words.split()[1:]) if len(words.split()) > 1 else None
|
|
||||||
if v := dictionary.get(word):
|
|
||||||
if len(v) > 1:
|
|
||||||
print(f"\nWARNING: {word} has multiple meanings:")
|
|
||||||
if pinyin and pinyin != "ERROR":
|
|
||||||
ml = list(filter(lambda x: x.pinyin == pinyin, v))
|
|
||||||
else:
|
|
||||||
ml = v
|
|
||||||
if len(ml) > 1:
|
|
||||||
for n, w in enumerate(ml):
|
|
||||||
print(f"{n+1} - {w}")
|
|
||||||
for m in w.meanings:
|
|
||||||
print(f"\t{m}")
|
|
||||||
s = None
|
|
||||||
while (
|
|
||||||
not s
|
|
||||||
or not s.isnumeric()
|
|
||||||
or not (1 <= int(s) <= len(v))
|
|
||||||
):
|
|
||||||
s = input(
|
|
||||||
f"Please select the correct word [1-{len(v)}]: "
|
|
||||||
)
|
|
||||||
v = v[int(s) - 1]
|
|
||||||
else:
|
|
||||||
v = ml[0]
|
|
||||||
else:
|
|
||||||
v = v[0]
|
|
||||||
audio_path = resources / f"{word}.wav"
|
|
||||||
if not audio_path.exists():
|
|
||||||
audio = tts.generate(f"{word}。", language_id="zh")
|
|
||||||
torchaudio.save(audio_path, audio, tts.sr)
|
|
||||||
input_file.write(f"{word}\t{v.pinyin}\n")
|
|
||||||
results.append((v, audio_path))
|
|
||||||
else:
|
|
||||||
print("============================================")
|
|
||||||
print(f"===================>ERROR: {word} not found")
|
|
||||||
print("============================================")
|
|
||||||
input_file.write(f"{word}\tERROR\n")
|
|
||||||
except Exception:
|
|
||||||
with in_file.open("w", encoding="utf8") as input_file:
|
|
||||||
input_file.write("\n".join(words_list))
|
|
||||||
return results
|
|
||||||
|
|
||||||
|
|
||||||
def translator_process(tts, resources, in_file):
|
|
||||||
"""Process for phases trasnlation"""
|
|
||||||
text_list = in_file.open(encoding="utf8").read().strip().split()
|
|
||||||
results = []
|
|
||||||
for n, phrase in enumerate(text_list):
|
|
||||||
phrase = phrase.strip()
|
|
||||||
audio_path = resources / f"N{n}.wav"
|
|
||||||
if not audio_path.exists():
|
|
||||||
audio = tts.generate(f"{phrase}。", language_id="zh")
|
|
||||||
torchaudio.save(audio_path, audio, tts.sr)
|
|
||||||
translated = argostranslate.translate.translate(phrase, CN, EN)
|
|
||||||
results.append([translated, phrase, audio_path])
|
|
||||||
return results
|
|
||||||
|
|
||||||
|
|
||||||
# def output_tsv(out_file, results):
|
|
||||||
# """writes the output as a tsv file"""
|
|
||||||
# final_file = out_file.parent / f"{out_file.stem}.tsv"
|
|
||||||
# with final_file.open("w", encoding="utf8", newline="") as csvfile:
|
|
||||||
# writer = csv.writer(csvfile, delimiter="\t", quotechar='"')
|
|
||||||
# for entry in results:
|
|
||||||
# writer.writerow(
|
|
||||||
# [
|
|
||||||
# "\n ".join(f"{n+1}. {m}" for n, m in enumerate(entry.meanings)),
|
|
||||||
# PinyinToneConverter().convert_text(entry.pinyin),
|
|
||||||
# entry.simplified,
|
|
||||||
# entry.traditional,
|
|
||||||
# ]
|
|
||||||
# )
|
|
||||||
|
|
||||||
|
|
||||||
def output_anki_dictionary(out_file, results):
|
|
||||||
final_file = out_file.parent / f"{out_file.stem}.apkg"
|
|
||||||
deck_name = "::".join(out_file.relative_to(OUTPUT).parts[:-1] + (out_file.stem,))
|
|
||||||
deck = Deck(random.randrange(1 << 30, 1 << 31), deck_name)
|
|
||||||
package = Package(deck)
|
|
||||||
audios = []
|
|
||||||
for entry, audio in results:
|
|
||||||
note = Note(
|
|
||||||
model=HSK_MODEL,
|
|
||||||
fields=[
|
|
||||||
"\n ".join(f"{n+1}. {m}" for n, m in enumerate(entry.meanings)),
|
|
||||||
PinyinToneConverter().convert_text(entry.pinyin),
|
|
||||||
entry.simplified,
|
|
||||||
entry.traditional,
|
|
||||||
f"[sound:{audio.name}]",
|
|
||||||
],
|
|
||||||
)
|
|
||||||
audios.append(audio)
|
|
||||||
deck.add_note(note)
|
|
||||||
package.media_files = audios
|
|
||||||
package.write_to_file(final_file)
|
|
||||||
|
|
||||||
|
|
||||||
def output_anki_phrase(out_file, results):
|
|
||||||
final_file = out_file.parent / f"{out_file.stem}.apkg"
|
|
||||||
deck_name = "::".join(out_file.relative_to(OUTPUT).parts[:-1] + (out_file.stem,))
|
|
||||||
deck = Deck(random.randrange(1 << 30, 1 << 31), deck_name)
|
|
||||||
package = Package(deck)
|
|
||||||
audios = []
|
|
||||||
for translated, phrase, audio in results:
|
|
||||||
note = Note(
|
|
||||||
model=PHRASE_MODEL,
|
|
||||||
fields=[
|
|
||||||
translated,
|
|
||||||
phrase,
|
|
||||||
f"[sound:{audio.name}]",
|
|
||||||
],
|
|
||||||
)
|
|
||||||
deck.add_note(note)
|
|
||||||
audios.append(audio)
|
|
||||||
package.media_files = audios
|
|
||||||
package.write_to_file(final_file)
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
tts = create_tts()
|
|
||||||
dictionary = create_cedict()
|
|
||||||
create_translator()
|
|
||||||
while True:
|
|
||||||
in_file, out_file, resources, file_type = process_files()
|
|
||||||
if PHRASES_TYPE in in_file.suffixes:
|
|
||||||
results = translator_process(tts, resources, in_file)
|
|
||||||
output_anki_phrase(out_file, results)
|
|
||||||
elif DICT_TYPE in in_file.suffixes:
|
|
||||||
results = dictionary_process(dictionary, tts, in_file, resources)
|
|
||||||
output_anki_dictionary(out_file, results)
|
|
||||||
else:
|
|
||||||
raise TypeError("Error, filetype not especified!")
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
@@ -1,88 +0,0 @@
|
|||||||
# anki-models.py
|
|
||||||
|
|
||||||
from genanki import Deck, Note, Model, Package
|
|
||||||
|
|
||||||
|
|
||||||
# Constants
|
|
||||||
|
|
||||||
CSS = """
|
|
||||||
.card {
|
|
||||||
font-family: arial;
|
|
||||||
font-size: 20px;
|
|
||||||
text-align: center;
|
|
||||||
color: black;
|
|
||||||
background-color: white;
|
|
||||||
}
|
|
||||||
.simple {
|
|
||||||
font-family: Arial;
|
|
||||||
font-size: 100px;
|
|
||||||
}
|
|
||||||
.trad {
|
|
||||||
font-family: Arial;
|
|
||||||
font-size: 75px;
|
|
||||||
}
|
|
||||||
"""
|
|
||||||
|
|
||||||
# Models
|
|
||||||
|
|
||||||
PHRASE_MODEL = Model(
|
|
||||||
2076166425,
|
|
||||||
"Phrase Model",
|
|
||||||
fields=[
|
|
||||||
{"name": "Translated"},
|
|
||||||
{"name": "Phrase"},
|
|
||||||
{"name": "Audio"},
|
|
||||||
],
|
|
||||||
templates=[
|
|
||||||
{
|
|
||||||
"name": "Card 1",
|
|
||||||
"qfmt": "{{Translated}}<br>{{Audio}}",
|
|
||||||
"afmt": '{{FrontSide}}<hr id="answer">{{Phrase}}',
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "Card 2",
|
|
||||||
"qfmt": "{{Phrase}}<br>{{Audio}}",
|
|
||||||
"afmt": '{{FrontSide}}<hr id="answer">{{Translated}}',
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "Card 3",
|
|
||||||
"qfmt": "{{Audio}}",
|
|
||||||
"afmt": '{{FrontSide}}<hr id="answer">{{Phrase}}',
|
|
||||||
},
|
|
||||||
],
|
|
||||||
css=CSS,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
HSK_MODEL = Model(
|
|
||||||
1708536519,
|
|
||||||
"HSK Model",
|
|
||||||
fields=[
|
|
||||||
{"name": "English"},
|
|
||||||
{"name": "Pinyin"},
|
|
||||||
{"name": "Simplified"},
|
|
||||||
{"name": "Traditional"},
|
|
||||||
{"name": "Audio"},
|
|
||||||
],
|
|
||||||
templates=[
|
|
||||||
{
|
|
||||||
"name": "Card 1",
|
|
||||||
"qfmt": "<strong>{{Pinyin}}</strong><br>{{English}}<br>{{Audio}}",
|
|
||||||
"afmt": "{{FrontSide}}<hr id='answer''><div class='simple'>"
|
|
||||||
"{{Simplified}}</div><br><div class='trad'>{{Traditional}}</div>",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "Card 2",
|
|
||||||
"qfmt": "<div class='simple'>{{Simplified}}</div><br><div class='trad'>"
|
|
||||||
"{{Traditional}}</div>",
|
|
||||||
"afmt": '{{FrontSide}}<hr id="answer"><strong>{{Pinyin}}</strong><br>{{English}}<br>{{Audio}}',
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "Card 3",
|
|
||||||
"qfmt": "{{Audio}}",
|
|
||||||
"afmt": "{{FrontSide}}<hr id='answer''><strong>{{Pinyin}}</strong><br><div class='simple'>"
|
|
||||||
"{{Simplified}}</div><br><div class='trad'>{{Traditional}}</div>",
|
|
||||||
},
|
|
||||||
],
|
|
||||||
css=CSS,
|
|
||||||
)
|
|
||||||
@@ -1,19 +0,0 @@
|
|||||||
## Imports
|
|
||||||
from pathlib import Path
|
|
||||||
import random
|
|
||||||
import importlib.resources
|
|
||||||
|
|
||||||
CCCEDICT = importlib.resources.files("anki-hsk-creator").joinpath("cedict_ts.u8")
|
|
||||||
|
|
||||||
DATA = Path(__file__).parent.parent / "data"
|
|
||||||
INPUT = DATA / "input"
|
|
||||||
OUTPUT = DATA / "output"
|
|
||||||
RESOURCES = DATA / "resources"
|
|
||||||
|
|
||||||
# File Types
|
|
||||||
PHRASES_TYPE = ".phrases"
|
|
||||||
DICT_TYPE = ".dictionary"
|
|
||||||
|
|
||||||
# Language codes
|
|
||||||
CN = "zh"
|
|
||||||
EN = "en"
|
|
||||||
@@ -1,53 +0,0 @@
|
|||||||
from cedict_utils.cedict import CedictParser
|
|
||||||
import argostranslate.package
|
|
||||||
import argostranslate.translate
|
|
||||||
from chatterbox.mtl_tts import ChatterboxMultilingualTTS
|
|
||||||
|
|
||||||
import torch
|
|
||||||
import torchaudio
|
|
||||||
|
|
||||||
from . import ARGOS_UPDATED, ARGOS_PACKAGES
|
|
||||||
from . import CCCEDICT
|
|
||||||
|
|
||||||
## Functions
|
|
||||||
|
|
||||||
|
|
||||||
def create_cedict(language_id="en"):
|
|
||||||
"""Creates a create_cedict dictionary object"""
|
|
||||||
|
|
||||||
parser = CedictParser()
|
|
||||||
parser.read_file(CCCEDICT)
|
|
||||||
entries = parser.parse()
|
|
||||||
|
|
||||||
dictionary = {}
|
|
||||||
for entry in entries:
|
|
||||||
if entry.simplified not in dictionary:
|
|
||||||
dictionary[entry.simplified] = [entry]
|
|
||||||
else:
|
|
||||||
dictionary[entry.simplified].append(entry)
|
|
||||||
|
|
||||||
return dictionary
|
|
||||||
|
|
||||||
|
|
||||||
def create_translator(from_code, to_code):
|
|
||||||
"""Download and install Argos Translate package"""
|
|
||||||
if not ARGOS_UPDATED:
|
|
||||||
argostranslate.package.update_package_index()
|
|
||||||
ARGOS_PACKAGES = argostranslate.package.get_available_packages()
|
|
||||||
ARGOS_UPDATED = True
|
|
||||||
package_to_install = next(
|
|
||||||
filter(lambda x: x.from_code == CN and x.to_code == EN, ARGOS_PACKAGES)
|
|
||||||
)
|
|
||||||
argostranslate.package.install_from_path(package_to_install.download())
|
|
||||||
|
|
||||||
|
|
||||||
def create_tts():
|
|
||||||
# Automatically detect the best available device
|
|
||||||
if torch.cuda.is_available():
|
|
||||||
device = "cuda"
|
|
||||||
elif torch.backends.mps.is_available():
|
|
||||||
device = "mps"
|
|
||||||
else:
|
|
||||||
device = "cpu"
|
|
||||||
tts = ChatterboxMultilingualTTS.from_pretrained(device=device, t3_model="v3")
|
|
||||||
return tts
|
|
||||||
@@ -1,4 +1,5 @@
|
|||||||
|
"""about.py"""
|
||||||
# SPDX-FileCopyrightText: 2026-present Wolfang Torres <wolfang.torres@gmail.com>
|
# SPDX-FileCopyrightText: 2026-present Wolfang Torres <wolfang.torres@gmail.com>
|
||||||
#
|
#
|
||||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||||
__version__ = "0.0.1"
|
__version__ = "0.1.0"
|
||||||
20
src/anki_hsk_creator/__init__.py
Normal file
20
src/anki_hsk_creator/__init__.py
Normal file
@@ -0,0 +1,20 @@
|
|||||||
|
"""anki_hsk_creator"""
|
||||||
|
|
||||||
|
# Standard Library
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
# Pip
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
|
load_dotenv()
|
||||||
|
|
||||||
|
# Globals
|
||||||
|
|
||||||
|
# Get an HF_TOKEN from huggingface for TTS generation
|
||||||
|
HF_TOKEN = os.environ.get("HF_TOKEN")
|
||||||
|
|
||||||
|
# Path
|
||||||
|
default_path = Path.home() / "anki-hsk-creator-data"
|
||||||
|
DATA_FOLDER = Path(os.environ.get("DATA_FOLDER", default_path))
|
||||||
|
DATA_FOLDER.mkdir(exist_ok=True, parents=True)
|
||||||
52
src/anki_hsk_creator/__main__.py
Normal file
52
src/anki_hsk_creator/__main__.py
Normal file
@@ -0,0 +1,52 @@
|
|||||||
|
"""__main__.py"""
|
||||||
|
|
||||||
|
# Standard Library
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
# Local
|
||||||
|
from .api import list_input_files, process_a_file, select_file
|
||||||
|
from .constants import LANGUAGES
|
||||||
|
|
||||||
|
|
||||||
|
def cli_select_files():
|
||||||
|
"""Loops until it finds a valid input_file"""
|
||||||
|
print("Select data file:")
|
||||||
|
in_file = None
|
||||||
|
level = Path()
|
||||||
|
while not in_file:
|
||||||
|
files = list_input_files(level)
|
||||||
|
for n, file in enumerate(files):
|
||||||
|
print(f"{n+1} - {file}")
|
||||||
|
s = None
|
||||||
|
while not s or not s.isnumeric() or not (1 <= int(s) <= len(files)):
|
||||||
|
s = input(f"Please select the file [1-{len(files)}]: ")
|
||||||
|
selected = files[int(s) - 1]
|
||||||
|
if selected.is_file():
|
||||||
|
in_file = selected
|
||||||
|
else:
|
||||||
|
level = selected
|
||||||
|
input_file = select_file(in_file)
|
||||||
|
return input_file
|
||||||
|
|
||||||
|
|
||||||
|
def cli_select_language():
|
||||||
|
"""Selects a language for the trasnlatatio"""
|
||||||
|
print("Select a language:")
|
||||||
|
for language_id, language in LANGUAGES.language_names.items():
|
||||||
|
print(f"{language_id} - {language}")
|
||||||
|
s = None
|
||||||
|
while not s or s not in LANGUAGES.available_languages:
|
||||||
|
s = input(f"Please select the language: ({ LANGUAGES.available_languages})")
|
||||||
|
return s
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""CLI interface for the module"""
|
||||||
|
while True:
|
||||||
|
input_file = cli_select_files()
|
||||||
|
language_id = cli_select_language()
|
||||||
|
process_a_file(input_file, language_id)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
159
src/anki_hsk_creator/anki_generation.py
Normal file
159
src/anki_hsk_creator/anki_generation.py
Normal file
@@ -0,0 +1,159 @@
|
|||||||
|
"""anki_generation.py
|
||||||
|
|
||||||
|
Produces anki output
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Standard Library
|
||||||
|
import random
|
||||||
|
|
||||||
|
# Pip
|
||||||
|
from genanki import Deck, Model, Note, Package
|
||||||
|
|
||||||
|
# Local
|
||||||
|
from .utility import ProcessFile, TranslationResult
|
||||||
|
|
||||||
|
# from pinyin_tone_converter.pinyin_tone_converter import PinyinToneConverter
|
||||||
|
|
||||||
|
|
||||||
|
# Constants
|
||||||
|
|
||||||
|
CSS = """
|
||||||
|
.card {
|
||||||
|
font-family: arial;
|
||||||
|
font-size: 20px;
|
||||||
|
text-align: center;
|
||||||
|
color: black;
|
||||||
|
background-color: white;
|
||||||
|
}
|
||||||
|
.simple {
|
||||||
|
font-family: Arial;
|
||||||
|
font-size: 100px;
|
||||||
|
}
|
||||||
|
.trad {
|
||||||
|
font-family: Arial;
|
||||||
|
font-size: 75px;
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Models
|
||||||
|
|
||||||
|
PHRASE_MODEL = Model(
|
||||||
|
2076166425,
|
||||||
|
"Phrase Model",
|
||||||
|
fields=[
|
||||||
|
{"name": "Translated"},
|
||||||
|
{"name": "Phrase"},
|
||||||
|
{"name": "Audio"},
|
||||||
|
],
|
||||||
|
templates=[
|
||||||
|
{
|
||||||
|
"name": "Card 1",
|
||||||
|
"qfmt": "{{Translated}}<br>{{Audio}}",
|
||||||
|
"afmt": '{{FrontSide}}<hr id="answer">{{Phrase}}',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Card 2",
|
||||||
|
"qfmt": "{{Phrase}}<br>{{Audio}}",
|
||||||
|
"afmt": '{{FrontSide}}<hr id="answer">{{Translated}}',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Card 3",
|
||||||
|
"qfmt": "{{Audio}}",
|
||||||
|
"afmt": '{{FrontSide}}<hr id="answer">{{Phrase}}',
|
||||||
|
},
|
||||||
|
],
|
||||||
|
css=CSS,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
HSK_MODEL = Model(
|
||||||
|
1708536519,
|
||||||
|
"HSK Model",
|
||||||
|
fields=[
|
||||||
|
{"name": "English"},
|
||||||
|
{"name": "Pinyin"},
|
||||||
|
{"name": "Simplified"},
|
||||||
|
{"name": "Traditional"},
|
||||||
|
{"name": "Audio"},
|
||||||
|
],
|
||||||
|
templates=[
|
||||||
|
{
|
||||||
|
"name": "Card 1",
|
||||||
|
"qfmt": "<strong>{{Pinyin}}</strong><br>{{English}}<br>{{Audio}}",
|
||||||
|
"afmt": (
|
||||||
|
"{{FrontSide}}<hr id='answer''><div class='simple'>{{Simplified}}</div>"
|
||||||
|
"<br><div class='trad'>{{Traditional}}</div>"
|
||||||
|
),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Card 2",
|
||||||
|
"qfmt": "<div class='simple'>{{Simplified}}</div><br><div class='trad'>"
|
||||||
|
"{{Traditional}}</div>",
|
||||||
|
"afmt": (
|
||||||
|
"{{FrontSide}}<hr id='answer'><strong>{{Pinyin}}</strong>"
|
||||||
|
"<br>{{English}}<br>{{Audio}}"
|
||||||
|
),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Card 3",
|
||||||
|
"qfmt": "{{Audio}}",
|
||||||
|
"afmt": (
|
||||||
|
"{{FrontSide}}<hr id='answer'><strong>{{Pinyin}}</strong>"
|
||||||
|
"<br><div class='simple'>{{Simplified}}</div>"
|
||||||
|
"<br><div class='trad'>{{Traditional}}</div>"
|
||||||
|
),
|
||||||
|
},
|
||||||
|
],
|
||||||
|
css=CSS,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Proccess
|
||||||
|
|
||||||
|
|
||||||
|
# def output_anki_dictionary(out_file, results):
|
||||||
|
# """Creates an anki file from a dictionary results"""
|
||||||
|
# final_file = out_file.parent / f"{out_file.stem}.apkg"
|
||||||
|
# deck_name = "::".join(out_file.relative_to(OUTPUT).parts[:-1] + (out_file.stem,))
|
||||||
|
# deck = Deck(random.randrange(1 << 30, 1 << 31), deck_name)
|
||||||
|
# package = Package(deck)
|
||||||
|
# audios = []
|
||||||
|
# for entry, audio in results:
|
||||||
|
# note = Note(
|
||||||
|
# model=HSK_MODEL,
|
||||||
|
# fields=[
|
||||||
|
# "\n ".join(f"{n+1}. {m}" for n, m in enumerate(entry.meanings)),
|
||||||
|
# PinyinToneConverter().convert_text(entry.pinyin),
|
||||||
|
# entry.simplified,
|
||||||
|
# entry.traditional,
|
||||||
|
# f"[sound:{audio.name}]",
|
||||||
|
# ],
|
||||||
|
# )
|
||||||
|
# audios.append(audio)
|
||||||
|
# deck.add_note(note)
|
||||||
|
# package.media_files = audios
|
||||||
|
# package.write_to_file(final_file)
|
||||||
|
|
||||||
|
|
||||||
|
def output_anki_phrase(process_file: ProcessFile, results: list[TranslationResult]):
|
||||||
|
"""Creates an anki file from a phrases results"""
|
||||||
|
|
||||||
|
final_file = process_file.output_name.with_suffix(".apkg")
|
||||||
|
deck_name = "::".join(
|
||||||
|
process_file.input_file.parts[:-1] + (process_file.input_fil.stem,)
|
||||||
|
)
|
||||||
|
deck = Deck(random.randrange(1 << 30, 1 << 31), deck_name)
|
||||||
|
package = Package(deck)
|
||||||
|
audios = []
|
||||||
|
for result in results:
|
||||||
|
note = Note(
|
||||||
|
model=PHRASE_MODEL,
|
||||||
|
fields=[
|
||||||
|
result.translated,
|
||||||
|
result.line,
|
||||||
|
f"[sound:{result.audio_path.name}]",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
deck.add_note(note)
|
||||||
|
audios.append(result.audio_path)
|
||||||
|
package.media_files = audios
|
||||||
|
package.write_to_file(final_file)
|
||||||
72
src/anki_hsk_creator/api.py
Normal file
72
src/anki_hsk_creator/api.py
Normal file
@@ -0,0 +1,72 @@
|
|||||||
|
"""api.py
|
||||||
|
|
||||||
|
Interface for managuing and procesing files
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Standard Library
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
# Local
|
||||||
|
from . import DATA_FOLDER
|
||||||
|
from .anki_generation import output_anki_phrase
|
||||||
|
from .constants import DICT_TYPE, INPUT, LANGUAGES, PHRASES_TYPE
|
||||||
|
from .proccessor import translator_process
|
||||||
|
from .utility import TRANS, TTS, ProcessFile
|
||||||
|
|
||||||
|
# interface
|
||||||
|
|
||||||
|
|
||||||
|
def get_data_folder() -> Path:
|
||||||
|
"""Utility function, return the data folder"""
|
||||||
|
return DATA_FOLDER
|
||||||
|
|
||||||
|
|
||||||
|
def list_input_files(search_path: Path = Path()) -> list[Path]:
|
||||||
|
"""Return a list of files relative to the INPUT path"""
|
||||||
|
level = INPUT / search_path
|
||||||
|
return [path.relative_to(INPUT) for path in level.glob("*")]
|
||||||
|
|
||||||
|
|
||||||
|
def select_file(file_path: Path) -> ProcessFile:
|
||||||
|
"""Given a relative path from `list_input_files`, return a ProcessFile"""
|
||||||
|
if (INPUT / file_path).is_file():
|
||||||
|
return ProcessFile(file_path)
|
||||||
|
else:
|
||||||
|
raise ValueError(f"{file_path} is not a file")
|
||||||
|
|
||||||
|
|
||||||
|
def create_input_file(
|
||||||
|
name: str, file_type: str, text: str, sub_folder: Path = Path()
|
||||||
|
) -> ProcessFile:
|
||||||
|
"""Creates an input file, with a name and a type form the available ones,
|
||||||
|
writes a text to it, if a sub_folder is given,
|
||||||
|
it is created and the file placed inside.
|
||||||
|
returns the relative path for future processing
|
||||||
|
|
||||||
|
valid file_types: ".phrases", ".dictionary"
|
||||||
|
"""
|
||||||
|
if file_type not in (PHRASES_TYPE, DICT_TYPE):
|
||||||
|
raise ValueError(f"file_type {file_type} not in {(PHRASES_TYPE, DICT_TYPE)}")
|
||||||
|
filename = f"{name}{file_type}.txt"
|
||||||
|
relative = sub_folder / filename
|
||||||
|
# write file
|
||||||
|
file_path = INPUT / relative
|
||||||
|
file_path.parent.mkdir(exist_ok=True, parents=True)
|
||||||
|
file_path.write_text(text, encoding="utf8")
|
||||||
|
# create process_file for future
|
||||||
|
process_file = ProcessFile(relative)
|
||||||
|
return process_file
|
||||||
|
|
||||||
|
|
||||||
|
def process_a_file(process_file: ProcessFile, language_id: str):
|
||||||
|
"""From a input_file, a language and an output type, process a file"""
|
||||||
|
process_file.language_id = language_id
|
||||||
|
if PHRASES_TYPE in process_file.input_file.suffix:
|
||||||
|
TTS.create_tts()
|
||||||
|
TRANS.create_translator(LANGUAGES.CN, language_id)
|
||||||
|
with process_file.absolute_input_file.open("r") as file:
|
||||||
|
text_lines = [line.strip() for line in file.readlines()]
|
||||||
|
results = translator_process(text_lines, process_file, language_id)
|
||||||
|
output_anki_phrase(process_file, results)
|
||||||
|
elif DICT_TYPE in process_file.input_file.suffix:
|
||||||
|
print("not implemented")
|
||||||
51
src/anki_hsk_creator/constants.py
Normal file
51
src/anki_hsk_creator/constants.py
Normal file
@@ -0,0 +1,51 @@
|
|||||||
|
"""constants.py"""
|
||||||
|
|
||||||
|
# Standard Library
|
||||||
|
import importlib.resources
|
||||||
|
|
||||||
|
# Local
|
||||||
|
from . import DATA_FOLDER
|
||||||
|
|
||||||
|
# Resources
|
||||||
|
CCCEDICT_PATH = importlib.resources.files("anki-hsk-creator").joinpath("cedict_ts.u8")
|
||||||
|
|
||||||
|
# Data folder structure
|
||||||
|
INPUT = DATA_FOLDER / "input"
|
||||||
|
INPUT.mkdir(exist_ok=True, parents=True)
|
||||||
|
OUTPUT = DATA_FOLDER / "output"
|
||||||
|
OUTPUT.mkdir(exist_ok=True, parents=True)
|
||||||
|
RESOURCES = DATA_FOLDER / "resources"
|
||||||
|
RESOURCES.mkdir(exist_ok=True, parents=True)
|
||||||
|
|
||||||
|
# File Types
|
||||||
|
PHRASES_TYPE = ".phrases"
|
||||||
|
DICT_TYPE = ".dictionary"
|
||||||
|
|
||||||
|
|
||||||
|
class LANGUAGES:
|
||||||
|
"""Available laguages for translation"""
|
||||||
|
|
||||||
|
CN = "zh"
|
||||||
|
EN = "en"
|
||||||
|
ES = "es"
|
||||||
|
FR = "fr"
|
||||||
|
RU = "ru"
|
||||||
|
TR = "tr"
|
||||||
|
TH = "th"
|
||||||
|
|
||||||
|
@property
|
||||||
|
def available_languages(self) -> tuple:
|
||||||
|
"""Available laguages for translation"""
|
||||||
|
return (self.EN, self.ES, self.FR, self.RU, self.TR, self.TH)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def language_names(self) -> dict:
|
||||||
|
"""Gets the name of a language code"""
|
||||||
|
return {
|
||||||
|
self.EN: "English",
|
||||||
|
self.ES: "Spanish",
|
||||||
|
self.FR: "French",
|
||||||
|
self.RU: "Russian",
|
||||||
|
self.TR: "Turkish",
|
||||||
|
self.TH: "Thai",
|
||||||
|
}
|
||||||
96
src/anki_hsk_creator/proccessor.py
Normal file
96
src/anki_hsk_creator/proccessor.py
Normal file
@@ -0,0 +1,96 @@
|
|||||||
|
"""processor.py"""
|
||||||
|
|
||||||
|
# Pip
|
||||||
|
import argostranslate.translate
|
||||||
|
import torchaudio
|
||||||
|
|
||||||
|
# Local
|
||||||
|
from .constants import LANGUAGES
|
||||||
|
from .utility import TTS, ProcessFile, TranslationResult # , CCCEDICT
|
||||||
|
|
||||||
|
# Results Classes
|
||||||
|
|
||||||
|
|
||||||
|
def translator_process(
|
||||||
|
text_lines: list[str],
|
||||||
|
process_file: ProcessFile,
|
||||||
|
language_id: str,
|
||||||
|
) -> list[TranslationResult]:
|
||||||
|
"""Process for phases or sentence translation"""
|
||||||
|
results = []
|
||||||
|
for n, line in enumerate(text_lines):
|
||||||
|
line = line.strip()
|
||||||
|
audio_path = process_file.resources / f"N{n::03.0n}.wav"
|
||||||
|
if not audio_path.exists():
|
||||||
|
audio = TTS.MODEL.generate(f"{line}。", language_id=LANGUAGES.CN)
|
||||||
|
torchaudio.save(audio_path, audio, TTS.MODEL.sr)
|
||||||
|
translated = argostranslate.translate.translate(line, LANGUAGES.CN, language_id)
|
||||||
|
results.append(TranslationResult(language_id, translated, line, audio_path))
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
# def dictionary_process(dictionary, tts, in_file, resources):
|
||||||
|
# """Process dictionary files"""
|
||||||
|
# words_list = in_file.open(encoding="utf8").read().strip().split("\n")
|
||||||
|
# results = []
|
||||||
|
# try:
|
||||||
|
# with in_file.open("w", encoding="utf8") as input_file:
|
||||||
|
# for words in words_list:
|
||||||
|
# word = words.split()[0]
|
||||||
|
# pinyin = " ".join(words.split()[1:]) if len(words.split()) > 1 else None
|
||||||
|
# if v := dictionary.get(word):
|
||||||
|
# if len(v) > 1:
|
||||||
|
# print(f"\nWARNING: {word} has multiple meanings:")
|
||||||
|
# if pinyin and pinyin != "ERROR":
|
||||||
|
# ml = list(filter(lambda x: x.pinyin == pinyin, v))
|
||||||
|
# else:
|
||||||
|
# ml = v
|
||||||
|
# if len(ml) > 1:
|
||||||
|
# for n, w in enumerate(ml):
|
||||||
|
# print(f"{n+1} - {w}")
|
||||||
|
# for m in w.meanings:
|
||||||
|
# print(f"\t{m}")
|
||||||
|
# s = None
|
||||||
|
# while (
|
||||||
|
# not s
|
||||||
|
# or not s.isnumeric()
|
||||||
|
# or not (1 <= int(s) <= len(v))
|
||||||
|
# ):
|
||||||
|
# s = input(
|
||||||
|
# f"Please select the correct word [1-{len(v)}]: "
|
||||||
|
# )
|
||||||
|
# v = v[int(s) - 1]
|
||||||
|
# else:
|
||||||
|
# v = ml[0]
|
||||||
|
# else:
|
||||||
|
# v = v[0]
|
||||||
|
# audio_path = resources / f"{word}.wav"
|
||||||
|
# if not audio_path.exists():
|
||||||
|
# audio = tts.generate(f"{word}。", language_id="zh")
|
||||||
|
# torchaudio.save(audio_path, audio, tts.sr)
|
||||||
|
# input_file.write(f"{word}\t{v.pinyin}\n")
|
||||||
|
# results.append((v, audio_path))
|
||||||
|
# else:
|
||||||
|
# print("============================================")
|
||||||
|
# print(f"===================>ERROR: {word} not found")
|
||||||
|
# print("============================================")
|
||||||
|
# input_file.write(f"{word}\tERROR\n")
|
||||||
|
# except Exception:
|
||||||
|
# with in_file.open("w", encoding="utf8") as input_file:
|
||||||
|
# input_file.write("\n".join(words_list))
|
||||||
|
# return results
|
||||||
|
|
||||||
|
# def output_tsv(out_file, results):
|
||||||
|
# """writes the output as a tsv file"""
|
||||||
|
# final_file = out_file.parent / f"{out_file.stem}.tsv"
|
||||||
|
# with final_file.open("w", encoding="utf8", newline="") as csvfile:
|
||||||
|
# writer = csv.writer(csvfile, delimiter="\t", quotechar='"')
|
||||||
|
# for entry in results:
|
||||||
|
# writer.writerow(
|
||||||
|
# [
|
||||||
|
# "\n ".join(f"{n+1}. {m}" for n, m in enumerate(entry.meanings)),
|
||||||
|
# PinyinToneConverter().convert_text(entry.pinyin),
|
||||||
|
# entry.simplified,
|
||||||
|
# entry.traditional,
|
||||||
|
# ]
|
||||||
|
# )
|
||||||
154
src/anki_hsk_creator/utility.py
Normal file
154
src/anki_hsk_creator/utility.py
Normal file
@@ -0,0 +1,154 @@
|
|||||||
|
"""utility.py
|
||||||
|
|
||||||
|
|
||||||
|
Static clasess and functions for general use
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Standard Library
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
# Pip
|
||||||
|
import argostranslate.package
|
||||||
|
import argostranslate.translate
|
||||||
|
import torch
|
||||||
|
from cedict_utils.cedict import CedictParser
|
||||||
|
from chatterbox.mtl_tts import ChatterboxMultilingualTTS
|
||||||
|
|
||||||
|
# Local
|
||||||
|
from .constants import CCCEDICT_PATH, INPUT, LANGUAGES, OUTPUT, RESOURCES
|
||||||
|
|
||||||
|
# Static Clases
|
||||||
|
|
||||||
|
|
||||||
|
class TRANS:
|
||||||
|
"""Static Class for Argos translate"""
|
||||||
|
|
||||||
|
UPDATED = False
|
||||||
|
PACKAGES = None
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def create_translator(from_code, to_code):
|
||||||
|
"""Download and install Argos Translate package"""
|
||||||
|
if not TRANS.UPDATED:
|
||||||
|
argostranslate.package.update_package_index()
|
||||||
|
TRANS.PACKAGES = argostranslate.package.get_available_packages()
|
||||||
|
TRANS.UPDATED = True
|
||||||
|
package_to_install = next(
|
||||||
|
filter(
|
||||||
|
lambda x: x.from_code == from_code and x.to_code == to_code,
|
||||||
|
TRANS.PACKAGES,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
argostranslate.package.install_from_path(package_to_install.download())
|
||||||
|
|
||||||
|
|
||||||
|
class CCCEDICT:
|
||||||
|
"""Static Class for the CCCEDIT dictionary"""
|
||||||
|
|
||||||
|
PARSER = None
|
||||||
|
ENTRIES = []
|
||||||
|
DICTIONARY_LIST = {}
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def create_cedict(language_id=LANGUAGES.EN):
|
||||||
|
"""Creates a create_cedict dictionary object"""
|
||||||
|
if not CCCEDICT.PARSER:
|
||||||
|
CCCEDICT.PARSER = CedictParser()
|
||||||
|
CCCEDICT.PARSER.read_file(CCCEDICT_PATH)
|
||||||
|
CCCEDICT.ENTRIES = CCCEDICT.PARSER.parse()
|
||||||
|
if language_id not in CCCEDICT.DICTIONARY_LIST:
|
||||||
|
dictionary = {}
|
||||||
|
for entry in CCCEDICT.ENTRIES:
|
||||||
|
if language_id != LANGUAGES.EN:
|
||||||
|
TRANS.create_translator(LANGUAGES.EN, language_id)
|
||||||
|
entry = argostranslate.translate.translate(
|
||||||
|
entry, LANGUAGES.EN, language_id
|
||||||
|
)
|
||||||
|
if entry.simplified not in dictionary:
|
||||||
|
dictionary[entry.simplified] = [entry]
|
||||||
|
else:
|
||||||
|
dictionary[entry.simplified].append(entry)
|
||||||
|
CCCEDICT.DICTIONARY_LIST[language_id] = dictionary
|
||||||
|
else:
|
||||||
|
dictionary = CCCEDICT.DICTIONARY_LIST[language_id]
|
||||||
|
return dictionary
|
||||||
|
|
||||||
|
|
||||||
|
class TTS:
|
||||||
|
"""Static class for the the TTS engine"""
|
||||||
|
|
||||||
|
MODEL = None
|
||||||
|
DEVICE = None
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def create_tts():
|
||||||
|
"""Creates a TTS engine"""
|
||||||
|
if TTS.DEVICE is None:
|
||||||
|
# Automatically detect the best available device
|
||||||
|
if torch.cuda.is_available():
|
||||||
|
TTS.DEVICE = "cuda"
|
||||||
|
elif torch.backends.mps.is_available():
|
||||||
|
TTS.DEVICE = "mps"
|
||||||
|
else:
|
||||||
|
TTS.DEVICE = "cpu"
|
||||||
|
if TTS.MODEL is None:
|
||||||
|
TTS.MODEL = ChatterboxMultilingualTTS.from_pretrained(
|
||||||
|
device=TTS.DEVICE, t3_model="v3"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# Clases
|
||||||
|
|
||||||
|
|
||||||
|
class ProcessFile:
|
||||||
|
"""Class that represents a file to processs
|
||||||
|
|
||||||
|
diferent input files has direfent process_files depending on language
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, input_file: Path, language_id: str = None):
|
||||||
|
self.input_file = input_file
|
||||||
|
self._language_id = language_id
|
||||||
|
# process file type
|
||||||
|
self.out_folder = OUTPUT / input_file.parent
|
||||||
|
self.out_folder.mkdir(parents=True, exist_ok=True)
|
||||||
|
resources = RESOURCES / input_file
|
||||||
|
self.resources = resources.parent / resources.stem
|
||||||
|
self.resources.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def absolute_input_file(self):
|
||||||
|
"""Absolute input file"""
|
||||||
|
return INPUT / self.input_file
|
||||||
|
|
||||||
|
@property
|
||||||
|
def language_id(self):
|
||||||
|
"""language for this trasnlation process"""
|
||||||
|
return self._language_id
|
||||||
|
|
||||||
|
@language_id.setter
|
||||||
|
def language_id(self, value):
|
||||||
|
self._language_id = value
|
||||||
|
|
||||||
|
@property
|
||||||
|
def output_name(self):
|
||||||
|
"""Posible name for the output file, still missing the filetype"""
|
||||||
|
if self.language_id is None:
|
||||||
|
raise ValueError("Not a valid language selected")
|
||||||
|
return self.input_file.parent / f"{self.input_file.stem}.{self.language_id})."
|
||||||
|
|
||||||
|
|
||||||
|
class TranslationResult:
|
||||||
|
"""Result of a translated process"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
language_id: str,
|
||||||
|
translated: str,
|
||||||
|
line: str,
|
||||||
|
audio_path: Path,
|
||||||
|
):
|
||||||
|
self.language_id = language_id
|
||||||
|
self.translated = translated
|
||||||
|
self.line = line
|
||||||
|
self.audio_path = audio_path
|
||||||
Reference in New Issue
Block a user