Compare commits

...

24 Commits

Author SHA1 Message Date
Wolfang Torres
bff05fca01 update speed generation for slower speed 2026-06-22 21:50:59 +08:00
Wolfang Torres
266bbbb370 update voice generation 2026-06-22 21:37:45 +08:00
d0c7da966d fix anki template 2026-06-22 19:02:38 +08:00
59075c0468 change hsk model, doesnt use to typeboxes per card 2026-06-22 18:42:45 +08:00
Wolfang Torres
deaa4c649f fix error on translator 2026-06-21 20:50:05 +08:00
Wolfang Torres
5fce71f44c test 2026-06-21 14:29:44 +08:00
Wolfang Torres
f81f48a5d9 debug 2026-06-21 14:25:08 +08:00
Wolfang Torres
e945d53f85 fix trasnlator issues 2026-06-21 13:31:35 +08:00
Wolfang Torres
5b52c5ae96 test 2026-06-21 10:32:28 +08:00
Wolfang Torres
f0b1d7c29c fix 2026-06-21 10:29:35 +08:00
Wolfang Torres
28eccd8d13 update for language creation 2026-06-21 10:26:12 +08:00
Wolfang Torres
43853678c1 updates 2026-06-21 10:09:06 +08:00
Wolfang Torres
ebc9aa77a7 add dictation model 2026-06-20 14:10:17 +08:00
Wolfang Torres
ebf2d58207 add create delete folder files 2026-06-20 12:29:41 +08:00
Wolfang Torres
d797e655c6 fix dot in name 2026-06-20 12:03:01 +08:00
Wolfang Torres
ab495b6d31 clean input and resources files 2026-06-20 11:51:49 +08:00
Wolfang Torres
bc1c4ff048 add debug 2026-06-20 11:40:26 +08:00
Wolfang Torres
a23e0dc34e fix cli 2026-06-20 10:19:59 +08:00
Wolfang Torres
62588298a8 fix cli 2026-06-20 09:27:08 +08:00
Wolfang Torres
30dd8c8671 update api endpoints 2026-06-19 20:17:37 +08:00
Wolfang Torres
dde819f1e6 fix bug with dictionary only translated on demand 2026-06-12 20:58:53 +08:00
Wolfang Torres
f9fc887d05 add dictionary file process 2026-06-12 20:26:50 +08:00
Wolfang Torres
9b0d23b8ac fix bug with name gneeration 2026-06-12 00:59:19 +08:00
Wolfang Torres
eb4cc8e6e0 update format for anki,
upgrade trasnlation package search,
fix small bugs
2026-06-12 00:43:55 +08:00
10 changed files with 601 additions and 167 deletions

2
.vscode/launch.json vendored
View File

@@ -9,7 +9,7 @@
"name": "Python Debugger: Module",
"type": "debugpy",
"request": "launch",
"module": "anki-hsk-creator"
"module": "anki_hsk_creator"
}
]
}

View File

@@ -14,7 +14,9 @@ creates anki hsk decks from a list of words
## Installation
```console
pip install anki-hsk-creator
git clone https://github.com/resemble-ai/chatterbox
git clone https://gitea.wolfang.info.ve/wolfang/anki-hsk-creator
git clone https://gitea.wolfang.info.ve/wolfang/anki-hsk-creator-data
```
## License

View File

@@ -35,6 +35,7 @@ dependencies = [
"torchaudio",
"torchcodec",
"python-dotenv",
]
[project.optional-dependencies]

View File

@@ -1,5 +1,6 @@
"""about.py"""
# SPDX-FileCopyrightText: 2026-present Wolfang Torres <wolfang.torres@gmail.com>
#
# SPDX-License-Identifier: GPL-3.0-or-later
__version__ = "0.1.0"
__version__ = "0.1.3"

View File

@@ -4,11 +4,19 @@
from pathlib import Path
# Local
from .api import list_input_files, process_a_file, select_file
from .constants import LANGUAGES
from .api import (
is_file,
list_input_files,
pre_process_a_dictionary_file,
process_a_dictionary_file,
process_a_phrases_file,
select_file,
)
from .constants import DICT_TYPE, LANGUAGES, PHRASES_TYPE
from .utility import ProcessFile
def cli_select_files():
def cli_select_files() -> ProcessFile:
"""Loops until it finds a valid input_file"""
print("Select data file:")
in_file = None
@@ -18,10 +26,10 @@ def cli_select_files():
for n, file in enumerate(files):
print(f"{n+1} - {file}")
s = None
while not s or not s.isnumeric() or not (1 <= int(s) <= len(files)):
while not s or not s.isnumeric() or not 1 <= int(s) <= len(files):
s = input(f"Please select the file [1-{len(files)}]: ")
selected = files[int(s) - 1]
if selected.is_file():
if is_file(selected):
in_file = selected
else:
level = selected
@@ -29,14 +37,36 @@ def cli_select_files():
return input_file
def cli_select_language():
"""Selects a language for the trasnlatatio"""
print("Select a language:")
for language_id, language in LANGUAGES.language_names.items():
print(f"{language_id} - {language}")
def cli_select_dictionay_tsv() -> bool:
"""If a dictionary file is selected, ask if the user wants to proccess it"""
s = None
while not s or s not in LANGUAGES.available_languages:
s = input(f"Please select the language: ({ LANGUAGES.available_languages})")
while s not in ("y", "yes", "no", "n"):
s = input("Do you want to Pre-Process a dictionary (y/n): ")
r = s in ("y", "yes")
return r
def cli_select_language(languages: list = None) -> str:
"""Selects a language for the trasnlatation"""
if languages:
avaliable_languages = {
lan_id: lan
for lan_id, lan in LANGUAGES.LanguageNames.items()
if lan_id in languages
}
else:
avaliable_languages = LANGUAGES.LanguageNames.items()
if not avaliable_languages:
raise ValueError("""No languages are avaliable,
if this is a dictionay file, you must preprocess it first""")
print("Select a language:")
for language_id, language in avaliable_languages:
if languages and language_id in languages:
print(f"{language_id} - {language}")
s = None
while not s or s not in LANGUAGES.AvailableLanguages:
lan_codes = [lan_id for lan_id, lan in avaliable_languages]
s = input(f"Please select the language {', '.join(lan_codes)}: ")
return s
@@ -44,8 +74,22 @@ def main():
"""CLI interface for the module"""
while True:
input_file = cli_select_files()
language_id = cli_select_language()
process_a_file(input_file, language_id)
if DICT_TYPE in input_file.input_file.suffixes:
dict_selected = cli_select_dictionay_tsv()
if dict_selected:
language_id = cli_select_language()
pre_process_a_dictionary_file(input_file, language_id)
else:
language_id = cli_select_language(
input_file.available_dictionary_languages
)
process_a_dictionary_file(input_file, language_id)
elif PHRASES_TYPE in input_file.input_file.suffixes:
language_id = cli_select_language()
print(
f"processing file {input_file.input_file} with language {language_id}"
)
process_a_phrases_file(input_file, language_id)
if __name__ == "__main__":

View File

@@ -5,15 +5,14 @@ Produces anki output
# Standard Library
import random
from pathlib import Path
# Pip
from genanki import Deck, Model, Note, Package
from pinyin_tone_converter.pinyin_tone_converter import PinyinToneConverter
# Local
from .utility import ProcessFile, TranslationResult
# from pinyin_tone_converter.pinyin_tone_converter import PinyinToneConverter
from .utility import DictionaryResult, ProcessFile, TranslationResult
# Constants
@@ -48,29 +47,46 @@ PHRASE_MODEL = Model(
templates=[
{
"name": "Card 1",
"qfmt": "{{Translated}}<br>{{Audio}}",
"qfmt": "{{Translated}}<br>{{Audio}}<br>{{type:Phrase}}",
"afmt": '{{FrontSide}}<hr id="answer">{{Phrase}}',
},
{
"name": "Card 2",
"qfmt": "{{Phrase}}<br>{{Audio}}",
"qfmt": "{{Phrase}}<br>{{Audio}}<br>{{type:Translated}}",
"afmt": '{{FrontSide}}<hr id="answer">{{Translated}}',
},
{
"name": "Card 3",
"qfmt": "{{Audio}}",
"qfmt": "{{Audio}}<br>{{type:Phrase}}",
"afmt": '{{FrontSide}}<hr id="answer">{{Phrase}}',
},
],
css=CSS,
)
DICTATION_MODEL = Model(
3187277536,
"Phrase Model",
fields=[
{"name": "Translated"},
{"name": "Phrase"},
{"name": "Audio"},
],
templates=[
{
"name": "Card 1",
"qfmt": "{{Audio}}<br>{{type:Phrase}}",
"afmt": '{{FrontSide}}<hr id="answer">{{Phrase}}<br>{{Translated}}',
},
],
css=CSS,
)
HSK_MODEL = Model(
1708536519,
"HSK Model",
fields=[
{"name": "English"},
{"name": "Translated"},
{"name": "Pinyin"},
{"name": "Simplified"},
{"name": "Traditional"},
@@ -79,7 +95,12 @@ HSK_MODEL = Model(
templates=[
{
"name": "Card 1",
"qfmt": "<strong>{{Pinyin}}</strong><br>{{English}}<br>{{Audio}}",
"qfmt": (
"<strong>{{Pinyin}}</strong>"
"<br>{{Translated}}"
"<br>{{Audio}}"
"<br>Simplified: {{type:Simplified}}"
),
"afmt": (
"{{FrontSide}}<hr id='answer''><div class='simple'>{{Simplified}}</div>"
"<br><div class='trad'>{{Traditional}}</div>"
@@ -87,16 +108,24 @@ HSK_MODEL = Model(
},
{
"name": "Card 2",
"qfmt": "<div class='simple'>{{Simplified}}</div><br><div class='trad'>"
"{{Traditional}}</div>",
"qfmt": (
"<div class='simple'>{{Simplified}}</div>"
"<br><div class='trad'>{{Traditional}}</div>"
"<br>Pinyin: {{type:Pinyin}}"
# "<br>Translated: {{type:Translated}}"
),
"afmt": (
"{{FrontSide}}<hr id='answer'><strong>{{Pinyin}}</strong>"
"<br>{{English}}<br>{{Audio}}"
"<br>{{Translated}}<br>{{Audio}}"
),
},
{
"name": "Card 3",
"qfmt": "{{Audio}}",
"qfmt": (
"{{Audio}}"
# "<br>Pinyin: {{type:Pinyin}}"
"<br>Simplified: {{type:Simplified}}"
),
"afmt": (
"{{FrontSide}}<hr id='answer'><strong>{{Pinyin}}</strong>"
"<br><div class='simple'>{{Simplified}}</div>"
@@ -110,38 +139,87 @@ HSK_MODEL = Model(
# Proccess
# def output_anki_dictionary(out_file, results):
# """Creates an anki file from a dictionary results"""
# final_file = out_file.parent / f"{out_file.stem}.apkg"
# deck_name = "::".join(out_file.relative_to(OUTPUT).parts[:-1] + (out_file.stem,))
# deck = Deck(random.randrange(1 << 30, 1 << 31), deck_name)
# package = Package(deck)
# audios = []
# for entry, audio in results:
# note = Note(
# model=HSK_MODEL,
# fields=[
# "\n ".join(f"{n+1}. {m}" for n, m in enumerate(entry.meanings)),
# PinyinToneConverter().convert_text(entry.pinyin),
# entry.simplified,
# entry.traditional,
# f"[sound:{audio.name}]",
# ],
# )
# audios.append(audio)
# deck.add_note(note)
# package.media_files = audios
# package.write_to_file(final_file)
def output_anki_phrase(process_file: ProcessFile, results: list[TranslationResult]):
"""Creates an anki file from a phrases results"""
def output_anki_dictation(
process_file: ProcessFile, results: list[DictionaryResult]
) -> Path:
"""Creates an anki file for dictation result"""
final_file = process_file.output_name.with_suffix(".apkg")
deck_name = "::".join(
process_file.input_file.parts[:-1] + (process_file.input_fil.stem,)
process_file.input_file.parts[:-1] + (process_file.output_name.stem,)
)
deck = Deck(
random.randrange(1 << 30, 1 << 31),
deck_name,
f"Deck for {final_file.name}, "
"created in https://www.wolfang.info.ve/hskankicreator/",
)
package = Package(deck)
audios = []
for result in results:
note = Note(
model=DICTATION_MODEL,
fields=[
result.translated,
result.line,
f"[sound:{result.audio_path.name}]",
],
)
deck.add_note(note)
audios.append(result.audio_path)
package.media_files = audios
package.write_to_file(final_file)
return final_file
def output_anki_dictionary(
process_file: ProcessFile, results: list[DictionaryResult]
) -> Path:
"""Creates an anki file from a dictionary results"""
final_file = process_file.output_name.with_suffix(".apkg")
deck_name = "::".join(
process_file.input_file.parts[:-1] + (process_file.output_name.stem,)
)
deck = Deck(
random.randrange(1 << 30, 1 << 31),
deck_name,
f"Deck for {final_file.name}, "
"created in https://www.wolfang.info.ve/hskankicreator/",
)
package = Package(deck)
audios = []
for result in results:
note = Note(
model=HSK_MODEL,
fields=[
# "\n ".join(f"{n+1}. {m}" for n, m in enumerate(result.meanings)),
result.meaning,
PinyinToneConverter().convert_text(result.pinyin),
result.simplified,
result.traditional,
f"[sound:{result.audio_path.name}]",
],
)
audios.append(result.audio_path)
deck.add_note(note)
package.media_files = audios
package.write_to_file(final_file)
return final_file
def output_anki_phrase(
process_file: ProcessFile, results: list[TranslationResult]
) -> Path:
"""Creates an anki file from a phrases results"""
final_file = process_file.output_name.with_suffix(".apkg")
deck_name = "::".join(
process_file.input_file.parts[:-1] + (process_file.output_name.stem,)
)
deck = Deck(
random.randrange(1 << 30, 1 << 31),
deck_name,
f"Deck for {final_file.name}, "
"created in https://www.wolfang.info.ve/hskankicreator/",
)
deck = Deck(random.randrange(1 << 30, 1 << 31), deck_name)
package = Package(deck)
audios = []
for result in results:
@@ -157,3 +235,4 @@ def output_anki_phrase(process_file: ProcessFile, results: list[TranslationResul
audios.append(result.audio_path)
package.media_files = audios
package.write_to_file(final_file)
return final_file

View File

@@ -8,10 +8,27 @@ from pathlib import Path
# Local
from . import DATA_FOLDER
from .anki_generation import output_anki_phrase
from .constants import DICT_TYPE, INPUT, LANGUAGES, PHRASES_TYPE
from .proccessor import translator_process
from .utility import TRANS, TTS, ProcessFile
from .anki_generation import (
output_anki_dictation,
output_anki_dictionary,
output_anki_phrase,
)
from .constants import (
DICTATION_TYPE,
DICT_TYPE,
INPUT,
LANGUAGES,
OUTPUT,
PHRASES_TYPE,
RESOURCES,
)
from .proccessor import (
dictation_process,
dictionary_pre_process,
dictionary_process,
translator_process,
)
from .utility import CCCEDICT, TRANS, TTS, ProcessFile
# interface
@@ -21,12 +38,40 @@ def get_data_folder() -> Path:
return DATA_FOLDER
def get_output_folder() -> Path:
"""Utility function, return the OUTPUT folder"""
return OUTPUT
def get_resources_folder() -> Path:
"""Utility function, return the RESOURCES folder"""
return RESOURCES
def list_input_files(search_path: Path = Path()) -> list[Path]:
"""Return a list of files relative to the INPUT path"""
level = INPUT / search_path
return [path.relative_to(INPUT) for path in level.glob("*")]
def is_file(file_path: Path) -> bool:
"""Check if a relative path is a file"""
return (INPUT / file_path).is_file()
def read_input_file(file_path: Path) -> str:
"""Reads an input file"""
return (INPUT / file_path).open(encoding="utf8", newline="\n").read()
def read_dictionary_file(process_file: ProcessFile, language_id: str) -> str:
"""Reads an dictionary resource file"""
process_file.language_id = language_id
return process_file.dictionary_resource_file.open(
encoding="utf8", newline="\n"
).read()
def select_file(file_path: Path) -> ProcessFile:
"""Given a relative path from `list_input_files`, return a ProcessFile"""
if (INPUT / file_path).is_file():
@@ -35,6 +80,71 @@ def select_file(file_path: Path) -> ProcessFile:
raise ValueError(f"{file_path} is not a file")
def create_folder(file_path: Path) -> ProcessFile:
"""Creates a folder in a file_path"""
input_folder = INPUT / file_path
if input_folder.exists():
raise ValueError(f"{file_path} already exists")
else:
input_folder.mkdir(exist_ok=True, parents=True)
return ProcessFile(input_folder)
def delete_folder(file_path: Path):
"""Delete an empty folder in file_path"""
input_folder = INPUT / file_path
if input_folder.exists():
if any(Path("some/path/here").iterdir()):
raise ValueError(f"{file_path} has files inside")
else:
input_folder.rmdir()
else:
raise ValueError(f"{file_path} doesn't exists")
def delete_file(file_path: Path):
"""Deletes a file in a file_path"""
input_file = INPUT / file_path
if input_file.if_file():
input_file.unlink()
else:
raise ValueError(f"{file_path} doesn't exists")
def list_file_resources(file_path: ProcessFile):
"""Returns a list of a file_path resources files"""
return [file_path.resources.glob("*")]
def analize_input_files(search_path: Path = Path()) -> dict[str, list[Path]]:
"""Analaizes a path file, and returns input, resources and output files"""
data = {
"input": [],
"resources": [],
"output": [],
}
if search_path is None:
return data
input_path = INPUT / search_path
if input_path.is_file():
process_file = ProcessFile(search_path)
res_path = process_file.resources
outputs_path = process_file.out_folder
data["input"] = [search_path]
data["resources"] = [path.relative_to(RESOURCES) for path in res_path.glob("*")]
data["output"] = [
path.relative_to(OUTPUT)
for path in outputs_path.glob(f"{process_file.input_file.stem}*")
]
elif input_path.exists():
res_path = RESOURCES / search_path
outputs_path = OUTPUT / search_path
data["input"] = [path.relative_to(INPUT) for path in input_path.glob("*")]
data["resources"] = [path.relative_to(RESOURCES) for path in res_path.glob("*")]
data["output"] = [path.relative_to(OUTPUT) for path in outputs_path.glob("*")]
return data
def create_input_file(
name: str, file_type: str, text: str, sub_folder: Path = Path()
) -> ProcessFile:
@@ -43,30 +153,87 @@ def create_input_file(
it is created and the file placed inside.
returns the relative path for future processing
valid file_types: ".phrases", ".dictionary"
valid file_types: ".phrases", ".dictionary" ".dictation"
"""
if file_type not in (PHRASES_TYPE, DICT_TYPE):
raise ValueError(f"file_type {file_type} not in {(PHRASES_TYPE, DICT_TYPE)}")
if file_type not in (PHRASES_TYPE, DICT_TYPE, DICTATION_TYPE):
raise ValueError(
f"file_type {file_type} not in {(PHRASES_TYPE, DICT_TYPE, DICTATION_TYPE)}"
)
filename = f"{name}{file_type}.txt"
relative = sub_folder / filename
# write file
file_path = INPUT / relative
file_path.parent.mkdir(exist_ok=True, parents=True)
file_path.write_text(text, encoding="utf8")
file_path.write_text(text, encoding="utf8", newline="\n")
# create process_file for future
process_file = ProcessFile(relative)
return process_file
def process_a_file(process_file: ProcessFile, language_id: str):
"""From a input_file, a language and an output type, process a file"""
def write_input_file(process_file: ProcessFile, text: str):
"""Write an input file"""
with process_file.absolute_input_file.open(
"w", encoding="utf8", newline="\n"
) as file:
for line in text.split("\n"):
line = line.strip()
if line:
file.write(f"{line}\n")
def write_resource_file(process_file: ProcessFile, language_id: str, text: str):
"""Write a resource file"""
process_file.language_id = language_id
if PHRASES_TYPE in process_file.input_file.suffix:
TTS.create_tts()
TRANS.create_translator(LANGUAGES.CN, language_id)
with process_file.absolute_input_file.open("r") as file:
text_lines = [line.strip() for line in file.readlines()]
results = translator_process(text_lines, process_file, language_id)
output_anki_phrase(process_file, results)
elif DICT_TYPE in process_file.input_file.suffix:
print("not implemented")
with process_file.dictionary_resource_file.open(
"w", encoding="utf8", newline="\n"
) as file:
for line in text.split("\n"):
line = line.strip()
if line:
file.write(f"{line}\n")
def pre_process_a_dictionary_file(process_file: ProcessFile, language_id: str):
"""From a input_file, a language_id and an output type, process a file"""
process_file.language_id = language_id
TRANS.create_translator(LANGUAGES.EN, language_id)
CCCEDICT.create_cedict(language_id)
with process_file.absolute_input_file.open(
"r", encoding="utf8", newline="\n"
) as file:
words_list = [word.strip() for word in file.readlines() if word]
dictionary_pre_process(words_list, process_file)
def process_a_dictionary_file(process_file: ProcessFile, language_id: str) -> Path:
"""Process a dictionary file"""
TTS.create_tts()
process_file.language_id = language_id
results = dictionary_process(process_file)
return output_anki_dictionary(process_file, results)
def process_a_dictation_file(process_file: ProcessFile, language_id: str) -> Path:
"""Process a dictation file"""
TTS.create_tts()
TRANS.create_translator(LANGUAGES.CN, language_id)
process_file.language_id = language_id
with process_file.absolute_input_file.open(
"r", encoding="utf8", newline="\n"
) as file:
text_lines = [line.strip() for line in file.read().split("")]
results = dictation_process(text_lines, process_file)
return output_anki_dictation(process_file, results)
def process_a_phrases_file(process_file: ProcessFile, language_id: str) -> Path:
"""Process a phrases file"""
process_file.language_id = language_id
TTS.create_tts()
TRANS.create_translator(LANGUAGES.CN, language_id)
with process_file.absolute_input_file.open(
"r", encoding="utf8", newline="\n"
) as file:
text_lines = [line.strip() for line in file.readlines()]
results = translator_process(text_lines, process_file)
return output_anki_phrase(process_file, results)

View File

@@ -7,7 +7,7 @@ import importlib.resources
from . import DATA_FOLDER
# Resources
CCCEDICT_PATH = importlib.resources.files("anki-hsk-creator").joinpath("cedict_ts.u8")
CCCEDICT_PATH = importlib.resources.files("anki_hsk_creator").joinpath("cedict_ts.u8")
# Data folder structure
INPUT = DATA_FOLDER / "input"
@@ -20,6 +20,7 @@ RESOURCES.mkdir(exist_ok=True, parents=True)
# File Types
PHRASES_TYPE = ".phrases"
DICT_TYPE = ".dictionary"
DICTATION_TYPE = ".dictation"
class LANGUAGES:
@@ -33,19 +34,12 @@ class LANGUAGES:
TR = "tr"
TH = "th"
@property
def available_languages(self) -> tuple:
"""Available laguages for translation"""
return (self.EN, self.ES, self.FR, self.RU, self.TR, self.TH)
@property
def language_names(self) -> dict:
"""Gets the name of a language code"""
return {
self.EN: "English",
self.ES: "Spanish",
self.FR: "French",
self.RU: "Russian",
self.TR: "Turkish",
self.TH: "Thai",
}
AvailableLanguages = (EN, ES, FR, RU, TR, TH)
LanguageNames = {
EN: "English",
ES: "Spanish",
FR: "French",
RU: "Russian",
TR: "Turkish",
TH: "Thai",
}

View File

@@ -1,84 +1,135 @@
"""processor.py"""
# Standard Library
import csv
# Pip
import argostranslate.translate
import torchaudio
# Local
from .constants import LANGUAGES
from .utility import TTS, ProcessFile, TranslationResult # , CCCEDICT
from .utility import CCCEDICT, TTS, DictionaryResult, ProcessFile, TranslationResult
# Constants
FIELDNAMES = ["simplified", "traditional", "pinyin", "meaning"]
DIALECT = "excel-tab"
# Results Classes
def dictation_process(
text_lines: list[str], process_file: ProcessFile
) -> list[TranslationResult]:
"""Process for Dictation translation"""
results = []
for n, line in enumerate(text_lines):
line = line.strip()
line = " ".join(line.split())
line = line.replace("", " 。。。 ")
audio_path = process_file.resources / f"N{n:03n}.wav"
if not audio_path.exists():
audio = TTS.MODEL.generate(
f"{line}", language_id=LANGUAGES.CN, **TTS.DEFAULTS
)
torchaudio.save(audio_path, audio, TTS.MODEL.sr)
translated = argostranslate.translate.translate(
line, LANGUAGES.CN, process_file.language_id
)
results.append(
TranslationResult(process_file.language_id, translated, line, audio_path)
)
return results
def translator_process(
text_lines: list[str],
process_file: ProcessFile,
language_id: str,
text_lines: list[str], process_file: ProcessFile
) -> list[TranslationResult]:
"""Process for phases or sentence translation"""
results = []
for n, line in enumerate(text_lines):
line = line.strip()
audio_path = process_file.resources / f"N{n::03.0n}.wav"
line = " ".join(line.split())
line = line.replace("", " 。。。 ")
audio_path = process_file.resources / f"N{n:03n}.wav"
if not audio_path.exists():
audio = TTS.MODEL.generate(f"{line}", language_id=LANGUAGES.CN)
audio = TTS.MODEL.generate(
f"{line}", language_id=LANGUAGES.CN, **TTS.DEFAULTS
)
torchaudio.save(audio_path, audio, TTS.MODEL.sr)
translated = argostranslate.translate.translate(line, LANGUAGES.CN, language_id)
results.append(TranslationResult(language_id, translated, line, audio_path))
translated = argostranslate.translate.translate(
line, LANGUAGES.CN, process_file.language_id
)
results.append(
TranslationResult(process_file.language_id, translated, line, audio_path)
)
return results
# def dictionary_process(dictionary, tts, in_file, resources):
# """Process dictionary files"""
# words_list = in_file.open(encoding="utf8").read().strip().split("\n")
# results = []
# try:
# with in_file.open("w", encoding="utf8") as input_file:
# for words in words_list:
# word = words.split()[0]
# pinyin = " ".join(words.split()[1:]) if len(words.split()) > 1 else None
# if v := dictionary.get(word):
# if len(v) > 1:
# print(f"\nWARNING: {word} has multiple meanings:")
# if pinyin and pinyin != "ERROR":
# ml = list(filter(lambda x: x.pinyin == pinyin, v))
# else:
# ml = v
# if len(ml) > 1:
# for n, w in enumerate(ml):
# print(f"{n+1} - {w}")
# for m in w.meanings:
# print(f"\t{m}")
# s = None
# while (
# not s
# or not s.isnumeric()
# or not (1 <= int(s) <= len(v))
# ):
# s = input(
# f"Please select the correct word [1-{len(v)}]: "
# )
# v = v[int(s) - 1]
# else:
# v = ml[0]
# else:
# v = v[0]
# audio_path = resources / f"{word}.wav"
# if not audio_path.exists():
# audio = tts.generate(f"{word}。", language_id="zh")
# torchaudio.save(audio_path, audio, tts.sr)
# input_file.write(f"{word}\t{v.pinyin}\n")
# results.append((v, audio_path))
# else:
# print("============================================")
# print(f"===================>ERROR: {word} not found")
# print("============================================")
# input_file.write(f"{word}\tERROR\n")
# except Exception:
# with in_file.open("w", encoding="utf8") as input_file:
# input_file.write("\n".join(words_list))
# return results
def dictionary_pre_process(words_list: list[str], process_file: ProcessFile):
"""Pre Process dictionary files into a intermediary resources file"""
dictionary = CCCEDICT.create_cedict(process_file.language_id)
with process_file.dictionary_resource_file.open(
"w", encoding="utf8", newline=""
) as resource_file:
tsv_writer = csv.DictWriter(
resource_file, dialect=DIALECT, fieldnames=FIELDNAMES
)
tsv_writer.writeheader()
for words in words_list:
word = words.split()[0]
pinyin = " ".join(words.split()[1:]) if len(words.split()) > 1 else None
if entries := dictionary.get(word):
if pinyin is not None:
entries = list(filter(lambda x: x.pinyin == pinyin, entries))
if len(entries) > 1:
print(f"\nWARNING: {word} has multiple meanings:")
for entry in entries:
for meaning in entry.meanings:
tsv_writer.writerow(
{
"simplified": entry.simplified,
"traditional": entry.traditional,
"pinyin": entry.pinyin,
"meaning": meaning,
}
)
else:
print("============================================")
print(f"===================>ERROR: {word} not found")
print("============================================")
tsv_writer.writerow(
{
"simplified": word,
"traditional": None,
"pinyin": None,
"meaning": None,
}
)
def dictionary_process(process_file: ProcessFile) -> list[DictionaryResult]:
"""Process a dictionary_resource_file into a final result"""
results = []
with process_file.dictionary_resource_file.open(
"r", encoding="utf8", newline=""
) as resource_file:
reader = csv.DictReader(resource_file, dialect=DIALECT)
for line in reader:
audio_path = process_file.resources / f"{line['pinyin']}.wav"
if not audio_path.exists():
audio = TTS.MODEL.generate(
f"{line['simplified']}", language_id=LANGUAGES.CN, **TTS.DEFAULTS
)
torchaudio.save(audio_path, audio, TTS.MODEL.sr)
print(line)
result = DictionaryResult(
**line, audio_path=audio_path, language_id=process_file.language_id
)
results.append(result)
return results
# def output_tsv(out_file, results):
# """writes the output as a tsv file"""

View File

@@ -11,7 +11,7 @@ from pathlib import Path
import argostranslate.package
import argostranslate.translate
import torch
from cedict_utils.cedict import CedictParser
from cedict_utils.cedict import CedictEntry, CedictParser
from chatterbox.mtl_tts import ChatterboxMultilingualTTS
# Local
@@ -24,22 +24,81 @@ class TRANS:
"""Static Class for Argos translate"""
UPDATED = False
PACKAGES = None
PACKAGES = []
@staticmethod
def create_translator(from_code, to_code):
"""Download and install Argos Translate package"""
print(f"Create translator from {from_code} to {to_code}")
if from_code == to_code:
return
if not TRANS.UPDATED:
argostranslate.package.update_package_index()
TRANS.PACKAGES = argostranslate.package.get_available_packages()
TRANS.UPDATED = True
package_to_install = next(
packages = tuple(
filter(
lambda x: x.from_code == from_code and x.to_code == to_code,
lambda x: x.from_code == from_code or x.to_code == to_code,
TRANS.PACKAGES,
)
)
argostranslate.package.install_from_path(package_to_install.download())
print(f"available packages {packages[:5]}")
packages_to_install = []
for in_package in packages:
if in_package.from_code == from_code:
for out_package in packages:
if out_package.to_code == to_code:
if in_package.to_code == out_package.from_code:
print(
f"Check in_package {in_package.from_code} {in_package.to_code}"
)
print(
f"Check out_package {out_package.from_code} {out_package.to_code}"
)
packages_to_install.append(in_package)
packages_to_install.append(out_package)
for package in packages_to_install:
print(f"instaling package {package}")
argostranslate.package.install_from_path(package.download())
class TranslatedEntry:
"""Holder class for CCCEDIT entry translated to `language_id`"""
def __init__(self, entry: CedictEntry, language_id: str):
self.entry = entry
self.language_id = language_id
self._translated_meanings = []
@property
def simplified(self):
"""Entry simplified"""
return self.entry.simplified
@property
def traditional(self):
"""Entry traditional"""
return self.entry.traditional
@property
def pinyin(self):
"""Entry piying"""
return self.entry.pinyin
@property
def meanings(self):
"""Entry translated meaning list"""
for meaning in self.entry.meanings:
if self.language_id != LANGUAGES.EN:
print(f"translating from {LANGUAGES.EN} to {self.language_id}")
print(f"-> {meaning}")
trans_meaning = argostranslate.translate.translate(
meaning, LANGUAGES.EN, self.language_id
)
else:
trans_meaning = meaning
self._translated_meanings.append(trans_meaning)
return self._translated_meanings
class CCCEDICT:
@@ -50,24 +109,23 @@ class CCCEDICT:
DICTIONARY_LIST = {}
@staticmethod
def create_cedict(language_id=LANGUAGES.EN):
def create_cedict(
language_id: str = LANGUAGES.EN,
) -> dict[str, list[TranslatedEntry]]:
"""Creates a create_cedict dictionary object"""
if not CCCEDICT.PARSER:
CCCEDICT.PARSER = CedictParser()
CCCEDICT.PARSER.read_file(CCCEDICT_PATH)
CCCEDICT.ENTRIES = CCCEDICT.PARSER.parse()
if language_id not in CCCEDICT.DICTIONARY_LIST:
TRANS.create_translator(LANGUAGES.EN, language_id)
dictionary = {}
for entry in CCCEDICT.ENTRIES:
if language_id != LANGUAGES.EN:
TRANS.create_translator(LANGUAGES.EN, language_id)
entry = argostranslate.translate.translate(
entry, LANGUAGES.EN, language_id
)
trans_entry = TranslatedEntry(entry, language_id)
if entry.simplified not in dictionary:
dictionary[entry.simplified] = [entry]
dictionary[entry.simplified] = [trans_entry]
else:
dictionary[entry.simplified].append(entry)
dictionary[entry.simplified].append(trans_entry)
CCCEDICT.DICTIONARY_LIST[language_id] = dictionary
else:
dictionary = CCCEDICT.DICTIONARY_LIST[language_id]
@@ -79,6 +137,7 @@ class TTS:
MODEL = None
DEVICE = None
DEFAULTS = {"cfg_weight": 0.2, "exaggeration": 0.8}
@staticmethod
def create_tts():
@@ -135,7 +194,23 @@ class ProcessFile:
"""Posible name for the output file, still missing the filetype"""
if self.language_id is None:
raise ValueError("Not a valid language selected")
return self.input_file.parent / f"{self.input_file.stem}.{self.language_id})."
return self.out_folder / f"{self.input_file.stem}.{self.language_id}.temp"
@property
def dictionary_resource_file(self):
"""The path for the resource tsv for dictionary files"""
return self.resources / f"dictionary.{self.language_id}.tsv"
@property
def relative_dictionary_resource_file(self):
"""The path for the resource tsv for dictionary files"""
path = self.resources / f"dictionary.{self.language_id}.tsv"
return path.relative_to(RESOURCES)
@property
def available_dictionary_languages(self):
"""for a Dictionary file loads the avaliable proceced languages"""
return [lan.suffixes[0][1:] for lan in self.resources.glob("dictionary.*.tsv")]
class TranslationResult:
@@ -152,3 +227,23 @@ class TranslationResult:
self.translated = translated
self.line = line
self.audio_path = audio_path
class DictionaryResult:
"""Result of a dictionaty process"""
def __init__(
self,
language_id: str,
simplified: str,
traditional: str,
pinyin: str,
meaning: str,
audio_path: Path,
):
self.language_id = language_id
self.simplified = simplified
self.traditional = traditional
self.pinyin = pinyin
self.meaning = meaning
self.audio_path = audio_path