version 0.1
This commit is contained in:
@@ -34,6 +34,7 @@ dependencies = [
|
||||
"torch",
|
||||
"torchaudio",
|
||||
"torchcodec",
|
||||
"python-dotenv",
|
||||
]
|
||||
|
||||
[project.optional-dependencies]
|
||||
@@ -41,7 +42,9 @@ dev = [
|
||||
"pytest",
|
||||
"black",
|
||||
"pylint",
|
||||
"flakehell"
|
||||
"flake8",
|
||||
"flake8-pyproject",
|
||||
# "flakeheaven",
|
||||
]
|
||||
|
||||
[project.urls]
|
||||
@@ -50,17 +53,18 @@ Issues = "https://gitea.wolfang.info.ve/wolfang/anki-hsk-creator/issues"
|
||||
Source = "https://gitea.wolfang.info.ve/wolfang/anki-hsk-creator"
|
||||
|
||||
[tool.hatch.version]
|
||||
path = "src/anki-hsk-creator/__about__.py"
|
||||
path = "src/anki_hsk_creator/__about__.py"
|
||||
|
||||
[tool.hatch.build.targets.sdist]
|
||||
packages = ["src/anki_hsk_creator"]
|
||||
include = [
|
||||
"src/anki-hsk-creator/cedict_ts.u8",
|
||||
"src/anki_hsk_creator/cedict_ts.u8",
|
||||
]
|
||||
|
||||
[tool.hatch.build.targets.wheel]
|
||||
packages = ["src/anki-hsk-creator"]
|
||||
packages = ["src/anki_hsk_creator"]
|
||||
include = [
|
||||
"src/anki-hsk-creator/cedict_ts.u8",
|
||||
"src/anki_hsk_creator/cedict_ts.u8",
|
||||
]
|
||||
|
||||
[tool.hatch.envs.default]
|
||||
@@ -69,7 +73,8 @@ extra-dependencies = [
|
||||
]
|
||||
|
||||
[tool.hatch.envs.default.scripts]
|
||||
format = "black --target-version=py314 anki-hsk-creator tests && isort anki-hsk-creator tests"
|
||||
format = "black --target-version=py314 src tests && isort src tests"
|
||||
lint = "flake8 src"
|
||||
|
||||
[tool.hatch.envs.types]
|
||||
extra-dependencies = [
|
||||
@@ -80,16 +85,16 @@ extra-dependencies = [
|
||||
check = "mypy --install-types --non-interactive {args:src/anki-hsk-creator tests}"
|
||||
|
||||
[tool.coverage.run]
|
||||
source_pkgs = ["anki-hsk-creator", "tests"]
|
||||
source_pkgs = ["src", "tests"]
|
||||
branch = true
|
||||
parallel = true
|
||||
omit = [
|
||||
"src/anki-hsk-creator/__about__.py",
|
||||
"src/anki_hsk_creator/__about__.py",
|
||||
]
|
||||
|
||||
[tool.coverage.paths]
|
||||
anki-hsk-creator = ["src/anki-hsk-creator", "*/anki-hsk-creator/src/anki-hsk-creator"]
|
||||
tests = ["tests", "*src/anki-hsk-creator/tests"]
|
||||
anki-hsk-creator = ["src/anki_hsk_creator", "*/anki-hsk-creator/src/anki_hsk_creator"]
|
||||
tests = ["tests", "*src/anki_hsk_creator/tests"]
|
||||
|
||||
[tool.coverage.report]
|
||||
exclude_lines = [
|
||||
@@ -111,6 +116,7 @@ exclude = '''
|
||||
)
|
||||
'''
|
||||
|
||||
|
||||
[tool.isort]
|
||||
src_paths = ["src", "test"]
|
||||
skip_glob = [".git", "__pycache__", ".vscode", "*venv", "build", "dist", "old", "*.egg-info"]
|
||||
@@ -145,9 +151,8 @@ msg-template="{path}:{module}:{line}: [{msg_id}({symbol}), {obj}] {msg}"
|
||||
logging-format-style="new"
|
||||
logging-modules="logging"
|
||||
|
||||
[tool.flakehell]
|
||||
[tool.flake8]
|
||||
max_line_length = 88
|
||||
format = "grouped"
|
||||
show_source = false
|
||||
exclude = [
|
||||
".git",
|
||||
@@ -160,8 +165,29 @@ exclude = [
|
||||
"*.egg-info",
|
||||
]
|
||||
|
||||
[tool.flakehell.plugins]
|
||||
[tool.flake8.plugins]
|
||||
mccabe = ["+C*"]
|
||||
pycodestyle = ["+E*", "+W*", "-E203", "-E501", "-W503"]
|
||||
pyflakes = ["+F*"]
|
||||
flake8-bugbear = ["+*", "+B950"]
|
||||
|
||||
# [tool.flakeheaven]
|
||||
# max_line_length = 88
|
||||
# format = "grouped"
|
||||
# show_source = false
|
||||
# exclude = [
|
||||
# ".git",
|
||||
# "__pycache__",
|
||||
# ".vscode",
|
||||
# "*venv",
|
||||
# "build",
|
||||
# "dist",
|
||||
# "old",
|
||||
# "*.egg-info",
|
||||
# ]
|
||||
|
||||
# [tool.flakeheaven.plugins]
|
||||
# mccabe = ["+C*"]
|
||||
# pycodestyle = ["+E*", "+W*", "-E203", "-E501", "-W503"]
|
||||
# pyflakes = ["+F*"]
|
||||
# flake8-bugbear = ["+*", "+B950"]
|
||||
|
||||
@@ -1,9 +0,0 @@
|
||||
"""anki-hsk-creator"""
|
||||
|
||||
import os
|
||||
|
||||
# Globals
|
||||
|
||||
os.environ["HF_TOKEN"] = "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"
|
||||
ARGOS_UPDATED = False
|
||||
ARGOS_PACKAGES = None
|
||||
@@ -1,184 +0,0 @@
|
||||
## Imports
|
||||
from pathlib import Path
|
||||
import random
|
||||
import csv
|
||||
|
||||
## PIP
|
||||
from pinyin_tone_converter.pinyin_tone_converter import PinyinToneConverter
|
||||
|
||||
## Main
|
||||
|
||||
|
||||
def process_files():
|
||||
print("Select data file:")
|
||||
in_file = None
|
||||
level = INPUT
|
||||
while not in_file:
|
||||
files = []
|
||||
for n, file in enumerate(level.glob("*")):
|
||||
files.append(file)
|
||||
print(f"{n+1} - {file.relative_to(INPUT)}")
|
||||
s = None
|
||||
while not s or not s.isnumeric() or not (1 <= int(s) <= len(files)):
|
||||
s = input(f"Please select the file [1-{len(files)}]: ")
|
||||
selected = files[int(s) - 1]
|
||||
if selected.is_file():
|
||||
in_file = selected
|
||||
else:
|
||||
level = selected
|
||||
relative = in_file.relative_to(INPUT)
|
||||
out_file = OUTPUT / relative
|
||||
resources = RESOURCES / relative
|
||||
resources = resources.parent / resources.stem
|
||||
resources.mkdir(parents=True, exist_ok=True)
|
||||
out_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
with in_file.open(encoding="utf8") as input_file:
|
||||
file_type = input_file.read().split()[0]
|
||||
return in_file, out_file, resources, file_type
|
||||
|
||||
|
||||
def dictionary_process(dictionary, tts, in_file, resources):
|
||||
"""Process dictionary files"""
|
||||
words_list = in_file.open(encoding="utf8").read().strip().split("\n")
|
||||
results = []
|
||||
try:
|
||||
with in_file.open("w", encoding="utf8") as input_file:
|
||||
for words in words_list:
|
||||
word = words.split()[0]
|
||||
pinyin = " ".join(words.split()[1:]) if len(words.split()) > 1 else None
|
||||
if v := dictionary.get(word):
|
||||
if len(v) > 1:
|
||||
print(f"\nWARNING: {word} has multiple meanings:")
|
||||
if pinyin and pinyin != "ERROR":
|
||||
ml = list(filter(lambda x: x.pinyin == pinyin, v))
|
||||
else:
|
||||
ml = v
|
||||
if len(ml) > 1:
|
||||
for n, w in enumerate(ml):
|
||||
print(f"{n+1} - {w}")
|
||||
for m in w.meanings:
|
||||
print(f"\t{m}")
|
||||
s = None
|
||||
while (
|
||||
not s
|
||||
or not s.isnumeric()
|
||||
or not (1 <= int(s) <= len(v))
|
||||
):
|
||||
s = input(
|
||||
f"Please select the correct word [1-{len(v)}]: "
|
||||
)
|
||||
v = v[int(s) - 1]
|
||||
else:
|
||||
v = ml[0]
|
||||
else:
|
||||
v = v[0]
|
||||
audio_path = resources / f"{word}.wav"
|
||||
if not audio_path.exists():
|
||||
audio = tts.generate(f"{word}。", language_id="zh")
|
||||
torchaudio.save(audio_path, audio, tts.sr)
|
||||
input_file.write(f"{word}\t{v.pinyin}\n")
|
||||
results.append((v, audio_path))
|
||||
else:
|
||||
print("============================================")
|
||||
print(f"===================>ERROR: {word} not found")
|
||||
print("============================================")
|
||||
input_file.write(f"{word}\tERROR\n")
|
||||
except Exception:
|
||||
with in_file.open("w", encoding="utf8") as input_file:
|
||||
input_file.write("\n".join(words_list))
|
||||
return results
|
||||
|
||||
|
||||
def translator_process(tts, resources, in_file):
|
||||
"""Process for phases trasnlation"""
|
||||
text_list = in_file.open(encoding="utf8").read().strip().split()
|
||||
results = []
|
||||
for n, phrase in enumerate(text_list):
|
||||
phrase = phrase.strip()
|
||||
audio_path = resources / f"N{n}.wav"
|
||||
if not audio_path.exists():
|
||||
audio = tts.generate(f"{phrase}。", language_id="zh")
|
||||
torchaudio.save(audio_path, audio, tts.sr)
|
||||
translated = argostranslate.translate.translate(phrase, CN, EN)
|
||||
results.append([translated, phrase, audio_path])
|
||||
return results
|
||||
|
||||
|
||||
# def output_tsv(out_file, results):
|
||||
# """writes the output as a tsv file"""
|
||||
# final_file = out_file.parent / f"{out_file.stem}.tsv"
|
||||
# with final_file.open("w", encoding="utf8", newline="") as csvfile:
|
||||
# writer = csv.writer(csvfile, delimiter="\t", quotechar='"')
|
||||
# for entry in results:
|
||||
# writer.writerow(
|
||||
# [
|
||||
# "\n ".join(f"{n+1}. {m}" for n, m in enumerate(entry.meanings)),
|
||||
# PinyinToneConverter().convert_text(entry.pinyin),
|
||||
# entry.simplified,
|
||||
# entry.traditional,
|
||||
# ]
|
||||
# )
|
||||
|
||||
|
||||
def output_anki_dictionary(out_file, results):
|
||||
final_file = out_file.parent / f"{out_file.stem}.apkg"
|
||||
deck_name = "::".join(out_file.relative_to(OUTPUT).parts[:-1] + (out_file.stem,))
|
||||
deck = Deck(random.randrange(1 << 30, 1 << 31), deck_name)
|
||||
package = Package(deck)
|
||||
audios = []
|
||||
for entry, audio in results:
|
||||
note = Note(
|
||||
model=HSK_MODEL,
|
||||
fields=[
|
||||
"\n ".join(f"{n+1}. {m}" for n, m in enumerate(entry.meanings)),
|
||||
PinyinToneConverter().convert_text(entry.pinyin),
|
||||
entry.simplified,
|
||||
entry.traditional,
|
||||
f"[sound:{audio.name}]",
|
||||
],
|
||||
)
|
||||
audios.append(audio)
|
||||
deck.add_note(note)
|
||||
package.media_files = audios
|
||||
package.write_to_file(final_file)
|
||||
|
||||
|
||||
def output_anki_phrase(out_file, results):
|
||||
final_file = out_file.parent / f"{out_file.stem}.apkg"
|
||||
deck_name = "::".join(out_file.relative_to(OUTPUT).parts[:-1] + (out_file.stem,))
|
||||
deck = Deck(random.randrange(1 << 30, 1 << 31), deck_name)
|
||||
package = Package(deck)
|
||||
audios = []
|
||||
for translated, phrase, audio in results:
|
||||
note = Note(
|
||||
model=PHRASE_MODEL,
|
||||
fields=[
|
||||
translated,
|
||||
phrase,
|
||||
f"[sound:{audio.name}]",
|
||||
],
|
||||
)
|
||||
deck.add_note(note)
|
||||
audios.append(audio)
|
||||
package.media_files = audios
|
||||
package.write_to_file(final_file)
|
||||
|
||||
|
||||
def main():
|
||||
tts = create_tts()
|
||||
dictionary = create_cedict()
|
||||
create_translator()
|
||||
while True:
|
||||
in_file, out_file, resources, file_type = process_files()
|
||||
if PHRASES_TYPE in in_file.suffixes:
|
||||
results = translator_process(tts, resources, in_file)
|
||||
output_anki_phrase(out_file, results)
|
||||
elif DICT_TYPE in in_file.suffixes:
|
||||
results = dictionary_process(dictionary, tts, in_file, resources)
|
||||
output_anki_dictionary(out_file, results)
|
||||
else:
|
||||
raise TypeError("Error, filetype not especified!")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,88 +0,0 @@
|
||||
# anki-models.py
|
||||
|
||||
from genanki import Deck, Note, Model, Package
|
||||
|
||||
|
||||
# Constants
|
||||
|
||||
CSS = """
|
||||
.card {
|
||||
font-family: arial;
|
||||
font-size: 20px;
|
||||
text-align: center;
|
||||
color: black;
|
||||
background-color: white;
|
||||
}
|
||||
.simple {
|
||||
font-family: Arial;
|
||||
font-size: 100px;
|
||||
}
|
||||
.trad {
|
||||
font-family: Arial;
|
||||
font-size: 75px;
|
||||
}
|
||||
"""
|
||||
|
||||
# Models
|
||||
|
||||
PHRASE_MODEL = Model(
|
||||
2076166425,
|
||||
"Phrase Model",
|
||||
fields=[
|
||||
{"name": "Translated"},
|
||||
{"name": "Phrase"},
|
||||
{"name": "Audio"},
|
||||
],
|
||||
templates=[
|
||||
{
|
||||
"name": "Card 1",
|
||||
"qfmt": "{{Translated}}<br>{{Audio}}",
|
||||
"afmt": '{{FrontSide}}<hr id="answer">{{Phrase}}',
|
||||
},
|
||||
{
|
||||
"name": "Card 2",
|
||||
"qfmt": "{{Phrase}}<br>{{Audio}}",
|
||||
"afmt": '{{FrontSide}}<hr id="answer">{{Translated}}',
|
||||
},
|
||||
{
|
||||
"name": "Card 3",
|
||||
"qfmt": "{{Audio}}",
|
||||
"afmt": '{{FrontSide}}<hr id="answer">{{Phrase}}',
|
||||
},
|
||||
],
|
||||
css=CSS,
|
||||
)
|
||||
|
||||
|
||||
HSK_MODEL = Model(
|
||||
1708536519,
|
||||
"HSK Model",
|
||||
fields=[
|
||||
{"name": "English"},
|
||||
{"name": "Pinyin"},
|
||||
{"name": "Simplified"},
|
||||
{"name": "Traditional"},
|
||||
{"name": "Audio"},
|
||||
],
|
||||
templates=[
|
||||
{
|
||||
"name": "Card 1",
|
||||
"qfmt": "<strong>{{Pinyin}}</strong><br>{{English}}<br>{{Audio}}",
|
||||
"afmt": "{{FrontSide}}<hr id='answer''><div class='simple'>"
|
||||
"{{Simplified}}</div><br><div class='trad'>{{Traditional}}</div>",
|
||||
},
|
||||
{
|
||||
"name": "Card 2",
|
||||
"qfmt": "<div class='simple'>{{Simplified}}</div><br><div class='trad'>"
|
||||
"{{Traditional}}</div>",
|
||||
"afmt": '{{FrontSide}}<hr id="answer"><strong>{{Pinyin}}</strong><br>{{English}}<br>{{Audio}}',
|
||||
},
|
||||
{
|
||||
"name": "Card 3",
|
||||
"qfmt": "{{Audio}}",
|
||||
"afmt": "{{FrontSide}}<hr id='answer''><strong>{{Pinyin}}</strong><br><div class='simple'>"
|
||||
"{{Simplified}}</div><br><div class='trad'>{{Traditional}}</div>",
|
||||
},
|
||||
],
|
||||
css=CSS,
|
||||
)
|
||||
@@ -1,19 +0,0 @@
|
||||
## Imports
|
||||
from pathlib import Path
|
||||
import random
|
||||
import importlib.resources
|
||||
|
||||
CCCEDICT = importlib.resources.files("anki-hsk-creator").joinpath("cedict_ts.u8")
|
||||
|
||||
DATA = Path(__file__).parent.parent / "data"
|
||||
INPUT = DATA / "input"
|
||||
OUTPUT = DATA / "output"
|
||||
RESOURCES = DATA / "resources"
|
||||
|
||||
# File Types
|
||||
PHRASES_TYPE = ".phrases"
|
||||
DICT_TYPE = ".dictionary"
|
||||
|
||||
# Language codes
|
||||
CN = "zh"
|
||||
EN = "en"
|
||||
@@ -1,53 +0,0 @@
|
||||
from cedict_utils.cedict import CedictParser
|
||||
import argostranslate.package
|
||||
import argostranslate.translate
|
||||
from chatterbox.mtl_tts import ChatterboxMultilingualTTS
|
||||
|
||||
import torch
|
||||
import torchaudio
|
||||
|
||||
from . import ARGOS_UPDATED, ARGOS_PACKAGES
|
||||
from . import CCCEDICT
|
||||
|
||||
## Functions
|
||||
|
||||
|
||||
def create_cedict(language_id="en"):
|
||||
"""Creates a create_cedict dictionary object"""
|
||||
|
||||
parser = CedictParser()
|
||||
parser.read_file(CCCEDICT)
|
||||
entries = parser.parse()
|
||||
|
||||
dictionary = {}
|
||||
for entry in entries:
|
||||
if entry.simplified not in dictionary:
|
||||
dictionary[entry.simplified] = [entry]
|
||||
else:
|
||||
dictionary[entry.simplified].append(entry)
|
||||
|
||||
return dictionary
|
||||
|
||||
|
||||
def create_translator(from_code, to_code):
|
||||
"""Download and install Argos Translate package"""
|
||||
if not ARGOS_UPDATED:
|
||||
argostranslate.package.update_package_index()
|
||||
ARGOS_PACKAGES = argostranslate.package.get_available_packages()
|
||||
ARGOS_UPDATED = True
|
||||
package_to_install = next(
|
||||
filter(lambda x: x.from_code == CN and x.to_code == EN, ARGOS_PACKAGES)
|
||||
)
|
||||
argostranslate.package.install_from_path(package_to_install.download())
|
||||
|
||||
|
||||
def create_tts():
|
||||
# Automatically detect the best available device
|
||||
if torch.cuda.is_available():
|
||||
device = "cuda"
|
||||
elif torch.backends.mps.is_available():
|
||||
device = "mps"
|
||||
else:
|
||||
device = "cpu"
|
||||
tts = ChatterboxMultilingualTTS.from_pretrained(device=device, t3_model="v3")
|
||||
return tts
|
||||
@@ -1,4 +1,5 @@
|
||||
"""about.py"""
|
||||
# SPDX-FileCopyrightText: 2026-present Wolfang Torres <wolfang.torres@gmail.com>
|
||||
#
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
__version__ = "0.0.1"
|
||||
__version__ = "0.1.0"
|
||||
20
src/anki_hsk_creator/__init__.py
Normal file
20
src/anki_hsk_creator/__init__.py
Normal file
@@ -0,0 +1,20 @@
|
||||
"""anki_hsk_creator"""
|
||||
|
||||
# Standard Library
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
# Pip
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv()
|
||||
|
||||
# Globals
|
||||
|
||||
# Get an HF_TOKEN from huggingface for TTS generation
|
||||
HF_TOKEN = os.environ.get("HF_TOKEN")
|
||||
|
||||
# Path
|
||||
default_path = Path.home() / "anki-hsk-creator-data"
|
||||
DATA_FOLDER = Path(os.environ.get("DATA_FOLDER", default_path))
|
||||
DATA_FOLDER.mkdir(exist_ok=True, parents=True)
|
||||
52
src/anki_hsk_creator/__main__.py
Normal file
52
src/anki_hsk_creator/__main__.py
Normal file
@@ -0,0 +1,52 @@
|
||||
"""__main__.py"""
|
||||
|
||||
# Standard Library
|
||||
from pathlib import Path
|
||||
|
||||
# Local
|
||||
from .api import list_input_files, process_a_file, select_file
|
||||
from .constants import LANGUAGES
|
||||
|
||||
|
||||
def cli_select_files():
|
||||
"""Loops until it finds a valid input_file"""
|
||||
print("Select data file:")
|
||||
in_file = None
|
||||
level = Path()
|
||||
while not in_file:
|
||||
files = list_input_files(level)
|
||||
for n, file in enumerate(files):
|
||||
print(f"{n+1} - {file}")
|
||||
s = None
|
||||
while not s or not s.isnumeric() or not (1 <= int(s) <= len(files)):
|
||||
s = input(f"Please select the file [1-{len(files)}]: ")
|
||||
selected = files[int(s) - 1]
|
||||
if selected.is_file():
|
||||
in_file = selected
|
||||
else:
|
||||
level = selected
|
||||
input_file = select_file(in_file)
|
||||
return input_file
|
||||
|
||||
|
||||
def cli_select_language():
|
||||
"""Selects a language for the trasnlatatio"""
|
||||
print("Select a language:")
|
||||
for language_id, language in LANGUAGES.language_names.items():
|
||||
print(f"{language_id} - {language}")
|
||||
s = None
|
||||
while not s or s not in LANGUAGES.available_languages:
|
||||
s = input(f"Please select the language: ({ LANGUAGES.available_languages})")
|
||||
return s
|
||||
|
||||
|
||||
def main():
|
||||
"""CLI interface for the module"""
|
||||
while True:
|
||||
input_file = cli_select_files()
|
||||
language_id = cli_select_language()
|
||||
process_a_file(input_file, language_id)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
159
src/anki_hsk_creator/anki_generation.py
Normal file
159
src/anki_hsk_creator/anki_generation.py
Normal file
@@ -0,0 +1,159 @@
|
||||
"""anki_generation.py
|
||||
|
||||
Produces anki output
|
||||
"""
|
||||
|
||||
# Standard Library
|
||||
import random
|
||||
|
||||
# Pip
|
||||
from genanki import Deck, Model, Note, Package
|
||||
|
||||
# Local
|
||||
from .utility import ProcessFile, TranslationResult
|
||||
|
||||
# from pinyin_tone_converter.pinyin_tone_converter import PinyinToneConverter
|
||||
|
||||
|
||||
# Constants
|
||||
|
||||
CSS = """
|
||||
.card {
|
||||
font-family: arial;
|
||||
font-size: 20px;
|
||||
text-align: center;
|
||||
color: black;
|
||||
background-color: white;
|
||||
}
|
||||
.simple {
|
||||
font-family: Arial;
|
||||
font-size: 100px;
|
||||
}
|
||||
.trad {
|
||||
font-family: Arial;
|
||||
font-size: 75px;
|
||||
}
|
||||
"""
|
||||
|
||||
# Models
|
||||
|
||||
PHRASE_MODEL = Model(
|
||||
2076166425,
|
||||
"Phrase Model",
|
||||
fields=[
|
||||
{"name": "Translated"},
|
||||
{"name": "Phrase"},
|
||||
{"name": "Audio"},
|
||||
],
|
||||
templates=[
|
||||
{
|
||||
"name": "Card 1",
|
||||
"qfmt": "{{Translated}}<br>{{Audio}}",
|
||||
"afmt": '{{FrontSide}}<hr id="answer">{{Phrase}}',
|
||||
},
|
||||
{
|
||||
"name": "Card 2",
|
||||
"qfmt": "{{Phrase}}<br>{{Audio}}",
|
||||
"afmt": '{{FrontSide}}<hr id="answer">{{Translated}}',
|
||||
},
|
||||
{
|
||||
"name": "Card 3",
|
||||
"qfmt": "{{Audio}}",
|
||||
"afmt": '{{FrontSide}}<hr id="answer">{{Phrase}}',
|
||||
},
|
||||
],
|
||||
css=CSS,
|
||||
)
|
||||
|
||||
|
||||
HSK_MODEL = Model(
|
||||
1708536519,
|
||||
"HSK Model",
|
||||
fields=[
|
||||
{"name": "English"},
|
||||
{"name": "Pinyin"},
|
||||
{"name": "Simplified"},
|
||||
{"name": "Traditional"},
|
||||
{"name": "Audio"},
|
||||
],
|
||||
templates=[
|
||||
{
|
||||
"name": "Card 1",
|
||||
"qfmt": "<strong>{{Pinyin}}</strong><br>{{English}}<br>{{Audio}}",
|
||||
"afmt": (
|
||||
"{{FrontSide}}<hr id='answer''><div class='simple'>{{Simplified}}</div>"
|
||||
"<br><div class='trad'>{{Traditional}}</div>"
|
||||
),
|
||||
},
|
||||
{
|
||||
"name": "Card 2",
|
||||
"qfmt": "<div class='simple'>{{Simplified}}</div><br><div class='trad'>"
|
||||
"{{Traditional}}</div>",
|
||||
"afmt": (
|
||||
"{{FrontSide}}<hr id='answer'><strong>{{Pinyin}}</strong>"
|
||||
"<br>{{English}}<br>{{Audio}}"
|
||||
),
|
||||
},
|
||||
{
|
||||
"name": "Card 3",
|
||||
"qfmt": "{{Audio}}",
|
||||
"afmt": (
|
||||
"{{FrontSide}}<hr id='answer'><strong>{{Pinyin}}</strong>"
|
||||
"<br><div class='simple'>{{Simplified}}</div>"
|
||||
"<br><div class='trad'>{{Traditional}}</div>"
|
||||
),
|
||||
},
|
||||
],
|
||||
css=CSS,
|
||||
)
|
||||
|
||||
# Proccess
|
||||
|
||||
|
||||
# def output_anki_dictionary(out_file, results):
|
||||
# """Creates an anki file from a dictionary results"""
|
||||
# final_file = out_file.parent / f"{out_file.stem}.apkg"
|
||||
# deck_name = "::".join(out_file.relative_to(OUTPUT).parts[:-1] + (out_file.stem,))
|
||||
# deck = Deck(random.randrange(1 << 30, 1 << 31), deck_name)
|
||||
# package = Package(deck)
|
||||
# audios = []
|
||||
# for entry, audio in results:
|
||||
# note = Note(
|
||||
# model=HSK_MODEL,
|
||||
# fields=[
|
||||
# "\n ".join(f"{n+1}. {m}" for n, m in enumerate(entry.meanings)),
|
||||
# PinyinToneConverter().convert_text(entry.pinyin),
|
||||
# entry.simplified,
|
||||
# entry.traditional,
|
||||
# f"[sound:{audio.name}]",
|
||||
# ],
|
||||
# )
|
||||
# audios.append(audio)
|
||||
# deck.add_note(note)
|
||||
# package.media_files = audios
|
||||
# package.write_to_file(final_file)
|
||||
|
||||
|
||||
def output_anki_phrase(process_file: ProcessFile, results: list[TranslationResult]):
|
||||
"""Creates an anki file from a phrases results"""
|
||||
|
||||
final_file = process_file.output_name.with_suffix(".apkg")
|
||||
deck_name = "::".join(
|
||||
process_file.input_file.parts[:-1] + (process_file.input_fil.stem,)
|
||||
)
|
||||
deck = Deck(random.randrange(1 << 30, 1 << 31), deck_name)
|
||||
package = Package(deck)
|
||||
audios = []
|
||||
for result in results:
|
||||
note = Note(
|
||||
model=PHRASE_MODEL,
|
||||
fields=[
|
||||
result.translated,
|
||||
result.line,
|
||||
f"[sound:{result.audio_path.name}]",
|
||||
],
|
||||
)
|
||||
deck.add_note(note)
|
||||
audios.append(result.audio_path)
|
||||
package.media_files = audios
|
||||
package.write_to_file(final_file)
|
||||
72
src/anki_hsk_creator/api.py
Normal file
72
src/anki_hsk_creator/api.py
Normal file
@@ -0,0 +1,72 @@
|
||||
"""api.py
|
||||
|
||||
Interface for managuing and procesing files
|
||||
"""
|
||||
|
||||
# Standard Library
|
||||
from pathlib import Path
|
||||
|
||||
# Local
|
||||
from . import DATA_FOLDER
|
||||
from .anki_generation import output_anki_phrase
|
||||
from .constants import DICT_TYPE, INPUT, LANGUAGES, PHRASES_TYPE
|
||||
from .proccessor import translator_process
|
||||
from .utility import TRANS, TTS, ProcessFile
|
||||
|
||||
# interface
|
||||
|
||||
|
||||
def get_data_folder() -> Path:
|
||||
"""Utility function, return the data folder"""
|
||||
return DATA_FOLDER
|
||||
|
||||
|
||||
def list_input_files(search_path: Path = Path()) -> list[Path]:
|
||||
"""Return a list of files relative to the INPUT path"""
|
||||
level = INPUT / search_path
|
||||
return [path.relative_to(INPUT) for path in level.glob("*")]
|
||||
|
||||
|
||||
def select_file(file_path: Path) -> ProcessFile:
|
||||
"""Given a relative path from `list_input_files`, return a ProcessFile"""
|
||||
if (INPUT / file_path).is_file():
|
||||
return ProcessFile(file_path)
|
||||
else:
|
||||
raise ValueError(f"{file_path} is not a file")
|
||||
|
||||
|
||||
def create_input_file(
|
||||
name: str, file_type: str, text: str, sub_folder: Path = Path()
|
||||
) -> ProcessFile:
|
||||
"""Creates an input file, with a name and a type form the available ones,
|
||||
writes a text to it, if a sub_folder is given,
|
||||
it is created and the file placed inside.
|
||||
returns the relative path for future processing
|
||||
|
||||
valid file_types: ".phrases", ".dictionary"
|
||||
"""
|
||||
if file_type not in (PHRASES_TYPE, DICT_TYPE):
|
||||
raise ValueError(f"file_type {file_type} not in {(PHRASES_TYPE, DICT_TYPE)}")
|
||||
filename = f"{name}{file_type}.txt"
|
||||
relative = sub_folder / filename
|
||||
# write file
|
||||
file_path = INPUT / relative
|
||||
file_path.parent.mkdir(exist_ok=True, parents=True)
|
||||
file_path.write_text(text, encoding="utf8")
|
||||
# create process_file for future
|
||||
process_file = ProcessFile(relative)
|
||||
return process_file
|
||||
|
||||
|
||||
def process_a_file(process_file: ProcessFile, language_id: str):
|
||||
"""From a input_file, a language and an output type, process a file"""
|
||||
process_file.language_id = language_id
|
||||
if PHRASES_TYPE in process_file.input_file.suffix:
|
||||
TTS.create_tts()
|
||||
TRANS.create_translator(LANGUAGES.CN, language_id)
|
||||
with process_file.absolute_input_file.open("r") as file:
|
||||
text_lines = [line.strip() for line in file.readlines()]
|
||||
results = translator_process(text_lines, process_file, language_id)
|
||||
output_anki_phrase(process_file, results)
|
||||
elif DICT_TYPE in process_file.input_file.suffix:
|
||||
print("not implemented")
|
||||
51
src/anki_hsk_creator/constants.py
Normal file
51
src/anki_hsk_creator/constants.py
Normal file
@@ -0,0 +1,51 @@
|
||||
"""constants.py"""
|
||||
|
||||
# Standard Library
|
||||
import importlib.resources
|
||||
|
||||
# Local
|
||||
from . import DATA_FOLDER
|
||||
|
||||
# Resources
|
||||
CCCEDICT_PATH = importlib.resources.files("anki-hsk-creator").joinpath("cedict_ts.u8")
|
||||
|
||||
# Data folder structure
|
||||
INPUT = DATA_FOLDER / "input"
|
||||
INPUT.mkdir(exist_ok=True, parents=True)
|
||||
OUTPUT = DATA_FOLDER / "output"
|
||||
OUTPUT.mkdir(exist_ok=True, parents=True)
|
||||
RESOURCES = DATA_FOLDER / "resources"
|
||||
RESOURCES.mkdir(exist_ok=True, parents=True)
|
||||
|
||||
# File Types
|
||||
PHRASES_TYPE = ".phrases"
|
||||
DICT_TYPE = ".dictionary"
|
||||
|
||||
|
||||
class LANGUAGES:
|
||||
"""Available laguages for translation"""
|
||||
|
||||
CN = "zh"
|
||||
EN = "en"
|
||||
ES = "es"
|
||||
FR = "fr"
|
||||
RU = "ru"
|
||||
TR = "tr"
|
||||
TH = "th"
|
||||
|
||||
@property
|
||||
def available_languages(self) -> tuple:
|
||||
"""Available laguages for translation"""
|
||||
return (self.EN, self.ES, self.FR, self.RU, self.TR, self.TH)
|
||||
|
||||
@property
|
||||
def language_names(self) -> dict:
|
||||
"""Gets the name of a language code"""
|
||||
return {
|
||||
self.EN: "English",
|
||||
self.ES: "Spanish",
|
||||
self.FR: "French",
|
||||
self.RU: "Russian",
|
||||
self.TR: "Turkish",
|
||||
self.TH: "Thai",
|
||||
}
|
||||
96
src/anki_hsk_creator/proccessor.py
Normal file
96
src/anki_hsk_creator/proccessor.py
Normal file
@@ -0,0 +1,96 @@
|
||||
"""processor.py"""
|
||||
|
||||
# Pip
|
||||
import argostranslate.translate
|
||||
import torchaudio
|
||||
|
||||
# Local
|
||||
from .constants import LANGUAGES
|
||||
from .utility import TTS, ProcessFile, TranslationResult # , CCCEDICT
|
||||
|
||||
# Results Classes
|
||||
|
||||
|
||||
def translator_process(
|
||||
text_lines: list[str],
|
||||
process_file: ProcessFile,
|
||||
language_id: str,
|
||||
) -> list[TranslationResult]:
|
||||
"""Process for phases or sentence translation"""
|
||||
results = []
|
||||
for n, line in enumerate(text_lines):
|
||||
line = line.strip()
|
||||
audio_path = process_file.resources / f"N{n::03.0n}.wav"
|
||||
if not audio_path.exists():
|
||||
audio = TTS.MODEL.generate(f"{line}。", language_id=LANGUAGES.CN)
|
||||
torchaudio.save(audio_path, audio, TTS.MODEL.sr)
|
||||
translated = argostranslate.translate.translate(line, LANGUAGES.CN, language_id)
|
||||
results.append(TranslationResult(language_id, translated, line, audio_path))
|
||||
return results
|
||||
|
||||
|
||||
# def dictionary_process(dictionary, tts, in_file, resources):
|
||||
# """Process dictionary files"""
|
||||
# words_list = in_file.open(encoding="utf8").read().strip().split("\n")
|
||||
# results = []
|
||||
# try:
|
||||
# with in_file.open("w", encoding="utf8") as input_file:
|
||||
# for words in words_list:
|
||||
# word = words.split()[0]
|
||||
# pinyin = " ".join(words.split()[1:]) if len(words.split()) > 1 else None
|
||||
# if v := dictionary.get(word):
|
||||
# if len(v) > 1:
|
||||
# print(f"\nWARNING: {word} has multiple meanings:")
|
||||
# if pinyin and pinyin != "ERROR":
|
||||
# ml = list(filter(lambda x: x.pinyin == pinyin, v))
|
||||
# else:
|
||||
# ml = v
|
||||
# if len(ml) > 1:
|
||||
# for n, w in enumerate(ml):
|
||||
# print(f"{n+1} - {w}")
|
||||
# for m in w.meanings:
|
||||
# print(f"\t{m}")
|
||||
# s = None
|
||||
# while (
|
||||
# not s
|
||||
# or not s.isnumeric()
|
||||
# or not (1 <= int(s) <= len(v))
|
||||
# ):
|
||||
# s = input(
|
||||
# f"Please select the correct word [1-{len(v)}]: "
|
||||
# )
|
||||
# v = v[int(s) - 1]
|
||||
# else:
|
||||
# v = ml[0]
|
||||
# else:
|
||||
# v = v[0]
|
||||
# audio_path = resources / f"{word}.wav"
|
||||
# if not audio_path.exists():
|
||||
# audio = tts.generate(f"{word}。", language_id="zh")
|
||||
# torchaudio.save(audio_path, audio, tts.sr)
|
||||
# input_file.write(f"{word}\t{v.pinyin}\n")
|
||||
# results.append((v, audio_path))
|
||||
# else:
|
||||
# print("============================================")
|
||||
# print(f"===================>ERROR: {word} not found")
|
||||
# print("============================================")
|
||||
# input_file.write(f"{word}\tERROR\n")
|
||||
# except Exception:
|
||||
# with in_file.open("w", encoding="utf8") as input_file:
|
||||
# input_file.write("\n".join(words_list))
|
||||
# return results
|
||||
|
||||
# def output_tsv(out_file, results):
|
||||
# """writes the output as a tsv file"""
|
||||
# final_file = out_file.parent / f"{out_file.stem}.tsv"
|
||||
# with final_file.open("w", encoding="utf8", newline="") as csvfile:
|
||||
# writer = csv.writer(csvfile, delimiter="\t", quotechar='"')
|
||||
# for entry in results:
|
||||
# writer.writerow(
|
||||
# [
|
||||
# "\n ".join(f"{n+1}. {m}" for n, m in enumerate(entry.meanings)),
|
||||
# PinyinToneConverter().convert_text(entry.pinyin),
|
||||
# entry.simplified,
|
||||
# entry.traditional,
|
||||
# ]
|
||||
# )
|
||||
154
src/anki_hsk_creator/utility.py
Normal file
154
src/anki_hsk_creator/utility.py
Normal file
@@ -0,0 +1,154 @@
|
||||
"""utility.py
|
||||
|
||||
|
||||
Static clasess and functions for general use
|
||||
"""
|
||||
|
||||
# Standard Library
|
||||
from pathlib import Path
|
||||
|
||||
# Pip
|
||||
import argostranslate.package
|
||||
import argostranslate.translate
|
||||
import torch
|
||||
from cedict_utils.cedict import CedictParser
|
||||
from chatterbox.mtl_tts import ChatterboxMultilingualTTS
|
||||
|
||||
# Local
|
||||
from .constants import CCCEDICT_PATH, INPUT, LANGUAGES, OUTPUT, RESOURCES
|
||||
|
||||
# Static Clases
|
||||
|
||||
|
||||
class TRANS:
|
||||
"""Static Class for Argos translate"""
|
||||
|
||||
UPDATED = False
|
||||
PACKAGES = None
|
||||
|
||||
@staticmethod
|
||||
def create_translator(from_code, to_code):
|
||||
"""Download and install Argos Translate package"""
|
||||
if not TRANS.UPDATED:
|
||||
argostranslate.package.update_package_index()
|
||||
TRANS.PACKAGES = argostranslate.package.get_available_packages()
|
||||
TRANS.UPDATED = True
|
||||
package_to_install = next(
|
||||
filter(
|
||||
lambda x: x.from_code == from_code and x.to_code == to_code,
|
||||
TRANS.PACKAGES,
|
||||
)
|
||||
)
|
||||
argostranslate.package.install_from_path(package_to_install.download())
|
||||
|
||||
|
||||
class CCCEDICT:
|
||||
"""Static Class for the CCCEDIT dictionary"""
|
||||
|
||||
PARSER = None
|
||||
ENTRIES = []
|
||||
DICTIONARY_LIST = {}
|
||||
|
||||
@staticmethod
|
||||
def create_cedict(language_id=LANGUAGES.EN):
|
||||
"""Creates a create_cedict dictionary object"""
|
||||
if not CCCEDICT.PARSER:
|
||||
CCCEDICT.PARSER = CedictParser()
|
||||
CCCEDICT.PARSER.read_file(CCCEDICT_PATH)
|
||||
CCCEDICT.ENTRIES = CCCEDICT.PARSER.parse()
|
||||
if language_id not in CCCEDICT.DICTIONARY_LIST:
|
||||
dictionary = {}
|
||||
for entry in CCCEDICT.ENTRIES:
|
||||
if language_id != LANGUAGES.EN:
|
||||
TRANS.create_translator(LANGUAGES.EN, language_id)
|
||||
entry = argostranslate.translate.translate(
|
||||
entry, LANGUAGES.EN, language_id
|
||||
)
|
||||
if entry.simplified not in dictionary:
|
||||
dictionary[entry.simplified] = [entry]
|
||||
else:
|
||||
dictionary[entry.simplified].append(entry)
|
||||
CCCEDICT.DICTIONARY_LIST[language_id] = dictionary
|
||||
else:
|
||||
dictionary = CCCEDICT.DICTIONARY_LIST[language_id]
|
||||
return dictionary
|
||||
|
||||
|
||||
class TTS:
|
||||
"""Static class for the the TTS engine"""
|
||||
|
||||
MODEL = None
|
||||
DEVICE = None
|
||||
|
||||
@staticmethod
|
||||
def create_tts():
|
||||
"""Creates a TTS engine"""
|
||||
if TTS.DEVICE is None:
|
||||
# Automatically detect the best available device
|
||||
if torch.cuda.is_available():
|
||||
TTS.DEVICE = "cuda"
|
||||
elif torch.backends.mps.is_available():
|
||||
TTS.DEVICE = "mps"
|
||||
else:
|
||||
TTS.DEVICE = "cpu"
|
||||
if TTS.MODEL is None:
|
||||
TTS.MODEL = ChatterboxMultilingualTTS.from_pretrained(
|
||||
device=TTS.DEVICE, t3_model="v3"
|
||||
)
|
||||
|
||||
|
||||
# Clases
|
||||
|
||||
|
||||
class ProcessFile:
|
||||
"""Class that represents a file to processs
|
||||
|
||||
diferent input files has direfent process_files depending on language
|
||||
"""
|
||||
|
||||
def __init__(self, input_file: Path, language_id: str = None):
|
||||
self.input_file = input_file
|
||||
self._language_id = language_id
|
||||
# process file type
|
||||
self.out_folder = OUTPUT / input_file.parent
|
||||
self.out_folder.mkdir(parents=True, exist_ok=True)
|
||||
resources = RESOURCES / input_file
|
||||
self.resources = resources.parent / resources.stem
|
||||
self.resources.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
@property
|
||||
def absolute_input_file(self):
|
||||
"""Absolute input file"""
|
||||
return INPUT / self.input_file
|
||||
|
||||
@property
|
||||
def language_id(self):
|
||||
"""language for this trasnlation process"""
|
||||
return self._language_id
|
||||
|
||||
@language_id.setter
|
||||
def language_id(self, value):
|
||||
self._language_id = value
|
||||
|
||||
@property
|
||||
def output_name(self):
|
||||
"""Posible name for the output file, still missing the filetype"""
|
||||
if self.language_id is None:
|
||||
raise ValueError("Not a valid language selected")
|
||||
return self.input_file.parent / f"{self.input_file.stem}.{self.language_id})."
|
||||
|
||||
|
||||
class TranslationResult:
|
||||
"""Result of a translated process"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
language_id: str,
|
||||
translated: str,
|
||||
line: str,
|
||||
audio_path: Path,
|
||||
):
|
||||
self.language_id = language_id
|
||||
self.translated = translated
|
||||
self.line = line
|
||||
self.audio_path = audio_path
|
||||
Reference in New Issue
Block a user