addd chaterboox audio generation

This commit is contained in:
Wolfang Torres
2026-05-27 12:49:31 +08:00
parent da472a6a8d
commit b1e0ed45b7
7 changed files with 68 additions and 43 deletions

View File

@@ -1 +1,2 @@
"""anki-hsk-creator""" """anki-hsk-creator"""
HF_TOKEN = "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"

View File

@@ -9,6 +9,7 @@ from pinyin_tone_converter.pinyin_tone_converter import PinyinToneConverter
from genanki import Deck, Note, Model, Package from genanki import Deck, Note, Model, Package
import argostranslate.package import argostranslate.package
import argostranslate.translate import argostranslate.translate
from chatterbox.mtl_tts import ChatterboxMultilingualTTS
## Constants ## Constants
@@ -16,6 +17,7 @@ CCCEDICT = Path(__file__).parent / "cedict_ts.u8"
DATA = Path(__file__).parent.parent / "data" DATA = Path(__file__).parent.parent / "data"
INPUT = DATA / "input" INPUT = DATA / "input"
OUTPUT = DATA / "output" OUTPUT = DATA / "output"
RESOURCES = DATA / "resources"
CN = "zh" CN = "zh"
EN = "en" EN = "en"
PHRASES_TYPE = ".phrases" PHRASES_TYPE = ".phrases"
@@ -65,6 +67,8 @@ HSK_FRONT_TEMPLATE = """
<tts service="android" voice="en-US"> <tts service="android" voice="en-US">
{{English}} {{English}}
</tts> </tts>
<br>
{{MyMedia}}
""" """
HSK_MODEL = Model( HSK_MODEL = Model(
@@ -75,6 +79,7 @@ HSK_MODEL = Model(
{"name": "Pinyin"}, {"name": "Pinyin"},
{"name": "Simplified"}, {"name": "Simplified"},
{"name": "Traditional"}, {"name": "Traditional"},
{'name': 'Audio'},
], ],
templates=[ templates=[
{ {
@@ -87,7 +92,7 @@ HSK_MODEL = Model(
"name": "Card 2", "name": "Card 2",
"qfmt": "<div class='simple'>{{Simplified}}</div><br><div class='trad'>" "qfmt": "<div class='simple'>{{Simplified}}</div><br><div class='trad'>"
"{{Traditional}}</div>", "{{Traditional}}</div>",
"afmt": '{{FrontSide}}<hr id="answer"><strong>{{Pinyin}}</strong><br>{{English}}', "afmt": '{{FrontSide}}<hr id="answer"><strong>{{Pinyin}}</strong><br>{{English}}<br>{{MyMedia}}',
}, },
], ],
css=CSS, css=CSS,
@@ -122,6 +127,9 @@ def create_translator():
) )
argostranslate.package.install_from_path(package_to_install.download()) argostranslate.package.install_from_path(package_to_install.download())
def create_tts():
tts = ChatterboxMultilingualTTS.from_pretrained(device="cuda")
return tts
## Main ## Main
@@ -145,15 +153,18 @@ def process_files():
level = selected level = selected
relative = in_file.relative_to(INPUT) relative = in_file.relative_to(INPUT)
out_file = OUTPUT / relative out_file = OUTPUT / relative
resources = RESOURCES / relative
resources = resources.parent / resources.stem
resources.mkdir(parents=True, exist_ok=True)
out_file.parent.mkdir(parents=True, exist_ok=True) out_file.parent.mkdir(parents=True, exist_ok=True)
with in_file.open(encoding="utf8") as input_file: with in_file.open(encoding="utf8") as input_file:
file_type = input_file.read().split()[0] file_type = input_file.read().split()[0]
return in_file, out_file, file_type return in_file, out_file, resources, file_type
def dictionary_process(dictionary, in_file): def dictionary_process(dictionary, tts, in_file, resources):
"""Process dictionary files""" """Process dictionary files"""
words_list = in_file.open(encoding="utf8").read().split("\n") words_list = in_file.open(encoding="utf8").read().strip().split("\n")
results = [] results = []
with in_file.open("w", encoding="utf8") as input_file: with in_file.open("w", encoding="utf8") as input_file:
for words in words_list: for words in words_list:
@@ -163,9 +174,10 @@ def dictionary_process(dictionary, in_file):
if len(v) > 1: if len(v) > 1:
print(f"\nWARNING: {word} has multiple meanings:") print(f"\nWARNING: {word} has multiple meanings:")
if pinyin and pinyin != "ERROR": if pinyin and pinyin != "ERROR":
ml = filter(lambda x: v.pinyin == pinyin, v) ml = list(filter(lambda x: x.pinyin == pinyin, v))
else: else:
ml = v ml = v
if len(ml) > 1:
for n, w in enumerate(ml): for n, w in enumerate(ml):
print(f"{n+1} - {w}") print(f"{n+1} - {w}")
for m in w.meanings: for m in w.meanings:
@@ -174,10 +186,15 @@ def dictionary_process(dictionary, in_file):
while not s or not s.isnumeric() or not (1 <= int(s) <= len(v)): while not s or not s.isnumeric() or not (1 <= int(s) <= len(v)):
s = input(f"Please select the correct word [1-{len(v)}]: ") s = input(f"Please select the correct word [1-{len(v)}]: ")
v = v[int(s) - 1] v = v[int(s) - 1]
else:
v = ml[0]
else: else:
v = v[0] v = v[0]
audio = tts.generate(word, language_id="zh")
audio_path = resources / f"{word}.wav"
ta.save(audio_path, audio, tts.sr)
input_file.write(f"{word}\t{v.pinyin}\n") input_file.write(f"{word}\t{v.pinyin}\n")
results.append(v) results.append((v, audio_path))
else: else:
print("============================================") print("============================================")
print(f"===================>ERROR: {word} not found") print(f"===================>ERROR: {word} not found")
@@ -199,27 +216,29 @@ def translator_process(in_file):
return results return results
def output_tsv(out_file, results): # def output_tsv(out_file, results):
"""writes the output as a tsv file""" # """writes the output as a tsv file"""
final_file = out_file.parent / f"{out_file.stem}.tsv" # final_file = out_file.parent / f"{out_file.stem}.tsv"
with final_file.open("w", encoding="utf8", newline="") as csvfile: # with final_file.open("w", encoding="utf8", newline="") as csvfile:
writer = csv.writer(csvfile, delimiter="\t", quotechar='"') # writer = csv.writer(csvfile, delimiter="\t", quotechar='"')
for entry in results: # for entry in results:
writer.writerow( # writer.writerow(
[ # [
"\n ".join(f"{n+1}. {m}" for n, m in enumerate(entry.meanings)), # "\n ".join(f"{n+1}. {m}" for n, m in enumerate(entry.meanings)),
PinyinToneConverter().convert_text(entry.pinyin), # PinyinToneConverter().convert_text(entry.pinyin),
entry.simplified, # entry.simplified,
entry.traditional, # entry.traditional,
] # ]
) # )
def output_anki_dictionary(out_file, results): def output_anki_dictionary(out_file, results):
final_file = out_file.parent / f"{out_file.stem}.apkg" final_file = out_file.parent / f"{out_file.stem}.apkg"
deck_name = "::".join(out_file.relative_to(OUTPUT).parts[:-1] + (out_file.stem,)) deck_name = "::".join(out_file.relative_to(OUTPUT).parts[:-1] + (out_file.stem,))
deck = Deck(random.randrange(1 << 30, 1 << 31), deck_name) deck = Deck(random.randrange(1 << 30, 1 << 31), deck_name)
for entry in results: package = Package(deck)
audios = []
for entry, audio in results:
note = Note( note = Note(
model=HSK_MODEL, model=HSK_MODEL,
fields=[ fields=[
@@ -227,10 +246,13 @@ def output_anki_dictionary(out_file, results):
PinyinToneConverter().convert_text(entry.pinyin), PinyinToneConverter().convert_text(entry.pinyin),
entry.simplified, entry.simplified,
entry.traditional, entry.traditional,
f"[sound:{audio.name}]"
], ],
) )
audios.append(audio)
deck.add_note(note) deck.add_note(note)
Package(deck).write_to_file(final_file) package.media_files = audios
package.write_to_file(final_file)
def output_anki_text(out_file, results): def output_anki_text(out_file, results):
@@ -247,14 +269,15 @@ def output_anki_text(out_file, results):
def main(): def main():
in_file, out_file, file_type = process_files() in_file, out_file, resources, file_type = process_files()
if PHRASES_TYPE in in_file.suffixes: if PHRASES_TYPE in in_file.suffixes:
create_translator() create_translator()
results = translator_process(in_file) results = translator_process(in_file)
output_anki_text(out_file, results) output_anki_text(out_file, results)
elif DICT_TYPE in in_file.suffixes: elif DICT_TYPE in in_file.suffixes:
tts = create_tts()
dictionary = create_cedict() dictionary = create_cedict()
results = dictionary_process(dictionary, in_file) results = dictionary_process(dictionary, tts, in_file, resources)
output_anki_dictionary(out_file, results) output_anki_dictionary(out_file, results)
else: else:
raise TypeError("Error, filetype not especified!") raise TypeError("Error, filetype not especified!")

View File

@@ -6,5 +6,6 @@ Requires-Dist: cedict-utils
Requires-Dist: pinyin-tone-converter Requires-Dist: pinyin-tone-converter
Requires-Dist: genanki Requires-Dist: genanki
Requires-Dist: argostranslate Requires-Dist: argostranslate
Requires-Dist: chatterbox-tts
Dynamic: license-file Dynamic: license-file
Dynamic: requires-dist Dynamic: requires-dist

View File

@@ -2,3 +2,4 @@ cedict-utils
pinyin-tone-converter pinyin-tone-converter
genanki genanki
argostranslate argostranslate
chatterbox-tts

View File

@@ -1,9 +1,7 @@
ni3
hao3
nin2
你好 你好 ni3 hao3
您好 您好 nin2 hao3
你们好 对不起 dui4 bu5 qi3
您们好 没关系 mei2 guan1 xi5
对不起
没关系

View File

@@ -9,6 +9,7 @@ setup(
"pinyin-tone-converter", "pinyin-tone-converter",
"genanki", "genanki",
"argostranslate", "argostranslate",
"chattts", "chatterbox-tts",
"torchaudio",
], ],
) )