addd chaterboox audio generation

2026-05-27 12:49:31 +08:00
parent da472a6a8d
commit b1e0ed45b7
7 changed files with 68 additions and 43 deletions
--- a/anki-hsk-creator/init.py
+++ b/anki-hsk-creator/init.py
@@ -1 +1,2 @@
 """anki-hsk-creator"""
+HF_TOKEN = "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"
--- a/anki-hsk-creator/main.py
+++ b/anki-hsk-creator/main.py
@@ -9,6 +9,7 @@ from pinyin_tone_converter.pinyin_tone_converter import PinyinToneConverter
 from genanki import Deck, Note, Model, Package
 import argostranslate.package
 import argostranslate.translate
+from chatterbox.mtl_tts import ChatterboxMultilingualTTS

 ## Constants

@@ -16,6 +17,7 @@ CCCEDICT = Path(__file__).parent / "cedict_ts.u8"
 DATA = Path(__file__).parent.parent / "data"
 INPUT = DATA / "input"
 OUTPUT = DATA / "output"
+RESOURCES = DATA / "resources"
 CN = "zh"
 EN = "en"
 PHRASES_TYPE = ".phrases"
@@ -65,6 +67,8 @@ HSK_FRONT_TEMPLATE = """
 <tts service="android" voice="en-US">
 {{English}}
 </tts>
+<br>
+{{MyMedia}}
 """

 HSK_MODEL = Model(
@@ -75,6 +79,7 @@ HSK_MODEL = Model(
        {"name": "Pinyin"},
        {"name": "Simplified"},
        {"name": "Traditional"},
+        {'name': 'Audio'},
    ],
    templates=[
        {
@@ -87,7 +92,7 @@ HSK_MODEL = Model(
            "name": "Card 2",
            "qfmt": "<div class='simple'>{{Simplified}}</div><br><div class='trad'>"
            "{{Traditional}}</div>",
-            "afmt": '{{FrontSide}}<hr id="answer"><strong>{{Pinyin}}</strong><br>{{English}}',
+            "afmt": '{{FrontSide}}<hr id="answer"><strong>{{Pinyin}}</strong><br>{{English}}<br>{{MyMedia}}',
        },
    ],
    css=CSS,
@@ -122,6 +127,9 @@ def create_translator():
    )
    argostranslate.package.install_from_path(package_to_install.download())

+def create_tts():
+    tts = ChatterboxMultilingualTTS.from_pretrained(device="cuda")
+    return tts

 ## Main

@@ -145,15 +153,18 @@ def process_files():
            level = selected
    relative = in_file.relative_to(INPUT)
    out_file = OUTPUT / relative
+    resources = RESOURCES / relative 
+    resources = resources.parent / resources.stem
+    resources.mkdir(parents=True, exist_ok=True)
    out_file.parent.mkdir(parents=True, exist_ok=True)
    with in_file.open(encoding="utf8") as input_file:
        file_type = input_file.read().split()[0]
-    return in_file, out_file, file_type
+    return in_file, out_file, resources, file_type


-def dictionary_process(dictionary, in_file):
+def dictionary_process(dictionary, tts, in_file, resources):
    """Process dictionary files"""
-    words_list = in_file.open(encoding="utf8").read().split("\n")
+    words_list = in_file.open(encoding="utf8").read().strip().split("\n")
    results = [] 
    with in_file.open("w", encoding="utf8") as input_file:
        for words in words_list:
@@ -163,21 +174,27 @@ def dictionary_process(dictionary, in_file):
                if len(v) > 1:
                    print(f"\nWARNING: {word} has multiple meanings:")
                    if pinyin and pinyin != "ERROR":
-                        ml = filter(lambda x: v.pinyin == pinyin, v)
+                        ml = list(filter(lambda x: x.pinyin == pinyin, v))
                    else:
                        ml = v
-                    for n, w in enumerate(ml):
-                        print(f"{n+1} - {w}")
-                        for m in w.meanings:
-                            print(f"\t{m}")
-                    s = None
-                    while not s or not s.isnumeric() or not (1 <= int(s) <= len(v)):
-                        s = input(f"Please select the correct word [1-{len(v)}]: ")
-                    v = v[int(s) - 1]
+                    if len(ml) > 1:
+                        for n, w in enumerate(ml):
+                            print(f"{n+1} - {w}")
+                            for m in w.meanings:
+                                print(f"\t{m}")
+                        s = None
+                        while not s or not s.isnumeric() or not (1 <= int(s) <= len(v)):
+                            s = input(f"Please select the correct word [1-{len(v)}]: ")
+                        v = v[int(s) - 1]
+                    else:
+                        v = ml[0]
                else:
                    v = v[0]
+                audio = tts.generate(word, language_id="zh")
+                audio_path = resources / f"{word}.wav"
+                ta.save(audio_path, audio, tts.sr)
                input_file.write(f"{word}\t{v.pinyin}\n")
-                results.append(v)
+                results.append((v, audio_path))
            else:
                print("============================================")
                print(f"===================>ERROR: {word} not found")
@@ -199,27 +216,29 @@ def translator_process(in_file):
    return results


-def output_tsv(out_file, results):
-    """writes the output as a tsv file"""
-    final_file = out_file.parent / f"{out_file.stem}.tsv"
-    with final_file.open("w", encoding="utf8", newline="") as csvfile:
-        writer = csv.writer(csvfile, delimiter="\t", quotechar='"')
-        for entry in results:
-            writer.writerow(
-                [
-                    "\n ".join(f"{n+1}. {m}" for n, m in enumerate(entry.meanings)),
-                    PinyinToneConverter().convert_text(entry.pinyin),
-                    entry.simplified,
-                    entry.traditional,
-                ]
-            )
+# def output_tsv(out_file, results):
+#     """writes the output as a tsv file"""
+#     final_file = out_file.parent / f"{out_file.stem}.tsv"
+#     with final_file.open("w", encoding="utf8", newline="") as csvfile:
+#         writer = csv.writer(csvfile, delimiter="\t", quotechar='"')
+#         for entry in results:
+#             writer.writerow(
+#                 [
+#                     "\n ".join(f"{n+1}. {m}" for n, m in enumerate(entry.meanings)),
+#                     PinyinToneConverter().convert_text(entry.pinyin),
+#                     entry.simplified,
+#                     entry.traditional,
+#                 ]
+#             )


 def output_anki_dictionary(out_file, results):
    final_file = out_file.parent / f"{out_file.stem}.apkg"
    deck_name = "::".join(out_file.relative_to(OUTPUT).parts[:-1] + (out_file.stem,))
    deck = Deck(random.randrange(1 << 30, 1 << 31), deck_name)
-    for entry in results:
+    package = Package(deck)
+    audios = []
+    for entry, audio in results:
        note = Note(
            model=HSK_MODEL,
            fields=[
@@ -227,11 +246,14 @@ def output_anki_dictionary(out_file, results):
                PinyinToneConverter().convert_text(entry.pinyin),
                entry.simplified,
                entry.traditional,
+                f"[sound:{audio.name}]"
            ],
        )
+        audios.append(audio)
        deck.add_note(note)
-    Package(deck).write_to_file(final_file)
-
+    package.media_files = audios
+    package.write_to_file(final_file)
+    

 def output_anki_text(out_file, results):
    final_file = out_file.parent / f"{out_file.stem}.apkg"
@@ -247,14 +269,15 @@ def output_anki_text(out_file, results):


 def main():
-    in_file, out_file, file_type = process_files()
+    in_file, out_file, resources, file_type = process_files()
    if PHRASES_TYPE in in_file.suffixes:
        create_translator()
        results = translator_process(in_file)
        output_anki_text(out_file, results)
    elif DICT_TYPE in in_file.suffixes:
+        tts = create_tts()
        dictionary = create_cedict()
-        results = dictionary_process(dictionary, in_file)
+        results = dictionary_process(dictionary, tts, in_file, resources)
        output_anki_dictionary(out_file, results)
    else:
        raise TypeError("Error, filetype not especified!")
--- a/anki_hsk_creator.egg-info/PKG-INFO
+++ b/anki_hsk_creator.egg-info/PKG-INFO
@@ -6,5 +6,6 @@ Requires-Dist: cedict-utils
 Requires-Dist: pinyin-tone-converter
 Requires-Dist: genanki
 Requires-Dist: argostranslate
+Requires-Dist: chatterbox-tts
 Dynamic: license-file
 Dynamic: requires-dist
--- a/anki_hsk_creator.egg-info/requires.txt
+++ b/anki_hsk_creator.egg-info/requires.txt
@@ -2,3 +2,4 @@ cedict-utils
 pinyin-tone-converter
 genanki
 argostranslate
+chatterbox-tts
--- a/data/input/HSK1/HSK1-1.dictionary.txt
+++ b/data/input/HSK1/HSK1-1.dictionary.txt
@@ -1,9 +1,7 @@
-你
-好
-您
-你好
-您好
-你们好
-您们好
-对不起
-没关系
+你	ni3
+好	hao3
+您	nin2
+你好	ni3 hao3
+您好	nin2 hao3
+对不起	dui4 bu5 qi3
+没关系	mei2 guan1 xi5
--- a/data/output/HSK1/HSK1-1.dictionary.apkg
+++ b/data/output/HSK1/HSK1-1.dictionary.apkg
--- a/setup.py
+++ b/setup.py
@@ -9,6 +9,7 @@ setup(
        "pinyin-tone-converter",
        "genanki",
        "argostranslate",
-        "chattts",
+        "chatterbox-tts",
+        "torchaudio",
    ],
 )