add audio generation suport

2026-05-27 14:42:39 +08:00
parent b1e0ed45b7
commit e35bcf6d74
13 changed files with 61 additions and 36 deletions
--- a/anki-hsk-creator/init.py
+++ b/anki-hsk-creator/init.py
@@ -1,2 +1,5 @@
 """anki-hsk-creator"""
-HF_TOKEN = "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"
+
+import os
+
+os.environ["HF_TOKEN"] = "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"
--- a/anki-hsk-creator/main.py
+++ b/anki-hsk-creator/main.py
@@ -10,6 +10,8 @@ from genanki import Deck, Note, Model, Package
 import argostranslate.package
 import argostranslate.translate
 from chatterbox.mtl_tts import ChatterboxMultilingualTTS
+import torch
+import torchaudio

 ## Constants

@@ -68,7 +70,7 @@ HSK_FRONT_TEMPLATE = """
 {{English}}
 </tts>
 <br>
-{{MyMedia}}
+{{Audio}}
 """

 HSK_MODEL = Model(
@@ -92,7 +94,7 @@ HSK_MODEL = Model(
            "name": "Card 2",
            "qfmt": "<div class='simple'>{{Simplified}}</div><br><div class='trad'>"
            "{{Traditional}}</div>",
-            "afmt": '{{FrontSide}}<hr id="answer"><strong>{{Pinyin}}</strong><br>{{English}}<br>{{MyMedia}}',
+            "afmt": '{{FrontSide}}<hr id="answer"><strong>{{Pinyin}}</strong><br>{{English}}<br>{{Audio}}',
        },
    ],
    css=CSS,
@@ -128,7 +130,14 @@ def create_translator():
    argostranslate.package.install_from_path(package_to_install.download())

 def create_tts():
-    tts = ChatterboxMultilingualTTS.from_pretrained(device="cuda")
+    # Automatically detect the best available device
+    if torch.cuda.is_available():
+        device = "cuda"
+    elif torch.backends.mps.is_available():
+        device = "mps"
+    else:
+        device = "cpu"
+    tts = ChatterboxMultilingualTTS.from_pretrained(device=device)
    return tts

 ## Main
@@ -166,40 +175,45 @@ def dictionary_process(dictionary, tts, in_file, resources):
    """Process dictionary files"""
    words_list = in_file.open(encoding="utf8").read().strip().split("\n")
    results = [] 
-    with in_file.open("w", encoding="utf8") as input_file:
-        for words in words_list:
-            word = words.split()[0]
-            pinyin = " ".join(words.split()[1:]) if len(words.split()) > 1 else None
-            if v := dictionary.get(word):
-                if len(v) > 1:
-                    print(f"\nWARNING: {word} has multiple meanings:")
-                    if pinyin and pinyin != "ERROR":
-                        ml = list(filter(lambda x: x.pinyin == pinyin, v))
+    try:
+        with in_file.open("w", encoding="utf8") as input_file:
+            for words in words_list:
+                word = words.split()[0]
+                pinyin = " ".join(words.split()[1:]) if len(words.split()) > 1 else None
+                if v := dictionary.get(word):
+                    if len(v) > 1:
+                        print(f"\nWARNING: {word} has multiple meanings:")
+                        if pinyin and pinyin != "ERROR":
+                            ml = list(filter(lambda x: x.pinyin == pinyin, v))
+                        else:
+                            ml = v
+                        if len(ml) > 1:
+                            for n, w in enumerate(ml):
+                                print(f"{n+1} - {w}")
+                                for m in w.meanings:
+                                    print(f"\t{m}")
+                            s = None
+                            while not s or not s.isnumeric() or not (1 <= int(s) <= len(v)):
+                                s = input(f"Please select the correct word [1-{len(v)}]: ")
+                            v = v[int(s) - 1]
+                        else:
+                            v = ml[0]
                    else:
-                        ml = v
-                    if len(ml) > 1:
-                        for n, w in enumerate(ml):
-                            print(f"{n+1} - {w}")
-                            for m in w.meanings:
-                                print(f"\t{m}")
-                        s = None
-                        while not s or not s.isnumeric() or not (1 <= int(s) <= len(v)):
-                            s = input(f"Please select the correct word [1-{len(v)}]: ")
-                        v = v[int(s) - 1]
-                    else:
-                        v = ml[0]
+                        v = v[0]
+                    audio_path = resources / f"{word}.wav"
+                    if not audio_path.exists():
+                        audio = tts.generate(word, language_id="zh")
+                        torchaudio.save(audio_path, audio, tts.sr)
+                    input_file.write(f"{word}\t{v.pinyin}\n")
+                    results.append((v, audio_path))
                else:
-                    v = v[0]
-                audio = tts.generate(word, language_id="zh")
-                audio_path = resources / f"{word}.wav"
-                ta.save(audio_path, audio, tts.sr)
-                input_file.write(f"{word}\t{v.pinyin}\n")
-                results.append((v, audio_path))
-            else:
-                print("============================================")
-                print(f"===================>ERROR: {word} not found")
-                print("============================================")
-                input_file.write(f"{word}\tERROR\n")
+                    print("============================================")
+                    print(f"===================>ERROR: {word} not found")
+                    print("============================================")
+                    input_file.write(f"{word}\tERROR\n")
+    except Exception:
+        with in_file.open("w", encoding="utf8") as input_file:
+            input_file.write("\n".join(words_list))
    return results


--- a/anki_hsk_creator.egg-info/PKG-INFO
+++ b/anki_hsk_creator.egg-info/PKG-INFO
@@ -7,5 +7,8 @@ Requires-Dist: pinyin-tone-converter
 Requires-Dist: genanki
 Requires-Dist: argostranslate
 Requires-Dist: chatterbox-tts
+Requires-Dist: torch
+Requires-Dist: torchaudio
+Requires-Dist: torchcodec
 Dynamic: license-file
 Dynamic: requires-dist
--- a/anki_hsk_creator.egg-info/requires.txt
+++ b/anki_hsk_creator.egg-info/requires.txt
@@ -3,3 +3,6 @@ pinyin-tone-converter
 genanki
 argostranslate
 chatterbox-tts
+torch
+torchaudio
+torchcodec
--- a/data/output/HSK1/HSK1-1.dictionary.apkg
+++ b/data/output/HSK1/HSK1-1.dictionary.apkg
--- a/data/resources/HSK1/HSK1-1.dictionary/你.wav
+++ b/data/resources/HSK1/HSK1-1.dictionary/你.wav
--- a/data/resources/HSK1/HSK1-1.dictionary/你好.wav
+++ b/data/resources/HSK1/HSK1-1.dictionary/你好.wav
--- a/data/resources/HSK1/HSK1-1.dictionary/好.wav
+++ b/data/resources/HSK1/HSK1-1.dictionary/好.wav
--- a/data/resources/HSK1/HSK1-1.dictionary/对不起.wav
+++ b/data/resources/HSK1/HSK1-1.dictionary/对不起.wav
--- a/data/resources/HSK1/HSK1-1.dictionary/您.wav
+++ b/data/resources/HSK1/HSK1-1.dictionary/您.wav
--- a/data/resources/HSK1/HSK1-1.dictionary/您好.wav
+++ b/data/resources/HSK1/HSK1-1.dictionary/您好.wav
--- a/data/resources/HSK1/HSK1-1.dictionary/没关系.wav
+++ b/data/resources/HSK1/HSK1-1.dictionary/没关系.wav
--- a/setup.py
+++ b/setup.py
@@ -10,6 +10,8 @@ setup(
        "genanki",
        "argostranslate",
        "chatterbox-tts",
+        "torch",
        "torchaudio",
+        "torchcodec"
    ],
 )