From b1e0ed45b776b87147e3d9646abbfcf9cdf4ce1b Mon Sep 17 00:00:00 2001
From: Wolfang Torres <wolfangtorres@qq.com>
Date: Wed, 27 May 2026 12:49:31 +0800
Subject: [PATCH] addd chaterboox audio generation

---
 anki-hsk-creator/__init__.py            |   1 +
 anki-hsk-creator/__main__.py            |  89 +++++++++++++++---------
 anki_hsk_creator.egg-info/PKG-INFO      |   1 +
 anki_hsk_creator.egg-info/requires.txt  |   1 +
 data/input/HSK1/HSK1-1.dictionary.txt   |  16 ++---
 data/output/HSK1/HSK1-1.dictionary.apkg | Bin 53466 -> 53466 bytes
 setup.py                                |   3 +-
 7 files changed, 68 insertions(+), 43 deletions(-)
diff --git a/anki-hsk-creator/__init__.py b/anki-hsk-creator/__init__.py
index e0f45dd..5ced5d1 100644
--- a/anki-hsk-creator/__init__.py
+++ b/anki-hsk-creator/__init__.py
@@ -1 +1,2 @@
 """anki-hsk-creator"""
+HF_TOKEN = "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"
\ No newline at end of file
diff --git a/anki-hsk-creator/__main__.py b/anki-hsk-creator/__main__.py
index b5da595..a352c35 100644
--- a/anki-hsk-creator/__main__.py
+++ b/anki-hsk-creator/__main__.py
@@ -9,6 +9,7 @@ from pinyin_tone_converter.pinyin_tone_converter import PinyinToneConverter
 from genanki import Deck, Note, Model, Package
 import argostranslate.package
 import argostranslate.translate
+from chatterbox.mtl_tts import ChatterboxMultilingualTTS
 
 ## Constants
 
@@ -16,6 +17,7 @@ CCCEDICT = Path(__file__).parent / "cedict_ts.u8"
 DATA = Path(__file__).parent.parent / "data"
 INPUT = DATA / "input"
 OUTPUT = DATA / "output"
+RESOURCES = DATA / "resources"
 CN = "zh"
 EN = "en"
 PHRASES_TYPE = ".phrases"
@@ -65,6 +67,8 @@ HSK_FRONT_TEMPLATE = """
 <tts service="android" voice="en-US">
 {{English}}
 </tts>
+<br>
+{{MyMedia}}
 """
 
 HSK_MODEL = Model(
@@ -75,6 +79,7 @@ HSK_MODEL = Model(
         {"name": "Pinyin"},
         {"name": "Simplified"},
         {"name": "Traditional"},
+        {'name': 'Audio'},
     ],
     templates=[
         {
@@ -87,7 +92,7 @@ HSK_MODEL = Model(
             "name": "Card 2",
             "qfmt": "<div class='simple'>{{Simplified}}</div><br><div class='trad'>"
             "{{Traditional}}</div>",
-            "afmt": '{{FrontSide}}<hr id="answer"><strong>{{Pinyin}}</strong><br>{{English}}',
+            "afmt": '{{FrontSide}}<hr id="answer"><strong>{{Pinyin}}</strong><br>{{English}}<br>{{MyMedia}}',
         },
     ],
     css=CSS,
@@ -122,6 +127,9 @@ def create_translator():
     )
     argostranslate.package.install_from_path(package_to_install.download())
 
+def create_tts():
+    tts = ChatterboxMultilingualTTS.from_pretrained(device="cuda")
+    return tts
 
 ## Main
 
@@ -145,15 +153,18 @@ def process_files():
             level = selected
     relative = in_file.relative_to(INPUT)
     out_file = OUTPUT / relative
+    resources = RESOURCES / relative 
+    resources = resources.parent / resources.stem
+    resources.mkdir(parents=True, exist_ok=True)
     out_file.parent.mkdir(parents=True, exist_ok=True)
     with in_file.open(encoding="utf8") as input_file:
         file_type = input_file.read().split()[0]
-    return in_file, out_file, file_type
+    return in_file, out_file, resources, file_type
 
 
-def dictionary_process(dictionary, in_file):
+def dictionary_process(dictionary, tts, in_file, resources):
     """Process dictionary files"""
-    words_list = in_file.open(encoding="utf8").read().split("\n")
+    words_list = in_file.open(encoding="utf8").read().strip().split("\n")
     results = [] 
     with in_file.open("w", encoding="utf8") as input_file:
         for words in words_list:
@@ -163,21 +174,27 @@ def dictionary_process(dictionary, in_file):
                 if len(v) > 1:
                     print(f"\nWARNING: {word} has multiple meanings:")
                     if pinyin and pinyin != "ERROR":
-                        ml = filter(lambda x: v.pinyin == pinyin, v)
+                        ml = list(filter(lambda x: x.pinyin == pinyin, v))
                     else:
                         ml = v
-                    for n, w in enumerate(ml):
-                        print(f"{n+1} - {w}")
-                        for m in w.meanings:
-                            print(f"\t{m}")
-                    s = None
-                    while not s or not s.isnumeric() or not (1 <= int(s) <= len(v)):
-                        s = input(f"Please select the correct word [1-{len(v)}]: ")
-                    v = v[int(s) - 1]
+                    if len(ml) > 1:
+                        for n, w in enumerate(ml):
+                            print(f"{n+1} - {w}")
+                            for m in w.meanings:
+                                print(f"\t{m}")
+                        s = None
+                        while not s or not s.isnumeric() or not (1 <= int(s) <= len(v)):
+                            s = input(f"Please select the correct word [1-{len(v)}]: ")
+                        v = v[int(s) - 1]
+                    else:
+                        v = ml[0]
                 else:
                     v = v[0]
+                audio = tts.generate(word, language_id="zh")
+                audio_path = resources / f"{word}.wav"
+                ta.save(audio_path, audio, tts.sr)
                 input_file.write(f"{word}\t{v.pinyin}\n")
-                results.append(v)
+                results.append((v, audio_path))
             else:
                 print("============================================")
                 print(f"===================>ERROR: {word} not found")
@@ -199,27 +216,29 @@ def translator_process(in_file):
     return results
 
 
-def output_tsv(out_file, results):
-    """writes the output as a tsv file"""
-    final_file = out_file.parent / f"{out_file.stem}.tsv"
-    with final_file.open("w", encoding="utf8", newline="") as csvfile:
-        writer = csv.writer(csvfile, delimiter="\t", quotechar='"')
-        for entry in results:
-            writer.writerow(
-                [
-                    "\n ".join(f"{n+1}. {m}" for n, m in enumerate(entry.meanings)),
-                    PinyinToneConverter().convert_text(entry.pinyin),
-                    entry.simplified,
-                    entry.traditional,
-                ]
-            )
+# def output_tsv(out_file, results):
+#     """writes the output as a tsv file"""
+#     final_file = out_file.parent / f"{out_file.stem}.tsv"
+#     with final_file.open("w", encoding="utf8", newline="") as csvfile:
+#         writer = csv.writer(csvfile, delimiter="\t", quotechar='"')
+#         for entry in results:
+#             writer.writerow(
+#                 [
+#                     "\n ".join(f"{n+1}. {m}" for n, m in enumerate(entry.meanings)),
+#                     PinyinToneConverter().convert_text(entry.pinyin),
+#                     entry.simplified,
+#                     entry.traditional,
+#                 ]
+#             )
 
 
 def output_anki_dictionary(out_file, results):
     final_file = out_file.parent / f"{out_file.stem}.apkg"
     deck_name = "::".join(out_file.relative_to(OUTPUT).parts[:-1] + (out_file.stem,))
     deck = Deck(random.randrange(1 << 30, 1 << 31), deck_name)
-    for entry in results:
+    package = Package(deck)
+    audios = []
+    for entry, audio in results:
         note = Note(
             model=HSK_MODEL,
             fields=[
@@ -227,11 +246,14 @@ def output_anki_dictionary(out_file, results):
                 PinyinToneConverter().convert_text(entry.pinyin),
                 entry.simplified,
                 entry.traditional,
+                f"[sound:{audio.name}]"
             ],
         )
+        audios.append(audio)
         deck.add_note(note)
-    Package(deck).write_to_file(final_file)
-
+    package.media_files = audios
+    package.write_to_file(final_file)
+    
 
 def output_anki_text(out_file, results):
     final_file = out_file.parent / f"{out_file.stem}.apkg"
@@ -247,14 +269,15 @@ def output_anki_text(out_file, results):
 
 
 def main():
-    in_file, out_file, file_type = process_files()
+    in_file, out_file, resources, file_type = process_files()
     if PHRASES_TYPE in in_file.suffixes:
         create_translator()
         results = translator_process(in_file)
         output_anki_text(out_file, results)
     elif DICT_TYPE in in_file.suffixes:
+        tts = create_tts()
         dictionary = create_cedict()
-        results = dictionary_process(dictionary, in_file)
+        results = dictionary_process(dictionary, tts, in_file, resources)
         output_anki_dictionary(out_file, results)
     else:
         raise TypeError("Error, filetype not especified!")
diff --git a/anki_hsk_creator.egg-info/PKG-INFO b/anki_hsk_creator.egg-info/PKG-INFO
index bef5ae1..faee22e 100644
--- a/anki_hsk_creator.egg-info/PKG-INFO
+++ b/anki_hsk_creator.egg-info/PKG-INFO
@@ -6,5 +6,6 @@ Requires-Dist: cedict-utils
 Requires-Dist: pinyin-tone-converter
 Requires-Dist: genanki
 Requires-Dist: argostranslate
+Requires-Dist: chatterbox-tts
 Dynamic: license-file
 Dynamic: requires-dist
diff --git a/anki_hsk_creator.egg-info/requires.txt b/anki_hsk_creator.egg-info/requires.txt
index de742a5..4406c32 100644
--- a/anki_hsk_creator.egg-info/requires.txt
+++ b/anki_hsk_creator.egg-info/requires.txt
@@ -2,3 +2,4 @@ cedict-utils
 pinyin-tone-converter
 genanki
 argostranslate
+chatterbox-tts
diff --git a/data/input/HSK1/HSK1-1.dictionary.txt b/data/input/HSK1/HSK1-1.dictionary.txt
index 3ec388f..828c31f 100644
--- a/data/input/HSK1/HSK1-1.dictionary.txt
+++ b/data/input/HSK1/HSK1-1.dictionary.txt
@@ -1,9 +1,7 @@
-你
-好
-您
-你好
-您好
-你们好
-您们好
-对不起
-没关系
\ No newline at end of file
+你	ni3
+好	hao3
+您	nin2
+你好	ni3 hao3
+您好	nin2 hao3
+对不起	dui4 bu5 qi3
+没关系	mei2 guan1 xi5
diff --git a/data/output/HSK1/HSK1-1.dictionary.apkg b/data/output/HSK1/HSK1-1.dictionary.apkg
index d5ae27839e387d0abdb872f189a92bbe0586d00d..e13640e7ae956b786f26daaca3e41be91ec27f50 100644
GIT binary patch
delta 1359
zcmZvcO-~a+9EWLlH`!`RBdZ*I!xx0ocH2^Hq10#(dN3G8!iJz;44Os`MB+^kz@Z<&
zOiVnX1e4mAZM&Vm(RvXN#>7KUCh`G1d-Di0({aZ9<DQ=9H@}&k*-7?bM>*J0hUVuK
z_2dA3(st+3^5XuYGlUG!)J$?R9-EHGcFTsMBx6zR40}`=qc5i)OT*@wdp`BUn_?1e
z2F|*7{b)OIf0TG)aNwe<x=w68+nUgIZ|2qAJL{QrWYz!lN9^nBDN;0o`*h+I`Wa*s
z{{++2qbL<(z8p$iCT@E9Lf#i-o!cv8*3!!Q=-Pr<rxtoi=LJyIJWn=&=gpgL8jPd2
zk#jU?MXn9fAcbDv7&fxj-op29zv&hZFX3M#pG6n0|1Y26{U+#7^FGYuypKWuD(}NQ
z#`{&!pX7a*M|j@`{V?ytJjDA2&=2xH%qMt12l{^6Kc2Njj?MI^$Sv|Ak0~zg(6#*}
zYc(ZeiB*ZV#Dc^e4nJS>ZDg&k%sLX=5?c}*66+Fc5*>*ZiDii;iA912KSW-}A8~kd
ze1vRlj~%b;Q#6fH-#fyaENugC3CC=_0lY4}%EoKJoxVRHU)E-YiYWBGEWE(ROTdf5
zb8I{hoSg6dm~VsLG#l^wG*#C%;T_;@;4R<{;C0|N;12VL@`V*pC<8A6F9Od4CkL#4
s9+@K-=u!0k)iniSK~+Jvpn{;>zyC-y+q<Iw{Z*n&FX2wx=v^;$1Yu(K8~^|S

delta 1359
zcmZvcO=}ZD9EY3iE^N%cWM8w{m+Y#wYLhgYloYIJ4hl*|t03YFf>sIkP%Pdw(Dss(
z7nzHv+R|$k>mCI0q8_}a7r}mjegH2X>+EElkw5O~d4BVo*_j>Ky-i_nQ!wY}1koPL
zp5peWdzW_azuhsBRp?C5%$#nw+U>1(mLRmJTj`El5P6Kg`470Ijo_LtZvA>$MXQrx
zeq$f4PhKC#9%y5SMKNcrJy|;`%WCh%)ho-rZgcU(;~%Z>iw-VYwXfMk5B;o>iN9;z
z%pEjaCw{x$If~s>=|bu=yw2r?iJP|;mdBTFuysDvpJ(%QbgEIs8%XCH7xG!4px4bP
z3qCf_YFUt=SLa=;Kd3(J1iRT54vwVzIFC_s?tghi`vK^Ov=4Ki_C3(A&_2w|wC{kv
zOZzalY2O5Wi}qn|(7q1(CEAC%M*9lrt69I;AFw=abU!yQ;tzR9$Vmse^+)|dz|rIA
za5OpU9F=tH>5{(EA0#}BIYt~qjy}f<$1+Ekqs`IcXmBiHwCEu;9;@lp>hTzFtV)iT
z=z=5>8hOk-AZY|VWbTo1A9#hiL&nR%-I3qHU)CfAn-xZGG1tks0ldUqA>$fwe7=nd
zeI>hTGM?y?D9aM_7<dFc1nvW`051b~iQnZfv_ZiFZU8R<*MQ>#zW)->;R`GX^62UU
iMjj)Fk;zDBr2PAjL}!M_<iEd4)Ejp4#clLvIQs|i#l1rS

diff --git a/setup.py b/setup.py
index 196a4ac..7776c9d 100644
--- a/setup.py
+++ b/setup.py
@@ -9,6 +9,7 @@ setup(
         "pinyin-tone-converter",
         "genanki",
         "argostranslate",
-        "chattts",
+        "chatterbox-tts",
+        "torchaudio",
     ],
 )