remove data files to separate git

2026-06-11 16:41:34 +08:00
parent 4822c5bbed
commit 5ada53913a
13 changed files with 1061 additions and 318 deletions
--- a/src/anki-hsk-creator/about.py
+++ b/src/anki-hsk-creator/about.py
@@ -0,0 +1,4 @@
+# SPDX-FileCopyrightText: 2026-present Wolfang Torres <wolfang.torres@gmail.com>
+#
+# SPDX-License-Identifier: GPL-3.0-or-later
+__version__ = "0.0.1"
--- a/src/anki-hsk-creator/init.py
+++ b/src/anki-hsk-creator/init.py
@@ -0,0 +1,9 @@
+"""anki-hsk-creator"""
+
+import os
+
+# Globals
+
+os.environ["HF_TOKEN"] = "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"
+ARGOS_UPDATED = False
+ARGOS_PACKAGES = None
--- a/src/anki-hsk-creator/main.py
+++ b/src/anki-hsk-creator/main.py
@@ -0,0 +1,184 @@
+## Imports
+from pathlib import Path
+import random
+import csv
+
+## PIP
+from pinyin_tone_converter.pinyin_tone_converter import PinyinToneConverter
+
+## Main
+
+
+def process_files():
+    print("Select data file:")
+    in_file = None
+    level = INPUT
+    while not in_file:
+        files = []
+        for n, file in enumerate(level.glob("*")):
+            files.append(file)
+            print(f"{n+1} - {file.relative_to(INPUT)}")
+        s = None
+        while not s or not s.isnumeric() or not (1 <= int(s) <= len(files)):
+            s = input(f"Please select the file [1-{len(files)}]: ")
+        selected = files[int(s) - 1]
+        if selected.is_file():
+            in_file = selected
+        else:
+            level = selected
+    relative = in_file.relative_to(INPUT)
+    out_file = OUTPUT / relative
+    resources = RESOURCES / relative
+    resources = resources.parent / resources.stem
+    resources.mkdir(parents=True, exist_ok=True)
+    out_file.parent.mkdir(parents=True, exist_ok=True)
+    with in_file.open(encoding="utf8") as input_file:
+        file_type = input_file.read().split()[0]
+    return in_file, out_file, resources, file_type
+
+
+def dictionary_process(dictionary, tts, in_file, resources):
+    """Process dictionary files"""
+    words_list = in_file.open(encoding="utf8").read().strip().split("\n")
+    results = []
+    try:
+        with in_file.open("w", encoding="utf8") as input_file:
+            for words in words_list:
+                word = words.split()[0]
+                pinyin = " ".join(words.split()[1:]) if len(words.split()) > 1 else None
+                if v := dictionary.get(word):
+                    if len(v) > 1:
+                        print(f"\nWARNING: {word} has multiple meanings:")
+                        if pinyin and pinyin != "ERROR":
+                            ml = list(filter(lambda x: x.pinyin == pinyin, v))
+                        else:
+                            ml = v
+                        if len(ml) > 1:
+                            for n, w in enumerate(ml):
+                                print(f"{n+1} - {w}")
+                                for m in w.meanings:
+                                    print(f"\t{m}")
+                            s = None
+                            while (
+                                not s
+                                or not s.isnumeric()
+                                or not (1 <= int(s) <= len(v))
+                            ):
+                                s = input(
+                                    f"Please select the correct word [1-{len(v)}]: "
+                                )
+                            v = v[int(s) - 1]
+                        else:
+                            v = ml[0]
+                    else:
+                        v = v[0]
+                    audio_path = resources / f"{word}.wav"
+                    if not audio_path.exists():
+                        audio = tts.generate(f"{word}。", language_id="zh")
+                        torchaudio.save(audio_path, audio, tts.sr)
+                    input_file.write(f"{word}\t{v.pinyin}\n")
+                    results.append((v, audio_path))
+                else:
+                    print("============================================")
+                    print(f"===================>ERROR: {word} not found")
+                    print("============================================")
+                    input_file.write(f"{word}\tERROR\n")
+    except Exception:
+        with in_file.open("w", encoding="utf8") as input_file:
+            input_file.write("\n".join(words_list))
+    return results
+
+
+def translator_process(tts, resources, in_file):
+    """Process for phases trasnlation"""
+    text_list = in_file.open(encoding="utf8").read().strip().split()
+    results = []
+    for n, phrase in enumerate(text_list):
+        phrase = phrase.strip()
+        audio_path = resources / f"N{n}.wav"
+        if not audio_path.exists():
+            audio = tts.generate(f"{phrase}。", language_id="zh")
+            torchaudio.save(audio_path, audio, tts.sr)
+        translated = argostranslate.translate.translate(phrase, CN, EN)
+        results.append([translated, phrase, audio_path])
+    return results
+
+
+# def output_tsv(out_file, results):
+#     """writes the output as a tsv file"""
+#     final_file = out_file.parent / f"{out_file.stem}.tsv"
+#     with final_file.open("w", encoding="utf8", newline="") as csvfile:
+#         writer = csv.writer(csvfile, delimiter="\t", quotechar='"')
+#         for entry in results:
+#             writer.writerow(
+#                 [
+#                     "\n ".join(f"{n+1}. {m}" for n, m in enumerate(entry.meanings)),
+#                     PinyinToneConverter().convert_text(entry.pinyin),
+#                     entry.simplified,
+#                     entry.traditional,
+#                 ]
+#             )
+
+
+def output_anki_dictionary(out_file, results):
+    final_file = out_file.parent / f"{out_file.stem}.apkg"
+    deck_name = "::".join(out_file.relative_to(OUTPUT).parts[:-1] + (out_file.stem,))
+    deck = Deck(random.randrange(1 << 30, 1 << 31), deck_name)
+    package = Package(deck)
+    audios = []
+    for entry, audio in results:
+        note = Note(
+            model=HSK_MODEL,
+            fields=[
+                "\n ".join(f"{n+1}. {m}" for n, m in enumerate(entry.meanings)),
+                PinyinToneConverter().convert_text(entry.pinyin),
+                entry.simplified,
+                entry.traditional,
+                f"[sound:{audio.name}]",
+            ],
+        )
+        audios.append(audio)
+        deck.add_note(note)
+    package.media_files = audios
+    package.write_to_file(final_file)
+
+
+def output_anki_phrase(out_file, results):
+    final_file = out_file.parent / f"{out_file.stem}.apkg"
+    deck_name = "::".join(out_file.relative_to(OUTPUT).parts[:-1] + (out_file.stem,))
+    deck = Deck(random.randrange(1 << 30, 1 << 31), deck_name)
+    package = Package(deck)
+    audios = []
+    for translated, phrase, audio in results:
+        note = Note(
+            model=PHRASE_MODEL,
+            fields=[
+                translated,
+                phrase,
+                f"[sound:{audio.name}]",
+            ],
+        )
+        deck.add_note(note)
+        audios.append(audio)
+    package.media_files = audios
+    package.write_to_file(final_file)
+
+
+def main():
+    tts = create_tts()
+    dictionary = create_cedict()
+    create_translator()
+    while True:
+        in_file, out_file, resources, file_type = process_files()
+        if PHRASES_TYPE in in_file.suffixes:
+            results = translator_process(tts, resources, in_file)
+            output_anki_phrase(out_file, results)
+        elif DICT_TYPE in in_file.suffixes:
+            results = dictionary_process(dictionary, tts, in_file, resources)
+            output_anki_dictionary(out_file, results)
+        else:
+            raise TypeError("Error, filetype not especified!")
+
+
+if __name__ == "__main__":
+    main()
--- a/src/anki-hsk-creator/anki-models.py
+++ b/src/anki-hsk-creator/anki-models.py
@@ -0,0 +1,88 @@
+# anki-models.py
+
+from genanki import Deck, Note, Model, Package
+
+
+# Constants
+
+CSS = """
+.card {
+ font-family: arial;
+ font-size: 20px;
+ text-align: center;
+ color: black;
+ background-color: white;
+}
+.simple {
+font-family: Arial;
+font-size: 100px;
+}
+.trad {
+font-family: Arial;
+font-size: 75px;
+}
+"""
+
+# Models
+
+PHRASE_MODEL = Model(
+    2076166425,
+    "Phrase Model",
+    fields=[
+        {"name": "Translated"},
+        {"name": "Phrase"},
+        {"name": "Audio"},
+    ],
+    templates=[
+        {
+            "name": "Card 1",
+            "qfmt": "{{Translated}}<br>{{Audio}}",
+            "afmt": '{{FrontSide}}<hr id="answer">{{Phrase}}',
+        },
+        {
+            "name": "Card 2",
+            "qfmt": "{{Phrase}}<br>{{Audio}}",
+            "afmt": '{{FrontSide}}<hr id="answer">{{Translated}}',
+        },
+        {
+            "name": "Card 3",
+            "qfmt": "{{Audio}}",
+            "afmt": '{{FrontSide}}<hr id="answer">{{Phrase}}',
+        },
+    ],
+    css=CSS,
+)
+
+
+HSK_MODEL = Model(
+    1708536519,
+    "HSK Model",
+    fields=[
+        {"name": "English"},
+        {"name": "Pinyin"},
+        {"name": "Simplified"},
+        {"name": "Traditional"},
+        {"name": "Audio"},
+    ],
+    templates=[
+        {
+            "name": "Card 1",
+            "qfmt": "<strong>{{Pinyin}}</strong><br>{{English}}<br>{{Audio}}",
+            "afmt": "{{FrontSide}}<hr id='answer''><div class='simple'>"
+            "{{Simplified}}</div><br><div class='trad'>{{Traditional}}</div>",
+        },
+        {
+            "name": "Card 2",
+            "qfmt": "<div class='simple'>{{Simplified}}</div><br><div class='trad'>"
+            "{{Traditional}}</div>",
+            "afmt": '{{FrontSide}}<hr id="answer"><strong>{{Pinyin}}</strong><br>{{English}}<br>{{Audio}}',
+        },
+        {
+            "name": "Card 3",
+            "qfmt": "{{Audio}}",
+            "afmt": "{{FrontSide}}<hr id='answer''><strong>{{Pinyin}}</strong><br><div class='simple'>"
+            "{{Simplified}}</div><br><div class='trad'>{{Traditional}}</div>",
+        },
+    ],
+    css=CSS,
+)
--- a/src/anki-hsk-creator/cedict_ts.u8
+++ b/src/anki-hsk-creator/cedict_ts.u8
--- a/src/anki-hsk-creator/constants.py
+++ b/src/anki-hsk-creator/constants.py
@@ -0,0 +1,19 @@
+## Imports
+from pathlib import Path
+import random
+import importlib.resources
+
+CCCEDICT = importlib.resources.files("anki-hsk-creator").joinpath("cedict_ts.u8")
+
+DATA = Path(__file__).parent.parent / "data"
+INPUT = DATA / "input"
+OUTPUT = DATA / "output"
+RESOURCES = DATA / "resources"
+
+# File Types
+PHRASES_TYPE = ".phrases"
+DICT_TYPE = ".dictionary"
+
+# Language codes
+CN = "zh"
+EN = "en"
--- a/src/anki-hsk-creator/untility.py
+++ b/src/anki-hsk-creator/untility.py
@@ -0,0 +1,53 @@
+from cedict_utils.cedict import CedictParser
+import argostranslate.package
+import argostranslate.translate
+from chatterbox.mtl_tts import ChatterboxMultilingualTTS
+
+import torch
+import torchaudio
+
+from . import ARGOS_UPDATED, ARGOS_PACKAGES
+from . import CCCEDICT
+
+## Functions
+
+
+def create_cedict(language_id="en"):
+    """Creates a create_cedict dictionary object"""
+
+    parser = CedictParser()
+    parser.read_file(CCCEDICT)
+    entries = parser.parse()
+
+    dictionary = {}
+    for entry in entries:
+        if entry.simplified not in dictionary:
+            dictionary[entry.simplified] = [entry]
+        else:
+            dictionary[entry.simplified].append(entry)
+
+    return dictionary
+
+
+def create_translator(from_code, to_code):
+    """Download and install Argos Translate package"""
+    if not ARGOS_UPDATED:
+        argostranslate.package.update_package_index()
+        ARGOS_PACKAGES = argostranslate.package.get_available_packages()
+        ARGOS_UPDATED = True
+    package_to_install = next(
+        filter(lambda x: x.from_code == CN and x.to_code == EN, ARGOS_PACKAGES)
+    )
+    argostranslate.package.install_from_path(package_to_install.download())
+
+
+def create_tts():
+    # Automatically detect the best available device
+    if torch.cuda.is_available():
+        device = "cuda"
+    elif torch.backends.mps.is_available():
+        device = "mps"
+    else:
+        device = "cpu"
+    tts = ChatterboxMultilingualTTS.from_pretrained(device=device, t3_model="v3")
+    return tts