add dictionary file process

2026-06-12 20:26:50 +08:00
parent 9b0d23b8ac
commit f9fc887d05
6 changed files with 194 additions and 111 deletions
--- a/src/anki_hsk_creator/about.py
+++ b/src/anki_hsk_creator/about.py
@@ -3,4 +3,4 @@
 # SPDX-FileCopyrightText: 2026-present Wolfang Torres <wolfang.torres@gmail.com>
 #
 # SPDX-License-Identifier: GPL-3.0-or-later
-__version__ = "0.1.1"
+__version__ = "0.1.2"
--- a/src/anki_hsk_creator/anki_generation.py
+++ b/src/anki_hsk_creator/anki_generation.py
@@ -8,12 +8,10 @@ import random

 # Pip
 from genanki import Deck, Model, Note, Package
+from pinyin_tone_converter.pinyin_tone_converter import PinyinToneConverter

 # Local
-from .utility import ProcessFile, TranslationResult
-
-# from pinyin_tone_converter.pinyin_tone_converter import PinyinToneConverter
-
+from .utility import DictionaryResult, ProcessFile, TranslationResult

 # Constants

@@ -123,33 +121,34 @@ HSK_MODEL = Model(
 # Proccess


-# def output_anki_dictionary(out_file, results):
-#     """Creates an anki file from a dictionary results"""
-#     final_file = out_file.parent / f"{out_file.stem}.apkg"
-#     deck_name = "::".join(out_file.relative_to(OUTPUT).parts[:-1] + (out_file.stem,))
-#     deck = Deck(random.randrange(1 << 30, 1 << 31), deck_name)
-#     package = Package(deck)
-#     audios = []
-#     for entry, audio in results:
-#         note = Note(
-#             model=HSK_MODEL,
-#             fields=[
-#                 "\n ".join(f"{n+1}. {m}" for n, m in enumerate(entry.meanings)),
-#                 PinyinToneConverter().convert_text(entry.pinyin),
-#                 entry.simplified,
-#                 entry.traditional,
-#                 f"[sound:{audio.name}]",
-#             ],
-#         )
-#         audios.append(audio)
-#         deck.add_note(note)
-#     package.media_files = audios
-#     package.write_to_file(final_file)
+def output_anki_dictionary(process_file: ProcessFile, results: list[DictionaryResult]):
+    """Creates an anki file from a dictionary results"""
+    final_file = process_file.output_name.with_suffix(".apkg")
+    deck_name = "::".join(
+        process_file.input_file.parts[:-1] + (process_file.output_name.stem,)
+    )
+    deck = Deck(random.randrange(1 << 30, 1 << 31), deck_name)
+    package = Package(deck)
+    audios = []
+    for result in results:
+        note = Note(
+            model=HSK_MODEL,
+            fields=[
+                "\n ".join(f"{n+1}. {m}" for n, m in enumerate(result.meanings)),
+                PinyinToneConverter().convert_text(result.pinyin),
+                result.simplified,
+                result.traditional,
+                f"[sound:{result.audio_path.name}]",
+            ],
+        )
+        audios.append(result.audio_path)
+        deck.add_note(note)
+    package.media_files = audios
+    package.write_to_file(final_file)


 def output_anki_phrase(process_file: ProcessFile, results: list[TranslationResult]):
    """Creates an anki file from a phrases results"""
-
    final_file = process_file.output_name.with_suffix(".apkg")
    deck_name = "::".join(
        process_file.input_file.parts[:-1] + (process_file.output_name.stem,)
--- a/src/anki_hsk_creator/api.py
+++ b/src/anki_hsk_creator/api.py
@@ -8,10 +8,10 @@ from pathlib import Path

 # Local
 from . import DATA_FOLDER
-from .anki_generation import output_anki_phrase
+from .anki_generation import output_anki_dictionary, output_anki_phrase
 from .constants import DICT_TYPE, INPUT, LANGUAGES, PHRASES_TYPE
-from .proccessor import translator_process
-from .utility import TRANS, TTS, ProcessFile
+from .proccessor import dictionary_pre_process, dictionary_process, translator_process
+from .utility import CCCEDICT, TRANS, TTS, ProcessFile

 # interface

@@ -66,14 +66,21 @@ def create_input_file(
 def process_a_file(process_file: ProcessFile, language_id: str):
    """From a input_file, a language and an output type, process a file"""
    process_file.language_id = language_id
+    TTS.create_tts()
    if PHRASES_TYPE in process_file.input_file.suffixes:
-        TTS.create_tts()
        TRANS.create_translator(LANGUAGES.CN, language_id)
        with process_file.absolute_input_file.open("r", encoding="utf8") as file:
            text_lines = [line.strip() for line in file.readlines()]
-        results = translator_process(text_lines, process_file, language_id)
+        results = translator_process(text_lines, process_file)
        output_anki_phrase(process_file, results)
    elif DICT_TYPE in process_file.input_file.suffixes:
-        print("not implemented")
+        if process_file.dictionary_resource_file.exists():
+            CCCEDICT.create_cedict(language_id)
+            with process_file.absolute_input_file.open("r", encoding="utf8") as file:
+                words_list = [word.strip() for word in file.readlines()]
+            dictionary_pre_process(words_list, process_file)
+        else:
+            results = dictionary_process(process_file)
+            output_anki_dictionary(process_file, results)
    else:
-        print("no identified")
+        print("filetype not identified")
--- a/src/anki_hsk_creator/constants.py
+++ b/src/anki_hsk_creator/constants.py
@@ -35,10 +35,10 @@ class LANGUAGES:

    AvailableLanguages = (EN, ES, FR, RU, TR, TH)
    LanguageNames = {
-            EN: "English",
-            ES: "Spanish",
-            FR: "French",
-            RU: "Russian",
-            TR: "Turkish",
-            TH: "Thai",
-        }
+        EN: "English",
+        ES: "Spanish",
+        FR: "French",
+        RU: "Russian",
+        TR: "Turkish",
+        TH: "Thai",
+    }
--- a/src/anki_hsk_creator/proccessor.py
+++ b/src/anki_hsk_creator/proccessor.py
@@ -1,20 +1,25 @@
 """processor.py"""

+# Standard Library
+import csv
+
 # Pip
 import argostranslate.translate
 import torchaudio

 # Local
 from .constants import LANGUAGES
-from .utility import TTS, ProcessFile, TranslationResult  # , CCCEDICT
+from .utility import CCCEDICT, TTS, DictionaryResult, ProcessFile, TranslationResult
+
+# Constants
+
+FIELDNAMES = ["simplified", "traditional", "pinyin", "meaning"]

 # Results Classes


 def translator_process(
-    text_lines: list[str],
-    process_file: ProcessFile,
-    language_id: str,
+    text_lines: list[str], process_file: ProcessFile
 ) -> list[TranslationResult]:
    """Process for phases or sentence translation"""
    results = []
@@ -24,61 +29,73 @@ def translator_process(
        if not audio_path.exists():
            audio = TTS.MODEL.generate(f"{line}。", language_id=LANGUAGES.CN)
            torchaudio.save(audio_path, audio, TTS.MODEL.sr)
-        translated = argostranslate.translate.translate(line, LANGUAGES.CN, language_id)
-        results.append(TranslationResult(language_id, translated, line, audio_path))
+        translated = argostranslate.translate.translate(
+            line, LANGUAGES.CN, process_file.language_id
+        )
+        results.append(
+            TranslationResult(process_file.language_id, translated, line, audio_path)
+        )
    return results


-# def dictionary_process(dictionary, tts, in_file, resources):
-#     """Process dictionary files"""
-#     words_list = in_file.open(encoding="utf8").read().strip().split("\n")
-#     results = []
-#     try:
-#         with in_file.open("w", encoding="utf8") as input_file:
-#             for words in words_list:
-#                 word = words.split()[0]
-#                 pinyin = " ".join(words.split()[1:]) if len(words.split()) > 1 else None
-#                 if v := dictionary.get(word):
-#                     if len(v) > 1:
-#                         print(f"\nWARNING: {word} has multiple meanings:")
-#                         if pinyin and pinyin != "ERROR":
-#                             ml = list(filter(lambda x: x.pinyin == pinyin, v))
-#                         else:
-#                             ml = v
-#                         if len(ml) > 1:
-#                             for n, w in enumerate(ml):
-#                                 print(f"{n+1} - {w}")
-#                                 for m in w.meanings:
-#                                     print(f"\t{m}")
-#                             s = None
-#                             while (
-#                                 not s
-#                                 or not s.isnumeric()
-#                                 or not (1 <= int(s) <= len(v))
-#                             ):
-#                                 s = input(
-#                                     f"Please select the correct word [1-{len(v)}]: "
-#                                 )
-#                             v = v[int(s) - 1]
-#                         else:
-#                             v = ml[0]
-#                     else:
-#                         v = v[0]
-#                     audio_path = resources / f"{word}.wav"
-#                     if not audio_path.exists():
-#                         audio = tts.generate(f"{word}。", language_id="zh")
-#                         torchaudio.save(audio_path, audio, tts.sr)
-#                     input_file.write(f"{word}\t{v.pinyin}\n")
-#                     results.append((v, audio_path))
-#                 else:
-#                     print("============================================")
-#                     print(f"===================>ERROR: {word} not found")
-#                     print("============================================")
-#                     input_file.write(f"{word}\tERROR\n")
-#     except Exception:
-#         with in_file.open("w", encoding="utf8") as input_file:
-#             input_file.write("\n".join(words_list))
-#     return results
+def dictionary_pre_process(words_list: list[str], process_file: ProcessFile):
+    """Pre Process dictionary files into a intermediary resources file"""
+    dictionary = CCCEDICT.create_cedict(process_file.language_id)
+    with process_file.dictionary_resource_file.open(
+        "w", encoding="utf8", newline=""
+    ) as resource_file:
+        tsv_writer = csv.writer(
+            resource_file, dialect="excel-tab", fieldnames=FIELDNAMES
+        )
+        tsv_writer.writeheader()
+        for words in words_list:
+            word = words.split()[0]
+            pinyin = " ".join(words.split()[1:]) if len(words.split()) > 1 else None
+            if entries := dictionary.get(word):
+                if pinyin is not None:
+                    entries = list(filter(lambda x: x.pinyin == pinyin, entries))
+                if len(entries) > 1:
+                    print(f"\nWARNING: {word} has multiple meanings:")
+                for entry in entries:
+                    for meaning in entry.meanings:
+                        tsv_writer.writerow(
+                            {
+                                "simplified": entry.simplified,
+                                "traditional": entry.traditional,
+                                "pinyin": entry.pinyin,
+                                "meaning": meaning,
+                            }
+                        )
+            else:
+                print("============================================")
+                print(f"===================>ERROR: {word} not found")
+                print("============================================")
+                tsv_writer.writerow(
+                    {
+                        "simplified": word,
+                        "traditional": None,
+                        "pinyin": None,
+                        "meaning": None,
+                    }
+                )
+
+
+def dictionary_process(process_file: ProcessFile) -> list[DictionaryResult]:
+    """Process a dictionary_resource_file into a final result"""
+    results = []
+    with process_file.dictionary_resource_file.open(
+        "w", encoding="utf8", newline=""
+    ) as resource_file:
+        reader = csv.DictReader(resource_file)
+        for line in reader:
+            audio_path = process_file.resources / f"{line['pinyin']}.wav"
+            if not audio_path.exists():
+                audio = TTS.MODEL.generate(f"{line['simplified']}。", language_id="zh")
+                torchaudio.save(audio_path, audio, TTS.MODEL.sr)
+            result = DictionaryResult(**line, audio_path=audio_path)
+            results.append(result)
+    return results
+

 # def output_tsv(out_file, results):
 #     """writes the output as a tsv file"""
--- a/src/anki_hsk_creator/utility.py
+++ b/src/anki_hsk_creator/utility.py
@@ -11,7 +11,7 @@ from pathlib import Path
 import argostranslate.package
 import argostranslate.translate
 import torch
-from cedict_utils.cedict import CedictParser
+from cedict_utils.cedict import CedictEntry, CedictParser
 from chatterbox.mtl_tts import ChatterboxMultilingualTTS

 # Local
@@ -34,9 +34,9 @@ class TRANS:
            TRANS.PACKAGES = argostranslate.package.get_available_packages()
            TRANS.UPDATED = True
        packages = filter(
-                lambda x: x.from_code == from_code or x.to_code == to_code,
-                TRANS.PACKAGES,
-            )
+            lambda x: x.from_code == from_code or x.to_code == to_code,
+            TRANS.PACKAGES,
+        )
        packages_to_install = []
        for in_package in packages:
            if in_package.from_code == from_code:
@@ -49,6 +49,43 @@ class TRANS:
            argostranslate.package.install_from_path(package.download())


+class TranslatedEntry:
+    """Holder class for CCCEDIT entry translated to `language_id`"""
+
+    def __init__(self, entry: CedictEntry, language_id: str):
+        self.entry = entry
+        self.language_id = language_id
+        self._translated_meanings = []
+        for meaning in entry.meanings:
+            if language_id != LANGUAGES.EN:
+                trans_meaning = argostranslate.translate.translate(
+                    meaning, LANGUAGES.EN, language_id
+                )
+            else:
+                trans_meaning = meaning
+            self._translated_meanings.append(trans_meaning)
+
+    @property
+    def simplified(self):
+        """Entry simplified"""
+        return self.entry.simplified
+
+    @property
+    def traditional(self):
+        """Entry traditional"""
+        return self.entry.traditional
+
+    @property
+    def pinyin(self):
+        """Entry piying"""
+        return self.entry.pinyin
+
+    @property
+    def meanings(self):
+        """Entry translated meaning list"""
+        return self._translated_meanings
+
+
 class CCCEDICT:
    """Static Class for the CCCEDIT dictionary"""

@@ -57,7 +94,9 @@ class CCCEDICT:
    DICTIONARY_LIST = {}

    @staticmethod
-    def create_cedict(language_id=LANGUAGES.EN):
+    def create_cedict(
+        language_id: str = LANGUAGES.EN,
+    ) -> dict[str, list[TranslatedEntry]]:
        """Creates a create_cedict dictionary object"""
        if not CCCEDICT.PARSER:
            CCCEDICT.PARSER = CedictParser()
@@ -66,15 +105,11 @@ class CCCEDICT:
        if language_id not in CCCEDICT.DICTIONARY_LIST:
            dictionary = {}
            for entry in CCCEDICT.ENTRIES:
-                if language_id != LANGUAGES.EN:
-                    TRANS.create_translator(LANGUAGES.EN, language_id)
-                    entry = argostranslate.translate.translate(
-                        entry, LANGUAGES.EN, language_id
-                    )
+                trans_entry = TranslatedEntry(entry, language_id)
                if entry.simplified not in dictionary:
-                    dictionary[entry.simplified] = [entry]
+                    dictionary[entry.simplified] = [trans_entry]
                else:
-                    dictionary[entry.simplified].append(entry)
+                    dictionary[entry.simplified].append(trans_entry)
            CCCEDICT.DICTIONARY_LIST[language_id] = dictionary
        else:
            dictionary = CCCEDICT.DICTIONARY_LIST[language_id]
@@ -144,6 +179,11 @@ class ProcessFile:
            raise ValueError("Not a valid language selected")
        return self.out_folder / f"{self.input_file.stem}.{self.language_id}."

+    @property
+    def dictionary_resource_file(self):
+        """The path for the resource tsv for dictionary files"""
+        return self.resources / f"dictionary.{self.language_id}.tsv"
+

 class TranslationResult:
    """Result of a translated process"""
@@ -159,3 +199,23 @@ class TranslationResult:
        self.translated = translated
        self.line = line
        self.audio_path = audio_path
+
+
+class DictionaryResult:
+    """Result of a dictionaty process"""
+
+    def __init__(
+        self,
+        language_id: str,
+        simplified: str,
+        traditional: str,
+        pinyin: str,
+        meanings: str,
+        audio_path: Path,
+    ):
+        self.language_id = language_id
+        self.simplified = simplified
+        self.traditional = traditional
+        self.pinyin = pinyin
+        self.meanings = meanings
+        self.audio_path = audio_path