diff --git a/src/anki_hsk_creator/__about__.py b/src/anki_hsk_creator/__about__.py index 32158c0..f9e9255 100644 --- a/src/anki_hsk_creator/__about__.py +++ b/src/anki_hsk_creator/__about__.py @@ -3,4 +3,4 @@ # SPDX-FileCopyrightText: 2026-present Wolfang Torres # # SPDX-License-Identifier: GPL-3.0-or-later -__version__ = "0.1.1" +__version__ = "0.1.2" diff --git a/src/anki_hsk_creator/anki_generation.py b/src/anki_hsk_creator/anki_generation.py index 02abaea..cedf13b 100644 --- a/src/anki_hsk_creator/anki_generation.py +++ b/src/anki_hsk_creator/anki_generation.py @@ -8,12 +8,10 @@ import random # Pip from genanki import Deck, Model, Note, Package +from pinyin_tone_converter.pinyin_tone_converter import PinyinToneConverter # Local -from .utility import ProcessFile, TranslationResult - -# from pinyin_tone_converter.pinyin_tone_converter import PinyinToneConverter - +from .utility import DictionaryResult, ProcessFile, TranslationResult # Constants @@ -123,33 +121,34 @@ HSK_MODEL = Model( # Proccess -# def output_anki_dictionary(out_file, results): -# """Creates an anki file from a dictionary results""" -# final_file = out_file.parent / f"{out_file.stem}.apkg" -# deck_name = "::".join(out_file.relative_to(OUTPUT).parts[:-1] + (out_file.stem,)) -# deck = Deck(random.randrange(1 << 30, 1 << 31), deck_name) -# package = Package(deck) -# audios = [] -# for entry, audio in results: -# note = Note( -# model=HSK_MODEL, -# fields=[ -# "\n ".join(f"{n+1}. {m}" for n, m in enumerate(entry.meanings)), -# PinyinToneConverter().convert_text(entry.pinyin), -# entry.simplified, -# entry.traditional, -# f"[sound:{audio.name}]", -# ], -# ) -# audios.append(audio) -# deck.add_note(note) -# package.media_files = audios -# package.write_to_file(final_file) +def output_anki_dictionary(process_file: ProcessFile, results: list[DictionaryResult]): + """Creates an anki file from a dictionary results""" + final_file = process_file.output_name.with_suffix(".apkg") + deck_name = "::".join( + process_file.input_file.parts[:-1] + (process_file.output_name.stem,) + ) + deck = Deck(random.randrange(1 << 30, 1 << 31), deck_name) + package = Package(deck) + audios = [] + for result in results: + note = Note( + model=HSK_MODEL, + fields=[ + "\n ".join(f"{n+1}. {m}" for n, m in enumerate(result.meanings)), + PinyinToneConverter().convert_text(result.pinyin), + result.simplified, + result.traditional, + f"[sound:{result.audio_path.name}]", + ], + ) + audios.append(result.audio_path) + deck.add_note(note) + package.media_files = audios + package.write_to_file(final_file) def output_anki_phrase(process_file: ProcessFile, results: list[TranslationResult]): """Creates an anki file from a phrases results""" - final_file = process_file.output_name.with_suffix(".apkg") deck_name = "::".join( process_file.input_file.parts[:-1] + (process_file.output_name.stem,) diff --git a/src/anki_hsk_creator/api.py b/src/anki_hsk_creator/api.py index e19484c..77c4c39 100644 --- a/src/anki_hsk_creator/api.py +++ b/src/anki_hsk_creator/api.py @@ -8,10 +8,10 @@ from pathlib import Path # Local from . import DATA_FOLDER -from .anki_generation import output_anki_phrase +from .anki_generation import output_anki_dictionary, output_anki_phrase from .constants import DICT_TYPE, INPUT, LANGUAGES, PHRASES_TYPE -from .proccessor import translator_process -from .utility import TRANS, TTS, ProcessFile +from .proccessor import dictionary_pre_process, dictionary_process, translator_process +from .utility import CCCEDICT, TRANS, TTS, ProcessFile # interface @@ -66,14 +66,21 @@ def create_input_file( def process_a_file(process_file: ProcessFile, language_id: str): """From a input_file, a language and an output type, process a file""" process_file.language_id = language_id + TTS.create_tts() if PHRASES_TYPE in process_file.input_file.suffixes: - TTS.create_tts() TRANS.create_translator(LANGUAGES.CN, language_id) with process_file.absolute_input_file.open("r", encoding="utf8") as file: text_lines = [line.strip() for line in file.readlines()] - results = translator_process(text_lines, process_file, language_id) + results = translator_process(text_lines, process_file) output_anki_phrase(process_file, results) elif DICT_TYPE in process_file.input_file.suffixes: - print("not implemented") + if process_file.dictionary_resource_file.exists(): + CCCEDICT.create_cedict(language_id) + with process_file.absolute_input_file.open("r", encoding="utf8") as file: + words_list = [word.strip() for word in file.readlines()] + dictionary_pre_process(words_list, process_file) + else: + results = dictionary_process(process_file) + output_anki_dictionary(process_file, results) else: - print("no identified") + print("filetype not identified") diff --git a/src/anki_hsk_creator/constants.py b/src/anki_hsk_creator/constants.py index bbeae01..2d728be 100644 --- a/src/anki_hsk_creator/constants.py +++ b/src/anki_hsk_creator/constants.py @@ -35,10 +35,10 @@ class LANGUAGES: AvailableLanguages = (EN, ES, FR, RU, TR, TH) LanguageNames = { - EN: "English", - ES: "Spanish", - FR: "French", - RU: "Russian", - TR: "Turkish", - TH: "Thai", - } + EN: "English", + ES: "Spanish", + FR: "French", + RU: "Russian", + TR: "Turkish", + TH: "Thai", + } diff --git a/src/anki_hsk_creator/proccessor.py b/src/anki_hsk_creator/proccessor.py index e83dbac..57e4275 100644 --- a/src/anki_hsk_creator/proccessor.py +++ b/src/anki_hsk_creator/proccessor.py @@ -1,84 +1,101 @@ """processor.py""" +# Standard Library +import csv + # Pip import argostranslate.translate import torchaudio # Local from .constants import LANGUAGES -from .utility import TTS, ProcessFile, TranslationResult # , CCCEDICT +from .utility import CCCEDICT, TTS, DictionaryResult, ProcessFile, TranslationResult + +# Constants + +FIELDNAMES = ["simplified", "traditional", "pinyin", "meaning"] # Results Classes def translator_process( - text_lines: list[str], - process_file: ProcessFile, - language_id: str, + text_lines: list[str], process_file: ProcessFile ) -> list[TranslationResult]: """Process for phases or sentence translation""" results = [] for n, line in enumerate(text_lines): line = line.strip() - audio_path = process_file.resources / f"N{n:03n}.wav" + audio_path = process_file.resources / f"N{n:03n}.wav" if not audio_path.exists(): audio = TTS.MODEL.generate(f"{line}。", language_id=LANGUAGES.CN) torchaudio.save(audio_path, audio, TTS.MODEL.sr) - translated = argostranslate.translate.translate(line, LANGUAGES.CN, language_id) - results.append(TranslationResult(language_id, translated, line, audio_path)) + translated = argostranslate.translate.translate( + line, LANGUAGES.CN, process_file.language_id + ) + results.append( + TranslationResult(process_file.language_id, translated, line, audio_path) + ) return results -# def dictionary_process(dictionary, tts, in_file, resources): -# """Process dictionary files""" -# words_list = in_file.open(encoding="utf8").read().strip().split("\n") -# results = [] -# try: -# with in_file.open("w", encoding="utf8") as input_file: -# for words in words_list: -# word = words.split()[0] -# pinyin = " ".join(words.split()[1:]) if len(words.split()) > 1 else None -# if v := dictionary.get(word): -# if len(v) > 1: -# print(f"\nWARNING: {word} has multiple meanings:") -# if pinyin and pinyin != "ERROR": -# ml = list(filter(lambda x: x.pinyin == pinyin, v)) -# else: -# ml = v -# if len(ml) > 1: -# for n, w in enumerate(ml): -# print(f"{n+1} - {w}") -# for m in w.meanings: -# print(f"\t{m}") -# s = None -# while ( -# not s -# or not s.isnumeric() -# or not (1 <= int(s) <= len(v)) -# ): -# s = input( -# f"Please select the correct word [1-{len(v)}]: " -# ) -# v = v[int(s) - 1] -# else: -# v = ml[0] -# else: -# v = v[0] -# audio_path = resources / f"{word}.wav" -# if not audio_path.exists(): -# audio = tts.generate(f"{word}。", language_id="zh") -# torchaudio.save(audio_path, audio, tts.sr) -# input_file.write(f"{word}\t{v.pinyin}\n") -# results.append((v, audio_path)) -# else: -# print("============================================") -# print(f"===================>ERROR: {word} not found") -# print("============================================") -# input_file.write(f"{word}\tERROR\n") -# except Exception: -# with in_file.open("w", encoding="utf8") as input_file: -# input_file.write("\n".join(words_list)) -# return results +def dictionary_pre_process(words_list: list[str], process_file: ProcessFile): + """Pre Process dictionary files into a intermediary resources file""" + dictionary = CCCEDICT.create_cedict(process_file.language_id) + with process_file.dictionary_resource_file.open( + "w", encoding="utf8", newline="" + ) as resource_file: + tsv_writer = csv.writer( + resource_file, dialect="excel-tab", fieldnames=FIELDNAMES + ) + tsv_writer.writeheader() + for words in words_list: + word = words.split()[0] + pinyin = " ".join(words.split()[1:]) if len(words.split()) > 1 else None + if entries := dictionary.get(word): + if pinyin is not None: + entries = list(filter(lambda x: x.pinyin == pinyin, entries)) + if len(entries) > 1: + print(f"\nWARNING: {word} has multiple meanings:") + for entry in entries: + for meaning in entry.meanings: + tsv_writer.writerow( + { + "simplified": entry.simplified, + "traditional": entry.traditional, + "pinyin": entry.pinyin, + "meaning": meaning, + } + ) + else: + print("============================================") + print(f"===================>ERROR: {word} not found") + print("============================================") + tsv_writer.writerow( + { + "simplified": word, + "traditional": None, + "pinyin": None, + "meaning": None, + } + ) + + +def dictionary_process(process_file: ProcessFile) -> list[DictionaryResult]: + """Process a dictionary_resource_file into a final result""" + results = [] + with process_file.dictionary_resource_file.open( + "w", encoding="utf8", newline="" + ) as resource_file: + reader = csv.DictReader(resource_file) + for line in reader: + audio_path = process_file.resources / f"{line['pinyin']}.wav" + if not audio_path.exists(): + audio = TTS.MODEL.generate(f"{line['simplified']}。", language_id="zh") + torchaudio.save(audio_path, audio, TTS.MODEL.sr) + result = DictionaryResult(**line, audio_path=audio_path) + results.append(result) + return results + # def output_tsv(out_file, results): # """writes the output as a tsv file""" diff --git a/src/anki_hsk_creator/utility.py b/src/anki_hsk_creator/utility.py index ccac88a..e3f9973 100644 --- a/src/anki_hsk_creator/utility.py +++ b/src/anki_hsk_creator/utility.py @@ -11,7 +11,7 @@ from pathlib import Path import argostranslate.package import argostranslate.translate import torch -from cedict_utils.cedict import CedictParser +from cedict_utils.cedict import CedictEntry, CedictParser from chatterbox.mtl_tts import ChatterboxMultilingualTTS # Local @@ -34,9 +34,9 @@ class TRANS: TRANS.PACKAGES = argostranslate.package.get_available_packages() TRANS.UPDATED = True packages = filter( - lambda x: x.from_code == from_code or x.to_code == to_code, - TRANS.PACKAGES, - ) + lambda x: x.from_code == from_code or x.to_code == to_code, + TRANS.PACKAGES, + ) packages_to_install = [] for in_package in packages: if in_package.from_code == from_code: @@ -49,6 +49,43 @@ class TRANS: argostranslate.package.install_from_path(package.download()) +class TranslatedEntry: + """Holder class for CCCEDIT entry translated to `language_id`""" + + def __init__(self, entry: CedictEntry, language_id: str): + self.entry = entry + self.language_id = language_id + self._translated_meanings = [] + for meaning in entry.meanings: + if language_id != LANGUAGES.EN: + trans_meaning = argostranslate.translate.translate( + meaning, LANGUAGES.EN, language_id + ) + else: + trans_meaning = meaning + self._translated_meanings.append(trans_meaning) + + @property + def simplified(self): + """Entry simplified""" + return self.entry.simplified + + @property + def traditional(self): + """Entry traditional""" + return self.entry.traditional + + @property + def pinyin(self): + """Entry piying""" + return self.entry.pinyin + + @property + def meanings(self): + """Entry translated meaning list""" + return self._translated_meanings + + class CCCEDICT: """Static Class for the CCCEDIT dictionary""" @@ -57,7 +94,9 @@ class CCCEDICT: DICTIONARY_LIST = {} @staticmethod - def create_cedict(language_id=LANGUAGES.EN): + def create_cedict( + language_id: str = LANGUAGES.EN, + ) -> dict[str, list[TranslatedEntry]]: """Creates a create_cedict dictionary object""" if not CCCEDICT.PARSER: CCCEDICT.PARSER = CedictParser() @@ -66,15 +105,11 @@ class CCCEDICT: if language_id not in CCCEDICT.DICTIONARY_LIST: dictionary = {} for entry in CCCEDICT.ENTRIES: - if language_id != LANGUAGES.EN: - TRANS.create_translator(LANGUAGES.EN, language_id) - entry = argostranslate.translate.translate( - entry, LANGUAGES.EN, language_id - ) + trans_entry = TranslatedEntry(entry, language_id) if entry.simplified not in dictionary: - dictionary[entry.simplified] = [entry] + dictionary[entry.simplified] = [trans_entry] else: - dictionary[entry.simplified].append(entry) + dictionary[entry.simplified].append(trans_entry) CCCEDICT.DICTIONARY_LIST[language_id] = dictionary else: dictionary = CCCEDICT.DICTIONARY_LIST[language_id] @@ -144,6 +179,11 @@ class ProcessFile: raise ValueError("Not a valid language selected") return self.out_folder / f"{self.input_file.stem}.{self.language_id}." + @property + def dictionary_resource_file(self): + """The path for the resource tsv for dictionary files""" + return self.resources / f"dictionary.{self.language_id}.tsv" + class TranslationResult: """Result of a translated process""" @@ -159,3 +199,23 @@ class TranslationResult: self.translated = translated self.line = line self.audio_path = audio_path + + +class DictionaryResult: + """Result of a dictionaty process""" + + def __init__( + self, + language_id: str, + simplified: str, + traditional: str, + pinyin: str, + meanings: str, + audio_path: Path, + ): + self.language_id = language_id + self.simplified = simplified + self.traditional = traditional + self.pinyin = pinyin + self.meanings = meanings + self.audio_path = audio_path