add dictionary file process

This commit is contained in:
Wolfang Torres
2026-06-12 20:26:50 +08:00
parent 9b0d23b8ac
commit f9fc887d05
6 changed files with 194 additions and 111 deletions

View File

@@ -3,4 +3,4 @@
# SPDX-FileCopyrightText: 2026-present Wolfang Torres <wolfang.torres@gmail.com> # SPDX-FileCopyrightText: 2026-present Wolfang Torres <wolfang.torres@gmail.com>
# #
# SPDX-License-Identifier: GPL-3.0-or-later # SPDX-License-Identifier: GPL-3.0-or-later
__version__ = "0.1.1" __version__ = "0.1.2"

View File

@@ -8,12 +8,10 @@ import random
# Pip # Pip
from genanki import Deck, Model, Note, Package from genanki import Deck, Model, Note, Package
from pinyin_tone_converter.pinyin_tone_converter import PinyinToneConverter
# Local # Local
from .utility import ProcessFile, TranslationResult from .utility import DictionaryResult, ProcessFile, TranslationResult
# from pinyin_tone_converter.pinyin_tone_converter import PinyinToneConverter
# Constants # Constants
@@ -123,33 +121,34 @@ HSK_MODEL = Model(
# Proccess # Proccess
# def output_anki_dictionary(out_file, results): def output_anki_dictionary(process_file: ProcessFile, results: list[DictionaryResult]):
# """Creates an anki file from a dictionary results""" """Creates an anki file from a dictionary results"""
# final_file = out_file.parent / f"{out_file.stem}.apkg" final_file = process_file.output_name.with_suffix(".apkg")
# deck_name = "::".join(out_file.relative_to(OUTPUT).parts[:-1] + (out_file.stem,)) deck_name = "::".join(
# deck = Deck(random.randrange(1 << 30, 1 << 31), deck_name) process_file.input_file.parts[:-1] + (process_file.output_name.stem,)
# package = Package(deck) )
# audios = [] deck = Deck(random.randrange(1 << 30, 1 << 31), deck_name)
# for entry, audio in results: package = Package(deck)
# note = Note( audios = []
# model=HSK_MODEL, for result in results:
# fields=[ note = Note(
# "\n ".join(f"{n+1}. {m}" for n, m in enumerate(entry.meanings)), model=HSK_MODEL,
# PinyinToneConverter().convert_text(entry.pinyin), fields=[
# entry.simplified, "\n ".join(f"{n+1}. {m}" for n, m in enumerate(result.meanings)),
# entry.traditional, PinyinToneConverter().convert_text(result.pinyin),
# f"[sound:{audio.name}]", result.simplified,
# ], result.traditional,
# ) f"[sound:{result.audio_path.name}]",
# audios.append(audio) ],
# deck.add_note(note) )
# package.media_files = audios audios.append(result.audio_path)
# package.write_to_file(final_file) deck.add_note(note)
package.media_files = audios
package.write_to_file(final_file)
def output_anki_phrase(process_file: ProcessFile, results: list[TranslationResult]): def output_anki_phrase(process_file: ProcessFile, results: list[TranslationResult]):
"""Creates an anki file from a phrases results""" """Creates an anki file from a phrases results"""
final_file = process_file.output_name.with_suffix(".apkg") final_file = process_file.output_name.with_suffix(".apkg")
deck_name = "::".join( deck_name = "::".join(
process_file.input_file.parts[:-1] + (process_file.output_name.stem,) process_file.input_file.parts[:-1] + (process_file.output_name.stem,)

View File

@@ -8,10 +8,10 @@ from pathlib import Path
# Local # Local
from . import DATA_FOLDER from . import DATA_FOLDER
from .anki_generation import output_anki_phrase from .anki_generation import output_anki_dictionary, output_anki_phrase
from .constants import DICT_TYPE, INPUT, LANGUAGES, PHRASES_TYPE from .constants import DICT_TYPE, INPUT, LANGUAGES, PHRASES_TYPE
from .proccessor import translator_process from .proccessor import dictionary_pre_process, dictionary_process, translator_process
from .utility import TRANS, TTS, ProcessFile from .utility import CCCEDICT, TRANS, TTS, ProcessFile
# interface # interface
@@ -66,14 +66,21 @@ def create_input_file(
def process_a_file(process_file: ProcessFile, language_id: str): def process_a_file(process_file: ProcessFile, language_id: str):
"""From a input_file, a language and an output type, process a file""" """From a input_file, a language and an output type, process a file"""
process_file.language_id = language_id process_file.language_id = language_id
if PHRASES_TYPE in process_file.input_file.suffixes:
TTS.create_tts() TTS.create_tts()
if PHRASES_TYPE in process_file.input_file.suffixes:
TRANS.create_translator(LANGUAGES.CN, language_id) TRANS.create_translator(LANGUAGES.CN, language_id)
with process_file.absolute_input_file.open("r", encoding="utf8") as file: with process_file.absolute_input_file.open("r", encoding="utf8") as file:
text_lines = [line.strip() for line in file.readlines()] text_lines = [line.strip() for line in file.readlines()]
results = translator_process(text_lines, process_file, language_id) results = translator_process(text_lines, process_file)
output_anki_phrase(process_file, results) output_anki_phrase(process_file, results)
elif DICT_TYPE in process_file.input_file.suffixes: elif DICT_TYPE in process_file.input_file.suffixes:
print("not implemented") if process_file.dictionary_resource_file.exists():
CCCEDICT.create_cedict(language_id)
with process_file.absolute_input_file.open("r", encoding="utf8") as file:
words_list = [word.strip() for word in file.readlines()]
dictionary_pre_process(words_list, process_file)
else: else:
print("no identified") results = dictionary_process(process_file)
output_anki_dictionary(process_file, results)
else:
print("filetype not identified")

View File

@@ -1,20 +1,25 @@
"""processor.py""" """processor.py"""
# Standard Library
import csv
# Pip # Pip
import argostranslate.translate import argostranslate.translate
import torchaudio import torchaudio
# Local # Local
from .constants import LANGUAGES from .constants import LANGUAGES
from .utility import TTS, ProcessFile, TranslationResult # , CCCEDICT from .utility import CCCEDICT, TTS, DictionaryResult, ProcessFile, TranslationResult
# Constants
FIELDNAMES = ["simplified", "traditional", "pinyin", "meaning"]
# Results Classes # Results Classes
def translator_process( def translator_process(
text_lines: list[str], text_lines: list[str], process_file: ProcessFile
process_file: ProcessFile,
language_id: str,
) -> list[TranslationResult]: ) -> list[TranslationResult]:
"""Process for phases or sentence translation""" """Process for phases or sentence translation"""
results = [] results = []
@@ -24,61 +29,73 @@ def translator_process(
if not audio_path.exists(): if not audio_path.exists():
audio = TTS.MODEL.generate(f"{line}", language_id=LANGUAGES.CN) audio = TTS.MODEL.generate(f"{line}", language_id=LANGUAGES.CN)
torchaudio.save(audio_path, audio, TTS.MODEL.sr) torchaudio.save(audio_path, audio, TTS.MODEL.sr)
translated = argostranslate.translate.translate(line, LANGUAGES.CN, language_id) translated = argostranslate.translate.translate(
results.append(TranslationResult(language_id, translated, line, audio_path)) line, LANGUAGES.CN, process_file.language_id
)
results.append(
TranslationResult(process_file.language_id, translated, line, audio_path)
)
return results return results
# def dictionary_process(dictionary, tts, in_file, resources): def dictionary_pre_process(words_list: list[str], process_file: ProcessFile):
# """Process dictionary files""" """Pre Process dictionary files into a intermediary resources file"""
# words_list = in_file.open(encoding="utf8").read().strip().split("\n") dictionary = CCCEDICT.create_cedict(process_file.language_id)
# results = [] with process_file.dictionary_resource_file.open(
# try: "w", encoding="utf8", newline=""
# with in_file.open("w", encoding="utf8") as input_file: ) as resource_file:
# for words in words_list: tsv_writer = csv.writer(
# word = words.split()[0] resource_file, dialect="excel-tab", fieldnames=FIELDNAMES
# pinyin = " ".join(words.split()[1:]) if len(words.split()) > 1 else None )
# if v := dictionary.get(word): tsv_writer.writeheader()
# if len(v) > 1: for words in words_list:
# print(f"\nWARNING: {word} has multiple meanings:") word = words.split()[0]
# if pinyin and pinyin != "ERROR": pinyin = " ".join(words.split()[1:]) if len(words.split()) > 1 else None
# ml = list(filter(lambda x: x.pinyin == pinyin, v)) if entries := dictionary.get(word):
# else: if pinyin is not None:
# ml = v entries = list(filter(lambda x: x.pinyin == pinyin, entries))
# if len(ml) > 1: if len(entries) > 1:
# for n, w in enumerate(ml): print(f"\nWARNING: {word} has multiple meanings:")
# print(f"{n+1} - {w}") for entry in entries:
# for m in w.meanings: for meaning in entry.meanings:
# print(f"\t{m}") tsv_writer.writerow(
# s = None {
# while ( "simplified": entry.simplified,
# not s "traditional": entry.traditional,
# or not s.isnumeric() "pinyin": entry.pinyin,
# or not (1 <= int(s) <= len(v)) "meaning": meaning,
# ): }
# s = input( )
# f"Please select the correct word [1-{len(v)}]: " else:
# ) print("============================================")
# v = v[int(s) - 1] print(f"===================>ERROR: {word} not found")
# else: print("============================================")
# v = ml[0] tsv_writer.writerow(
# else: {
# v = v[0] "simplified": word,
# audio_path = resources / f"{word}.wav" "traditional": None,
# if not audio_path.exists(): "pinyin": None,
# audio = tts.generate(f"{word}。", language_id="zh") "meaning": None,
# torchaudio.save(audio_path, audio, tts.sr) }
# input_file.write(f"{word}\t{v.pinyin}\n") )
# results.append((v, audio_path))
# else:
# print("============================================") def dictionary_process(process_file: ProcessFile) -> list[DictionaryResult]:
# print(f"===================>ERROR: {word} not found") """Process a dictionary_resource_file into a final result"""
# print("============================================") results = []
# input_file.write(f"{word}\tERROR\n") with process_file.dictionary_resource_file.open(
# except Exception: "w", encoding="utf8", newline=""
# with in_file.open("w", encoding="utf8") as input_file: ) as resource_file:
# input_file.write("\n".join(words_list)) reader = csv.DictReader(resource_file)
# return results for line in reader:
audio_path = process_file.resources / f"{line['pinyin']}.wav"
if not audio_path.exists():
audio = TTS.MODEL.generate(f"{line['simplified']}", language_id="zh")
torchaudio.save(audio_path, audio, TTS.MODEL.sr)
result = DictionaryResult(**line, audio_path=audio_path)
results.append(result)
return results
# def output_tsv(out_file, results): # def output_tsv(out_file, results):
# """writes the output as a tsv file""" # """writes the output as a tsv file"""

View File

@@ -11,7 +11,7 @@ from pathlib import Path
import argostranslate.package import argostranslate.package
import argostranslate.translate import argostranslate.translate
import torch import torch
from cedict_utils.cedict import CedictParser from cedict_utils.cedict import CedictEntry, CedictParser
from chatterbox.mtl_tts import ChatterboxMultilingualTTS from chatterbox.mtl_tts import ChatterboxMultilingualTTS
# Local # Local
@@ -49,6 +49,43 @@ class TRANS:
argostranslate.package.install_from_path(package.download()) argostranslate.package.install_from_path(package.download())
class TranslatedEntry:
"""Holder class for CCCEDIT entry translated to `language_id`"""
def __init__(self, entry: CedictEntry, language_id: str):
self.entry = entry
self.language_id = language_id
self._translated_meanings = []
for meaning in entry.meanings:
if language_id != LANGUAGES.EN:
trans_meaning = argostranslate.translate.translate(
meaning, LANGUAGES.EN, language_id
)
else:
trans_meaning = meaning
self._translated_meanings.append(trans_meaning)
@property
def simplified(self):
"""Entry simplified"""
return self.entry.simplified
@property
def traditional(self):
"""Entry traditional"""
return self.entry.traditional
@property
def pinyin(self):
"""Entry piying"""
return self.entry.pinyin
@property
def meanings(self):
"""Entry translated meaning list"""
return self._translated_meanings
class CCCEDICT: class CCCEDICT:
"""Static Class for the CCCEDIT dictionary""" """Static Class for the CCCEDIT dictionary"""
@@ -57,7 +94,9 @@ class CCCEDICT:
DICTIONARY_LIST = {} DICTIONARY_LIST = {}
@staticmethod @staticmethod
def create_cedict(language_id=LANGUAGES.EN): def create_cedict(
language_id: str = LANGUAGES.EN,
) -> dict[str, list[TranslatedEntry]]:
"""Creates a create_cedict dictionary object""" """Creates a create_cedict dictionary object"""
if not CCCEDICT.PARSER: if not CCCEDICT.PARSER:
CCCEDICT.PARSER = CedictParser() CCCEDICT.PARSER = CedictParser()
@@ -66,15 +105,11 @@ class CCCEDICT:
if language_id not in CCCEDICT.DICTIONARY_LIST: if language_id not in CCCEDICT.DICTIONARY_LIST:
dictionary = {} dictionary = {}
for entry in CCCEDICT.ENTRIES: for entry in CCCEDICT.ENTRIES:
if language_id != LANGUAGES.EN: trans_entry = TranslatedEntry(entry, language_id)
TRANS.create_translator(LANGUAGES.EN, language_id)
entry = argostranslate.translate.translate(
entry, LANGUAGES.EN, language_id
)
if entry.simplified not in dictionary: if entry.simplified not in dictionary:
dictionary[entry.simplified] = [entry] dictionary[entry.simplified] = [trans_entry]
else: else:
dictionary[entry.simplified].append(entry) dictionary[entry.simplified].append(trans_entry)
CCCEDICT.DICTIONARY_LIST[language_id] = dictionary CCCEDICT.DICTIONARY_LIST[language_id] = dictionary
else: else:
dictionary = CCCEDICT.DICTIONARY_LIST[language_id] dictionary = CCCEDICT.DICTIONARY_LIST[language_id]
@@ -144,6 +179,11 @@ class ProcessFile:
raise ValueError("Not a valid language selected") raise ValueError("Not a valid language selected")
return self.out_folder / f"{self.input_file.stem}.{self.language_id}." return self.out_folder / f"{self.input_file.stem}.{self.language_id}."
@property
def dictionary_resource_file(self):
"""The path for the resource tsv for dictionary files"""
return self.resources / f"dictionary.{self.language_id}.tsv"
class TranslationResult: class TranslationResult:
"""Result of a translated process""" """Result of a translated process"""
@@ -159,3 +199,23 @@ class TranslationResult:
self.translated = translated self.translated = translated
self.line = line self.line = line
self.audio_path = audio_path self.audio_path = audio_path
class DictionaryResult:
"""Result of a dictionaty process"""
def __init__(
self,
language_id: str,
simplified: str,
traditional: str,
pinyin: str,
meanings: str,
audio_path: Path,
):
self.language_id = language_id
self.simplified = simplified
self.traditional = traditional
self.pinyin = pinyin
self.meanings = meanings
self.audio_path = audio_path