add dictionary file process
This commit is contained in:
@@ -3,4 +3,4 @@
|
|||||||
# SPDX-FileCopyrightText: 2026-present Wolfang Torres <wolfang.torres@gmail.com>
|
# SPDX-FileCopyrightText: 2026-present Wolfang Torres <wolfang.torres@gmail.com>
|
||||||
#
|
#
|
||||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||||
__version__ = "0.1.1"
|
__version__ = "0.1.2"
|
||||||
|
|||||||
@@ -8,12 +8,10 @@ import random
|
|||||||
|
|
||||||
# Pip
|
# Pip
|
||||||
from genanki import Deck, Model, Note, Package
|
from genanki import Deck, Model, Note, Package
|
||||||
|
from pinyin_tone_converter.pinyin_tone_converter import PinyinToneConverter
|
||||||
|
|
||||||
# Local
|
# Local
|
||||||
from .utility import ProcessFile, TranslationResult
|
from .utility import DictionaryResult, ProcessFile, TranslationResult
|
||||||
|
|
||||||
# from pinyin_tone_converter.pinyin_tone_converter import PinyinToneConverter
|
|
||||||
|
|
||||||
|
|
||||||
# Constants
|
# Constants
|
||||||
|
|
||||||
@@ -123,33 +121,34 @@ HSK_MODEL = Model(
|
|||||||
# Proccess
|
# Proccess
|
||||||
|
|
||||||
|
|
||||||
# def output_anki_dictionary(out_file, results):
|
def output_anki_dictionary(process_file: ProcessFile, results: list[DictionaryResult]):
|
||||||
# """Creates an anki file from a dictionary results"""
|
"""Creates an anki file from a dictionary results"""
|
||||||
# final_file = out_file.parent / f"{out_file.stem}.apkg"
|
final_file = process_file.output_name.with_suffix(".apkg")
|
||||||
# deck_name = "::".join(out_file.relative_to(OUTPUT).parts[:-1] + (out_file.stem,))
|
deck_name = "::".join(
|
||||||
# deck = Deck(random.randrange(1 << 30, 1 << 31), deck_name)
|
process_file.input_file.parts[:-1] + (process_file.output_name.stem,)
|
||||||
# package = Package(deck)
|
)
|
||||||
# audios = []
|
deck = Deck(random.randrange(1 << 30, 1 << 31), deck_name)
|
||||||
# for entry, audio in results:
|
package = Package(deck)
|
||||||
# note = Note(
|
audios = []
|
||||||
# model=HSK_MODEL,
|
for result in results:
|
||||||
# fields=[
|
note = Note(
|
||||||
# "\n ".join(f"{n+1}. {m}" for n, m in enumerate(entry.meanings)),
|
model=HSK_MODEL,
|
||||||
# PinyinToneConverter().convert_text(entry.pinyin),
|
fields=[
|
||||||
# entry.simplified,
|
"\n ".join(f"{n+1}. {m}" for n, m in enumerate(result.meanings)),
|
||||||
# entry.traditional,
|
PinyinToneConverter().convert_text(result.pinyin),
|
||||||
# f"[sound:{audio.name}]",
|
result.simplified,
|
||||||
# ],
|
result.traditional,
|
||||||
# )
|
f"[sound:{result.audio_path.name}]",
|
||||||
# audios.append(audio)
|
],
|
||||||
# deck.add_note(note)
|
)
|
||||||
# package.media_files = audios
|
audios.append(result.audio_path)
|
||||||
# package.write_to_file(final_file)
|
deck.add_note(note)
|
||||||
|
package.media_files = audios
|
||||||
|
package.write_to_file(final_file)
|
||||||
|
|
||||||
|
|
||||||
def output_anki_phrase(process_file: ProcessFile, results: list[TranslationResult]):
|
def output_anki_phrase(process_file: ProcessFile, results: list[TranslationResult]):
|
||||||
"""Creates an anki file from a phrases results"""
|
"""Creates an anki file from a phrases results"""
|
||||||
|
|
||||||
final_file = process_file.output_name.with_suffix(".apkg")
|
final_file = process_file.output_name.with_suffix(".apkg")
|
||||||
deck_name = "::".join(
|
deck_name = "::".join(
|
||||||
process_file.input_file.parts[:-1] + (process_file.output_name.stem,)
|
process_file.input_file.parts[:-1] + (process_file.output_name.stem,)
|
||||||
|
|||||||
@@ -8,10 +8,10 @@ from pathlib import Path
|
|||||||
|
|
||||||
# Local
|
# Local
|
||||||
from . import DATA_FOLDER
|
from . import DATA_FOLDER
|
||||||
from .anki_generation import output_anki_phrase
|
from .anki_generation import output_anki_dictionary, output_anki_phrase
|
||||||
from .constants import DICT_TYPE, INPUT, LANGUAGES, PHRASES_TYPE
|
from .constants import DICT_TYPE, INPUT, LANGUAGES, PHRASES_TYPE
|
||||||
from .proccessor import translator_process
|
from .proccessor import dictionary_pre_process, dictionary_process, translator_process
|
||||||
from .utility import TRANS, TTS, ProcessFile
|
from .utility import CCCEDICT, TRANS, TTS, ProcessFile
|
||||||
|
|
||||||
# interface
|
# interface
|
||||||
|
|
||||||
@@ -66,14 +66,21 @@ def create_input_file(
|
|||||||
def process_a_file(process_file: ProcessFile, language_id: str):
|
def process_a_file(process_file: ProcessFile, language_id: str):
|
||||||
"""From a input_file, a language and an output type, process a file"""
|
"""From a input_file, a language and an output type, process a file"""
|
||||||
process_file.language_id = language_id
|
process_file.language_id = language_id
|
||||||
if PHRASES_TYPE in process_file.input_file.suffixes:
|
|
||||||
TTS.create_tts()
|
TTS.create_tts()
|
||||||
|
if PHRASES_TYPE in process_file.input_file.suffixes:
|
||||||
TRANS.create_translator(LANGUAGES.CN, language_id)
|
TRANS.create_translator(LANGUAGES.CN, language_id)
|
||||||
with process_file.absolute_input_file.open("r", encoding="utf8") as file:
|
with process_file.absolute_input_file.open("r", encoding="utf8") as file:
|
||||||
text_lines = [line.strip() for line in file.readlines()]
|
text_lines = [line.strip() for line in file.readlines()]
|
||||||
results = translator_process(text_lines, process_file, language_id)
|
results = translator_process(text_lines, process_file)
|
||||||
output_anki_phrase(process_file, results)
|
output_anki_phrase(process_file, results)
|
||||||
elif DICT_TYPE in process_file.input_file.suffixes:
|
elif DICT_TYPE in process_file.input_file.suffixes:
|
||||||
print("not implemented")
|
if process_file.dictionary_resource_file.exists():
|
||||||
|
CCCEDICT.create_cedict(language_id)
|
||||||
|
with process_file.absolute_input_file.open("r", encoding="utf8") as file:
|
||||||
|
words_list = [word.strip() for word in file.readlines()]
|
||||||
|
dictionary_pre_process(words_list, process_file)
|
||||||
else:
|
else:
|
||||||
print("no identified")
|
results = dictionary_process(process_file)
|
||||||
|
output_anki_dictionary(process_file, results)
|
||||||
|
else:
|
||||||
|
print("filetype not identified")
|
||||||
|
|||||||
@@ -1,20 +1,25 @@
|
|||||||
"""processor.py"""
|
"""processor.py"""
|
||||||
|
|
||||||
|
# Standard Library
|
||||||
|
import csv
|
||||||
|
|
||||||
# Pip
|
# Pip
|
||||||
import argostranslate.translate
|
import argostranslate.translate
|
||||||
import torchaudio
|
import torchaudio
|
||||||
|
|
||||||
# Local
|
# Local
|
||||||
from .constants import LANGUAGES
|
from .constants import LANGUAGES
|
||||||
from .utility import TTS, ProcessFile, TranslationResult # , CCCEDICT
|
from .utility import CCCEDICT, TTS, DictionaryResult, ProcessFile, TranslationResult
|
||||||
|
|
||||||
|
# Constants
|
||||||
|
|
||||||
|
FIELDNAMES = ["simplified", "traditional", "pinyin", "meaning"]
|
||||||
|
|
||||||
# Results Classes
|
# Results Classes
|
||||||
|
|
||||||
|
|
||||||
def translator_process(
|
def translator_process(
|
||||||
text_lines: list[str],
|
text_lines: list[str], process_file: ProcessFile
|
||||||
process_file: ProcessFile,
|
|
||||||
language_id: str,
|
|
||||||
) -> list[TranslationResult]:
|
) -> list[TranslationResult]:
|
||||||
"""Process for phases or sentence translation"""
|
"""Process for phases or sentence translation"""
|
||||||
results = []
|
results = []
|
||||||
@@ -24,61 +29,73 @@ def translator_process(
|
|||||||
if not audio_path.exists():
|
if not audio_path.exists():
|
||||||
audio = TTS.MODEL.generate(f"{line}。", language_id=LANGUAGES.CN)
|
audio = TTS.MODEL.generate(f"{line}。", language_id=LANGUAGES.CN)
|
||||||
torchaudio.save(audio_path, audio, TTS.MODEL.sr)
|
torchaudio.save(audio_path, audio, TTS.MODEL.sr)
|
||||||
translated = argostranslate.translate.translate(line, LANGUAGES.CN, language_id)
|
translated = argostranslate.translate.translate(
|
||||||
results.append(TranslationResult(language_id, translated, line, audio_path))
|
line, LANGUAGES.CN, process_file.language_id
|
||||||
|
)
|
||||||
|
results.append(
|
||||||
|
TranslationResult(process_file.language_id, translated, line, audio_path)
|
||||||
|
)
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
|
||||||
# def dictionary_process(dictionary, tts, in_file, resources):
|
def dictionary_pre_process(words_list: list[str], process_file: ProcessFile):
|
||||||
# """Process dictionary files"""
|
"""Pre Process dictionary files into a intermediary resources file"""
|
||||||
# words_list = in_file.open(encoding="utf8").read().strip().split("\n")
|
dictionary = CCCEDICT.create_cedict(process_file.language_id)
|
||||||
# results = []
|
with process_file.dictionary_resource_file.open(
|
||||||
# try:
|
"w", encoding="utf8", newline=""
|
||||||
# with in_file.open("w", encoding="utf8") as input_file:
|
) as resource_file:
|
||||||
# for words in words_list:
|
tsv_writer = csv.writer(
|
||||||
# word = words.split()[0]
|
resource_file, dialect="excel-tab", fieldnames=FIELDNAMES
|
||||||
# pinyin = " ".join(words.split()[1:]) if len(words.split()) > 1 else None
|
)
|
||||||
# if v := dictionary.get(word):
|
tsv_writer.writeheader()
|
||||||
# if len(v) > 1:
|
for words in words_list:
|
||||||
# print(f"\nWARNING: {word} has multiple meanings:")
|
word = words.split()[0]
|
||||||
# if pinyin and pinyin != "ERROR":
|
pinyin = " ".join(words.split()[1:]) if len(words.split()) > 1 else None
|
||||||
# ml = list(filter(lambda x: x.pinyin == pinyin, v))
|
if entries := dictionary.get(word):
|
||||||
# else:
|
if pinyin is not None:
|
||||||
# ml = v
|
entries = list(filter(lambda x: x.pinyin == pinyin, entries))
|
||||||
# if len(ml) > 1:
|
if len(entries) > 1:
|
||||||
# for n, w in enumerate(ml):
|
print(f"\nWARNING: {word} has multiple meanings:")
|
||||||
# print(f"{n+1} - {w}")
|
for entry in entries:
|
||||||
# for m in w.meanings:
|
for meaning in entry.meanings:
|
||||||
# print(f"\t{m}")
|
tsv_writer.writerow(
|
||||||
# s = None
|
{
|
||||||
# while (
|
"simplified": entry.simplified,
|
||||||
# not s
|
"traditional": entry.traditional,
|
||||||
# or not s.isnumeric()
|
"pinyin": entry.pinyin,
|
||||||
# or not (1 <= int(s) <= len(v))
|
"meaning": meaning,
|
||||||
# ):
|
}
|
||||||
# s = input(
|
)
|
||||||
# f"Please select the correct word [1-{len(v)}]: "
|
else:
|
||||||
# )
|
print("============================================")
|
||||||
# v = v[int(s) - 1]
|
print(f"===================>ERROR: {word} not found")
|
||||||
# else:
|
print("============================================")
|
||||||
# v = ml[0]
|
tsv_writer.writerow(
|
||||||
# else:
|
{
|
||||||
# v = v[0]
|
"simplified": word,
|
||||||
# audio_path = resources / f"{word}.wav"
|
"traditional": None,
|
||||||
# if not audio_path.exists():
|
"pinyin": None,
|
||||||
# audio = tts.generate(f"{word}。", language_id="zh")
|
"meaning": None,
|
||||||
# torchaudio.save(audio_path, audio, tts.sr)
|
}
|
||||||
# input_file.write(f"{word}\t{v.pinyin}\n")
|
)
|
||||||
# results.append((v, audio_path))
|
|
||||||
# else:
|
|
||||||
# print("============================================")
|
def dictionary_process(process_file: ProcessFile) -> list[DictionaryResult]:
|
||||||
# print(f"===================>ERROR: {word} not found")
|
"""Process a dictionary_resource_file into a final result"""
|
||||||
# print("============================================")
|
results = []
|
||||||
# input_file.write(f"{word}\tERROR\n")
|
with process_file.dictionary_resource_file.open(
|
||||||
# except Exception:
|
"w", encoding="utf8", newline=""
|
||||||
# with in_file.open("w", encoding="utf8") as input_file:
|
) as resource_file:
|
||||||
# input_file.write("\n".join(words_list))
|
reader = csv.DictReader(resource_file)
|
||||||
# return results
|
for line in reader:
|
||||||
|
audio_path = process_file.resources / f"{line['pinyin']}.wav"
|
||||||
|
if not audio_path.exists():
|
||||||
|
audio = TTS.MODEL.generate(f"{line['simplified']}。", language_id="zh")
|
||||||
|
torchaudio.save(audio_path, audio, TTS.MODEL.sr)
|
||||||
|
result = DictionaryResult(**line, audio_path=audio_path)
|
||||||
|
results.append(result)
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
# def output_tsv(out_file, results):
|
# def output_tsv(out_file, results):
|
||||||
# """writes the output as a tsv file"""
|
# """writes the output as a tsv file"""
|
||||||
|
|||||||
@@ -11,7 +11,7 @@ from pathlib import Path
|
|||||||
import argostranslate.package
|
import argostranslate.package
|
||||||
import argostranslate.translate
|
import argostranslate.translate
|
||||||
import torch
|
import torch
|
||||||
from cedict_utils.cedict import CedictParser
|
from cedict_utils.cedict import CedictEntry, CedictParser
|
||||||
from chatterbox.mtl_tts import ChatterboxMultilingualTTS
|
from chatterbox.mtl_tts import ChatterboxMultilingualTTS
|
||||||
|
|
||||||
# Local
|
# Local
|
||||||
@@ -49,6 +49,43 @@ class TRANS:
|
|||||||
argostranslate.package.install_from_path(package.download())
|
argostranslate.package.install_from_path(package.download())
|
||||||
|
|
||||||
|
|
||||||
|
class TranslatedEntry:
|
||||||
|
"""Holder class for CCCEDIT entry translated to `language_id`"""
|
||||||
|
|
||||||
|
def __init__(self, entry: CedictEntry, language_id: str):
|
||||||
|
self.entry = entry
|
||||||
|
self.language_id = language_id
|
||||||
|
self._translated_meanings = []
|
||||||
|
for meaning in entry.meanings:
|
||||||
|
if language_id != LANGUAGES.EN:
|
||||||
|
trans_meaning = argostranslate.translate.translate(
|
||||||
|
meaning, LANGUAGES.EN, language_id
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
trans_meaning = meaning
|
||||||
|
self._translated_meanings.append(trans_meaning)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def simplified(self):
|
||||||
|
"""Entry simplified"""
|
||||||
|
return self.entry.simplified
|
||||||
|
|
||||||
|
@property
|
||||||
|
def traditional(self):
|
||||||
|
"""Entry traditional"""
|
||||||
|
return self.entry.traditional
|
||||||
|
|
||||||
|
@property
|
||||||
|
def pinyin(self):
|
||||||
|
"""Entry piying"""
|
||||||
|
return self.entry.pinyin
|
||||||
|
|
||||||
|
@property
|
||||||
|
def meanings(self):
|
||||||
|
"""Entry translated meaning list"""
|
||||||
|
return self._translated_meanings
|
||||||
|
|
||||||
|
|
||||||
class CCCEDICT:
|
class CCCEDICT:
|
||||||
"""Static Class for the CCCEDIT dictionary"""
|
"""Static Class for the CCCEDIT dictionary"""
|
||||||
|
|
||||||
@@ -57,7 +94,9 @@ class CCCEDICT:
|
|||||||
DICTIONARY_LIST = {}
|
DICTIONARY_LIST = {}
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def create_cedict(language_id=LANGUAGES.EN):
|
def create_cedict(
|
||||||
|
language_id: str = LANGUAGES.EN,
|
||||||
|
) -> dict[str, list[TranslatedEntry]]:
|
||||||
"""Creates a create_cedict dictionary object"""
|
"""Creates a create_cedict dictionary object"""
|
||||||
if not CCCEDICT.PARSER:
|
if not CCCEDICT.PARSER:
|
||||||
CCCEDICT.PARSER = CedictParser()
|
CCCEDICT.PARSER = CedictParser()
|
||||||
@@ -66,15 +105,11 @@ class CCCEDICT:
|
|||||||
if language_id not in CCCEDICT.DICTIONARY_LIST:
|
if language_id not in CCCEDICT.DICTIONARY_LIST:
|
||||||
dictionary = {}
|
dictionary = {}
|
||||||
for entry in CCCEDICT.ENTRIES:
|
for entry in CCCEDICT.ENTRIES:
|
||||||
if language_id != LANGUAGES.EN:
|
trans_entry = TranslatedEntry(entry, language_id)
|
||||||
TRANS.create_translator(LANGUAGES.EN, language_id)
|
|
||||||
entry = argostranslate.translate.translate(
|
|
||||||
entry, LANGUAGES.EN, language_id
|
|
||||||
)
|
|
||||||
if entry.simplified not in dictionary:
|
if entry.simplified not in dictionary:
|
||||||
dictionary[entry.simplified] = [entry]
|
dictionary[entry.simplified] = [trans_entry]
|
||||||
else:
|
else:
|
||||||
dictionary[entry.simplified].append(entry)
|
dictionary[entry.simplified].append(trans_entry)
|
||||||
CCCEDICT.DICTIONARY_LIST[language_id] = dictionary
|
CCCEDICT.DICTIONARY_LIST[language_id] = dictionary
|
||||||
else:
|
else:
|
||||||
dictionary = CCCEDICT.DICTIONARY_LIST[language_id]
|
dictionary = CCCEDICT.DICTIONARY_LIST[language_id]
|
||||||
@@ -144,6 +179,11 @@ class ProcessFile:
|
|||||||
raise ValueError("Not a valid language selected")
|
raise ValueError("Not a valid language selected")
|
||||||
return self.out_folder / f"{self.input_file.stem}.{self.language_id}."
|
return self.out_folder / f"{self.input_file.stem}.{self.language_id}."
|
||||||
|
|
||||||
|
@property
|
||||||
|
def dictionary_resource_file(self):
|
||||||
|
"""The path for the resource tsv for dictionary files"""
|
||||||
|
return self.resources / f"dictionary.{self.language_id}.tsv"
|
||||||
|
|
||||||
|
|
||||||
class TranslationResult:
|
class TranslationResult:
|
||||||
"""Result of a translated process"""
|
"""Result of a translated process"""
|
||||||
@@ -159,3 +199,23 @@ class TranslationResult:
|
|||||||
self.translated = translated
|
self.translated = translated
|
||||||
self.line = line
|
self.line = line
|
||||||
self.audio_path = audio_path
|
self.audio_path = audio_path
|
||||||
|
|
||||||
|
|
||||||
|
class DictionaryResult:
|
||||||
|
"""Result of a dictionaty process"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
language_id: str,
|
||||||
|
simplified: str,
|
||||||
|
traditional: str,
|
||||||
|
pinyin: str,
|
||||||
|
meanings: str,
|
||||||
|
audio_path: Path,
|
||||||
|
):
|
||||||
|
self.language_id = language_id
|
||||||
|
self.simplified = simplified
|
||||||
|
self.traditional = traditional
|
||||||
|
self.pinyin = pinyin
|
||||||
|
self.meanings = meanings
|
||||||
|
self.audio_path = audio_path
|
||||||
|
|||||||
Reference in New Issue
Block a user