|
|
|
|
@@ -9,6 +9,7 @@ from pinyin_tone_converter.pinyin_tone_converter import PinyinToneConverter
|
|
|
|
|
from genanki import Deck, Note, Model, Package
|
|
|
|
|
import argostranslate.package
|
|
|
|
|
import argostranslate.translate
|
|
|
|
|
from chatterbox.mtl_tts import ChatterboxMultilingualTTS
|
|
|
|
|
|
|
|
|
|
## Constants
|
|
|
|
|
|
|
|
|
|
@@ -16,6 +17,7 @@ CCCEDICT = Path(__file__).parent / "cedict_ts.u8"
|
|
|
|
|
DATA = Path(__file__).parent.parent / "data"
|
|
|
|
|
INPUT = DATA / "input"
|
|
|
|
|
OUTPUT = DATA / "output"
|
|
|
|
|
RESOURCES = DATA / "resources"
|
|
|
|
|
CN = "zh"
|
|
|
|
|
EN = "en"
|
|
|
|
|
PHRASES_TYPE = ".phrases"
|
|
|
|
|
@@ -65,6 +67,8 @@ HSK_FRONT_TEMPLATE = """
|
|
|
|
|
<tts service="android" voice="en-US">
|
|
|
|
|
{{English}}
|
|
|
|
|
</tts>
|
|
|
|
|
<br>
|
|
|
|
|
{{MyMedia}}
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
HSK_MODEL = Model(
|
|
|
|
|
@@ -75,6 +79,7 @@ HSK_MODEL = Model(
|
|
|
|
|
{"name": "Pinyin"},
|
|
|
|
|
{"name": "Simplified"},
|
|
|
|
|
{"name": "Traditional"},
|
|
|
|
|
{'name': 'Audio'},
|
|
|
|
|
],
|
|
|
|
|
templates=[
|
|
|
|
|
{
|
|
|
|
|
@@ -87,7 +92,7 @@ HSK_MODEL = Model(
|
|
|
|
|
"name": "Card 2",
|
|
|
|
|
"qfmt": "<div class='simple'>{{Simplified}}</div><br><div class='trad'>"
|
|
|
|
|
"{{Traditional}}</div>",
|
|
|
|
|
"afmt": '{{FrontSide}}<hr id="answer"><strong>{{Pinyin}}</strong><br>{{English}}',
|
|
|
|
|
"afmt": '{{FrontSide}}<hr id="answer"><strong>{{Pinyin}}</strong><br>{{English}}<br>{{MyMedia}}',
|
|
|
|
|
},
|
|
|
|
|
],
|
|
|
|
|
css=CSS,
|
|
|
|
|
@@ -122,6 +127,9 @@ def create_translator():
|
|
|
|
|
)
|
|
|
|
|
argostranslate.package.install_from_path(package_to_install.download())
|
|
|
|
|
|
|
|
|
|
def create_tts():
|
|
|
|
|
tts = ChatterboxMultilingualTTS.from_pretrained(device="cuda")
|
|
|
|
|
return tts
|
|
|
|
|
|
|
|
|
|
## Main
|
|
|
|
|
|
|
|
|
|
@@ -145,15 +153,18 @@ def process_files():
|
|
|
|
|
level = selected
|
|
|
|
|
relative = in_file.relative_to(INPUT)
|
|
|
|
|
out_file = OUTPUT / relative
|
|
|
|
|
resources = RESOURCES / relative
|
|
|
|
|
resources = resources.parent / resources.stem
|
|
|
|
|
resources.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
out_file.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
with in_file.open(encoding="utf8") as input_file:
|
|
|
|
|
file_type = input_file.read().split()[0]
|
|
|
|
|
return in_file, out_file, file_type
|
|
|
|
|
return in_file, out_file, resources, file_type
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def dictionary_process(dictionary, in_file):
|
|
|
|
|
def dictionary_process(dictionary, tts, in_file, resources):
|
|
|
|
|
"""Process dictionary files"""
|
|
|
|
|
words_list = in_file.open(encoding="utf8").read().split("\n")
|
|
|
|
|
words_list = in_file.open(encoding="utf8").read().strip().split("\n")
|
|
|
|
|
results = []
|
|
|
|
|
with in_file.open("w", encoding="utf8") as input_file:
|
|
|
|
|
for words in words_list:
|
|
|
|
|
@@ -163,21 +174,27 @@ def dictionary_process(dictionary, in_file):
|
|
|
|
|
if len(v) > 1:
|
|
|
|
|
print(f"\nWARNING: {word} has multiple meanings:")
|
|
|
|
|
if pinyin and pinyin != "ERROR":
|
|
|
|
|
ml = filter(lambda x: v.pinyin == pinyin, v)
|
|
|
|
|
ml = list(filter(lambda x: x.pinyin == pinyin, v))
|
|
|
|
|
else:
|
|
|
|
|
ml = v
|
|
|
|
|
for n, w in enumerate(ml):
|
|
|
|
|
print(f"{n+1} - {w}")
|
|
|
|
|
for m in w.meanings:
|
|
|
|
|
print(f"\t{m}")
|
|
|
|
|
s = None
|
|
|
|
|
while not s or not s.isnumeric() or not (1 <= int(s) <= len(v)):
|
|
|
|
|
s = input(f"Please select the correct word [1-{len(v)}]: ")
|
|
|
|
|
v = v[int(s) - 1]
|
|
|
|
|
if len(ml) > 1:
|
|
|
|
|
for n, w in enumerate(ml):
|
|
|
|
|
print(f"{n+1} - {w}")
|
|
|
|
|
for m in w.meanings:
|
|
|
|
|
print(f"\t{m}")
|
|
|
|
|
s = None
|
|
|
|
|
while not s or not s.isnumeric() or not (1 <= int(s) <= len(v)):
|
|
|
|
|
s = input(f"Please select the correct word [1-{len(v)}]: ")
|
|
|
|
|
v = v[int(s) - 1]
|
|
|
|
|
else:
|
|
|
|
|
v = ml[0]
|
|
|
|
|
else:
|
|
|
|
|
v = v[0]
|
|
|
|
|
audio = tts.generate(word, language_id="zh")
|
|
|
|
|
audio_path = resources / f"{word}.wav"
|
|
|
|
|
ta.save(audio_path, audio, tts.sr)
|
|
|
|
|
input_file.write(f"{word}\t{v.pinyin}\n")
|
|
|
|
|
results.append(v)
|
|
|
|
|
results.append((v, audio_path))
|
|
|
|
|
else:
|
|
|
|
|
print("============================================")
|
|
|
|
|
print(f"===================>ERROR: {word} not found")
|
|
|
|
|
@@ -199,27 +216,29 @@ def translator_process(in_file):
|
|
|
|
|
return results
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def output_tsv(out_file, results):
|
|
|
|
|
"""writes the output as a tsv file"""
|
|
|
|
|
final_file = out_file.parent / f"{out_file.stem}.tsv"
|
|
|
|
|
with final_file.open("w", encoding="utf8", newline="") as csvfile:
|
|
|
|
|
writer = csv.writer(csvfile, delimiter="\t", quotechar='"')
|
|
|
|
|
for entry in results:
|
|
|
|
|
writer.writerow(
|
|
|
|
|
[
|
|
|
|
|
"\n ".join(f"{n+1}. {m}" for n, m in enumerate(entry.meanings)),
|
|
|
|
|
PinyinToneConverter().convert_text(entry.pinyin),
|
|
|
|
|
entry.simplified,
|
|
|
|
|
entry.traditional,
|
|
|
|
|
]
|
|
|
|
|
)
|
|
|
|
|
# def output_tsv(out_file, results):
|
|
|
|
|
# """writes the output as a tsv file"""
|
|
|
|
|
# final_file = out_file.parent / f"{out_file.stem}.tsv"
|
|
|
|
|
# with final_file.open("w", encoding="utf8", newline="") as csvfile:
|
|
|
|
|
# writer = csv.writer(csvfile, delimiter="\t", quotechar='"')
|
|
|
|
|
# for entry in results:
|
|
|
|
|
# writer.writerow(
|
|
|
|
|
# [
|
|
|
|
|
# "\n ".join(f"{n+1}. {m}" for n, m in enumerate(entry.meanings)),
|
|
|
|
|
# PinyinToneConverter().convert_text(entry.pinyin),
|
|
|
|
|
# entry.simplified,
|
|
|
|
|
# entry.traditional,
|
|
|
|
|
# ]
|
|
|
|
|
# )
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def output_anki_dictionary(out_file, results):
|
|
|
|
|
final_file = out_file.parent / f"{out_file.stem}.apkg"
|
|
|
|
|
deck_name = "::".join(out_file.relative_to(OUTPUT).parts[:-1] + (out_file.stem,))
|
|
|
|
|
deck = Deck(random.randrange(1 << 30, 1 << 31), deck_name)
|
|
|
|
|
for entry in results:
|
|
|
|
|
package = Package(deck)
|
|
|
|
|
audios = []
|
|
|
|
|
for entry, audio in results:
|
|
|
|
|
note = Note(
|
|
|
|
|
model=HSK_MODEL,
|
|
|
|
|
fields=[
|
|
|
|
|
@@ -227,11 +246,14 @@ def output_anki_dictionary(out_file, results):
|
|
|
|
|
PinyinToneConverter().convert_text(entry.pinyin),
|
|
|
|
|
entry.simplified,
|
|
|
|
|
entry.traditional,
|
|
|
|
|
f"[sound:{audio.name}]"
|
|
|
|
|
],
|
|
|
|
|
)
|
|
|
|
|
audios.append(audio)
|
|
|
|
|
deck.add_note(note)
|
|
|
|
|
Package(deck).write_to_file(final_file)
|
|
|
|
|
|
|
|
|
|
package.media_files = audios
|
|
|
|
|
package.write_to_file(final_file)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def output_anki_text(out_file, results):
|
|
|
|
|
final_file = out_file.parent / f"{out_file.stem}.apkg"
|
|
|
|
|
@@ -247,14 +269,15 @@ def output_anki_text(out_file, results):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
|
|
in_file, out_file, file_type = process_files()
|
|
|
|
|
in_file, out_file, resources, file_type = process_files()
|
|
|
|
|
if PHRASES_TYPE in in_file.suffixes:
|
|
|
|
|
create_translator()
|
|
|
|
|
results = translator_process(in_file)
|
|
|
|
|
output_anki_text(out_file, results)
|
|
|
|
|
elif DICT_TYPE in in_file.suffixes:
|
|
|
|
|
tts = create_tts()
|
|
|
|
|
dictionary = create_cedict()
|
|
|
|
|
results = dictionary_process(dictionary, in_file)
|
|
|
|
|
results = dictionary_process(dictionary, tts, in_file, resources)
|
|
|
|
|
output_anki_dictionary(out_file, results)
|
|
|
|
|
else:
|
|
|
|
|
raise TypeError("Error, filetype not especified!")
|
|
|
|
|
|