107 lines
3.3 KiB
Python
107 lines
3.3 KiB
Python
## Imports
|
|
from pathlib import Path
|
|
import csv
|
|
|
|
from cedict_utils.cedict import CedictParser
|
|
from pinyin_tone_converter.pinyin_tone_converter import PinyinToneConverter
|
|
|
|
# from genanki import Deck, Note
|
|
|
|
# import argostranslate.package
|
|
# import argostranslate.translate
|
|
|
|
## Constants
|
|
|
|
CCCEDICT = Path(__file__).parent / "cedict_ts.u8"
|
|
DATA = Path(__file__).parent.parent / "data"
|
|
INPUT = DATA / "input"
|
|
OUTPUT = DATA / "output"
|
|
CN = "cn"
|
|
EN = "en"
|
|
|
|
## Classess
|
|
|
|
## Main
|
|
|
|
# Download and install Argos Translate package
|
|
# argostranslate.package.update_package_index()
|
|
# available_packages = argostranslate.package.get_available_packages()
|
|
# package_to_install = next(
|
|
# filter(
|
|
# lambda x: x.from_code == CN and x.to_code == EN, available_packages
|
|
# )
|
|
# )
|
|
# argostranslate.package.install_from_path(package_to_install.download())
|
|
|
|
def process_files():
|
|
print("Select data file:")
|
|
files = []
|
|
for n, file in enumerate(INPUT.glob('**/*.txt')):
|
|
files.append(file)
|
|
print(f"{n+1} - {file.relative_to(INPUT)}")
|
|
s = None
|
|
while not s or not s.isnumeric() or not (1 <= int(s) <= len(files)):
|
|
s = input(f"Please select the file [1-{len(files)}]: ")
|
|
in_file = files[int(s)-1]
|
|
relative = in_file.relative_to(INPUT)
|
|
out_file = OUTPUT / relative
|
|
out_file.parent.mkdir(parents=True, exist_ok=True)
|
|
return in_file, out_file
|
|
|
|
def dictionary_process(in_file, out_file):
|
|
"""Process dictionary files"""
|
|
parser = CedictParser()
|
|
parser.read_file(CCCEDICT)
|
|
entries = parser.parse()
|
|
|
|
dictionary = {}
|
|
for entry in entries:
|
|
if entry.simplified not in dictionary:
|
|
dictionary[entry.simplified] = [entry]
|
|
else:
|
|
dictionary[entry.simplified].append(entry)
|
|
|
|
out_file = DATA / f"{in_file.stem}.tsv"
|
|
words_list = in_file.open(encoding="utf8").read().split()
|
|
|
|
results = []
|
|
for word in words_list:
|
|
if v := dictionary.get(word):
|
|
if len(v) > 1:
|
|
print(
|
|
f"\nWARNING: {word} has multiple meanings:"
|
|
)
|
|
for n, w in enumerate(v):
|
|
print(f"{n+1} - {w}")
|
|
for m in w.meanings:
|
|
print(f"\t{m}")
|
|
s = None
|
|
while not s or not s.isnumeric() or not (1 <= int(s) <= len(v)):
|
|
s = input(f"Please select the correct word [1-{len(v)}]: ")
|
|
v = v[int(s)-1]
|
|
else:
|
|
v = v[0]
|
|
results.append(v)
|
|
else:
|
|
print("============================================")
|
|
print(f"===================>ERROR: {word} not found")
|
|
print("============================================")
|
|
with out_file.open("w", encoding="utf8", newline="") as csvfile:
|
|
writer = csv.writer(csvfile, delimiter="\t", quotechar='"')
|
|
for entry in results:
|
|
writer.writerow(
|
|
[
|
|
"\n ".join(f"{n+1}. {m}" for n,m in enumerate(entry.meanings)),
|
|
PinyinToneConverter().convert_text(entry.pinyin),
|
|
entry.simplified,
|
|
entry.traditional,
|
|
]
|
|
)
|
|
|
|
def main():
|
|
in_file, out_file = process_files()
|
|
dictionary_process(in_file, out_file)
|
|
|
|
if __name__ == "__main__":
|
|
main()
|