add suport for paragraph trsanlations

This commit is contained in:
Wolfang Torres
2026-05-20 02:06:39 +08:00
parent 6382d03475
commit d0e3693574
13 changed files with 173 additions and 44 deletions

View File

@@ -1,14 +1,14 @@
## Imports
from pathlib import Path
import random
import csv
## PIP
from cedict_utils.cedict import CedictParser
from pinyin_tone_converter.pinyin_tone_converter import PinyinToneConverter
# from genanki import Deck, Note
# import argostranslate.package
# import argostranslate.translate
from genanki import Deck, Note, Model, Package
import argostranslate.package
import argostranslate.translate
## Constants
@@ -16,40 +16,76 @@ CCCEDICT = Path(__file__).parent / "cedict_ts.u8"
DATA = Path(__file__).parent.parent / "data"
INPUT = DATA / "input"
OUTPUT = DATA / "output"
CN = "cn"
CN = "zh"
EN = "en"
TEXT_TYPE = "TEXT_TYPE"
CSS = """
.card {
font-family: arial;
font-size: 20px;
text-align: center;
color: black;
background-color: white;
}
.simple {
font-family: Arial;
font-size: 100px;
}
.trad {
font-family: Arial;
font-size: 75px;
}
"""
## Classess
## Main
SIMPLE_MODEL = Model(
2076166425,
"Simple Model",
fields=[
{"name": "Question"},
{"name": "Answer"},
],
templates=[
{
"name": "Card 1",
"qfmt": "{{Question}}",
"afmt": '{{FrontSide}}<hr id="answer">{{Answer}}',
},
],
css=CSS,
)
# Download and install Argos Translate package
# argostranslate.package.update_package_index()
# available_packages = argostranslate.package.get_available_packages()
# package_to_install = next(
# filter(
# lambda x: x.from_code == CN and x.to_code == EN, available_packages
# )
# )
# argostranslate.package.install_from_path(package_to_install.download())
HSK_MODEL = Model(
1708536519,
"HSK Model",
fields=[
{"name": "English"},
{"name": "Pinyin"},
{"name": "Simplified"},
{"name": "Traditional"},
],
templates=[
{
"name": "Card 1",
"qfmt": "<strong>{{Pinyin}}</strong><br>{{English}}",
"afmt": "{{FrontSide}}<hr id='answer''><div class='simple'>{{Simplified}}</div><br><div class='trad'>{{Traditional}}</div>",
},
{
"name": "Card 2",
"qfmt": "<div class='simple'>{{Simplified}}</div><br><div class='trad'>{{Traditional}}</div>",
"afmt": '{{FrontSide}}<hr id="answer"><strong>{{Pinyin}}</strong><br>{{English}}',
},
],
css=CSS,
)
def process_files():
print("Select data file:")
files = []
for n, file in enumerate(INPUT.glob('**/*.txt')):
files.append(file)
print(f"{n+1} - {file.relative_to(INPUT)}")
s = None
while not s or not s.isnumeric() or not (1 <= int(s) <= len(files)):
s = input(f"Please select the file [1-{len(files)}]: ")
in_file = files[int(s)-1]
relative = in_file.relative_to(INPUT)
out_file = OUTPUT / relative
out_file.parent.mkdir(parents=True, exist_ok=True)
return in_file, out_file
def dictionary_process(in_file, out_file):
"""Process dictionary files"""
## Functions
def create_cedict():
"""Creates a create_cedict dictionary object"""
parser = CedictParser()
parser.read_file(CCCEDICT)
entries = parser.parse()
@@ -61,16 +97,48 @@ def dictionary_process(in_file, out_file):
else:
dictionary[entry.simplified].append(entry)
out_file = DATA / f"{in_file.stem}.tsv"
words_list = in_file.open(encoding="utf8").read().split()
return dictionary
def create_translator():
"""Download and install Argos Translate package"""
argostranslate.package.update_package_index()
available_packages = argostranslate.package.get_available_packages()
package_to_install = next(
filter(lambda x: x.from_code == CN and x.to_code == EN, available_packages)
)
argostranslate.package.install_from_path(package_to_install.download())
## Main
def process_files():
print("Select data file:")
files = []
for n, file in enumerate(INPUT.glob("**/*.txt")):
files.append(file)
print(f"{n+1} - {file.relative_to(INPUT)}")
s = None
while not s or not s.isnumeric() or not (1 <= int(s) <= len(files)):
s = input(f"Please select the file [1-{len(files)}]: ")
in_file = files[int(s) - 1]
relative = in_file.relative_to(INPUT)
out_file = OUTPUT / relative
out_file.parent.mkdir(parents=True, exist_ok=True)
with in_file.open(encoding="utf8") as input_file:
file_type = input_file.read().split()[0]
return in_file, out_file, file_type
def dictionary_process(dictionary, in_file):
"""Process dictionary files"""
words_list = in_file.open(encoding="utf8").read().split()
results = []
for word in words_list:
if v := dictionary.get(word):
if len(v) > 1:
print(
f"\nWARNING: {word} has multiple meanings:"
)
print(f"\nWARNING: {word} has multiple meanings:")
for n, w in enumerate(v):
print(f"{n+1} - {w}")
for m in w.meanings:
@@ -78,7 +146,7 @@ def dictionary_process(in_file, out_file):
s = None
while not s or not s.isnumeric() or not (1 <= int(s) <= len(v)):
s = input(f"Please select the correct word [1-{len(v)}]: ")
v = v[int(s)-1]
v = v[int(s) - 1]
else:
v = v[0]
results.append(v)
@@ -86,21 +154,80 @@ def dictionary_process(in_file, out_file):
print("============================================")
print(f"===================>ERROR: {word} not found")
print("============================================")
with out_file.open("w", encoding="utf8", newline="") as csvfile:
return results
def trasnlator_process(in_file):
"""Process text trasnlate files"""
text_list = in_file.open(encoding="utf8").read().split()[1:]
results = []
for text in text_list:
text = text.strip()
for par in text.split(""):
if par:
translatedText = argostranslate.translate.translate(par, CN, EN)
results.append([translatedText, par])
return results
def output_tsv(out_file, results):
"""writes the output as a tsv file"""
final_file = out_file.parent / f"{out_file.stem}.tsv"
with final_file.open("w", encoding="utf8", newline="") as csvfile:
writer = csv.writer(csvfile, delimiter="\t", quotechar='"')
for entry in results:
writer.writerow(
[
"\n ".join(f"{n+1}. {m}" for n,m in enumerate(entry.meanings)),
"\n ".join(f"{n+1}. {m}" for n, m in enumerate(entry.meanings)),
PinyinToneConverter().convert_text(entry.pinyin),
entry.simplified,
entry.traditional,
]
)
def output_anki_dictionary(out_file, results):
final_file = out_file.parent / f"{out_file.stem}.apkg"
deck_name = "::".join(out_file.relative_to(OUTPUT).parts[:-1] + (out_file.stem,))
deck = Deck(random.randrange(1 << 30, 1 << 31), deck_name)
for entry in results:
note = Note(
model=HSK_MODEL,
fields=[
"\n ".join(f"{n+1}. {m}" for n, m in enumerate(entry.meanings)),
PinyinToneConverter().convert_text(entry.pinyin),
entry.simplified,
entry.traditional,
],
)
deck.add_note(note)
Package(deck).write_to_file(final_file)
def output_anki_text(out_file, results):
final_file = out_file.parent / f"{out_file.stem}.apkg"
deck_name = "::".join(out_file.relative_to(OUTPUT).parts[:-1] + (out_file.stem,))
deck = Deck(random.randrange(1 << 30, 1 << 31), deck_name)
for entry in results:
note = Note(
model=SIMPLE_MODEL,
fields=entry,
)
deck.add_note(note)
Package(deck).write_to_file(final_file)
def main():
in_file, out_file = process_files()
dictionary_process(in_file, out_file)
in_file, out_file, file_type = process_files()
if TEXT_TYPE == file_type:
create_translator()
results = trasnlator_process(in_file)
output_anki_text(out_file, results)
else:
dictionary = create_cedict()
results = dictionary_process(dictionary, in_file)
output_anki_dictionary(out_file, results)
if __name__ == "__main__":
main()

0
data/HSK1-1.txt Normal file
View File

View File

@@ -0,0 +1,2 @@
TEXT_TYPE
周六那场篮球比在,对手很厉害。前半场他们一直赢,后半场我们对才超过他们,领先得并不轻松。

Binary file not shown.

Binary file not shown.