add suport for paragraph trsanlations
This commit is contained in:
@@ -1,14 +1,14 @@
|
|||||||
## Imports
|
## Imports
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
import random
|
||||||
import csv
|
import csv
|
||||||
|
|
||||||
|
## PIP
|
||||||
from cedict_utils.cedict import CedictParser
|
from cedict_utils.cedict import CedictParser
|
||||||
from pinyin_tone_converter.pinyin_tone_converter import PinyinToneConverter
|
from pinyin_tone_converter.pinyin_tone_converter import PinyinToneConverter
|
||||||
|
from genanki import Deck, Note, Model, Package
|
||||||
# from genanki import Deck, Note
|
import argostranslate.package
|
||||||
|
import argostranslate.translate
|
||||||
# import argostranslate.package
|
|
||||||
# import argostranslate.translate
|
|
||||||
|
|
||||||
## Constants
|
## Constants
|
||||||
|
|
||||||
@@ -16,40 +16,76 @@ CCCEDICT = Path(__file__).parent / "cedict_ts.u8"
|
|||||||
DATA = Path(__file__).parent.parent / "data"
|
DATA = Path(__file__).parent.parent / "data"
|
||||||
INPUT = DATA / "input"
|
INPUT = DATA / "input"
|
||||||
OUTPUT = DATA / "output"
|
OUTPUT = DATA / "output"
|
||||||
CN = "cn"
|
CN = "zh"
|
||||||
EN = "en"
|
EN = "en"
|
||||||
|
TEXT_TYPE = "TEXT_TYPE"
|
||||||
|
CSS = """
|
||||||
|
.card {
|
||||||
|
font-family: arial;
|
||||||
|
font-size: 20px;
|
||||||
|
text-align: center;
|
||||||
|
color: black;
|
||||||
|
background-color: white;
|
||||||
|
}
|
||||||
|
.simple {
|
||||||
|
font-family: Arial;
|
||||||
|
font-size: 100px;
|
||||||
|
}
|
||||||
|
.trad {
|
||||||
|
font-family: Arial;
|
||||||
|
font-size: 75px;
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
|
||||||
## Classess
|
## Classess
|
||||||
|
|
||||||
## Main
|
SIMPLE_MODEL = Model(
|
||||||
|
2076166425,
|
||||||
|
"Simple Model",
|
||||||
|
fields=[
|
||||||
|
{"name": "Question"},
|
||||||
|
{"name": "Answer"},
|
||||||
|
],
|
||||||
|
templates=[
|
||||||
|
{
|
||||||
|
"name": "Card 1",
|
||||||
|
"qfmt": "{{Question}}",
|
||||||
|
"afmt": '{{FrontSide}}<hr id="answer">{{Answer}}',
|
||||||
|
},
|
||||||
|
],
|
||||||
|
css=CSS,
|
||||||
|
)
|
||||||
|
|
||||||
# Download and install Argos Translate package
|
HSK_MODEL = Model(
|
||||||
# argostranslate.package.update_package_index()
|
1708536519,
|
||||||
# available_packages = argostranslate.package.get_available_packages()
|
"HSK Model",
|
||||||
# package_to_install = next(
|
fields=[
|
||||||
# filter(
|
{"name": "English"},
|
||||||
# lambda x: x.from_code == CN and x.to_code == EN, available_packages
|
{"name": "Pinyin"},
|
||||||
# )
|
{"name": "Simplified"},
|
||||||
# )
|
{"name": "Traditional"},
|
||||||
# argostranslate.package.install_from_path(package_to_install.download())
|
],
|
||||||
|
templates=[
|
||||||
|
{
|
||||||
|
"name": "Card 1",
|
||||||
|
"qfmt": "<strong>{{Pinyin}}</strong><br>{{English}}",
|
||||||
|
"afmt": "{{FrontSide}}<hr id='answer''><div class='simple'>{{Simplified}}</div><br><div class='trad'>{{Traditional}}</div>",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Card 2",
|
||||||
|
"qfmt": "<div class='simple'>{{Simplified}}</div><br><div class='trad'>{{Traditional}}</div>",
|
||||||
|
"afmt": '{{FrontSide}}<hr id="answer"><strong>{{Pinyin}}</strong><br>{{English}}',
|
||||||
|
},
|
||||||
|
],
|
||||||
|
css=CSS,
|
||||||
|
)
|
||||||
|
|
||||||
def process_files():
|
|
||||||
print("Select data file:")
|
|
||||||
files = []
|
|
||||||
for n, file in enumerate(INPUT.glob('**/*.txt')):
|
|
||||||
files.append(file)
|
|
||||||
print(f"{n+1} - {file.relative_to(INPUT)}")
|
|
||||||
s = None
|
|
||||||
while not s or not s.isnumeric() or not (1 <= int(s) <= len(files)):
|
|
||||||
s = input(f"Please select the file [1-{len(files)}]: ")
|
|
||||||
in_file = files[int(s)-1]
|
|
||||||
relative = in_file.relative_to(INPUT)
|
|
||||||
out_file = OUTPUT / relative
|
|
||||||
out_file.parent.mkdir(parents=True, exist_ok=True)
|
|
||||||
return in_file, out_file
|
|
||||||
|
|
||||||
def dictionary_process(in_file, out_file):
|
## Functions
|
||||||
"""Process dictionary files"""
|
|
||||||
|
|
||||||
|
def create_cedict():
|
||||||
|
"""Creates a create_cedict dictionary object"""
|
||||||
parser = CedictParser()
|
parser = CedictParser()
|
||||||
parser.read_file(CCCEDICT)
|
parser.read_file(CCCEDICT)
|
||||||
entries = parser.parse()
|
entries = parser.parse()
|
||||||
@@ -61,16 +97,48 @@ def dictionary_process(in_file, out_file):
|
|||||||
else:
|
else:
|
||||||
dictionary[entry.simplified].append(entry)
|
dictionary[entry.simplified].append(entry)
|
||||||
|
|
||||||
out_file = DATA / f"{in_file.stem}.tsv"
|
return dictionary
|
||||||
words_list = in_file.open(encoding="utf8").read().split()
|
|
||||||
|
|
||||||
|
|
||||||
|
def create_translator():
|
||||||
|
"""Download and install Argos Translate package"""
|
||||||
|
argostranslate.package.update_package_index()
|
||||||
|
available_packages = argostranslate.package.get_available_packages()
|
||||||
|
package_to_install = next(
|
||||||
|
filter(lambda x: x.from_code == CN and x.to_code == EN, available_packages)
|
||||||
|
)
|
||||||
|
argostranslate.package.install_from_path(package_to_install.download())
|
||||||
|
|
||||||
|
|
||||||
|
## Main
|
||||||
|
|
||||||
|
|
||||||
|
def process_files():
|
||||||
|
print("Select data file:")
|
||||||
|
files = []
|
||||||
|
for n, file in enumerate(INPUT.glob("**/*.txt")):
|
||||||
|
files.append(file)
|
||||||
|
print(f"{n+1} - {file.relative_to(INPUT)}")
|
||||||
|
s = None
|
||||||
|
while not s or not s.isnumeric() or not (1 <= int(s) <= len(files)):
|
||||||
|
s = input(f"Please select the file [1-{len(files)}]: ")
|
||||||
|
in_file = files[int(s) - 1]
|
||||||
|
relative = in_file.relative_to(INPUT)
|
||||||
|
out_file = OUTPUT / relative
|
||||||
|
out_file.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
with in_file.open(encoding="utf8") as input_file:
|
||||||
|
file_type = input_file.read().split()[0]
|
||||||
|
return in_file, out_file, file_type
|
||||||
|
|
||||||
|
|
||||||
|
def dictionary_process(dictionary, in_file):
|
||||||
|
"""Process dictionary files"""
|
||||||
|
words_list = in_file.open(encoding="utf8").read().split()
|
||||||
results = []
|
results = []
|
||||||
for word in words_list:
|
for word in words_list:
|
||||||
if v := dictionary.get(word):
|
if v := dictionary.get(word):
|
||||||
if len(v) > 1:
|
if len(v) > 1:
|
||||||
print(
|
print(f"\nWARNING: {word} has multiple meanings:")
|
||||||
f"\nWARNING: {word} has multiple meanings:"
|
|
||||||
)
|
|
||||||
for n, w in enumerate(v):
|
for n, w in enumerate(v):
|
||||||
print(f"{n+1} - {w}")
|
print(f"{n+1} - {w}")
|
||||||
for m in w.meanings:
|
for m in w.meanings:
|
||||||
@@ -86,7 +154,26 @@ def dictionary_process(in_file, out_file):
|
|||||||
print("============================================")
|
print("============================================")
|
||||||
print(f"===================>ERROR: {word} not found")
|
print(f"===================>ERROR: {word} not found")
|
||||||
print("============================================")
|
print("============================================")
|
||||||
with out_file.open("w", encoding="utf8", newline="") as csvfile:
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
def trasnlator_process(in_file):
|
||||||
|
"""Process text trasnlate files"""
|
||||||
|
text_list = in_file.open(encoding="utf8").read().split()[1:]
|
||||||
|
results = []
|
||||||
|
for text in text_list:
|
||||||
|
text = text.strip()
|
||||||
|
for par in text.split("。"):
|
||||||
|
if par:
|
||||||
|
translatedText = argostranslate.translate.translate(par, CN, EN)
|
||||||
|
results.append([translatedText, par])
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
def output_tsv(out_file, results):
|
||||||
|
"""writes the output as a tsv file"""
|
||||||
|
final_file = out_file.parent / f"{out_file.stem}.tsv"
|
||||||
|
with final_file.open("w", encoding="utf8", newline="") as csvfile:
|
||||||
writer = csv.writer(csvfile, delimiter="\t", quotechar='"')
|
writer = csv.writer(csvfile, delimiter="\t", quotechar='"')
|
||||||
for entry in results:
|
for entry in results:
|
||||||
writer.writerow(
|
writer.writerow(
|
||||||
@@ -98,9 +185,49 @@ def dictionary_process(in_file, out_file):
|
|||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def output_anki_dictionary(out_file, results):
|
||||||
|
final_file = out_file.parent / f"{out_file.stem}.apkg"
|
||||||
|
deck_name = "::".join(out_file.relative_to(OUTPUT).parts[:-1] + (out_file.stem,))
|
||||||
|
deck = Deck(random.randrange(1 << 30, 1 << 31), deck_name)
|
||||||
|
for entry in results:
|
||||||
|
note = Note(
|
||||||
|
model=HSK_MODEL,
|
||||||
|
fields=[
|
||||||
|
"\n ".join(f"{n+1}. {m}" for n, m in enumerate(entry.meanings)),
|
||||||
|
PinyinToneConverter().convert_text(entry.pinyin),
|
||||||
|
entry.simplified,
|
||||||
|
entry.traditional,
|
||||||
|
],
|
||||||
|
)
|
||||||
|
deck.add_note(note)
|
||||||
|
Package(deck).write_to_file(final_file)
|
||||||
|
|
||||||
|
|
||||||
|
def output_anki_text(out_file, results):
|
||||||
|
final_file = out_file.parent / f"{out_file.stem}.apkg"
|
||||||
|
deck_name = "::".join(out_file.relative_to(OUTPUT).parts[:-1] + (out_file.stem,))
|
||||||
|
deck = Deck(random.randrange(1 << 30, 1 << 31), deck_name)
|
||||||
|
for entry in results:
|
||||||
|
note = Note(
|
||||||
|
model=SIMPLE_MODEL,
|
||||||
|
fields=entry,
|
||||||
|
)
|
||||||
|
deck.add_note(note)
|
||||||
|
Package(deck).write_to_file(final_file)
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
in_file, out_file = process_files()
|
in_file, out_file, file_type = process_files()
|
||||||
dictionary_process(in_file, out_file)
|
if TEXT_TYPE == file_type:
|
||||||
|
create_translator()
|
||||||
|
results = trasnlator_process(in_file)
|
||||||
|
output_anki_text(out_file, results)
|
||||||
|
else:
|
||||||
|
dictionary = create_cedict()
|
||||||
|
results = dictionary_process(dictionary, in_file)
|
||||||
|
output_anki_dictionary(out_file, results)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
|
|||||||
0
data/HSK1-1.txt
Normal file
0
data/HSK1-1.txt
Normal file
2
data/input/口语/口语-第9课.-text.txt
Normal file
2
data/input/口语/口语-第9课.-text.txt
Normal file
@@ -0,0 +1,2 @@
|
|||||||
|
TEXT_TYPE
|
||||||
|
周六那场篮球比在,对手很厉害。前半场他们一直赢,后半场我们对才超过他们,领先得并不轻松。
|
||||||
BIN
data/output/HSK1/HSK1-1.apkg
Normal file
BIN
data/output/HSK1/HSK1-1.apkg
Normal file
Binary file not shown.
BIN
data/output/口语/口语-第9课.-text.apkg
Normal file
BIN
data/output/口语/口语-第9课.-text.apkg
Normal file
Binary file not shown.
Reference in New Issue
Block a user