remove data files to separate git

This commit is contained in:
Wolfang Torres
2026-06-11 16:41:34 +08:00
parent 4822c5bbed
commit 5ada53913a
13 changed files with 1061 additions and 318 deletions

View File

@@ -0,0 +1,4 @@
# SPDX-FileCopyrightText: 2026-present Wolfang Torres <wolfang.torres@gmail.com>
#
# SPDX-License-Identifier: GPL-3.0-or-later
__version__ = "0.0.1"

View File

@@ -0,0 +1,9 @@
"""anki-hsk-creator"""
import os
# Globals
os.environ["HF_TOKEN"] = "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"
ARGOS_UPDATED = False
ARGOS_PACKAGES = None

View File

@@ -0,0 +1,184 @@
## Imports
from pathlib import Path
import random
import csv
## PIP
from pinyin_tone_converter.pinyin_tone_converter import PinyinToneConverter
## Main
def process_files():
print("Select data file:")
in_file = None
level = INPUT
while not in_file:
files = []
for n, file in enumerate(level.glob("*")):
files.append(file)
print(f"{n+1} - {file.relative_to(INPUT)}")
s = None
while not s or not s.isnumeric() or not (1 <= int(s) <= len(files)):
s = input(f"Please select the file [1-{len(files)}]: ")
selected = files[int(s) - 1]
if selected.is_file():
in_file = selected
else:
level = selected
relative = in_file.relative_to(INPUT)
out_file = OUTPUT / relative
resources = RESOURCES / relative
resources = resources.parent / resources.stem
resources.mkdir(parents=True, exist_ok=True)
out_file.parent.mkdir(parents=True, exist_ok=True)
with in_file.open(encoding="utf8") as input_file:
file_type = input_file.read().split()[0]
return in_file, out_file, resources, file_type
def dictionary_process(dictionary, tts, in_file, resources):
"""Process dictionary files"""
words_list = in_file.open(encoding="utf8").read().strip().split("\n")
results = []
try:
with in_file.open("w", encoding="utf8") as input_file:
for words in words_list:
word = words.split()[0]
pinyin = " ".join(words.split()[1:]) if len(words.split()) > 1 else None
if v := dictionary.get(word):
if len(v) > 1:
print(f"\nWARNING: {word} has multiple meanings:")
if pinyin and pinyin != "ERROR":
ml = list(filter(lambda x: x.pinyin == pinyin, v))
else:
ml = v
if len(ml) > 1:
for n, w in enumerate(ml):
print(f"{n+1} - {w}")
for m in w.meanings:
print(f"\t{m}")
s = None
while (
not s
or not s.isnumeric()
or not (1 <= int(s) <= len(v))
):
s = input(
f"Please select the correct word [1-{len(v)}]: "
)
v = v[int(s) - 1]
else:
v = ml[0]
else:
v = v[0]
audio_path = resources / f"{word}.wav"
if not audio_path.exists():
audio = tts.generate(f"{word}", language_id="zh")
torchaudio.save(audio_path, audio, tts.sr)
input_file.write(f"{word}\t{v.pinyin}\n")
results.append((v, audio_path))
else:
print("============================================")
print(f"===================>ERROR: {word} not found")
print("============================================")
input_file.write(f"{word}\tERROR\n")
except Exception:
with in_file.open("w", encoding="utf8") as input_file:
input_file.write("\n".join(words_list))
return results
def translator_process(tts, resources, in_file):
"""Process for phases trasnlation"""
text_list = in_file.open(encoding="utf8").read().strip().split()
results = []
for n, phrase in enumerate(text_list):
phrase = phrase.strip()
audio_path = resources / f"N{n}.wav"
if not audio_path.exists():
audio = tts.generate(f"{phrase}", language_id="zh")
torchaudio.save(audio_path, audio, tts.sr)
translated = argostranslate.translate.translate(phrase, CN, EN)
results.append([translated, phrase, audio_path])
return results
# def output_tsv(out_file, results):
# """writes the output as a tsv file"""
# final_file = out_file.parent / f"{out_file.stem}.tsv"
# with final_file.open("w", encoding="utf8", newline="") as csvfile:
# writer = csv.writer(csvfile, delimiter="\t", quotechar='"')
# for entry in results:
# writer.writerow(
# [
# "\n ".join(f"{n+1}. {m}" for n, m in enumerate(entry.meanings)),
# PinyinToneConverter().convert_text(entry.pinyin),
# entry.simplified,
# entry.traditional,
# ]
# )
def output_anki_dictionary(out_file, results):
final_file = out_file.parent / f"{out_file.stem}.apkg"
deck_name = "::".join(out_file.relative_to(OUTPUT).parts[:-1] + (out_file.stem,))
deck = Deck(random.randrange(1 << 30, 1 << 31), deck_name)
package = Package(deck)
audios = []
for entry, audio in results:
note = Note(
model=HSK_MODEL,
fields=[
"\n ".join(f"{n+1}. {m}" for n, m in enumerate(entry.meanings)),
PinyinToneConverter().convert_text(entry.pinyin),
entry.simplified,
entry.traditional,
f"[sound:{audio.name}]",
],
)
audios.append(audio)
deck.add_note(note)
package.media_files = audios
package.write_to_file(final_file)
def output_anki_phrase(out_file, results):
final_file = out_file.parent / f"{out_file.stem}.apkg"
deck_name = "::".join(out_file.relative_to(OUTPUT).parts[:-1] + (out_file.stem,))
deck = Deck(random.randrange(1 << 30, 1 << 31), deck_name)
package = Package(deck)
audios = []
for translated, phrase, audio in results:
note = Note(
model=PHRASE_MODEL,
fields=[
translated,
phrase,
f"[sound:{audio.name}]",
],
)
deck.add_note(note)
audios.append(audio)
package.media_files = audios
package.write_to_file(final_file)
def main():
tts = create_tts()
dictionary = create_cedict()
create_translator()
while True:
in_file, out_file, resources, file_type = process_files()
if PHRASES_TYPE in in_file.suffixes:
results = translator_process(tts, resources, in_file)
output_anki_phrase(out_file, results)
elif DICT_TYPE in in_file.suffixes:
results = dictionary_process(dictionary, tts, in_file, resources)
output_anki_dictionary(out_file, results)
else:
raise TypeError("Error, filetype not especified!")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,88 @@
# anki-models.py
from genanki import Deck, Note, Model, Package
# Constants
CSS = """
.card {
font-family: arial;
font-size: 20px;
text-align: center;
color: black;
background-color: white;
}
.simple {
font-family: Arial;
font-size: 100px;
}
.trad {
font-family: Arial;
font-size: 75px;
}
"""
# Models
PHRASE_MODEL = Model(
2076166425,
"Phrase Model",
fields=[
{"name": "Translated"},
{"name": "Phrase"},
{"name": "Audio"},
],
templates=[
{
"name": "Card 1",
"qfmt": "{{Translated}}<br>{{Audio}}",
"afmt": '{{FrontSide}}<hr id="answer">{{Phrase}}',
},
{
"name": "Card 2",
"qfmt": "{{Phrase}}<br>{{Audio}}",
"afmt": '{{FrontSide}}<hr id="answer">{{Translated}}',
},
{
"name": "Card 3",
"qfmt": "{{Audio}}",
"afmt": '{{FrontSide}}<hr id="answer">{{Phrase}}',
},
],
css=CSS,
)
HSK_MODEL = Model(
1708536519,
"HSK Model",
fields=[
{"name": "English"},
{"name": "Pinyin"},
{"name": "Simplified"},
{"name": "Traditional"},
{"name": "Audio"},
],
templates=[
{
"name": "Card 1",
"qfmt": "<strong>{{Pinyin}}</strong><br>{{English}}<br>{{Audio}}",
"afmt": "{{FrontSide}}<hr id='answer''><div class='simple'>"
"{{Simplified}}</div><br><div class='trad'>{{Traditional}}</div>",
},
{
"name": "Card 2",
"qfmt": "<div class='simple'>{{Simplified}}</div><br><div class='trad'>"
"{{Traditional}}</div>",
"afmt": '{{FrontSide}}<hr id="answer"><strong>{{Pinyin}}</strong><br>{{English}}<br>{{Audio}}',
},
{
"name": "Card 3",
"qfmt": "{{Audio}}",
"afmt": "{{FrontSide}}<hr id='answer''><strong>{{Pinyin}}</strong><br><div class='simple'>"
"{{Simplified}}</div><br><div class='trad'>{{Traditional}}</div>",
},
],
css=CSS,
)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,19 @@
## Imports
from pathlib import Path
import random
import importlib.resources
CCCEDICT = importlib.resources.files("anki-hsk-creator").joinpath("cedict_ts.u8")
DATA = Path(__file__).parent.parent / "data"
INPUT = DATA / "input"
OUTPUT = DATA / "output"
RESOURCES = DATA / "resources"
# File Types
PHRASES_TYPE = ".phrases"
DICT_TYPE = ".dictionary"
# Language codes
CN = "zh"
EN = "en"

View File

@@ -0,0 +1,53 @@
from cedict_utils.cedict import CedictParser
import argostranslate.package
import argostranslate.translate
from chatterbox.mtl_tts import ChatterboxMultilingualTTS
import torch
import torchaudio
from . import ARGOS_UPDATED, ARGOS_PACKAGES
from . import CCCEDICT
## Functions
def create_cedict(language_id="en"):
"""Creates a create_cedict dictionary object"""
parser = CedictParser()
parser.read_file(CCCEDICT)
entries = parser.parse()
dictionary = {}
for entry in entries:
if entry.simplified not in dictionary:
dictionary[entry.simplified] = [entry]
else:
dictionary[entry.simplified].append(entry)
return dictionary
def create_translator(from_code, to_code):
"""Download and install Argos Translate package"""
if not ARGOS_UPDATED:
argostranslate.package.update_package_index()
ARGOS_PACKAGES = argostranslate.package.get_available_packages()
ARGOS_UPDATED = True
package_to_install = next(
filter(lambda x: x.from_code == CN and x.to_code == EN, ARGOS_PACKAGES)
)
argostranslate.package.install_from_path(package_to_install.download())
def create_tts():
# Automatically detect the best available device
if torch.cuda.is_available():
device = "cuda"
elif torch.backends.mps.is_available():
device = "mps"
else:
device = "cpu"
tts = ChatterboxMultilingualTTS.from_pretrained(device=device, t3_model="v3")
return tts