| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183 |
- import argparse
- import shutil
- from pathlib import Path
- from opencc import OpenCC
- from translator import (
- DEFAULT_MODEL_CACHE,
- SUPPORTED_AI_LANGS,
- TranslatorConfig,
- build_translator,
- )
- SOURCE_DIR = Path("./Subtitle_Trans_CN")
- BASE_OPENCC_TARGETS = [
- ("通用繁體", Path("./Subtitle_ZH_Hant"), OpenCC("s2t")),
- ]
- TW_HK_TARGETS = [
- ("台灣繁體", Path("./Subtitle_ZH_Hant_TW"), OpenCC("s2tw")),
- ("香港繁體", Path("./Subtitle_ZH_Hant_HK"), OpenCC("s2hk")),
- ]
- AI_LANG_DIRS = {
- "en": Path("./Subtitle_EN"),
- "jp": Path("./Subtitle_JP"),
- "korean": Path("./Subtitle_KOREAN"),
- }
- AI_LANGS = ["en", "jp", "korean"]
- def parse_args() -> argparse.Namespace:
- parser = argparse.ArgumentParser(description="批量翻译 SRT 字幕为多种语言。")
- parser.add_argument("--backend", choices=["local", "openai"], default="local")
- parser.add_argument("--openai-model", default="gpt-4o-mini")
- parser.add_argument("--openai-api-key", default=None)
- parser.add_argument(
- "--model-cache-dir",
- default=str(DEFAULT_MODEL_CACHE),
- help="NLLB 本地模型存储目录。",
- )
- parser.add_argument("--batch-size", type=int, default=4)
- parser.add_argument("--max-length", type=int, default=512)
- parser.add_argument(
- "--include-tw-hk",
- action="store_true",
- help="额外生成台繁与港繁字幕。",
- )
- parser.add_argument(
- "--show-language-progress",
- action="store_true",
- help="打印正在翻译的语言。",
- )
- return parser.parse_args()
- def ensure_source_dir(source_dir: Path) -> list[Path]:
- if not source_dir.is_dir():
- print(f"❌ 目录 {source_dir} 不存在,请检查路径。")
- raise SystemExit(1)
- files = sorted(source_dir.glob("*.srt"))
- if not files:
- print(f"❌ 目录 {source_dir} 中没有找到任何 .srt 文件。")
- raise SystemExit(1)
- return files
- def prepare_target_dir(path: Path) -> None:
- if path.exists():
- for child in path.iterdir():
- if child.is_file() or child.is_symlink():
- child.unlink()
- else:
- shutil.rmtree(child)
- else:
- path.mkdir(parents=True, exist_ok=True)
- def is_dialog_line(line: str) -> bool:
- stripped = line.strip()
- if not stripped:
- return False
- if stripped.isdigit():
- return False
- if "-->" in stripped:
- return False
- return True
- def split_line(line: str) -> tuple[str, str]:
- if line.endswith("\r\n"):
- return line[:-2], "\r\n"
- if line.endswith("\n"):
- return line[:-1], "\n"
- if line.endswith("\r"):
- return line[:-1], "\r"
- return line, ""
- def translate_dialogs(content: str, translator, target_lang: str) -> str:
- lines = content.splitlines(keepends=True)
- indices: list[int] = []
- texts: list[str] = []
- endings: list[str] = []
- for idx, line in enumerate(lines):
- if is_dialog_line(line):
- text, ending = split_line(line)
- indices.append(idx)
- texts.append(text)
- endings.append(ending)
- if not texts:
- return content
- translated = translator.translate(texts, target_lang=target_lang)
- for pos, new_text in enumerate(translated):
- idx = indices[pos]
- lines[idx] = new_text + endings[pos]
- return "".join(lines)
- def main() -> None:
- args = parse_args()
- files = ensure_source_dir(SOURCE_DIR)
- opencc_targets = list(BASE_OPENCC_TARGETS)
- if args.include_tw_hk:
- opencc_targets.extend(TW_HK_TARGETS)
- for _, target_dir, _ in opencc_targets:
- prepare_target_dir(target_dir)
- ai_targets = AI_LANG_DIRS
- for lang, path in ai_targets.items():
- if lang not in SUPPORTED_AI_LANGS:
- raise SystemExit(f"❌ 不支持的 AI 语言: {lang}")
- prepare_target_dir(path)
- translator = build_translator(
- TranslatorConfig(
- backend=args.backend,
- openai_model=args.openai_model,
- openai_api_key=args.openai_api_key,
- model_cache_dir=Path(args.model_cache_dir),
- batch_size=args.batch_size,
- max_length=args.max_length,
- )
- )
- total = len(files)
- print(f"🔄 开始转换 {total} 个 .srt 文件...")
- for idx, file_path in enumerate(files, 1):
- percent = idx * 100 // total
- print(f"🔄 转换中: {idx}/{total} ({percent}%) - {file_path}")
- try:
- content = file_path.read_text(encoding="utf-8")
- except UnicodeDecodeError as exc:
- print(f"❌ 转换失败: {file_path} (无法以 UTF-8 读取: {exc})")
- continue
- # 繁体系
- for label, target_dir, converter in opencc_targets:
- if args.show_language_progress:
- print(f" ↳ 正在生成 {label}")
- out_path = target_dir / file_path.name
- converted = converter.convert(content)
- out_path.write_text(converted, encoding="utf-8")
- # 英日韩
- for lang, target_dir in ai_targets.items():
- if args.show_language_progress:
- lang_label = SUPPORTED_AI_LANGS[lang]["label"]
- print(f" ↳ 正在翻译 {lang_label}")
- translated_content = translate_dialogs(content, translator, lang)
- out_path = target_dir / file_path.name
- out_path.write_text(translated_content, encoding="utf-8")
- print(f"✅ 转换成功: {file_path}")
- if __name__ == "__main__":
- main()
|