srt_to_zh_hant.py 5.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183
  1. import argparse
  2. import shutil
  3. from pathlib import Path
  4. from opencc import OpenCC
  5. from translator import (
  6. DEFAULT_MODEL_CACHE,
  7. SUPPORTED_AI_LANGS,
  8. TranslatorConfig,
  9. build_translator,
  10. )
  11. SOURCE_DIR = Path("./Subtitle_Trans_CN")
  12. BASE_OPENCC_TARGETS = [
  13. ("通用繁體", Path("./Subtitle_ZH_Hant"), OpenCC("s2t")),
  14. ]
  15. TW_HK_TARGETS = [
  16. ("台灣繁體", Path("./Subtitle_ZH_Hant_TW"), OpenCC("s2tw")),
  17. ("香港繁體", Path("./Subtitle_ZH_Hant_HK"), OpenCC("s2hk")),
  18. ]
  19. AI_LANG_DIRS = {
  20. "en": Path("./Subtitle_EN"),
  21. "jp": Path("./Subtitle_JP"),
  22. "korean": Path("./Subtitle_KOREAN"),
  23. }
  24. AI_LANGS = ["en", "jp", "korean"]
  25. def parse_args() -> argparse.Namespace:
  26. parser = argparse.ArgumentParser(description="批量翻译 SRT 字幕为多种语言。")
  27. parser.add_argument("--backend", choices=["local", "openai"], default="local")
  28. parser.add_argument("--openai-model", default="gpt-4o-mini")
  29. parser.add_argument("--openai-api-key", default=None)
  30. parser.add_argument(
  31. "--model-cache-dir",
  32. default=str(DEFAULT_MODEL_CACHE),
  33. help="NLLB 本地模型存储目录。",
  34. )
  35. parser.add_argument("--batch-size", type=int, default=4)
  36. parser.add_argument("--max-length", type=int, default=512)
  37. parser.add_argument(
  38. "--include-tw-hk",
  39. action="store_true",
  40. help="额外生成台繁与港繁字幕。",
  41. )
  42. parser.add_argument(
  43. "--show-language-progress",
  44. action="store_true",
  45. help="打印正在翻译的语言。",
  46. )
  47. return parser.parse_args()
  48. def ensure_source_dir(source_dir: Path) -> list[Path]:
  49. if not source_dir.is_dir():
  50. print(f"❌ 目录 {source_dir} 不存在,请检查路径。")
  51. raise SystemExit(1)
  52. files = sorted(source_dir.glob("*.srt"))
  53. if not files:
  54. print(f"❌ 目录 {source_dir} 中没有找到任何 .srt 文件。")
  55. raise SystemExit(1)
  56. return files
  57. def prepare_target_dir(path: Path) -> None:
  58. if path.exists():
  59. for child in path.iterdir():
  60. if child.is_file() or child.is_symlink():
  61. child.unlink()
  62. else:
  63. shutil.rmtree(child)
  64. else:
  65. path.mkdir(parents=True, exist_ok=True)
  66. def is_dialog_line(line: str) -> bool:
  67. stripped = line.strip()
  68. if not stripped:
  69. return False
  70. if stripped.isdigit():
  71. return False
  72. if "-->" in stripped:
  73. return False
  74. return True
  75. def split_line(line: str) -> tuple[str, str]:
  76. if line.endswith("\r\n"):
  77. return line[:-2], "\r\n"
  78. if line.endswith("\n"):
  79. return line[:-1], "\n"
  80. if line.endswith("\r"):
  81. return line[:-1], "\r"
  82. return line, ""
  83. def translate_dialogs(content: str, translator, target_lang: str) -> str:
  84. lines = content.splitlines(keepends=True)
  85. indices: list[int] = []
  86. texts: list[str] = []
  87. endings: list[str] = []
  88. for idx, line in enumerate(lines):
  89. if is_dialog_line(line):
  90. text, ending = split_line(line)
  91. indices.append(idx)
  92. texts.append(text)
  93. endings.append(ending)
  94. if not texts:
  95. return content
  96. translated = translator.translate(texts, target_lang=target_lang)
  97. for pos, new_text in enumerate(translated):
  98. idx = indices[pos]
  99. lines[idx] = new_text + endings[pos]
  100. return "".join(lines)
  101. def main() -> None:
  102. args = parse_args()
  103. files = ensure_source_dir(SOURCE_DIR)
  104. opencc_targets = list(BASE_OPENCC_TARGETS)
  105. if args.include_tw_hk:
  106. opencc_targets.extend(TW_HK_TARGETS)
  107. for _, target_dir, _ in opencc_targets:
  108. prepare_target_dir(target_dir)
  109. ai_targets = AI_LANG_DIRS
  110. for lang, path in ai_targets.items():
  111. if lang not in SUPPORTED_AI_LANGS:
  112. raise SystemExit(f"❌ 不支持的 AI 语言: {lang}")
  113. prepare_target_dir(path)
  114. translator = build_translator(
  115. TranslatorConfig(
  116. backend=args.backend,
  117. openai_model=args.openai_model,
  118. openai_api_key=args.openai_api_key,
  119. model_cache_dir=Path(args.model_cache_dir),
  120. batch_size=args.batch_size,
  121. max_length=args.max_length,
  122. )
  123. )
  124. total = len(files)
  125. print(f"🔄 开始转换 {total} 个 .srt 文件...")
  126. for idx, file_path in enumerate(files, 1):
  127. percent = idx * 100 // total
  128. print(f"🔄 转换中: {idx}/{total} ({percent}%) - {file_path}")
  129. try:
  130. content = file_path.read_text(encoding="utf-8")
  131. except UnicodeDecodeError as exc:
  132. print(f"❌ 转换失败: {file_path} (无法以 UTF-8 读取: {exc})")
  133. continue
  134. # 繁体系
  135. for label, target_dir, converter in opencc_targets:
  136. if args.show_language_progress:
  137. print(f" ↳ 正在生成 {label}")
  138. out_path = target_dir / file_path.name
  139. converted = converter.convert(content)
  140. out_path.write_text(converted, encoding="utf-8")
  141. # 英日韩
  142. for lang, target_dir in ai_targets.items():
  143. if args.show_language_progress:
  144. lang_label = SUPPORTED_AI_LANGS[lang]["label"]
  145. print(f" ↳ 正在翻译 {lang_label}")
  146. translated_content = translate_dialogs(content, translator, lang)
  147. out_path = target_dir / file_path.name
  148. out_path.write_text(translated_content, encoding="utf-8")
  149. print(f"✅ 转换成功: {file_path}")
  150. if __name__ == "__main__":
  151. main()