Переглянути джерело

Initial import from local codebase (keep remote .gitignore)

Krystic Cong 1 місяць тому
коміт
2a65fd155d
7 змінених файлів з 792 додано та 0 видалено
  1. 107 0
      .gitignore
  2. 70 0
      README.md
  3. 77 0
      convert_srt_to_t.sh
  4. 154 0
      excel_to_zh_hant.py
  5. 7 0
      requirements.txt
  6. 183 0
      srt_to_zh_hant.py
  7. 194 0
      translator.py

+ 107 - 0
.gitignore

@@ -0,0 +1,107 @@
+# ---> macOS
+.DS_Store
+.AppleDouble
+.LSOverride
+
+# Icon must end with two \r
+Icon
+
+
+# Thumbnails
+._*
+
+# Files that might appear in the root of a volume
+.DocumentRevisions-V100
+.fseventsd
+.Spotlight-V100
+.TemporaryItems
+.Trashes
+.VolumeIcon.icns
+
+# Directories potentially created on remote AFP share
+.AppleDB
+.AppleDesktop
+Network Trash Folder
+Temporary Items
+.apdisk
+
+# ---> VisualStudioCode
+.settings
+
+
+# ---> Python
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*,cover
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# ---> Vim
+[._]*.s[a-w][a-z]
+[._]s[a-w][a-z]
+*.un~
+Session.vim
+.netrwhist
+*~
+
+.models
+Subtitle_ZH_CN
+Subtitle_Trans_CN
+Subtitle_ZH_Hant
+Subtitle_EN
+Subtitle_JP
+Subtitle_KOREAN
+excel

+ 70 - 0
README.md

@@ -0,0 +1,70 @@
+# 本地多语言翻译工具
+
+该仓库包含两个脚本,用于将简体中文的字幕 (`srt_to_zh_hant.py`) 和 Excel UI 文本 (`excel_to_zh_hant.py`) 自动翻译成繁體、英文、日文、韩文,并可选接入 OpenAI API 作为翻译后端。
+
+## 环境准备
+
+1. **Python & 虚拟环境**
+   ```bash
+   python3 -m venv venv
+   source venv/bin/activate
+   python -m pip install --upgrade pip
+   ```
+2. **安装依赖**
+   ```bash
+   python -m pip install -r requirements.txt
+   ```
+   - 首次执行会自动从 Hugging Face 下载 NLLB-200 distilled 模型(约 2.6GB)到 `.models/nllb-200`。
+   - 中国大陆环境建议准备代理或提前下载模型文件后手动放入 `.models/nllb-200`。
+
+3. **OpenAI(可选)**
+   - 若需使用 ChatGPT 翻译,设置 `OPENAI_API_KEY` 或在命令中传入 `--openai-api-key`,并使用 `--backend openai`。
+
+## `srt_to_zh_hant.py` 使用说明
+
+| 参数 | 默认 | 说明 |
+| --- | --- | --- |
+| `--backend {local,openai}` | `local` | 选择本地 NLLB 模型或 OpenAI API。 |
+| `--include-tw-hk` | 关闭 | 额外生成 `Subtitle_ZH_Hant_TW`、`Subtitle_ZH_Hant_HK`。 |
+| `--show-language-progress` | 关闭 | 输出“正在翻译 xx”提示。 |
+| `--openai-model` | `gpt-4o-mini` | OpenAI 模型名。 |
+| `--openai-api-key` | 环境变量 | OpenAI API Key。 |
+| `--model-cache-dir` | `.models/nllb-200` | 本地模型缓存目录。 |
+| `--batch-size`/`--max-length` | 4 / 512 | 控制推理性能。 |
+
+### 目录约定
+- 待翻简体字幕:`Subtitle_Trans_CN`
+- 输出目录:
+  - `Subtitle_ZH_Hant`(通用繁體,默认)
+  - `Subtitle_ZH_Hant_TW`、`Subtitle_ZH_Hant_HK`(加 `--include-tw-hk` 时生成)
+  - `Subtitle_EN`、`Subtitle_JP`、`Subtitle_KOREAN`
+
+### 示例
+```bash
+# 默认:本地模型 + 通用繁体/英/日/韩
+python srt_to_zh_hant.py
+
+# 同时输出台繁和港繁,并显示每种语言进度
+python srt_to_zh_hant.py --include-tw-hk --show-language-progress
+
+# 改用 ChatGPT
+python srt_to_zh_hant.py --backend openai --openai-model gpt-4o-mini
+```
+
+## `excel_to_zh_hant.py` 简述
+- 扫描 `./excel` 目录下的 `.xlsx` 文件,为未翻译的文件创建 `_zh-Hant.xlsx` 副本。
+- 自动插入六列翻译结果:通用繁體、台繁、港繁、英文、日文、韩文,列名引用 `translator.py` 中的 `SUPPORTED_AI_LANGS`。
+- 根据备注列中包含“无需翻译/不翻译”的行跳过翻译。
+- 支持与字幕脚本相同的 CLI 选项(`--backend`、`--openai-*`、`--model-cache-dir` 等)。
+
+运行示例:
+```bash
+python excel_to_zh_hant.py
+# 或使用 ChatGPT
+python excel_to_zh_hant.py --backend openai --openai-model gpt-4o-mini
+```
+
+## 常见问题
+- **无法连接 PyPI / Hugging Face**:配置系统代理、使用国内镜像或手动下载模型文件。
+- **缺少 Torch / SentencePiece**:确保已在虚拟环境中执行 `pip install -r requirements.txt`,如遇网络问题可手动下载 wheel 安装。
+- **第一次运行耗时较长**:NLLB 模型较大,下载与首次加载会花费几分钟,请耐心等待。

+ 77 - 0
convert_srt_to_t.sh

@@ -0,0 +1,77 @@
+#!/bin/bash
+
+DIR="./zh-Hans"
+TDIR="./zh-Hant"
+TWDIR="./zh-Hant-TW"
+HKDIR="./zh-Hant-HK"
+
+# 检查 opencc 是否安装
+if ! command -v opencc &> /dev/null; then
+  echo "❌ opencc 命令未找到,请先安装 OpenCC。"
+  exit 1
+fi
+
+# 检查 $DIR 中是否有 srt 文件
+if [ ! -d "$DIR" ]; then
+  echo "❌ 目录 $DIR 不存在,请检查路径。"
+  exit 1
+fi
+if [ -z "$(ls -A "$DIR"/*.srt 2>/dev/null)" ]; then
+  echo "❌ 目录 $DIR 中没有找到任何 .srt 文件。"
+  exit 1
+fi
+
+# 如果 TDIR、TWDIR 和 HKDIR 目录不存在,则创建它们,如果已存在则清空目录里的所有文件
+if [ -d "$TDIR" ]; then
+  rm -rf "$TDIR"/*
+fi
+if [ -d "$TWDIR" ]; then
+  rm -rf "$TWDIR"/*
+fi
+if [ -d "$HKDIR" ]; then
+  rm -rf "$HKDIR"/*
+fi
+# 如果目标目录不存在,则创建它们
+if [ ! -d "$TDIR" ]; then
+  mkdir -p "$TDIR"
+fi
+if [ ! -d "$TWDIR" ]; then
+  mkdir -p "$TWDIR"
+fi
+if [ ! -d "$HKDIR" ]; then
+  mkdir -p "$HKDIR"
+fi
+
+total_files=$(ls -1q "$DIR"/*.srt | wc -l)
+current_file=0
+echo "🔄 开始转换 $total_files 个 .srt 文件..."
+
+# 遍历 $DIR 中的所有 .srt 文件并转换
+for file in "$DIR"/*.srt; do
+  if [[ -f "$file" ]]; then
+    out="${file##*/}"
+    out="${out%.srt}.srt"  # 从 out 中取出文件名部分并替换扩展名为 .srt
+
+    # 增加显示循环百分比的功能
+    current_file=$((current_file + 1))
+    percent=$((current_file * 100 / total_files))
+    echo "🔄 转换中: $current_file/$total_files ($percent%) - $file"
+
+    opencc -i "$file" -o "$TDIR/$out" -c s2t.json
+    if [ $? -ne 0 ]; then
+      echo "❌ 转换失败: $file by s2t.json"
+      continue
+    fi
+    opencc -i "$file" -o "$TWDIR/$out" -c s2tw.json
+    if [ $? -ne 0 ]; then
+      echo "❌ 转换失败: $file by s2tw.json"
+      continue
+    fi
+    opencc -i "$file" -o "$HKDIR/$out" -c s2hk.json
+    if [ $? -ne 0 ]; then
+      echo "❌ 转换失败: $file by s2hk.json"
+      continue
+    fi
+    echo "✅ 转换成功: $file"
+  fi
+done

+ 154 - 0
excel_to_zh_hant.py

@@ -0,0 +1,154 @@
+import argparse
+import os
+from copy import copy
+from pathlib import Path
+
+from opencc import OpenCC
+from openpyxl import load_workbook
+from openpyxl.utils import get_column_letter
+
+from translator import (
+    DEFAULT_MODEL_CACHE,
+    SUPPORTED_AI_LANGS,
+    TranslatorConfig,
+    build_translator,
+)
+
+INPUT_DIR = Path("./excel")
+OUTPUT_DIR = Path(".")
+SOURCE_COLUMN_INDEX = 2
+TARGET_COLUMN_INDEX = 3
+REMARK_COLUMN_INDEX = 6
+OPENCC_HEADERS = [
+    ("译文(ZH-Hant)", OpenCC("s2t")),
+    ("译文(ZH-TW)", OpenCC("s2tw")),
+    ("译文(ZH-HK)", OpenCC("s2hk")),
+]
+AI_LANGS = ["en", "jp", "korean"]
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="翻译 Excel UI 文本到多语言。")
+    parser.add_argument("--backend", choices=["local", "openai"], default="local")
+    parser.add_argument("--openai-model", default="gpt-4o-mini")
+    parser.add_argument("--openai-api-key", default=None)
+    parser.add_argument(
+        "--model-cache-dir",
+        default=str(DEFAULT_MODEL_CACHE),
+        help="NLLB 本地模型存储目录。",
+    )
+    parser.add_argument("--batch-size", type=int, default=4)
+    parser.add_argument("--max-length", type=int, default=512)
+    return parser.parse_args()
+
+
+def insert_translation_columns(ws, start_col: int, total_cols: int) -> None:
+    for _ in range(total_cols):
+        column_widths = {}
+        for col in range(1, ws.max_column + 1):
+            width = ws.column_dimensions[get_column_letter(col)].width
+            if width is not None:
+                column_widths[col] = width
+
+        ws.insert_cols(start_col)
+
+        for col, width in column_widths.items():
+            new_col = col + 1 if col >= start_col else col
+            ws.column_dimensions[get_column_letter(new_col)].width = width
+
+        for row in ws.iter_rows():
+            source_cell = row[start_col - 2]
+            target_cell = row[start_col - 1]
+            if source_cell.has_style:
+                target_cell.font = copy(source_cell.font)
+                target_cell.border = copy(source_cell.border)
+                target_cell.fill = copy(source_cell.fill)
+                target_cell.number_format = copy(source_cell.number_format)
+                target_cell.protection = copy(source_cell.protection)
+                target_cell.alignment = copy(source_cell.alignment)
+
+
+def should_skip(remark_value) -> bool:
+    remark_text = str(remark_value or "").strip()
+    return any(keyword in remark_text for keyword in ("无需翻译", "不翻译"))
+
+
+def translate_row(
+    ws,
+    row_idx: int,
+    simplified_text: str,
+    translator,
+) -> None:
+    for offset, (_, converter) in enumerate(OPENCC_HEADERS):
+        value = converter.convert(simplified_text)
+        ws.cell(row=row_idx, column=TARGET_COLUMN_INDEX + offset, value=value)
+
+    for lang_offset, lang in enumerate(AI_LANGS, start=len(OPENCC_HEADERS)):
+        translated = translator.translate_text(simplified_text, lang)
+        ws.cell(
+            row=row_idx,
+            column=TARGET_COLUMN_INDEX + lang_offset,
+            value=translated,
+        )
+
+
+def main() -> None:
+    args = parse_args()
+    translator = build_translator(
+        TranslatorConfig(
+            backend=args.backend,
+            openai_model=args.openai_model,
+            openai_api_key=args.openai_api_key,
+            model_cache_dir=Path(args.model_cache_dir),
+            batch_size=args.batch_size,
+            max_length=args.max_length,
+        )
+    )
+
+    total_new_cols = len(OPENCC_HEADERS) + len(AI_LANGS)
+    for lang in AI_LANGS:
+        if lang not in SUPPORTED_AI_LANGS:
+            raise SystemExit(f"❌ 不支持的 AI 语言: {lang}")
+
+    for filename in os.listdir(INPUT_DIR):
+        if not filename.endswith(".xlsx"):
+            continue
+        if "_zh-Hant" in filename or filename.startswith("~$"):
+            continue
+
+        input_path = INPUT_DIR / filename
+        output_path = OUTPUT_DIR / filename.replace(".xlsx", "_zh-Hant.xlsx")
+
+        print(f"📝 处理文件: {filename}")
+        wb = load_workbook(input_path)
+        ws = wb.active
+
+        insert_translation_columns(ws, TARGET_COLUMN_INDEX, total_new_cols)
+
+        headers = [header for header, _ in OPENCC_HEADERS] + [
+            SUPPORTED_AI_LANGS[lang]["label"] for lang in AI_LANGS
+        ]
+        for offset, header in enumerate(headers):
+            ws.cell(row=1, column=TARGET_COLUMN_INDEX + offset, value=header)
+
+        remark_col = REMARK_COLUMN_INDEX + total_new_cols
+
+        for row in ws.iter_rows(
+            min_row=2,
+            min_col=SOURCE_COLUMN_INDEX,
+            max_col=remark_col,
+        ):
+            remark_cell = row[remark_col - SOURCE_COLUMN_INDEX]
+            if should_skip(remark_cell.value):
+                continue
+            cell = row[0]
+            simplified = cell.value
+            if isinstance(simplified, str) and simplified.strip():
+                translate_row(ws, cell.row, simplified, translator)
+
+        wb.save(output_path)
+        print(f"✅ 已输出: {output_path}\n")
+
+
+if __name__ == "__main__":
+    main()

+ 7 - 0
requirements.txt

@@ -0,0 +1,7 @@
+opencc-python-reimplemented>=0.1.7
+openpyxl>=3.1.2
+transformers>=4.40.0
+huggingface_hub>=0.24.0
+sentencepiece>=0.2.0
+torch>=2.1.0
+openai>=1.14.0

+ 183 - 0
srt_to_zh_hant.py

@@ -0,0 +1,183 @@
+import argparse
+import shutil
+from pathlib import Path
+
+from opencc import OpenCC
+
+from translator import (
+    DEFAULT_MODEL_CACHE,
+    SUPPORTED_AI_LANGS,
+    TranslatorConfig,
+    build_translator,
+)
+
+SOURCE_DIR = Path("./Subtitle_Trans_CN")
+BASE_OPENCC_TARGETS = [
+    ("通用繁體", Path("./Subtitle_ZH_Hant"), OpenCC("s2t")),
+]
+TW_HK_TARGETS = [
+    ("台灣繁體", Path("./Subtitle_ZH_Hant_TW"), OpenCC("s2tw")),
+    ("香港繁體", Path("./Subtitle_ZH_Hant_HK"), OpenCC("s2hk")),
+]
+AI_LANG_DIRS = {
+    "en": Path("./Subtitle_EN"),
+    "jp": Path("./Subtitle_JP"),
+    "korean": Path("./Subtitle_KOREAN"),
+}
+AI_LANGS = ["en", "jp", "korean"]
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="批量翻译 SRT 字幕为多种语言。")
+    parser.add_argument("--backend", choices=["local", "openai"], default="local")
+    parser.add_argument("--openai-model", default="gpt-4o-mini")
+    parser.add_argument("--openai-api-key", default=None)
+    parser.add_argument(
+        "--model-cache-dir",
+        default=str(DEFAULT_MODEL_CACHE),
+        help="NLLB 本地模型存储目录。",
+    )
+    parser.add_argument("--batch-size", type=int, default=4)
+    parser.add_argument("--max-length", type=int, default=512)
+    parser.add_argument(
+        "--include-tw-hk",
+        action="store_true",
+        help="额外生成台繁与港繁字幕。",
+    )
+    parser.add_argument(
+        "--show-language-progress",
+        action="store_true",
+        help="打印正在翻译的语言。",
+    )
+    return parser.parse_args()
+
+
+def ensure_source_dir(source_dir: Path) -> list[Path]:
+    if not source_dir.is_dir():
+        print(f"❌ 目录 {source_dir} 不存在,请检查路径。")
+        raise SystemExit(1)
+
+    files = sorted(source_dir.glob("*.srt"))
+    if not files:
+        print(f"❌ 目录 {source_dir} 中没有找到任何 .srt 文件。")
+        raise SystemExit(1)
+    return files
+
+
+def prepare_target_dir(path: Path) -> None:
+    if path.exists():
+        for child in path.iterdir():
+            if child.is_file() or child.is_symlink():
+                child.unlink()
+            else:
+                shutil.rmtree(child)
+    else:
+        path.mkdir(parents=True, exist_ok=True)
+
+
+def is_dialog_line(line: str) -> bool:
+    stripped = line.strip()
+    if not stripped:
+        return False
+    if stripped.isdigit():
+        return False
+    if "-->" in stripped:
+        return False
+    return True
+
+
+def split_line(line: str) -> tuple[str, str]:
+    if line.endswith("\r\n"):
+        return line[:-2], "\r\n"
+    if line.endswith("\n"):
+        return line[:-1], "\n"
+    if line.endswith("\r"):
+        return line[:-1], "\r"
+    return line, ""
+
+
+def translate_dialogs(content: str, translator, target_lang: str) -> str:
+    lines = content.splitlines(keepends=True)
+    indices: list[int] = []
+    texts: list[str] = []
+    endings: list[str] = []
+
+    for idx, line in enumerate(lines):
+        if is_dialog_line(line):
+            text, ending = split_line(line)
+            indices.append(idx)
+            texts.append(text)
+            endings.append(ending)
+
+    if not texts:
+        return content
+
+    translated = translator.translate(texts, target_lang=target_lang)
+    for pos, new_text in enumerate(translated):
+        idx = indices[pos]
+        lines[idx] = new_text + endings[pos]
+    return "".join(lines)
+
+
+def main() -> None:
+    args = parse_args()
+    files = ensure_source_dir(SOURCE_DIR)
+
+    opencc_targets = list(BASE_OPENCC_TARGETS)
+    if args.include_tw_hk:
+        opencc_targets.extend(TW_HK_TARGETS)
+
+    for _, target_dir, _ in opencc_targets:
+        prepare_target_dir(target_dir)
+
+    ai_targets = AI_LANG_DIRS
+    for lang, path in ai_targets.items():
+        if lang not in SUPPORTED_AI_LANGS:
+            raise SystemExit(f"❌ 不支持的 AI 语言: {lang}")
+        prepare_target_dir(path)
+
+    translator = build_translator(
+        TranslatorConfig(
+            backend=args.backend,
+            openai_model=args.openai_model,
+            openai_api_key=args.openai_api_key,
+            model_cache_dir=Path(args.model_cache_dir),
+            batch_size=args.batch_size,
+            max_length=args.max_length,
+        )
+    )
+
+    total = len(files)
+    print(f"🔄 开始转换 {total} 个 .srt 文件...")
+
+    for idx, file_path in enumerate(files, 1):
+        percent = idx * 100 // total
+        print(f"🔄 转换中: {idx}/{total} ({percent}%) - {file_path}")
+        try:
+            content = file_path.read_text(encoding="utf-8")
+        except UnicodeDecodeError as exc:
+            print(f"❌ 转换失败: {file_path} (无法以 UTF-8 读取: {exc})")
+            continue
+
+        # 繁体系
+        for label, target_dir, converter in opencc_targets:
+            if args.show_language_progress:
+                print(f"   ↳ 正在生成 {label}")
+            out_path = target_dir / file_path.name
+            converted = converter.convert(content)
+            out_path.write_text(converted, encoding="utf-8")
+
+        # 英日韩
+        for lang, target_dir in ai_targets.items():
+            if args.show_language_progress:
+                lang_label = SUPPORTED_AI_LANGS[lang]["label"]
+                print(f"   ↳ 正在翻译 {lang_label}")
+            translated_content = translate_dialogs(content, translator, lang)
+            out_path = target_dir / file_path.name
+            out_path.write_text(translated_content, encoding="utf-8")
+
+        print(f"✅ 转换成功: {file_path}")
+
+
+if __name__ == "__main__":
+    main()

+ 194 - 0
translator.py

@@ -0,0 +1,194 @@
+from __future__ import annotations
+
+import os
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Iterable
+
+from huggingface_hub import snapshot_download
+from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
+
+try:
+    from openai import OpenAI
+except ImportError:
+    OpenAI = None  # 延迟到选择 OpenAI backend 时再报错
+
+
+NLLB_MODEL_NAME = "facebook/nllb-200-distilled-600M"
+DEFAULT_MODEL_CACHE = Path(".models") / "nllb-200"
+SOURCE_LANG_CODE = "zho_Hans"
+SUPPORTED_AI_LANGS = {
+    "en": {"nllb": "eng_Latn", "label": "译文(EN)"},
+    "jp": {"nllb": "jpn_Jpan", "label": "訳文(JP)"},
+    "korean": {"nllb": "kor_Hang", "label": "번译(KOREAN)"},
+}
+
+
+class TranslationError(Exception):
+    """Generic translation failure."""
+
+
+@dataclass
+class TranslatorConfig:
+    backend: str = "local"
+    openai_model: str = "gpt-4o-mini"
+    openai_api_key: str | None = None
+    model_cache_dir: Path = DEFAULT_MODEL_CACHE
+    batch_size: int = 4
+    max_length: int = 512
+
+
+class BaseTranslator:
+    def __init__(self) -> None:
+        self._cache: dict[tuple[str, str], str] = {}
+
+    def translate(self, texts: Iterable[str], target_lang: str) -> list[str]:
+        normalized_lang = target_lang.lower()
+        missing: list[str] = []
+        order: list[int] = []
+        results: list[str] = []
+
+        for idx, text in enumerate(texts):
+            key = (text, normalized_lang)
+            if key in self._cache:
+                results.append(self._cache[key])
+            else:
+                order.append(idx)
+                missing.append(text)
+                results.append("")  # placeholder
+
+        if missing:
+            new_values = self._translate_impl(missing, normalized_lang)
+            if len(new_values) != len(missing):
+                raise TranslationError("翻译服务返回的结果数量与输入不匹配。")
+            for idx, translated in enumerate(new_values):
+                pos = order[idx]
+                text = missing[idx]
+                key = (text, normalized_lang)
+                self._cache[key] = translated
+                results[pos] = translated
+
+        return results
+
+    def translate_text(self, text: str, target_lang: str) -> str:
+        return self.translate([text], target_lang)[0]
+
+    def _translate_impl(self, texts: list[str], target_lang: str) -> list[str]:
+        raise NotImplementedError
+
+
+class LocalNLLBTranslator(BaseTranslator):
+    def __init__(self, config: TranslatorConfig) -> None:
+        super().__init__()
+        try:
+            import torch
+        except ImportError as exc:
+            raise RuntimeError("需要安装 torch 才能加载本地 NLLB 模型。") from exc
+
+        self.torch = torch
+        self.config = config
+        model_dir = self._ensure_model_files(config.model_cache_dir)
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            model_dir,
+            src_lang=SOURCE_LANG_CODE,
+            use_fast=False,
+        )
+        self.tokenizer.src_lang = SOURCE_LANG_CODE
+        self.model = AutoModelForSeq2SeqLM.from_pretrained(model_dir)
+        self.device = self.torch.device(
+            "cuda" if self.torch.cuda.is_available() else "cpu"
+        )
+        self.model.to(self.device)
+
+    def _ensure_model_files(self, cache_dir: Path) -> Path:
+        cache_dir = Path(cache_dir)
+        cache_dir.mkdir(parents=True, exist_ok=True)
+        config_file = cache_dir / "config.json"
+        if not config_file.exists():
+            snapshot_download(
+                repo_id=NLLB_MODEL_NAME,
+                local_dir=str(cache_dir),
+                local_dir_use_symlinks=False,
+                resume_download=True,
+            )
+        return cache_dir
+
+    def _translate_impl(self, texts: list[str], target_lang: str) -> list[str]:
+        if target_lang not in SUPPORTED_AI_LANGS:
+            raise TranslationError(f"未支持的目标语言: {target_lang}")
+
+        target_code = SUPPORTED_AI_LANGS[target_lang]["nllb"]
+        forced_bos = self.tokenizer.convert_tokens_to_ids(target_code)
+        if forced_bos == self.tokenizer.unk_token_id:
+            raise TranslationError(f"NLLB 模型不支持目标语言代码: {target_code}")
+
+        outputs: list[str] = []
+        batch_size = max(1, self.config.batch_size)
+        for start in range(0, len(texts), batch_size):
+            batch = texts[start : start + batch_size]
+            encoded = self.tokenizer(
+                batch,
+                return_tensors="pt",
+                padding=True,
+                truncation=True,
+                max_length=self.config.max_length,
+            ).to(self.device)
+            with self.torch.no_grad():
+                generated = self.model.generate(
+                    **encoded,
+                    forced_bos_token_id=forced_bos,
+                    max_length=self.config.max_length,
+                )
+            decoded = self.tokenizer.batch_decode(generated, skip_special_tokens=True)
+            outputs.extend(decoded)
+        return outputs
+
+
+class OpenAITranslator(BaseTranslator):
+    def __init__(self, config: TranslatorConfig) -> None:
+        super().__init__()
+        if OpenAI is None:
+            raise RuntimeError("需要安装 openai 包才可以使用 OpenAI backend。")
+        api_key = config.openai_api_key or os.getenv("OPENAI_API_KEY")
+        if not api_key:
+            raise RuntimeError("缺少 OPENAI_API_KEY,无法连接 OpenAI API。")
+        self.client = OpenAI(api_key=api_key)
+        self.model = config.openai_model
+
+    def _translate_impl(self, texts: list[str], target_lang: str) -> list[str]:
+        if target_lang not in SUPPORTED_AI_LANGS:
+            raise TranslationError(f"未支持的目标语言: {target_lang}")
+
+        language_name = SUPPORTED_AI_LANGS[target_lang]["label"]
+        results: list[str] = []
+        for text in texts:
+            response = self.client.chat.completions.create(
+                model=self.model,
+                messages=[
+                    {
+                        "role": "system",
+                        "content": "You are a professional video game localization translator.",
+                    },
+                    {
+                        "role": "user",
+                        "content": (
+                            f"Translate the following Simplified Chinese text to {language_name}. "
+                            "Preserve the tone, keep placeholders or tags unchanged, and return only the translation.\n"
+                            f"Text:\n{text}"
+                        ),
+                    },
+                ],
+                temperature=0.2,
+            )
+            translated = response.choices[0].message.content.strip()
+            results.append(translated)
+        return results
+
+
+def build_translator(config: TranslatorConfig) -> BaseTranslator:
+    backend = config.backend.lower()
+    if backend == "local":
+        return LocalNLLBTranslator(config)
+    if backend == "openai":
+        return OpenAITranslator(config)
+    raise ValueError(f"未知的翻译 backend: {config.backend}")