feat: improve subject generation workflow

2026-05-18 17:44:52 +08:00
parent 78bd294d57
commit 1f600ae436
12 changed files with 682 additions and 372 deletions
--- a/api/.env.example
+++ b/api/.env.example
@@ -17,7 +17,9 @@ LOCAL_ASR_BIN=/opt/homebrew/bin/mlx_whisper
 LOCAL_ASR_MODEL=mlx-community/whisper-tiny
 LOCAL_ASR_TIMEOUT_SECONDS=180
 TRANSLATE_MODEL=gemini-2.5-flash
-REWRITE_MODEL=gemini-2.5-pro
+GPT_TEXT_MODEL=gpt-4o
+REWRITE_MODEL=gpt-4o
+VISION_MODEL=gpt-4o
 PRODUCT_VIEW_MODEL=gpt-image-2
 IMAGE_BASE_URL=https://ai.skg.com/ezlink/v1
 IMAGE_API_KEY=
@@ -27,6 +29,8 @@ SUBJECT_ASSET_IMAGE_MODEL=gpt-image-2
 SUBJECT_ASSET_IMAGE_MODELS=gpt-image-2
 # 可选：本地网络需要代理访问 ai.skg.com 时配置；launchd 不一定继承 shell 代理变量。
 AI_HTTP_PROXY=
+YTDLP_COOKIES_FILE=
+YTDLP_COOKIES_FROM_BROWSER=
 VIDEO_MODEL=seedance
 VIDEO_MODEL_SEEDANCE=seedance-2-fast
 VIDEO_MODEL_KLING=kling-omni
@@ -35,6 +39,7 @@ VIDEO_MODEL_VEO3=veo-3.1-fast
 # 音频文案改写 + Azure OpenAI 配音
 AUDIO_REWRITE_MODEL=gemini-2.5-pro
 AUDIO_PRODUCT_BRIEF="SKG 智能按摩产品，主打日常肩颈、腰背、眼部、膝盖或足部放松；广告表达要高级、干净、可信，不做医疗疗效承诺。"
+# 语音通道服务端固定为 Azure OpenAI。
 VOICE_PROVIDER=azure_openai
 AZURE_OPENAI_BASE_URL=https://ai.skg.com/azure
 AZURE_OPENAI_API_KEY=
@@ -42,13 +47,7 @@ AZURE_TTS_MODEL=gpt-4o-mini-tts
 AZURE_TTS_VOICE_ID=alloy
 AZURE_TTS_VOICE_POOL=alloy,verse,shimmer
 AZURE_TTS_PATH=/audio/speech
-
-# MiniMax 旧配音通道，保留兼容；默认不走
-MINIMAX_API_KEY=
-MINIMAX_TTS_BASE_URL=https://api.minimax.io
-MINIMAX_TTS_MODEL=speech-2.8-turbo
-MINIMAX_TTS_VOICE_ID=English_expressive_narrator
-MINIMAX_TTS_VOICE_POOL=English_magnetic_voiced_man,English_Upbeat_Woman,English_MaturePartner
+AZURE_TTS_PATHS=/audio/speech,/v1/audio/speech

 # Poe 视频 API（优先用于 Seedance / Kling / Veo）
 POE_API_BASE_URL=https://api.poe.com/v1
--- a/api/README.md
+++ b/api/README.md
@@ -1,6 +1,6 @@
 # SKG TK 二创 API

-FastAPI 后端，跑 yt-dlp + ffmpeg + ASR/翻译/英文 SKG 产品介绍文案 + MiniMax 英文配音管线。
+FastAPI 后端，跑 yt-dlp + ffmpeg + ASR/翻译/英文 SKG 产品介绍文案 + Azure OpenAI 英文配音管线。

 ## 启动

@@ -9,7 +9,7 @@ cd api
 python3 -m venv .venv
 source .venv/bin/activate
 pip install -r requirements.txt
-cp .env.example .env  # 按需填 LLM_API_KEY / MINIMAX_API_KEY
+cp .env.example .env  # 按需填 LLM_API_KEY / AZURE_OPENAI_API_KEY
 uvicorn main:app --host 127.0.0.1 --port 4291
 ```

@@ -20,19 +20,19 @@ uvicorn main:app --host 127.0.0.1 --port 4291
 - `GET  /health` — 健康检查 + 配置状态
 - `POST /jobs` `{url}` — 创建 job，后台下载源视频，视频就绪后可手动解析或提取音频
 - `GET  /jobs/{id}` — 当前状态 + 产物；若原始音轨已拆出，会返回 `source_audio_url`
- `POST /jobs/{id}/transcribe` — 触发音频提取 + ASR + 翻译 + SKG 英文产品介绍文案；文案长度按原音频时长估算，配置 MiniMax 后从英文随机音色池生成配音。前端 Audio 节点提供“提取音频 / 重新提取音频”按钮，可与抽帧并行，不自动触发
+- `POST /jobs/{id}/transcribe` — 触发音频提取 + ASR + 翻译 + SKG 英文产品介绍文案；文案长度按原音频时长估算，配置 Azure OpenAI TTS 后从 Azure 音色池生成配音。前端 Audio 节点提供“提取音频 / 重新提取音频”按钮，可与抽帧并行，不自动触发
 - `GET  /jobs/{id}/video.mp4` — 原视频
 - `GET  /jobs/{id}/audio.wav` — 拆轨后的原始音频，供前端底部音频条生成波形
- `GET  /jobs/{id}/audio-script.mp3` — 英文改写文案的 MiniMax 配音
+- `GET  /jobs/{id}/audio-script.mp3` — 英文改写文案的 Azure OpenAI TTS 配音
 - `GET  /jobs/{id}/frames/{i}.jpg` — 第 i 张关键帧（0-9）

 ## Mock 模式

-未设 `LLM_API_KEY` 时，转录走本地 mock，便于 UI 联调；未设 `MINIMAX_API_KEY` 时只生成改写文案，不生成配音文件。
+未设 `LLM_API_KEY` 时，转录走本地 mock，便于 UI 联调；未设 `AZURE_OPENAI_API_KEY` 且无法复用 `LLM_API_KEY` 时只生成改写文案，不生成配音文件。

 ## 依赖

 - `ffmpeg` 系统二进制（拆轨 / 抽帧）
 - `yt-dlp` 系统二进制（也可走 Python 包）
 - OpenAI 兼容 LLM 网关（ASR / 翻译 / 文案改写）；如果 `/audio/transcriptions` 不可用，会用 `ASR_FALLBACK_MODEL` 走 Gemini 多模态音频识别
- MiniMax T2A HTTP（英文产品介绍文案配音，使用 `MINIMAX_API_KEY`；默认随机音色池 `English_magnetic_voiced_man,English_Upbeat_Woman,English_MaturePartner`）
+- Azure OpenAI TTS（英文产品介绍文案配音，使用 `AZURE_OPENAI_API_KEY` 或回退复用 `LLM_API_KEY`；默认音色池 `alloy,verse,shimmer`）
--- a/api/character_library/skg-characters/manifest.json
+++ b/api/character_library/skg-characters/manifest.json
@@ -8,6 +8,7 @@
      "name": "运动阳光男",
      "folder": "01_运动阳光男",
      "description": "运动阳光男透明骨架人角色，含正面、左右45度、侧面、背面、半身近景和背部特写参考。",
+      "prompt_brief": "Athletic sunny male transparent wellness character, young adult energy, lean fit proportions, open and upbeat posture, clean translucent skin shell with visible white skeleton. The character should feel friendly, active, outdoor-sport inspired, bright, healthy, and suitable for premium SKG neck-and-shoulder wearable device ads. Keep neck, collarbone, shoulders, upper back, and cervical spine readable without bulky clothing or props.",
      "primary_image": "character-01-front",
      "images": [
        {
@@ -80,6 +81,7 @@
      "name": "都市型男",
      "folder": "02_都市型男",
      "description": "都市型男透明骨架人角色，含正面、左右45度、侧面、背面、半身近景和背部特写参考。",
+      "prompt_brief": "Urban stylish male transparent wellness character, adult metropolitan feel, clean confident posture, refined proportions, translucent body shell with visible white skeleton. The commercial mood is premium city lifestyle, composed, sharp, and modern, suitable for office or commute-oriented SKG neck-and-shoulder massage ads. Keep shoulder line, side neck, collarbone, and upper back clear for wearable device placement.",
      "primary_image": "character-02-front",
      "images": [
        {
@@ -152,6 +154,7 @@
      "name": "优雅白领女",
      "folder": "03_优雅白领女",
      "description": "优雅白领女透明骨架人角色，含正面、左右45度、侧面、背面、半身近景和背部特写参考。",
+      "prompt_brief": "Elegant professional female transparent wellness character, young adult to adult office-worker mood, slim balanced proportions, calm poised posture, translucent outer body with a clean visible white skeleton. The style should feel premium, gentle, trustworthy, and workplace-friendly for SKG neck-and-shoulder wearable device ads. Keep hair, collars, and accessories from hiding the neck, shoulders, collarbone, upper back, and cervical spine.",
      "primary_image": "character-03-front",
      "images": [
        {
@@ -224,6 +227,7 @@
      "name": "运动辣妹",
      "folder": "04_运动辣妹",
      "description": "运动辣妹透明骨架人角色，含正面、左右45度、侧面、背面、半身近景和背部特写参考。",
+      "prompt_brief": "Sporty confident female transparent wellness character, energetic young adult fitness mood, toned proportions, expressive posture, translucent skin shell with visible white skeleton. The character should feel active, fashionable, bright, and creator-ad friendly while remaining premium and non-horror. Keep the neck, side neck, shoulders, collarbone, upper trapezius, and upper back open and readable for SKG wearable massage device scenes.",
      "primary_image": "character-04-front",
      "images": [
        {
@@ -296,6 +300,7 @@
      "name": "绅士大叔",
      "folder": "05_绅士大叔",
      "description": "绅士大叔透明骨架人角色，含正面、左右45度、侧面、背面、半身近景和背部特写参考。",
+      "prompt_brief": "Mature gentleman transparent wellness character, adult to middle-aged presence without exact age, steady confident posture, slightly stronger build, translucent body shell with a clean visible white skeleton. The commercial mood is calm, trustworthy, premium, and lifestyle-oriented for SKG neck-and-shoulder wearable device ads. Keep collars and styling minimal so the neck, shoulders, upper back, cervical spine, and shoulder blades remain visible.",
      "primary_image": "character-05-front",
      "images": [
        {
@@ -364,4 +369,4 @@
      ]
    }
  ]
-}
+}
--- a/api/main.py
+++ b/api/main.py
@@ -52,8 +52,18 @@ LOCAL_ASR_BIN = os.getenv("LOCAL_ASR_BIN", "").strip()
 LOCAL_ASR_MODEL = os.getenv("LOCAL_ASR_MODEL", "mlx-community/whisper-tiny").strip() or "mlx-community/whisper-tiny"
 LOCAL_ASR_TIMEOUT_SECONDS = max(30, int(os.getenv("LOCAL_ASR_TIMEOUT_SECONDS", "180")))
 TRANSLATE_MODEL = os.getenv("TRANSLATE_MODEL", "gemini-2.5-flash")
-REWRITE_MODEL = os.getenv("REWRITE_MODEL", "gemini-2.5-pro")
-VISION_MODEL = os.getenv("VISION_MODEL", "gemini-2.5-flash")
+DEFAULT_GPT_TEXT_MODEL = os.getenv("GPT_TEXT_MODEL", "gpt-4o").strip() or "gpt-4o"
+
+
+def gpt_model_env(name: str, default: str | None = None) -> str:
+    value = os.getenv(name, default or DEFAULT_GPT_TEXT_MODEL).strip()
+    if not value or value.lower().startswith("gemini-"):
+        return default or DEFAULT_GPT_TEXT_MODEL
+    return value
+
+
+REWRITE_MODEL = gpt_model_env("REWRITE_MODEL")
+VISION_MODEL = gpt_model_env("VISION_MODEL")
 IMAGE_BASE_URL = os.getenv("IMAGE_BASE_URL", LLM_BASE_URL).strip()
 IMAGE_API_KEY = os.getenv("IMAGE_API_KEY", LLM_API_KEY).strip()
 AI_HTTP_PROXY = (
@@ -77,29 +87,14 @@ PRODUCT_ASSET_MIN_LONG_SIDE = max(512, int(os.getenv("PRODUCT_ASSET_MIN_LONG_SID
 PRODUCT_ASSET_MIN_SHORT_SIDE = max(320, int(os.getenv("PRODUCT_ASSET_MIN_SHORT_SIDE", "600")))
 PRODUCT_ASSET_JPEG_QUALITY = max(80, min(95, int(os.getenv("PRODUCT_ASSET_JPEG_QUALITY", "92"))))
 VIDEO_MODEL = os.getenv("VIDEO_MODEL", "seedance").strip() or "seedance"
+YTDLP_COOKIES_FILE = os.getenv("YTDLP_COOKIES_FILE", "").strip()
+YTDLP_COOKIES_FROM_BROWSER = os.getenv("YTDLP_COOKIES_FROM_BROWSER", "").strip()
 AUDIO_PRODUCT_BRIEF = os.getenv(
    "AUDIO_PRODUCT_BRIEF",
    "SKG 智能按摩产品，主打日常肩颈、腰背、眼部、膝盖或足部放松；广告表达要高级、干净、可信，不做医疗疗效承诺。",
 ).strip()
-AUDIO_REWRITE_MODEL = os.getenv("AUDIO_REWRITE_MODEL", REWRITE_MODEL).strip() or REWRITE_MODEL
-MINIMAX_API_KEY = os.getenv("MINIMAX_API_KEY", "").strip()
-MINIMAX_TTS_BASE_URL = os.getenv("MINIMAX_TTS_BASE_URL", "https://api.minimax.io").strip().rstrip("/")
-MINIMAX_TTS_MODEL = os.getenv("MINIMAX_TTS_MODEL", "speech-2.8-turbo").strip() or "speech-2.8-turbo"
-MINIMAX_TTS_VOICE_ID = os.getenv(
-    "MINIMAX_TTS_VOICE_ID",
-    "English_expressive_narrator",
-).strip() or "English_expressive_narrator"
-DEFAULT_MINIMAX_TTS_VOICE_POOL = [
-    "English_magnetic_voiced_man",
-    "English_Upbeat_Woman",
-    "English_MaturePartner",
-]
-MINIMAX_TTS_VOICE_POOL = [
-    v.strip()
-    for v in os.getenv("MINIMAX_TTS_VOICE_POOL", ",".join(DEFAULT_MINIMAX_TTS_VOICE_POOL)).split(",")
-    if v.strip()
-]
-VOICE_PROVIDER = os.getenv("VOICE_PROVIDER", "azure_openai").strip().lower() or "azure_openai"
+AUDIO_REWRITE_MODEL = gpt_model_env("AUDIO_REWRITE_MODEL", REWRITE_MODEL)
+VOICE_PROVIDER = "azure_openai"
 AZURE_OPENAI_BASE_URL = os.getenv("AZURE_OPENAI_BASE_URL", "https://ai.skg.com/azure").strip().rstrip("/")
 AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY", LLM_API_KEY).strip()
 AZURE_TTS_MODEL = os.getenv("AZURE_TTS_MODEL", "gpt-4o-mini-tts").strip() or "gpt-4o-mini-tts"
@@ -111,6 +106,11 @@ AZURE_TTS_VOICE_POOL = [
    if v.strip()
 ]
 AZURE_TTS_PATH = os.getenv("AZURE_TTS_PATH", "/audio/speech").strip() or "/audio/speech"
+AZURE_TTS_PATHS = [
+    p.strip()
+    for p in os.getenv("AZURE_TTS_PATHS", f"{AZURE_TTS_PATH},/audio/speech,/v1/audio/speech").split(",")
+    if p.strip()
+]

 POE_API_BASE_URL = os.getenv("POE_API_BASE_URL", "https://api.poe.com/v1").strip() or "https://api.poe.com/v1"
 POE_API_KEY = os.getenv("POE_API_KEY", "").strip()
@@ -452,6 +452,7 @@ class CharacterLibraryItem(BaseModel):
    name: str
    folder: str = ""
    description: str = ""
+    prompt_brief: str = ""
    primary_image: str = ""
    images: list[CharacterLibraryImage] = Field(default_factory=list)

@@ -477,6 +478,7 @@ class SubjectTemplateItem(BaseModel):
    name: str
    description: str = ""
    note: str = ""
+    prompt_brief: str = ""
    source: Literal["database"] = "database"
    source_job_id: str = ""
    source_frame_idx: int = -1
@@ -1075,6 +1077,35 @@ def run(cmd: list[str], cwd: Path | None = None) -> str:
    return res.stdout


+def ytdlp_cookie_args() -> list[str]:
+    if YTDLP_COOKIES_FILE:
+        cookies = Path(YTDLP_COOKIES_FILE).expanduser()
+        if not cookies.exists():
+            raise RuntimeError("TikTok cookies 文件不可用，请检查 YTDLP_COOKIES_FILE 配置。")
+        return ["--cookies", str(cookies)]
+    if YTDLP_COOKIES_FROM_BROWSER:
+        return ["--cookies-from-browser", YTDLP_COOKIES_FROM_BROWSER]
+    return []
+
+
+def normalize_download_error(error: Exception) -> str:
+    raw = str(error)
+    lower = raw.lower()
+    auth_required = (
+        "log in for access" in lower
+        or "login" in lower and "cookies" in lower
+        or "cookies-from-browser" in lower
+        or "sign in" in lower and "tiktok" in lower
+    )
+    if auth_required:
+        return (
+            "TikTok 下载需要登录态。请上传视频文件，或在后端配置 "
+            "YTDLP_COOKIES_FILE / YTDLP_COOKIES_FROM_BROWSER 后重试。"
+            f"原始错误：{raw}"
+        )
+    return raw
+
+
 # ---- 启发式选帧工具 ----
 import imagehash
 import numpy as np
@@ -1728,13 +1759,15 @@ def pipeline_download(job_id: str) -> None:
            update(job, status="downloading", message="本地上传 · 跳过下载", progress=15)
        else:
            update(job, status="downloading", message="yt-dlp 下载中…", progress=5)
-            run([
+            cmd = [
                "yt-dlp", "-f", "best[ext=mp4]/best",
                "-o", str(mp4),
                "--no-warnings", "--no-playlist",
                "--retries", "3",
+                *ytdlp_cookie_args(),
                job.url,
-            ])
+            ]
+            run(cmd)
            if not mp4.exists():
                raise RuntimeError("下载完成但找不到 source.mp4")

@@ -1757,7 +1790,7 @@ def pipeline_download(job_id: str) -> None:
        )
    except Exception as e:
        message = "视频元数据解析失败" if stage == "metadata" else "下载失败"
-        update(job, status="failed", error=str(e), message=message)
+        update(job, status="failed", error=normalize_download_error(e), message=message)


 def pipeline_analyze(
@@ -1929,7 +1962,7 @@ def analyze_queue_worker() -> None:
        ANALYZE_WORKER_RUNNING = False


-# ---------- 音频转写 + 翻译 + SKG 改写 + MiniMax 配音 ----------
+# ---------- 音频转写 + 翻译 + SKG 改写 + Azure OpenAI 配音 ----------

 class TranscriptionUnavailable(RuntimeError):
    pass
@@ -2385,18 +2418,6 @@ def _rewrite_audio_script_sync(segments: list[TranscriptSegment], target_seconds
        return fallback, f"改写失败，使用本地模板：{e}"


-def _minimax_tts_url() -> str:
-    if MINIMAX_TTS_BASE_URL.endswith("/v1/t2a_v2"):
-        return MINIMAX_TTS_BASE_URL
-    return f"{MINIMAX_TTS_BASE_URL}/v1/t2a_v2"
-
-
-def _choose_minimax_voice_id() -> str:
-    if MINIMAX_TTS_VOICE_POOL:
-        return random.choice(MINIMAX_TTS_VOICE_POOL)
-    return MINIMAX_TTS_VOICE_ID
-
-
 def _choose_azure_voice_id() -> str:
    if AZURE_TTS_VOICE_POOL:
        return random.choice(AZURE_TTS_VOICE_POOL)
@@ -2404,9 +2425,7 @@ def _choose_azure_voice_id() -> str:


 def _choose_tts_voice_id() -> str:
-    if VOICE_PROVIDER == "azure_openai":
-        return _choose_azure_voice_id()
-    return _choose_minimax_voice_id()
+    return _choose_azure_voice_id()


 def _voice_speed_for(voice_id: str, target_seconds: float, text: str) -> float:
@@ -2423,60 +2442,22 @@ def _voice_speed_for(voice_id: str, target_seconds: float, text: str) -> float:
    return 0.99


-def _minimax_tts_sync(job_id: str, text: str, voice_id: str, target_seconds: float = 12.0) -> str:
-    if not MINIMAX_API_KEY:
-        raise RuntimeError("MINIMAX_API_KEY 未配置，未生成配音")
-    if not text.strip():
-        raise RuntimeError("改写文案为空，未生成配音")
-    payload = {
-        "model": MINIMAX_TTS_MODEL,
-        "text": text.strip()[:9500],
-        "stream": False,
-        "language_boost": "English",
-        "output_format": "hex",
-        "voice_setting": {
-            "voice_id": voice_id,
-            "speed": _voice_speed_for(voice_id, target_seconds, text),
-            "vol": 1,
-            "pitch": 0,
-        },
-        "audio_setting": {
-            "sample_rate": 32000,
-            "bitrate": 128000,
-            "format": "mp3",
-            "channel": 1,
-        },
-    }
-    resp = httpx.post(
-        _minimax_tts_url(),
-        headers={"Authorization": f"Bearer {MINIMAX_API_KEY}", "Content-Type": "application/json"},
-        json=payload,
-        timeout=90,
-    )
-    resp.raise_for_status()
-    data = resp.json()
-    base_resp = data.get("base_resp") or {}
-    if int(base_resp.get("status_code", 0) or 0) != 0:
-        raise RuntimeError(base_resp.get("status_msg") or "MiniMax TTS 返回失败")
-    audio_hex = ((data.get("data") or {}).get("audio") or "").strip()
-    if not audio_hex:
-        raise RuntimeError("MiniMax TTS 未返回 audio hex")
-    try:
-        audio_bytes = bytes.fromhex(audio_hex)
-    except ValueError as e:
-        raise RuntimeError(f"MiniMax TTS audio hex 无法解析：{e}") from e
-    out = job_dir(job_id) / "audio_script.mp3"
-    out.write_bytes(audio_bytes)
-    return f"/jobs/{job_id}/audio-script.mp3"
-
-
-def _azure_tts_url() -> str:
-    path = AZURE_TTS_PATH if AZURE_TTS_PATH.startswith("/") else f"/{AZURE_TTS_PATH}"
+def _azure_tts_url_for(path_value: str) -> str:
+    path = path_value if path_value.startswith("/") else f"/{path_value}"
    if AZURE_OPENAI_BASE_URL.endswith(path):
        return AZURE_OPENAI_BASE_URL
    return f"{AZURE_OPENAI_BASE_URL}{path}"


+def _azure_tts_urls() -> list[str]:
+    urls: list[str] = []
+    for path in AZURE_TTS_PATHS or [AZURE_TTS_PATH]:
+        url = _azure_tts_url_for(path)
+        if url not in urls:
+            urls.append(url)
+    return urls
+
+
 def _azure_openai_tts_sync(job_id: str, text: str, voice_id: str, target_seconds: float = 12.0) -> str:
    if not AZURE_OPENAI_API_KEY:
        raise RuntimeError("AZURE_OPENAI_API_KEY 或 LLM_API_KEY 未配置，未生成配音")
@@ -2489,18 +2470,32 @@ def _azure_openai_tts_sync(job_id: str, text: str, voice_id: str, target_seconds
        "response_format": "mp3",
        "speed": _voice_speed_for(voice_id, target_seconds, text),
    }
-    resp = httpx.post(
-        _azure_tts_url(),
-        headers={
-            "Authorization": f"Bearer {AZURE_OPENAI_API_KEY}",
-            "api-key": AZURE_OPENAI_API_KEY,
-            "Content-Type": "application/json",
-        },
-        json=payload,
-        timeout=120,
-    )
+    headers = {
+        "Authorization": f"Bearer {AZURE_OPENAI_API_KEY}",
+        "api-key": AZURE_OPENAI_API_KEY,
+        "Content-Type": "application/json",
+    }
+    resp: httpx.Response | None = None
+    errors: list[str] = []
+    with ai_http_client(timeout=120) as client:
+        for url in _azure_tts_urls():
+            try:
+                current = client.post(url, headers=headers, json=payload)
+            except Exception as e:
+                errors.append(f"{url}: {type(e).__name__}: {e}")
+                continue
+            if current.status_code < 400:
+                resp = current
+                break
+            errors.append(f"{url}: HTTP {current.status_code}: {current.text[:180]}")
+            if current.status_code not in {404, 405}:
+                resp = current
+                break
+    if resp is None:
+        raise RuntimeError("Azure OpenAI TTS 不可用；已尝试 " + " | ".join(errors))
    if resp.status_code >= 400:
-        raise RuntimeError(f"Azure OpenAI TTS HTTP {resp.status_code}: {resp.text[:300]}")
+        detail = " | ".join(errors) or resp.text[:300]
+        raise RuntimeError(f"Azure OpenAI TTS HTTP {resp.status_code}: {detail[:600]}")
    audio_bytes = resp.content
    if not audio_bytes:
        raise RuntimeError("Azure OpenAI TTS 未返回音频内容")
@@ -2517,9 +2512,7 @@ def _azure_openai_tts_sync(job_id: str, text: str, voice_id: str, target_seconds


 def _tts_sync(job_id: str, text: str, voice_id: str, target_seconds: float = 12.0) -> tuple[str, str, str]:
-    if VOICE_PROVIDER == "azure_openai":
-        return _azure_openai_tts_sync(job_id, text, voice_id, target_seconds), "azure_openai", AZURE_TTS_MODEL
-    return _minimax_tts_sync(job_id, text, voice_id, target_seconds), "minimax", MINIMAX_TTS_MODEL
+    return _azure_openai_tts_sync(job_id, text, voice_id, target_seconds), "azure_openai", AZURE_TTS_MODEL


 def _build_audio_script_sync(job_id: str, segments: list[TranscriptSegment], target_seconds: float = 12.0) -> AudioScript:
@@ -2531,8 +2524,8 @@ def _build_audio_script_sync(job_id: str, segments: list[TranscriptSegment], tar
    speaker_profile, rhythm_profile = _audio_delivery_profile(segments, duration, selected_voice_id)
    voice_url = ""
    voice_error = ""
-    voice_provider = "azure_openai" if VOICE_PROVIDER == "azure_openai" else "minimax"
-    voice_model = AZURE_TTS_MODEL if voice_provider == "azure_openai" else MINIMAX_TTS_MODEL
+    voice_provider = "azure_openai"
+    voice_model = AZURE_TTS_MODEL
    try:
        voice_url, voice_provider, voice_model = _tts_sync(job_id, rewritten, selected_voice_id, duration)
    except Exception as e:
@@ -2944,6 +2937,83 @@ def _image_text_call(
    raise RuntimeError(_image_failure_message("image text", max_attempts, last_err, capacity_seen))


+def _image_path_to_data_url(path: Path) -> str:
+    media_type = "image/png" if path.suffix.lower() == ".png" else "image/jpeg"
+    return f"data:{media_type};base64,{base64.b64encode(path.read_bytes()).decode('ascii')}"
+
+
+def _vision_brief_from_images(image_paths: list[Path], prompt: str, max_images: int = 8) -> str:
+    paths = [path for path in image_paths if path.exists()][:max_images]
+    if not paths:
+        return ""
+    if not LLM_API_KEY:
+        return ""
+    content: list[dict] = [{"type": "text", "text": prompt}]
+    for path in paths:
+        content.append({"type": "image_url", "image_url": {"url": _image_path_to_data_url(path)}})
+    try:
+        resp = llm().chat.completions.create(
+            model=VISION_MODEL,
+            messages=[{"role": "user", "content": content}],
+            response_format={"type": "json_object"},
+            temperature=0.1,
+            max_tokens=1400,
+        )
+        raw = (resp.choices[0].message.content or "").strip()
+        if not raw:
+            raw = (getattr(resp.choices[0].message, "reasoning_content", "") or "").strip()
+        match = re.search(r"\{[\s\S]*\}", raw)
+        raw = match.group(0) if match else raw
+        data = json.loads(raw)
+    except Exception as e:
+        print(f"[vision brief failed] {e}", flush=True)
+        return ""
+
+    if isinstance(data, dict):
+        if isinstance(data.get("brief"), str) and data["brief"].strip():
+            return data["brief"].strip()[:1800]
+        parts: list[str] = []
+        for key in (
+            "gender_presentation", "age_range", "body_proportion", "hair", "skin_tone",
+            "wardrobe_style", "pose_language", "camera_visibility", "commercial_mood",
+            "neck_shoulder_readiness", "style_constraints",
+        ):
+            value = data.get(key)
+            if isinstance(value, str) and value.strip():
+                parts.append(f"{key.replace('_', ' ')}: {value.strip()}")
+        if parts:
+            return "; ".join(parts)[:1800]
+    return ""
+
+
+def _describe_source_subject(job_id: str, source_indices: list[int]) -> str:
+    """Turn source keyframes into a non-identifying visual brief for similar-subject text generation."""
+    paths = [_source_frame_path(job_id, idx) for idx in source_indices]
+    prompt = (
+        "You are preparing a non-identifying character brief for generating a NEW similar but non-identical ad subject. "
+        "Look at these source video keyframes as evidence of one role and style, not as a person to identify. "
+        "Do NOT identify the person, do NOT estimate exact age, do NOT describe biometric identity, and do NOT mention celebrity or real-person likeness. "
+        "Output strict JSON only. Use broad style traits suitable for text-to-image generation.\n"
+        "Required keys: gender_presentation, age_range, body_proportion, hair, skin_tone, wardrobe_style, "
+        "pose_language, camera_visibility, commercial_mood, neck_shoulder_readiness, style_constraints, brief.\n"
+        "The brief should be 80-140 words and should preserve category, role, energy, camera readability, and commercial atmosphere while explicitly allowing a new non-identical subject."
+    )
+    return _vision_brief_from_images(paths, prompt, max_images=8)
+
+
+def _describe_subject_template_from_images(name: str, subject_style: str, image_paths: list[Path], note: str = "") -> str:
+    prompt = (
+        f"You are summarizing a saved SKG subject template named '{name}' for future text-to-image generation. "
+        f"Subject style: {subject_style}. User note: {note[:500]}. "
+        "Look at the subject views and describe the reusable creative direction without copying identity or pixels. "
+        "Do NOT identify a person and do NOT describe exact facial identity. "
+        "Output strict JSON only with keys: gender_presentation, age_range, body_proportion, material_or_skin, "
+        "wardrobe_or_surface_style, pose_language, camera_readability, neck_shoulder_readiness, commercial_mood, brief. "
+        "The brief should be 80-140 words and must be useful as a reference character brief for creating a new innovative variation."
+    )
+    return _vision_brief_from_images(image_paths, prompt, max_images=10)
+
+
 # ---------- API 路由 ----------

 class CreateJobReq(BaseModel):
@@ -3130,7 +3200,7 @@ def health() -> dict:
        "auth_configured": WEB_AUTH_CONFIGURED,
        "base_url": LLM_BASE_URL or "openai-default",
        "image_base_url": IMAGE_BASE_URL or LLM_BASE_URL or "openai-default",
-        "voice_base_url": AZURE_OPENAI_BASE_URL if VOICE_PROVIDER == "azure_openai" else MINIMAX_TTS_BASE_URL,
+        "voice_base_url": AZURE_OPENAI_BASE_URL,
        "models": {
            "asr": ASR_MODEL,
            "local_asr": LOCAL_ASR_MODEL,
@@ -3147,15 +3217,12 @@ def health() -> dict:
            "subject_image": SUBJECT_ASSET_IMAGE_MODEL,
            "subject_image_fallbacks": SUBJECT_ASSET_IMAGE_MODELS,
            "voice_provider": VOICE_PROVIDER,
-            "voice_base_url": AZURE_OPENAI_BASE_URL if VOICE_PROVIDER == "azure_openai" else MINIMAX_TTS_BASE_URL,
-            "voice_tts": AZURE_TTS_MODEL if VOICE_PROVIDER == "azure_openai" else MINIMAX_TTS_MODEL,
-            "voice_id": AZURE_TTS_VOICE_ID if VOICE_PROVIDER == "azure_openai" else MINIMAX_TTS_VOICE_ID,
-            "voice_pool": AZURE_TTS_VOICE_POOL if VOICE_PROVIDER == "azure_openai" else (MINIMAX_TTS_VOICE_POOL or [MINIMAX_TTS_VOICE_ID]),
-            "voice_configured": bool(AZURE_OPENAI_API_KEY) if VOICE_PROVIDER == "azure_openai" else bool(MINIMAX_API_KEY),
-            "minimax_tts": MINIMAX_TTS_MODEL,
-            "minimax_voice": MINIMAX_TTS_VOICE_ID,
-            "minimax_voice_pool": MINIMAX_TTS_VOICE_POOL or [MINIMAX_TTS_VOICE_ID],
-            "minimax_configured": bool(MINIMAX_API_KEY),
+            "voice_base_url": AZURE_OPENAI_BASE_URL,
+            "voice_tts": AZURE_TTS_MODEL,
+            "voice_tts_paths": AZURE_TTS_PATHS,
+            "voice_id": AZURE_TTS_VOICE_ID,
+            "voice_pool": AZURE_TTS_VOICE_POOL,
+            "voice_configured": bool(AZURE_OPENAI_API_KEY),
            "video": VIDEO_MODEL,
            "video_aliases": VIDEO_MODEL_ALIASES,
            "video_provider": video_provider_name(),
@@ -3225,6 +3292,31 @@ async def create_job(req: CreateJobReq, bg: BackgroundTasks) -> Job:
    return job


+@app.post("/jobs/{job_id}/download/retry", response_model=Job)
+async def retry_job_download(job_id: str, bg: BackgroundTasks) -> Job:
+    job = JOBS.get(job_id)
+    if not job:
+        raise HTTPException(404, "job not found")
+    if job.source_kind == "upload" or job.url.startswith("upload://"):
+        raise HTTPException(409, "uploaded videos cannot be redownloaded; upload the file again")
+    if job.status in {"downloading", "splitting", "transcribing"}:
+        raise HTTPException(409, f"job is busy: {job.status}")
+
+    mp4 = job_dir(job_id) / "source.mp4"
+    if mp4.exists() and mp4.stat().st_size == 0:
+        mp4.unlink()
+    update(
+        job,
+        status="downloading",
+        progress=1,
+        error="",
+        message="重新提交下载…",
+        video_url="",
+    )
+    bg.add_task(pipeline_download, job_id)
+    return job
+
+
@app.post("/jobs/upload", response_model=Job)
 async def create_job_from_upload(bg: BackgroundTasks, file: UploadFile = File(...)) -> Job:
    if not file.filename:
@@ -4308,43 +4400,56 @@ def generate_subject_assets(job_id: str, idx: int, element_id: str, req: Generat
        source_indices = [idx] + source_indices
    source_indices = list(dict.fromkeys(source_indices))[:12]

+    similar_mode = req.reconstruction_mode == "similar"
    character_reference_paths: list[Path] = []
-    character_reference_clause = ""
+    template_brief_clause = ""
    character_label = ""
    subject_template_id = (req.subject_template_id or "").strip()
    character_id = (req.character_id or "").strip()
    if subject_template_id:
        template = find_subject_template_item(subject_template_id)
        character_label = template.name
-        for image in template.images[:10]:
-            character_reference_paths.append(subject_template_image_file(image.filename))
-        character_reference_clause = (
-            f"Selected reusable subject template from database: {template.name}. "
-            "Use these saved generated subject views as a high-quality creative direction and identity bible only; "
-            "do not copy pixels, file artifacts, exact pose, labels, or accidental defects. "
-            "Create a new innovative variation that keeps the same broad subject type, transparent wellness character language, "
-            "camera readability, shoulder/neck product compatibility, and commercial role. "
+        template_paths = [subject_template_image_file(image.filename) for image in template.images[:10]]
+        character_reference_paths.extend(template_paths)
+        brief = template.prompt_brief.strip() or template.note.strip() or template.description.strip()
+        if similar_mode and not brief:
+            brief = _describe_subject_template_from_images(template.name, template.subject_style, template_paths, template.note)
+        template_brief_clause = (
+            f"Reference character brief from saved database template '{template.name}': {brief}. "
+            "Use this as a high-quality creative direction and identity bible only; do not copy a face, exact pose, pixels, file artifacts, labels, or accidental defects. "
+            "Create a new innovative variation that keeps the same broad subject type, transparent wellness character language, camera readability, shoulder/neck product compatibility, and commercial role. "
+            if brief else
+            f"Selected reusable subject template from database: {template.name}. Create a new innovative variation, not a duplicate. "
        )
    elif character_id:
        character = find_character_library_item(character_id)
        character_label = character.name
-        for image in character.images[:7]:
-            character_reference_paths.append(character_library_file(image.filename))
-        character_reference_clause = (
-            f"Selected built-in creative character reference: {character.name}. "
-            "Use these planned character images as a high-quality creative direction and anatomy/style bible only; "
+        character_reference_paths.extend(character_library_file(image.filename) for image in character.images[:7])
+        brief = character.prompt_brief.strip() or character.description.strip()
+        template_brief_clause = (
+            f"Reference character brief from built-in creative character '{character.name}': {brief}. "
+            "Use this planned character brief as a high-quality creative direction and anatomy/style bible only; "
            "do not copy the exact face, exact pose, exact silhouette, pixels, or make a duplicate. "
-            "Create a new innovative variation that keeps the same broad role, transparent wellness character language, "
-            "camera readability, and shoulder/neck product compatibility. "
+            "Create a new innovative variation that keeps the same broad role, transparent wellness character language, camera readability, and shoulder/neck product compatibility. "
        )

-    model_src, tmp_focus = _focus_source_for_element(job_id, idx, el)
+    tmp_focus: Path | None = None
+    model_src: Path | list[Path] | None = None
    frame_reference_paths = [p for p in (_source_frame_path(job_id, i) for i in source_indices) if p.exists()]
-    if character_reference_paths:
-        remaining = max(0, 10 - len(character_reference_paths))
-        model_src = character_reference_paths + frame_reference_paths[:remaining]
-    elif len(frame_reference_paths) > 1:
-        model_src = frame_reference_paths[:10]
+    source_subject_brief = _describe_source_subject(job_id, source_indices) if similar_mode else ""
+    source_subject_clause = (
+        f"Source video role brief from selected keyframes: {source_subject_brief}. "
+        "Use this brief to preserve role category, creator-ad energy, camera readability, and broad styling, while creating a new non-identical subject. "
+        if source_subject_brief else
+        "Source video role brief unavailable; create a new non-identical ad subject guided by the user direction, template brief, and requested view. "
+    )
+    if not similar_mode:
+        model_src, tmp_focus = _focus_source_for_element(job_id, idx, el)
+        if character_reference_paths:
+            remaining = max(0, 10 - len(character_reference_paths))
+            model_src = character_reference_paths + frame_reference_paths[:remaining]
+        elif len(frame_reference_paths) > 1:
+            model_src = frame_reference_paths[:10]

    try:
        with Image.open(_source_frame_path(job_id, idx)) as src_im:
@@ -4371,7 +4476,7 @@ def generate_subject_assets(job_id: str, idx: int, element_id: str, req: Generat
    )
    actor_style_clause = (
        "Generate a believable normal commercial video actor, not a transparent or skeleton character. "
-        "Use the references to understand the source video's casting direction, age range, gender presentation, body proportion, wardrobe category, gesture vocabulary, framing, energy, lighting, and creator-ad style. "
+        "Use the text briefs to understand the source video's casting direction, age range, gender presentation, body proportion, wardrobe category, gesture vocabulary, framing, energy, lighting, and creator-ad style. "
        "Do not recreate the exact person's face, biometric identity, unique likeness, tattoos, scars, logos, watermarks, captions, or platform UI. "
        "The output must be a newly designed similar actor that could play the same role in a new ad, with consistent identity across all views. "
        if similar_actor
@@ -4386,7 +4491,7 @@ def generate_subject_assets(job_id: str, idx: int, element_id: str, req: Generat
    prompt_extra_clause = f"User direction: {prompt_extra[:1200]} " if prompt_extra else ""
    identity_lock_clause = (
        "Identity lock: these API calls generate one high-definition multi-view pack for ONE single subject, but each individual output file must show only its one requested view. "
-        "Before rendering, infer one consistent character bible from the reference image(s): gender presentation, age range, body proportions, head shape, face direction cues, material, silhouette, wardrobe/material style, and commercial mood. "
+        "Before rendering, infer one consistent character bible from the supplied text brief and generation instructions: gender presentation, age range, body proportions, head shape, face direction cues, material, silhouette, wardrobe/material style, and commercial mood. "
        "Keep that same character bible unchanged across every generated view in separate files. "
        "If user direction requests a gender, age, or style change, apply that one change uniformly to all views; never mix male/female, young/old, or multiple style identities inside the same pack. "
        "For transparent humanoids, keep the same transparent skin shell, skeleton proportions, visible spine/rib cage/pelvis/limb bones, and non-horror wellness character style in every view. "
@@ -4427,14 +4532,22 @@ def generate_subject_assets(job_id: str, idx: int, element_id: str, req: Generat
                if closeup_view and req.subject_kind == "living"
                else "The subject must be complete, centered, full body or full object, head-to-feet visible when applicable, not cropped by the canvas. Make the subject large and readable: it should occupy about 85-95% of the image height with only small margins. "
            )
+            reference_strategy_clause = (
+                "Text-only generation mode: no source image is attached to this image request. Use only the written source/video/template briefs below as creative constraints. "
+                "This is intentionally NOT image editing and NOT identity replication. "
+                + source_subject_clause
+                + template_brief_clause
+                if similar_mode else
+                "Use the reference image(s) only as visual evidence; do not crop, cut out, paste, trace, or extract pixels from the source. "
+            )
            prompt = (
-                f"Use the reference image(s) only as visual evidence; do not crop, cut out, paste, trace, or extract pixels from the source. "
+                reference_strategy_clause
+                +
                f"Generate one newly rendered {view_prompt} for {target}. "
-                f"The subject is a {kind_phrase}. If multiple frames are shown, treat them as evidence of one same subject, not multiple subjects. "
+                f"The subject is a {kind_phrase}. Treat all source evidence as one role and one consistent subject bible, not multiple subjects. "
                + single_view_clause
                + identity_clause
                + identity_lock_clause
-                + character_reference_clause
                + neck_product_clause
                + canvas_clause
                + prompt_extra_clause
@@ -4447,7 +4560,16 @@ def generate_subject_assets(job_id: str, idx: int, element_id: str, req: Generat
                + transparent_character_clause
            )
            try:
-                img_bytes, _mode = _image_edit_call(model_src, prompt, models=models, fallback_text=False, max_attempts=3, max_side=1280)
+                if similar_mode:
+                    print(
+                        f"[subject assets] reconstruction_mode=similar endpoint=/images/generations view={view} image_refs=0 model={GPT_IMAGE_MODEL}",
+                        flush=True,
+                    )
+                    img_bytes, _mode = _image_text_call(prompt, models=models, max_attempts=3)
+                else:
+                    if model_src is None:
+                        raise RuntimeError("subject asset edit reference image missing")
+                    img_bytes, _mode = _image_edit_call(model_src, prompt, models=models, fallback_text=False, max_attempts=3, max_side=1280)
            except RuntimeError as e:
                raise HTTPException(_image_error_status(e), f"subject asset {view} failed: {e}")

@@ -5026,6 +5148,7 @@ def save_subject_template(job_id: str, req: SaveSubjectTemplateReq) -> SubjectTe
    template_dir.mkdir(parents=True, exist_ok=True)
    now = _time.time()
    images: list[SubjectTemplateImage] = []
+    saved_image_paths: list[Path] = []
    for asset in selected_assets:
        src = job_dir(job_id) / "assets" / f"{asset.id}.jpg"
        if not src.exists():
@@ -5034,6 +5157,7 @@ def save_subject_template(job_id: str, req: SaveSubjectTemplateReq) -> SubjectTe
        filename = f"{template_id}/{image_id}.jpg"
        dst = SUBJECT_TEMPLATE_IMAGE_DIR / filename
        shutil.copy2(src, dst)
+        saved_image_paths.append(dst)
        images.append(SubjectTemplateImage(
            id=image_id,
            view=asset.view,
@@ -5053,11 +5177,18 @@ def save_subject_template(job_id: str, req: SaveSubjectTemplateReq) -> SubjectTe
        raise HTTPException(404, "subject asset files missing")

    primary = next((image.id for image in images if image.view == "front"), images[0].id)
+    prompt_brief = _describe_subject_template_from_images(
+        name,
+        req.subject_style,
+        saved_image_paths,
+        req.note.strip(),
+    ) or req.note.strip()
    item = SubjectTemplateItem(
        id=template_id,
        name=name,
        description=req.note.strip(),
        note=req.note.strip(),
+        prompt_brief=prompt_brief,
        source_job_id=job_id,
        source_frame_idx=frame.index,
        source_element_id=element.id,