feat: improve subject generation workflow

2026-05-18 17:44:52 +08:00
parent 78bd294d57
commit 1f600ae436
12 changed files with 682 additions and 372 deletions
--- a/api/main.py
+++ b/api/main.py
@@ -52,8 +52,18 @@ LOCAL_ASR_BIN = os.getenv("LOCAL_ASR_BIN", "").strip()
 LOCAL_ASR_MODEL = os.getenv("LOCAL_ASR_MODEL", "mlx-community/whisper-tiny").strip() or "mlx-community/whisper-tiny"
 LOCAL_ASR_TIMEOUT_SECONDS = max(30, int(os.getenv("LOCAL_ASR_TIMEOUT_SECONDS", "180")))
 TRANSLATE_MODEL = os.getenv("TRANSLATE_MODEL", "gemini-2.5-flash")
-REWRITE_MODEL = os.getenv("REWRITE_MODEL", "gemini-2.5-pro")
-VISION_MODEL = os.getenv("VISION_MODEL", "gemini-2.5-flash")
+DEFAULT_GPT_TEXT_MODEL = os.getenv("GPT_TEXT_MODEL", "gpt-4o").strip() or "gpt-4o"
+
+
+def gpt_model_env(name: str, default: str | None = None) -> str:
+    value = os.getenv(name, default or DEFAULT_GPT_TEXT_MODEL).strip()
+    if not value or value.lower().startswith("gemini-"):
+        return default or DEFAULT_GPT_TEXT_MODEL
+    return value
+
+
+REWRITE_MODEL = gpt_model_env("REWRITE_MODEL")
+VISION_MODEL = gpt_model_env("VISION_MODEL")
 IMAGE_BASE_URL = os.getenv("IMAGE_BASE_URL", LLM_BASE_URL).strip()
 IMAGE_API_KEY = os.getenv("IMAGE_API_KEY", LLM_API_KEY).strip()
 AI_HTTP_PROXY = (
@@ -77,29 +87,14 @@ PRODUCT_ASSET_MIN_LONG_SIDE = max(512, int(os.getenv("PRODUCT_ASSET_MIN_LONG_SID
 PRODUCT_ASSET_MIN_SHORT_SIDE = max(320, int(os.getenv("PRODUCT_ASSET_MIN_SHORT_SIDE", "600")))
 PRODUCT_ASSET_JPEG_QUALITY = max(80, min(95, int(os.getenv("PRODUCT_ASSET_JPEG_QUALITY", "92"))))
 VIDEO_MODEL = os.getenv("VIDEO_MODEL", "seedance").strip() or "seedance"
+YTDLP_COOKIES_FILE = os.getenv("YTDLP_COOKIES_FILE", "").strip()
+YTDLP_COOKIES_FROM_BROWSER = os.getenv("YTDLP_COOKIES_FROM_BROWSER", "").strip()
 AUDIO_PRODUCT_BRIEF = os.getenv(
    "AUDIO_PRODUCT_BRIEF",
    "SKG 智能按摩产品，主打日常肩颈、腰背、眼部、膝盖或足部放松；广告表达要高级、干净、可信，不做医疗疗效承诺。",
 ).strip()
-AUDIO_REWRITE_MODEL = os.getenv("AUDIO_REWRITE_MODEL", REWRITE_MODEL).strip() or REWRITE_MODEL
-MINIMAX_API_KEY = os.getenv("MINIMAX_API_KEY", "").strip()
-MINIMAX_TTS_BASE_URL = os.getenv("MINIMAX_TTS_BASE_URL", "https://api.minimax.io").strip().rstrip("/")
-MINIMAX_TTS_MODEL = os.getenv("MINIMAX_TTS_MODEL", "speech-2.8-turbo").strip() or "speech-2.8-turbo"
-MINIMAX_TTS_VOICE_ID = os.getenv(
-    "MINIMAX_TTS_VOICE_ID",
-    "English_expressive_narrator",
-).strip() or "English_expressive_narrator"
-DEFAULT_MINIMAX_TTS_VOICE_POOL = [
-    "English_magnetic_voiced_man",
-    "English_Upbeat_Woman",
-    "English_MaturePartner",
-]
-MINIMAX_TTS_VOICE_POOL = [
-    v.strip()
-    for v in os.getenv("MINIMAX_TTS_VOICE_POOL", ",".join(DEFAULT_MINIMAX_TTS_VOICE_POOL)).split(",")
-    if v.strip()
-]
-VOICE_PROVIDER = os.getenv("VOICE_PROVIDER", "azure_openai").strip().lower() or "azure_openai"
+AUDIO_REWRITE_MODEL = gpt_model_env("AUDIO_REWRITE_MODEL", REWRITE_MODEL)
+VOICE_PROVIDER = "azure_openai"
 AZURE_OPENAI_BASE_URL = os.getenv("AZURE_OPENAI_BASE_URL", "https://ai.skg.com/azure").strip().rstrip("/")
 AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY", LLM_API_KEY).strip()
 AZURE_TTS_MODEL = os.getenv("AZURE_TTS_MODEL", "gpt-4o-mini-tts").strip() or "gpt-4o-mini-tts"
@@ -111,6 +106,11 @@ AZURE_TTS_VOICE_POOL = [
    if v.strip()
 ]
 AZURE_TTS_PATH = os.getenv("AZURE_TTS_PATH", "/audio/speech").strip() or "/audio/speech"
+AZURE_TTS_PATHS = [
+    p.strip()
+    for p in os.getenv("AZURE_TTS_PATHS", f"{AZURE_TTS_PATH},/audio/speech,/v1/audio/speech").split(",")
+    if p.strip()
+]

 POE_API_BASE_URL = os.getenv("POE_API_BASE_URL", "https://api.poe.com/v1").strip() or "https://api.poe.com/v1"
 POE_API_KEY = os.getenv("POE_API_KEY", "").strip()
@@ -452,6 +452,7 @@ class CharacterLibraryItem(BaseModel):
    name: str
    folder: str = ""
    description: str = ""
+    prompt_brief: str = ""
    primary_image: str = ""
    images: list[CharacterLibraryImage] = Field(default_factory=list)

@@ -477,6 +478,7 @@ class SubjectTemplateItem(BaseModel):
    name: str
    description: str = ""
    note: str = ""
+    prompt_brief: str = ""
    source: Literal["database"] = "database"
    source_job_id: str = ""
    source_frame_idx: int = -1
@@ -1075,6 +1077,35 @@ def run(cmd: list[str], cwd: Path | None = None) -> str:
    return res.stdout


+def ytdlp_cookie_args() -> list[str]:
+    if YTDLP_COOKIES_FILE:
+        cookies = Path(YTDLP_COOKIES_FILE).expanduser()
+        if not cookies.exists():
+            raise RuntimeError("TikTok cookies 文件不可用，请检查 YTDLP_COOKIES_FILE 配置。")
+        return ["--cookies", str(cookies)]
+    if YTDLP_COOKIES_FROM_BROWSER:
+        return ["--cookies-from-browser", YTDLP_COOKIES_FROM_BROWSER]
+    return []
+
+
+def normalize_download_error(error: Exception) -> str:
+    raw = str(error)
+    lower = raw.lower()
+    auth_required = (
+        "log in for access" in lower
+        or "login" in lower and "cookies" in lower
+        or "cookies-from-browser" in lower
+        or "sign in" in lower and "tiktok" in lower
+    )
+    if auth_required:
+        return (
+            "TikTok 下载需要登录态。请上传视频文件，或在后端配置 "
+            "YTDLP_COOKIES_FILE / YTDLP_COOKIES_FROM_BROWSER 后重试。"
+            f"原始错误：{raw}"
+        )
+    return raw
+
+
 # ---- 启发式选帧工具 ----
 import imagehash
 import numpy as np
@@ -1728,13 +1759,15 @@ def pipeline_download(job_id: str) -> None:
            update(job, status="downloading", message="本地上传 · 跳过下载", progress=15)
        else:
            update(job, status="downloading", message="yt-dlp 下载中…", progress=5)
-            run([
+            cmd = [
                "yt-dlp", "-f", "best[ext=mp4]/best",
                "-o", str(mp4),
                "--no-warnings", "--no-playlist",
                "--retries", "3",
+                *ytdlp_cookie_args(),
                job.url,
-            ])
+            ]
+            run(cmd)
            if not mp4.exists():
                raise RuntimeError("下载完成但找不到 source.mp4")

@@ -1757,7 +1790,7 @@ def pipeline_download(job_id: str) -> None:
        )
    except Exception as e:
        message = "视频元数据解析失败" if stage == "metadata" else "下载失败"
-        update(job, status="failed", error=str(e), message=message)
+        update(job, status="failed", error=normalize_download_error(e), message=message)


 def pipeline_analyze(
@@ -1929,7 +1962,7 @@ def analyze_queue_worker() -> None:
        ANALYZE_WORKER_RUNNING = False


-# ---------- 音频转写 + 翻译 + SKG 改写 + MiniMax 配音 ----------
+# ---------- 音频转写 + 翻译 + SKG 改写 + Azure OpenAI 配音 ----------

 class TranscriptionUnavailable(RuntimeError):
    pass
@@ -2385,18 +2418,6 @@ def _rewrite_audio_script_sync(segments: list[TranscriptSegment], target_seconds
        return fallback, f"改写失败，使用本地模板：{e}"


-def _minimax_tts_url() -> str:
-    if MINIMAX_TTS_BASE_URL.endswith("/v1/t2a_v2"):
-        return MINIMAX_TTS_BASE_URL
-    return f"{MINIMAX_TTS_BASE_URL}/v1/t2a_v2"
-
-
-def _choose_minimax_voice_id() -> str:
-    if MINIMAX_TTS_VOICE_POOL:
-        return random.choice(MINIMAX_TTS_VOICE_POOL)
-    return MINIMAX_TTS_VOICE_ID
-
-
 def _choose_azure_voice_id() -> str:
    if AZURE_TTS_VOICE_POOL:
        return random.choice(AZURE_TTS_VOICE_POOL)
@@ -2404,9 +2425,7 @@ def _choose_azure_voice_id() -> str:


 def _choose_tts_voice_id() -> str:
-    if VOICE_PROVIDER == "azure_openai":
-        return _choose_azure_voice_id()
-    return _choose_minimax_voice_id()
+    return _choose_azure_voice_id()


 def _voice_speed_for(voice_id: str, target_seconds: float, text: str) -> float:
@@ -2423,60 +2442,22 @@ def _voice_speed_for(voice_id: str, target_seconds: float, text: str) -> float:
    return 0.99


-def _minimax_tts_sync(job_id: str, text: str, voice_id: str, target_seconds: float = 12.0) -> str:
-    if not MINIMAX_API_KEY:
-        raise RuntimeError("MINIMAX_API_KEY 未配置，未生成配音")
-    if not text.strip():
-        raise RuntimeError("改写文案为空，未生成配音")
-    payload = {
-        "model": MINIMAX_TTS_MODEL,
-        "text": text.strip()[:9500],
-        "stream": False,
-        "language_boost": "English",
-        "output_format": "hex",
-        "voice_setting": {
-            "voice_id": voice_id,
-            "speed": _voice_speed_for(voice_id, target_seconds, text),
-            "vol": 1,
-            "pitch": 0,
-        },
-        "audio_setting": {
-            "sample_rate": 32000,
-            "bitrate": 128000,
-            "format": "mp3",
-            "channel": 1,
-        },
-    }
-    resp = httpx.post(
-        _minimax_tts_url(),
-        headers={"Authorization": f"Bearer {MINIMAX_API_KEY}", "Content-Type": "application/json"},
-        json=payload,
-        timeout=90,
-    )
-    resp.raise_for_status()
-    data = resp.json()
-    base_resp = data.get("base_resp") or {}
-    if int(base_resp.get("status_code", 0) or 0) != 0:
-        raise RuntimeError(base_resp.get("status_msg") or "MiniMax TTS 返回失败")
-    audio_hex = ((data.get("data") or {}).get("audio") or "").strip()
-    if not audio_hex:
-        raise RuntimeError("MiniMax TTS 未返回 audio hex")
-    try:
-        audio_bytes = bytes.fromhex(audio_hex)
-    except ValueError as e:
-        raise RuntimeError(f"MiniMax TTS audio hex 无法解析：{e}") from e
-    out = job_dir(job_id) / "audio_script.mp3"
-    out.write_bytes(audio_bytes)
-    return f"/jobs/{job_id}/audio-script.mp3"
-
-
-def _azure_tts_url() -> str:
-    path = AZURE_TTS_PATH if AZURE_TTS_PATH.startswith("/") else f"/{AZURE_TTS_PATH}"
+def _azure_tts_url_for(path_value: str) -> str:
+    path = path_value if path_value.startswith("/") else f"/{path_value}"
    if AZURE_OPENAI_BASE_URL.endswith(path):
        return AZURE_OPENAI_BASE_URL
    return f"{AZURE_OPENAI_BASE_URL}{path}"


+def _azure_tts_urls() -> list[str]:
+    urls: list[str] = []
+    for path in AZURE_TTS_PATHS or [AZURE_TTS_PATH]:
+        url = _azure_tts_url_for(path)
+        if url not in urls:
+            urls.append(url)
+    return urls
+
+
 def _azure_openai_tts_sync(job_id: str, text: str, voice_id: str, target_seconds: float = 12.0) -> str:
    if not AZURE_OPENAI_API_KEY:
        raise RuntimeError("AZURE_OPENAI_API_KEY 或 LLM_API_KEY 未配置，未生成配音")
@@ -2489,18 +2470,32 @@ def _azure_openai_tts_sync(job_id: str, text: str, voice_id: str, target_seconds
        "response_format": "mp3",
        "speed": _voice_speed_for(voice_id, target_seconds, text),
    }
-    resp = httpx.post(
-        _azure_tts_url(),
-        headers={
-            "Authorization": f"Bearer {AZURE_OPENAI_API_KEY}",
-            "api-key": AZURE_OPENAI_API_KEY,
-            "Content-Type": "application/json",
-        },
-        json=payload,
-        timeout=120,
-    )
+    headers = {
+        "Authorization": f"Bearer {AZURE_OPENAI_API_KEY}",
+        "api-key": AZURE_OPENAI_API_KEY,
+        "Content-Type": "application/json",
+    }
+    resp: httpx.Response | None = None
+    errors: list[str] = []
+    with ai_http_client(timeout=120) as client:
+        for url in _azure_tts_urls():
+            try:
+                current = client.post(url, headers=headers, json=payload)
+            except Exception as e:
+                errors.append(f"{url}: {type(e).__name__}: {e}")
+                continue
+            if current.status_code < 400:
+                resp = current
+                break
+            errors.append(f"{url}: HTTP {current.status_code}: {current.text[:180]}")
+            if current.status_code not in {404, 405}:
+                resp = current
+                break
+    if resp is None:
+        raise RuntimeError("Azure OpenAI TTS 不可用；已尝试 " + " | ".join(errors))
    if resp.status_code >= 400:
-        raise RuntimeError(f"Azure OpenAI TTS HTTP {resp.status_code}: {resp.text[:300]}")
+        detail = " | ".join(errors) or resp.text[:300]
+        raise RuntimeError(f"Azure OpenAI TTS HTTP {resp.status_code}: {detail[:600]}")
    audio_bytes = resp.content
    if not audio_bytes:
        raise RuntimeError("Azure OpenAI TTS 未返回音频内容")
@@ -2517,9 +2512,7 @@ def _azure_openai_tts_sync(job_id: str, text: str, voice_id: str, target_seconds


 def _tts_sync(job_id: str, text: str, voice_id: str, target_seconds: float = 12.0) -> tuple[str, str, str]:
-    if VOICE_PROVIDER == "azure_openai":
-        return _azure_openai_tts_sync(job_id, text, voice_id, target_seconds), "azure_openai", AZURE_TTS_MODEL
-    return _minimax_tts_sync(job_id, text, voice_id, target_seconds), "minimax", MINIMAX_TTS_MODEL
+    return _azure_openai_tts_sync(job_id, text, voice_id, target_seconds), "azure_openai", AZURE_TTS_MODEL


 def _build_audio_script_sync(job_id: str, segments: list[TranscriptSegment], target_seconds: float = 12.0) -> AudioScript:
@@ -2531,8 +2524,8 @@ def _build_audio_script_sync(job_id: str, segments: list[TranscriptSegment], tar
    speaker_profile, rhythm_profile = _audio_delivery_profile(segments, duration, selected_voice_id)
    voice_url = ""
    voice_error = ""
-    voice_provider = "azure_openai" if VOICE_PROVIDER == "azure_openai" else "minimax"
-    voice_model = AZURE_TTS_MODEL if voice_provider == "azure_openai" else MINIMAX_TTS_MODEL
+    voice_provider = "azure_openai"
+    voice_model = AZURE_TTS_MODEL
    try:
        voice_url, voice_provider, voice_model = _tts_sync(job_id, rewritten, selected_voice_id, duration)
    except Exception as e:
@@ -2944,6 +2937,83 @@ def _image_text_call(
    raise RuntimeError(_image_failure_message("image text", max_attempts, last_err, capacity_seen))


+def _image_path_to_data_url(path: Path) -> str:
+    media_type = "image/png" if path.suffix.lower() == ".png" else "image/jpeg"
+    return f"data:{media_type};base64,{base64.b64encode(path.read_bytes()).decode('ascii')}"
+
+
+def _vision_brief_from_images(image_paths: list[Path], prompt: str, max_images: int = 8) -> str:
+    paths = [path for path in image_paths if path.exists()][:max_images]
+    if not paths:
+        return ""
+    if not LLM_API_KEY:
+        return ""
+    content: list[dict] = [{"type": "text", "text": prompt}]
+    for path in paths:
+        content.append({"type": "image_url", "image_url": {"url": _image_path_to_data_url(path)}})
+    try:
+        resp = llm().chat.completions.create(
+            model=VISION_MODEL,
+            messages=[{"role": "user", "content": content}],
+            response_format={"type": "json_object"},
+            temperature=0.1,
+            max_tokens=1400,
+        )
+        raw = (resp.choices[0].message.content or "").strip()
+        if not raw:
+            raw = (getattr(resp.choices[0].message, "reasoning_content", "") or "").strip()
+        match = re.search(r"\{[\s\S]*\}", raw)
+        raw = match.group(0) if match else raw
+        data = json.loads(raw)
+    except Exception as e:
+        print(f"[vision brief failed] {e}", flush=True)
+        return ""
+
+    if isinstance(data, dict):
+        if isinstance(data.get("brief"), str) and data["brief"].strip():
+            return data["brief"].strip()[:1800]
+        parts: list[str] = []
+        for key in (
+            "gender_presentation", "age_range", "body_proportion", "hair", "skin_tone",
+            "wardrobe_style", "pose_language", "camera_visibility", "commercial_mood",
+            "neck_shoulder_readiness", "style_constraints",
+        ):
+            value = data.get(key)
+            if isinstance(value, str) and value.strip():
+                parts.append(f"{key.replace('_', ' ')}: {value.strip()}")
+        if parts:
+            return "; ".join(parts)[:1800]
+    return ""
+
+
+def _describe_source_subject(job_id: str, source_indices: list[int]) -> str:
+    """Turn source keyframes into a non-identifying visual brief for similar-subject text generation."""
+    paths = [_source_frame_path(job_id, idx) for idx in source_indices]
+    prompt = (
+        "You are preparing a non-identifying character brief for generating a NEW similar but non-identical ad subject. "
+        "Look at these source video keyframes as evidence of one role and style, not as a person to identify. "
+        "Do NOT identify the person, do NOT estimate exact age, do NOT describe biometric identity, and do NOT mention celebrity or real-person likeness. "
+        "Output strict JSON only. Use broad style traits suitable for text-to-image generation.\n"
+        "Required keys: gender_presentation, age_range, body_proportion, hair, skin_tone, wardrobe_style, "
+        "pose_language, camera_visibility, commercial_mood, neck_shoulder_readiness, style_constraints, brief.\n"
+        "The brief should be 80-140 words and should preserve category, role, energy, camera readability, and commercial atmosphere while explicitly allowing a new non-identical subject."
+    )
+    return _vision_brief_from_images(paths, prompt, max_images=8)
+
+
+def _describe_subject_template_from_images(name: str, subject_style: str, image_paths: list[Path], note: str = "") -> str:
+    prompt = (
+        f"You are summarizing a saved SKG subject template named '{name}' for future text-to-image generation. "
+        f"Subject style: {subject_style}. User note: {note[:500]}. "
+        "Look at the subject views and describe the reusable creative direction without copying identity or pixels. "
+        "Do NOT identify a person and do NOT describe exact facial identity. "
+        "Output strict JSON only with keys: gender_presentation, age_range, body_proportion, material_or_skin, "
+        "wardrobe_or_surface_style, pose_language, camera_readability, neck_shoulder_readiness, commercial_mood, brief. "
+        "The brief should be 80-140 words and must be useful as a reference character brief for creating a new innovative variation."
+    )
+    return _vision_brief_from_images(image_paths, prompt, max_images=10)
+
+
 # ---------- API 路由 ----------

 class CreateJobReq(BaseModel):
@@ -3130,7 +3200,7 @@ def health() -> dict:
        "auth_configured": WEB_AUTH_CONFIGURED,
        "base_url": LLM_BASE_URL or "openai-default",
        "image_base_url": IMAGE_BASE_URL or LLM_BASE_URL or "openai-default",
-        "voice_base_url": AZURE_OPENAI_BASE_URL if VOICE_PROVIDER == "azure_openai" else MINIMAX_TTS_BASE_URL,
+        "voice_base_url": AZURE_OPENAI_BASE_URL,
        "models": {
            "asr": ASR_MODEL,
            "local_asr": LOCAL_ASR_MODEL,
@@ -3147,15 +3217,12 @@ def health() -> dict:
            "subject_image": SUBJECT_ASSET_IMAGE_MODEL,
            "subject_image_fallbacks": SUBJECT_ASSET_IMAGE_MODELS,
            "voice_provider": VOICE_PROVIDER,
-            "voice_base_url": AZURE_OPENAI_BASE_URL if VOICE_PROVIDER == "azure_openai" else MINIMAX_TTS_BASE_URL,
-            "voice_tts": AZURE_TTS_MODEL if VOICE_PROVIDER == "azure_openai" else MINIMAX_TTS_MODEL,
-            "voice_id": AZURE_TTS_VOICE_ID if VOICE_PROVIDER == "azure_openai" else MINIMAX_TTS_VOICE_ID,
-            "voice_pool": AZURE_TTS_VOICE_POOL if VOICE_PROVIDER == "azure_openai" else (MINIMAX_TTS_VOICE_POOL or [MINIMAX_TTS_VOICE_ID]),
-            "voice_configured": bool(AZURE_OPENAI_API_KEY) if VOICE_PROVIDER == "azure_openai" else bool(MINIMAX_API_KEY),
-            "minimax_tts": MINIMAX_TTS_MODEL,
-            "minimax_voice": MINIMAX_TTS_VOICE_ID,
-            "minimax_voice_pool": MINIMAX_TTS_VOICE_POOL or [MINIMAX_TTS_VOICE_ID],
-            "minimax_configured": bool(MINIMAX_API_KEY),
+            "voice_base_url": AZURE_OPENAI_BASE_URL,
+            "voice_tts": AZURE_TTS_MODEL,
+            "voice_tts_paths": AZURE_TTS_PATHS,
+            "voice_id": AZURE_TTS_VOICE_ID,
+            "voice_pool": AZURE_TTS_VOICE_POOL,
+            "voice_configured": bool(AZURE_OPENAI_API_KEY),
            "video": VIDEO_MODEL,
            "video_aliases": VIDEO_MODEL_ALIASES,
            "video_provider": video_provider_name(),
@@ -3225,6 +3292,31 @@ async def create_job(req: CreateJobReq, bg: BackgroundTasks) -> Job:
    return job


+@app.post("/jobs/{job_id}/download/retry", response_model=Job)
+async def retry_job_download(job_id: str, bg: BackgroundTasks) -> Job:
+    job = JOBS.get(job_id)
+    if not job:
+        raise HTTPException(404, "job not found")
+    if job.source_kind == "upload" or job.url.startswith("upload://"):
+        raise HTTPException(409, "uploaded videos cannot be redownloaded; upload the file again")
+    if job.status in {"downloading", "splitting", "transcribing"}:
+        raise HTTPException(409, f"job is busy: {job.status}")
+
+    mp4 = job_dir(job_id) / "source.mp4"
+    if mp4.exists() and mp4.stat().st_size == 0:
+        mp4.unlink()
+    update(
+        job,
+        status="downloading",
+        progress=1,
+        error="",
+        message="重新提交下载…",
+        video_url="",
+    )
+    bg.add_task(pipeline_download, job_id)
+    return job
+
+
@app.post("/jobs/upload", response_model=Job)
 async def create_job_from_upload(bg: BackgroundTasks, file: UploadFile = File(...)) -> Job:
    if not file.filename:
@@ -4308,43 +4400,56 @@ def generate_subject_assets(job_id: str, idx: int, element_id: str, req: Generat
        source_indices = [idx] + source_indices
    source_indices = list(dict.fromkeys(source_indices))[:12]

+    similar_mode = req.reconstruction_mode == "similar"
    character_reference_paths: list[Path] = []
-    character_reference_clause = ""
+    template_brief_clause = ""
    character_label = ""
    subject_template_id = (req.subject_template_id or "").strip()
    character_id = (req.character_id or "").strip()
    if subject_template_id:
        template = find_subject_template_item(subject_template_id)
        character_label = template.name
-        for image in template.images[:10]:
-            character_reference_paths.append(subject_template_image_file(image.filename))
-        character_reference_clause = (
-            f"Selected reusable subject template from database: {template.name}. "
-            "Use these saved generated subject views as a high-quality creative direction and identity bible only; "
-            "do not copy pixels, file artifacts, exact pose, labels, or accidental defects. "
-            "Create a new innovative variation that keeps the same broad subject type, transparent wellness character language, "
-            "camera readability, shoulder/neck product compatibility, and commercial role. "
+        template_paths = [subject_template_image_file(image.filename) for image in template.images[:10]]
+        character_reference_paths.extend(template_paths)
+        brief = template.prompt_brief.strip() or template.note.strip() or template.description.strip()
+        if similar_mode and not brief:
+            brief = _describe_subject_template_from_images(template.name, template.subject_style, template_paths, template.note)
+        template_brief_clause = (
+            f"Reference character brief from saved database template '{template.name}': {brief}. "
+            "Use this as a high-quality creative direction and identity bible only; do not copy a face, exact pose, pixels, file artifacts, labels, or accidental defects. "
+            "Create a new innovative variation that keeps the same broad subject type, transparent wellness character language, camera readability, shoulder/neck product compatibility, and commercial role. "
+            if brief else
+            f"Selected reusable subject template from database: {template.name}. Create a new innovative variation, not a duplicate. "
        )
    elif character_id:
        character = find_character_library_item(character_id)
        character_label = character.name
-        for image in character.images[:7]:
-            character_reference_paths.append(character_library_file(image.filename))
-        character_reference_clause = (
-            f"Selected built-in creative character reference: {character.name}. "
-            "Use these planned character images as a high-quality creative direction and anatomy/style bible only; "
+        character_reference_paths.extend(character_library_file(image.filename) for image in character.images[:7])
+        brief = character.prompt_brief.strip() or character.description.strip()
+        template_brief_clause = (
+            f"Reference character brief from built-in creative character '{character.name}': {brief}. "
+            "Use this planned character brief as a high-quality creative direction and anatomy/style bible only; "
            "do not copy the exact face, exact pose, exact silhouette, pixels, or make a duplicate. "
-            "Create a new innovative variation that keeps the same broad role, transparent wellness character language, "
-            "camera readability, and shoulder/neck product compatibility. "
+            "Create a new innovative variation that keeps the same broad role, transparent wellness character language, camera readability, and shoulder/neck product compatibility. "
        )

-    model_src, tmp_focus = _focus_source_for_element(job_id, idx, el)
+    tmp_focus: Path | None = None
+    model_src: Path | list[Path] | None = None
    frame_reference_paths = [p for p in (_source_frame_path(job_id, i) for i in source_indices) if p.exists()]
-    if character_reference_paths:
-        remaining = max(0, 10 - len(character_reference_paths))
-        model_src = character_reference_paths + frame_reference_paths[:remaining]
-    elif len(frame_reference_paths) > 1:
-        model_src = frame_reference_paths[:10]
+    source_subject_brief = _describe_source_subject(job_id, source_indices) if similar_mode else ""
+    source_subject_clause = (
+        f"Source video role brief from selected keyframes: {source_subject_brief}. "
+        "Use this brief to preserve role category, creator-ad energy, camera readability, and broad styling, while creating a new non-identical subject. "
+        if source_subject_brief else
+        "Source video role brief unavailable; create a new non-identical ad subject guided by the user direction, template brief, and requested view. "
+    )
+    if not similar_mode:
+        model_src, tmp_focus = _focus_source_for_element(job_id, idx, el)
+        if character_reference_paths:
+            remaining = max(0, 10 - len(character_reference_paths))
+            model_src = character_reference_paths + frame_reference_paths[:remaining]
+        elif len(frame_reference_paths) > 1:
+            model_src = frame_reference_paths[:10]

    try:
        with Image.open(_source_frame_path(job_id, idx)) as src_im:
@@ -4371,7 +4476,7 @@ def generate_subject_assets(job_id: str, idx: int, element_id: str, req: Generat
    )
    actor_style_clause = (
        "Generate a believable normal commercial video actor, not a transparent or skeleton character. "
-        "Use the references to understand the source video's casting direction, age range, gender presentation, body proportion, wardrobe category, gesture vocabulary, framing, energy, lighting, and creator-ad style. "
+        "Use the text briefs to understand the source video's casting direction, age range, gender presentation, body proportion, wardrobe category, gesture vocabulary, framing, energy, lighting, and creator-ad style. "
        "Do not recreate the exact person's face, biometric identity, unique likeness, tattoos, scars, logos, watermarks, captions, or platform UI. "
        "The output must be a newly designed similar actor that could play the same role in a new ad, with consistent identity across all views. "
        if similar_actor
@@ -4386,7 +4491,7 @@ def generate_subject_assets(job_id: str, idx: int, element_id: str, req: Generat
    prompt_extra_clause = f"User direction: {prompt_extra[:1200]} " if prompt_extra else ""
    identity_lock_clause = (
        "Identity lock: these API calls generate one high-definition multi-view pack for ONE single subject, but each individual output file must show only its one requested view. "
-        "Before rendering, infer one consistent character bible from the reference image(s): gender presentation, age range, body proportions, head shape, face direction cues, material, silhouette, wardrobe/material style, and commercial mood. "
+        "Before rendering, infer one consistent character bible from the supplied text brief and generation instructions: gender presentation, age range, body proportions, head shape, face direction cues, material, silhouette, wardrobe/material style, and commercial mood. "
        "Keep that same character bible unchanged across every generated view in separate files. "
        "If user direction requests a gender, age, or style change, apply that one change uniformly to all views; never mix male/female, young/old, or multiple style identities inside the same pack. "
        "For transparent humanoids, keep the same transparent skin shell, skeleton proportions, visible spine/rib cage/pelvis/limb bones, and non-horror wellness character style in every view. "
@@ -4427,14 +4532,22 @@ def generate_subject_assets(job_id: str, idx: int, element_id: str, req: Generat
                if closeup_view and req.subject_kind == "living"
                else "The subject must be complete, centered, full body or full object, head-to-feet visible when applicable, not cropped by the canvas. Make the subject large and readable: it should occupy about 85-95% of the image height with only small margins. "
            )
+            reference_strategy_clause = (
+                "Text-only generation mode: no source image is attached to this image request. Use only the written source/video/template briefs below as creative constraints. "
+                "This is intentionally NOT image editing and NOT identity replication. "
+                + source_subject_clause
+                + template_brief_clause
+                if similar_mode else
+                "Use the reference image(s) only as visual evidence; do not crop, cut out, paste, trace, or extract pixels from the source. "
+            )
            prompt = (
-                f"Use the reference image(s) only as visual evidence; do not crop, cut out, paste, trace, or extract pixels from the source. "
+                reference_strategy_clause
+                +
                f"Generate one newly rendered {view_prompt} for {target}. "
-                f"The subject is a {kind_phrase}. If multiple frames are shown, treat them as evidence of one same subject, not multiple subjects. "
+                f"The subject is a {kind_phrase}. Treat all source evidence as one role and one consistent subject bible, not multiple subjects. "
                + single_view_clause
                + identity_clause
                + identity_lock_clause
-                + character_reference_clause
                + neck_product_clause
                + canvas_clause
                + prompt_extra_clause
@@ -4447,7 +4560,16 @@ def generate_subject_assets(job_id: str, idx: int, element_id: str, req: Generat
                + transparent_character_clause
            )
            try:
-                img_bytes, _mode = _image_edit_call(model_src, prompt, models=models, fallback_text=False, max_attempts=3, max_side=1280)
+                if similar_mode:
+                    print(
+                        f"[subject assets] reconstruction_mode=similar endpoint=/images/generations view={view} image_refs=0 model={GPT_IMAGE_MODEL}",
+                        flush=True,
+                    )
+                    img_bytes, _mode = _image_text_call(prompt, models=models, max_attempts=3)
+                else:
+                    if model_src is None:
+                        raise RuntimeError("subject asset edit reference image missing")
+                    img_bytes, _mode = _image_edit_call(model_src, prompt, models=models, fallback_text=False, max_attempts=3, max_side=1280)
            except RuntimeError as e:
                raise HTTPException(_image_error_status(e), f"subject asset {view} failed: {e}")

@@ -5026,6 +5148,7 @@ def save_subject_template(job_id: str, req: SaveSubjectTemplateReq) -> SubjectTe
    template_dir.mkdir(parents=True, exist_ok=True)
    now = _time.time()
    images: list[SubjectTemplateImage] = []
+    saved_image_paths: list[Path] = []
    for asset in selected_assets:
        src = job_dir(job_id) / "assets" / f"{asset.id}.jpg"
        if not src.exists():
@@ -5034,6 +5157,7 @@ def save_subject_template(job_id: str, req: SaveSubjectTemplateReq) -> SubjectTe
        filename = f"{template_id}/{image_id}.jpg"
        dst = SUBJECT_TEMPLATE_IMAGE_DIR / filename
        shutil.copy2(src, dst)
+        saved_image_paths.append(dst)
        images.append(SubjectTemplateImage(
            id=image_id,
            view=asset.view,
@@ -5053,11 +5177,18 @@ def save_subject_template(job_id: str, req: SaveSubjectTemplateReq) -> SubjectTe
        raise HTTPException(404, "subject asset files missing")

    primary = next((image.id for image in images if image.view == "front"), images[0].id)
+    prompt_brief = _describe_subject_template_from_images(
+        name,
+        req.subject_style,
+        saved_image_paths,
+        req.note.strip(),
+    ) or req.note.strip()
    item = SubjectTemplateItem(
        id=template_id,
        name=name,
        description=req.note.strip(),
        note=req.note.strip(),
+        prompt_brief=prompt_brief,
        source_job_id=job_id,
        source_frame_idx=frame.index,
        source_element_id=element.id,