From 801b194bffadb1871906393390ee32975e49f710 Mon Sep 17 00:00:00 2001
From: kang <wankang2050@gmail.com>
Date: Thu, 14 May 2026 11:53:14 +0800
Subject: [PATCH] auto-save 2026-05-14 11:53 (~4)

---
 .memory/worklog.json           |  27 ++++-----
 api/main.py                    | 107 +++++++++++++++++++++++++++------
 web/components/nodes/index.tsx |   6 +-
 web/lib/api.ts                 |   7 +++
 4 files changed, 113 insertions(+), 34 deletions(-)

diff --git a/.memory/worklog.json b/.memory/worklog.json
index 0c5319d..cd5f001 100644
--- a/.memory/worklog.json
+++ b/.memory/worklog.json
@@ -1,19 +1,5 @@
 {
   "entries": [
-    {
-      "files_changed": 1,
-      "hash": "7a5b09a",
-      "message": "auto-save 2026-05-13 04:05 (~1)",
-      "ts": "2026-05-13T04:06:09+08:00",
-      "type": "commit"
-    },
-    {
-      "files_changed": 1,
-      "hash": "6304eab",
-      "message": "auto-save 2026-05-13 04:11 (~1)",
-      "ts": "2026-05-13T04:12:02+08:00",
-      "type": "commit"
-    },
     {
       "files_changed": 1,
       "hash": "9fcc418",
@@ -3298,6 +3284,19 @@
       "type": "session-heartbeat",
       "message": "Codex 会话活跃 · 最近命令：codex · 6 项未提交变更 · 最近提交：auto-save 2026-05-14 11:41 (~1)",
       "files_changed": 6
+    },
+    {
+      "ts": "2026-05-14T11:47:40+08:00",
+      "type": "commit",
+      "message": "auto-save 2026-05-14 11:47 (~7)",
+      "hash": "ba491c0",
+      "files_changed": 7
+    },
+    {
+      "ts": "2026-05-14T03:48:39Z",
+      "type": "session-heartbeat",
+      "message": "Codex 会话活跃 · 最近命令：codex · 2 项未提交变更 · 最近提交：auto-save 2026-05-14 11:47 (~7)",
+      "files_changed": 2
     }
   ]
 }
diff --git a/api/main.py b/api/main.py
index 84805b6..211574b 100644
--- a/api/main.py
+++ b/api/main.py
@@ -120,6 +120,7 @@ SubjectKind = Literal["object", "living"]
 SubjectView = str
 SceneMode = Literal["remove_subject", "similar", "style"]
 SceneStyle = Literal["source", "premium_product", "clean_studio", "warm_lifestyle", "cinematic"]
+SceneAssetRole = Literal["scene", "first_frame", "last_frame"]
 FRAME_TARGET_LABELS: dict[FrameExtractTarget, str] = {
     "transparent_human": "透明骨架人",
     "balanced": "综合关键帧",
@@ -256,6 +257,7 @@ class SceneAsset(BaseModel):
     size: AssetSize = "source"
     scene_mode: SceneMode = "remove_subject"
     scene_style: SceneStyle = "source"
+    asset_role: SceneAssetRole = "scene"
     quality_report: QualityReport | None = None
     created_at: float = 0.0
 
@@ -302,6 +304,9 @@ class ProductFusionRegion(BaseModel):
 
 class ProductFusionShot(BaseModel):
     id: str = ""
+    first_image: dict | None = None
+    last_image: dict | None = None
+    product_images: list[dict] = Field(default_factory=list)
     product_image: dict | None = None
     person_image: dict | None = None
     product_region: ProductFusionRegion | None = None
@@ -1897,6 +1902,40 @@ def _image_edit_call(
     return b64lib.b64decode(b64), effective_mode
 
 
+def _image_text_call(
+    prompt: str,
+    model: str | None = None,
+    models: list[str] | None = None,
+    max_attempts: int = 3,
+) -> tuple[bytes, str]:
+    """Text-only image generation with light model rotation."""
+    import base64 as b64lib
+    import time as _time
+    if not LLM_API_KEY:
+        raise RuntimeError("LLM_API_KEY 未配置")
+    models_cycle = list(models) if models else [model or IMAGE_MODEL]
+    last_err = ""
+    resp_data: dict = {}
+    for attempt in range(max_attempts):
+        current_model = models_cycle[min(attempt, len(models_cycle) - 1)]
+        try:
+            resp = llm().images.generate(model=current_model, prompt=prompt, n=1)
+            resp_data = resp.model_dump() if hasattr(resp, "model_dump") else {"data": [{"b64_json": resp.data[0].b64_json}]}
+            if resp_data.get("data"):
+                b64 = resp_data["data"][0].get("b64_json")
+                if b64:
+                    return b64lib.b64decode(b64), "text"
+            err_obj = resp_data.get("error") or {}
+            last_err = f"empty data · {err_obj.get('code', '')} · {str(err_obj.get('message', ''))[:200]} · model={current_model}"
+        except Exception as e:
+            last_err = f"{type(e).__name__}: {e} · model={current_model}"
+        if attempt < max_attempts - 1:
+            next_model = models_cycle[min(attempt + 1, len(models_cycle) - 1)]
+            print(f"[image text retry {attempt + 1}/{max_attempts} → {next_model}] {last_err}", flush=True)
+            _time.sleep(1.0)
+    raise RuntimeError(f"image text failed after {max_attempts} attempts: {last_err}")
+
+
 # ---------- API 路由 ----------
 
 class CreateJobReq(BaseModel):
@@ -2642,6 +2681,7 @@ class GenerateSceneAssetReq(BaseModel):
     size: AssetSize = "source"
     scene_mode: SceneMode = "remove_subject"
     scene_style: SceneStyle = "source"
+    asset_role: SceneAssetRole = "scene"
     prompt: str = ""
     source_frame_indices: list[int] | None = None
 
@@ -2772,8 +2812,8 @@ def delete_element(job_id: str, idx: int, element_id: str) -> Job:
 
 @app.post("/jobs/{job_id}/frames/{idx}/scene-asset", response_model=Job)
 def generate_scene_asset(job_id: str, idx: int, req: GenerateSceneAssetReq) -> Job:
-    """为关键帧生成一张干净、高清的场景参考图。默认一帧只需要一张，重跑会保留历史供人工比对。
-    场景图排在主体资产之后：优先依据已确认主体，去主体并补全背景，再按模式生成原场景/相似场景/换风格场景。"""
+    """为关键帧生成一张资产图。
+    scene: 去主体背景板；first_frame/last_frame: 纯文字生成视频首尾帧，参考帧只用于理解统一人物形象。"""
     import time as _time
     job = JOBS.get(job_id)
     if not job:
@@ -2814,6 +2854,11 @@ def generate_scene_asset(job_id: str, idx: int, req: GenerateSceneAssetReq) -> J
         if confirmed_subjects
         else "Remove the main foreground subject from the frame if present. "
     )
+    identity_clause = (
+        "Known character identity cues: " + ", ".join(confirmed_subjects) + ". "
+        if confirmed_subjects
+        else "Infer one consistent friendly transparent human character identity from the provided references. "
+    )
     mode_clause = {
         "remove_subject": (
             "Keep the original environment, camera angle, perspective, composition, lighting direction, color mood, and spatial layout. "
@@ -2846,22 +2891,44 @@ def generate_scene_asset(job_id: str, idx: int, req: GenerateSceneAssetReq) -> J
         if len(source_indices) > 1
         else "Use the provided frame as the primary visual reference. "
     )
-    prompt = (
-        "Create one clean high-definition scene/background reference image from this frame. "
-        + subject_clause
-        + "Do not include the removed subject, duplicate people, animals, products, text, watermark, platform UI, captions, usernames, hashtags, logos, or overlay graphics. "
-        + reference_clause
-        + user_prompt_clause
-        + mode_clause + " "
-        + style_clause + " "
-        + "Enhance clarity and texture while avoiding over-smoothing, warped geometry, or changing important perspective details. "
-        + "Do not create multiple views. Do not isolate objects."
-    )
+    if req.asset_role == "scene":
+        prompt = (
+            "Create one clean high-definition scene/background reference image from this frame. "
+            + subject_clause
+            + "Do not include the removed subject, duplicate people, animals, products, text, watermark, platform UI, captions, usernames, hashtags, logos, or overlay graphics. "
+            + reference_clause
+            + user_prompt_clause
+            + mode_clause + " "
+            + style_clause + " "
+            + "Enhance clarity and texture while avoiding over-smoothing, warped geometry, or changing important perspective details. "
+            + "Do not create multiple views. Do not isolate objects."
+        )
+    else:
+        role_clause = (
+            "This is the FIRST frame for an image-to-video clip: create a clear beginning pose and composition. "
+            if req.asset_role == "first_frame"
+            else "This is the LAST frame for an image-to-video clip: create a clear ending pose that can naturally follow the first frame, not a duplicate. "
+        )
+        prompt = (
+            "Create one premium 9:16 high-definition video endpoint frame from text direction. "
+            + role_clause
+            + identity_clause
+            + reference_clause
+            + user_prompt_clause
+            + style_clause + " "
+            + "The frame must feature the same friendly transparent or translucent human character: glass/acrylic/vinyl-like transparent outer body, visible clean white skeleton inside, clean commercial wellness style, non-horror. "
+            + "Use the references only to understand character identity, proportions, transparent shell, white bones, pose vocabulary, camera language, and lighting; do not copy watermarks, subtitles, platform UI, logos, or accidental artifacts. "
+            + "Do not create a plain background plate. Do not remove the character. Do not include SKG product unless the user prompt explicitly asks for it. "
+            + "The output should be ready as a first/last frame for Seedance video generation, with stable composition, believable perspective, clear subject, no text, no watermark, no gore, no medical surgery imagery."
+        )
     models = [IMAGE_MODEL, "gemini-3.1-flash-image-preview", "gemini-2.5-flash-image"]
     try:
-        img_bytes, _mode = _image_edit_call(model_src, prompt, models=models, fallback_text=False, max_attempts=3, max_side=1280)
+        if req.asset_role == "scene":
+            img_bytes, _mode = _image_edit_call(model_src, prompt, models=models, fallback_text=False, max_attempts=3, max_side=1280)
+        else:
+            img_bytes, _mode = _image_text_call(prompt, models=models, max_attempts=3)
     except RuntimeError as e:
-        raise HTTPException(500, f"scene asset failed: {e}")
+        raise HTTPException(500, f"{req.asset_role} asset failed: {e}")
     finally:
         if sheet_tmp and sheet_tmp.exists():
             try: sheet_tmp.unlink()
@@ -2873,7 +2940,11 @@ def generate_scene_asset(job_id: str, idx: int, req: GenerateSceneAssetReq) -> J
     report = _image_quality_report(out_path)
     scene = SceneAsset(
         id=asset_id,
-        label=f"分镜 {idx + 1} 场景图",
+        label=(
+            f"分镜 {idx + 1} 场景图"
+            if req.asset_role == "scene"
+            else f"分镜 {idx + 1} {'首帧' if req.asset_role == 'first_frame' else '尾帧'}"
+        ),
         url=_asset_url(job_id, asset_id),
         width=width,
         height=height,
@@ -2881,6 +2952,7 @@ def generate_scene_asset(job_id: str, idx: int, req: GenerateSceneAssetReq) -> J
         size=req.size,
         scene_mode=req.scene_mode,
         scene_style=req.scene_style,
+        asset_role=req.asset_role,
         quality_report=report,
         created_at=_time.time(),
     )
@@ -2891,7 +2963,8 @@ def generate_scene_asset(job_id: str, idx: int, req: GenerateSceneAssetReq) -> J
             f.quality_report = _image_quality_report(src)
             f.scene_assets = (f.scene_assets or []) + [scene]
         new_frames.append(f)
-    update(job, frames=new_frames, message=f"场景图生成完成 · 分镜 {idx + 1}")
+    asset_label = "场景图" if req.asset_role == "scene" else ("首帧" if req.asset_role == "first_frame" else "尾帧")
+    update(job, frames=new_frames, message=f"{asset_label}生成完成 · 分镜 {idx + 1}")
     return job
 
 
diff --git a/web/components/nodes/index.tsx b/web/components/nodes/index.tsx
index 32d626e..8222a0e 100644
--- a/web/components/nodes/index.tsx
+++ b/web/components/nodes/index.tsx
@@ -2143,13 +2143,13 @@ export function AudioNode({ data, selected }: any) {
       onTogglePin={() => d.onToggleNodePin?.("audio")}
     >
       <div
-        className="space-y-2 text-[11px] text-[var(--text-soft)] leading-snug"
+        className="cursor-pointer space-y-2 text-[11px] text-[var(--text-soft)] leading-snug"
         onClick={() => {
           if (job?.video_url) d.onOpenAudioStrip?.(job.id)
         }}
       >
         <div>
-          音轨 → ASR 转录 → 英中翻译 → SKG 口播改写 → MiniMax 配音<br />
+          音轨 → ASR 转录 → 英中翻译 → SKG 英文口播 → MiniMax 英文配音<br />
           <span className="text-[var(--text-faint)] font-mono">
             {audioScript?.rewrite_model || "AUDIO_REWRITE_MODEL"} → {audioScript?.voice_model || "MiniMax T2A"}
           </span>
@@ -2186,7 +2186,7 @@ export function AudioNode({ data, selected }: any) {
             )}
             {rewrittenText && (
               <div className="rounded-md border border-emerald-400/25 bg-emerald-400/10 px-2.5 py-2">
-                <div className="mb-1 text-[9.5px] uppercase tracking-widest text-emerald-200/80">改后 · SKG 口播</div>
+                <div className="mb-1 text-[9.5px] uppercase tracking-widest text-emerald-200/80">改后 · SKG English VO</div>
                 <div className="line-clamp-4 text-[11.5px] leading-relaxed text-[var(--text-strong)] break-words">{rewrittenText}</div>
               </div>
             )}
diff --git a/web/lib/api.ts b/web/lib/api.ts
index 7ddbd43..8cd08cb 100644
--- a/web/lib/api.ts
+++ b/web/lib/api.ts
@@ -67,6 +67,9 @@ export interface ProductFusionRegion {
 
 export interface ProductFusionShot {
   id: string
+  first_image?: ImageRef | null
+  last_image?: ImageRef | null
+  product_images?: ImageRef[]
   product_image?: ImageRef | null
   person_image?: ImageRef | null
   product_region?: ProductFusionRegion | null
@@ -218,6 +221,7 @@ export type SubjectKind = "object" | "living"
 export type SubjectView = string
 export type SceneMode = "remove_subject" | "similar" | "style"
 export type SceneStyle = "source" | "premium_product" | "clean_studio" | "warm_lifestyle" | "cinematic"
+export type SceneAssetRole = "scene" | "first_frame" | "last_frame"
 
 export interface QualityReport {
   width: number
@@ -251,6 +255,7 @@ export interface SceneAsset {
   size: AssetSize
   scene_mode?: SceneMode
   scene_style?: SceneStyle
+  asset_role?: SceneAssetRole
   quality_report?: QualityReport | null
   created_at: number
 }
@@ -794,6 +799,7 @@ export async function generateSceneAsset(
     size?: AssetSize
     scene_mode?: SceneMode
     scene_style?: SceneStyle
+    asset_role?: SceneAssetRole
     prompt?: string
     source_frame_indices?: number[]
   } = {},
@@ -806,6 +812,7 @@ export async function generateSceneAsset(
       size: body.size ?? "source",
       scene_mode: body.scene_mode ?? "remove_subject",
       scene_style: body.scene_style ?? "source",
+      asset_role: body.asset_role ?? "scene",
       prompt: body.prompt ?? "",
       source_frame_indices: body.source_frame_indices ?? null,
     }),