auto-save 2026-05-14 12:04 (~4)

2026-05-14 12:04:20 +08:00
parent f0c6c5b916
commit 9f3e28d230
4 changed files with 105 additions and 138 deletions
--- a/.memory/worklog.json
+++ b/.memory/worklog.json
@@ -1,12 +1,5 @@
 {
  "entries": [
-    {
-      "files_changed": 1,
-      "hash": "c7af450",
-      "message": "auto-save 2026-05-13 04:35 (~1)",
-      "ts": "2026-05-13T04:35:34+08:00",
-      "type": "commit"
-    },
    {
      "files_changed": 1,
      "hash": "fed62f1",
@@ -3295,6 +3288,13 @@
      "type": "session-heartbeat",
      "message": "Codex 会话活跃 · 最近命令：codex · 4 项未提交变更 · 最近提交：auto-save 2026-05-14 11:53 (~4)",
      "files_changed": 4
+    },
+    {
+      "ts": "2026-05-14T11:58:48+08:00",
+      "type": "commit",
+      "message": "auto-save 2026-05-14 11:58 (~4)",
+      "hash": "f0c6c5b",
+      "files_changed": 4
    }
  ]
 }
--- a/api/main.py
+++ b/api/main.py
@@ -2919,6 +2919,8 @@ def generate_scene_asset(job_id: str, idx: int, req: GenerateSceneAssetReq) -> J
            + reference_clause
            + user_prompt_clause
            + style_clause + " "
+            + TRANSPARENT_HUMAN_POSITIVE_PROMPT + " "
+            + TRANSPARENT_HUMAN_NEGATIVE_PROMPT + " "
            + "The frame must feature the same friendly transparent or translucent human character: glass/acrylic/vinyl-like transparent outer body, visible clean white skeleton inside, clean commercial wellness style, non-horror. "
            + "Use the references only to understand character identity, proportions, transparent shell, white bones, pose vocabulary, camera language, and lighting; do not copy watermarks, subtitles, platform UI, logos, or accidental artifacts. "
            + "Do not create a plain background plate. Do not remove the character. Do not include SKG product unless the user prompt explicitly asks for it. "
@@ -3697,12 +3699,12 @@ def create_product_fusion_guide(job_id: str, req: ProductFusionShot) -> dict:

 def fallback_product_fusion_descriptions() -> list[str]:
    return [
-        "人物双手拿起 SKG 颈部按摩仪，准备戴到脖子上，镜头轻微推近产品。",
-        "人物把 SKG 按摩仪贴合到肩颈位置，手部轻轻调整两侧机身角度。",
-        "人物坐在场景中轻按侧边控制区，产品保持在画框指定区域内清晰可见。",
-        "人物闭眼放松，肩颈从紧绷变舒展，产品佩戴位置稳定不漂移。",
-        "镜头靠近展示 SKG 产品材质、按键和内侧触点，手部不要遮挡产品主体。",
-        "使用后的放松状态收尾，人物自然抬头，产品仍保持白色 U 形外观和真实比例。",
+        "透明骨架人双手拿起 SKG 颈部按摩仪，准备戴到脖子上，镜头轻微推近产品。",
+        "透明骨架人把 SKG 按摩仪贴合到肩颈位置，手部轻轻调整两侧机身角度。",
+        "透明骨架人坐在场景中轻按侧边控制区，产品保持真实比例并清晰可见。",
+        "透明骨架人闭眼放松，肩颈从紧绷变舒展，产品佩戴位置稳定不漂移。",
+        "镜头靠近展示 SKG 产品材质、按键和内侧触点，透明骨架人的手部不要遮挡产品主体。",
+        "使用后的放松状态收尾，透明骨架人自然抬头，产品仍保持白色 U 形外观和真实比例。",
    ]


@@ -3716,16 +3718,19 @@ def generate_product_fusion_descriptions(job_id: str, req: ProductFusionDescript
        return {"descriptions": fallback, "mode": "fallback"}
    shot_lines = []
    for i, shot in enumerate(shots, start=1):
-        product = (shot.product_image or {}).get("label") or "SKG 产品图"
-        person = (shot.person_image or {}).get("label") or "白底人物姿态图"
-        scene = (shot.scene_image or {}).get("label") or "场景图"
-        region = shot.product_region
-        region_text = f"x={region.x:.2f}, y={region.y:.2f}, w={region.w:.2f}, h={region.h:.2f}" if region else "未画区域"
-        shot_lines.append(f"{i}. 产品={product}；人物={person}；区域={region_text}；场景={scene}；已有描述={shot.action_text or '空'}")
+        first = (shot.first_image or {}).get("label") or "首帧未填"
+        last = (shot.last_image or {}).get("label") or "尾帧未填"
+        products = [
+            (ref or {}).get("label") or f"产品角度{idx + 1}未填"
+            for idx, ref in enumerate((shot.product_images or [])[:3])
+        ]
+        while len(products) < 3:
+            products.append(f"产品角度{len(products) + 1}未填")
+        shot_lines.append(f"{i}. 首帧={first}；尾帧={last}；产品角度={products[0]} / {products[1]} / {products[2]}；已有描述={shot.action_text or '空'}")
    prompt = (
        "你是 SKG 产品短视频分镜导演。请为 6 条产品融合镜头各写一条中文动作描述，"
-        "每条 20-40 字，必须说明人物在做什么、产品如何佩戴/展示、动作如何自然连续。"
-        "产品是 SKG 白色 U 形颈部/肩颈按摩仪，不要写医疗治疗承诺，不要出现竞品。"
+        "每条 20-45 字，必须说明透明骨架人在做什么、产品如何佩戴/展示、动作如何从首帧自然过渡到尾帧。"
+        "产品是 SKG 白色 U 形颈部/肩颈按摩仪，三张产品角度图是同一产品的身份真源；不要写医疗治疗承诺，不要出现竞品。"
        "输出 JSON：{\"descriptions\":[\"...\", \"...\"]}。\n\n"
        + "\n".join(shot_lines)
    )
--- a/docs/source-analysis.html
+++ b/docs/source-analysis.html
@@ -629,7 +629,7 @@ api/main.py
          </div>
          <div class="flow-row">
            <div><strong>你看到的区域</strong><span>关键帧素材审核面板</span></div>
-            <div><strong>主要源码</strong><span><code>FrameLightbox</code>；按“原图/清洗、主体资产、场景图、产品融合、审核”五个页签组织；左侧只放主图/框选画布，但主体资产页左侧改为全部已清洗/已选参考帧网格，场景图页左侧显示全部关键帧并可勾选场景参考。主体识别页会显示透明骨架人目标和 Vision 验收分数。清洗页右侧支持一键清洗未处理帧、单张替换清洗版和一键替换全部待应用清洗版；批量替换顺序调用 <code>applyCleanedFrame</code>，不新增后端接口。产品融合页左侧改为纵向 6 行镜头工作表：每行直接显示产品图、白底人物图、人物图上的产品区域、场景图、描述词、秒数和单条生成按钮，便于一次看完 6 条视频。产品融合槽位的“粘贴”优先使用应用内 <code>clipboard</code>，也支持选中槽位后 Cmd+V 粘贴系统图片。右侧只保留 GPT Image 2 / Seedance 固定模型、当前镜头状态、AI 描述草稿、批量排队和产品图库选用。主体资产页只确认一个统一主体，后端按参考重绘六张纯背景、占满画面的标准站立透明骨架人资产图；场景图依赖主体资产，右侧通过地点、生成方式、风格和参考要素拼出可编辑 prompt，再按当前关键帧生成去主体原场景、相似新场景或同构换风格。相关接口包括 <code>cleanupFrame</code>、<code>applyCleanedFrame</code>、<code>addElement</code>、<code>generateSubjectAssets</code>、<code>generateSceneAsset</code>、<code>listProductLibrary</code>、<code>copyProductLibraryAsset</code>、<code>createProductFusionGuide</code> 和 <code>generateProductFusionDescriptions</code>。</span></div>
+            <div><strong>主要源码</strong><span><code>FrameLightbox</code>；按“原图/清洗、主体资产、首尾帧、产品融合、审核”五个页签组织；左侧只放主图/框选画布，但主体资产页左侧改为全部已清洗/已选参考帧网格，首尾帧页左侧显示全部关键帧并可勾选人物/机位参考。主体识别页会显示透明骨架人目标和 Vision 验收分数。清洗页右侧支持一键清洗未处理帧、单张替换清洗版和一键替换全部待应用清洗版；批量替换顺序调用 <code>applyCleanedFrame</code>，不新增后端接口。产品融合页左侧是纵向 6 行镜头工作表：每行直接显示首帧、尾帧、同一产品 3 个角度图、描述词、秒数和单条生成按钮，便于一次看完 6 条视频。产品融合槽位的“粘贴”优先使用应用内 <code>clipboard</code>，也支持选中槽位后 Cmd+V 粘贴系统图片。右侧保留 GPT Image 2 / Seedance 固定模型、当前镜头状态、AI 描述草稿、批量排队和产品图库选用；产品图库选中后会填入当前镜头下一个产品角度槽。主体资产页只确认一个统一主体，后端按参考重绘六张纯背景、占满画面的标准站立透明骨架人资产图；首尾帧页通过地点、风格、参考要素和可编辑 prompt 做文字生图，生成结果写入 <code>scene_assets</code> 但以 <code>asset_role=first_frame/last_frame</code> 标记，并自动传入当前产品融合镜头。相关接口包括 <code>cleanupFrame</code>、<code>applyCleanedFrame</code>、<code>addElement</code>、<code>generateSubjectAssets</code>、<code>generateSceneAsset</code>、<code>listProductLibrary</code>、<code>copyProductLibraryAsset</code> 和 <code>generateProductFusionDescriptions</code>。</span></div>
            <div><strong>适合怎么描述</strong><span>“这一组关键帧如何共同生成一个统一主体包；某张关键帧的水印、去主体场景图、产品融合镜头组和质量风险应该如何审核”。</span></div>
          </div>
          <div class="flow-row">
@@ -748,18 +748,17 @@ SubjectAsset {
          </div>
          <div class="card">
            <h3>ProductFusionShot</h3>
-            <p>产品融合镜头组的单行数据。每个关键帧最多 6 行，产品图、人物图、产品区域、场景图、动作描述和秒数一一对应；生成时先创建融合引导图，再提交 Seedance。</p>
+            <p>产品融合镜头组的单行数据。每个关键帧最多 6 行，首帧、尾帧、三张同一产品不同角度图、动作描述和秒数一一对应；生成时直接把首尾帧和产品角度图作为 Seedance 垫图提交。</p>
            <pre>ProductFusionShot {
  id,
-  product_image,
-  person_image,
-  product_region: { x, y, w, h },
-  scene_image,
+  first_image,
+  last_image,
+  product_images[3],
  action_text,
  duration,
  image_model: gpt-image-2,
  video_model: seedance,
-  guide_image
+  // legacy: product_image, person_image, product_region, scene_image, guide_image
 }</pre>
          </div>
          <div class="card">
@@ -803,11 +802,11 @@ SubjectAsset {
            <tr><td>元素增改删</td><td><code>POST/PATCH/DELETE /elements</code></td><td><code>addElement/updateElement/deleteElement</code></td><td>让用户修正 Vision 错误，避免候选结果锁死。</td></tr>
            <tr><td>元素提取</td><td><code>POST /elements/{element_id}/cutout</code></td><td><code>cutoutElement</code></td><td>调用图像模型生成独立白底素材图，每次累积一张 cutout。</td></tr>
            <tr><td>主体资产包</td><td><code>POST /elements/{element_id}/subject-assets</code></td><td><code>generateSubjectAssets</code></td><td>根据参考帧重新绘制一个统一主体资产包；前端默认把全部关键帧作为 <code>source_frame_indices</code>，如果用户手动选择了关键帧则只传已选帧，后端拼参考板。人物默认输出六张身份标准图，另有表情补充和动作补充分组可选；纯白/黑背景，不含其他元素，并裁去空白让主体占满画面。</td></tr>
-            <tr><td>场景资产</td><td><code>POST /frames/{idx}/scene-asset</code></td><td><code>generateSceneAsset</code></td><td>在统一主体资产之后，按当前关键帧生成去主体背景板；请求包含 <code>scene_mode</code>、<code>scene_style</code>、<code>prompt</code> 和 <code>source_frame_indices</code>，可用左侧选择的参考帧 + 右侧关键词生成原场景补背景、相似新场景或同构换风格，保留历史版本用于人工审核。</td></tr>
+            <tr><td>首尾帧资产</td><td><code>POST /frames/{idx}/scene-asset</code></td><td><code>generateSceneAsset</code></td><td>同一接口兼容旧场景图和新首尾帧；新流程传 <code>asset_role=first_frame/last_frame</code>，后端走文字生图，参考帧只用于理解透明骨架人形象、比例、机位和光线，生成结果仍保存在 <code>scene_assets</code> 并自动填入产品融合镜头。</td></tr>
            <tr><td>产品图库</td><td><code>GET /product-library/skg</code></td><td><code>listProductLibrary</code></td><td>读取内置 SKG 白底图库 manifest，返回产品标题、品类、尺寸、白底评分和预览图 URL。</td></tr>
            <tr><td>产品图入库到 job</td><td><code>POST /jobs/{id}/assets/product-library</code></td><td><code>copyProductLibraryAsset</code></td><td>把一个内置产品图库条目复制为当前 job 的普通 asset，返回 <code>ImageRef(kind="asset")</code>，用于画面工作台产品融合和分镜产品参考组。</td></tr>
-            <tr><td>产品融合引导图</td><td><code>POST /jobs/{id}/product-fusion/guide</code></td><td><code>createProductFusionGuide</code></td><td>读取产品图和白底人物图，按用户在人物图上画出的 <code>product_region</code> 合成一张位置引导图；前端固定显示图片模型为 GPT Image 2，返回普通 <code>asset</code> 作为 Seedance 首帧。</td></tr>
-            <tr><td>产品融合描述词</td><td><code>POST /jobs/{id}/product-fusion/descriptions</code></td><td><code>generateProductFusionDescriptions</code></td><td>为 6 行产品融合镜头生成动作描述草稿；有 LLM 配置时用 <code>REWRITE_MODEL</code> 生成 JSON，无配置或失败时回退到本地镜头模板。</td></tr>
+            <tr><td>产品融合引导图</td><td><code>POST /jobs/{id}/product-fusion/guide</code></td><td><code>createProductFusionGuide</code></td><td>旧流程兼容接口：读取产品图和白底人物图，按 <code>product_region</code> 合成位置引导图。当前首尾帧流程不再主动调用它。</td></tr>
+            <tr><td>产品融合描述词</td><td><code>POST /jobs/{id}/product-fusion/descriptions</code></td><td><code>generateProductFusionDescriptions</code></td><td>为 6 行产品融合镜头生成动作描述草稿；输入重点变为首帧、尾帧和三张产品角度图，有 LLM 配置时用 <code>REWRITE_MODEL</code> 生成 JSON，无配置或失败时回退到本地镜头模板。</td></tr>
            <tr><td>分镜保存</td><td><code>PUT /frames/{idx}/storyboard</code></td><td><code>updateStoryboard</code></td><td>保存 4 图槽、时长和改造说明。</td></tr>
            <tr><td>生图</td><td><code>POST /frames/{idx}/generate</code></td><td><code>generateImage</code></td><td>基于关键帧或已选生成图做 image-to-image，目前可用。</td></tr>
          </tbody>
@@ -918,6 +917,19 @@ SubjectAsset {
        <h2>变更记录</h2>
        <p>这个记录不是 git log 的替代品。它记录“产品理解发生了什么变化、影响了哪些源码、你以后描述需求时该怎么说”。后续每次改功能都要补一条。</p>
        <div class="changelog">
+          <article class="change">
+            <header>
+              <h3>2026-05-14 · 产品融合改为首尾帧加三产品角度垫图</h3>
+              <span class="tag violet">FrameLightbox</span>
+              <span class="tag orange">产品融合</span>
+            </header>
+            <div class="body">
+              <p><strong>问题：</strong>原产品融合依赖白底人物、手动画区域、场景图和融合引导图，但当前透明骨架人二创流程更需要文字生成首尾帧，再把产品真源作为垫图传给视频模型。</p>
+              <p><strong>改动：</strong>“场景图”页签改名为“首尾帧”，右侧用地点、风格、参考要素和 prompt 生成首帧/尾帧，生成后自动填入当前产品融合镜头。产品融合 6 行工作表改为首帧、尾帧、三张同一产品不同角度图、描述词、秒数和生成按钮。</p>
+              <p><strong>后端：</strong><code>generateSceneAsset</code> 新增 <code>asset_role</code>，<code>first_frame/last_frame</code> 走文字生图并标记资产角色；<code>ProductFusionShot</code> 新增 <code>first_image</code>、<code>last_image</code>、<code>product_images</code>，视频提交直接把首尾帧和三张产品图交给 Seedance。</p>
+              <p><strong>影响：</strong><code>api/main.py</code>、<code>web/lib/api.ts</code>、<code>web/app/page.tsx</code>、<code>web/components/lightbox.tsx</code>、<code>docs/source-analysis.html</code>。</p>
+            </div>
+          </article>
          <article class="change">
            <header>
              <h3>2026-05-14 · 本地抽帧改为展示友好算力档</h3>
--- a/web/components/lightbox.tsx
+++ b/web/components/lightbox.tsx
@@ -6,7 +6,7 @@ import {
  frameUrl, cleanedFrameUrl, apiAssetUrl,
  describeFrame, cleanupFrame, applyCleanedFrame, discardCleanedFrame, addElement, updateElement, deleteElement,
  generateSceneAsset, generateSubjectAssets, generateProductFusionDescriptions, resolveImageRefUrl, uploadStoryboardAsset, updateStoryboard,
-  type AssetBackground, type AssetSize, type KeyFrame, type Job, type ImageRef, type ProductFusionShot, type SceneAssetRole, type SceneMode, type SceneStyle, type SubjectKind,
+  type AssetBackground, type AssetSize, type KeyFrame, type Job, type ImageRef, type ProductFusionShot, type SceneAssetRole, type SceneStyle, type SubjectKind,
 } from "@/lib/api"
 import { ProductLibraryPicker } from "@/components/product-library-picker"
 import { TRANSPARENT_HUMAN_FRAME_STANDARD, TRANSPARENT_HUMAN_UI_SUMMARY } from "@/lib/workflow-target"
@@ -77,12 +77,6 @@ const LIGHTBOX_TABS: Array<{ key: LightboxTab; label: string }> = [
  { key: "review", label: "审核" },
 ]

-const SCENE_MODE_OPTIONS: Array<[SceneMode, string]> = [
-  ["remove_subject", "去主体原场景"],
-  ["similar", "相似新场景"],
-  ["style", "同构换风格"],
-]
-
 const SCENE_STYLE_OPTIONS: Array<[SceneStyle, string]> = [
  ["source", "跟随原图"],
  ["premium_product", "高端产品感"],
@@ -162,7 +156,6 @@ export function FrameLightbox({ jobId, frames, activeIndex, selected, onClose, o
  const [sceneGenerating, setSceneGenerating] = useState<SceneAssetRole | null>(null)
  const [subjectGenerating, setSubjectGenerating] = useState<string | null>(null)
  const [assetSize, setAssetSize] = useState<AssetSize>("source")
-  const [sceneMode, setSceneMode] = useState<SceneMode>("remove_subject")
  const [sceneStyle, setSceneStyle] = useState<SceneStyle>("source")
  const [sceneLocation, setSceneLocation] = useState("modern living room")
  const [sceneReferenceKeys, setSceneReferenceKeys] = useState<string[]>(["camera angle and composition", "lighting direction", "spatial layout"])
@@ -177,8 +170,6 @@ export function FrameLightbox({ jobId, frames, activeIndex, selected, onClose, o
  const [fusionUploadTarget, setFusionUploadTarget] = useState<FusionUploadTarget | null>(null)
  const [fusionGenerating, setFusionGenerating] = useState<number | "all" | null>(null)
  const [fusionSaving, setFusionSaving] = useState(false)
-  const [fusionDraftRegion, setFusionDraftRegion] = useState<{ x: number; y: number; w: number; h: number } | null>(null)
-  const [fusionDragStart, setFusionDragStart] = useState<{ x: number; y: number } | null>(null)
  const [editingElement, setEditingElement] = useState<{
    frameIndex: number
    id: string
@@ -194,7 +185,6 @@ export function FrameLightbox({ jobId, frames, activeIndex, selected, onClose, o
  const [draftRegion, setDraftRegion] = useState<Region | null>(null)  // 当前正在拖的
  const [dragStart, setDragStart] = useState<{ x: number; y: number } | null>(null)
  const imgWrapRef = useRef<HTMLDivElement>(null)
-  const fusionPersonWrapRef = useRef<HTMLDivElement>(null)
  const fusionFileInputRef = useRef<HTMLInputElement | null>(null)
  const loadedFusionKey = useRef("")
  const activeIndexRef = useRef<number | null>(activeIndex)
@@ -321,9 +311,6 @@ export function FrameLightbox({ jobId, frames, activeIndex, selected, onClose, o
  const currentFusionProductCount = currentFusionProducts.filter(Boolean).length
  const currentFusionFirstUrl = currentFusionShot?.first_image ? resolveImageRefUrl(jobId, currentFusionShot.first_image) : ""
  const currentFusionLastUrl = currentFusionShot?.last_image ? resolveImageRefUrl(jobId, currentFusionShot.last_image) : ""
-  const currentFusionProductUrl = currentFusionProducts[0] ? resolveImageRefUrl(jobId, currentFusionProducts[0]) : ""
-  const currentFusionPersonUrl = currentFusionShot?.person_image ? resolveImageRefUrl(jobId, currentFusionShot.person_image) : ""
-  const currentFusionSceneUrl = currentFusionShot?.scene_image ? resolveImageRefUrl(jobId, currentFusionShot.scene_image) : ""
  const fusionReadyCount = fusionShots.filter((shot) =>
    shot.first_image && shot.last_image && (shot.product_images ?? []).filter(Boolean).length >= 3 && shot.action_text?.trim()
  ).length
@@ -355,13 +342,15 @@ export function FrameLightbox({ jobId, frames, activeIndex, selected, onClose, o
    if (!current) return
    if (target.slot === "product_images") {
      const productImages = [...(current.product_images ?? [])].slice(0, 3)
-      const productIndex = Math.max(0, Math.min(2, target.productIndex ?? productImages.findIndex((item) => !item)))
-      const safeIndex = productIndex >= 0 ? productIndex : 0
+      const inferredIndex = [0, 1, 2].find((idx) => !productImages[idx]) ?? 0
+      const safeIndex = Math.max(0, Math.min(2, target.productIndex ?? inferredIndex))
      productImages[safeIndex] = ref
      updateFusionShot(index, { product_images: productImages, product_image: productImages[0] ?? null, guide_image: null }, true)
      return
    }
-    updateFusionShot(index, { [target.slot]: ref, guide_image: null }, true)
+    updateFusionShot(index, target.slot === "first_image"
+      ? { first_image: ref, guide_image: null }
+      : { last_image: ref, guide_image: null }, true)
  }

  const uploadFusionFiles = async (files: FileList | File[]) => {
@@ -390,12 +379,12 @@ export function FrameLightbox({ jobId, frames, activeIndex, selected, onClose, o

  const draftFusionDescriptions = async () => {
    const actions = [
-      "人物双手拿起 SKG 颈部按摩仪，准备戴到脖子上，镜头轻微推近产品。",
-      "人物把 SKG 按摩仪贴合到肩颈位置，手部轻轻调整两侧机身角度。",
-      "人物坐在场景中轻按侧边控制区，产品保持在画框指定区域内清晰可见。",
-      "人物闭眼放松，肩颈从紧绷变舒展，产品佩戴位置稳定不漂移。",
-      "镜头靠近展示 SKG 产品材质、按键和内侧触点，手部不要遮挡产品主体。",
-      "使用后的放松状态收尾，人物自然抬头，产品仍保持白色 U 形外观和真实比例。",
+      "透明骨架人双手拿起 SKG 颈部按摩仪，准备戴到脖子上，镜头轻微推近产品。",
+      "透明骨架人把 SKG 按摩仪贴合到肩颈位置，手部轻轻调整两侧机身角度。",
+      "透明骨架人坐在场景中轻按侧边控制区，产品保持真实比例并清晰可见。",
+      "透明骨架人闭眼放松，肩颈从紧绷变舒展，产品佩戴位置稳定不漂移。",
+      "镜头靠近展示 SKG 产品材质、按键和内侧触点，透明骨架人的手部不要遮挡产品主体。",
+      "使用后的放松状态收尾，透明骨架人自然抬头，产品仍保持白色 U 形外观和真实比例。",
    ]
    let descriptions = actions
    try {
@@ -413,44 +402,6 @@ export function FrameLightbox({ jobId, frames, activeIndex, selected, onClose, o
    toast.success("已生成 6 条动作描述草稿，可继续手工修改")
  }

-  const fusionPointerPosition = (ev: React.MouseEvent<HTMLDivElement>) => {
-    const rect = fusionPersonWrapRef.current?.getBoundingClientRect()
-    if (!rect || rect.width <= 0 || rect.height <= 0) return null
-    return {
-      x: Math.max(0, Math.min(1, (ev.clientX - rect.left) / rect.width)),
-      y: Math.max(0, Math.min(1, (ev.clientY - rect.top) / rect.height)),
-    }
-  }
-
-  const onFusionRegionDown = (ev: React.MouseEvent<HTMLDivElement>) => {
-    if (activeTab !== "product" || !currentFusionPersonUrl) return
-    ev.preventDefault()
-    const p = fusionPointerPosition(ev)
-    if (!p) return
-    setFusionDragStart(p)
-    setFusionDraftRegion({ x: p.x, y: p.y, w: 0, h: 0 })
-  }
-
-  const onFusionRegionMove = (ev: React.MouseEvent<HTMLDivElement>) => {
-    if (!fusionDragStart) return
-    const p = fusionPointerPosition(ev)
-    if (!p) return
-    setFusionDraftRegion({
-      x: Math.min(fusionDragStart.x, p.x),
-      y: Math.min(fusionDragStart.y, p.y),
-      w: Math.abs(p.x - fusionDragStart.x),
-      h: Math.abs(p.y - fusionDragStart.y),
-    })
-  }
-
-  const onFusionRegionUp = () => {
-    if (!fusionDraftRegion || !fusionDragStart) return
-    const region = fusionDraftRegion.w >= 0.02 && fusionDraftRegion.h >= 0.02 ? fusionDraftRegion : null
-    if (region) updateFusionShot(activeFusionShot, { product_region: region, guide_image: null }, true)
-    setFusionDraftRegion(null)
-    setFusionDragStart(null)
-  }
-
  const runFusionVideo = async (index: number) => {
    const shot = fusionShots[index]
    if (!shot?.first_image || !shot.last_image || (shot.product_images ?? []).filter(Boolean).length < 3 || !shot.action_text?.trim()) {
@@ -585,6 +536,7 @@ export function FrameLightbox({ jobId, frames, activeIndex, selected, onClose, o
  const handleGenerateSceneAsset = async (role: Exclude<SceneAssetRole, "scene">) => {
    const roleLabel = role === "first_frame" ? "首帧" : "尾帧"
    const targetSlot: FusionFrameRole = role === "first_frame" ? "first_image" : "last_image"
+    const targetShotIndex = activeFusionShot
    if (!hasSubjectAssets) {
      toast.message("还没有主体资产，也会按当前参考帧理解人物；一致性可能弱一些")
    }
@@ -608,7 +560,7 @@ export function FrameLightbox({ jobId, frames, activeIndex, selected, onClose, o
      const asset = [...(updatedFrame?.scene_assets ?? [])].reverse().find((item) => item.asset_role === role)
      if (asset) {
        assignFusionImage({
-          shotIndex: activeFusionShot,
+          shotIndex: targetShotIndex,
          slot: targetSlot,
        }, {
          kind: "asset",
@@ -618,7 +570,7 @@ export function FrameLightbox({ jobId, frames, activeIndex, selected, onClose, o
          label: asset.label,
        })
      }
-      toast.success(`分镜 ${f.index + 1} ${roleLabel}已生成，并填入镜头 ${activeFusionShot + 1}`)
+      toast.success(`分镜 ${f.index + 1} ${roleLabel}已生成，并填入镜头 ${targetShotIndex + 1}`)
    } catch (e) {
      toast.error(`${roleLabel}生成失败：` + (e instanceof Error ? e.message : String(e)))
    } finally {
@@ -1442,7 +1394,7 @@ export function FrameLightbox({ jobId, frames, activeIndex, selected, onClose, o
          {activeTab === "scene" && (
            <section className="rounded-lg border border-emerald-300/15 bg-emerald-500/[0.08] p-2.5 text-[10.5px] leading-relaxed text-white/58">
              <div className="mb-2 flex items-center justify-between gap-2">
-                <div className="text-[12px] font-semibold text-white">场景图</div>
+                <div className="text-[12px] font-semibold text-white">首尾帧生图</div>
                <select
                  value={assetSize}
                  onChange={(e) => setAssetSize(e.target.value as AssetSize)}
@@ -1456,7 +1408,7 @@ export function FrameLightbox({ jobId, frames, activeIndex, selected, onClose, o
                </select>
              </div>
 	              <div className="mb-2 rounded-md border border-white/10 bg-black/25 px-2 py-1.5 text-[10px] leading-relaxed text-white/50">
-	                左侧选择场景参考图，右侧选择地点和参考关键词；下方 prompt 可自动拼好，也可以手动改。
+	                这里只做文字生图：用前面参考帧理解透明骨架人形象，生成首帧/尾帧并自动填入当前产品融合镜头。
 	              </div>
 	              <div className="mb-2 grid grid-cols-2 gap-1.5">
 	                <label className="space-y-1">
@@ -1471,20 +1423,6 @@ export function FrameLightbox({ jobId, frames, activeIndex, selected, onClose, o
 	                    ))}
 	                  </select>
 	                </label>
-	                <label className="space-y-1">
-	                  <span className="block text-[9px] text-white/35">生成方式</span>
-                  <select
-                    value={sceneMode}
-                    onChange={(e) => setSceneMode(e.target.value as SceneMode)}
-                    className="w-full rounded border border-white/10 bg-black/35 px-1.5 py-1 text-[10px] text-white/75 outline-none"
-                  >
-                    {SCENE_MODE_OPTIONS.map(([value, label]) => (
-                      <option key={value} value={value}>{label}</option>
-	                    ))}
-	                  </select>
-	                </label>
-	              </div>
-	              <div className="mb-2 grid grid-cols-2 gap-1.5">
 	                <label className="space-y-1">
 	                  <span className="block text-[9px] text-white/35">风格</span>
                  <select
@@ -1497,12 +1435,14 @@ export function FrameLightbox({ jobId, frames, activeIndex, selected, onClose, o
 	                    ))}
 	                  </select>
 	                </label>
-	                <label className="space-y-1">
+	              </div>
+	              <div className="mb-2">
+	                <label className="block space-y-1">
 	                  <span className="block text-[9px] text-white/35">额外关键词</span>
 	                  <input
 	                    value={sceneExtraKeywords}
 	                    onChange={(e) => setSceneExtraKeywords(e.target.value)}
-	                    placeholder="例如：玻璃、金属、夜景"
+	                    placeholder="例如：人物站在客厅，抬手准备佩戴颈部按摩仪，镜头慢慢推近"
 	                    className="w-full rounded border border-white/10 bg-black/35 px-1.5 py-1 text-[10px] text-white/75 outline-none placeholder:text-white/25"
 	                  />
 	                </label>
@@ -1535,7 +1475,7 @@ export function FrameLightbox({ jobId, frames, activeIndex, selected, onClose, o
 	              </div>
 	              <label className="mb-2 block">
 	                <div className="mb-1 flex items-center justify-between gap-2">
-	                  <span className="text-[9px] text-white/35">场景 prompt</span>
+	                  <span className="text-[9px] text-white/35">首尾帧 prompt</span>
 	                  <button
 	                    type="button"
 	                    onClick={() => setScenePrompt(scenePromptDraft)}
@@ -1552,7 +1492,7 @@ export function FrameLightbox({ jobId, frames, activeIndex, selected, onClose, o
 	              </label>
 	              {!hasSubjectAssets && (
                <div className="mb-2 rounded border border-amber-300/25 bg-amber-500/10 px-2 py-1.5 text-[10px] leading-snug text-amber-100/85">
-                  还没有主体资产。先在“主体资产”页生成主体图，场景图才能更准确地去主体和补背景。
+                  还没有主体资产。仍可生成首尾帧，但人物一致性会更依赖当前参考帧。
                </div>
              )}
              {latestSceneAsset ? (
@@ -1561,8 +1501,8 @@ export function FrameLightbox({ jobId, frames, activeIndex, selected, onClose, o
                  <div className="flex items-center justify-between gap-2 border-t border-white/10 px-2 py-1 text-[9.5px] text-white/50">
                    <span>
                      {latestSceneAsset.width}×{latestSceneAsset.height}
-                      {latestSceneAsset.scene_mode && (
-                        <> · {SCENE_MODE_OPTIONS.find(([value]) => value === latestSceneAsset.scene_mode)?.[1] ?? latestSceneAsset.scene_mode}</>
+                      {latestSceneAsset.asset_role && (
+                        <> · {latestSceneAsset.asset_role === "first_frame" ? "首帧" : latestSceneAsset.asset_role === "last_frame" ? "尾帧" : "场景图"}</>
                      )}
                    </span>
                    {onCopyImage && (
@@ -1579,7 +1519,7 @@ export function FrameLightbox({ jobId, frames, activeIndex, selected, onClose, o
                </div>
              ) : (
                <div className="mb-2 rounded border border-white/10 bg-black/25 px-2 py-2 text-white/45">
-                  当前帧还没有场景图。
+                  当前帧还没有首尾帧资产。
                </div>
              )}
              {latestSceneAsset?.quality_report?.warnings?.length ? (
@@ -1587,25 +1527,28 @@ export function FrameLightbox({ jobId, frames, activeIndex, selected, onClose, o
                  {latestSceneAsset.quality_report.warnings[0]}
                </div>
              ) : null}
-              <button
-                type="button"
-                onClick={handleGenerateSceneAsset}
-                disabled={sceneGenerating || isCleaningCurrentFrame || batchCleaning || !hasSubjectAssets}
-                className="w-full rounded-md bg-emerald-500/65 px-2 py-1.5 text-[11px] font-medium text-white transition hover:bg-emerald-400 disabled:cursor-wait disabled:opacity-45 inline-flex items-center justify-center gap-1"
-                title={hasSubjectAssets ? "基于主体资产去主体、补背景并生成场景参考图" : "先生成主体资产"}
-              >
-                {sceneGenerating ? <Loader2 className="h-3 w-3 animate-spin" /> : <Sparkles className="h-3 w-3" />}
-                {sceneGenerating ? "生成场景图中…" : latestSceneAsset ? "重新生成场景图" : "生成去主体场景图"}
-              </button>
-              {!hasSubjectAssets && (
+              <div className="grid grid-cols-2 gap-1.5">
                <button
                  type="button"
-                  onClick={() => setActiveTab("subject")}
-                  className="mt-1.5 w-full rounded-md border border-violet-300/25 bg-violet-500/15 px-2 py-1.5 text-[10.5px] font-medium text-violet-100 transition hover:bg-violet-500/25"
+                  onClick={() => void handleGenerateSceneAsset("first_frame")}
+                  disabled={!!sceneGenerating || isCleaningCurrentFrame || batchCleaning}
+                  className="w-full rounded-md bg-emerald-500/65 px-2 py-1.5 text-[11px] font-medium text-white transition hover:bg-emerald-400 disabled:cursor-wait disabled:opacity-45 inline-flex items-center justify-center gap-1"
+                  title={`生成后填入产品融合镜头 ${activeFusionShot + 1} 的首帧`}
                >
-                  去生成主体资产
+                  {sceneGenerating === "first_frame" ? <Loader2 className="h-3 w-3 animate-spin" /> : <Sparkles className="h-3 w-3" />}
+                  {sceneGenerating === "first_frame" ? "生成首帧中…" : "生成首帧并填入"}
                </button>
-              )}
+                <button
+                  type="button"
+                  onClick={() => void handleGenerateSceneAsset("last_frame")}
+                  disabled={!!sceneGenerating || isCleaningCurrentFrame || batchCleaning}
+                  className="w-full rounded-md bg-cyan-500/65 px-2 py-1.5 text-[11px] font-medium text-white transition hover:bg-cyan-400 disabled:cursor-wait disabled:opacity-45 inline-flex items-center justify-center gap-1"
+                  title={`生成后填入产品融合镜头 ${activeFusionShot + 1} 的尾帧`}
+                >
+                  {sceneGenerating === "last_frame" ? <Loader2 className="h-3 w-3 animate-spin" /> : <Sparkles className="h-3 w-3" />}
+                  {sceneGenerating === "last_frame" ? "生成尾帧中…" : "生成尾帧并填入"}
+                </button>
+              </div>
            </section>
          )}
          {activeTab === "product" && (
@@ -1631,10 +1574,10 @@ export function FrameLightbox({ jobId, frames, activeIndex, selected, onClose, o
                    <span className="text-[9px] text-white/38">{currentFusionShot?.duration ?? 5}s</span>
                  </div>
                  <div className="grid grid-cols-2 gap-x-2 gap-y-1 text-[9.5px]">
-                    <span className={currentFusionShot?.product_image ? "text-emerald-200/80" : "text-white/35"}>产品图</span>
-                    <span className={currentFusionShot?.person_image ? "text-emerald-200/80" : "text-white/35"}>白底人物</span>
-                    <span className={currentFusionShot?.product_region ? "text-emerald-200/80" : "text-white/35"}>产品区域</span>
-                    <span className={currentFusionShot?.scene_image ? "text-emerald-200/80" : "text-white/35"}>场景图</span>
+                    <span className={currentFusionFirstUrl ? "text-emerald-200/80" : "text-white/35"}>首帧</span>
+                    <span className={currentFusionLastUrl ? "text-emerald-200/80" : "text-white/35"}>尾帧</span>
+                    <span className={currentFusionProductCount >= 3 ? "text-emerald-200/80" : "text-white/35"}>产品角度 {currentFusionProductCount}/3</span>
+                    <span className={currentFusionShot?.action_text?.trim() ? "text-emerald-200/80" : "text-white/35"}>描述词</span>
                  </div>
                  <div className={`mt-1 truncate text-[9.5px] ${currentFusionShot?.action_text?.trim() ? "text-white/58" : "text-white/32"}`}>
                    {currentFusionShot?.action_text?.trim() || "描述词未填写"}
@@ -1664,8 +1607,15 @@ export function FrameLightbox({ jobId, frames, activeIndex, selected, onClose, o
                jobId={jobId}
                compact
                buttonLabel="选用"
-                title={`镜头 ${activeFusionShot + 1} 产品图`}
-                onPick={(ref) => assignFusionImage("product_image", ref, activeFusionShot)}
+                title={`镜头 ${activeFusionShot + 1} 产品角度图`}
+                onPick={(ref) => {
+                  const nextEmpty = [0, 1, 2].find((idx) => !currentFusionProducts[idx]) ?? 0
+                  assignFusionImage({
+                    shotIndex: activeFusionShot,
+                    slot: "product_images",
+                    productIndex: nextEmpty,
+                  }, ref)
+                }}
              />
            </>
          )}