feat: gate video generation on endpoint frames

2026-05-18 11:37:13 +08:00
parent 4c8cb066d6
commit 8f917d52b8
6 changed files with 373 additions and 80 deletions
--- a/RULES.md
+++ b/RULES.md
@@ -11,7 +11,7 @@
 - 详见 `CLAUDE.md` 立项决策段 + `.memory/plan.md` 七步管线拆解
 - 风格：`04-Dark-Gallery-Ambient`（路径：`~/Projects/research/20260305-网页风格库/04-Dark-Gallery-Ambient.md`）
 - 第一冲刺：步骤 1-4（下载 / 拆轨 / 关键帧 / ASR+翻译）
- 当前产品方向（2026-05-18 再确认）：先解决信息流广告快速复刻的第一步，不再沿用“开始后线性完成抽帧、分镜、元素生成、合成”的旧做法。主界面为“左侧素材输入列 + 右侧信息流复刻工作表”。用户粘贴 TK 链接或上传视频后点击“开始分析”，系统自动下载源视频；下载完成后并行启动两条路：音频文案路提取原音频文案/字幕，并分析讲话人、语速节奏、背景音乐/环境声/音效；视频视觉路自动抽取 12 张参考帧，供人工选择可用主体并生成相似主体视图。产品图上传后独立形成产品资产包，自动识别视角/结构/比例并补缺角度。分镜工作台按逐句时间轴规划新口播、镜头类型、首帧/尾帧、人物需求和产品出现方式；单条或“一键提交全部”生成视频时，按该行规划自动调取产品图、人物主体和参考帧。
+- 当前产品方向（2026-05-18 再确认）：先解决信息流广告快速复刻的第一步，不再沿用“开始后线性完成抽帧、分镜、元素生成、合成”的旧做法。主界面为“左侧素材输入列 + 右侧信息流复刻工作表”。用户粘贴 TK 链接或上传视频后点击“开始分析”，系统自动下载源视频；下载完成后并行启动两条路：音频文案路提取原音频文案/字幕，并分析讲话人、语速节奏、背景音乐/环境声/音效；视频视觉路自动抽取参考帧，供人工选择可用主体并生成相似主体白底视图。产品图上传后独立形成产品资产包，自动识别视角/结构/比例并补缺角度。分镜工作台按逐句时间轴规划新口播、镜头类型、首帧/尾帧、人物需求和产品出现方式；当前暂停直接调视频模型，先逐条用“相似主体视图 + 产品素材池 + 首尾帧文字规划”生成并审核首帧/尾帧，保存规划后再决定哪些分镜进入单条视频候选。

 ## 部署事实
 - 平台：VPS `76.13.31.179`（Ubuntu 24.04 / Docker Compose / Coolify Traefik）
--- a/api/main.py
+++ b/api/main.py
@@ -330,6 +330,7 @@ class StoryboardScene(BaseModel):
    first_image: dict | None = None
    last_image: dict | None = None
    product_images: list[dict] = Field(default_factory=list)
+    subject_images: list[dict] = Field(default_factory=list)
    product_fusion_shots: list[dict] = Field(default_factory=list)
    visual_mode: Literal["person_only", "person_product", "product_only", "environment"] = "person_product"
    needs_product: bool = True
@@ -1274,6 +1275,44 @@ def _make_reference_contact_sheet(job_id: str, frame_indices: list[int], out_pat
    return out_path


+def _make_paths_contact_sheet(paths: list[Path], out_path: Path, max_items: int = 10) -> Path | None:
+    usable: list[Path] = []
+    seen: set[str] = set()
+    max_items = max(2, min(12, int(max_items or 10)))
+    for p in paths:
+        key = str(p)
+        if key in seen or not p.exists():
+            continue
+        seen.add(key)
+        usable.append(p)
+        if len(usable) >= max_items:
+            break
+    if len(usable) <= 1:
+        return usable[0] if usable else None
+
+    thumbs: list[Image.Image] = []
+    for p in usable:
+        try:
+            im = Image.open(p).convert("RGB")
+            im.thumbnail((420, 420), Image.Resampling.LANCZOS)
+            canvas = Image.new("RGB", (420, 420), (245, 245, 245))
+            canvas.paste(im, ((420 - im.width) // 2, (420 - im.height) // 2))
+            thumbs.append(canvas)
+        except Exception:
+            continue
+    if len(thumbs) <= 1:
+        return usable[0] if usable else None
+
+    cols = 4 if len(thumbs) > 6 else (3 if len(thumbs) > 2 else 2)
+    rows = (len(thumbs) + cols - 1) // cols
+    sheet = Image.new("RGB", (cols * 420, rows * 420), (245, 245, 245))
+    for i, thumb in enumerate(thumbs):
+        sheet.paste(thumb, ((i % cols) * 420, (i // cols) * 420))
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    sheet.save(out_path, "JPEG", quality=92)
+    return out_path
+
+
 SUBJECT_VIEW_LABELS: dict[str, str] = {
    "front": "正面",
    "back": "背面",
@@ -3732,6 +3771,8 @@ class GenerateSceneAssetReq(BaseModel):
    asset_role: SceneAssetRole = "scene"
    prompt: str = ""
    source_frame_indices: list[int] | None = None
+    subject_images: list[dict] = Field(default_factory=list)
+    product_images: list[dict] = Field(default_factory=list)


 class GenerateSubjectAssetsReq(BaseModel):
@@ -3899,11 +3940,20 @@ def generate_scene_asset(job_id: str, idx: int, req: GenerateSceneAssetReq) -> J
    source_indices = list(dict.fromkeys(source_indices))[:8]
    model_src = src
    sheet_tmp: Path | None = None
+    asset_sheet_tmp: Path | None = None
    if len(source_indices) > 1:
        sheet_tmp = job_dir(job_id) / "tmp" / f"scene_refs_{idx:03d}_{uuid.uuid4().hex[:6]}.jpg"
        sheet = _make_reference_contact_sheet(job_id, source_indices, sheet_tmp)
        if sheet:
            model_src = sheet
+    subject_ref_paths = [p for p in (storyboard_ref_path(job_id, r) for r in req.subject_images[:8]) if p and p.exists()]
+    product_ref_paths = [p for p in (storyboard_ref_path(job_id, r) for r in req.product_images[:6]) if p and p.exists()]
+    asset_ref_paths = [*subject_ref_paths, *product_ref_paths]
+    if req.asset_role != "scene" and asset_ref_paths:
+        asset_sheet_tmp = job_dir(job_id) / "tmp" / f"endpoint_refs_{idx:03d}_{uuid.uuid4().hex[:6]}.jpg"
+        asset_sheet = _make_paths_contact_sheet(asset_ref_paths, asset_sheet_tmp, max_items=10)
+        if asset_sheet:
+            model_src = asset_sheet

    confirmed_subjects = [
        (e.name_en or e.name_zh).strip()
@@ -3925,9 +3975,13 @@ def generate_scene_asset(job_id: str, idx: int, req: GenerateSceneAssetReq) -> J
        else "Remove the main foreground subject from the frame if present. "
    )
    identity_clause = (
-        "Known character identity cues: " + ", ".join(confirmed_subjects) + ". "
-        if confirmed_subjects
-        else "Infer one consistent friendly transparent human character identity from the provided references. "
+        f"Use the generated subject asset references as the primary character identity lock ({len(subject_ref_paths)} image(s)); keep the same transparent body shell, clean visible skeleton, proportions, material, and ad-friendly non-horror identity. "
+        if subject_ref_paths
+        else (
+            "Known character identity cues: " + ", ".join(confirmed_subjects) + ". "
+            if confirmed_subjects
+            else "Infer one consistent friendly transparent human character identity from the provided references. "
+        )
    )
    mode_clause = {
        "remove_subject": (
@@ -3956,10 +4010,21 @@ def generate_scene_asset(job_id: str, idx: int, req: GenerateSceneAssetReq) -> J
        if user_prompt
        else ""
    )
-    reference_clause = (
-        f"Use the selected reference frame contact sheet as visual evidence for location, composition, lighting, materials, and atmosphere. Reference frame indices: {', '.join(str(i + 1) for i in source_indices)}. "
-        if len(source_indices) > 1
-        else "Use the provided frame as the primary visual reference. "
+    if req.asset_role != "scene" and asset_ref_paths:
+        reference_clause = (
+            f"Use the provided asset contact sheet as the primary visual reference: {len(subject_ref_paths)} generated subject image(s) and {len(product_ref_paths)} SKG product image(s). "
+            "Do not use the original keyframe as the first/last-frame truth; it is only a storage anchor for this row. "
+        )
+    else:
+        reference_clause = (
+            f"Use the selected reference frame contact sheet as visual evidence for location, composition, lighting, materials, and atmosphere. Reference frame indices: {', '.join(str(i + 1) for i in source_indices)}. "
+            if len(source_indices) > 1
+            else "Use the provided frame as the primary visual reference. "
+        )
+    product_asset_clause = (
+        "Use the provided SKG product references as the rigid product truth when the user prompt asks for product presence: a white U-shaped neck-and-shoulder wearable massage device worn around the neck/shoulders, not headphones, a collar pillow, skincare, food, or a medical prop. Keep product scale believable, preserve left/right asymmetry, side thickness, inner contact pads, buttons, white material, and real wearable placement. "
+        if product_ref_paths
+        else "Do not invent a random product. Only include an SKG product if the user prompt explicitly asks for it. "
    )
    if req.asset_role == "scene":
        prompt = (
@@ -3986,6 +4051,7 @@ def generate_scene_asset(job_id: str, idx: int, req: GenerateSceneAssetReq) -> J
            + reference_clause
            + user_prompt_clause
            + style_clause + " "
+            + product_asset_clause
            + TRANSPARENT_HUMAN_POSITIVE_PROMPT + " "
            + TRANSPARENT_HUMAN_NEGATIVE_PROMPT + " "
            + "The frame must feature the same friendly transparent or translucent human character: glass/acrylic/vinyl-like transparent outer body, visible clean white skeleton inside, clean commercial wellness style, non-horror. "
@@ -3997,6 +4063,8 @@ def generate_scene_asset(job_id: str, idx: int, req: GenerateSceneAssetReq) -> J
    try:
        if req.asset_role == "scene":
            img_bytes, _mode = _image_edit_call(model_src, prompt, models=models, fallback_text=False, max_attempts=3, max_side=1280)
+        elif asset_ref_paths:
+            img_bytes, _mode = _image_edit_call(model_src, prompt, models=models, fallback_text=False, max_attempts=3, max_side=1600)
        else:
            img_bytes, _mode = _image_text_call(prompt, models=models, max_attempts=3)
    except RuntimeError as e:
@@ -4005,6 +4073,9 @@ def generate_scene_asset(job_id: str, idx: int, req: GenerateSceneAssetReq) -> J
        if sheet_tmp and sheet_tmp.exists():
            try: sheet_tmp.unlink()
            except OSError: pass
+        if asset_sheet_tmp and asset_sheet_tmp.exists():
+            try: asset_sheet_tmp.unlink()
+            except OSError: pass

    asset_id = f"scene_{idx:03d}_{uuid.uuid4().hex[:8]}"
    out_path = job_dir(job_id) / "assets" / f"{asset_id}.jpg"
@@ -4387,6 +4458,7 @@ class UpdateStoryboardReq(BaseModel):
    first_image: dict | None = None
    last_image: dict | None = None
    product_images: list[dict] = Field(default_factory=list)
+    subject_images: list[dict] = Field(default_factory=list)
    product_fusion_shots: list[dict] = Field(default_factory=list)
    visual_mode: Literal["person_only", "person_product", "product_only", "environment"] = "person_product"
    needs_product: bool = True
@@ -5562,6 +5634,7 @@ def update_storyboard(job_id: str, idx: int, req: UpdateStoryboardReq) -> Job:
                first_image=req.first_image,
                last_image=req.last_image,
                product_images=list(req.product_images),
+                subject_images=list(req.subject_images),
                product_fusion_shots=list(req.product_fusion_shots),
                visual_mode=req.visual_mode,
                needs_product=bool(req.needs_product),
--- a/docs/source-analysis.html
+++ b/docs/source-analysis.html
--- a/web/app/page.tsx
+++ b/web/app/page.tsx
@@ -38,6 +38,7 @@ const NODE_TYPES = {
 const KEYFRAME_PANEL_ID = "keyframe-detail-panel"
 const VIDEO_FRAME_PANEL_ID = "video-frame-panel"
 const FLOATING_PANEL_IDS = new Set([KEYFRAME_PANEL_ID, VIDEO_FRAME_PANEL_ID])
+const DIRECT_VIDEO_GENERATION_PAUSED = true
 const FRAME_TARGET_LABELS: Record<FrameExtractTarget, string> = {
  transparent_human: "透明骨架人",
  balanced: "综合关键帧",
@@ -592,6 +593,10 @@ export default function Home() {
  }, [jobs, productionJobIds, startProductionLanesForJob])

  const handleQuickGenerateVideo = useCallback(async (frameIdx: number, scene: StoryboardScene, model: string) => {
+    if (DIRECT_VIDEO_GENERATION_PAUSED) {
+      toast.info("视频生成调用已暂停：先生成并审核每条分镜的首帧/尾帧，再开放单条提交")
+      return
+    }
    if (!job) return
    const frame = job.frames.find((f) => f.index === frameIdx)
    if (!frame) return
@@ -728,6 +733,10 @@ export default function Home() {
  }, [ensureDefaultProductRefs, job, selectedFrames, updateJobInList])

  const handleGenerateProductFusionVideo = useCallback(async (frameIdx: number, shot: ProductFusionShot) => {
+    if (DIRECT_VIDEO_GENERATION_PAUSED) {
+      toast.info("视频生成调用已暂停：当前只做首尾帧和素材规划")
+      return
+    }
    if (!job) return
    const frame = job.frames.find((f) => f.index === frameIdx)
    if (!frame) return
--- a/web/components/ad-recreation-board.tsx
+++ b/web/components/ad-recreation-board.tsx
@@ -32,6 +32,7 @@ import {
  cutoutElement,
  deleteSubjectAsset,
  effectiveFrameUrl,
+  generateSceneAsset,
  generateProductAngleAsset,
  generateSubjectAssets,
  generatedImageUrl,
@@ -471,7 +472,7 @@ function videoModelTrace(models: RuntimeModels | undefined, model: string): Mode
      `前端选择：${model}`,
      `后端解析：${resolveVideoModelLabel(models, model)}`,
      `服务商：${modelValue(models?.video_provider)} · ${modelValue(models?.video_base_url)}`,
-      "输入：当前分镜文案、参考帧、产品素材、产品方向标注和画面规划",
+      "输入：已确认的首尾帧、当前分镜文案、产品素材、相似主体资产和画面规划",
      "输出：异步候选视频，完成后回填到对应分镜行",
    ],
  }
@@ -479,7 +480,6 @@ function videoModelTrace(models: RuntimeModels | undefined, model: string): Mode

 function buildFallbackScene(job: Job, frame: KeyFrame, order: number): StoryboardScene {
  const frames = [...job.frames].sort((a, b) => a.timestamp - b.timestamp)
-  const nextFrame = frames.find((item) => item.timestamp > frame.timestamp) ?? null
  const duration = Math.max(3.5, Math.min(7.5, Math.max(job.duration || 0, frames.length * 5) / Math.max(frames.length, 1)))
  const audio = job.audio_script?.rewritten_text?.trim()
    || job.transcript?.slice(0, 4).map((item) => item.en || item.zh).filter(Boolean).join(" ")
@@ -487,10 +487,10 @@ function buildFallbackScene(job: Job, frame: KeyFrame, order: number): Storyboar
  const objects = frame.description?.objects?.slice(0, 5).map((item) => item.name).filter(Boolean).join("、")
  return {
    duration: Number(duration.toFixed(1)),
-    first_image: { kind: "keyframe", frame_idx: frame.index, label: `分镜 ${order + 1} 首帧` },
-    last_image: nextFrame ? { kind: "keyframe", frame_idx: nextFrame.index, label: `分镜 ${order + 1} 尾帧` } : null,
+    first_image: null,
+    last_image: null,
    subject: objects ? `关键元素候选：${objects}` : "保留原视频最重要的主体动作和构图关系。",
-    scene: `${frame.description?.scene || `参考第 ${order + 1} 个关键画面规划 SKG 信息流广告分镜。`}\n音频节奏依据：${audio.slice(0, 220)}`,
+    scene: `${frame.description?.scene || `按第 ${order + 1} 段音频规划 SKG 信息流广告分镜。`}\n音频节奏依据：${audio.slice(0, 220)}`,
    product: "把原素材里的产品/痛点转成 SKG 颈部/肩颈按摩仪表达，默认使用 SKG 四张产品角度图做产品真源。",
    action: frame.description?.style
      ? `沿用原画面的讲话节奏、动作节点和 ${frame.description.style}，突出使用前紧绷、使用后放松。`
@@ -929,7 +929,60 @@ function selectProductItemsForRow(row: AudioStoryboardRow, items: ProductRefItem
  return picked
 }

-function buildStoryboardSceneFromAudioRow(row: AudioStoryboardRow, frame: KeyFrame, nextFrame?: KeyFrame | null, productItems: ProductRefItem[] = []): StoryboardScene {
+function subjectAssetRefsForPlanning(source: { frame: KeyFrame; element: KeyElement } | null): ImageRef[] {
+  if (!source) return []
+  return (source.element.subject_assets ?? []).slice(0, 10).map((asset) => ({
+    kind: "asset",
+    frame_idx: source.frame.index,
+    element_id: asset.id,
+    cutout_id: asset.id,
+    label: asset.label || asset.view || "相似主体视图",
+  }))
+}
+
+function endpointAssetRef(frame: KeyFrame | null, role: "first_frame" | "last_frame"): ImageRef | null {
+  if (!frame) return null
+  const saved = role === "first_frame" ? frame.storyboard?.first_image : frame.storyboard?.last_image
+  if (saved && saved.kind !== "keyframe") return saved
+  const asset = [...(frame.scene_assets ?? [])].reverse().find((item) => item.asset_role === role)
+  if (!asset) return null
+  return {
+    kind: "asset",
+    frame_idx: frame.index,
+    element_id: asset.id,
+    cutout_id: asset.id,
+    label: asset.label || (role === "first_frame" ? "首帧" : "尾帧"),
+  }
+}
+
+function buildEndpointFramePrompt(row: AudioStoryboardRow, role: "first_frame" | "last_frame", selectedProductItems: ProductRefItem[], subjectRefs: ImageRef[]) {
+  const target = role === "first_frame" ? row.firstFramePlan : row.lastFramePlan
+  const opposite = role === "first_frame" ? row.lastFramePlan : row.firstFramePlan
+  const productNotes = selectedProductItems.length ? productReferenceNotes(selectedProductItems) : ""
+  return [
+    `分镜 ${row.index + 1} ${role === "first_frame" ? "首帧" : "尾帧"}。`,
+    `新口播文案：${row.skgCopy}`,
+    `镜头类型：${VISUAL_MODE_OPTIONS.find((item) => item.value === row.visualMode)?.label ?? row.visualMode}。`,
+    `当前要生成的画面：${target}`,
+    `另一端画面用于连续性参考：${opposite}`,
+    `画面规划：${row.visualPlan}`,
+    row.needsSubject
+      ? `人物主体：必须使用已生成的相似主体白底视图作为人物真源；已提供 ${subjectRefs.length} 张主体参考。不要回到原视频关键帧复刻人物。`
+      : "本条不需要主角人物；如出现人物，只能是局部手部、背影或环境人物，不要生成透明骨架主角。",
+    row.needsProduct
+      ? `产品融入：${row.productPlacement}。${row.productIntegration}。已提供 ${selectedProductItems.length} 张同一 SKG 肩颈按摩仪产品参考；${productNotes}。产品是套在脖子上的 U 形肩颈按摩仪，必须保持真实佩戴大小、左右非对称和贴颈位置。`
+      : "本条不露出产品，不要强行生成 SKG 产品、包装、白底图或随机商品。",
+    "输出一张单独的 9:16 高清首/尾帧，不要拼图，不要字幕，不要平台 UI，不要水印。画面要能作为后续视频生成的明确起止帧。",
+  ].join("\n")
+}
+
+function buildStoryboardSceneFromAudioRow(
+  row: AudioStoryboardRow,
+  frame: KeyFrame,
+  productItems: ProductRefItem[] = [],
+  subjectRefs: ImageRef[] = [],
+  endpointRefs: { firstImage?: ImageRef | null; lastImage?: ImageRef | null } = {},
+): StoryboardScene {
  const selectedProductItems = row.needsProduct ? selectProductItemsForRow(row, productItems) : []
  const productRefs = selectedProductItems.map((item) => item.ref)
  const notes = productReferenceNotes(selectedProductItems)
@@ -940,8 +993,8 @@ function buildStoryboardSceneFromAudioRow(row: AudioStoryboardRow, frame: KeyFra
    : "未上传产品图时使用默认 SKG 产品图；生成前建议先建立同一产品素材池，锁定左右差异、厚度和佩戴比例。"
  return {
    duration: Number(Math.max(3.2, Math.min(6.5, row.end - row.start || 4.5)).toFixed(1)),
-    first_image: { kind: "keyframe", frame_idx: frame.index, label: `分镜 ${row.index + 1} 参考帧` },
-    last_image: nextFrame ? { kind: "keyframe", frame_idx: nextFrame.index, label: `分镜 ${row.index + 1} 尾帧` } : null,
+    first_image: endpointRefs.firstImage ?? null,
+    last_image: endpointRefs.lastImage ?? null,
    visual_mode: row.visualMode,
    needs_product: row.needsProduct,
    needs_subject: row.needsSubject,
@@ -950,7 +1003,11 @@ function buildStoryboardSceneFromAudioRow(row: AudioStoryboardRow, frame: KeyFra
    product_placement: row.productPlacement,
    product_images: productRefs,
    product_image: productRefs[0] ?? null,
-    subject: row.needsSubject ? row.keyElements : "本条不需要人物主体或相似主体参考；如画面里出现人物，只作为背景或局部，不作为主角。",
+    subject_images: row.needsSubject ? subjectRefs : [],
+    subject_image: row.needsSubject ? subjectRefs[0] ?? null : null,
+    subject: row.needsSubject
+      ? `${row.keyElements}\n主体真源：使用已生成的相似主体白底视图，共 ${subjectRefs.length} 张；关键帧只用于前置主体提取，不作为后续视频首尾帧参考。`
+      : "本条不需要人物主体或相似主体参考；如画面里出现人物，只作为背景或局部，不作为主角。",
    scene: `镜头类型：${VISUAL_MODE_OPTIONS.find((item) => item.value === row.visualMode)?.label ?? row.visualMode}\n${row.visualPlan}\n首帧规划：${row.firstFramePlan}\n尾帧规划：${row.lastFramePlan}\n原音频依据：${row.source}`,
    product: `产品需求：${row.needsProduct ? "需要产品参考" : "本条不需要产品"}\n产品出现方式：${row.productPlacement}\n${row.needsProduct ? row.productIntegration : "本条以情绪、人物状态、空间或节奏过渡为主，不露出产品。"}\n${productGuidance}`,
    action: `${row.skgCopy}\n连续动作：从首帧规划自然过渡到尾帧规划，镜头类型和产品/人物需求不能中途改变。`,
@@ -988,6 +1045,9 @@ export function AdRecreationBoard({
  const visualReady = (job?.frames.length ?? 0) > 0
  const subjectAssetCount = countSubjectAssetViews(job)
  const productAssetCount = job?.product_refs?.length ?? 0
+  const statusMessage = job?.message?.startsWith("视频生成已提交")
+    ? "历史候选视频已保留；当前已暂停直接提交视频，先逐条生成并审核首尾帧。"
+    : job?.message

  useEffect(() => {
    setDraftSegments([])
@@ -1203,8 +1263,8 @@ export function AdRecreationBoard({
                    <span className="font-mono text-[12px] text-white/36">02</span>
                    <h2 className="text-[15px] font-semibold leading-tight text-white">源视频解析与参考帧</h2>
                  </div>
-                  <div className="mt-1 truncate text-[11px] text-white/38" title={job?.message}>
-                    {job?.message || "下载源视频后解析音频，再抽参考帧并生成相似主体。"}
+                  <div className="mt-1 truncate text-[11px] text-white/38" title={statusMessage}>
+                    {statusMessage || "下载源视频后解析音频，再抽参考帧并生成相似主体。"}
                  </div>
                </div>
                <div className="flex shrink-0 items-center gap-2">
@@ -1260,7 +1320,6 @@ export function AdRecreationBoard({
                job={job}
                selectedFrames={data.selectedFrames}
                onJobUpdate={data.onJobUpdate}
-                onGenerateVideo={onGenerateVideo}
                runtimeModels={runtimeModels}
              />
            </div>
@@ -2157,17 +2216,16 @@ function AudioStoryboardPlanPanel({
  job,
  selectedFrames,
  onJobUpdate,
-  onGenerateVideo,
  runtimeModels,
 }: {
  job: Job | null
  selectedFrames: Set<number>
  onJobUpdate?: (job: Job) => void
-  onGenerateVideo?: (frameIdx: number, scene: StoryboardScene, model: string) => Promise<void> | void
  runtimeModels?: RuntimeModels
 }) {
-  const [videoBusyRow, setVideoBusyRow] = useState<number | null>(null)
-  const [batchVideoBusy, setBatchVideoBusy] = useState(false)
+  const [storyboardSaveBusyRow, setStoryboardSaveBusyRow] = useState<number | null>(null)
+  const [batchStoryboardSaveBusy, setBatchStoryboardSaveBusy] = useState(false)
+  const [endpointFrameBusy, setEndpointFrameBusy] = useState<string | null>(null)
  const [productItems, setProductItems] = useState<ProductRefItem[]>([])
  const [productUploading, setProductUploading] = useState(false)
  const [productAnalyzing, setProductAnalyzing] = useState(false)
@@ -2185,6 +2243,11 @@ function AudioStoryboardPlanPanel({
    [orderedFrames, selectedFrames],
  )
  const rowReferencePool = selectedReferenceFrames.length ? selectedReferenceFrames : orderedFrames
+  const similarActorSource = useMemo(
+    () => findSimilarActorSource(selectedReferenceFrames, orderedFrames),
+    [selectedReferenceFrames, orderedFrames],
+  )
+  const subjectRefs = useMemo(() => subjectAssetRefsForPlanning(similarActorSource), [similarActorSource])

  useEffect(() => {
    setProductItems((job?.product_refs ?? []).map(normalizeStoredProductItem))
@@ -2463,56 +2526,100 @@ function AudioStoryboardPlanPanel({
    }
  }

-  const submitRowVideo = async (row: AudioStoryboardRow, frame: KeyFrame) => {
-    if (!job || !onGenerateVideo) return
-    const nextFrame = orderedFrames.find((item) => item.timestamp > frame.timestamp) ?? null
+  const saveRowStoryboardDraft = async (row: AudioStoryboardRow, frame: KeyFrame) => {
+    if (!job) return
    const plannedRow = { ...planForRow(row, frame), skgCopy: copyForRow(row) }
-    const scene = buildStoryboardSceneFromAudioRow(plannedRow, frame, nextFrame, productItems)
+    const scene = buildStoryboardSceneFromAudioRow(plannedRow, frame, productItems, subjectRefs, {
+      firstImage: endpointAssetRef(frame, "first_frame"),
+      lastImage: endpointAssetRef(frame, "last_frame"),
+    })
    const updated = await updateStoryboard(job.id, frame.index, scene)
    onJobUpdate?.(updated)
-    await onGenerateVideo(frame.index, scene, "seedance")
  }

-  const generateRowVideo = async (row: AudioStoryboardRow, frame: KeyFrame | null) => {
-    if (!job || !frame || !onGenerateVideo) return
-    setVideoBusyRow(row.index)
+  const generateEndpointFrameForRow = async (row: AudioStoryboardRow, frame: KeyFrame | null, role: "first_frame" | "last_frame") => {
+    if (!job || !frame) return
+    const plannedRow = { ...planForRow(row, frame), skgCopy: copyForRow(row) }
+    if (plannedRow.needsSubject && !subjectRefs.length) {
+      toast.warning("先在上方生成相似主体白底视图，再生成首尾帧")
+      return
+    }
+    if (plannedRow.needsProduct && !productItems.length) {
+      toast.warning("本条需要产品，请先上传并识别产品素材池")
+      return
+    }
+    const selectedProductItems = plannedRow.needsProduct ? selectProductItemsForRow(plannedRow, productItems) : []
+    const busyKey = `${row.index}:${role}`
+    setEndpointFrameBusy(busyKey)
    try {
-      await submitRowVideo(row, frame)
+      await saveRowStoryboardDraft(plannedRow, frame)
+      const updated = await generateSceneAsset(job.id, frame.index, {
+        size: SUBJECT_ASSET_SIZE,
+        scene_mode: "similar",
+        scene_style: "premium_product",
+        asset_role: role,
+        prompt: buildEndpointFramePrompt(plannedRow, role, selectedProductItems, subjectRefs),
+        subject_images: plannedRow.needsSubject ? subjectRefs : [],
+        product_images: selectedProductItems.map((item) => item.ref),
+        source_frame_indices: [],
+      })
+      const updatedFrame = updated.frames.find((item) => item.index === frame.index) ?? frame
+      const generatedRef = endpointAssetRef(updatedFrame, role)
+      const scene = buildStoryboardSceneFromAudioRow(plannedRow, updatedFrame, productItems, subjectRefs, {
+        firstImage: role === "first_frame" ? generatedRef : endpointAssetRef(updatedFrame, "first_frame"),
+        lastImage: role === "last_frame" ? generatedRef : endpointAssetRef(updatedFrame, "last_frame"),
+      })
+      const saved = await updateStoryboard(job.id, frame.index, scene)
+      onJobUpdate?.(saved)
+      toast.success(`分镜 ${row.index + 1} ${role === "first_frame" ? "首帧" : "尾帧"}已生成`)
    } catch (e) {
-      toast.error("生成本条视频失败：" + (e instanceof Error ? e.message : String(e)))
+      toast.error(`${role === "first_frame" ? "首帧" : "尾帧"}生成失败：` + (e instanceof Error ? e.message : String(e)))
    } finally {
-      setVideoBusyRow(null)
+      setEndpointFrameBusy(null)
    }
  }

-  const generateAllRowVideos = async () => {
-    if (!job || !onGenerateVideo || !rows.length) return
+  const saveSingleRowStoryboardDraft = async (row: AudioStoryboardRow, frame: KeyFrame | null) => {
+    if (!job || !frame) return
+    setStoryboardSaveBusyRow(row.index)
+    try {
+      await saveRowStoryboardDraft(row, frame)
+      toast.success("已保存本条分镜规划；视频生成入口已暂停，等待首尾帧资产")
+    } catch (e) {
+      toast.error("保存本条规划失败：" + (e instanceof Error ? e.message : String(e)))
+    } finally {
+      setStoryboardSaveBusyRow(null)
+    }
+  }
+
+  const saveAllStoryboardDrafts = async () => {
+    if (!job || !rows.length) return
    const jobsToSubmit = rows
      .map((row) => ({ row: planForRow(row, referenceFrameForRow(row)), frame: referenceFrameForRow(row) }))
      .filter((item): item is { row: AudioStoryboardRow; frame: KeyFrame } => !!item.frame)
    if (!jobsToSubmit.length) {
-      toast.warning("先完成自动抽帧，或在原版视频上手动补参考帧")
+      toast.warning("先完成前置抽帧，让每条分镜有可保存的承载位置")
      return
    }
-    setBatchVideoBusy(true)
+    setBatchStoryboardSaveBusy(true)
    let ok = 0
    let failed = 0
    try {
      for (const item of jobsToSubmit) {
-        setVideoBusyRow(item.row.index)
+        setStoryboardSaveBusyRow(item.row.index)
        try {
-          await submitRowVideo(item.row, item.frame)
+          await saveRowStoryboardDraft(item.row, item.frame)
          ok += 1
        } catch (e) {
          failed += 1
-          console.warn("批量提交分镜失败", item.row.index, e)
+          console.warn("批量保存分镜规划失败", item.row.index, e)
        }
      }
-      if (failed) toast.warning(`已提交 ${ok} 条，${failed} 条失败`)
-      else toast.success(`已提交全部 ${ok} 条分镜视频`)
+      if (failed) toast.warning(`已保存 ${ok} 条规划，${failed} 条失败`)
+      else toast.success(`已保存全部 ${ok} 条分镜规划；视频生成入口已暂停`)
    } finally {
-      setVideoBusyRow(null)
-      setBatchVideoBusy(false)
+      setStoryboardSaveBusyRow(null)
+      setBatchStoryboardSaveBusy(false)
    }
  }

@@ -2528,7 +2635,7 @@ function AudioStoryboardPlanPanel({
        <div className="grid shrink-0 grid-cols-3 gap-2 text-[11px] text-white/45">
          <Requirement label="分镜" ready={rows.length > 0} detail={rows.length ? `${rows.length} 条` : "待音频"} />
          <Requirement label="参考帧" ready={orderedFrames.length > 0} detail={orderedFrames.length ? `${orderedFrames.length} 张` : "待抽帧"} />
-          <Requirement label="生成" ready={(job.generated_videos?.length ?? 0) > 0} detail={`${job.generated_videos?.length ?? 0} 条`} />
+          <Requirement label="候选" ready={(job.generated_videos?.length ?? 0) > 0} detail={`${job.generated_videos?.length ?? 0} 条历史`} />
        </div>
      </div>

@@ -2636,12 +2743,12 @@ function AudioStoryboardPlanPanel({
            </button>
            <button
              type="button"
-              onClick={() => void generateAllRowVideos()}
-              disabled={batchVideoBusy || !onGenerateVideo || !rows.length || !orderedFrames.length}
-              className="inline-flex h-9 items-center justify-center gap-1 rounded-md bg-rose-600 px-2.5 text-[11px] font-semibold text-white transition hover:bg-rose-500 disabled:cursor-not-allowed disabled:opacity-40"
+              onClick={() => void saveAllStoryboardDrafts()}
+              disabled={batchStoryboardSaveBusy || !rows.length || !orderedFrames.length}
+              className="inline-flex h-9 items-center justify-center gap-1 rounded-md bg-white px-2.5 text-[11px] font-semibold text-black transition hover:bg-white/90 disabled:cursor-not-allowed disabled:opacity-40"
            >
-              {batchVideoBusy ? <Loader2 className="h-3.5 w-3.5 animate-spin" /> : <Play className="h-3.5 w-3.5" />}
-              一键提交全部
+              {batchStoryboardSaveBusy ? <Loader2 className="h-3.5 w-3.5 animate-spin" /> : <Check className="h-3.5 w-3.5" />}
+              保存全部规划
            </button>
          </div>
        </div>
@@ -2650,13 +2757,13 @@ function AudioStoryboardPlanPanel({
            const referenceFrame = referenceFrameForRow(row)
            const plannedRow = planForRow(row, referenceFrame)
            const rowVideos = videosForFrame(referenceFrame)
-            const generating = videoBusyRow === row.index
+            const savingStoryboard = storyboardSaveBusyRow === row.index
            const copyText = copyForRow(row)
            const selectedProductCount = plannedRow.needsProduct ? selectProductItemsForRow(plannedRow, productItems).length : 0
            return (
              <article
                key={row.index}
-                className="grid overflow-hidden rounded-md border border-white/10 bg-black/24 text-[11px] leading-snug text-white/64 xl:grid-cols-[56px_140px_minmax(220px,0.75fr)_minmax(240px,0.8fr)_minmax(320px,1fr)] 2xl:grid-cols-[58px_170px_minmax(360px,0.8fr)_minmax(380px,1fr)_520px]"
+                className="grid overflow-hidden rounded-md border border-white/10 bg-black/24 text-[11px] leading-snug text-white/64 xl:grid-cols-[54px_120px_minmax(170px,0.48fr)_minmax(420px,1.2fr)_360px] 2xl:grid-cols-[56px_140px_280px_minmax(560px,1fr)_420px]"
              >
                <StoryboardPlanCell label="分镜">
                  <div className="font-mono text-[11px] text-white/40">{row.start.toFixed(1)}-{row.end.toFixed(1)}s</div>
@@ -2744,8 +2851,37 @@ function AudioStoryboardPlanPanel({
                      placeholder="产品出现方式：不出现 / 首帧出现 / 尾帧出现 / 全程佩戴 / 产品特写"
                      className="min-h-[38px] w-full resize-y rounded border border-white/10 bg-black/32 px-2 py-1.5 text-[10.5px] leading-snug text-white/68 outline-none placeholder:text-white/25 focus:border-rose-300/45"
                    />
+                    <div className="grid gap-1.5 md:grid-cols-[minmax(0,1fr)_88px_88px]">
+                      <div className="rounded border border-white/10 bg-black/24 px-2 py-1.5 text-[10px] leading-snug text-white/42">
+                        <div className="mb-1 flex items-center justify-between gap-2">
+                          <span className="text-white/54">首尾帧闸门</span>
+                          <span className={endpointAssetRef(referenceFrame, "first_frame") && endpointAssetRef(referenceFrame, "last_frame") ? "text-emerald-100/75" : "text-amber-100/72"}>
+                            {endpointAssetRef(referenceFrame, "first_frame") && endpointAssetRef(referenceFrame, "last_frame") ? "可进入视频候选" : "先看图再生视频"}
+                          </span>
+                        </div>
+                        <p>
+                          关键帧只用于前置主体重构；这里用相似主体视图{plannedRow.needsProduct ? " + 产品素材池" : ""}生成首尾帧。
+                        </p>
+                      </div>
+                      <EndpointFrameSlot
+                        job={job}
+                        frame={referenceFrame}
+                        role="first_frame"
+                        busy={endpointFrameBusy === `${row.index}:first_frame`}
+                        disabled={!referenceFrame || (plannedRow.needsSubject && !subjectRefs.length) || (plannedRow.needsProduct && !productItems.length)}
+                        onGenerate={() => void generateEndpointFrameForRow(plannedRow, referenceFrame, "first_frame")}
+                      />
+                      <EndpointFrameSlot
+                        job={job}
+                        frame={referenceFrame}
+                        role="last_frame"
+                        busy={endpointFrameBusy === `${row.index}:last_frame`}
+                        disabled={!referenceFrame || (plannedRow.needsSubject && !subjectRefs.length) || (plannedRow.needsProduct && !productItems.length)}
+                        onGenerate={() => void generateEndpointFrameForRow(plannedRow, referenceFrame, "last_frame")}
+                      />
+                    </div>
                    <div className="flex items-center justify-between gap-2 text-[10px] text-white/34">
-                      <span>{plannedRow.needsProduct ? `将自动挑选 ${selectedProductCount || 0} 张产品参考图` : "本条不传产品图"}</span>
+                      <span>{plannedRow.needsSubject ? `主体视图 ${subjectRefs.length} 张` : "本条不传主体"} · {plannedRow.needsProduct ? `产品参考 ${selectedProductCount || 0} 张` : "本条不传产品图"}</span>
                      <button
                        type="button"
                        onClick={() => patchRowPlan(row.index, visualModeDefaults(plannedRow.visualMode))}
@@ -2757,23 +2893,32 @@ function AudioStoryboardPlanPanel({
                  </div>
                </StoryboardPlanCell>

-                <StoryboardPlanCell label="生成视频" className="xl:border-r-0">
-                  <StoryboardVideoSlots job={job} videos={rowVideos} enabled={!!referenceFrame} />
-                  <div className="mt-1 truncate text-[10px] text-white/34" title={referenceFrame ? `参考 ${referenceFrame.timestamp.toFixed(1)}s` : row.referencePlan}>
-                    {referenceFrame ? `参考 ${referenceFrame.timestamp.toFixed(1)}s · 可多次生成候选` : "先在关键帧区自动抽帧 12 张"}
+                <StoryboardPlanCell label="视频候选 / 待生成" className="xl:border-r-0">
+                  <StoryboardVideoSlots
+                    job={job}
+                    videos={rowVideos}
+                    enabled={!!endpointAssetRef(referenceFrame, "first_frame") && !!endpointAssetRef(referenceFrame, "last_frame")}
+                  />
+                  <div className="mt-1 truncate text-[10px] text-white/34" title="视频生成已暂停，首尾帧确认后再开放单条提交">
+                    {endpointAssetRef(referenceFrame, "first_frame") && endpointAssetRef(referenceFrame, "last_frame")
+                      ? "首尾帧已就绪 · 待开放单条视频提交"
+                      : "先生成并确认首帧 / 尾帧"}
                  </div>
                  <div className="mt-1 flex items-center justify-between gap-2">
-                    <span className="text-[10px] text-white/34">生视频模型</span>
-                    <ModelTrace trace={videoModelTrace(runtimeModels, "seedance")} compact />
+                    <span className="text-[10px] text-white/34">视频生成</span>
+                    <span className="rounded border border-amber-300/18 bg-amber-300/[0.07] px-1.5 py-0.5 text-[10px] text-amber-100/70">已暂停</span>
+                  </div>
+                  <div className="mt-1 rounded border border-amber-300/12 bg-amber-300/[0.045] px-2 py-1 text-[10px] leading-snug text-amber-100/62">
+                    先保存画面规划；等 SKG 首帧/尾帧资产确认后再开放单条视频提交。
                  </div>
                  <button
                    type="button"
-                    onClick={() => generateRowVideo(plannedRow, referenceFrame)}
-                    disabled={!referenceFrame || !onGenerateVideo || generating}
+                    onClick={() => void saveSingleRowStoryboardDraft(plannedRow, referenceFrame)}
+                    disabled={!referenceFrame || savingStoryboard}
                    className="mt-1.5 inline-flex h-8 w-full items-center justify-center gap-1 rounded-md bg-white px-2 text-[11px] font-semibold text-black transition hover:bg-white/90 disabled:cursor-not-allowed disabled:opacity-40"
                  >
-                    {generating ? <Loader2 className="h-3.5 w-3.5 animate-spin" /> : <Play className="h-3.5 w-3.5" />}
-                    生成本条 · Seedance
+                    {savingStoryboard ? <Loader2 className="h-3.5 w-3.5 animate-spin" /> : <Check className="h-3.5 w-3.5" />}
+                    保存本条规划
                  </button>
                </StoryboardPlanCell>
              </article>
@@ -2782,7 +2927,7 @@ function AudioStoryboardPlanPanel({
        </div>
        </>
      ) : (
-        <EmptyState text="音频解析完成后，这里会按逐句时间轴生成信息流复刻分镜工作台。先在关键帧区自动抽帧 12 张并选择主角参考，再按分镜生成视频候选。" />
+        <EmptyState text="音频解析完成后，这里会按逐句时间轴生成信息流复刻分镜工作台。先抽帧并生成相似主体，再逐条规划首尾帧。" />
      )}
    </section>
  )
@@ -2951,7 +3096,7 @@ function StoryboardVideoSlots({ job, videos, enabled }: { job: Job; videos: Gene
        ))}
        {Array.from({ length: emptyCount }).map((_, index) => (
          <div key={`empty-video-${index}`} className="flex aspect-[9/16] min-h-[86px] min-w-0 items-center justify-center rounded border border-dashed border-white/12 bg-black/25 px-1 text-center text-[9.5px] leading-tight text-white/26">
-            {enabled ? `候选 ${visible.length + index + 1}` : "先抽 12 帧"}
+            {enabled ? `候选 ${visible.length + index + 1}` : "待首尾帧"}
          </div>
        ))}
      </div>
@@ -2962,6 +3107,53 @@ function StoryboardVideoSlots({ job, videos, enabled }: { job: Job; videos: Gene
  )
 }

+function EndpointFrameSlot({
+  job,
+  frame,
+  role,
+  busy,
+  disabled,
+  onGenerate,
+}: {
+  job: Job
+  frame: KeyFrame | null
+  role: "first_frame" | "last_frame"
+  busy: boolean
+  disabled: boolean
+  onGenerate: () => void
+}) {
+  const ref = endpointAssetRef(frame, role)
+  const src = ref ? resolveImageRefUrl(job.id, ref) : ""
+  const label = role === "first_frame" ? "首帧" : "尾帧"
+  return (
+    <div className="overflow-hidden rounded border border-white/10 bg-black/32">
+      <div className="relative flex aspect-[9/16] min-h-[112px] items-center justify-center bg-black">
+        {src ? (
+          <a href={src} target="_blank" rel="noreferrer" className="group h-full w-full">
+            <img src={src} alt={`${label}资产`} className="h-full w-full object-contain transition group-hover:scale-[1.02]" />
+          </a>
+        ) : (
+          <div className="px-2 text-center text-[10px] leading-snug text-white/28">先生成{label}</div>
+        )}
+        {busy && (
+          <div className="absolute inset-0 flex items-center justify-center bg-black/65">
+            <Loader2 className="h-4 w-4 animate-spin text-white/80" />
+          </div>
+        )}
+      </div>
+      <button
+        type="button"
+        onClick={onGenerate}
+        disabled={disabled || busy}
+        className="flex h-7 w-full items-center justify-center gap-1 border-t border-white/10 bg-white/[0.045] px-1 text-[10px] font-semibold text-white/62 transition hover:bg-white/[0.09] hover:text-white disabled:cursor-not-allowed disabled:opacity-35"
+      >
+        {busy ? <Loader2 className="h-3 w-3 animate-spin" /> : <Sparkles className="h-3 w-3" />}
+        {src ? `重生${label}` : `生成${label}`}
+      </button>
+    </div>
+  )
+}
+
 function StoryboardVideoPreview({ job, video, className = "h-20 w-12" }: { job: Job; video: GeneratedVideo; className?: string }) {
  const src = videoSrc(video)
  const poster = videoPoster(job, video)
--- a/web/lib/api.ts
+++ b/web/lib/api.ts
@@ -117,6 +117,7 @@ export interface StoryboardScene {
  first_image?: ImageRef | null
  last_image?: ImageRef | null
  product_images?: ImageRef[]
+  subject_images?: ImageRef[]
  product_fusion_shots?: ProductFusionShot[]
  visual_mode?: "person_only" | "person_product" | "product_only" | "environment"
  needs_product?: boolean
@@ -1049,6 +1050,8 @@ export async function generateSceneAsset(
    asset_role?: SceneAssetRole
    prompt?: string
    source_frame_indices?: number[]
+    subject_images?: ImageRef[]
+    product_images?: ImageRef[]
  } = {},
 ): Promise<Job> {
  const res = await fetch(`${API_BASE}/jobs/${jobId}/frames/${frameIdx}/scene-asset`, {
@@ -1062,6 +1065,8 @@ export async function generateSceneAsset(
      asset_role: body.asset_role ?? "scene",
      prompt: body.prompt ?? "",
      source_frame_indices: body.source_frame_indices ?? null,
+      subject_images: body.subject_images ?? [],
+      product_images: body.product_images ?? [],
    }),
  })
  if (!res.ok) {