auto-save 2026-05-14 10:08 (~4)

2026-05-14 10:09:11 +08:00
parent e45c1d5c5b
commit 96784f9df1
4 changed files with 2560 additions and 2374 deletions
--- a/api/main.py
+++ b/api/main.py
@@ -92,7 +92,7 @@ JobStatus = Literal[
 ]

 KEYFRAME_COUNT = int(os.getenv("KEYFRAME_COUNT", "5"))
-FrameExtractTarget = Literal["balanced", "subject", "transition", "expression", "motion"]
+FrameExtractTarget = Literal["transparent_human", "balanced", "subject", "transition", "expression", "motion"]
 FrameExtractMode = Literal["replace", "append"]
 FrameExtractQuality = Literal["auto", "fast", "accurate", "ultra"]
 AnalyzeTask = tuple[str, int, FrameExtractTarget, FrameExtractMode, FrameExtractQuality]
@@ -104,12 +104,38 @@ SubjectView = str
 SceneMode = Literal["remove_subject", "similar", "style"]
 SceneStyle = Literal["source", "premium_product", "clean_studio", "warm_lifestyle", "cinematic"]
 FRAME_TARGET_LABELS: dict[FrameExtractTarget, str] = {
+    "transparent_human": "透明骨架人",
    "balanced": "综合关键帧",
    "subject": "清晰主体",
    "transition": "转场变化",
    "expression": "表情瞬间",
    "motion": "动作峰值",
 }
+
+TRANSPARENT_HUMAN_POSITIVE_PROMPT = (
+    "Target subject: transparent human character, translucent human body, glass-like human body, clear acrylic skin, "
+    "transparent vinyl skin, visible clean white skeleton inside, skeleton visible inside transparent body, "
+    "white bones inside clear body, non-horror skeleton character, friendly transparent humanoid, 3D commercial character, "
+    "premium wellness character, transparent body with visible spine, transparent body with visible rib cage. "
+    "中文目标：透明人体、半透明人体、玻璃人体、亚克力人体、果冻质感人体、外层透明皮肤、身体内部可见骨架、"
+    "透明身体里的白色骨骼、干净白色骨架、非恐怖骷髅人、3D广告角色、透明骨架人、可见脊柱、可见肋骨、"
+    "可见颈椎、可见骨盆、可见四肢骨骼、透明皮肤包裹骨架。"
+)
+TRANSPARENT_HUMAN_NEGATIVE_PROMPT = (
+    "Avoid: normal human, ordinary skeleton, skeleton only without transparent body, horror skeleton, gore, blood, corpse, "
+    "zombie, organs, veins, autopsy, surgery, hospital, dark horror scene, blurry person, heavily occluded person, "
+    "person too small, product only, background only, no visible skeleton, no transparent body, transparent clothing only. "
+    "反向排除：普通真人、普通骷髅、只有骨架没有透明外壳、恐怖骷髅、血腥、腐烂、僵尸、尸体、器官、血管、"
+    "解剖、医院、手术、黑暗恐怖场景、模糊人物、遮挡严重、人物太远、只有产品没有人、只有背景没有人、"
+    "看不到骨架、看不到透明身体、透明衣服但不是透明身体。"
+)
+TRANSPARENT_HUMAN_QUALIFIED_STANDARD = (
+    "A qualified frame must satisfy all core conditions: 1) there is a humanoid character; "
+    "2) the outer body is transparent or translucent; 3) a clean white skeleton is clearly visible inside the body; "
+    "4) the transparent body and inner skeleton belong to the same character, not a background overlay; "
+    "5) the character should occupy at least about 35% of frame height and be easy to inspect; "
+    "6) no severe blur, occlusion, or deformation; 7) clean premium commercial wellness style, non-horror."
+)
 FRAME_QUALITY_LABELS: dict[FrameExtractQuality, str] = {
    "auto": "自动",
    "fast": "快速",
@@ -190,6 +216,19 @@ class QualityReport(BaseModel):
    warnings: list[str] = Field(default_factory=list)


+class TransparentHumanFrameScore(BaseModel):
+    transparent_body_score: int = 0
+    skeleton_visible_score: int = 0
+    human_prominence_score: int = 0
+    clarity_score: int = 0
+    commercial_style_score: int = 0
+    product_usefulness_score: int = 0
+    total_score: int = 0
+    qualified: bool = False
+    reject_reason: str = ""
+    notes: str = ""
+
+
 class SceneAsset(BaseModel):
    id: str
    label: str = ""
@@ -280,6 +319,7 @@ class KeyFrame(BaseModel):
    timestamp: float
    url: str
    description: dict | None = None  # vision 模型识别结果 {scene, objects, style, suggested_prompt}
+    transparent_human_score: TransparentHumanFrameScore | None = None
    cleaned_url: str | None = None   # 清洗后干净版（待应用）→ /jobs/{id}/frames/{idx}/cleaned.jpg
    cleaned_applied: bool = False    # 是否已用清洗版替换原图（替换后 cleaned_url=null）
    quality_report: QualityReport | None = None
@@ -870,7 +910,11 @@ def _target_score(item: dict, target: FrameExtractTarget) -> float:
    scene = float(item.get("scene_score_n", 0.0))
    motion = float(item.get("motion_n", 0.0))

-    if target == "subject":
+    if target == "transparent_human":
+        # 透明骨架人仍先依赖本地清晰度 / 中心主体 / 对比度筛候选，
+        # 后续再交给 Vision 逐张语义验收。
+        score = center * 0.45 + sharp * 0.30 + contrast * 0.15 + color * 0.10
+    elif target == "subject":
        score = center * 0.48 + sharp * 0.25 + contrast * 0.17 + color * 0.10
    elif target == "transition":
        score = scene * 0.55 + sharp * 0.28 + contrast * 0.12 + color * 0.05
@@ -942,6 +986,100 @@ def _select_keyframes(candidates: list[dict], n: int, target: FrameExtractTarget
    return selected


+def _rank_keyframe_candidates(candidates: list[dict], target: FrameExtractTarget, limit: int, dup_threshold: int = 8) -> list[dict]:
+    if not candidates:
+        return []
+    _attach_temporal_metrics(candidates)
+    _normalize_item_metrics(candidates)
+    for it in candidates:
+        it["score"] = _target_score(it, target)
+    deduped: list[dict] = []
+    for it in sorted(candidates, key=lambda x: -float(x.get("score", 0.0))):
+        if any((it["hash"] - kept["hash"]) < dup_threshold for kept in deduped):
+            continue
+        deduped.append(it)
+        if len(deduped) >= limit:
+            break
+    return deduped
+
+
+def _score_transparent_human_frame(img_path: Path) -> TransparentHumanFrameScore:
+    if not LLM_API_KEY:
+        return TransparentHumanFrameScore(
+            qualified=False,
+            reject_reason="LLM_API_KEY 未配置，无法进行透明骨架人语义验收",
+        )
+    img_b64 = base64.b64encode(img_path.read_bytes()).decode("ascii")
+    prompt = (
+        "You are a strict keyframe quality inspector for a SKG transparent-human video recreation workflow. "
+        + TRANSPARENT_HUMAN_POSITIVE_PROMPT + " "
+        + TRANSPARENT_HUMAN_NEGATIVE_PROMPT + " "
+        + TRANSPARENT_HUMAN_QUALIFIED_STANDARD + "\n\n"
+        "Score this single frame using exactly these dimensions:\n"
+        "- transparent_body_score: 0-25, clear transparent/translucent outer human body shell.\n"
+        "- skeleton_visible_score: 0-25, clean white skeleton clearly visible inside the body.\n"
+        "- human_prominence_score: 0-15, character centered/large/easy to identify, ideally >=35% frame height.\n"
+        "- clarity_score: 0-15, no severe motion blur, occlusion, or deformation.\n"
+        "- commercial_style_score: 0-10, clean premium non-horror advertising/wellness style.\n"
+        "- product_usefulness_score: 0-10, useful for later SKG product video generation; neck/shoulder/waist/eye/foot/knee area visible when relevant.\n"
+        "Reject if any of these is true: normal human only; ordinary skeleton only; product/background only; transparent person too far; severe blur; more than half occluded; horror/corpse/autopsy/surgery/hospital; unable to judge.\n"
+        "Output strict JSON only with keys: transparent_body_score, skeleton_visible_score, human_prominence_score, clarity_score, commercial_style_score, product_usefulness_score, qualified, reject_reason, notes."
+    )
+    try:
+        resp = llm().chat.completions.create(
+            model=VISION_MODEL,
+            messages=[{"role": "user", "content": [
+                {"type": "text", "text": prompt},
+                {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{img_b64}"}},
+            ]}],
+            response_format={"type": "json_object"},
+            temperature=0.1,
+            max_tokens=1200,
+        )
+        raw = (resp.choices[0].message.content or "").strip()
+        data = json.loads(raw)
+    except Exception as e:
+        return TransparentHumanFrameScore(qualified=False, reject_reason=f"AI 评分失败：{e}")
+
+    def score(name: str, cap: int) -> int:
+        try:
+            value = int(round(float(data.get(name, 0))))
+        except Exception:
+            value = 0
+        return max(0, min(cap, value))
+
+    item = TransparentHumanFrameScore(
+        transparent_body_score=score("transparent_body_score", 25),
+        skeleton_visible_score=score("skeleton_visible_score", 25),
+        human_prominence_score=score("human_prominence_score", 15),
+        clarity_score=score("clarity_score", 15),
+        commercial_style_score=score("commercial_style_score", 10),
+        product_usefulness_score=score("product_usefulness_score", 10),
+        reject_reason=str(data.get("reject_reason", "") or ""),
+        notes=str(data.get("notes", "") or ""),
+    )
+    item.total_score = (
+        item.transparent_body_score
+        + item.skeleton_visible_score
+        + item.human_prominence_score
+        + item.clarity_score
+        + item.commercial_style_score
+        + item.product_usefulness_score
+    )
+    item.qualified = bool(data.get("qualified")) and (
+        item.transparent_body_score >= 18
+        and item.skeleton_visible_score >= 18
+        and item.human_prominence_score >= 8
+        and item.clarity_score >= 8
+        and item.commercial_style_score >= 6
+        and item.product_usefulness_score >= 4
+        and item.total_score >= 72
+    )
+    if not item.qualified and not item.reject_reason:
+        item.reject_reason = f"透明骨架人评分不足，总分 {item.total_score}/100"
+    return item
+
+
 def ffprobe_meta(mp4: Path) -> dict:
    out = run([
        "ffprobe", "-v", "error", "-print_format", "json", "-show_streams", "-show_format", str(mp4),
@@ -989,7 +1127,7 @@ async def pipeline_download(job_id: str) -> None:
 async def pipeline_analyze(
    job_id: str,
    frame_count: int = KEYFRAME_COUNT,
-    target: FrameExtractTarget = "balanced",
+    target: FrameExtractTarget = "transparent_human",
    mode: FrameExtractMode = "replace",
    quality: FrameExtractQuality = "auto",
 ) -> None:
@@ -1053,17 +1191,25 @@ async def pipeline_analyze(
        if not candidates:
            raise RuntimeError("候选帧评分失败")

-        # 2) 目标化筛选：pHash 去重 + 清晰度 / 中心细节 / 转场变化 / 动作强度 + 时序分桶。
-        selection_count = n if replacing else min(len(candidates), max(n * 4, n + len(existing_frames) + 2))
-        update(job, message=f"{quality_label}筛选 · {target_label} · {n} / {len(candidates)} 张…", progress=60)
-        chosen = _select_keyframes(candidates, selection_count, target)
+        # 2) 目标化筛选：pHash 去重 + 清晰度 / 中心细节 / 转场变化 / 动作强度。
+        # 透明骨架人目标会先扩大候选池，再用 Vision 逐张验收；不合格自动换下一帧。
+        semantic_transparent = target == "transparent_human"
+        if semantic_transparent:
+            selection_count = min(len(candidates), min(max(n * 10, 24), 48))
+            update(job, message=f"{quality_label}筛选透明骨架人候选 · 本地 {selection_count} / {len(candidates)} 张…", progress=58)
+            chosen = _rank_keyframe_candidates(candidates, target, selection_count)
+        else:
+            selection_count = n if replacing else min(len(candidates), max(n * 4, n + len(existing_frames) + 2))
+            update(job, message=f"{quality_label}筛选 · {target_label} · {n} / {len(candidates)} 张…", progress=60)
+            chosen = _select_keyframes(candidates, selection_count, target)

        # 3) 只对最终选中的时间点，从原视频抽高质量关键帧。
        renamed: list[KeyFrame] = []
-        chosen_sorted = sorted(chosen, key=lambda it: float(it["timestamp"]))
+        chosen_sorted = chosen if semantic_transparent else sorted(chosen, key=lambda it: float(it["timestamp"]))
        existing_timestamps = [float(f.timestamp) for f in existing_frames]
        next_idx = max((int(f.index) for f in existing_frames), default=-1) + 1
-        for item in chosen_sorted:
+        rejected_by_ai = 0
+        for attempt, item in enumerate(chosen_sorted, start=1):
            if len(renamed) >= n:
                break
            t = float(item["timestamp"])
@@ -1077,25 +1223,53 @@ async def pipeline_analyze(
                "-pix_fmt", "yuvj420p", "-q:v", "3",
                str(dst),
            ])
+            transparent_score: TransparentHumanFrameScore | None = None
+            if semantic_transparent:
+                update(
+                    job,
+                    message=f"AI 验收透明骨架人 · 已通过 {len(renamed)}/{n} · 候选 {attempt}/{len(chosen_sorted)}…",
+                    progress=min(68, 60 + int(attempt / max(1, len(chosen_sorted)) * 8)),
+                )
+                transparent_score = _score_transparent_human_frame(dst)
+                if not transparent_score.qualified:
+                    rejected_by_ai += 1
+                    try:
+                        dst.unlink()
+                    except OSError:
+                        pass
+                    reason = transparent_score.reject_reason or f"总分 {transparent_score.total_score}/100"
+                    update(job, message=f"AI 退回候选帧 · {reason[:48]} · 自动换下一帧", progress=65)
+                    continue
            renamed.append(KeyFrame(
                index=idx,
                timestamp=round(t, 2),
                url=f"/jobs/{job_id}/frames/{idx}.jpg",
+                transparent_human_score=transparent_score,
            ))
            existing_timestamps.append(t)

+        if semantic_transparent and not renamed:
+            raise RuntimeError("AI 未找到合格透明骨架人帧：需要透明/半透明人体外壳 + 清楚白色骨架 + 非恐怖广告感")
+
        # 4) 清理扫描目录
        shutil.rmtree(scan_dir, ignore_errors=True)

        merged_frames = sorted(existing_frames + renamed, key=lambda f: f.timestamp)
        action_label = "追加" if not replacing else "抽取"

+        final_message = (
+            f"已按「{quality_label} · {target_label}」AI验收 {action_label} {len(renamed)} 张"
+            + (f" · 退回 {rejected_by_ai} 张" if semantic_transparent else "")
+            + f" · 共 {len(merged_frames)} 张"
+        ) if semantic_transparent else (
+            f"已按「{quality_label} · {target_label}」{action_label} {len(renamed)} 张关键帧 · 共 {len(merged_frames)} 张"
+        )
        update(
            job,
            status="frames_extracted",
            frames=merged_frames,
            progress=70,
-            message=f"已按「{quality_label} · {target_label}」{action_label} {len(renamed)} 张关键帧 · 共 {len(merged_frames)} 张",
+            message=final_message,
        )

    except Exception as e:
@@ -1486,7 +1660,7 @@ async def trigger_analyze(
    job_id: str,
    bg: BackgroundTasks,
    frames: int = KEYFRAME_COUNT,
-    target: FrameExtractTarget = "balanced",
+    target: FrameExtractTarget = "transparent_human",
    mode: FrameExtractMode = "replace",
    quality: FrameExtractQuality = "auto",
 ) -> Job:
@@ -1792,9 +1966,14 @@ def describe_frame(job_id: str, idx: int) -> Job:
        '  "scene": "一句话描述场景",\n'
        '  "objects": [{"name": "物体名（中文）", "position": "在画面哪里", "color": "颜色", "extract_prompt": "用于提取该元素的英文 prompt"}],\n'
        '  "style": "整体风格 / 打光 / 色调（一句话）",\n'
-        '  "suggested_prompt": "适合用作下游生图的完整英文 prompt"\n'
+        '  "suggested_prompt": "适合用作下游生图的完整英文 prompt",\n'
+        '  "transparent_human_assessment": {"transparent_body_score": 0, "skeleton_visible_score": 0, "human_prominence_score": 0, "clarity_score": 0, "commercial_style_score": 0, "product_usefulness_score": 0, "qualified": false, "reject_reason": "如果不合格说明原因"}\n'
        '}\n'
        "要求：objects 列出 3-8 个画面里**可独立提取**的主要元素，extract_prompt 用于后续 image edit 模型。"
+        "transparent_human_assessment 按透明骨架人标准评分："
+        + TRANSPARENT_HUMAN_POSITIVE_PROMPT + " "
+        + TRANSPARENT_HUMAN_NEGATIVE_PROMPT + " "
+        + TRANSPARENT_HUMAN_QUALIFIED_STANDARD
    )

    last_err = ""
@@ -2409,6 +2588,14 @@ def generate_subject_assets(job_id: str, idx: int, element_id: str, req: Generat
    target = (el.name_en or el.name_zh).strip()
    bg_phrase = "pure white" if req.background == "white" else "pure black"
    kind_phrase = "person, animal, or living character" if req.subject_kind == "living" else "object or product-like subject"
+    transparent_character_clause = (
+        TRANSPARENT_HUMAN_POSITIVE_PROMPT
+        + " The generated living character must be a friendly transparent humanoid with transparent or translucent outer body and clean white skeleton visible inside the same body. "
+        + TRANSPARENT_HUMAN_NEGATIVE_PROMPT
+        + " Do not render a normal human, ordinary skeleton-only character, horror skeleton, medical anatomy, organs, veins, blood, corpse, zombie, hospital, surgery, or autopsy visual. "
+        if req.subject_kind == "living"
+        else ""
+    )
    models = [IMAGE_MODEL, "gemini-3.1-flash-image-preview", "gemini-2.5-flash-image"]
    generated: list[SubjectAsset] = []
    try:
@@ -2433,7 +2620,8 @@ def generate_subject_assets(job_id: str, idx: int, element_id: str, req: Generat
                f"Create a high-definition standalone asset on a solid {bg_phrase} background. "
                "No extra objects, no props, no additional products, no background elements, no original scene fragments, no shadows from the original scene, no text, no watermark, no UI. "
                "If the source is incomplete, partially visible, occluded, or low resolution, reconstruct the missing parts by redrawing a clean complete subject while staying consistent with the reference. "
-                "For living subjects, keep a normal upright standing pose for the standard views; do not create sitting, walking, medical, horror, or distorted anatomy unless explicitly requested by the view label."
+                "For living subjects, keep a normal upright standing pose for the standard views; do not create sitting, walking, medical, horror, or distorted anatomy unless explicitly requested by the view label. "
+                + transparent_character_clause
            )
            try:
                img_bytes, _mode = _image_edit_call(model_src, prompt, models=models, fallback_text=False, max_attempts=3, max_side=1280)