auto-save 2026-05-14 10:08 (~4)
This commit is contained in:
214
api/main.py
214
api/main.py
@@ -92,7 +92,7 @@ JobStatus = Literal[
|
||||
]
|
||||
|
||||
KEYFRAME_COUNT = int(os.getenv("KEYFRAME_COUNT", "5"))
|
||||
FrameExtractTarget = Literal["balanced", "subject", "transition", "expression", "motion"]
|
||||
FrameExtractTarget = Literal["transparent_human", "balanced", "subject", "transition", "expression", "motion"]
|
||||
FrameExtractMode = Literal["replace", "append"]
|
||||
FrameExtractQuality = Literal["auto", "fast", "accurate", "ultra"]
|
||||
AnalyzeTask = tuple[str, int, FrameExtractTarget, FrameExtractMode, FrameExtractQuality]
|
||||
@@ -104,12 +104,38 @@ SubjectView = str
|
||||
SceneMode = Literal["remove_subject", "similar", "style"]
|
||||
SceneStyle = Literal["source", "premium_product", "clean_studio", "warm_lifestyle", "cinematic"]
|
||||
FRAME_TARGET_LABELS: dict[FrameExtractTarget, str] = {
|
||||
"transparent_human": "透明骨架人",
|
||||
"balanced": "综合关键帧",
|
||||
"subject": "清晰主体",
|
||||
"transition": "转场变化",
|
||||
"expression": "表情瞬间",
|
||||
"motion": "动作峰值",
|
||||
}
|
||||
|
||||
TRANSPARENT_HUMAN_POSITIVE_PROMPT = (
|
||||
"Target subject: transparent human character, translucent human body, glass-like human body, clear acrylic skin, "
|
||||
"transparent vinyl skin, visible clean white skeleton inside, skeleton visible inside transparent body, "
|
||||
"white bones inside clear body, non-horror skeleton character, friendly transparent humanoid, 3D commercial character, "
|
||||
"premium wellness character, transparent body with visible spine, transparent body with visible rib cage. "
|
||||
"中文目标:透明人体、半透明人体、玻璃人体、亚克力人体、果冻质感人体、外层透明皮肤、身体内部可见骨架、"
|
||||
"透明身体里的白色骨骼、干净白色骨架、非恐怖骷髅人、3D广告角色、透明骨架人、可见脊柱、可见肋骨、"
|
||||
"可见颈椎、可见骨盆、可见四肢骨骼、透明皮肤包裹骨架。"
|
||||
)
|
||||
TRANSPARENT_HUMAN_NEGATIVE_PROMPT = (
|
||||
"Avoid: normal human, ordinary skeleton, skeleton only without transparent body, horror skeleton, gore, blood, corpse, "
|
||||
"zombie, organs, veins, autopsy, surgery, hospital, dark horror scene, blurry person, heavily occluded person, "
|
||||
"person too small, product only, background only, no visible skeleton, no transparent body, transparent clothing only. "
|
||||
"反向排除:普通真人、普通骷髅、只有骨架没有透明外壳、恐怖骷髅、血腥、腐烂、僵尸、尸体、器官、血管、"
|
||||
"解剖、医院、手术、黑暗恐怖场景、模糊人物、遮挡严重、人物太远、只有产品没有人、只有背景没有人、"
|
||||
"看不到骨架、看不到透明身体、透明衣服但不是透明身体。"
|
||||
)
|
||||
TRANSPARENT_HUMAN_QUALIFIED_STANDARD = (
|
||||
"A qualified frame must satisfy all core conditions: 1) there is a humanoid character; "
|
||||
"2) the outer body is transparent or translucent; 3) a clean white skeleton is clearly visible inside the body; "
|
||||
"4) the transparent body and inner skeleton belong to the same character, not a background overlay; "
|
||||
"5) the character should occupy at least about 35% of frame height and be easy to inspect; "
|
||||
"6) no severe blur, occlusion, or deformation; 7) clean premium commercial wellness style, non-horror."
|
||||
)
|
||||
FRAME_QUALITY_LABELS: dict[FrameExtractQuality, str] = {
|
||||
"auto": "自动",
|
||||
"fast": "快速",
|
||||
@@ -190,6 +216,19 @@ class QualityReport(BaseModel):
|
||||
warnings: list[str] = Field(default_factory=list)
|
||||
|
||||
|
||||
class TransparentHumanFrameScore(BaseModel):
|
||||
transparent_body_score: int = 0
|
||||
skeleton_visible_score: int = 0
|
||||
human_prominence_score: int = 0
|
||||
clarity_score: int = 0
|
||||
commercial_style_score: int = 0
|
||||
product_usefulness_score: int = 0
|
||||
total_score: int = 0
|
||||
qualified: bool = False
|
||||
reject_reason: str = ""
|
||||
notes: str = ""
|
||||
|
||||
|
||||
class SceneAsset(BaseModel):
|
||||
id: str
|
||||
label: str = ""
|
||||
@@ -280,6 +319,7 @@ class KeyFrame(BaseModel):
|
||||
timestamp: float
|
||||
url: str
|
||||
description: dict | None = None # vision 模型识别结果 {scene, objects, style, suggested_prompt}
|
||||
transparent_human_score: TransparentHumanFrameScore | None = None
|
||||
cleaned_url: str | None = None # 清洗后干净版(待应用)→ /jobs/{id}/frames/{idx}/cleaned.jpg
|
||||
cleaned_applied: bool = False # 是否已用清洗版替换原图(替换后 cleaned_url=null)
|
||||
quality_report: QualityReport | None = None
|
||||
@@ -870,7 +910,11 @@ def _target_score(item: dict, target: FrameExtractTarget) -> float:
|
||||
scene = float(item.get("scene_score_n", 0.0))
|
||||
motion = float(item.get("motion_n", 0.0))
|
||||
|
||||
if target == "subject":
|
||||
if target == "transparent_human":
|
||||
# 透明骨架人仍先依赖本地清晰度 / 中心主体 / 对比度筛候选,
|
||||
# 后续再交给 Vision 逐张语义验收。
|
||||
score = center * 0.45 + sharp * 0.30 + contrast * 0.15 + color * 0.10
|
||||
elif target == "subject":
|
||||
score = center * 0.48 + sharp * 0.25 + contrast * 0.17 + color * 0.10
|
||||
elif target == "transition":
|
||||
score = scene * 0.55 + sharp * 0.28 + contrast * 0.12 + color * 0.05
|
||||
@@ -942,6 +986,100 @@ def _select_keyframes(candidates: list[dict], n: int, target: FrameExtractTarget
|
||||
return selected
|
||||
|
||||
|
||||
def _rank_keyframe_candidates(candidates: list[dict], target: FrameExtractTarget, limit: int, dup_threshold: int = 8) -> list[dict]:
|
||||
if not candidates:
|
||||
return []
|
||||
_attach_temporal_metrics(candidates)
|
||||
_normalize_item_metrics(candidates)
|
||||
for it in candidates:
|
||||
it["score"] = _target_score(it, target)
|
||||
deduped: list[dict] = []
|
||||
for it in sorted(candidates, key=lambda x: -float(x.get("score", 0.0))):
|
||||
if any((it["hash"] - kept["hash"]) < dup_threshold for kept in deduped):
|
||||
continue
|
||||
deduped.append(it)
|
||||
if len(deduped) >= limit:
|
||||
break
|
||||
return deduped
|
||||
|
||||
|
||||
def _score_transparent_human_frame(img_path: Path) -> TransparentHumanFrameScore:
|
||||
if not LLM_API_KEY:
|
||||
return TransparentHumanFrameScore(
|
||||
qualified=False,
|
||||
reject_reason="LLM_API_KEY 未配置,无法进行透明骨架人语义验收",
|
||||
)
|
||||
img_b64 = base64.b64encode(img_path.read_bytes()).decode("ascii")
|
||||
prompt = (
|
||||
"You are a strict keyframe quality inspector for a SKG transparent-human video recreation workflow. "
|
||||
+ TRANSPARENT_HUMAN_POSITIVE_PROMPT + " "
|
||||
+ TRANSPARENT_HUMAN_NEGATIVE_PROMPT + " "
|
||||
+ TRANSPARENT_HUMAN_QUALIFIED_STANDARD + "\n\n"
|
||||
"Score this single frame using exactly these dimensions:\n"
|
||||
"- transparent_body_score: 0-25, clear transparent/translucent outer human body shell.\n"
|
||||
"- skeleton_visible_score: 0-25, clean white skeleton clearly visible inside the body.\n"
|
||||
"- human_prominence_score: 0-15, character centered/large/easy to identify, ideally >=35% frame height.\n"
|
||||
"- clarity_score: 0-15, no severe motion blur, occlusion, or deformation.\n"
|
||||
"- commercial_style_score: 0-10, clean premium non-horror advertising/wellness style.\n"
|
||||
"- product_usefulness_score: 0-10, useful for later SKG product video generation; neck/shoulder/waist/eye/foot/knee area visible when relevant.\n"
|
||||
"Reject if any of these is true: normal human only; ordinary skeleton only; product/background only; transparent person too far; severe blur; more than half occluded; horror/corpse/autopsy/surgery/hospital; unable to judge.\n"
|
||||
"Output strict JSON only with keys: transparent_body_score, skeleton_visible_score, human_prominence_score, clarity_score, commercial_style_score, product_usefulness_score, qualified, reject_reason, notes."
|
||||
)
|
||||
try:
|
||||
resp = llm().chat.completions.create(
|
||||
model=VISION_MODEL,
|
||||
messages=[{"role": "user", "content": [
|
||||
{"type": "text", "text": prompt},
|
||||
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{img_b64}"}},
|
||||
]}],
|
||||
response_format={"type": "json_object"},
|
||||
temperature=0.1,
|
||||
max_tokens=1200,
|
||||
)
|
||||
raw = (resp.choices[0].message.content or "").strip()
|
||||
data = json.loads(raw)
|
||||
except Exception as e:
|
||||
return TransparentHumanFrameScore(qualified=False, reject_reason=f"AI 评分失败:{e}")
|
||||
|
||||
def score(name: str, cap: int) -> int:
|
||||
try:
|
||||
value = int(round(float(data.get(name, 0))))
|
||||
except Exception:
|
||||
value = 0
|
||||
return max(0, min(cap, value))
|
||||
|
||||
item = TransparentHumanFrameScore(
|
||||
transparent_body_score=score("transparent_body_score", 25),
|
||||
skeleton_visible_score=score("skeleton_visible_score", 25),
|
||||
human_prominence_score=score("human_prominence_score", 15),
|
||||
clarity_score=score("clarity_score", 15),
|
||||
commercial_style_score=score("commercial_style_score", 10),
|
||||
product_usefulness_score=score("product_usefulness_score", 10),
|
||||
reject_reason=str(data.get("reject_reason", "") or ""),
|
||||
notes=str(data.get("notes", "") or ""),
|
||||
)
|
||||
item.total_score = (
|
||||
item.transparent_body_score
|
||||
+ item.skeleton_visible_score
|
||||
+ item.human_prominence_score
|
||||
+ item.clarity_score
|
||||
+ item.commercial_style_score
|
||||
+ item.product_usefulness_score
|
||||
)
|
||||
item.qualified = bool(data.get("qualified")) and (
|
||||
item.transparent_body_score >= 18
|
||||
and item.skeleton_visible_score >= 18
|
||||
and item.human_prominence_score >= 8
|
||||
and item.clarity_score >= 8
|
||||
and item.commercial_style_score >= 6
|
||||
and item.product_usefulness_score >= 4
|
||||
and item.total_score >= 72
|
||||
)
|
||||
if not item.qualified and not item.reject_reason:
|
||||
item.reject_reason = f"透明骨架人评分不足,总分 {item.total_score}/100"
|
||||
return item
|
||||
|
||||
|
||||
def ffprobe_meta(mp4: Path) -> dict:
|
||||
out = run([
|
||||
"ffprobe", "-v", "error", "-print_format", "json", "-show_streams", "-show_format", str(mp4),
|
||||
@@ -989,7 +1127,7 @@ async def pipeline_download(job_id: str) -> None:
|
||||
async def pipeline_analyze(
|
||||
job_id: str,
|
||||
frame_count: int = KEYFRAME_COUNT,
|
||||
target: FrameExtractTarget = "balanced",
|
||||
target: FrameExtractTarget = "transparent_human",
|
||||
mode: FrameExtractMode = "replace",
|
||||
quality: FrameExtractQuality = "auto",
|
||||
) -> None:
|
||||
@@ -1053,17 +1191,25 @@ async def pipeline_analyze(
|
||||
if not candidates:
|
||||
raise RuntimeError("候选帧评分失败")
|
||||
|
||||
# 2) 目标化筛选:pHash 去重 + 清晰度 / 中心细节 / 转场变化 / 动作强度 + 时序分桶。
|
||||
selection_count = n if replacing else min(len(candidates), max(n * 4, n + len(existing_frames) + 2))
|
||||
update(job, message=f"{quality_label}筛选 · {target_label} · {n} / {len(candidates)} 张…", progress=60)
|
||||
chosen = _select_keyframes(candidates, selection_count, target)
|
||||
# 2) 目标化筛选:pHash 去重 + 清晰度 / 中心细节 / 转场变化 / 动作强度。
|
||||
# 透明骨架人目标会先扩大候选池,再用 Vision 逐张验收;不合格自动换下一帧。
|
||||
semantic_transparent = target == "transparent_human"
|
||||
if semantic_transparent:
|
||||
selection_count = min(len(candidates), min(max(n * 10, 24), 48))
|
||||
update(job, message=f"{quality_label}筛选透明骨架人候选 · 本地 {selection_count} / {len(candidates)} 张…", progress=58)
|
||||
chosen = _rank_keyframe_candidates(candidates, target, selection_count)
|
||||
else:
|
||||
selection_count = n if replacing else min(len(candidates), max(n * 4, n + len(existing_frames) + 2))
|
||||
update(job, message=f"{quality_label}筛选 · {target_label} · {n} / {len(candidates)} 张…", progress=60)
|
||||
chosen = _select_keyframes(candidates, selection_count, target)
|
||||
|
||||
# 3) 只对最终选中的时间点,从原视频抽高质量关键帧。
|
||||
renamed: list[KeyFrame] = []
|
||||
chosen_sorted = sorted(chosen, key=lambda it: float(it["timestamp"]))
|
||||
chosen_sorted = chosen if semantic_transparent else sorted(chosen, key=lambda it: float(it["timestamp"]))
|
||||
existing_timestamps = [float(f.timestamp) for f in existing_frames]
|
||||
next_idx = max((int(f.index) for f in existing_frames), default=-1) + 1
|
||||
for item in chosen_sorted:
|
||||
rejected_by_ai = 0
|
||||
for attempt, item in enumerate(chosen_sorted, start=1):
|
||||
if len(renamed) >= n:
|
||||
break
|
||||
t = float(item["timestamp"])
|
||||
@@ -1077,25 +1223,53 @@ async def pipeline_analyze(
|
||||
"-pix_fmt", "yuvj420p", "-q:v", "3",
|
||||
str(dst),
|
||||
])
|
||||
transparent_score: TransparentHumanFrameScore | None = None
|
||||
if semantic_transparent:
|
||||
update(
|
||||
job,
|
||||
message=f"AI 验收透明骨架人 · 已通过 {len(renamed)}/{n} · 候选 {attempt}/{len(chosen_sorted)}…",
|
||||
progress=min(68, 60 + int(attempt / max(1, len(chosen_sorted)) * 8)),
|
||||
)
|
||||
transparent_score = _score_transparent_human_frame(dst)
|
||||
if not transparent_score.qualified:
|
||||
rejected_by_ai += 1
|
||||
try:
|
||||
dst.unlink()
|
||||
except OSError:
|
||||
pass
|
||||
reason = transparent_score.reject_reason or f"总分 {transparent_score.total_score}/100"
|
||||
update(job, message=f"AI 退回候选帧 · {reason[:48]} · 自动换下一帧", progress=65)
|
||||
continue
|
||||
renamed.append(KeyFrame(
|
||||
index=idx,
|
||||
timestamp=round(t, 2),
|
||||
url=f"/jobs/{job_id}/frames/{idx}.jpg",
|
||||
transparent_human_score=transparent_score,
|
||||
))
|
||||
existing_timestamps.append(t)
|
||||
|
||||
if semantic_transparent and not renamed:
|
||||
raise RuntimeError("AI 未找到合格透明骨架人帧:需要透明/半透明人体外壳 + 清楚白色骨架 + 非恐怖广告感")
|
||||
|
||||
# 4) 清理扫描目录
|
||||
shutil.rmtree(scan_dir, ignore_errors=True)
|
||||
|
||||
merged_frames = sorted(existing_frames + renamed, key=lambda f: f.timestamp)
|
||||
action_label = "追加" if not replacing else "抽取"
|
||||
|
||||
final_message = (
|
||||
f"已按「{quality_label} · {target_label}」AI验收 {action_label} {len(renamed)} 张"
|
||||
+ (f" · 退回 {rejected_by_ai} 张" if semantic_transparent else "")
|
||||
+ f" · 共 {len(merged_frames)} 张"
|
||||
) if semantic_transparent else (
|
||||
f"已按「{quality_label} · {target_label}」{action_label} {len(renamed)} 张关键帧 · 共 {len(merged_frames)} 张"
|
||||
)
|
||||
update(
|
||||
job,
|
||||
status="frames_extracted",
|
||||
frames=merged_frames,
|
||||
progress=70,
|
||||
message=f"已按「{quality_label} · {target_label}」{action_label} {len(renamed)} 张关键帧 · 共 {len(merged_frames)} 张",
|
||||
message=final_message,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
@@ -1486,7 +1660,7 @@ async def trigger_analyze(
|
||||
job_id: str,
|
||||
bg: BackgroundTasks,
|
||||
frames: int = KEYFRAME_COUNT,
|
||||
target: FrameExtractTarget = "balanced",
|
||||
target: FrameExtractTarget = "transparent_human",
|
||||
mode: FrameExtractMode = "replace",
|
||||
quality: FrameExtractQuality = "auto",
|
||||
) -> Job:
|
||||
@@ -1792,9 +1966,14 @@ def describe_frame(job_id: str, idx: int) -> Job:
|
||||
' "scene": "一句话描述场景",\n'
|
||||
' "objects": [{"name": "物体名(中文)", "position": "在画面哪里", "color": "颜色", "extract_prompt": "用于提取该元素的英文 prompt"}],\n'
|
||||
' "style": "整体风格 / 打光 / 色调(一句话)",\n'
|
||||
' "suggested_prompt": "适合用作下游生图的完整英文 prompt"\n'
|
||||
' "suggested_prompt": "适合用作下游生图的完整英文 prompt",\n'
|
||||
' "transparent_human_assessment": {"transparent_body_score": 0, "skeleton_visible_score": 0, "human_prominence_score": 0, "clarity_score": 0, "commercial_style_score": 0, "product_usefulness_score": 0, "qualified": false, "reject_reason": "如果不合格说明原因"}\n'
|
||||
'}\n'
|
||||
"要求:objects 列出 3-8 个画面里**可独立提取**的主要元素,extract_prompt 用于后续 image edit 模型。"
|
||||
"transparent_human_assessment 按透明骨架人标准评分:"
|
||||
+ TRANSPARENT_HUMAN_POSITIVE_PROMPT + " "
|
||||
+ TRANSPARENT_HUMAN_NEGATIVE_PROMPT + " "
|
||||
+ TRANSPARENT_HUMAN_QUALIFIED_STANDARD
|
||||
)
|
||||
|
||||
last_err = ""
|
||||
@@ -2409,6 +2588,14 @@ def generate_subject_assets(job_id: str, idx: int, element_id: str, req: Generat
|
||||
target = (el.name_en or el.name_zh).strip()
|
||||
bg_phrase = "pure white" if req.background == "white" else "pure black"
|
||||
kind_phrase = "person, animal, or living character" if req.subject_kind == "living" else "object or product-like subject"
|
||||
transparent_character_clause = (
|
||||
TRANSPARENT_HUMAN_POSITIVE_PROMPT
|
||||
+ " The generated living character must be a friendly transparent humanoid with transparent or translucent outer body and clean white skeleton visible inside the same body. "
|
||||
+ TRANSPARENT_HUMAN_NEGATIVE_PROMPT
|
||||
+ " Do not render a normal human, ordinary skeleton-only character, horror skeleton, medical anatomy, organs, veins, blood, corpse, zombie, hospital, surgery, or autopsy visual. "
|
||||
if req.subject_kind == "living"
|
||||
else ""
|
||||
)
|
||||
models = [IMAGE_MODEL, "gemini-3.1-flash-image-preview", "gemini-2.5-flash-image"]
|
||||
generated: list[SubjectAsset] = []
|
||||
try:
|
||||
@@ -2433,7 +2620,8 @@ def generate_subject_assets(job_id: str, idx: int, element_id: str, req: Generat
|
||||
f"Create a high-definition standalone asset on a solid {bg_phrase} background. "
|
||||
"No extra objects, no props, no additional products, no background elements, no original scene fragments, no shadows from the original scene, no text, no watermark, no UI. "
|
||||
"If the source is incomplete, partially visible, occluded, or low resolution, reconstruct the missing parts by redrawing a clean complete subject while staying consistent with the reference. "
|
||||
"For living subjects, keep a normal upright standing pose for the standard views; do not create sitting, walking, medical, horror, or distorted anatomy unless explicitly requested by the view label."
|
||||
"For living subjects, keep a normal upright standing pose for the standard views; do not create sitting, walking, medical, horror, or distorted anatomy unless explicitly requested by the view label. "
|
||||
+ transparent_character_clause
|
||||
)
|
||||
try:
|
||||
img_bytes, _mode = _image_edit_call(model_src, prompt, models=models, fallback_text=False, max_attempts=3, max_side=1280)
|
||||
|
||||
Reference in New Issue
Block a user