From 801b194bffadb1871906393390ee32975e49f710 Mon Sep 17 00:00:00 2001 From: kang Date: Thu, 14 May 2026 11:53:14 +0800 Subject: [PATCH] auto-save 2026-05-14 11:53 (~4) --- .memory/worklog.json | 27 ++++----- api/main.py | 107 +++++++++++++++++++++++++++------ web/components/nodes/index.tsx | 6 +- web/lib/api.ts | 7 +++ 4 files changed, 113 insertions(+), 34 deletions(-) diff --git a/.memory/worklog.json b/.memory/worklog.json index 0c5319d..cd5f001 100644 --- a/.memory/worklog.json +++ b/.memory/worklog.json @@ -1,19 +1,5 @@ { "entries": [ - { - "files_changed": 1, - "hash": "7a5b09a", - "message": "auto-save 2026-05-13 04:05 (~1)", - "ts": "2026-05-13T04:06:09+08:00", - "type": "commit" - }, - { - "files_changed": 1, - "hash": "6304eab", - "message": "auto-save 2026-05-13 04:11 (~1)", - "ts": "2026-05-13T04:12:02+08:00", - "type": "commit" - }, { "files_changed": 1, "hash": "9fcc418", @@ -3298,6 +3284,19 @@ "type": "session-heartbeat", "message": "Codex 会话活跃 · 最近命令:codex · 6 项未提交变更 · 最近提交:auto-save 2026-05-14 11:41 (~1)", "files_changed": 6 + }, + { + "ts": "2026-05-14T11:47:40+08:00", + "type": "commit", + "message": "auto-save 2026-05-14 11:47 (~7)", + "hash": "ba491c0", + "files_changed": 7 + }, + { + "ts": "2026-05-14T03:48:39Z", + "type": "session-heartbeat", + "message": "Codex 会话活跃 · 最近命令:codex · 2 项未提交变更 · 最近提交:auto-save 2026-05-14 11:47 (~7)", + "files_changed": 2 } ] } diff --git a/api/main.py b/api/main.py index 84805b6..211574b 100644 --- a/api/main.py +++ b/api/main.py @@ -120,6 +120,7 @@ SubjectKind = Literal["object", "living"] SubjectView = str SceneMode = Literal["remove_subject", "similar", "style"] SceneStyle = Literal["source", "premium_product", "clean_studio", "warm_lifestyle", "cinematic"] +SceneAssetRole = Literal["scene", "first_frame", "last_frame"] FRAME_TARGET_LABELS: dict[FrameExtractTarget, str] = { "transparent_human": "透明骨架人", "balanced": "综合关键帧", @@ -256,6 +257,7 @@ class SceneAsset(BaseModel): size: AssetSize = "source" scene_mode: SceneMode = "remove_subject" scene_style: SceneStyle = "source" + asset_role: SceneAssetRole = "scene" quality_report: QualityReport | None = None created_at: float = 0.0 @@ -302,6 +304,9 @@ class ProductFusionRegion(BaseModel): class ProductFusionShot(BaseModel): id: str = "" + first_image: dict | None = None + last_image: dict | None = None + product_images: list[dict] = Field(default_factory=list) product_image: dict | None = None person_image: dict | None = None product_region: ProductFusionRegion | None = None @@ -1897,6 +1902,40 @@ def _image_edit_call( return b64lib.b64decode(b64), effective_mode +def _image_text_call( + prompt: str, + model: str | None = None, + models: list[str] | None = None, + max_attempts: int = 3, +) -> tuple[bytes, str]: + """Text-only image generation with light model rotation.""" + import base64 as b64lib + import time as _time + if not LLM_API_KEY: + raise RuntimeError("LLM_API_KEY 未配置") + models_cycle = list(models) if models else [model or IMAGE_MODEL] + last_err = "" + resp_data: dict = {} + for attempt in range(max_attempts): + current_model = models_cycle[min(attempt, len(models_cycle) - 1)] + try: + resp = llm().images.generate(model=current_model, prompt=prompt, n=1) + resp_data = resp.model_dump() if hasattr(resp, "model_dump") else {"data": [{"b64_json": resp.data[0].b64_json}]} + if resp_data.get("data"): + b64 = resp_data["data"][0].get("b64_json") + if b64: + return b64lib.b64decode(b64), "text" + err_obj = resp_data.get("error") or {} + last_err = f"empty data · {err_obj.get('code', '')} · {str(err_obj.get('message', ''))[:200]} · model={current_model}" + except Exception as e: + last_err = f"{type(e).__name__}: {e} · model={current_model}" + if attempt < max_attempts - 1: + next_model = models_cycle[min(attempt + 1, len(models_cycle) - 1)] + print(f"[image text retry {attempt + 1}/{max_attempts} → {next_model}] {last_err}", flush=True) + _time.sleep(1.0) + raise RuntimeError(f"image text failed after {max_attempts} attempts: {last_err}") + + # ---------- API 路由 ---------- class CreateJobReq(BaseModel): @@ -2642,6 +2681,7 @@ class GenerateSceneAssetReq(BaseModel): size: AssetSize = "source" scene_mode: SceneMode = "remove_subject" scene_style: SceneStyle = "source" + asset_role: SceneAssetRole = "scene" prompt: str = "" source_frame_indices: list[int] | None = None @@ -2772,8 +2812,8 @@ def delete_element(job_id: str, idx: int, element_id: str) -> Job: @app.post("/jobs/{job_id}/frames/{idx}/scene-asset", response_model=Job) def generate_scene_asset(job_id: str, idx: int, req: GenerateSceneAssetReq) -> Job: - """为关键帧生成一张干净、高清的场景参考图。默认一帧只需要一张,重跑会保留历史供人工比对。 - 场景图排在主体资产之后:优先依据已确认主体,去主体并补全背景,再按模式生成原场景/相似场景/换风格场景。""" + """为关键帧生成一张资产图。 + scene: 去主体背景板;first_frame/last_frame: 纯文字生成视频首尾帧,参考帧只用于理解统一人物形象。""" import time as _time job = JOBS.get(job_id) if not job: @@ -2814,6 +2854,11 @@ def generate_scene_asset(job_id: str, idx: int, req: GenerateSceneAssetReq) -> J if confirmed_subjects else "Remove the main foreground subject from the frame if present. " ) + identity_clause = ( + "Known character identity cues: " + ", ".join(confirmed_subjects) + ". " + if confirmed_subjects + else "Infer one consistent friendly transparent human character identity from the provided references. " + ) mode_clause = { "remove_subject": ( "Keep the original environment, camera angle, perspective, composition, lighting direction, color mood, and spatial layout. " @@ -2846,22 +2891,44 @@ def generate_scene_asset(job_id: str, idx: int, req: GenerateSceneAssetReq) -> J if len(source_indices) > 1 else "Use the provided frame as the primary visual reference. " ) - prompt = ( - "Create one clean high-definition scene/background reference image from this frame. " - + subject_clause - + "Do not include the removed subject, duplicate people, animals, products, text, watermark, platform UI, captions, usernames, hashtags, logos, or overlay graphics. " - + reference_clause - + user_prompt_clause - + mode_clause + " " - + style_clause + " " - + "Enhance clarity and texture while avoiding over-smoothing, warped geometry, or changing important perspective details. " - + "Do not create multiple views. Do not isolate objects." - ) + if req.asset_role == "scene": + prompt = ( + "Create one clean high-definition scene/background reference image from this frame. " + + subject_clause + + "Do not include the removed subject, duplicate people, animals, products, text, watermark, platform UI, captions, usernames, hashtags, logos, or overlay graphics. " + + reference_clause + + user_prompt_clause + + mode_clause + " " + + style_clause + " " + + "Enhance clarity and texture while avoiding over-smoothing, warped geometry, or changing important perspective details. " + + "Do not create multiple views. Do not isolate objects." + ) + else: + role_clause = ( + "This is the FIRST frame for an image-to-video clip: create a clear beginning pose and composition. " + if req.asset_role == "first_frame" + else "This is the LAST frame for an image-to-video clip: create a clear ending pose that can naturally follow the first frame, not a duplicate. " + ) + prompt = ( + "Create one premium 9:16 high-definition video endpoint frame from text direction. " + + role_clause + + identity_clause + + reference_clause + + user_prompt_clause + + style_clause + " " + + "The frame must feature the same friendly transparent or translucent human character: glass/acrylic/vinyl-like transparent outer body, visible clean white skeleton inside, clean commercial wellness style, non-horror. " + + "Use the references only to understand character identity, proportions, transparent shell, white bones, pose vocabulary, camera language, and lighting; do not copy watermarks, subtitles, platform UI, logos, or accidental artifacts. " + + "Do not create a plain background plate. Do not remove the character. Do not include SKG product unless the user prompt explicitly asks for it. " + + "The output should be ready as a first/last frame for Seedance video generation, with stable composition, believable perspective, clear subject, no text, no watermark, no gore, no medical surgery imagery." + ) models = [IMAGE_MODEL, "gemini-3.1-flash-image-preview", "gemini-2.5-flash-image"] try: - img_bytes, _mode = _image_edit_call(model_src, prompt, models=models, fallback_text=False, max_attempts=3, max_side=1280) + if req.asset_role == "scene": + img_bytes, _mode = _image_edit_call(model_src, prompt, models=models, fallback_text=False, max_attempts=3, max_side=1280) + else: + img_bytes, _mode = _image_text_call(prompt, models=models, max_attempts=3) except RuntimeError as e: - raise HTTPException(500, f"scene asset failed: {e}") + raise HTTPException(500, f"{req.asset_role} asset failed: {e}") finally: if sheet_tmp and sheet_tmp.exists(): try: sheet_tmp.unlink() @@ -2873,7 +2940,11 @@ def generate_scene_asset(job_id: str, idx: int, req: GenerateSceneAssetReq) -> J report = _image_quality_report(out_path) scene = SceneAsset( id=asset_id, - label=f"分镜 {idx + 1} 场景图", + label=( + f"分镜 {idx + 1} 场景图" + if req.asset_role == "scene" + else f"分镜 {idx + 1} {'首帧' if req.asset_role == 'first_frame' else '尾帧'}" + ), url=_asset_url(job_id, asset_id), width=width, height=height, @@ -2881,6 +2952,7 @@ def generate_scene_asset(job_id: str, idx: int, req: GenerateSceneAssetReq) -> J size=req.size, scene_mode=req.scene_mode, scene_style=req.scene_style, + asset_role=req.asset_role, quality_report=report, created_at=_time.time(), ) @@ -2891,7 +2963,8 @@ def generate_scene_asset(job_id: str, idx: int, req: GenerateSceneAssetReq) -> J f.quality_report = _image_quality_report(src) f.scene_assets = (f.scene_assets or []) + [scene] new_frames.append(f) - update(job, frames=new_frames, message=f"场景图生成完成 · 分镜 {idx + 1}") + asset_label = "场景图" if req.asset_role == "scene" else ("首帧" if req.asset_role == "first_frame" else "尾帧") + update(job, frames=new_frames, message=f"{asset_label}生成完成 · 分镜 {idx + 1}") return job diff --git a/web/components/nodes/index.tsx b/web/components/nodes/index.tsx index 32d626e..8222a0e 100644 --- a/web/components/nodes/index.tsx +++ b/web/components/nodes/index.tsx @@ -2143,13 +2143,13 @@ export function AudioNode({ data, selected }: any) { onTogglePin={() => d.onToggleNodePin?.("audio")} >
{ if (job?.video_url) d.onOpenAudioStrip?.(job.id) }} >
- 音轨 → ASR 转录 → 英中翻译 → SKG 口播改写 → MiniMax 配音
+ 音轨 → ASR 转录 → 英中翻译 → SKG 英文口播 → MiniMax 英文配音
{audioScript?.rewrite_model || "AUDIO_REWRITE_MODEL"} → {audioScript?.voice_model || "MiniMax T2A"} @@ -2186,7 +2186,7 @@ export function AudioNode({ data, selected }: any) { )} {rewrittenText && (
-
改后 · SKG 口播
+
改后 · SKG English VO
{rewrittenText}
)} diff --git a/web/lib/api.ts b/web/lib/api.ts index 7ddbd43..8cd08cb 100644 --- a/web/lib/api.ts +++ b/web/lib/api.ts @@ -67,6 +67,9 @@ export interface ProductFusionRegion { export interface ProductFusionShot { id: string + first_image?: ImageRef | null + last_image?: ImageRef | null + product_images?: ImageRef[] product_image?: ImageRef | null person_image?: ImageRef | null product_region?: ProductFusionRegion | null @@ -218,6 +221,7 @@ export type SubjectKind = "object" | "living" export type SubjectView = string export type SceneMode = "remove_subject" | "similar" | "style" export type SceneStyle = "source" | "premium_product" | "clean_studio" | "warm_lifestyle" | "cinematic" +export type SceneAssetRole = "scene" | "first_frame" | "last_frame" export interface QualityReport { width: number @@ -251,6 +255,7 @@ export interface SceneAsset { size: AssetSize scene_mode?: SceneMode scene_style?: SceneStyle + asset_role?: SceneAssetRole quality_report?: QualityReport | null created_at: number } @@ -794,6 +799,7 @@ export async function generateSceneAsset( size?: AssetSize scene_mode?: SceneMode scene_style?: SceneStyle + asset_role?: SceneAssetRole prompt?: string source_frame_indices?: number[] } = {}, @@ -806,6 +812,7 @@ export async function generateSceneAsset( size: body.size ?? "source", scene_mode: body.scene_mode ?? "remove_subject", scene_style: body.scene_style ?? "source", + asset_role: body.asset_role ?? "scene", prompt: body.prompt ?? "", source_frame_indices: body.source_frame_indices ?? null, }),