auto-save 2026-05-14 11:53 (~4)

This commit is contained in:
2026-05-14 11:53:14 +08:00
parent ba491c0c5a
commit 801b194bff
4 changed files with 113 additions and 34 deletions

View File

@@ -1,19 +1,5 @@
{
"entries": [
{
"files_changed": 1,
"hash": "7a5b09a",
"message": "auto-save 2026-05-13 04:05 (~1)",
"ts": "2026-05-13T04:06:09+08:00",
"type": "commit"
},
{
"files_changed": 1,
"hash": "6304eab",
"message": "auto-save 2026-05-13 04:11 (~1)",
"ts": "2026-05-13T04:12:02+08:00",
"type": "commit"
},
{
"files_changed": 1,
"hash": "9fcc418",
@@ -3298,6 +3284,19 @@
"type": "session-heartbeat",
"message": "Codex 会话活跃 · 最近命令codex · 6 项未提交变更 · 最近提交auto-save 2026-05-14 11:41 (~1)",
"files_changed": 6
},
{
"ts": "2026-05-14T11:47:40+08:00",
"type": "commit",
"message": "auto-save 2026-05-14 11:47 (~7)",
"hash": "ba491c0",
"files_changed": 7
},
{
"ts": "2026-05-14T03:48:39Z",
"type": "session-heartbeat",
"message": "Codex 会话活跃 · 最近命令codex · 2 项未提交变更 · 最近提交auto-save 2026-05-14 11:47 (~7)",
"files_changed": 2
}
]
}

View File

@@ -120,6 +120,7 @@ SubjectKind = Literal["object", "living"]
SubjectView = str
SceneMode = Literal["remove_subject", "similar", "style"]
SceneStyle = Literal["source", "premium_product", "clean_studio", "warm_lifestyle", "cinematic"]
SceneAssetRole = Literal["scene", "first_frame", "last_frame"]
FRAME_TARGET_LABELS: dict[FrameExtractTarget, str] = {
"transparent_human": "透明骨架人",
"balanced": "综合关键帧",
@@ -256,6 +257,7 @@ class SceneAsset(BaseModel):
size: AssetSize = "source"
scene_mode: SceneMode = "remove_subject"
scene_style: SceneStyle = "source"
asset_role: SceneAssetRole = "scene"
quality_report: QualityReport | None = None
created_at: float = 0.0
@@ -302,6 +304,9 @@ class ProductFusionRegion(BaseModel):
class ProductFusionShot(BaseModel):
id: str = ""
first_image: dict | None = None
last_image: dict | None = None
product_images: list[dict] = Field(default_factory=list)
product_image: dict | None = None
person_image: dict | None = None
product_region: ProductFusionRegion | None = None
@@ -1897,6 +1902,40 @@ def _image_edit_call(
return b64lib.b64decode(b64), effective_mode
def _image_text_call(
prompt: str,
model: str | None = None,
models: list[str] | None = None,
max_attempts: int = 3,
) -> tuple[bytes, str]:
"""Text-only image generation with light model rotation."""
import base64 as b64lib
import time as _time
if not LLM_API_KEY:
raise RuntimeError("LLM_API_KEY 未配置")
models_cycle = list(models) if models else [model or IMAGE_MODEL]
last_err = ""
resp_data: dict = {}
for attempt in range(max_attempts):
current_model = models_cycle[min(attempt, len(models_cycle) - 1)]
try:
resp = llm().images.generate(model=current_model, prompt=prompt, n=1)
resp_data = resp.model_dump() if hasattr(resp, "model_dump") else {"data": [{"b64_json": resp.data[0].b64_json}]}
if resp_data.get("data"):
b64 = resp_data["data"][0].get("b64_json")
if b64:
return b64lib.b64decode(b64), "text"
err_obj = resp_data.get("error") or {}
last_err = f"empty data · {err_obj.get('code', '')} · {str(err_obj.get('message', ''))[:200]} · model={current_model}"
except Exception as e:
last_err = f"{type(e).__name__}: {e} · model={current_model}"
if attempt < max_attempts - 1:
next_model = models_cycle[min(attempt + 1, len(models_cycle) - 1)]
print(f"[image text retry {attempt + 1}/{max_attempts}{next_model}] {last_err}", flush=True)
_time.sleep(1.0)
raise RuntimeError(f"image text failed after {max_attempts} attempts: {last_err}")
# ---------- API 路由 ----------
class CreateJobReq(BaseModel):
@@ -2642,6 +2681,7 @@ class GenerateSceneAssetReq(BaseModel):
size: AssetSize = "source"
scene_mode: SceneMode = "remove_subject"
scene_style: SceneStyle = "source"
asset_role: SceneAssetRole = "scene"
prompt: str = ""
source_frame_indices: list[int] | None = None
@@ -2772,8 +2812,8 @@ def delete_element(job_id: str, idx: int, element_id: str) -> Job:
@app.post("/jobs/{job_id}/frames/{idx}/scene-asset", response_model=Job)
def generate_scene_asset(job_id: str, idx: int, req: GenerateSceneAssetReq) -> Job:
"""为关键帧生成一张干净、高清的场景参考图。默认一帧只需要一张,重跑会保留历史供人工比对
场景图排在主体资产之后:优先依据已确认主体,去主体并补全背景,再按模式生成原场景/相似场景/换风格场景"""
"""为关键帧生成一张资产图
scene: 去主体背景板first_frame/last_frame: 纯文字生成视频首尾帧,参考帧只用于理解统一人物形象"""
import time as _time
job = JOBS.get(job_id)
if not job:
@@ -2814,6 +2854,11 @@ def generate_scene_asset(job_id: str, idx: int, req: GenerateSceneAssetReq) -> J
if confirmed_subjects
else "Remove the main foreground subject from the frame if present. "
)
identity_clause = (
"Known character identity cues: " + ", ".join(confirmed_subjects) + ". "
if confirmed_subjects
else "Infer one consistent friendly transparent human character identity from the provided references. "
)
mode_clause = {
"remove_subject": (
"Keep the original environment, camera angle, perspective, composition, lighting direction, color mood, and spatial layout. "
@@ -2846,22 +2891,44 @@ def generate_scene_asset(job_id: str, idx: int, req: GenerateSceneAssetReq) -> J
if len(source_indices) > 1
else "Use the provided frame as the primary visual reference. "
)
prompt = (
"Create one clean high-definition scene/background reference image from this frame. "
+ subject_clause
+ "Do not include the removed subject, duplicate people, animals, products, text, watermark, platform UI, captions, usernames, hashtags, logos, or overlay graphics. "
+ reference_clause
+ user_prompt_clause
+ mode_clause + " "
+ style_clause + " "
+ "Enhance clarity and texture while avoiding over-smoothing, warped geometry, or changing important perspective details. "
+ "Do not create multiple views. Do not isolate objects."
)
if req.asset_role == "scene":
prompt = (
"Create one clean high-definition scene/background reference image from this frame. "
+ subject_clause
+ "Do not include the removed subject, duplicate people, animals, products, text, watermark, platform UI, captions, usernames, hashtags, logos, or overlay graphics. "
+ reference_clause
+ user_prompt_clause
+ mode_clause + " "
+ style_clause + " "
+ "Enhance clarity and texture while avoiding over-smoothing, warped geometry, or changing important perspective details. "
+ "Do not create multiple views. Do not isolate objects."
)
else:
role_clause = (
"This is the FIRST frame for an image-to-video clip: create a clear beginning pose and composition. "
if req.asset_role == "first_frame"
else "This is the LAST frame for an image-to-video clip: create a clear ending pose that can naturally follow the first frame, not a duplicate. "
)
prompt = (
"Create one premium 9:16 high-definition video endpoint frame from text direction. "
+ role_clause
+ identity_clause
+ reference_clause
+ user_prompt_clause
+ style_clause + " "
+ "The frame must feature the same friendly transparent or translucent human character: glass/acrylic/vinyl-like transparent outer body, visible clean white skeleton inside, clean commercial wellness style, non-horror. "
+ "Use the references only to understand character identity, proportions, transparent shell, white bones, pose vocabulary, camera language, and lighting; do not copy watermarks, subtitles, platform UI, logos, or accidental artifacts. "
+ "Do not create a plain background plate. Do not remove the character. Do not include SKG product unless the user prompt explicitly asks for it. "
+ "The output should be ready as a first/last frame for Seedance video generation, with stable composition, believable perspective, clear subject, no text, no watermark, no gore, no medical surgery imagery."
)
models = [IMAGE_MODEL, "gemini-3.1-flash-image-preview", "gemini-2.5-flash-image"]
try:
img_bytes, _mode = _image_edit_call(model_src, prompt, models=models, fallback_text=False, max_attempts=3, max_side=1280)
if req.asset_role == "scene":
img_bytes, _mode = _image_edit_call(model_src, prompt, models=models, fallback_text=False, max_attempts=3, max_side=1280)
else:
img_bytes, _mode = _image_text_call(prompt, models=models, max_attempts=3)
except RuntimeError as e:
raise HTTPException(500, f"scene asset failed: {e}")
raise HTTPException(500, f"{req.asset_role} asset failed: {e}")
finally:
if sheet_tmp and sheet_tmp.exists():
try: sheet_tmp.unlink()
@@ -2873,7 +2940,11 @@ def generate_scene_asset(job_id: str, idx: int, req: GenerateSceneAssetReq) -> J
report = _image_quality_report(out_path)
scene = SceneAsset(
id=asset_id,
label=f"分镜 {idx + 1} 场景图",
label=(
f"分镜 {idx + 1} 场景图"
if req.asset_role == "scene"
else f"分镜 {idx + 1} {'首帧' if req.asset_role == 'first_frame' else '尾帧'}"
),
url=_asset_url(job_id, asset_id),
width=width,
height=height,
@@ -2881,6 +2952,7 @@ def generate_scene_asset(job_id: str, idx: int, req: GenerateSceneAssetReq) -> J
size=req.size,
scene_mode=req.scene_mode,
scene_style=req.scene_style,
asset_role=req.asset_role,
quality_report=report,
created_at=_time.time(),
)
@@ -2891,7 +2963,8 @@ def generate_scene_asset(job_id: str, idx: int, req: GenerateSceneAssetReq) -> J
f.quality_report = _image_quality_report(src)
f.scene_assets = (f.scene_assets or []) + [scene]
new_frames.append(f)
update(job, frames=new_frames, message=f"场景图生成完成 · 分镜 {idx + 1}")
asset_label = "场景图" if req.asset_role == "scene" else ("首帧" if req.asset_role == "first_frame" else "尾帧")
update(job, frames=new_frames, message=f"{asset_label}生成完成 · 分镜 {idx + 1}")
return job

View File

@@ -2143,13 +2143,13 @@ export function AudioNode({ data, selected }: any) {
onTogglePin={() => d.onToggleNodePin?.("audio")}
>
<div
className="space-y-2 text-[11px] text-[var(--text-soft)] leading-snug"
className="cursor-pointer space-y-2 text-[11px] text-[var(--text-soft)] leading-snug"
onClick={() => {
if (job?.video_url) d.onOpenAudioStrip?.(job.id)
}}
>
<div>
ASR SKG MiniMax <br />
ASR SKG MiniMax <br />
<span className="text-[var(--text-faint)] font-mono">
{audioScript?.rewrite_model || "AUDIO_REWRITE_MODEL"} {audioScript?.voice_model || "MiniMax T2A"}
</span>
@@ -2186,7 +2186,7 @@ export function AudioNode({ data, selected }: any) {
)}
{rewrittenText && (
<div className="rounded-md border border-emerald-400/25 bg-emerald-400/10 px-2.5 py-2">
<div className="mb-1 text-[9.5px] uppercase tracking-widest text-emerald-200/80"> · SKG </div>
<div className="mb-1 text-[9.5px] uppercase tracking-widest text-emerald-200/80"> · SKG English VO</div>
<div className="line-clamp-4 text-[11.5px] leading-relaxed text-[var(--text-strong)] break-words">{rewrittenText}</div>
</div>
)}

View File

@@ -67,6 +67,9 @@ export interface ProductFusionRegion {
export interface ProductFusionShot {
id: string
first_image?: ImageRef | null
last_image?: ImageRef | null
product_images?: ImageRef[]
product_image?: ImageRef | null
person_image?: ImageRef | null
product_region?: ProductFusionRegion | null
@@ -218,6 +221,7 @@ export type SubjectKind = "object" | "living"
export type SubjectView = string
export type SceneMode = "remove_subject" | "similar" | "style"
export type SceneStyle = "source" | "premium_product" | "clean_studio" | "warm_lifestyle" | "cinematic"
export type SceneAssetRole = "scene" | "first_frame" | "last_frame"
export interface QualityReport {
width: number
@@ -251,6 +255,7 @@ export interface SceneAsset {
size: AssetSize
scene_mode?: SceneMode
scene_style?: SceneStyle
asset_role?: SceneAssetRole
quality_report?: QualityReport | null
created_at: number
}
@@ -794,6 +799,7 @@ export async function generateSceneAsset(
size?: AssetSize
scene_mode?: SceneMode
scene_style?: SceneStyle
asset_role?: SceneAssetRole
prompt?: string
source_frame_indices?: number[]
} = {},
@@ -806,6 +812,7 @@ export async function generateSceneAsset(
size: body.size ?? "source",
scene_mode: body.scene_mode ?? "remove_subject",
scene_style: body.scene_style ?? "source",
asset_role: body.asset_role ?? "scene",
prompt: body.prompt ?? "",
source_frame_indices: body.source_frame_indices ?? null,
}),