From f1f3a0fbe526fd67b158b7aa4088b858940748f6 Mon Sep 17 00:00:00 2001 From: kang Date: Thu, 14 May 2026 04:59:53 +0800 Subject: [PATCH] auto-save 2026-05-14 04:59 (~3) --- .memory/worklog.json | 13 ++ api/main.py | 384 ++++++++++++++++++++++++++++++++++++++++++- web/lib/api.ts | 96 ++++++++++- 3 files changed, 491 insertions(+), 2 deletions(-) diff --git a/.memory/worklog.json b/.memory/worklog.json index 40418ae..0ea53f1 100644 --- a/.memory/worklog.json +++ b/.memory/worklog.json @@ -3270,6 +3270,19 @@ "type": "session-heartbeat", "message": "Claude 会话活跃 · 最近命令:claude · 1 项未提交变更 · 最近提交:auto-save 2026-05-14 04:48 (~1)", "files_changed": 1 + }, + { + "ts": "2026-05-14T04:54:24+08:00", + "type": "commit", + "message": "auto-save 2026-05-14 04:54 (~1)", + "hash": "f5ac97b", + "files_changed": 1 + }, + { + "ts": "2026-05-13T20:58:50Z", + "type": "session-heartbeat", + "message": "Codex 会话活跃 · 最近命令:codex · 2 项未提交变更 · 最近提交:auto-save 2026-05-14 04:54 (~1)", + "files_changed": 2 } ] } diff --git a/api/main.py b/api/main.py index dc8f045..3948db2 100644 --- a/api/main.py +++ b/api/main.py @@ -92,6 +92,11 @@ FrameExtractTarget = Literal["balanced", "subject", "transition", "expression", FrameExtractMode = Literal["replace", "append"] FrameExtractQuality = Literal["auto", "fast", "accurate", "ultra"] AnalyzeTask = tuple[str, int, FrameExtractTarget, FrameExtractMode, FrameExtractQuality] +AssetBackground = Literal["white", "black"] +AssetSize = Literal["source", "1024", "1536", "2048"] +AssetQuality = Literal["hd"] +SubjectKind = Literal["object", "living"] +SubjectView = Literal["front", "back", "left", "right", "side", "side_walk", "top", "bottom", "expression"] FRAME_TARGET_LABELS: dict[FrameExtractTarget, str] = { "balanced": "综合关键帧", "subject": "清晰主体", @@ -161,7 +166,7 @@ class StoryboardScene(BaseModel): class StoryboardImage(BaseModel): """用户从各处"上推"到分镜头编排区的图片""" ref_id: str # uuid hex 8 - kind: Literal["keyframe", "cutout"] # keyframe = 关键帧本身 / cutout = 元素提取图 + kind: Literal["keyframe", "cutout", "asset"] # asset = 场景 / 主体视角等组图素材 frame_idx: int element_id: str | None = None # cutout 时 cutout_id: str | None = None # cutout 时(versioned id;老数据可能 == element_id) @@ -169,6 +174,42 @@ class StoryboardImage(BaseModel): created_at: float = 0.0 +class QualityReport(BaseModel): + width: int = 0 + height: int = 0 + short_side: int = 0 + sharpness: float = 0.0 + risk: Literal["ok", "warn", "bad"] = "ok" + warnings: list[str] = Field(default_factory=list) + + +class SceneAsset(BaseModel): + id: str + label: str = "" + url: str = "" + width: int = 0 + height: int = 0 + quality: AssetQuality = "hd" + size: AssetSize = "source" + quality_report: QualityReport | None = None + created_at: float = 0.0 + + +class SubjectAsset(BaseModel): + id: str + view: SubjectView + label: str = "" + url: str = "" + width: int = 0 + height: int = 0 + background: AssetBackground = "white" + quality: AssetQuality = "hd" + size: AssetSize = "source" + source_frame_indices: list[int] = Field(default_factory=list) + ai_completed: bool = True + created_at: float = 0.0 + + class KeyElement(BaseModel): """关键帧里识别 / 用户提取的元素 · 多次提取累积多张图,让用户挑选满意的""" id: str # uuid hex 8 @@ -182,6 +223,8 @@ class KeyElement(BaseModel): # 旧字段兼容(v1 单图)· 渲染时 fallback 用,新提取不再写入 cutout_id: str | None = None cutout_background: Literal["white", "black"] = "white" + subject_kind: SubjectKind = "object" + subject_assets: list[SubjectAsset] = Field(default_factory=list) created_at: float = 0.0 @@ -192,6 +235,8 @@ class KeyFrame(BaseModel): description: dict | None = None # vision 模型识别结果 {scene, objects, style, suggested_prompt} cleaned_url: str | None = None # 清洗后干净版(待应用)→ /jobs/{id}/frames/{idx}/cleaned.jpg cleaned_applied: bool = False # 是否已用清洗版替换原图(替换后 cleaned_url=null) + quality_report: QualityReport | None = None + scene_assets: list[SceneAsset] = Field(default_factory=list) elements: list[KeyElement] = [] # 提取的元素清单(持久化) storyboard: StoryboardScene | None = None # 分镜头编排字段 generated_images: list[GeneratedImage] = [] @@ -483,6 +528,189 @@ def _scan_profile(duration: float, quality: FrameExtractQuality) -> tuple[float, return scan_fps, scan_width, metric_width, estimated +def _image_quality_report(img_path: Path, region: dict | None = None) -> QualityReport: + warnings: list[str] = [] + try: + with Image.open(img_path) as raw: + img = raw.convert("RGB") + width, height = img.size + metric_width = min(512, width) + metric_height = max(1, round(metric_width * height / max(width, 1))) + small = img.resize((metric_width, metric_height)) + gray = np.asarray(ImageOps.grayscale(small), dtype=np.float32) + sharp = _sharpness_from_gray(gray) + except Exception: + return QualityReport(risk="bad", warnings=["无法读取图片质量信息"]) + + short_side = min(width, height) + if short_side < 720: + warnings.append(f"短边 {short_side}px 低于 720px,生视频可能偏糊") + if sharp < 30: + warnings.append("清晰度偏低,高清增强后仍可能有细节损失") + + if region: + try: + rw = int(float(region.get("w", 0)) * width) + rh = int(float(region.get("h", 0)) * height) + if min(rw, rh) < 512: + warnings.append(f"主体框约 {rw}×{rh}px,主体素材偏小") + except Exception: + pass + + risk: Literal["ok", "warn", "bad"] = "ok" + if any("低于" in w or "偏小" in w for w in warnings): + risk = "warn" + if short_side < 480 or sharp < 12: + risk = "bad" + return QualityReport(width=width, height=height, short_side=short_side, sharpness=round(sharp, 2), risk=risk, warnings=warnings) + + +def _asset_target_size(source_path: Path, size: AssetSize, square: bool = False) -> tuple[int, int]: + try: + with Image.open(source_path) as raw: + src_w, src_h = raw.size + except Exception: + src_w, src_h = 1024, 1024 + if size == "source": + return max(1, src_w), max(1, src_h) + side = int(size) + if square: + return side, side + if src_w >= src_h: + return side, max(1, round(side * src_h / max(src_w, 1))) + return max(1, round(side * src_w / max(src_h, 1))), side + + +def _normalize_asset_image( + img_bytes: bytes, + out_path: Path, + source_path: Path, + size: AssetSize, + background: AssetBackground = "white", + square: bool = False, +) -> tuple[int, int]: + import io as _io + target_w, target_h = _asset_target_size(source_path, size, square=square) + bg = (255, 255, 255) if background == "white" else (0, 0, 0) + out_path.parent.mkdir(parents=True, exist_ok=True) + with Image.open(_io.BytesIO(img_bytes)) as raw: + img = raw.convert("RGB") + img.thumbnail((target_w, target_h), Image.Resampling.LANCZOS) + canvas = Image.new("RGB", (target_w, target_h), bg) + canvas.paste(img, ((target_w - img.width) // 2, (target_h - img.height) // 2)) + canvas.save(out_path, "JPEG", quality=95) + return target_w, target_h + + +def _asset_url(job_id: str, asset_id: str) -> str: + return f"/jobs/{job_id}/assets/{asset_id}.jpg" + + +def _find_frame(job: Job, idx: int) -> KeyFrame: + frame = next((f for f in job.frames if f.index == idx), None) + if not frame: + raise HTTPException(404, "frame not found") + return frame + + +def _source_frame_path(job_id: str, idx: int) -> Path: + cleaned_path = job_dir(job_id) / "cleaned" / f"{idx:03d}.jpg" + if cleaned_path.exists(): + return cleaned_path + return job_dir(job_id) / "frames" / f"{idx:03d}.jpg" + + +def _focus_source_for_element(job_id: str, idx: int, el: KeyElement) -> tuple[Path, Path | None]: + import tempfile as _tempfile + src = _source_frame_path(job_id, idx) + tmp_focus: Path | None = None + model_src = src + if not el.region: + return model_src, tmp_focus + try: + im = Image.open(src).convert("RGB") + W, H = im.size + r = el.region + x = max(0.0, min(1.0, float(r.get("x", 0)))) + y = max(0.0, min(1.0, float(r.get("y", 0)))) + w = max(0.0, min(1.0 - x, float(r.get("w", 0)))) + h = max(0.0, min(1.0 - y, float(r.get("h", 0)))) + cx, cy = x + w / 2, y + h / 2 + ew, eh = w * 1.6, h * 1.6 + x0 = max(0.0, cx - ew / 2); y0 = max(0.0, cy - eh / 2) + x1 = min(1.0, cx + ew / 2); y1 = min(1.0, cy + eh / 2) + left, top, right, bottom = int(x0 * W), int(y0 * H), int(x1 * W), int(y1 * H) + if right - left > 8 and bottom - top > 8: + cropped = im.crop((left, top, right, bottom)) + tmp = _tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) + cropped.save(tmp.name, format="JPEG", quality=92) + tmp.close() + tmp_focus = Path(tmp.name) + model_src = tmp_focus + except Exception as e: + print(f"[focus source crop failed, fallback to full frame] {e}", flush=True) + return model_src, tmp_focus + + +def _make_reference_contact_sheet(job_id: str, frame_indices: list[int], out_path: Path) -> Path | None: + paths: list[Path] = [] + seen: set[int] = set() + for idx in frame_indices: + if idx in seen: + continue + seen.add(idx) + p = _source_frame_path(job_id, idx) + if p.exists(): + paths.append(p) + if len(paths) >= 6: + break + if len(paths) <= 1: + return None + + thumbs: list[Image.Image] = [] + for p in paths: + try: + im = Image.open(p).convert("RGB") + im.thumbnail((420, 420), Image.Resampling.LANCZOS) + canvas = Image.new("RGB", (420, 420), (245, 245, 245)) + canvas.paste(im, ((420 - im.width) // 2, (420 - im.height) // 2)) + thumbs.append(canvas) + except Exception: + continue + if len(thumbs) <= 1: + return None + + cols = 3 if len(thumbs) > 2 else 2 + rows = (len(thumbs) + cols - 1) // cols + sheet = Image.new("RGB", (cols * 420, rows * 420), (245, 245, 245)) + for i, thumb in enumerate(thumbs): + sheet.paste(thumb, ((i % cols) * 420, (i // cols) * 420)) + out_path.parent.mkdir(parents=True, exist_ok=True) + sheet.save(out_path, "JPEG", quality=92) + return out_path + + +def _subject_view_labels(kind: SubjectKind) -> list[tuple[SubjectView, str]]: + if kind == "living": + return [ + ("front", "正面站立"), + ("back", "背面站立"), + ("side", "侧面站立"), + ("side_walk", "侧面走路"), + ("top", "顶部视角"), + ("bottom", "底部视角"), + ("expression", "表情参考"), + ] + return [ + ("front", "正面"), + ("back", "背面"), + ("left", "左侧"), + ("right", "右侧"), + ("top", "顶部"), + ("bottom", "底部"), + ] + + def _attach_temporal_metrics(items: list[dict]) -> None: """相邻低清帧差异:转场 / 动作目标依赖它,不需要逐帧高分辨率扫描。""" for i, it in enumerate(items): @@ -1672,6 +1900,19 @@ class UpdateElementReq(BaseModel): position: str | None = None +class GenerateSceneAssetReq(BaseModel): + quality: AssetQuality = "hd" + size: AssetSize = "source" + + +class GenerateSubjectAssetsReq(BaseModel): + subject_kind: SubjectKind = "object" + background: AssetBackground = "white" + quality: AssetQuality = "hd" + size: AssetSize = "source" + source_frame_indices: list[int] | None = None + + @app.post("/jobs/{job_id}/frames/{idx}/elements", response_model=Job) def add_element(job_id: str, idx: int, req: AddElementReq) -> Job: """加一条元素 · 若 name_en 缺则自动 zh→en 翻译""" @@ -1787,6 +2028,57 @@ def delete_element(job_id: str, idx: int, element_id: str) -> Job: return job +@app.post("/jobs/{job_id}/frames/{idx}/scene-asset", response_model=Job) +def generate_scene_asset(job_id: str, idx: int, req: GenerateSceneAssetReq) -> Job: + """为关键帧生成一张干净、高清的场景参考图。默认一帧只需要一张,重跑会保留历史供人工比对。""" + import time as _time + job = JOBS.get(job_id) + if not job: + raise HTTPException(404, "job not found") + frame = _find_frame(job, idx) + src = _source_frame_path(job_id, idx) + if not src.exists(): + raise HTTPException(404, "source frame file missing") + + prompt = ( + "Create one clean high-definition scene reference image from this frame. " + "Remove watermarks, platform UI, captions, usernames, hashtags, logos, and overlay graphics. " + "Preserve the original camera angle, composition, environment, lighting style, and believable spatial layout. " + "Do not create multiple views. Do not isolate objects. Keep it useful as the scene/background reference for image-to-video generation. " + "Enhance clarity and texture while avoiding over-smoothing or changing important visual details." + ) + models = [IMAGE_MODEL, "gemini-3.1-flash-image-preview", "gemini-2.5-flash-image"] + try: + img_bytes, _mode = _image_edit_call(src, prompt, models=models, fallback_text=False, max_attempts=3, max_side=1280) + except RuntimeError as e: + raise HTTPException(500, f"scene asset failed: {e}") + + asset_id = f"scene_{idx:03d}_{uuid.uuid4().hex[:8]}" + out_path = job_dir(job_id) / "assets" / f"{asset_id}.jpg" + width, height = _normalize_asset_image(img_bytes, out_path, src, req.size, "white", square=False) + report = _image_quality_report(out_path) + scene = SceneAsset( + id=asset_id, + label=f"分镜 {idx + 1} 场景图", + url=_asset_url(job_id, asset_id), + width=width, + height=height, + quality=req.quality, + size=req.size, + quality_report=report, + created_at=_time.time(), + ) + + new_frames = [] + for f in job.frames: + if f.index == idx: + f.quality_report = _image_quality_report(src) + f.scene_assets = (f.scene_assets or []) + [scene] + new_frames.append(f) + update(job, frames=new_frames, message=f"场景图生成完成 · 分镜 {idx + 1}") + return job + + @app.post("/jobs/{job_id}/frames/{idx}/elements/{element_id}/cutout", response_model=Job) def cutout_element(job_id: str, idx: int, element_id: str) -> Job: """AI 提取元素 · 每次累积一张新图: @@ -1881,6 +2173,96 @@ def cutout_element(job_id: str, idx: int, element_id: str) -> Job: return job +@app.post("/jobs/{job_id}/frames/{idx}/elements/{element_id}/subject-assets", response_model=Job) +def generate_subject_assets(job_id: str, idx: int, element_id: str, req: GenerateSubjectAssetsReq) -> Job: + """为一个主体生成多视角资产包。 + 如果传入 source_frame_indices,则把多张已选关键帧拼成参考板,表示这些帧都在服务同一个主体。""" + import time as _time + job = JOBS.get(job_id) + if not job: + raise HTTPException(404, "job not found") + frame = _find_frame(job, idx) + el = next((e for e in frame.elements if e.id == element_id), None) + if not el: + raise HTTPException(404, "element not found") + + source_indices = [int(x) for x in (req.source_frame_indices or [idx]) if isinstance(x, int) or str(x).isdigit()] + if idx not in source_indices: + source_indices = [idx] + source_indices + source_indices = list(dict.fromkeys(source_indices))[:6] + + model_src, tmp_focus = _focus_source_for_element(job_id, idx, el) + sheet_tmp: Path | None = None + if len(source_indices) > 1: + sheet_tmp = job_dir(job_id) / "tmp" / f"subject_refs_{idx:03d}_{element_id}_{uuid.uuid4().hex[:6]}.jpg" + sheet = _make_reference_contact_sheet(job_id, source_indices, sheet_tmp) + if sheet: + model_src = sheet + + target = (el.name_en or el.name_zh).strip() + bg_phrase = "pure white" if req.background == "white" else "pure black" + kind_phrase = "person, animal, or living character" if req.subject_kind == "living" else "object or product-like subject" + models = [IMAGE_MODEL, "gemini-3.1-flash-image-preview", "gemini-2.5-flash-image"] + generated: list[SubjectAsset] = [] + try: + for view, view_label in _subject_view_labels(req.subject_kind): + if view == "side_walk": + view_prompt = "side view in a natural walking pose, same identity and proportions" + elif view == "expression": + view_prompt = "clear expression reference, frontal or three-quarter standing pose, preserving the same identity" + else: + view_prompt = f"{view_label} view" + prompt = ( + f"Use the reference image(s) to generate a single {view_prompt} of the same {target}. " + f"The subject is a {kind_phrase}. If multiple frames are shown, treat them as evidence of one same subject, not multiple subjects. " + "Preserve identity, proportions, silhouette, material, colors, styling, and distinctive details across all generated views. " + f"Create a high-definition standalone asset on a {bg_phrase} background. " + "No extra objects, no original scene fragments, no text, no watermark, no UI. " + "If the source is incomplete or occluded, intelligently complete missing parts while staying consistent with the reference. " + "For living subjects, keep the body standing and readable; do not create medical, horror, or distorted anatomy." + ) + try: + img_bytes, _mode = _image_edit_call(model_src, prompt, models=models, fallback_text=False, max_attempts=3, max_side=1280) + except RuntimeError as e: + raise HTTPException(500, f"subject asset {view} failed: {e}") + + asset_id = f"subject_{idx:03d}_{element_id}_{view}_{uuid.uuid4().hex[:8]}" + out_path = job_dir(job_id) / "assets" / f"{asset_id}.jpg" + width, height = _normalize_asset_image(img_bytes, out_path, _source_frame_path(job_id, idx), req.size, req.background, square=False) + generated.append(SubjectAsset( + id=asset_id, + view=view, + label=f"{el.name_zh} · {view_label}", + url=_asset_url(job_id, asset_id), + width=width, + height=height, + background=req.background, + quality=req.quality, + size=req.size, + source_frame_indices=source_indices, + created_at=_time.time(), + )) + finally: + for p in (tmp_focus, sheet_tmp): + if p and p.exists(): + try: p.unlink() + except OSError: pass + + src = _source_frame_path(job_id, idx) + new_frames = [] + for f in job.frames: + if f.index == idx: + f.quality_report = _image_quality_report(src, el.region) + for e in f.elements: + if e.id == element_id: + e.subject_kind = req.subject_kind + e.cutout_background = req.background + e.subject_assets = (e.subject_assets or []) + generated + new_frames.append(f) + update(job, frames=new_frames, message=f"主体资产包生成完成 · {el.name_zh} · {len(generated)} 张") + return job + + @app.delete("/jobs/{job_id}/frames/{idx}/elements/{element_id}/cutouts/{cutout_id}", response_model=Job) def delete_cutout(job_id: str, idx: int, element_id: str, cutout_id: str) -> Job: """删除该元素的某张提取图""" diff --git a/web/lib/api.ts b/web/lib/api.ts index bd4901b..e82e5b6 100644 --- a/web/lib/api.ts +++ b/web/lib/api.ts @@ -44,6 +44,8 @@ export interface KeyElement { cutouts?: string[] // v2 多张提取图 id 列表 cutout_id?: string | null // v1 兼容字段 cutout_background?: "white" | "black" + subject_kind?: SubjectKind + subject_assets?: SubjectAsset[] created_at?: number } @@ -123,6 +125,8 @@ export interface KeyFrame { description?: FrameDescription | null cleaned_url?: string | null cleaned_applied?: boolean + quality_report?: QualityReport | null + scene_assets?: SceneAsset[] elements?: KeyElement[] storyboard?: StoryboardScene | null generated_images?: GeneratedImage[] @@ -131,6 +135,46 @@ export interface KeyFrame { export type FrameExtractTarget = "balanced" | "subject" | "transition" | "expression" | "motion" export type FrameExtractMode = "replace" | "append" export type FrameExtractQuality = "auto" | "fast" | "accurate" | "ultra" +export type AssetBackground = "white" | "black" +export type AssetSize = "source" | "1024" | "1536" | "2048" +export type SubjectKind = "object" | "living" +export type SubjectView = "front" | "back" | "left" | "right" | "side" | "side_walk" | "top" | "bottom" | "expression" + +export interface QualityReport { + width: number + height: number + short_side: number + sharpness: number + risk: "ok" | "warn" | "bad" + warnings: string[] +} + +export interface SceneAsset { + id: string + label: string + url: string + width: number + height: number + quality: "hd" + size: AssetSize + quality_report?: QualityReport | null + created_at: number +} + +export interface SubjectAsset { + id: string + view: SubjectView + label: string + url: string + width: number + height: number + background: AssetBackground + quality: "hd" + size: AssetSize + source_frame_indices?: number[] + ai_completed?: boolean + created_at: number +} export interface TranscriptSegment { index: number @@ -142,7 +186,7 @@ export interface TranscriptSegment { export interface StoryboardImage { ref_id: string - kind: "keyframe" | "cutout" + kind: "keyframe" | "cutout" | "asset" frame_idx: number element_id?: string | null cutout_id?: string | null @@ -373,6 +417,10 @@ export function cutoutUrl(jobId: string, frameIndex: number, elementId: string, return `${API_BASE}/jobs/${jobId}/frames/${frameIndex}/elements/${elementId}/cutout.jpg` } +export function jobAssetUrl(jobId: string, assetId: string): string { + return `${API_BASE}/jobs/${jobId}/assets/${assetId}.jpg` +} + // 兼容 v1 (cutout_id) / v2 (cutouts 数组) — 返回"有没有提取图" export function hasCutout(e: KeyElement): boolean { return (Array.isArray(e.cutouts) && e.cutouts.length > 0) || !!e.cutout_id @@ -601,3 +649,49 @@ export async function cutoutElement(jobId: string, frameIdx: number, elementId: } return res.json() } + +export async function generateSceneAsset( + jobId: string, + frameIdx: number, + body: { size?: AssetSize } = {}, +): Promise { + const res = await fetch(`${API_BASE}/jobs/${jobId}/frames/${frameIdx}/scene-asset`, { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ quality: "hd", size: body.size ?? "source" }), + }) + if (!res.ok) { + const txt = await res.text().catch(() => "") + throw new Error(`sceneAsset ${res.status} ${txt.slice(0, 300)}`) + } + return res.json() +} + +export async function generateSubjectAssets( + jobId: string, + frameIdx: number, + elementId: string, + body: { + subject_kind?: SubjectKind + background?: AssetBackground + size?: AssetSize + source_frame_indices?: number[] + } = {}, +): Promise { + const res = await fetch(`${API_BASE}/jobs/${jobId}/frames/${frameIdx}/elements/${elementId}/subject-assets`, { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ + quality: "hd", + subject_kind: body.subject_kind ?? "object", + background: body.background ?? "white", + size: body.size ?? "source", + source_frame_indices: body.source_frame_indices ?? null, + }), + }) + if (!res.ok) { + const txt = await res.text().catch(() => "") + throw new Error(`subjectAssets ${res.status} ${txt.slice(0, 300)}`) + } + return res.json() +}