diff --git a/.memory/worklog.json b/.memory/worklog.json index f94fc30..8c0751c 100644 --- a/.memory/worklog.json +++ b/.memory/worklog.json @@ -468,6 +468,13 @@ "message": "auto-save 2026-05-12 23:27 (~1)", "hash": "df5fa84", "files_changed": 1 + }, + { + "ts": "2026-05-12T23:33:05+08:00", + "type": "commit", + "message": "auto-save 2026-05-12 23:32 (~1)", + "hash": "0c251a2", + "files_changed": 1 } ] } diff --git a/api/main.py b/api/main.py index 1b9d866..4f5051b 100644 --- a/api/main.py +++ b/api/main.py @@ -27,6 +27,7 @@ LLM_API_KEY = os.getenv("LLM_API_KEY", "").strip() ASR_MODEL = os.getenv("ASR_MODEL", "whisper-1") TRANSLATE_MODEL = os.getenv("TRANSLATE_MODEL", "gemini-2.5-flash") REWRITE_MODEL = os.getenv("REWRITE_MODEL", "gemini-2.5-pro") +VISION_MODEL = os.getenv("VISION_MODEL", "gemini-2.5-flash") # OpenAI 客户端(OpenAI 兼容网关,含 SKG ezlink) from openai import OpenAI @@ -55,6 +56,7 @@ class KeyFrame(BaseModel): index: int timestamp: float url: str + description: dict | None = None # vision 模型识别结果 {scene, objects, style, suggested_prompt} class TranscriptSegment(BaseModel): @@ -579,3 +581,58 @@ def get_frame(job_id: str, idx: int): if not p.exists(): raise HTTPException(404, "frame not found") return FileResponse(p, media_type="image/jpeg") + + +@app.post("/jobs/{job_id}/frames/{idx}/describe", response_model=Job) +def describe_frame(job_id: str, idx: int) -> Job: + """调 vision 模型识别该关键帧,返回结构化描述。""" + job = JOBS.get(job_id) + if not job: + raise HTTPException(404, "job not found") + frame = next((f for f in job.frames if f.index == idx), None) + if not frame: + raise HTTPException(404, "frame not found") + p = job_dir(job_id) / "frames" / f"{idx:03d}.jpg" + if not p.exists(): + raise HTTPException(404, "frame file not found") + + import base64 as b64lib + img_b64 = b64lib.b64encode(p.read_bytes()).decode("ascii") + + prompt = ( + "请识别这张图,输出严格 JSON(不要 markdown 不要解释):\n" + '{\n' + ' "scene": "一句话描述场景",\n' + ' "objects": [{"name": "物体名(中文)", "position": "在画面哪里", "color": "颜色", "extract_prompt": "用于提取该元素的英文 prompt"}],\n' + ' "style": "整体风格 / 打光 / 色调(一句话)",\n' + ' "suggested_prompt": "适合用作下游生图的完整英文 prompt"\n' + '}\n' + "要求:objects 列出 3-8 个画面里**可独立提取**的主要元素,extract_prompt 用于后续 image edit 模型。" + ) + + try: + resp = llm().chat.completions.create( + model=VISION_MODEL, + messages=[{"role": "user", "content": [ + {"type": "text", "text": prompt}, + {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{img_b64}"}}, + ]}], + response_format={"type": "json_object"}, + temperature=0.3, + max_tokens=1500, + ) + content = resp.choices[0].message.content or "{}" + data = json.loads(content) + except json.JSONDecodeError as e: + raise HTTPException(500, f"vision returned invalid JSON: {e}") + except Exception as e: + raise HTTPException(500, f"vision failed: {e}") + + # 写回 job + new_frames = [] + for f in job.frames: + if f.index == idx: + f.description = data + new_frames.append(f) + update(job, frames=new_frames, message=f"识别完成 · 分镜 {idx + 1}") + return job diff --git a/web/app/page.tsx b/web/app/page.tsx index 1c3f471..f796bec 100644 --- a/web/app/page.tsx +++ b/web/app/page.tsx @@ -315,6 +315,7 @@ export default function Home() { onClose={() => setExpandedFrame(null)} onChange={setExpandedFrame} onToggleSelect={handleToggleFrame} + onJobUpdate={setJob} /> )} diff --git a/web/components/lightbox.tsx b/web/components/lightbox.tsx index 37f1810..6594b50 100644 --- a/web/components/lightbox.tsx +++ b/web/components/lightbox.tsx @@ -1,7 +1,8 @@ "use client" import { useEffect, useState } from "react" -import { X, ChevronLeft, ChevronRight, Check, Sparkles, Wand2, Loader2, Eye } from "lucide-react" -import { frameUrl, type KeyFrame } from "@/lib/api" +import { X, ChevronLeft, ChevronRight, Check, Sparkles, Wand2, Loader2, Eye, RefreshCw, Copy } from "lucide-react" +import { frameUrl, describeFrame, type KeyFrame, type Job } from "@/lib/api" +import { toast } from "sonner" interface Props { jobId: string @@ -11,20 +12,25 @@ interface Props { onClose: () => void onChange: (idx: number) => void onToggleSelect: (idx: number) => void + onJobUpdate?: (job: Job) => void } -export function FrameLightbox({ jobId, frames, activeIndex, selected, onClose, onChange, onToggleSelect }: Props) { +export function FrameLightbox({ jobId, frames, activeIndex, selected, onClose, onChange, onToggleSelect, onJobUpdate }: Props) { const [extractPrompt, setExtractPrompt] = useState("") + const [describing, setDescribing] = useState(false) useEffect(() => { if (activeIndex === null) return const onKey = (e: KeyboardEvent) => { + const inField = ["INPUT", "TEXTAREA"].includes((e.target as HTMLElement).tagName) if (e.key === "Escape") onClose() - if (e.key === "ArrowLeft" && activeIndex > 0) onChange(activeIndex - 1) - if (e.key === "ArrowRight" && activeIndex < frames.length - 1) onChange(activeIndex + 1) - if ((e.key === " " || e.key === "Enter") && (e.target as HTMLElement).tagName !== "INPUT" && (e.target as HTMLElement).tagName !== "TEXTAREA") { - e.preventDefault() - onToggleSelect(activeIndex) + if (!inField) { + if (e.key === "ArrowLeft" && activeIndex > 0) onChange(activeIndex - 1) + if (e.key === "ArrowRight" && activeIndex < frames.length - 1) onChange(activeIndex + 1) + if (e.key === " " || e.key === "Enter") { + e.preventDefault() + onToggleSelect(activeIndex) + } } } window.addEventListener("keydown", onKey) @@ -34,22 +40,37 @@ export function FrameLightbox({ jobId, frames, activeIndex, selected, onClose, o if (activeIndex === null || !frames[activeIndex]) return null const f = frames[activeIndex] const isSelected = selected.has(f.index) + const desc = f.description + + const handleDescribe = async () => { + setDescribing(true) + try { + const updated = await describeFrame(jobId, f.index) + onJobUpdate?.(updated) + toast.success(`分镜 ${f.index + 1} 识别完成`) + } catch (e) { + toast.error("识别失败:" + (e instanceof Error ? e.message : String(e))) + } finally { + setDescribing(false) + } + } + + const copyText = (text: string) => { + navigator.clipboard.writeText(text).then(() => toast.success("已复制")) + } return (