diff --git a/.memory/worklog.json b/.memory/worklog.json
index f94fc30..8c0751c 100644
--- a/.memory/worklog.json
+++ b/.memory/worklog.json
@@ -468,6 +468,13 @@
       "message": "auto-save 2026-05-12 23:27 (~1)",
       "hash": "df5fa84",
       "files_changed": 1
+    },
+    {
+      "ts": "2026-05-12T23:33:05+08:00",
+      "type": "commit",
+      "message": "auto-save 2026-05-12 23:32 (~1)",
+      "hash": "0c251a2",
+      "files_changed": 1
     }
   ]
 }
diff --git a/api/main.py b/api/main.py
index 1b9d866..4f5051b 100644
--- a/api/main.py
+++ b/api/main.py
@@ -27,6 +27,7 @@ LLM_API_KEY = os.getenv("LLM_API_KEY", "").strip()
 ASR_MODEL = os.getenv("ASR_MODEL", "whisper-1")
 TRANSLATE_MODEL = os.getenv("TRANSLATE_MODEL", "gemini-2.5-flash")
 REWRITE_MODEL = os.getenv("REWRITE_MODEL", "gemini-2.5-pro")
+VISION_MODEL = os.getenv("VISION_MODEL", "gemini-2.5-flash")
 
 # OpenAI 客户端（OpenAI 兼容网关，含 SKG ezlink）
 from openai import OpenAI
@@ -55,6 +56,7 @@ class KeyFrame(BaseModel):
     index: int
     timestamp: float
     url: str
+    description: dict | None = None  # vision 模型识别结果 {scene, objects, style, suggested_prompt}
 
 
 class TranscriptSegment(BaseModel):
@@ -579,3 +581,58 @@ def get_frame(job_id: str, idx: int):
     if not p.exists():
         raise HTTPException(404, "frame not found")
     return FileResponse(p, media_type="image/jpeg")
+
+
+@app.post("/jobs/{job_id}/frames/{idx}/describe", response_model=Job)
+def describe_frame(job_id: str, idx: int) -> Job:
+    """调 vision 模型识别该关键帧，返回结构化描述。"""
+    job = JOBS.get(job_id)
+    if not job:
+        raise HTTPException(404, "job not found")
+    frame = next((f for f in job.frames if f.index == idx), None)
+    if not frame:
+        raise HTTPException(404, "frame not found")
+    p = job_dir(job_id) / "frames" / f"{idx:03d}.jpg"
+    if not p.exists():
+        raise HTTPException(404, "frame file not found")
+
+    import base64 as b64lib
+    img_b64 = b64lib.b64encode(p.read_bytes()).decode("ascii")
+
+    prompt = (
+        "请识别这张图，输出严格 JSON（不要 markdown 不要解释）：\n"
+        '{\n'
+        '  "scene": "一句话描述场景",\n'
+        '  "objects": [{"name": "物体名（中文）", "position": "在画面哪里", "color": "颜色", "extract_prompt": "用于提取该元素的英文 prompt"}],\n'
+        '  "style": "整体风格 / 打光 / 色调（一句话）",\n'
+        '  "suggested_prompt": "适合用作下游生图的完整英文 prompt"\n'
+        '}\n'
+        "要求：objects 列出 3-8 个画面里**可独立提取**的主要元素，extract_prompt 用于后续 image edit 模型。"
+    )
+
+    try:
+        resp = llm().chat.completions.create(
+            model=VISION_MODEL,
+            messages=[{"role": "user", "content": [
+                {"type": "text", "text": prompt},
+                {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{img_b64}"}},
+            ]}],
+            response_format={"type": "json_object"},
+            temperature=0.3,
+            max_tokens=1500,
+        )
+        content = resp.choices[0].message.content or "{}"
+        data = json.loads(content)
+    except json.JSONDecodeError as e:
+        raise HTTPException(500, f"vision returned invalid JSON: {e}")
+    except Exception as e:
+        raise HTTPException(500, f"vision failed: {e}")
+
+    # 写回 job
+    new_frames = []
+    for f in job.frames:
+        if f.index == idx:
+            f.description = data
+        new_frames.append(f)
+    update(job, frames=new_frames, message=f"识别完成 · 分镜 {idx + 1}")
+    return job
diff --git a/web/app/page.tsx b/web/app/page.tsx
index 1c3f471..f796bec 100644
--- a/web/app/page.tsx
+++ b/web/app/page.tsx
@@ -315,6 +315,7 @@ export default function Home() {
             onClose={() => setExpandedFrame(null)}
             onChange={setExpandedFrame}
             onToggleSelect={handleToggleFrame}
+            onJobUpdate={setJob}
           />
         )}
 
diff --git a/web/components/lightbox.tsx b/web/components/lightbox.tsx
index 37f1810..6594b50 100644
--- a/web/components/lightbox.tsx
+++ b/web/components/lightbox.tsx
@@ -1,7 +1,8 @@
 "use client"
 import { useEffect, useState } from "react"
-import { X, ChevronLeft, ChevronRight, Check, Sparkles, Wand2, Loader2, Eye } from "lucide-react"
-import { frameUrl, type KeyFrame } from "@/lib/api"
+import { X, ChevronLeft, ChevronRight, Check, Sparkles, Wand2, Loader2, Eye, RefreshCw, Copy } from "lucide-react"
+import { frameUrl, describeFrame, type KeyFrame, type Job } from "@/lib/api"
+import { toast } from "sonner"
 
 interface Props {
   jobId: string
@@ -11,20 +12,25 @@ interface Props {
   onClose: () => void
   onChange: (idx: number) => void
   onToggleSelect: (idx: number) => void
+  onJobUpdate?: (job: Job) => void
 }
 
-export function FrameLightbox({ jobId, frames, activeIndex, selected, onClose, onChange, onToggleSelect }: Props) {
+export function FrameLightbox({ jobId, frames, activeIndex, selected, onClose, onChange, onToggleSelect, onJobUpdate }: Props) {
   const [extractPrompt, setExtractPrompt] = useState("")
+  const [describing, setDescribing] = useState(false)
 
   useEffect(() => {
     if (activeIndex === null) return
     const onKey = (e: KeyboardEvent) => {
+      const inField = ["INPUT", "TEXTAREA"].includes((e.target as HTMLElement).tagName)
       if (e.key === "Escape") onClose()
-      if (e.key === "ArrowLeft" && activeIndex > 0) onChange(activeIndex - 1)
-      if (e.key === "ArrowRight" && activeIndex < frames.length - 1) onChange(activeIndex + 1)
-      if ((e.key === " " || e.key === "Enter") && (e.target as HTMLElement).tagName !== "INPUT" && (e.target as HTMLElement).tagName !== "TEXTAREA") {
-        e.preventDefault()
-        onToggleSelect(activeIndex)
+      if (!inField) {
+        if (e.key === "ArrowLeft" && activeIndex > 0) onChange(activeIndex - 1)
+        if (e.key === "ArrowRight" && activeIndex < frames.length - 1) onChange(activeIndex + 1)
+        if (e.key === " " || e.key === "Enter") {
+          e.preventDefault()
+          onToggleSelect(activeIndex)
+        }
       }
     }
     window.addEventListener("keydown", onKey)
@@ -34,22 +40,37 @@ export function FrameLightbox({ jobId, frames, activeIndex, selected, onClose, o
   if (activeIndex === null || !frames[activeIndex]) return null
   const f = frames[activeIndex]
   const isSelected = selected.has(f.index)
+  const desc = f.description
+
+  const handleDescribe = async () => {
+    setDescribing(true)
+    try {
+      const updated = await describeFrame(jobId, f.index)
+      onJobUpdate?.(updated)
+      toast.success(`分镜 ${f.index + 1} 识别完成`)
+    } catch (e) {
+      toast.error("识别失败：" + (e instanceof Error ? e.message : String(e)))
+    } finally {
+      setDescribing(false)
+    }
+  }
+
+  const copyText = (text: string) => {
+    navigator.clipboard.writeText(text).then(() => toast.success("已复制"))
+  }
 
   return (
     <div
       className="fixed inset-0 z-[100] bg-black/85 backdrop-blur-sm flex items-center justify-center"
       onClick={onClose}
     >
-      {/* 关闭 */}
       <button
         onClick={(e) => { e.stopPropagation(); onClose() }}
         className="absolute top-5 right-5 h-10 w-10 rounded-full bg-white/10 hover:bg-white/20 text-white flex items-center justify-center z-10"
-        aria-label="关闭"
       >
         <X className="h-5 w-5" />
       </button>
 
-      {/* 左右切换 */}
       {activeIndex > 0 && (
         <button
           onClick={(e) => { e.stopPropagation(); onChange(activeIndex - 1) }}
@@ -67,12 +88,8 @@ export function FrameLightbox({ jobId, frames, activeIndex, selected, onClose, o
         </button>
       )}
 
-      {/* 主体：左大图 + 右识别面板 */}
-      <div
-        onClick={(e) => e.stopPropagation()}
-        className="flex gap-4 max-w-[92vw] max-h-[92vh] items-start"
-      >
-        {/* 左侧：大图 + 底部 meta */}
+      <div onClick={(e) => e.stopPropagation()} className="flex gap-4 max-w-[92vw] max-h-[92vh] items-start">
+        {/* 左侧大图 */}
         <div className="flex flex-col items-center gap-3 flex-shrink-0">
           <img
             src={frameUrl(jobId, f.index)}
@@ -98,13 +115,12 @@ export function FrameLightbox({ jobId, frames, activeIndex, selected, onClose, o
               {isSelected ? "已选用" : "选用此帧"}
             </button>
           </div>
-          <div className="text-[10.5px] text-white/40 font-mono">←/→ 切换 · Space 选用 · ESC 关闭</div>
         </div>
 
-        {/* 右侧：识别 + 提取面板 */}
+        {/* 右侧识别面板 */}
         <div
           className="flex flex-col gap-3 overflow-y-auto rounded-2xl border border-white/15 bg-black/40 backdrop-blur-xl p-4"
-          style={{ width: 340, maxHeight: "80vh" }}
+          style={{ width: 360, maxHeight: "80vh" }}
         >
           {/* 识别到的元素 */}
           <section>
@@ -112,30 +128,84 @@ export function FrameLightbox({ jobId, frames, activeIndex, selected, onClose, o
               <div className="flex items-center gap-1.5 text-white text-[12.5px] font-semibold">
                 <Eye className="h-3.5 w-3.5" />
                 识别到的元素
+                {desc && <span className="text-[10px] text-emerald-400 font-mono ml-1">已识别</span>}
               </div>
               <button
-                disabled
-                className="text-[10.5px] text-white/40 px-2 py-0.5 rounded border border-white/10 cursor-not-allowed"
-                title="待 vision 模型接入"
+                onClick={handleDescribe}
+                disabled={describing}
+                className="text-[10.5px] text-white/70 hover:text-white px-2 py-0.5 rounded border border-white/20 hover:border-white/40 disabled:opacity-50 inline-flex items-center gap-1"
+                title="调 Gemini Vision 识别"
               >
-                ↻ 刷新
+                {describing ? <Loader2 className="h-2.5 w-2.5 animate-spin" /> : <RefreshCw className="h-2.5 w-2.5" />}
+                {desc ? "重新识别" : "识别"}
               </button>
             </div>
-            <div className="rounded-lg border border-dashed border-white/15 bg-white/[0.03] p-3 text-[11.5px] text-white/50 leading-relaxed">
-              <div className="flex items-center gap-1.5 text-white/40 mb-1.5">
-                <Loader2 className="h-3 w-3 animate-spin" />
-                等待 Vision 模型
+
+            {!desc ? (
+              <div className="rounded-lg border border-dashed border-white/15 bg-white/[0.03] p-3 text-[11.5px] text-white/50 leading-relaxed">
+                {describing ? (
+                  <div className="flex items-center gap-1.5 text-white/60">
+                    <Loader2 className="h-3 w-3 animate-spin" />
+                    Gemini Vision 识别中…
+                  </div>
+                ) : (
+                  <>点击右上角「识别」按钮，Gemini 2.5 看图给出场景描述 / 物体列表 / 风格 / 适合下游的 prompt。</>
+                )}
               </div>
-              点击「↻ 刷新」识别图中元素后，这里会列出：
-              <ul className="mt-1.5 ml-3 space-y-0.5 text-white/45">
-                <li>• 主体物（人 / 产品 / 道具）</li>
-                <li>• 场景 / 背景描述</li>
-                <li>• 风格 / 打光 / 色调</li>
-              </ul>
-              <div className="mt-2 text-[10.5px] text-white/30 font-mono">
-                依赖：Gemini Vision · SKG 网关 image 渠道待开通
+            ) : (
+              <div className="space-y-2.5 text-[11.5px]">
+                {desc.scene && (
+                  <div className="rounded-md bg-violet-500/10 border border-violet-400/25 px-2.5 py-2">
+                    <div className="text-[9.5px] uppercase tracking-widest text-violet-300 mb-1">场景</div>
+                    <div className="text-white leading-relaxed">{desc.scene}</div>
+                  </div>
+                )}
+                {desc.style && (
+                  <div className="rounded-md bg-amber-500/10 border border-amber-400/25 px-2.5 py-2">
+                    <div className="text-[9.5px] uppercase tracking-widest text-amber-300 mb-1">风格</div>
+                    <div className="text-white leading-relaxed">{desc.style}</div>
+                  </div>
+                )}
+                {desc.objects && desc.objects.length > 0 && (
+                  <div className="rounded-md bg-pink-500/10 border border-pink-400/25 px-2.5 py-2">
+                    <div className="text-[9.5px] uppercase tracking-widest text-pink-300 mb-1.5">
+                      物体（点击 → 自动填入提取框）
+                    </div>
+                    <div className="space-y-1.5">
+                      {desc.objects.map((o, i) => (
+                        <button
+                          key={i}
+                          onClick={() => setExtractPrompt(o.extract_prompt || o.name)}
+                          className="w-full text-left rounded bg-white/[0.04] hover:bg-white/[0.08] border border-white/10 hover:border-pink-300/40 px-2 py-1.5 transition"
+                        >
+                          <div className="text-white text-[11.5px] font-medium">
+                            {o.name}
+                            {o.position && <span className="text-white/40 ml-1.5 text-[10px]">· {o.position}</span>}
+                          </div>
+                          {o.extract_prompt && (
+                            <div className="text-[10px] text-white/40 mt-0.5 truncate font-mono">{o.extract_prompt}</div>
+                          )}
+                        </button>
+                      ))}
+                    </div>
+                  </div>
+                )}
+                {desc.suggested_prompt && (
+                  <div className="rounded-md bg-emerald-500/10 border border-emerald-400/25 px-2.5 py-2">
+                    <div className="flex items-center justify-between mb-1">
+                      <div className="text-[9.5px] uppercase tracking-widest text-emerald-300">建议 Prompt</div>
+                      <button
+                        onClick={() => copyText(desc.suggested_prompt!)}
+                        className="text-[9.5px] text-white/60 hover:text-white inline-flex items-center gap-0.5"
+                      >
+                        <Copy className="h-2.5 w-2.5" /> 复制
+                      </button>
+                    </div>
+                    <div className="text-white text-[10.5px] font-mono leading-relaxed">{desc.suggested_prompt}</div>
+                  </div>
+                )}
               </div>
-            </div>
+            )}
           </section>
 
           {/* 自定义提取 */}
@@ -147,21 +217,19 @@ export function FrameLightbox({ jobId, frames, activeIndex, selected, onClose, o
             <textarea
               value={extractPrompt}
               onChange={(e) => setExtractPrompt(e.target.value)}
-              placeholder="比如：最右边那个白瓶子 / 中间的胶囊"
+              placeholder="比如：rightmost white bottle"
               rows={2}
-              disabled
-              className="w-full text-[12px] px-2.5 py-1.5 rounded-md bg-black/40 border border-white/15 outline-none text-white placeholder:text-white/30 resize-none disabled:opacity-50 focus:ring-2 focus:ring-violet-400/50"
+              className="w-full text-[12px] px-2.5 py-1.5 rounded-md bg-black/40 border border-white/15 outline-none text-white placeholder:text-white/30 resize-none focus:ring-2 focus:ring-violet-400/50"
             />
             <button
               disabled
               className="mt-2 w-full text-[12px] py-1.5 rounded-md bg-violet-500/60 text-white inline-flex items-center justify-center gap-1.5 cursor-not-allowed disabled:opacity-50"
-              title="待 image edit 接入"
             >
               <Wand2 className="h-3.5 w-3.5" />
-              ⚡ 快速提取
+              ⚡ 快速提取（下一步实现）
             </button>
             <div className="mt-1.5 text-[10px] text-white/30 font-mono">
-              依赖：nano-banana-pro image edit
+              下一步：调 nano-banana-pro image edit
             </div>
           </section>
 
@@ -172,11 +240,15 @@ export function FrameLightbox({ jobId, frames, activeIndex, selected, onClose, o
               已提取的元素
             </div>
             <div className="rounded-lg border border-dashed border-white/10 bg-white/[0.02] p-3 text-[11px] text-white/40 text-center">
-              暂无 · 提取后会出现在这里，可点击传入「生图」节点
+              暂无 · 提取后会出现在这里
             </div>
           </section>
         </div>
       </div>
+
+      <div className="absolute bottom-3 left-1/2 -translate-x-1/2 text-[10.5px] text-white/40 font-mono">
+        ←/→ 切换 · Space 选用 · ESC 关闭
+      </div>
     </div>
   )
 }
diff --git a/web/lib/api.ts b/web/lib/api.ts
index 0817c74..dd79ef8 100644
--- a/web/lib/api.ts
+++ b/web/lib/api.ts
@@ -10,10 +10,25 @@ export type JobStatus =
   | "transcribed"
   | "failed"
 
+export interface FrameObject {
+  name: string
+  position?: string
+  color?: string
+  extract_prompt?: string
+}
+
+export interface FrameDescription {
+  scene?: string
+  objects?: FrameObject[]
+  style?: string
+  suggested_prompt?: string
+}
+
 export interface KeyFrame {
   index: number
   timestamp: number
   url: string
+  description?: FrameDescription | null
 }
 
 export interface TranscriptSegment {
@@ -93,6 +108,15 @@ export async function addManualFrame(id: string, t: number): Promise<Job> {
   return res.json()
 }
 
+export async function describeFrame(jobId: string, frameIdx: number): Promise<Job> {
+  const res = await fetch(`${API_BASE}/jobs/${jobId}/frames/${frameIdx}/describe`, { method: "POST" })
+  if (!res.ok) {
+    const txt = await res.text().catch(() => "")
+    throw new Error(`describe ${res.status} ${txt.slice(0, 200)}`)
+  }
+  return res.json()
+}
+
 export function frameUrl(jobId: string, frameIndex: number): string {
   return `${API_BASE}/jobs/${jobId}/frames/${frameIndex}.jpg`
 }