From 494d9905770011f2e7117105ae50cce94268dcd1 Mon Sep 17 00:00:00 2001
From: kang <wankang2050@gmail.com>
Date: Tue, 12 May 2026 23:44:18 +0800
Subject: [PATCH] auto-save 2026-05-12 23:44 (~2)

---
 .memory/worklog.json |  7 ++++++
 api/main.py          | 58 ++++++++++++++++++++++++++++++--------------
 2 files changed, 47 insertions(+), 18 deletions(-)

diff --git a/.memory/worklog.json b/.memory/worklog.json
index 8c0751c..c880a01 100644
--- a/.memory/worklog.json
+++ b/.memory/worklog.json
@@ -475,6 +475,13 @@
       "message": "auto-save 2026-05-12 23:32 (~1)",
       "hash": "0c251a2",
       "files_changed": 1
+    },
+    {
+      "ts": "2026-05-12T23:38:47+08:00",
+      "type": "commit",
+      "message": "auto-save 2026-05-12 23:38 (~5)",
+      "hash": "447f116",
+      "files_changed": 5
     }
   ]
 }
diff --git a/api/main.py b/api/main.py
index 4f5051b..f194dcd 100644
--- a/api/main.py
+++ b/api/main.py
@@ -597,10 +597,11 @@ def describe_frame(job_id: str, idx: int) -> Job:
         raise HTTPException(404, "frame file not found")
 
     import base64 as b64lib
+    import re as _re
     img_b64 = b64lib.b64encode(p.read_bytes()).decode("ascii")
 
     prompt = (
-        "请识别这张图，输出严格 JSON（不要 markdown 不要解释）：\n"
+        "请识别这张图，输出严格 JSON（不要 markdown 不要解释，不要思考）：\n"
         '{\n'
         '  "scene": "一句话描述场景",\n'
         '  "objects": [{"name": "物体名（中文）", "position": "在画面哪里", "color": "颜色", "extract_prompt": "用于提取该元素的英文 prompt"}],\n'
@@ -610,23 +611,44 @@ def describe_frame(job_id: str, idx: int) -> Job:
         "要求：objects 列出 3-8 个画面里**可独立提取**的主要元素，extract_prompt 用于后续 image edit 模型。"
     )
 
-    try:
-        resp = llm().chat.completions.create(
-            model=VISION_MODEL,
-            messages=[{"role": "user", "content": [
-                {"type": "text", "text": prompt},
-                {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{img_b64}"}},
-            ]}],
-            response_format={"type": "json_object"},
-            temperature=0.3,
-            max_tokens=1500,
-        )
-        content = resp.choices[0].message.content or "{}"
-        data = json.loads(content)
-    except json.JSONDecodeError as e:
-        raise HTTPException(500, f"vision returned invalid JSON: {e}")
-    except Exception as e:
-        raise HTTPException(500, f"vision failed: {e}")
+    last_err = ""
+    data = None
+    for attempt in range(3):
+        try:
+            resp = llm().chat.completions.create(
+                model=VISION_MODEL,
+                messages=[{"role": "user", "content": [
+                    {"type": "text", "text": prompt},
+                    {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{img_b64}"}},
+                ]}],
+                response_format={"type": "json_object"},
+                temperature=0.3,
+                max_tokens=3000,
+            )
+            content = (resp.choices[0].message.content or "").strip()
+            if not content:
+                # thinking 模型可能 content 空；尝试取 reasoning_content 里挖 JSON
+                rc = getattr(resp.choices[0].message, "reasoning_content", "") or ""
+                m = _re.search(r"\{[\s\S]*\}", rc)
+                content = m.group(0) if m else ""
+            # 剥掉 ```json ... ``` 包装
+            content = _re.sub(r"^```(?:json)?\s*|\s*```$", "", content).strip()
+            if not content:
+                last_err = f"empty content (attempt {attempt + 1})"
+                continue
+            data = json.loads(content)
+            break
+        except json.JSONDecodeError as e:
+            last_err = f"json decode (attempt {attempt + 1}): {e} · raw[:200]={content[:200]}"
+            print(f"[vision retry] {last_err}", flush=True)
+            continue
+        except Exception as e:
+            last_err = f"vision call (attempt {attempt + 1}): {e}"
+            print(f"[vision retry] {last_err}", flush=True)
+            continue
+
+    if data is None:
+        raise HTTPException(500, last_err or "vision failed after 3 retries")
 
     # 写回 job
     new_frames = []