auto-save 2026-05-12 23:44 (~2)

2026-05-12 23:44:18 +08:00
parent 447f116e66
commit 494d990577
2 changed files with 47 additions and 18 deletions
--- a/.memory/worklog.json
+++ b/.memory/worklog.json
@@ -475,6 +475,13 @@
      "message": "auto-save 2026-05-12 23:32 (~1)",
      "hash": "0c251a2",
      "files_changed": 1
+    },
+    {
+      "ts": "2026-05-12T23:38:47+08:00",
+      "type": "commit",
+      "message": "auto-save 2026-05-12 23:38 (~5)",
+      "hash": "447f116",
+      "files_changed": 5
    }
  ]
 }
--- a/api/main.py
+++ b/api/main.py
@@ -597,10 +597,11 @@ def describe_frame(job_id: str, idx: int) -> Job:
        raise HTTPException(404, "frame file not found")

    import base64 as b64lib
+    import re as _re
    img_b64 = b64lib.b64encode(p.read_bytes()).decode("ascii")

    prompt = (
-        "请识别这张图，输出严格 JSON（不要 markdown 不要解释）：\n"
+        "请识别这张图，输出严格 JSON（不要 markdown 不要解释，不要思考）：\n"
        '{\n'
        '  "scene": "一句话描述场景",\n'
        '  "objects": [{"name": "物体名（中文）", "position": "在画面哪里", "color": "颜色", "extract_prompt": "用于提取该元素的英文 prompt"}],\n'
@@ -610,23 +611,44 @@ def describe_frame(job_id: str, idx: int) -> Job:
        "要求：objects 列出 3-8 个画面里**可独立提取**的主要元素，extract_prompt 用于后续 image edit 模型。"
    )

-    try:
-        resp = llm().chat.completions.create(
-            model=VISION_MODEL,
-            messages=[{"role": "user", "content": [
-                {"type": "text", "text": prompt},
-                {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{img_b64}"}},
-            ]}],
-            response_format={"type": "json_object"},
-            temperature=0.3,
-            max_tokens=1500,
-        )
-        content = resp.choices[0].message.content or "{}"
-        data = json.loads(content)
-    except json.JSONDecodeError as e:
-        raise HTTPException(500, f"vision returned invalid JSON: {e}")
-    except Exception as e:
-        raise HTTPException(500, f"vision failed: {e}")
+    last_err = ""
+    data = None
+    for attempt in range(3):
+        try:
+            resp = llm().chat.completions.create(
+                model=VISION_MODEL,
+                messages=[{"role": "user", "content": [
+                    {"type": "text", "text": prompt},
+                    {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{img_b64}"}},
+                ]}],
+                response_format={"type": "json_object"},
+                temperature=0.3,
+                max_tokens=3000,
+            )
+            content = (resp.choices[0].message.content or "").strip()
+            if not content:
+                # thinking 模型可能 content 空；尝试取 reasoning_content 里挖 JSON
+                rc = getattr(resp.choices[0].message, "reasoning_content", "") or ""
+                m = _re.search(r"\{[\s\S]*\}", rc)
+                content = m.group(0) if m else ""
+            # 剥掉 ```json ... ``` 包装
+            content = _re.sub(r"^```(?:json)?\s*|\s*```$", "", content).strip()
+            if not content:
+                last_err = f"empty content (attempt {attempt + 1})"
+                continue
+            data = json.loads(content)
+            break
+        except json.JSONDecodeError as e:
+            last_err = f"json decode (attempt {attempt + 1}): {e} · raw[:200]={content[:200]}"
+            print(f"[vision retry] {last_err}", flush=True)
+            continue
+        except Exception as e:
+            last_err = f"vision call (attempt {attempt + 1}): {e}"
+            print(f"[vision retry] {last_err}", flush=True)
+            continue
+
+    if data is None:
+        raise HTTPException(500, last_err or "vision failed after 3 retries")

    # 写回 job
    new_frames = []