From 494d9905770011f2e7117105ae50cce94268dcd1 Mon Sep 17 00:00:00 2001 From: kang Date: Tue, 12 May 2026 23:44:18 +0800 Subject: [PATCH] auto-save 2026-05-12 23:44 (~2) --- .memory/worklog.json | 7 ++++++ api/main.py | 58 ++++++++++++++++++++++++++++++-------------- 2 files changed, 47 insertions(+), 18 deletions(-) diff --git a/.memory/worklog.json b/.memory/worklog.json index 8c0751c..c880a01 100644 --- a/.memory/worklog.json +++ b/.memory/worklog.json @@ -475,6 +475,13 @@ "message": "auto-save 2026-05-12 23:32 (~1)", "hash": "0c251a2", "files_changed": 1 + }, + { + "ts": "2026-05-12T23:38:47+08:00", + "type": "commit", + "message": "auto-save 2026-05-12 23:38 (~5)", + "hash": "447f116", + "files_changed": 5 } ] } diff --git a/api/main.py b/api/main.py index 4f5051b..f194dcd 100644 --- a/api/main.py +++ b/api/main.py @@ -597,10 +597,11 @@ def describe_frame(job_id: str, idx: int) -> Job: raise HTTPException(404, "frame file not found") import base64 as b64lib + import re as _re img_b64 = b64lib.b64encode(p.read_bytes()).decode("ascii") prompt = ( - "请识别这张图,输出严格 JSON(不要 markdown 不要解释):\n" + "请识别这张图,输出严格 JSON(不要 markdown 不要解释,不要思考):\n" '{\n' ' "scene": "一句话描述场景",\n' ' "objects": [{"name": "物体名(中文)", "position": "在画面哪里", "color": "颜色", "extract_prompt": "用于提取该元素的英文 prompt"}],\n' @@ -610,23 +611,44 @@ def describe_frame(job_id: str, idx: int) -> Job: "要求:objects 列出 3-8 个画面里**可独立提取**的主要元素,extract_prompt 用于后续 image edit 模型。" ) - try: - resp = llm().chat.completions.create( - model=VISION_MODEL, - messages=[{"role": "user", "content": [ - {"type": "text", "text": prompt}, - {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{img_b64}"}}, - ]}], - response_format={"type": "json_object"}, - temperature=0.3, - max_tokens=1500, - ) - content = resp.choices[0].message.content or "{}" - data = json.loads(content) - except json.JSONDecodeError as e: - raise HTTPException(500, f"vision returned invalid JSON: {e}") - except Exception as e: - raise HTTPException(500, f"vision failed: {e}") + last_err = "" + data = None + for attempt in range(3): + try: + resp = llm().chat.completions.create( + model=VISION_MODEL, + messages=[{"role": "user", "content": [ + {"type": "text", "text": prompt}, + {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{img_b64}"}}, + ]}], + response_format={"type": "json_object"}, + temperature=0.3, + max_tokens=3000, + ) + content = (resp.choices[0].message.content or "").strip() + if not content: + # thinking 模型可能 content 空;尝试取 reasoning_content 里挖 JSON + rc = getattr(resp.choices[0].message, "reasoning_content", "") or "" + m = _re.search(r"\{[\s\S]*\}", rc) + content = m.group(0) if m else "" + # 剥掉 ```json ... ``` 包装 + content = _re.sub(r"^```(?:json)?\s*|\s*```$", "", content).strip() + if not content: + last_err = f"empty content (attempt {attempt + 1})" + continue + data = json.loads(content) + break + except json.JSONDecodeError as e: + last_err = f"json decode (attempt {attempt + 1}): {e} · raw[:200]={content[:200]}" + print(f"[vision retry] {last_err}", flush=True) + continue + except Exception as e: + last_err = f"vision call (attempt {attempt + 1}): {e}" + print(f"[vision retry] {last_err}", flush=True) + continue + + if data is None: + raise HTTPException(500, last_err or "vision failed after 3 retries") # 写回 job new_frames = []