auto-save 2026-05-12 23:44 (~2)

This commit is contained in:
2026-05-12 23:44:18 +08:00
parent 447f116e66
commit 494d990577
2 changed files with 47 additions and 18 deletions

View File

@@ -475,6 +475,13 @@
"message": "auto-save 2026-05-12 23:32 (~1)",
"hash": "0c251a2",
"files_changed": 1
},
{
"ts": "2026-05-12T23:38:47+08:00",
"type": "commit",
"message": "auto-save 2026-05-12 23:38 (~5)",
"hash": "447f116",
"files_changed": 5
}
]
}

View File

@@ -597,10 +597,11 @@ def describe_frame(job_id: str, idx: int) -> Job:
raise HTTPException(404, "frame file not found")
import base64 as b64lib
import re as _re
img_b64 = b64lib.b64encode(p.read_bytes()).decode("ascii")
prompt = (
"请识别这张图,输出严格 JSON不要 markdown 不要解释):\n"
"请识别这张图,输出严格 JSON不要 markdown 不要解释,不要思考\n"
'{\n'
' "scene": "一句话描述场景",\n'
' "objects": [{"name": "物体名(中文)", "position": "在画面哪里", "color": "颜色", "extract_prompt": "用于提取该元素的英文 prompt"}],\n'
@@ -610,23 +611,44 @@ def describe_frame(job_id: str, idx: int) -> Job:
"要求objects 列出 3-8 个画面里**可独立提取**的主要元素extract_prompt 用于后续 image edit 模型。"
)
try:
resp = llm().chat.completions.create(
model=VISION_MODEL,
messages=[{"role": "user", "content": [
{"type": "text", "text": prompt},
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{img_b64}"}},
]}],
response_format={"type": "json_object"},
temperature=0.3,
max_tokens=1500,
)
content = resp.choices[0].message.content or "{}"
data = json.loads(content)
except json.JSONDecodeError as e:
raise HTTPException(500, f"vision returned invalid JSON: {e}")
except Exception as e:
raise HTTPException(500, f"vision failed: {e}")
last_err = ""
data = None
for attempt in range(3):
try:
resp = llm().chat.completions.create(
model=VISION_MODEL,
messages=[{"role": "user", "content": [
{"type": "text", "text": prompt},
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{img_b64}"}},
]}],
response_format={"type": "json_object"},
temperature=0.3,
max_tokens=3000,
)
content = (resp.choices[0].message.content or "").strip()
if not content:
# thinking 模型可能 content 空;尝试取 reasoning_content 里挖 JSON
rc = getattr(resp.choices[0].message, "reasoning_content", "") or ""
m = _re.search(r"\{[\s\S]*\}", rc)
content = m.group(0) if m else ""
# 剥掉 ```json ... ``` 包装
content = _re.sub(r"^```(?:json)?\s*|\s*```$", "", content).strip()
if not content:
last_err = f"empty content (attempt {attempt + 1})"
continue
data = json.loads(content)
break
except json.JSONDecodeError as e:
last_err = f"json decode (attempt {attempt + 1}): {e} · raw[:200]={content[:200]}"
print(f"[vision retry] {last_err}", flush=True)
continue
except Exception as e:
last_err = f"vision call (attempt {attempt + 1}): {e}"
print(f"[vision retry] {last_err}", flush=True)
continue
if data is None:
raise HTTPException(500, last_err or "vision failed after 3 retries")
# 写回 job
new_frames = []