auto-save 2026-05-12 23:44 (~2)
This commit is contained in:
@@ -475,6 +475,13 @@
|
||||
"message": "auto-save 2026-05-12 23:32 (~1)",
|
||||
"hash": "0c251a2",
|
||||
"files_changed": 1
|
||||
},
|
||||
{
|
||||
"ts": "2026-05-12T23:38:47+08:00",
|
||||
"type": "commit",
|
||||
"message": "auto-save 2026-05-12 23:38 (~5)",
|
||||
"hash": "447f116",
|
||||
"files_changed": 5
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
58
api/main.py
58
api/main.py
@@ -597,10 +597,11 @@ def describe_frame(job_id: str, idx: int) -> Job:
|
||||
raise HTTPException(404, "frame file not found")
|
||||
|
||||
import base64 as b64lib
|
||||
import re as _re
|
||||
img_b64 = b64lib.b64encode(p.read_bytes()).decode("ascii")
|
||||
|
||||
prompt = (
|
||||
"请识别这张图,输出严格 JSON(不要 markdown 不要解释):\n"
|
||||
"请识别这张图,输出严格 JSON(不要 markdown 不要解释,不要思考):\n"
|
||||
'{\n'
|
||||
' "scene": "一句话描述场景",\n'
|
||||
' "objects": [{"name": "物体名(中文)", "position": "在画面哪里", "color": "颜色", "extract_prompt": "用于提取该元素的英文 prompt"}],\n'
|
||||
@@ -610,23 +611,44 @@ def describe_frame(job_id: str, idx: int) -> Job:
|
||||
"要求:objects 列出 3-8 个画面里**可独立提取**的主要元素,extract_prompt 用于后续 image edit 模型。"
|
||||
)
|
||||
|
||||
try:
|
||||
resp = llm().chat.completions.create(
|
||||
model=VISION_MODEL,
|
||||
messages=[{"role": "user", "content": [
|
||||
{"type": "text", "text": prompt},
|
||||
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{img_b64}"}},
|
||||
]}],
|
||||
response_format={"type": "json_object"},
|
||||
temperature=0.3,
|
||||
max_tokens=1500,
|
||||
)
|
||||
content = resp.choices[0].message.content or "{}"
|
||||
data = json.loads(content)
|
||||
except json.JSONDecodeError as e:
|
||||
raise HTTPException(500, f"vision returned invalid JSON: {e}")
|
||||
except Exception as e:
|
||||
raise HTTPException(500, f"vision failed: {e}")
|
||||
last_err = ""
|
||||
data = None
|
||||
for attempt in range(3):
|
||||
try:
|
||||
resp = llm().chat.completions.create(
|
||||
model=VISION_MODEL,
|
||||
messages=[{"role": "user", "content": [
|
||||
{"type": "text", "text": prompt},
|
||||
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{img_b64}"}},
|
||||
]}],
|
||||
response_format={"type": "json_object"},
|
||||
temperature=0.3,
|
||||
max_tokens=3000,
|
||||
)
|
||||
content = (resp.choices[0].message.content or "").strip()
|
||||
if not content:
|
||||
# thinking 模型可能 content 空;尝试取 reasoning_content 里挖 JSON
|
||||
rc = getattr(resp.choices[0].message, "reasoning_content", "") or ""
|
||||
m = _re.search(r"\{[\s\S]*\}", rc)
|
||||
content = m.group(0) if m else ""
|
||||
# 剥掉 ```json ... ``` 包装
|
||||
content = _re.sub(r"^```(?:json)?\s*|\s*```$", "", content).strip()
|
||||
if not content:
|
||||
last_err = f"empty content (attempt {attempt + 1})"
|
||||
continue
|
||||
data = json.loads(content)
|
||||
break
|
||||
except json.JSONDecodeError as e:
|
||||
last_err = f"json decode (attempt {attempt + 1}): {e} · raw[:200]={content[:200]}"
|
||||
print(f"[vision retry] {last_err}", flush=True)
|
||||
continue
|
||||
except Exception as e:
|
||||
last_err = f"vision call (attempt {attempt + 1}): {e}"
|
||||
print(f"[vision retry] {last_err}", flush=True)
|
||||
continue
|
||||
|
||||
if data is None:
|
||||
raise HTTPException(500, last_err or "vision failed after 3 retries")
|
||||
|
||||
# 写回 job
|
||||
new_frames = []
|
||||
|
||||
Reference in New Issue
Block a user