diff --git a/.memory/worklog.json b/.memory/worklog.json index 6175d02..844e7e3 100644 --- a/.memory/worklog.json +++ b/.memory/worklog.json @@ -1290,6 +1290,13 @@ "type": "session-heartbeat", "message": "Claude 会话活跃 · 最近命令:claude · 2 项未提交变更 · 最近提交:auto-save 2026-05-13 10:55 (~4)", "files_changed": 2 + }, + { + "ts": "2026-05-13T11:01:06+08:00", + "type": "commit", + "message": "auto-save 2026-05-13 11:00 (~2)", + "hash": "08d7cb4", + "files_changed": 2 } ] } diff --git a/api/main.py b/api/main.py index 1ac1756..200f0de 100644 --- a/api/main.py +++ b/api/main.py @@ -69,7 +69,8 @@ class KeyElement(BaseModel): name_zh: str name_en: str = "" position: str = "" # 在画面中的位置描述(vision 给的) - source: Literal["auto", "manual"] = "manual" # auto=vision 识别 / manual=用户加 + source: Literal["auto", "manual", "region"] = "manual" # auto=vision / manual=用户加 / region=画框 + region: dict | None = None # 用户画框的相对坐标 {x,y,w,h}(用于精准抠图) cutout_id: str | None = None # 已抠图 → /jobs/{id}/frames/{idx}/elements/{element_id}/cutout.png created_at: float = 0.0 @@ -476,17 +477,33 @@ def _image_edit_call( model: str | None = None, fallback_text: bool = False, max_attempts: int = 3, + max_side: int = 1024, ) -> tuple[bytes, str]: """通用 image edit 调用 · 失败重试 + 可选 text fallback。 返回 (image_bytes, effective_mode) where effective_mode in {"edit","text"}。 - 失败 raise RuntimeError。""" + 失败 raise RuntimeError。 + 输入图自动 resize 到 max_side(默认 1024)边长后再 base64,避免大图把 Gemini + function call 输入挤超阈值导致 incomplete_generation。""" import base64 as b64lib + import io as _io import time as _time import httpx + from PIL import Image as _PILImage if not LLM_API_KEY: raise RuntimeError("LLM_API_KEY 未配置") model = model or IMAGE_MODEL - img_b64 = b64lib.b64encode(image_path.read_bytes()).decode("ascii") + # 缩到 max_side 内 + try: + im = _PILImage.open(image_path) + if max(im.size) > max_side: + im.thumbnail((max_side, max_side), _PILImage.LANCZOS) + buf = _io.BytesIO() + im.convert("RGB").save(buf, format="JPEG", quality=88) + img_bytes_in = buf.getvalue() + except Exception: + # PIL 失败兜底走原文件 + img_bytes_in = image_path.read_bytes() + img_b64 = b64lib.b64encode(img_bytes_in).decode("ascii") data_uri = f"data:image/jpeg;base64,{img_b64}" plan: list[str] = ["edit"] * max_attempts @@ -985,7 +1002,7 @@ class CleanupReq(BaseModel): def _region_to_phrase(r: dict) -> str: - """把相对坐标矩形转成方位描述给 prompt 用""" + """把相对坐标矩形转成简短方位描述给 prompt 用(避免百分号 / 括号触发模型异常)""" x = max(0.0, min(1.0, float(r.get("x", 0)))) y = max(0.0, min(1.0, float(r.get("y", 0)))) w = max(0.0, min(1.0 - x, float(r.get("w", 0)))) @@ -993,15 +1010,15 @@ def _region_to_phrase(r: dict) -> str: if w <= 0 or h <= 0: return "" cx, cy = x + w / 2, y + h / 2 - hpos = "left" if cx < 0.4 else "right" if cx > 0.6 else "center" + hpos = "left" if cx < 0.4 else "right" if cx > 0.6 else "middle" vpos = "top" if cy < 0.4 else "bottom" if cy > 0.6 else "middle" - quadrant = f"{vpos}-{hpos}" if hpos != "center" else vpos - x_pct = (int(x * 100), int((x + w) * 100)) - y_pct = (int(y * 100), int((y + h) * 100)) - return ( - f"the {quadrant} area of the image " - f"(roughly horizontal {x_pct[0]}%-{x_pct[1]}%, vertical {y_pct[0]}%-{y_pct[1]}%)" - ) + if hpos == "middle" and vpos == "middle": + return "center" + if hpos == "middle": + return vpos + if vpos == "middle": + return hpos + return f"{vpos} {hpos}" @app.post("/jobs/{job_id}/frames/{idx}/cleanup", response_model=Job) @@ -1023,14 +1040,11 @@ def cleanup_frame(job_id: str, idx: int, req: CleanupReq | None = None) -> Job: region_phrase = _region_to_phrase(req.region) if (req and req.region) else "" if region_phrase: prompt = ( - f"Remove text overlays only within {region_phrase}: watermarks, usernames, captions, hashtags, " - "platform logos. Keep every other part of the image exactly unchanged." + f"Erase the text and graphics in the {region_phrase} part of the image. " + "Keep all other parts unchanged." ) else: - prompt = ( - "Remove all text overlays from this image: watermarks, usernames, captions, hashtags, " - "platform logos. Keep the rest of the scene intact and natural." - ) + prompt = "Erase all watermarks and text overlays. Keep the scene natural." try: img_bytes, _mode = _image_edit_call(frame_path, prompt, fallback_text=False, max_attempts=3) except RuntimeError as e: @@ -1105,7 +1119,8 @@ class AddElementReq(BaseModel): name_zh: str name_en: str = "" position: str = "" - source: Literal["auto", "manual"] = "manual" + source: Literal["auto", "manual", "region"] = "manual" + region: dict | None = None @app.post("/jobs/{job_id}/frames/{idx}/elements", response_model=Job) @@ -1152,6 +1167,7 @@ def add_element(job_id: str, idx: int, req: AddElementReq) -> Job: name_en=name_en, position=req.position.strip(), source=req.source, + region=req.region, created_at=_time.time(), ) new_frames = [] @@ -1211,13 +1227,18 @@ def cutout_element(job_id: str, idx: int, element_id: str) -> Job: raise HTTPException(404, "source frame file missing") target = (el.name_en or el.name_zh).strip() - position_hint = f" Located {el.position}." if el.position else "" - prompt = ( - f"Extract the element '{target}' from this image as a standalone asset.{position_hint} " - "Output: the element on a fully transparent background (alpha channel), " - "isolated cleanly with no surrounding scene, no other objects, no shadows from the original scene. " - "Preserve the element's original colors, lighting, shape and proportions." - ) + region_phrase = _region_to_phrase(el.region) if el.region else "" + if region_phrase: + prompt = ( + f"Extract whatever is in the {region_phrase} part of the image as a standalone asset. " + "Output on transparent background, isolated, no other objects." + ) + else: + position_hint = f" Located in the {el.position} area." if el.position else "" + prompt = ( + f"Extract the {target} from this image as a standalone asset.{position_hint} " + "Output on transparent background, isolated, no other objects." + ) try: img_bytes, _mode = _image_edit_call(src, prompt, fallback_text=False, max_attempts=3) except RuntimeError as e: diff --git a/web/components/lightbox.tsx b/web/components/lightbox.tsx index 901140a..91d0b35 100644 --- a/web/components/lightbox.tsx +++ b/web/components/lightbox.tsx @@ -34,6 +34,9 @@ export function FrameLightbox({ jobId, frames, activeIndex, selected, onClose, o const [cropMode, setCropMode] = useState(false) const [region, setRegion] = useState<{ x: number; y: number; w: number; h: number } | null>(null) const [dragStart, setDragStart] = useState<{ x: number; y: number } | null>(null) + const [extractNamePrompt, setExtractNamePrompt] = useState(false) // 提取模式:要用户填名字 + const [extractName, setExtractName] = useState("") + const [extracting, setExtracting] = useState(false) const imgWrapRef = useRef(null) useEffect(() => setMounted(true), []) @@ -42,6 +45,8 @@ export function FrameLightbox({ jobId, frames, activeIndex, selected, onClose, o setCropMode(false) setRegion(null) setDragStart(null) + setExtractNamePrompt(false) + setExtractName("") }, [activeIndex]) useEffect(() => { @@ -99,6 +104,35 @@ export function FrameLightbox({ jobId, frames, activeIndex, selected, onClose, o } } + const handleExtractRegion = async () => { + if (!region || !extractName.trim()) return + setExtracting(true) + try { + // 先加 element 拿到 id + const added = await addElement(jobId, f.index, { + name_zh: extractName.trim(), + source: "region", + region, + }) + onJobUpdate?.(added) + // 找到新加的 element id(按 created_at desc 取最新一条) + const fr = added.frames.find((x) => x.index === f.index) + const newEl = fr?.elements?.sort((a, b) => (b.created_at ?? 0) - (a.created_at ?? 0))[0] + if (newEl) { + const cut = await cutoutElement(jobId, f.index, newEl.id) + onJobUpdate?.(cut) + toast.success(`「${extractName.trim()}」已提取并加入元素清单`) + } else { + toast.success(`「${extractName.trim()}」已加入元素清单 · 但抠图未触发`) + } + setCropMode(false); setRegion(null); setExtractNamePrompt(false); setExtractName("") + } catch (e) { + toast.error("提取失败:" + (e instanceof Error ? e.message : String(e))) + } finally { + setExtracting(false) + } + } + // 画框 mouse handlers — 坐标基于 img wrapper 相对位置 const getRelXY = (clientX: number, clientY: number) => { const el = imgWrapRef.current @@ -294,32 +328,79 @@ export function FrameLightbox({ jobId, frames, activeIndex, selected, onClose, o {/* 画框工具栏 */} {cropMode ? ( -
- - -
+ extractNamePrompt ? ( + // 提取模式:要用户填名字 +
+ setExtractName(e.target.value)} + onKeyDown={(e) => { + if (e.key === "Enter" && !e.nativeEvent.isComposing && extractName.trim()) { + e.preventDefault() + handleExtractRegion() + } + if (e.key === "Escape") { setExtractNamePrompt(false); setExtractName("") } + }} + placeholder="给这个元素起个中文名(如:左下角药瓶)" + className="w-full text-[11.5px] px-2 py-1.5 rounded-md bg-black/40 border border-violet-300/50 outline-none text-white placeholder:text-white/30 focus:ring-2 focus:ring-violet-400/40" + /> +
+ + +
+
+ ) : ( + // 画框完成 → 选操作 +
+ + + +
+ ) ) : ( )} diff --git a/web/lib/api.ts b/web/lib/api.ts index b88e477..47dc0b5 100644 --- a/web/lib/api.ts +++ b/web/lib/api.ts @@ -39,7 +39,8 @@ export interface KeyElement { name_zh: string name_en: string position?: string - source: "auto" | "manual" + source: "auto" | "manual" | "region" + region?: { x: number; y: number; w: number; h: number } | null cutout_id?: string | null created_at?: number } @@ -237,7 +238,13 @@ export async function applyCleanedFrame(jobId: string, frameIdx: number): Promis export async function addElement( jobId: string, frameIdx: number, - body: { name_zh: string; name_en?: string; position?: string; source?: "auto" | "manual" }, + body: { + name_zh: string + name_en?: string + position?: string + source?: "auto" | "manual" | "region" + region?: { x: number; y: number; w: number; h: number } | null + }, ): Promise { const res = await fetch(`${API_BASE}/jobs/${jobId}/frames/${frameIdx}/elements`, { method: "POST",