auto-save 2026-05-14 06:33 (~5)

2026-05-14 06:33:37 +08:00
parent 6480d69c63
commit 0d86b4cff2
5 changed files with 271 additions and 96 deletions
--- a/api/main.py
+++ b/api/main.py
@@ -1957,6 +1957,8 @@ class GenerateSceneAssetReq(BaseModel):
    size: AssetSize = "source"
    scene_mode: SceneMode = "remove_subject"
    scene_style: SceneStyle = "source"
+    prompt: str = ""
+    source_frame_indices: list[int] | None = None


 class GenerateSubjectAssetsReq(BaseModel):
@@ -2096,6 +2098,18 @@ def generate_scene_asset(job_id: str, idx: int, req: GenerateSceneAssetReq) -> J
    if not src.exists():
        raise HTTPException(404, "source frame file missing")

+    source_indices = [int(x) for x in (req.source_frame_indices or [idx]) if isinstance(x, int) or str(x).isdigit()]
+    if not source_indices:
+        source_indices = [idx]
+    source_indices = list(dict.fromkeys(source_indices))[:8]
+    model_src = src
+    sheet_tmp: Path | None = None
+    if len(source_indices) > 1:
+        sheet_tmp = job_dir(job_id) / "tmp" / f"scene_refs_{idx:03d}_{uuid.uuid4().hex[:6]}.jpg"
+        sheet = _make_reference_contact_sheet(job_id, source_indices, sheet_tmp)
+        if sheet:
+            model_src = sheet
+
    confirmed_subjects = [
        (e.name_en or e.name_zh).strip()
        for ref_frame in job.frames
@@ -2136,10 +2150,23 @@ def generate_scene_asset(job_id: str, idx: int, req: GenerateSceneAssetReq) -> J
        "warm_lifestyle": "Use a warm lifestyle style: realistic lived-in details, soft natural light, approachable atmosphere.",
        "cinematic": "Use a cinematic style: dramatic but natural lighting, richer depth, filmic contrast, not fantasy.",
    }[req.scene_style]
+    user_prompt = req.prompt.strip()
+    user_prompt_clause = (
+        "User scene direction: " + user_prompt[:1200] + " "
+        if user_prompt
+        else ""
+    )
+    reference_clause = (
+        f"Use the selected reference frame contact sheet as visual evidence for location, composition, lighting, materials, and atmosphere. Reference frame indices: {', '.join(str(i + 1) for i in source_indices)}. "
+        if len(source_indices) > 1
+        else "Use the provided frame as the primary visual reference. "
+    )
    prompt = (
        "Create one clean high-definition scene/background reference image from this frame. "
        + subject_clause
        + "Do not include the removed subject, duplicate people, animals, products, text, watermark, platform UI, captions, usernames, hashtags, logos, or overlay graphics. "
+        + reference_clause
+        + user_prompt_clause
        + mode_clause + " "
        + style_clause + " "
        + "Enhance clarity and texture while avoiding over-smoothing, warped geometry, or changing important perspective details. "
@@ -2147,9 +2174,13 @@ def generate_scene_asset(job_id: str, idx: int, req: GenerateSceneAssetReq) -> J
    )
    models = [IMAGE_MODEL, "gemini-3.1-flash-image-preview", "gemini-2.5-flash-image"]
    try:
-        img_bytes, _mode = _image_edit_call(src, prompt, models=models, fallback_text=False, max_attempts=3, max_side=1280)
+        img_bytes, _mode = _image_edit_call(model_src, prompt, models=models, fallback_text=False, max_attempts=3, max_side=1280)
    except RuntimeError as e:
        raise HTTPException(500, f"scene asset failed: {e}")
+    finally:
+        if sheet_tmp and sheet_tmp.exists():
+            try: sheet_tmp.unlink()
+            except OSError: pass

    asset_id = f"scene_{idx:03d}_{uuid.uuid4().hex[:8]}"
    out_path = job_dir(job_id) / "assets" / f"{asset_id}.jpg"
@@ -2306,23 +2337,27 @@ def generate_subject_assets(job_id: str, idx: int, element_id: str, req: Generat
    generated: list[SubjectAsset] = []
    try:
        for view, view_label in _subject_view_labels(req.subject_kind, req.views):
-            if view == "side_walk":
-                view_prompt = "side view in a natural walking pose, same identity and proportions"
-            elif view.startswith("expression_"):
-                emotion = view_label.replace("表情", "")
-                view_prompt = f"clear {emotion} facial expression reference, frontal or three-quarter standing pose, preserving the same identity"
-            elif view.startswith("action_"):
-                view_prompt = f"{view_label} reference pose, same identity and proportions"
+            if req.subject_kind == "living":
+                if view.startswith("expression_"):
+                    emotion = view_label.replace("表情", "")
+                    view_prompt = f"full-body upright standing character reference with a clear {emotion} facial expression"
+                elif view.startswith("action_") or view == "side_walk":
+                    view_prompt = f"full-body upright standing character reference, {view_label}, same identity and proportions"
+                else:
+                    view_prompt = f"full-body upright standing character reference, {view_label}"
            else:
-                view_prompt = f"{view_label} view"
+                view_prompt = f"complete object/product reference, {view_label} view"
            prompt = (
-                f"Use the reference image(s) to generate a single {view_prompt} of the same {target}. "
+                f"Use the reference image(s) only as visual evidence to redraw the same {target}; do not crop, cut out, paste, or extract pixels from the source. "
+                f"Generate one newly rendered {view_prompt} of the same subject. "
                f"The subject is a {kind_phrase}. If multiple frames are shown, treat them as evidence of one same subject, not multiple subjects. "
                "Preserve identity, proportions, silhouette, material, colors, styling, and distinctive details across all generated views. "
-                f"Create a high-definition standalone asset on a {bg_phrase} background. "
-                "No extra objects, no original scene fragments, no text, no watermark, no UI. "
-                "If the source is incomplete or occluded, intelligently complete missing parts while staying consistent with the reference. "
-                "For living subjects, keep the body standing and readable; do not create medical, horror, or distorted anatomy."
+                "The subject must be complete, centered, full body or full object, head-to-feet visible when applicable, not cropped by the canvas. "
+                "Make the subject large and readable: it should occupy about 85-95% of the image height with only small margins. "
+                f"Create a high-definition standalone asset on a solid {bg_phrase} background. "
+                "No extra objects, no props, no additional products, no background elements, no original scene fragments, no shadows from the original scene, no text, no watermark, no UI. "
+                "If the source is incomplete, partially visible, occluded, or low resolution, reconstruct the missing parts by redrawing a clean complete subject while staying consistent with the reference. "
+                "For living subjects, keep a normal upright standing pose for the standard views; do not create sitting, walking, medical, horror, or distorted anatomy unless explicitly requested by the view label."
            )
            try:
                img_bytes, _mode = _image_edit_call(model_src, prompt, models=models, fallback_text=False, max_attempts=3, max_side=1280)
@@ -2331,7 +2366,7 @@ def generate_subject_assets(job_id: str, idx: int, element_id: str, req: Generat

            asset_id = f"subject_{idx:03d}_{element_id}_{view}_{uuid.uuid4().hex[:8]}"
            out_path = job_dir(job_id) / "assets" / f"{asset_id}.jpg"
-            width, height = _normalize_asset_image(img_bytes, out_path, _source_frame_path(job_id, idx), req.size, req.background, square=False)
+            width, height = _normalize_asset_image(img_bytes, out_path, _source_frame_path(job_id, idx), req.size, req.background, square=False, fill_subject=True)
            generated.append(SubjectAsset(
                id=asset_id,
                view=view,