fix: align generation size and duration options

2026-05-25 14:23:09 +08:00
parent fa64f95911
commit e77e77fada
5 changed files with 239 additions and 23 deletions
--- a/api/main.py
+++ b/api/main.py
@@ -124,6 +124,58 @@ IMAGE_FALLBACK_ENABLED = os.getenv("IMAGE_FALLBACK_ENABLED", "true").strip().low
 IMAGE_MODEL = GPT_IMAGE_MODEL
 PRODUCT_VIEW_MODEL = GPT_IMAGE_MODEL
 SUBJECT_ASSET_IMAGE_MODEL = GPT_IMAGE_MODEL
+IMAGE_SIZE_CHOICES = [
+    {
+        "id": "auto",
+        "label": "自动",
+        "value": "auto",
+        "description": "由图片模型自行决定输出尺寸",
+    },
+    {
+        "id": "1024x1536",
+        "label": "竖图 2:3",
+        "value": "1024x1536",
+        "description": "适合信息流营销图、人物和产品竖版构图",
+    },
+    {
+        "id": "1024x1024",
+        "label": "方图 1:1",
+        "value": "1024x1024",
+        "description": "适合头像、方形素材和电商图",
+    },
+    {
+        "id": "1536x1024",
+        "label": "横图 3:2",
+        "value": "1536x1024",
+        "description": "适合横版封面和详情页配图",
+    },
+]
+VIDEO_SIZE_CHOICES = [
+    {
+        "id": "720x1280",
+        "label": "竖屏 9:16",
+        "value": "720x1280",
+        "description": "适合抖音、短视频和飞书内预览",
+    },
+    {
+        "id": "1280x720",
+        "label": "横屏 16:9",
+        "value": "1280x720",
+        "description": "适合横版展示和网页视频",
+    },
+    {
+        "id": "1024x1024",
+        "label": "方形 1:1",
+        "value": "1024x1024",
+        "description": "适合方形广告位",
+    },
+    {
+        "id": "960x1280",
+        "label": "竖屏 3:4",
+        "value": "960x1280",
+        "description": "适合更接近图文卡片的竖版素材",
+    },
+]
 SubjectModelBundle = Literal["gpt", "gemini"]
 SubjectAgentMode = Literal["realistic", "cartoon", "elements", "custom"]
 SUBJECT_AGENT_GPT_MODEL = gpt_model_env("SUBJECT_AGENT_GPT_MODEL", VISION_MODEL)
@@ -4134,6 +4186,67 @@ def image_model_options() -> list[dict]:
    return options


+def image_size_options() -> list[dict]:
+    return IMAGE_SIZE_CHOICES
+
+
+def _normalize_image_size(raw: str | None) -> str:
+    value = (raw or "auto").strip().lower()
+    aliases = {
+        "vertical": "1024x1536",
+        "portrait": "1024x1536",
+        "竖图": "1024x1536",
+        "square": "1024x1024",
+        "方图": "1024x1024",
+        "horizontal": "1536x1024",
+        "landscape": "1536x1024",
+        "横图": "1536x1024",
+    }
+    value = aliases.get(value, value)
+    allowed = {str(item["value"]) for item in IMAGE_SIZE_CHOICES}
+    if value not in allowed:
+        raise HTTPException(400, f"unsupported image size: {raw}")
+    return value
+
+
+def _image_size_payload(raw: str | None) -> dict:
+    size = _normalize_image_size(raw)
+    return {} if size == "auto" else {"size": size}
+
+
+def video_duration_options() -> list[int]:
+    if video_uses_ark():
+        return [5, 8, 10, 12, 15]
+    return [4, 8, 12]
+
+
+def video_size_options() -> list[dict]:
+    return VIDEO_SIZE_CHOICES
+
+
+def _normalize_video_size(raw: str | None) -> str:
+    value = (raw or "720x1280").strip().lower().replace(" ", "")
+    aliases = {
+        "vertical": "720x1280",
+        "portrait": "720x1280",
+        "9:16": "720x1280",
+        "竖屏": "720x1280",
+        "horizontal": "1280x720",
+        "landscape": "1280x720",
+        "16:9": "1280x720",
+        "横屏": "1280x720",
+        "square": "1024x1024",
+        "1:1": "1024x1024",
+        "方形": "1024x1024",
+        "3:4": "960x1280",
+    }
+    value = aliases.get(value, value)
+    allowed = {str(item["value"]) for item in VIDEO_SIZE_CHOICES}
+    if value not in allowed:
+        raise HTTPException(400, f"unsupported video size: {raw}")
+    return value
+
+
 def video_model_options() -> list[dict]:
    label_map = {
        "seedance": "Seedance",
@@ -4156,7 +4269,10 @@ def video_model_options() -> list[dict]:
            "id": key,
            "label": label_map.get(key, key),
            "model": model,
-            "description": "当前视频网关可选模型",
+            "description": f"当前视频网关可选模型；单次时长最高 {max(video_duration_options())} 秒",
+            "duration_options": video_duration_options(),
+            "size_options": video_size_options(),
+            "max_duration_seconds": max(video_duration_options()),
            "available": bool(video_api_key()),
        })
    default_model = resolve_video_model(VIDEO_MODEL)
@@ -4166,6 +4282,9 @@ def video_model_options() -> list[dict]:
            "label": label_map.get(VIDEO_MODEL, VIDEO_MODEL),
            "model": default_model,
            "description": "默认视频模型",
+            "duration_options": video_duration_options(),
+            "size_options": video_size_options(),
+            "max_duration_seconds": max(video_duration_options()),
            "available": bool(video_api_key()),
        })
    return options
@@ -4252,12 +4371,12 @@ def _image_endpoint(path: str) -> str:
    return f"{base}/{path.lstrip('/')}"


-def _image_generation_response(prompt: str, model: str) -> dict:
+def _image_generation_response(prompt: str, model: str, size: str | None = "auto") -> dict:
    with ai_http_client(timeout=IMAGE_REQUEST_TIMEOUT_SECONDS) as client:
        r = client.post(
            _image_endpoint("/images/generations"),
            headers={"Authorization": f"Bearer {IMAGE_API_KEY}"},
-            json={"model": model, "prompt": prompt, "n": 1},
+            json={"model": model, "prompt": prompt, "n": 1, **_image_size_payload(size)},
        )
        r.raise_for_status()
        return r.json()
@@ -5198,6 +5317,7 @@ def health() -> dict:
            "image_base_url": IMAGE_BASE_URL or LLM_BASE_URL or "openai-default",
            "image_request_timeout_seconds": IMAGE_REQUEST_TIMEOUT_SECONDS,
            "image_options": image_model_options(),
+            "image_size_options": image_size_options(),
            "ai_proxy_configured": bool(AI_HTTP_PROXY),
            "image_fallbacks": _image_fallback_models(),
            "image_circuit": _image_circuit_snapshot(),
@@ -5213,6 +5333,9 @@ def health() -> dict:
            "video": VIDEO_MODEL,
            "video_aliases": VIDEO_MODEL_ALIASES,
            "video_options": video_model_options(),
+            "video_duration_options": video_duration_options(),
+            "video_max_duration_seconds": max(video_duration_options()),
+            "video_size_options": video_size_options(),
            "video_provider": video_provider_name(),
            "video_base_url": video_api_base(),
            "video_configured": bool(video_api_key()),
@@ -5666,6 +5789,7 @@ class GenerateReq(BaseModel):
    extra_prompt: str = ""        # ✓ 需要的元素（正向）
    negative_prompt: str = ""     # ✗ 不需要的元素（负向）
    model: str = "auto"  # auto / gpt-image-2 / gemini-3-pro-image-preview
+    size: str = "auto"  # auto / 1024x1536 / 1024x1024 / 1536x1024
    mode: str = "edit"  # "edit" 带参考图，"text" 纯文字
    from_selected: bool = False   # True 时优先用 frame.selected 的生成图作 reference（迭代），否则原关键帧

@@ -5702,6 +5826,7 @@ def generate_image(job_id: str, idx: int, req: GenerateReq) -> Job:
    if not raw_prompt:
        raise HTTPException(400, "prompt required")
    full_prompt = _ensure_english(raw_prompt)
+    image_size = _normalize_image_size(req.size)
    if not IMAGE_API_KEY:
        raise HTTPException(503, "IMAGE_API_KEY 或 LLM_API_KEY 未配置")

@@ -5742,14 +5867,14 @@ def generate_image(job_id: str, idx: int, req: GenerateReq) -> Job:
                        headers={
                            "Authorization": f"Bearer {IMAGE_API_KEY}",
                        },
-                        data={"model": current_model, "prompt": full_prompt, "n": "1"},
+                        data={"model": current_model, "prompt": full_prompt, "n": "1", **_image_size_payload(image_size)},
                        files={"image": ("reference.jpg", img_bytes_in, "image/jpeg")},
                    )
                    r.raise_for_status()
                    resp_data = r.json()
            else:
                # text-only
-                resp_data = _image_generation_response(full_prompt, current_model)
+                resp_data = _image_generation_response(full_prompt, current_model, image_size)

            if resp_data.get("data"):
                effective_mode = f"{current_mode}:{current_model}"
@@ -7870,6 +7995,7 @@ def _enqueue_storyboard_videos(job: Job, frame: KeyFrame, req: GenerateStoryboar

    model = resolve_video_model(req.model)
    seconds = video_seconds(float(req.duration or 4))
+    video_size = _normalize_video_size(req.size)
    source_ref = req.source_ref
    if source_ref and source_ref.kind == "source_video" and not source_ref.url:
        source_ref = None
@@ -7894,7 +8020,7 @@ def _enqueue_storyboard_videos(job: Job, frame: KeyFrame, req: GenerateStoryboar
            progress=0,
            created_at=time.time(),
        ))
-        task_args = (job.id, local_id, "", ref_path, variant_prompt, model, seconds, req.size, source_ref, last_ref_path, reference_ref_paths, primary_role)
+        task_args = (job.id, local_id, "", ref_path, variant_prompt, model, seconds, video_size, source_ref, last_ref_path, reference_ref_paths, primary_role)
        if bg is not None:
            bg.add_task(render_storyboard_video, *task_args)
        else: