fix: align generation size and duration options

2026-05-25 14:23:09 +08:00
parent fa64f95911
commit e77e77fada
5 changed files with 239 additions and 23 deletions
--- a/RULES.md
+++ b/RULES.md
@@ -11,7 +11,7 @@
 - 详见 `CLAUDE.md` 立项决策段 + `.memory/plan.md` 七步管线拆解
 - 风格：`04-Dark-Gallery-Ambient`（路径：`~/Projects/research/20260305-网页风格库/04-Dark-Gallery-Ambient.md`）
 - 第一冲刺：步骤 1-4（下载 / 拆轨 / 关键帧 / ASR+翻译）
- 当前产品方向（2026-05-25 单对话框版）：默认首页彻底从“信息流广告复刻管线”切换为多人通用的 SKG 营销内容生成入口，服务约 6 名公司成员同时使用。首页默认只保留一个中央对话框，不再显示侧栏、灵感区、任务列表或大结果面板；用户先选择四种生成方式之一：文生视频、文生图、首帧生视频、首尾帧生视频，然后手写提示词并点击生成。首帧 / 首尾帧模式只露必要图片上传位，视频模式只保留时长选择。后端 `/health` 向前端返回可选图片 / 视频模型，首页允许用户选择图片模型（自动、GPT Image 2、Gemini 图片兜底）和视频模型（Seedance、Kling、Veo 3 等别名；实际可用模型以环境变量映射为准）。用户登录后仍只看到自己的任务、结果和详情页，继续沿用后端 owner 隔离；结果生成后从对话框下方进入 `/detail/?job=<id>` 沉淀参考图、生成图、视频候选和提示词。旧 TK 复刻工作台、Agent Cut 一键出片和营销图文方案保留为高级/详情页能力，不再作为默认首页入口或默认理解框架。
+- 当前产品方向（2026-05-25 单对话框版）：默认首页彻底从“信息流广告复刻管线”切换为多人通用的 SKG 营销内容生成入口，服务约 6 名公司成员同时使用。首页默认只保留一个中央对话框，不再显示侧栏、灵感区、任务列表或大结果面板；用户先选择四种生成方式之一：文生视频、文生图、首帧生视频、首尾帧生视频，然后手写提示词并点击生成。首帧 / 首尾帧模式只露必要图片上传位，图片模式显示尺寸选择，视频模式显示画幅和真实可用时长选择。后端 `/health` 向前端返回可选图片 / 视频模型、图片尺寸、视频画幅和视频时长，首页允许用户选择图片模型（自动、GPT Image 2、Gemini 图片兜底）和视频模型（Seedance、Kling、Veo 3 等别名；实际可用模型以环境变量映射为准）。当前 Doubao / Seedance 生产链路单条视频最长按 15 秒暴露，不在 UI 显示 30 秒；如后续要 30 秒，需要改成多段生成后合成。用户登录后仍只看到自己的任务、结果和详情页，继续沿用后端 owner 隔离；结果生成后从对话框下方进入 `/detail/?job=<id>` 沉淀参考图、生成图、视频候选和提示词。旧 TK 复刻工作台、Agent Cut 一键出片和营销图文方案保留为高级/详情页能力，不再作为默认首页入口或默认理解框架。
 ## 部署事实
 - 平台：VPS `76.13.31.179`（Ubuntu 24.04 / Docker Compose / Coolify Traefik）
--- a/api/main.py
+++ b/api/main.py
@@ -124,6 +124,58 @@ IMAGE_FALLBACK_ENABLED = os.getenv("IMAGE_FALLBACK_ENABLED", "true").strip().low
 IMAGE_MODEL = GPT_IMAGE_MODEL
 PRODUCT_VIEW_MODEL = GPT_IMAGE_MODEL
 SUBJECT_ASSET_IMAGE_MODEL = GPT_IMAGE_MODEL
 IMAGE_SIZE_CHOICES = [
    {
        "id": "auto",
        "label": "自动",
        "value": "auto",
        "description": "由图片模型自行决定输出尺寸",
    },
    {
        "id": "1024x1536",
        "label": "竖图 2:3",
        "value": "1024x1536",
        "description": "适合信息流营销图、人物和产品竖版构图",
    },
    {
        "id": "1024x1024",
        "label": "方图 1:1",
        "value": "1024x1024",
        "description": "适合头像、方形素材和电商图",
    },
    {
        "id": "1536x1024",
        "label": "横图 3:2",
        "value": "1536x1024",
        "description": "适合横版封面和详情页配图",
    },
 ]
 VIDEO_SIZE_CHOICES = [
    {
        "id": "720x1280",
        "label": "竖屏 9:16",
        "value": "720x1280",
        "description": "适合抖音、短视频和飞书内预览",
    },
    {
        "id": "1280x720",
        "label": "横屏 16:9",
        "value": "1280x720",
        "description": "适合横版展示和网页视频",
    },
    {
        "id": "1024x1024",
        "label": "方形 1:1",
        "value": "1024x1024",
        "description": "适合方形广告位",
    },
    {
        "id": "960x1280",
        "label": "竖屏 3:4",
        "value": "960x1280",
        "description": "适合更接近图文卡片的竖版素材",
    },
 ]
 SubjectModelBundle = Literal["gpt", "gemini"]
 SubjectAgentMode = Literal["realistic", "cartoon", "elements", "custom"]
 SUBJECT_AGENT_GPT_MODEL = gpt_model_env("SUBJECT_AGENT_GPT_MODEL", VISION_MODEL)
@@ -4134,6 +4186,67 @@ def image_model_options() -> list[dict]:
    return options
 def image_size_options() -> list[dict]:
    return IMAGE_SIZE_CHOICES
 def _normalize_image_size(raw: str | None) -> str:
    value = (raw or "auto").strip().lower()
    aliases = {
        "vertical": "1024x1536",
        "portrait": "1024x1536",
        "竖图": "1024x1536",
        "square": "1024x1024",
        "方图": "1024x1024",
        "horizontal": "1536x1024",
        "landscape": "1536x1024",
        "横图": "1536x1024",
    }
    value = aliases.get(value, value)
    allowed = {str(item["value"]) for item in IMAGE_SIZE_CHOICES}
    if value not in allowed:
        raise HTTPException(400, f"unsupported image size: {raw}")
    return value
 def _image_size_payload(raw: str | None) -> dict:
    size = _normalize_image_size(raw)
    return {} if size == "auto" else {"size": size}
 def video_duration_options() -> list[int]:
    if video_uses_ark():
        return [5, 8, 10, 12, 15]
    return [4, 8, 12]
 def video_size_options() -> list[dict]:
    return VIDEO_SIZE_CHOICES
 def _normalize_video_size(raw: str | None) -> str:
    value = (raw or "720x1280").strip().lower().replace(" ", "")
    aliases = {
        "vertical": "720x1280",
        "portrait": "720x1280",
        "9:16": "720x1280",
        "竖屏": "720x1280",
        "horizontal": "1280x720",
        "landscape": "1280x720",
        "16:9": "1280x720",
        "横屏": "1280x720",
        "square": "1024x1024",
        "1:1": "1024x1024",
        "方形": "1024x1024",
        "3:4": "960x1280",
    }
    value = aliases.get(value, value)
    allowed = {str(item["value"]) for item in VIDEO_SIZE_CHOICES}
    if value not in allowed:
        raise HTTPException(400, f"unsupported video size: {raw}")
    return value
 def video_model_options() -> list[dict]:
    label_map = {
        "seedance": "Seedance",
@@ -4156,7 +4269,10 @@ def video_model_options() -> list[dict]:
            "id": key,
            "label": label_map.get(key, key),
            "model": model,
-            "description": "当前视频网关可选模型",
+            "description": f"当前视频网关可选模型；单次时长最高 {max(video_duration_options())} 秒",
            "duration_options": video_duration_options(),
            "size_options": video_size_options(),
            "max_duration_seconds": max(video_duration_options()),
            "available": bool(video_api_key()),
        })
    default_model = resolve_video_model(VIDEO_MODEL)
@@ -4166,6 +4282,9 @@ def video_model_options() -> list[dict]:
            "label": label_map.get(VIDEO_MODEL, VIDEO_MODEL),
            "model": default_model,
            "description": "默认视频模型",
            "duration_options": video_duration_options(),
            "size_options": video_size_options(),
            "max_duration_seconds": max(video_duration_options()),
            "available": bool(video_api_key()),
        })
    return options
@@ -4252,12 +4371,12 @@ def _image_endpoint(path: str) -> str:
    return f"{base}/{path.lstrip('/')}"
-def _image_generation_response(prompt: str, model: str) -> dict:
+def _image_generation_response(prompt: str, model: str, size: str | None = "auto") -> dict:
    with ai_http_client(timeout=IMAGE_REQUEST_TIMEOUT_SECONDS) as client:
        r = client.post(
            _image_endpoint("/images/generations"),
            headers={"Authorization": f"Bearer {IMAGE_API_KEY}"},
-            json={"model": model, "prompt": prompt, "n": 1},
+            json={"model": model, "prompt": prompt, "n": 1, **_image_size_payload(size)},
        )
        r.raise_for_status()
        return r.json()
@@ -5198,6 +5317,7 @@ def health() -> dict:
            "image_base_url": IMAGE_BASE_URL or LLM_BASE_URL or "openai-default",
            "image_request_timeout_seconds": IMAGE_REQUEST_TIMEOUT_SECONDS,
            "image_options": image_model_options(),
            "image_size_options": image_size_options(),
            "ai_proxy_configured": bool(AI_HTTP_PROXY),
            "image_fallbacks": _image_fallback_models(),
            "image_circuit": _image_circuit_snapshot(),
@@ -5213,6 +5333,9 @@ def health() -> dict:
            "video": VIDEO_MODEL,
            "video_aliases": VIDEO_MODEL_ALIASES,
            "video_options": video_model_options(),
            "video_duration_options": video_duration_options(),
            "video_max_duration_seconds": max(video_duration_options()),
            "video_size_options": video_size_options(),
            "video_provider": video_provider_name(),
            "video_base_url": video_api_base(),
            "video_configured": bool(video_api_key()),
@@ -5666,6 +5789,7 @@ class GenerateReq(BaseModel):
    extra_prompt: str = ""        # ✓ 需要的元素（正向）
    negative_prompt: str = ""     # ✗ 不需要的元素（负向）
    model: str = "auto"  # auto / gpt-image-2 / gemini-3-pro-image-preview
    size: str = "auto"  # auto / 1024x1536 / 1024x1024 / 1536x1024
    mode: str = "edit"  # "edit" 带参考图，"text" 纯文字
    from_selected: bool = False   # True 时优先用 frame.selected 的生成图作 reference（迭代），否则原关键帧
@@ -5702,6 +5826,7 @@ def generate_image(job_id: str, idx: int, req: GenerateReq) -> Job:
    if not raw_prompt:
        raise HTTPException(400, "prompt required")
    full_prompt = _ensure_english(raw_prompt)
    image_size = _normalize_image_size(req.size)
    if not IMAGE_API_KEY:
        raise HTTPException(503, "IMAGE_API_KEY 或 LLM_API_KEY 未配置")
@@ -5742,14 +5867,14 @@ def generate_image(job_id: str, idx: int, req: GenerateReq) -> Job:
                        headers={
                            "Authorization": f"Bearer {IMAGE_API_KEY}",
                        },
-                        data={"model": current_model, "prompt": full_prompt, "n": "1"},
+                        data={"model": current_model, "prompt": full_prompt, "n": "1", **_image_size_payload(image_size)},
                        files={"image": ("reference.jpg", img_bytes_in, "image/jpeg")},
                    )
                    r.raise_for_status()
                    resp_data = r.json()
            else:
                # text-only
-                resp_data = _image_generation_response(full_prompt, current_model)
+                resp_data = _image_generation_response(full_prompt, current_model, image_size)
            if resp_data.get("data"):
                effective_mode = f"{current_mode}:{current_model}"
@@ -7870,6 +7995,7 @@ def _enqueue_storyboard_videos(job: Job, frame: KeyFrame, req: GenerateStoryboar
    model = resolve_video_model(req.model)
    seconds = video_seconds(float(req.duration or 4))
    video_size = _normalize_video_size(req.size)
    source_ref = req.source_ref
    if source_ref and source_ref.kind == "source_video" and not source_ref.url:
        source_ref = None
@@ -7894,7 +8020,7 @@ def _enqueue_storyboard_videos(job: Job, frame: KeyFrame, req: GenerateStoryboar
            progress=0,
            created_at=time.time(),
        ))
-        task_args = (job.id, local_id, "", ref_path, variant_prompt, model, seconds, req.size, source_ref, last_ref_path, reference_ref_paths, primary_role)
+        task_args = (job.id, local_id, "", ref_path, variant_prompt, model, seconds, video_size, source_ref, last_ref_path, reference_ref_paths, primary_role)
        if bg is not None:
            bg.add_task(render_storyboard_video, *task_args)
        else:
--- a/docs/source-analysis.html
+++ b/docs/source-analysis.html
--- a/web/app/page.tsx
+++ b/web/app/page.tsx
@@ -29,6 +29,7 @@ import {
  type GeneratedVideo,
  type Job,
  type RuntimeModelOption,
  type RuntimeSizeOption,
 } from "@/lib/api"
 type CreationMode = "text-video" | "text-image" | "first-frame-video" | "first-last-frame-video"
@@ -99,6 +100,19 @@ function isVideoMode(mode: CreationMode) {
  return mode !== "text-image"
 }
 const DEFAULT_IMAGE_SIZE_OPTIONS: RuntimeSizeOption[] = [
  { id: "1024x1536", label: "竖图 2:3", value: "1024x1536" },
  { id: "1024x1024", label: "方图 1:1", value: "1024x1024" },
  { id: "1536x1024", label: "横图 3:2", value: "1536x1024" },
  { id: "auto", label: "自动", value: "auto" },
 ]
 const DEFAULT_VIDEO_SIZE_OPTIONS: RuntimeSizeOption[] = [
  { id: "720x1280", label: "竖屏 9:16", value: "720x1280" },
  { id: "1280x720", label: "横屏 16:9", value: "1280x720" },
  { id: "1024x1024", label: "方形 1:1", value: "1024x1024" },
 ]
 export default function Home() {
  const [mode, setMode] = useState<CreationMode>("text-video")
  const [prompt, setPrompt] = useState("")
@@ -109,12 +123,17 @@ export default function Home() {
  const [lastFramePreview, setLastFramePreview] = useState("")
  const [imageModel, setImageModel] = useState("auto")
  const [videoModel, setVideoModel] = useState("seedance")
  const [imageSize, setImageSize] = useState("1024x1536")
  const [videoSize, setVideoSize] = useState("720x1280")
  const [videoDurationOptions, setVideoDurationOptions] = useState<number[]>([5, 8, 10, 12, 15])
  const [imageOptions, setImageOptions] = useState<RuntimeModelOption[]>([
    { id: "auto", label: "自动", model: "gpt-image-2", available: true },
  ])
  const [videoOptions, setVideoOptions] = useState<RuntimeModelOption[]>([
    { id: "seedance", label: "Seedance", model: "seedance", available: true },
  ])
  const [imageSizeOptions, setImageSizeOptions] = useState<RuntimeSizeOption[]>(DEFAULT_IMAGE_SIZE_OPTIONS)
  const [videoSizeOptions, setVideoSizeOptions] = useState<RuntimeSizeOption[]>(DEFAULT_VIDEO_SIZE_OPTIONS)
  const [job, setJob] = useState<Job | null>(null)
  const [busy, setBusy] = useState<BusyTask>(null)
  const [error, setError] = useState("")
@@ -140,14 +159,26 @@ export default function Home() {
        const nextVideoOptions = models?.video_options?.length
          ? models.video_options
          : [{ id: models?.video || "seedance", label: "Seedance", model: models?.video || "seedance", available: !!models?.video_configured }]
        const nextImageSizeOptions = models?.image_size_options?.length ? models.image_size_options : DEFAULT_IMAGE_SIZE_OPTIONS
        const nextVideoSizeOptions = models?.video_size_options?.length ? models.video_size_options : DEFAULT_VIDEO_SIZE_OPTIONS
        const nextDurationOptions = models?.video_duration_options?.length ? models.video_duration_options : [5, 8, 10, 12, 15]
        setImageOptions(nextImageOptions)
        setVideoOptions(nextVideoOptions)
        setImageSizeOptions(nextImageSizeOptions)
        setVideoSizeOptions(nextVideoSizeOptions)
        setVideoDurationOptions(nextDurationOptions)
        if (!nextImageOptions.some((item) => item.id === imageModel)) setImageModel(nextImageOptions[0]?.id || "auto")
        if (!nextVideoOptions.some((item) => item.id === videoModel)) setVideoModel(nextVideoOptions[0]?.id || "seedance")
        if (!nextImageSizeOptions.some((item) => item.value === imageSize)) setImageSize(nextImageSizeOptions[0]?.value || "1024x1536")
        if (!nextVideoSizeOptions.some((item) => item.value === videoSize)) setVideoSize(nextVideoSizeOptions[0]?.value || "720x1280")
        if (!nextDurationOptions.includes(seconds)) setSeconds(nextDurationOptions.includes(12) ? 12 : (nextDurationOptions[0] ?? 5))
      })
      .catch(() => {
        setImageOptions([{ id: "auto", label: "自动", model: "gpt-image-2", available: true }])
        setVideoOptions([{ id: "seedance", label: "Seedance", model: "seedance", available: true }])
        setImageSizeOptions(DEFAULT_IMAGE_SIZE_OPTIONS)
        setVideoSizeOptions(DEFAULT_VIDEO_SIZE_OPTIONS)
        setVideoDurationOptions([5, 8, 10, 12, 15])
      })
  }, [])
@@ -246,6 +277,7 @@ export default function Home() {
        prompt: promptWithGuardrails(),
        mode: "text",
        model: imageModel,
        size: imageSize,
      })
      setJob(updated)
      toast.success("图片已生成")
@@ -271,7 +303,7 @@ export default function Home() {
        count: 1,
        first_image: activeMode.needsFirstFrame ? { kind: "keyframe", frame_idx: 0 } : null,
        last_image: activeMode.needsLastFrame && lastFrame ? { kind: "keyframe", frame_idx: lastFrame.index } : null,
-        size: "720x1280",
+        size: videoSize,
        model: videoModel,
      })
      setJob(updated)
@@ -423,17 +455,48 @@ export default function Home() {
                    </select>
                  </label>
                  {isVideoMode(mode) ? (
                    <>
                      <label className="inline-flex h-9 items-center gap-2 rounded-xl border border-white/7 bg-black/14 px-3">
                        画幅
                        <select
                          value={videoSize}
                          onChange={(event) => setVideoSize(event.target.value)}
                          className="max-w-32 bg-transparent text-white/76 outline-none"
                        >
                          {videoSizeOptions.map((item) => (
                            <option key={item.value} value={item.value}>
                              {item.label}
                            </option>
                          ))}
                        </select>
                      </label>
                      <label className="inline-flex h-9 items-center gap-2 rounded-xl border border-white/7 bg-black/14 px-3">
                        时长
                        <select
                          value={seconds}
                          onChange={(event) => setSeconds(Number(event.target.value))}
                          className="bg-transparent text-white/76 outline-none"
                        >
                          {videoDurationOptions.map((value) => <option key={value} value={value}>{value}s</option>)}
                        </select>
                      </label>
                    </>
                  ) : (
                    <label className="inline-flex h-9 items-center gap-2 rounded-xl border border-white/7 bg-black/14 px-3">
-                      时长
+                      尺寸
                      <select
-                        value={seconds}
+                        value={imageSize}
-                        onChange={(event) => setSeconds(Number(event.target.value))}
+                        onChange={(event) => setImageSize(event.target.value)}
-                        className="bg-transparent text-white/76 outline-none"
+                        className="max-w-32 bg-transparent text-white/76 outline-none"
                      >
-                        {[5, 8, 12, 15, 20, 30].map((value) => <option key={value} value={value}>{value}s</option>)}
+                        {imageSizeOptions.map((item) => (
                          <option key={item.value} value={item.value}>
                            {item.label}
                          </option>
                        ))}
                      </select>
                    </label>
-                  ) : null}
+                  )}
                  <span>{activeMode.needsFirstFrame ? "图片作为参考帧" : "只根据文字生成"}</span>
                </div>
--- a/web/lib/api.ts
+++ b/web/lib/api.ts
@@ -260,6 +260,16 @@ export interface RuntimeModelOption {
  model: string
  description?: string
  available?: boolean
  duration_options?: number[]
  max_duration_seconds?: number
  size_options?: RuntimeSizeOption[]
 }
 export interface RuntimeSizeOption {
  id: string
  label: string
  value: string
  description?: string
 }
 export interface RuntimeModels {
@@ -280,6 +290,7 @@ export interface RuntimeModels {
  image?: string
  image_base_url?: string
  image_options?: RuntimeModelOption[]
  image_size_options?: RuntimeSizeOption[]
  image_fallbacks?: string[]
  image_circuit?: {
    primary?: string
@@ -303,6 +314,9 @@ export interface RuntimeModels {
  video?: string
  video_aliases?: Record<string, string>
  video_options?: RuntimeModelOption[]
  video_duration_options?: number[]
  video_max_duration_seconds?: number
  video_size_options?: RuntimeSizeOption[]
  video_provider?: string
  video_base_url?: string
  video_configured?: boolean
@@ -1231,7 +1245,7 @@ export async function translateText(text: string, target: "en" | "zh" = "en"): P
 export async function generateImage(
  jobId: string,
  frameIdx: number,
-  body: { prompt: string; extra_prompt?: string; negative_prompt?: string; model?: string; mode?: "edit" | "text"; from_selected?: boolean },
+  body: { prompt: string; extra_prompt?: string; negative_prompt?: string; model?: string; size?: string; mode?: "edit" | "text"; from_selected?: boolean },
 ): Promise<Job> {
  const res = await fetch(`${API_BASE}/jobs/${jobId}/frames/${frameIdx}/generate`, {
    method: "POST",