feat: expose generation model choices

2026-05-25 11:02:13 +08:00
parent 6ba84a7603
commit dcc8abc812
5 changed files with 159 additions and 15 deletions
--- a/RULES.md
+++ b/RULES.md
@@ -11,7 +11,7 @@
 - 详见 `CLAUDE.md` 立项决策段 + `.memory/plan.md` 七步管线拆解
 - 风格：`04-Dark-Gallery-Ambient`（路径：`~/Projects/research/20260305-网页风格库/04-Dark-Gallery-Ambient.md`）
 - 第一冲刺：步骤 1-4（下载 / 拆轨 / 关键帧 / ASR+翻译）
- 当前产品方向（2026-05-25 单对话框版）：默认首页彻底从“信息流广告复刻管线”切换为多人通用的 SKG 营销内容生成入口，服务约 6 名公司成员同时使用。首页默认只保留一个中央对话框，不再显示侧栏、灵感区、任务列表或大结果面板；用户先选择四种生成方式之一：文生视频、文生图、首帧生视频、首尾帧生视频，然后手写提示词并点击生成。首帧 / 首尾帧模式只露必要图片上传位，视频模式只保留时长选择。用户登录后仍只看到自己的任务、结果和详情页，继续沿用后端 owner 隔离；结果生成后从对话框下方进入 `/detail/?job=<id>` 沉淀参考图、生成图、视频候选和提示词。旧 TK 复刻工作台、Agent Cut 一键出片和营销图文方案保留为高级/详情页能力，不再作为默认首页入口或默认理解框架。
+- 当前产品方向（2026-05-25 单对话框版）：默认首页彻底从“信息流广告复刻管线”切换为多人通用的 SKG 营销内容生成入口，服务约 6 名公司成员同时使用。首页默认只保留一个中央对话框，不再显示侧栏、灵感区、任务列表或大结果面板；用户先选择四种生成方式之一：文生视频、文生图、首帧生视频、首尾帧生视频，然后手写提示词并点击生成。首帧 / 首尾帧模式只露必要图片上传位，视频模式只保留时长选择。后端 `/health` 向前端返回可选图片 / 视频模型，首页允许用户选择图片模型（自动、GPT Image 2、Gemini 图片兜底）和视频模型（Seedance、Kling、Veo 3 等别名；实际可用模型以环境变量映射为准）。用户登录后仍只看到自己的任务、结果和详情页，继续沿用后端 owner 隔离；结果生成后从对话框下方进入 `/detail/?job=<id>` 沉淀参考图、生成图、视频候选和提示词。旧 TK 复刻工作台、Agent Cut 一键出片和营销图文方案保留为高级/详情页能力，不再作为默认首页入口或默认理解框架。

 ## 部署事实
 - 平台：VPS `76.13.31.179`（Ubuntu 24.04 / Docker Compose / Coolify Traefik）
--- a/api/main.py
+++ b/api/main.py
@@ -4106,6 +4106,71 @@ def _image_model_candidates(force_fallback: bool = False, preference: str | None
    return [GPT_IMAGE_MODEL, *fallbacks]


+def image_model_options() -> list[dict]:
+    options = [
+        {
+            "id": "auto",
+            "label": "自动",
+            "model": GPT_IMAGE_MODEL,
+            "description": "优先 GPT Image 2，必要时按后端熔断和兜底策略切到备用图片模型",
+            "available": bool(IMAGE_API_KEY),
+        },
+        {
+            "id": GPT_IMAGE_MODEL,
+            "label": "GPT Image 2",
+            "model": GPT_IMAGE_MODEL,
+            "description": "主生图模型，适合营销图和参考图重绘",
+            "available": bool(IMAGE_API_KEY),
+        },
+    ]
+    if IMAGE_FALLBACK_ENABLED and IMAGE_FALLBACK_MODEL and IMAGE_FALLBACK_MODEL != GPT_IMAGE_MODEL:
+        options.append({
+            "id": IMAGE_FALLBACK_MODEL,
+            "label": "Gemini 图片",
+            "model": IMAGE_FALLBACK_MODEL,
+            "description": "备用图片模型，适合主模型慢或失败时手动选择",
+            "available": bool(IMAGE_API_KEY),
+        })
+    return options
+
+
+def video_model_options() -> list[dict]:
+    label_map = {
+        "seedance": "Seedance",
+        "kling": "Kling",
+        "veo3": "Veo 3",
+        "veo": "Veo",
+        "voe": "Veo",
+    }
+    seen: set[str] = set()
+    options: list[dict] = []
+    for key in ["seedance", "kling", "veo3", "veo"]:
+        if key not in VIDEO_MODEL_ALIASES:
+            continue
+        model = VIDEO_MODEL_ALIASES[key]
+        unique_key = f"{key}:{model}"
+        if unique_key in seen:
+            continue
+        seen.add(unique_key)
+        options.append({
+            "id": key,
+            "label": label_map.get(key, key),
+            "model": model,
+            "description": "当前视频网关可选模型",
+            "available": bool(video_api_key()),
+        })
+    default_model = resolve_video_model(VIDEO_MODEL)
+    if not any(item["id"] == VIDEO_MODEL or item["model"] == default_model for item in options):
+        options.insert(0, {
+            "id": VIDEO_MODEL,
+            "label": label_map.get(VIDEO_MODEL, VIDEO_MODEL),
+            "model": default_model,
+            "description": "默认视频模型",
+            "available": bool(video_api_key()),
+        })
+    return options
+
+
 def _image_failure_can_fallback(status_code: int, body: str, last_err: str) -> bool:
    if status_code in (400, 401, 403, 404):
        return False
@@ -5132,6 +5197,7 @@ def health() -> dict:
            "image": IMAGE_MODEL,
            "image_base_url": IMAGE_BASE_URL or LLM_BASE_URL or "openai-default",
            "image_request_timeout_seconds": IMAGE_REQUEST_TIMEOUT_SECONDS,
+            "image_options": image_model_options(),
            "ai_proxy_configured": bool(AI_HTTP_PROXY),
            "image_fallbacks": _image_fallback_models(),
            "image_circuit": _image_circuit_snapshot(),
@@ -5146,6 +5212,7 @@ def health() -> dict:
            "voice_configured": bool(AZURE_OPENAI_API_KEY),
            "video": VIDEO_MODEL,
            "video_aliases": VIDEO_MODEL_ALIASES,
+            "video_options": video_model_options(),
            "video_provider": video_provider_name(),
            "video_base_url": video_api_base(),
            "video_configured": bool(video_api_key()),
@@ -5598,7 +5665,7 @@ class GenerateReq(BaseModel):
    prompt: str
    extra_prompt: str = ""        # ✓ 需要的元素（正向）
    negative_prompt: str = ""     # ✗ 不需要的元素（负向）
-    model: str = ""  # 兼容旧前端字段；服务端强制使用 gpt-image-2
+    model: str = "auto"  # auto / gpt-image-2 / gemini-3-pro-image-preview
    mode: str = "edit"  # "edit" 带参考图，"text" 纯文字
    from_selected: bool = False   # True 时优先用 frame.selected 的生成图作 reference（迭代），否则原关键帧

@@ -5649,8 +5716,8 @@ def generate_image(job_id: str, idx: int, req: GenerateReq) -> Job:
    if req.mode == "edit":
        img_bytes_in = reference_path.read_bytes()

-    # 尝试 i2i；主模型上游异常时允许 Gemini 兜底。无兜底时保留旧的多次重试。
-    model_candidates = _image_model_candidates()
+    # 尝试 i2i；auto 允许按熔断策略兜底，显式模型只走用户所选模型。
+    model_candidates = _image_model_candidates(preference=req.model)
    plan: list[str] = ([req.mode] if model_candidates != [GPT_IMAGE_MODEL] else [req.mode] * 3) if req.mode == "edit" else [req.mode]
    if req.mode == "edit":
        plan.append("text")  # i2i 都失败时自动降级
--- a/docs/source-analysis.html
+++ b/docs/source-analysis.html
--- a/web/app/page.tsx
+++ b/web/app/page.tsx
@@ -22,11 +22,13 @@ import {
  deleteGeneratedVideo,
  generateImage,
  generateStoryboardVideo,
+  getRuntimeHealth,
  getJob,
  uploadReferenceFrame,
  type GeneratedImage,
  type GeneratedVideo,
  type Job,
+  type RuntimeModelOption,
 } from "@/lib/api"

 type CreationMode = "text-video" | "text-image" | "first-frame-video" | "first-last-frame-video"
@@ -105,6 +107,14 @@ export default function Home() {
  const [lastFrameFile, setLastFrameFile] = useState<File | null>(null)
  const [firstFramePreview, setFirstFramePreview] = useState("")
  const [lastFramePreview, setLastFramePreview] = useState("")
+  const [imageModel, setImageModel] = useState("auto")
+  const [videoModel, setVideoModel] = useState("seedance")
+  const [imageOptions, setImageOptions] = useState<RuntimeModelOption[]>([
+    { id: "auto", label: "自动", model: "gpt-image-2", available: true },
+  ])
+  const [videoOptions, setVideoOptions] = useState<RuntimeModelOption[]>([
+    { id: "seedance", label: "Seedance", model: "seedance", available: true },
+  ])
  const [job, setJob] = useState<Job | null>(null)
  const [busy, setBusy] = useState<BusyTask>(null)
  const [error, setError] = useState("")
@@ -117,6 +127,30 @@ export default function Home() {
  const runningVideo = (job?.generated_videos ?? []).some((item) => item.status === "queued" || item.status === "in_progress")
  const submitting = busy === mode || busy === "job"

+  useEffect(() => {
+    getRuntimeHealth()
+      .then((health) => {
+        const models = health.models
+        const nextImageOptions = models?.image_options?.length
+          ? models.image_options
+          : [
+              { id: "auto", label: "自动", model: models?.image || "gpt-image-2", available: true },
+              { id: models?.image || "gpt-image-2", label: "GPT Image 2", model: models?.image || "gpt-image-2", available: true },
+            ]
+        const nextVideoOptions = models?.video_options?.length
+          ? models.video_options
+          : [{ id: models?.video || "seedance", label: "Seedance", model: models?.video || "seedance", available: !!models?.video_configured }]
+        setImageOptions(nextImageOptions)
+        setVideoOptions(nextVideoOptions)
+        if (!nextImageOptions.some((item) => item.id === imageModel)) setImageModel(nextImageOptions[0]?.id || "auto")
+        if (!nextVideoOptions.some((item) => item.id === videoModel)) setVideoModel(nextVideoOptions[0]?.id || "seedance")
+      })
+      .catch(() => {
+        setImageOptions([{ id: "auto", label: "自动", model: "gpt-image-2", available: true }])
+        setVideoOptions([{ id: "seedance", label: "Seedance", model: "seedance", available: true }])
+      })
+  }, [])
+
  useEffect(() => {
    if (!firstFrameFile) {
      setFirstFramePreview("")
@@ -211,6 +245,7 @@ export default function Home() {
      const updated = await generateImage(target.id, 0, {
        prompt: promptWithGuardrails(),
        mode: "text",
+        model: imageModel,
      })
      setJob(updated)
      toast.success("图片已生成")
@@ -237,6 +272,7 @@ export default function Home() {
        first_image: activeMode.needsFirstFrame ? { kind: "keyframe", frame_idx: 0 } : null,
        last_image: activeMode.needsLastFrame && lastFrame ? { kind: "keyframe", frame_idx: lastFrame.index } : null,
        size: "720x1280",
+        model: videoModel,
      })
      setJob(updated)
      toast.success("视频已提交")
@@ -368,7 +404,24 @@ export default function Home() {
              />

              <div className="mt-3 flex flex-wrap items-center justify-between gap-3">
-                <div className="flex items-center gap-2 text-xs text-white/38">
+                <div className="flex flex-wrap items-center gap-2 text-xs text-white/38">
+                  <label className="inline-flex h-9 items-center gap-2 rounded-xl border border-white/7 bg-black/14 px-3">
+                    模型
+                    <select
+                      value={isVideoMode(mode) ? videoModel : imageModel}
+                      onChange={(event) => {
+                        if (isVideoMode(mode)) setVideoModel(event.target.value)
+                        else setImageModel(event.target.value)
+                      }}
+                      className="max-w-36 bg-transparent text-white/76 outline-none"
+                    >
+                      {(isVideoMode(mode) ? videoOptions : imageOptions).map((item) => (
+                        <option key={item.id} value={item.id} disabled={item.available === false}>
+                          {item.label}
+                        </option>
+                      ))}
+                    </select>
+                  </label>
                  {isVideoMode(mode) ? (
                    <label className="inline-flex h-9 items-center gap-2 rounded-xl border border-white/7 bg-black/14 px-3">
                      时长
--- a/web/lib/api.ts
+++ b/web/lib/api.ts
@@ -254,6 +254,14 @@ export interface GeneratedVideo {
  created_at: number
 }

+export interface RuntimeModelOption {
+  id: string
+  label: string
+  model: string
+  description?: string
+  available?: boolean
+}
+
 export interface RuntimeModels {
  asr?: string
  asr_language?: string
@@ -271,6 +279,7 @@ export interface RuntimeModels {
  product_view?: string
  image?: string
  image_base_url?: string
+  image_options?: RuntimeModelOption[]
  image_fallbacks?: string[]
  image_circuit?: {
    primary?: string
@@ -293,6 +302,7 @@ export interface RuntimeModels {
  voice_tts_paths?: string[]
  video?: string
  video_aliases?: Record<string, string>
+  video_options?: RuntimeModelOption[]
  video_provider?: string
  video_base_url?: string
  video_configured?: boolean