auto-save 2026-05-14 10:20 (~7)

2026-05-14 10:20:16 +08:00
parent ee32d83b6c
commit be1ae80750
7 changed files with 347 additions and 57 deletions
--- a/.memory/worklog.json
+++ b/.memory/worklog.json
@@ -1,40 +1,5 @@
 {
  "entries": [
-    {
-      "files_changed": 1,
-      "hash": "ad36702",
-      "message": "auto-save 2026-05-13 00:33 (~1)",
-      "ts": "2026-05-13T00:34:03+08:00",
-      "type": "commit"
-    },
-    {
-      "files_changed": 1,
-      "hash": "70a88fc",
-      "message": "auto-save 2026-05-13 00:39 (~1)",
-      "ts": "2026-05-13T00:39:38+08:00",
-      "type": "commit"
-    },
-    {
-      "files_changed": 1,
-      "hash": "cd8a082",
-      "message": "auto-save 2026-05-13 00:44 (~1)",
-      "ts": "2026-05-13T00:45:12+08:00",
-      "type": "commit"
-    },
-    {
-      "files_changed": 1,
-      "hash": "2c48980",
-      "message": "auto-save 2026-05-13 00:50 (~1)",
-      "ts": "2026-05-13T00:50:45+08:00",
-      "type": "commit"
-    },
-    {
-      "files_changed": 1,
-      "hash": "bcc4933",
-      "message": "auto-save 2026-05-13 00:56 (~1)",
-      "ts": "2026-05-13T00:56:19+08:00",
-      "type": "commit"
-    },
    {
      "files_changed": 1,
      "hash": "ffba726",
@@ -3319,6 +3284,37 @@
      "type": "session-heartbeat",
      "message": "Claude 会话活跃 · 最近命令：claude · 5 项未提交变更 · 最近提交：auto-save 2026-05-14 10:08 (~4)",
      "files_changed": 5
+    },
+    {
+      "ts": "2026-05-14T10:14:43+08:00",
+      "type": "commit",
+      "message": "auto-save 2026-05-14 10:14 (~7)",
+      "hash": "ee32d83",
+      "files_changed": 7
+    },
+    {
+      "ts": "2026-05-14T02:14:59Z",
+      "type": "session-end",
+      "message": "Claude 会话结束 · 持续 0 秒 · 最近命令：claude · 1 项未提交变更 · 最近提交：auto-save 2026-05-14 10:14 (~7)",
+      "files_changed": 1
+    },
+    {
+      "ts": "2026-05-14T02:14:59Z",
+      "type": "session-end",
+      "message": "Claude 会话结束 · 持续 0 秒 · 最近命令：claude · 1 项未提交变更 · 最近提交：auto-save 2026-05-14 10:14 (~7)",
+      "files_changed": 1
+    },
+    {
+      "ts": "2026-05-14T02:16:09Z",
+      "type": "session-heartbeat",
+      "message": "Codex 会话活跃 · 最近命令：codex · 1 项未提交变更 · 最近提交：auto-save 2026-05-14 10:14 (~7)",
+      "files_changed": 1
+    },
+    {
+      "ts": "2026-05-14T02:18:38Z",
+      "type": "session-heartbeat",
+      "message": "Codex 会话活跃 · 最近命令：codex · 2 项未提交变更 · 最近提交：auto-save 2026-05-14 10:14 (~7)",
+      "files_changed": 2
    }
  ]
 }
--- a/api/.env.example
+++ b/api/.env.example
@@ -12,6 +12,14 @@ VIDEO_MODEL_SEEDANCE=seedance-2-fast
 VIDEO_MODEL_KLING=kling-omni
 VIDEO_MODEL_VEO3=veo-3.1-fast

+# 音频文案改写 + MiniMax 配音
+AUDIO_REWRITE_MODEL=gemini-2.5-pro
+AUDIO_PRODUCT_BRIEF=SKG 智能按摩产品，主打日常肩颈、腰背、眼部、膝盖或足部放松；广告表达要高级、干净、可信，不做医疗疗效承诺。
+MINIMAX_API_KEY=
+MINIMAX_TTS_BASE_URL=https://api.minimax.io
+MINIMAX_TTS_MODEL=speech-2.8-turbo
+MINIMAX_TTS_VOICE_ID=Chinese (Mandarin)_Reliable_Executive
+
 # Poe 视频 API（优先用于 Seedance / Kling / Veo）
 POE_API_BASE_URL=https://api.poe.com/v1
 POE_API_KEY=
--- a/api/README.md
+++ b/api/README.md
@@ -1,6 +1,6 @@
 # SKG TK 二创 API

-FastAPI 后端，跑 yt-dlp + ffmpeg + Gemini ASR/翻译 管线。
+FastAPI 后端，跑 yt-dlp + ffmpeg + ASR/翻译/文案改写 + MiniMax 配音管线。

 ## 启动

@@ -18,16 +18,18 @@ uvicorn main:app --port 4291 --reload
 - `GET  /health` — 健康检查 + 配置状态
 - `POST /jobs` `{url}` — 创建 job，后台跑下载/拆轨/抽帧
 - `GET  /jobs/{id}` — 当前状态 + 产物
- `POST /jobs/{id}/transcribe` — 触发 Gemini ASR + 翻译
+- `POST /jobs/{id}/transcribe` — 触发 ASR + 翻译 + SKG 文案改写；配置 MiniMax 后生成配音
 - `GET  /jobs/{id}/video.mp4` — 原视频
+- `GET  /jobs/{id}/audio-script.mp3` — 改写文案的 MiniMax 配音
 - `GET  /jobs/{id}/frames/{i}.jpg` — 第 i 张关键帧（0-9）

 ## Mock 模式

-未设 `GEMINI_API_KEY` 时，转录走本地 mock，便于 UI 联调。
+未设 `LLM_API_KEY` 时，转录走本地 mock，便于 UI 联调；未设 `MINIMAX_API_KEY` 时只生成改写文案，不生成配音文件。

 ## 依赖

 - `ffmpeg` 系统二进制（拆轨 / 抽帧）
 - `yt-dlp` 系统二进制（也可走 Python 包）
- `google-generativeai` Python（ASR + 翻译）
+- OpenAI 兼容 LLM 网关（ASR / 翻译 / 文案改写）
+- MiniMax T2A HTTP（改写文案配音，使用 `MINIMAX_API_KEY`）
--- a/api/main.py
+++ b/api/main.py
@@ -12,6 +12,7 @@ from contextlib import asynccontextmanager
 from pathlib import Path
 from typing import Literal

+import httpx
 from dotenv import load_dotenv
 from fastapi import BackgroundTasks, FastAPI, File, HTTPException, UploadFile
 from fastapi.middleware.cors import CORSMiddleware
@@ -36,6 +37,18 @@ REWRITE_MODEL = os.getenv("REWRITE_MODEL", "gemini-2.5-pro")
 VISION_MODEL = os.getenv("VISION_MODEL", "gemini-2.5-flash")
 IMAGE_MODEL = os.getenv("IMAGE_MODEL", "gemini-3-pro-image-preview")
 VIDEO_MODEL = os.getenv("VIDEO_MODEL", "seedance").strip() or "seedance"
+AUDIO_PRODUCT_BRIEF = os.getenv(
+    "AUDIO_PRODUCT_BRIEF",
+    "SKG 智能按摩产品，主打日常肩颈、腰背、眼部、膝盖或足部放松；广告表达要高级、干净、可信，不做医疗疗效承诺。",
+).strip()
+AUDIO_REWRITE_MODEL = os.getenv("AUDIO_REWRITE_MODEL", REWRITE_MODEL).strip() or REWRITE_MODEL
+MINIMAX_API_KEY = os.getenv("MINIMAX_API_KEY", "").strip()
+MINIMAX_TTS_BASE_URL = os.getenv("MINIMAX_TTS_BASE_URL", "https://api.minimax.io").strip().rstrip("/")
+MINIMAX_TTS_MODEL = os.getenv("MINIMAX_TTS_MODEL", "speech-2.8-turbo").strip() or "speech-2.8-turbo"
+MINIMAX_TTS_VOICE_ID = os.getenv(
+    "MINIMAX_TTS_VOICE_ID",
+    "Chinese (Mandarin)_Reliable_Executive",
+).strip() or "Chinese (Mandarin)_Reliable_Executive"

 POE_API_BASE_URL = os.getenv("POE_API_BASE_URL", "https://api.poe.com/v1").strip() or "https://api.poe.com/v1"
 POE_API_KEY = os.getenv("POE_API_KEY", "").strip()
@@ -337,6 +350,21 @@ class TranscriptSegment(BaseModel):
    zh: str = ""


+class AudioScript(BaseModel):
+    status: Literal["idle", "rewriting", "completed", "failed"] = "idle"
+    source_text: str = ""
+    source_zh: str = ""
+    rewritten_text: str = ""
+    product_brief: str = ""
+    rewrite_model: str = ""
+    voice_provider: str = ""
+    voice_model: str = ""
+    voice_id: str = ""
+    voice_url: str = ""
+    error: str = ""
+    created_at: float = 0.0
+
+
 class Job(BaseModel):
    id: str
    url: str
@@ -349,6 +377,7 @@ class Job(BaseModel):
    height: int = 0
    frames: list[KeyFrame] = Field(default_factory=list)
    transcript: list[TranscriptSegment] = Field(default_factory=list)
+    audio_script: AudioScript = Field(default_factory=AudioScript)
    storyboard_images: list[StoryboardImage] = Field(default_factory=list)
    generated_videos: list[GeneratedVideo] = Field(default_factory=list)
    error: str = ""
@@ -1351,6 +1380,148 @@ def _translate_sync(segments: list[dict]) -> list[str]:
    return [zh_by_idx.get(i, "") for i in range(len(segments))]


+def _transcript_join(segments: list[TranscriptSegment], field: Literal["en", "zh"]) -> str:
+    lines: list[str] = []
+    for s in segments:
+        text = (s.zh if field == "zh" else s.en).strip()
+        if text:
+            lines.append(f"[{s.start:.1f}-{s.end:.1f}s] {text}")
+    return "\n".join(lines)
+
+
+def _fallback_audio_script(segments: list[TranscriptSegment]) -> str:
+    joined = " ".join((s.zh or s.en).strip() for s in segments if (s.zh or s.en).strip())
+    if not joined:
+        return "日常疲惫不用硬扛。戴上 SKG，让肩颈慢慢放松，跟着呼吸找回轻松状态。"
+    return (
+        "把日常紧绷交给 SKG。贴合身体需要放松的位置，热敷与按摩节奏自然陪伴，"
+        "让每一次短暂休息都更轻松、更有质感。"
+    )
+
+
+def _rewrite_audio_script_sync(segments: list[TranscriptSegment]) -> tuple[str, str]:
+    fallback = _fallback_audio_script(segments)
+    if not LLM_API_KEY:
+        return fallback, "LLM_API_KEY 未配置，使用本地 SKG 模板"
+    source_text = _transcript_join(segments, "en")
+    source_zh = _transcript_join(segments, "zh")
+    prompt = (
+        "你是 SKG 短视频口播编导。根据参考视频音频转写，抽取它的表达结构、情绪节奏和可复用卖点，"
+        "改写成适合 SKG 按摩/放松产品二创视频的中文口播文案。\n"
+        "要求：\n"
+        "1. 输出 35-90 个中文字，适合 8-18 秒短视频配音。\n"
+        "2. 口语化、干净、高级，能直接给 TTS 朗读。\n"
+        "3. 不承诺治疗、治愈、医学疗效，不夸大。\n"
+        "4. 不复刻原视频品牌/人物/价格/平台话术，只保留表达结构。\n"
+        "5. 如果参考转写信息不足，按产品信息生成通用 SKG 放松口播。\n"
+        '严格返回 JSON：{"rewritten_text":"..."}。\n\n'
+        f"SKG 产品信息：{AUDIO_PRODUCT_BRIEF}\n\n"
+        f"英文转写：\n{source_text or '无'}\n\n"
+        f"中文翻译：\n{source_zh or '无'}"
+    )
+    try:
+        resp = llm().chat.completions.create(
+            model=AUDIO_REWRITE_MODEL,
+            messages=[
+                {"role": "system", "content": "只输出合法 JSON，不要解释，不要 markdown。"},
+                {"role": "user", "content": prompt},
+            ],
+            response_format={"type": "json_object"},
+            temperature=0.45,
+            max_tokens=600,
+        )
+        raw = (resp.choices[0].message.content or "").strip()
+        if raw.startswith("```"):
+            import re as _re
+            match = _re.search(r"\{[\s\S]*\}", raw)
+            raw = match.group(0) if match else raw
+        data = json.loads(raw)
+        text = str(data.get("rewritten_text", "")).strip()
+        return (text or fallback), ""
+    except Exception as e:
+        return fallback, f"改写失败，使用本地模板：{e}"
+
+
+def _minimax_tts_url() -> str:
+    if MINIMAX_TTS_BASE_URL.endswith("/v1/t2a_v2"):
+        return MINIMAX_TTS_BASE_URL
+    return f"{MINIMAX_TTS_BASE_URL}/v1/t2a_v2"
+
+
+def _minimax_tts_sync(job_id: str, text: str) -> str:
+    if not MINIMAX_API_KEY:
+        raise RuntimeError("MINIMAX_API_KEY 未配置，未生成配音")
+    if not text.strip():
+        raise RuntimeError("改写文案为空，未生成配音")
+    payload = {
+        "model": MINIMAX_TTS_MODEL,
+        "text": text.strip()[:9500],
+        "stream": False,
+        "language_boost": "Chinese",
+        "output_format": "hex",
+        "voice_setting": {
+            "voice_id": MINIMAX_TTS_VOICE_ID,
+            "speed": 1,
+            "vol": 1,
+            "pitch": 0,
+        },
+        "audio_setting": {
+            "sample_rate": 32000,
+            "bitrate": 128000,
+            "format": "mp3",
+            "channel": 1,
+        },
+    }
+    resp = httpx.post(
+        _minimax_tts_url(),
+        headers={"Authorization": f"Bearer {MINIMAX_API_KEY}", "Content-Type": "application/json"},
+        json=payload,
+        timeout=90,
+    )
+    resp.raise_for_status()
+    data = resp.json()
+    base_resp = data.get("base_resp") or {}
+    if int(base_resp.get("status_code", 0) or 0) != 0:
+        raise RuntimeError(base_resp.get("status_msg") or "MiniMax TTS 返回失败")
+    audio_hex = ((data.get("data") or {}).get("audio") or "").strip()
+    if not audio_hex:
+        raise RuntimeError("MiniMax TTS 未返回 audio hex")
+    try:
+        audio_bytes = bytes.fromhex(audio_hex)
+    except ValueError as e:
+        raise RuntimeError(f"MiniMax TTS audio hex 无法解析：{e}") from e
+    out = job_dir(job_id) / "audio_script.mp3"
+    out.write_bytes(audio_bytes)
+    return f"/jobs/{job_id}/audio-script.mp3"
+
+
+def _build_audio_script_sync(job_id: str, segments: list[TranscriptSegment]) -> AudioScript:
+    source_text = _transcript_join(segments, "en")
+    source_zh = _transcript_join(segments, "zh")
+    rewritten, rewrite_error = _rewrite_audio_script_sync(segments)
+    voice_url = ""
+    voice_error = ""
+    try:
+        voice_url = _minimax_tts_sync(job_id, rewritten)
+    except Exception as e:
+        voice_error = str(e)
+    errors = "；".join(x for x in [rewrite_error, voice_error] if x)
+    return AudioScript(
+        status="completed",
+        source_text=source_text,
+        source_zh=source_zh,
+        rewritten_text=rewritten,
+        product_brief=AUDIO_PRODUCT_BRIEF,
+        rewrite_model=AUDIO_REWRITE_MODEL,
+        voice_provider="minimax",
+        voice_model=MINIMAX_TTS_MODEL,
+        voice_id=MINIMAX_TTS_VOICE_ID,
+        voice_url=voice_url,
+        error=errors,
+        created_at=time.time(),
+    )
+
+
 async def pipeline_transcribe(job_id: str) -> None:
    job = JOBS[job_id]
    d = job_dir(job_id)
@@ -1371,7 +1542,25 @@ async def pipeline_transcribe(job_id: str) -> None:
                                  en="This device looks really sleek and minimal.",
                                  zh="这个设备看起来非常时尚和简约。"),
            ]
+            update(
+                job,
+                transcript=mock,
+                audio_script=AudioScript(
+                    status="rewriting",
+                    source_text=_transcript_join(mock, "en"),
+                    source_zh=_transcript_join(mock, "zh"),
+                    product_brief=AUDIO_PRODUCT_BRIEF,
+                    rewrite_model=AUDIO_REWRITE_MODEL,
+                    voice_provider="minimax",
+                    voice_model=MINIMAX_TTS_MODEL,
+                    voice_id=MINIMAX_TTS_VOICE_ID,
+                ),
+                message="ASR mock 完成，生成 SKG 改写文案…",
+                progress=92,
+            )
+            audio_script = await asyncio.to_thread(_build_audio_script_sync, job_id, mock)
            update(job, transcript=mock, status="transcribed", progress=100,
+                   audio_script=audio_script,
                   message="转录完成（MOCK · 未设 LLM_API_KEY）")
            return

@@ -1403,11 +1592,35 @@ async def pipeline_transcribe(job_id: str) -> None:
            )
            for i, seg in enumerate(en_only)
        ]
+        update(
+            job,
+            transcript=full,
+            audio_script=AudioScript(
+                status="rewriting",
+                source_text=_transcript_join(full, "en"),
+                source_zh=_transcript_join(full, "zh"),
+                product_brief=AUDIO_PRODUCT_BRIEF,
+                rewrite_model=AUDIO_REWRITE_MODEL,
+                voice_provider="minimax",
+                voice_model=MINIMAX_TTS_MODEL,
+                voice_id=MINIMAX_TTS_VOICE_ID,
+            ),
+            message="翻译完成，生成 SKG 改写文案与 MiniMax 配音…",
+            progress=94,
+        )
+        audio_script = await asyncio.to_thread(_build_audio_script_sync, job_id, full)
        update(job, transcript=full, status="transcribed", progress=100,
+               audio_script=audio_script,
               message=f"转录完成 · {len(full)} 段（{ASR_MODEL} + {TRANSLATE_MODEL}）")

    except Exception as e:
-        update(job, status="failed", error=str(e), message="转录失败")
+        update(
+            job,
+            status="failed",
+            audio_script=AudioScript(status="failed", error=str(e), created_at=time.time()),
+            error=str(e),
+            message="转录失败",
+        )


 def _image_edit_call(
@@ -1566,6 +1779,10 @@ def health() -> dict:
            "asr": ASR_MODEL,
            "translate": TRANSLATE_MODEL,
            "rewrite": REWRITE_MODEL,
+            "audio_rewrite": AUDIO_REWRITE_MODEL,
+            "minimax_tts": MINIMAX_TTS_MODEL,
+            "minimax_voice": MINIMAX_TTS_VOICE_ID,
+            "minimax_configured": bool(MINIMAX_API_KEY),
            "video": VIDEO_MODEL,
            "video_aliases": VIDEO_MODEL_ALIASES,
            "video_provider": "poe" if video_uses_poe() else ("ark" if video_uses_ark() else "custom"),
@@ -1765,6 +1982,14 @@ def get_video(job_id: str):
    return FileResponse(p, media_type="video/mp4")


+@app.get("/jobs/{job_id}/audio-script.mp3")
+def get_audio_script(job_id: str):
+    p = job_dir(job_id) / "audio_script.mp3"
+    if not p.exists():
+        raise HTTPException(404, "audio script not found")
+    return FileResponse(p, media_type="audio/mpeg")
+
+
@app.get("/jobs/{job_id}/frames/{idx}.jpg")
 def get_frame(job_id: str, idx: int):
    p = job_dir(job_id) / "frames" / f"{idx:03d}.jpg"
--- a/web/components/dashboard.tsx
+++ b/web/components/dashboard.tsx
@@ -6,7 +6,7 @@ import {
  Mic, Languages, FileEdit, Sparkles, Film, FileVideo, Loader2, Plus, Check,
  ChevronDown, X, LayoutGrid,
 } from "lucide-react"
-import { type Job, type KeyFrame, frameUrl, effectiveFrameUrl, videoUrl, generateImage, selectGenerated, generatedImageUrl } from "@/lib/api"
+import { type Job, type KeyFrame, frameUrl, effectiveFrameUrl, videoUrl, generateImage, selectGenerated, generatedImageUrl, apiAssetUrl } from "@/lib/api"
 import { type NodeData } from "@/components/nodes"
 import { FrameLightbox } from "@/components/lightbox"
 import { toast } from "sonner"
@@ -154,6 +154,8 @@ export const Dashboard = forwardRef<DashboardHandle, Props>(function Dashboard({
  const hasFrames = (job?.frames.length ?? 0) > 0
  const hasTranscript = (job?.transcript.length ?? 0) > 0
  const hasZh = job?.transcript.some((s) => s.zh) ?? false
+  const hasAudioRewrite = !!job?.audio_script?.rewritten_text?.trim()
+  const isAudioRewriting = job?.audio_script?.status === "rewriting"
  const isFailed = job?.status === "failed"

  const colState: Record<string, ColState> = {
@@ -168,7 +170,7 @@ export const Dashboard = forwardRef<DashboardHandle, Props>(function Dashboard({
    keyframe: !job ? "pending" : (isSplitting && !hasFrames) ? "running" : hasFrames ? "done" : isFailed && job.progress >= 50 && job.progress < 70 ? "failed" : "pending",
    asr: !job ? "pending" : job.status === "transcribing" ? "running" : hasTranscript ? "done" : isFailed && job.progress >= 70 ? "failed" : "pending",
    translate: !job ? "pending" : job.status === "transcribing" ? "running" : hasZh ? "done" : "pending",
-    rewrite: "pending",
+    rewrite: !job ? "pending" : isAudioRewriting ? "running" : hasAudioRewrite ? "done" : "pending",
    imagegen: "pending",
    videogen: "pending",
    compose: "pending",
@@ -180,7 +182,7 @@ export const Dashboard = forwardRef<DashboardHandle, Props>(function Dashboard({
    keyframe: hasFrames ? `${data.selectedFrames.size}/${job!.frames.length} 选用` : "—",
    asr: hasTranscript ? `${job!.transcript.length} 段` : "—",
    translate: hasZh ? `${job!.transcript.filter((s) => s.zh).length} 段` : "—",
-    rewrite: "占位",
+    rewrite: hasAudioRewrite ? "已生成" : isAudioRewriting ? "生成中…" : "待文案",
    imagegen: data.selectedFrames.size > 0 ? `${data.selectedFrames.size} 帧待编排` : "占位",
    videogen: "占位",
    compose: "占位",
@@ -593,16 +595,31 @@ export const Dashboard = forwardRef<DashboardHandle, Props>(function Dashboard({
            {key === "rewrite" && (
              <>
                <KanbanCard tone="green" tags={["产品信息"]} title="SKG 产品卖点">
-                  <textarea
-                    rows={5}
-                    placeholder="粘贴 SKG 产品关键卖点（占位）"
-                    disabled
-                    className="w-full text-[12px] px-2 py-1.5 rounded-md bg-black/30 border border-dashed border-white/10 placeholder:text-[var(--text-faint)] text-[var(--text-strong)] resize-none opacity-70 mt-1"
-                  />
+                  <div className="text-[12px] text-[var(--text-soft)] leading-relaxed">
+                    {job?.audio_script?.product_brief || "等待音频转写完成后，按默认 SKG 放松产品卖点生成口播。"}
+                  </div>
                </KanbanCard>
-                <KanbanCard tone="green" tags={["模型"]} title="gemini-2.5-pro">
-                  <div className="text-[11px] text-[var(--text-soft)]">按英文转录 + 产品信息 → 输出改写中文文案</div>
-                  <div className="kanban-meta">下一冲刺接入</div>
+                <KanbanCard tone="green" tags={["改写"]} title={job?.audio_script?.rewrite_model || "gemini-2.5-pro"}>
+                  {job?.audio_script?.rewritten_text ? (
+                    <div className="text-[13px] text-[var(--text-strong)] leading-relaxed">
+                      {job.audio_script.rewritten_text}
+                    </div>
+                  ) : (
+                    <div className="text-[11px] text-[var(--text-soft)]">
+                      {isAudioRewriting ? "正在生成 SKG 口播文案…" : "转录完成后自动生成 SKG 口播文案"}
+                    </div>
+                  )}
+                  <div className="kanban-meta">ASR + 翻译 + SKG 卖点转化</div>
+                </KanbanCard>
+                <KanbanCard tone="green" tags={["配音"]} title={job?.audio_script?.voice_model || "MiniMax T2A"}>
+                  {job?.audio_script?.voice_url ? (
+                    <audio controls className="h-8 w-full" src={apiAssetUrl(job.audio_script.voice_url)} />
+                  ) : (
+                    <div className="text-[11px] text-[var(--text-soft)]">
+                      {job?.audio_script?.error || "配置 MiniMax 后自动生成配音文件"}
+                    </div>
+                  )}
+                  <div className="kanban-meta">{job?.audio_script?.voice_id || "Chinese (Mandarin)_Reliable_Executive"}</div>
                </KanbanCard>
              </>
            )}
--- a/web/components/nodes/index.tsx
+++ b/web/components/nodes/index.tsx
@@ -2101,12 +2101,16 @@ export function AudioNode({ data, selected }: any) {
  const d: NodeData = data
  const job = d.job
  const transcript = job?.transcript ?? []
+  const audioScript = job?.audio_script
+  const rewrittenText = audioScript?.rewritten_text?.trim() ?? ""
+  const voiceUrl = apiAssetUrl(audioScript?.voice_url)
  const hasASR = transcript.length > 0
+  const isRewriting = audioScript?.status === "rewriting"
  const status: NodeStatus = !job
    ? "pending"
-    : job.status === "transcribing"
+    : job.status === "transcribing" || isRewriting
    ? "running"
-    : hasASR
+    : rewrittenText || hasASR
    ? "done"
    : "pending"
  return (
@@ -2119,9 +2123,27 @@ export function AudioNode({ data, selected }: any) {
      pinned={d.pinnedNodes?.has("audio")}
      onTogglePin={() => d.onToggleNodePin?.("audio")}
    >
-      <div className="text-[11px] text-[var(--text-soft)] leading-snug">
-        音轨 → ASR 转录 → 英中翻译 → 接 SKG 卖点改写文案<br />
-        <span className="text-[var(--text-faint)] font-mono">Gemini 2.5 Flash</span>
+      <div className="space-y-2 text-[11px] text-[var(--text-soft)] leading-snug">
+        <div>
+          音轨 → ASR 转录 → 英中翻译 → SKG 口播改写 → MiniMax 配音<br />
+          <span className="text-[var(--text-faint)] font-mono">
+            {audioScript?.rewrite_model || "Gemini 2.5 Pro"} → {audioScript?.voice_model || "MiniMax T2A"}
+          </span>
+        </div>
+        {rewrittenText && (
+          <div className="rounded-md border border-emerald-400/25 bg-emerald-400/10 px-2.5 py-2 text-[11.5px] leading-relaxed text-[var(--text-strong)] break-words">
+            {rewrittenText}
+          </div>
+        )}
+        {voiceUrl && (
+          <audio controls src={voiceUrl} className="h-7 w-full" />
+        )}
+        {isRewriting && (
+          <div className="text-[10.5px] text-[var(--text-faint)]">正在生成改写文案和配音…</div>
+        )}
+        {audioScript?.error && rewrittenText && !voiceUrl && (
+          <div className="text-[10.5px] text-amber-300/85">配音待生成：{audioScript.error}</div>
+        )}
      </div>
    </NodeShell>
  )
--- a/web/lib/api.ts
+++ b/web/lib/api.ts
@@ -296,6 +296,21 @@ export interface TranscriptSegment {
  zh: string
 }

+export interface AudioScript {
+  status: "idle" | "rewriting" | "completed" | "failed"
+  source_text: string
+  source_zh: string
+  rewritten_text: string
+  product_brief: string
+  rewrite_model: string
+  voice_provider: string
+  voice_model: string
+  voice_id: string
+  voice_url: string
+  error: string
+  created_at: number
+}
+
 export interface StoryboardImage {
  ref_id: string
  kind: "keyframe" | "cutout" | "asset"
@@ -318,6 +333,7 @@ export interface Job {
  height?: number
  frames: KeyFrame[]
  transcript: TranscriptSegment[]
+  audio_script?: AudioScript
  storyboard_images?: StoryboardImage[]
  generated_videos?: GeneratedVideo[]
  error?: string
@@ -331,6 +347,10 @@ export interface BackendHealth {
    asr?: string
    translate?: string
    rewrite?: string
+    audio_rewrite?: string
+    minimax_tts?: string
+    minimax_voice?: string
+    minimax_configured?: boolean
    video?: string
    video_aliases?: Record<string, string>
    video_base_url?: string