auto-save 2026-05-14 11:47 (~7)

2026-05-14 11:47:40 +08:00
parent b474d804c8
commit ba491c0c5a
7 changed files with 187 additions and 70 deletions
--- a/.memory/worklog.json
+++ b/.memory/worklog.json
@@ -1,19 +1,5 @@
 {
  "entries": [
    {
      "files_changed": 1,
      "hash": "c8fd985",
      "message": "auto-save 2026-05-13 03:54 (~1)",
      "ts": "2026-05-13T03:54:21+08:00",
      "type": "commit"
    },
    {
      "files_changed": 1,
      "hash": "ffc7437",
      "message": "auto-save 2026-05-13 04:00 (~1)",
      "ts": "2026-05-13T04:00:13+08:00",
      "type": "commit"
    },
    {
      "files_changed": 1,
      "hash": "7a5b09a",
@@ -3299,6 +3285,19 @@
      "type": "session-heartbeat",
      "message": "Codex 会话活跃 · 最近命令：codex · 1 项未提交变更 · 最近提交：auto-save 2026-05-14 11:36 (~3)",
      "files_changed": 1
    },
    {
      "ts": "2026-05-14T11:42:06+08:00",
      "type": "commit",
      "message": "auto-save 2026-05-14 11:41 (~1)",
      "hash": "b474d80",
      "files_changed": 1
    },
    {
      "ts": "2026-05-14T03:46:10Z",
      "type": "session-heartbeat",
      "message": "Codex 会话活跃 · 最近命令：codex · 6 项未提交变更 · 最近提交：auto-save 2026-05-14 11:41 (~1)",
      "files_changed": 6
    }
  ]
 }
--- a/api/.env.example
+++ b/api/.env.example
@@ -19,7 +19,7 @@ AUDIO_PRODUCT_BRIEF="SKG 智能按摩产品，主打日常肩颈、腰背、眼
 MINIMAX_API_KEY=
 MINIMAX_TTS_BASE_URL=https://api.minimax.io
 MINIMAX_TTS_MODEL=speech-2.8-turbo
-MINIMAX_TTS_VOICE_ID="Chinese (Mandarin)_Reliable_Executive"
+MINIMAX_TTS_VOICE_ID=English_expressive_narrator
 # Poe 视频 API（优先用于 Seedance / Kling / Veo）
 POE_API_BASE_URL=https://api.poe.com/v1
--- a/api/main.py
+++ b/api/main.py
@@ -49,8 +49,8 @@ MINIMAX_TTS_BASE_URL = os.getenv("MINIMAX_TTS_BASE_URL", "https://api.minimax.io
 MINIMAX_TTS_MODEL = os.getenv("MINIMAX_TTS_MODEL", "speech-2.8-turbo").strip() or "speech-2.8-turbo"
 MINIMAX_TTS_VOICE_ID = os.getenv(
    "MINIMAX_TTS_VOICE_ID",
-    "Chinese (Mandarin)_Reliable_Executive",
+    "English_expressive_narrator",
-).strip() or "Chinese (Mandarin)_Reliable_Executive"
+).strip() or "English_expressive_narrator"
 POE_API_BASE_URL = os.getenv("POE_API_BASE_URL", "https://api.poe.com/v1").strip() or "https://api.poe.com/v1"
 POE_API_KEY = os.getenv("POE_API_KEY", "").strip()
@@ -379,6 +379,7 @@ class Job(BaseModel):
    duration: float = 0.0
    width: int = 0
    height: int = 0
    source_audio_url: str = ""
    frames: list[KeyFrame] = Field(default_factory=list)
    transcript: list[TranscriptSegment] = Field(default_factory=list)
    audio_script: AudioScript = Field(default_factory=AudioScript)
@@ -400,6 +401,14 @@ def job_dir(job_id: str) -> Path:
    return d
 def source_audio_url_for(job_id: str) -> str:
    return f"/jobs/{job_id}/audio.wav" if (JOBS_DIR / job_id / "audio.wav").exists() else ""
 def job_with_artifacts(job: Job) -> Job:
    return job.model_copy(update={"source_audio_url": source_audio_url_for(job.id)})
 def save_state(job: Job) -> None:
    (job_dir(job.id) / "state.json").write_text(job.model_dump_json(indent=2))
@@ -1224,7 +1233,7 @@ def pipeline_analyze(
        wav = d / "audio.wav"
        if wav.exists():
-            update(job, status="splitting", message="复用音轨 · 准备抽帧…", progress=35)
+            update(job, status="splitting", message="复用音轨 · 准备抽帧…", progress=35, source_audio_url=f"/jobs/{job_id}/audio.wav")
        else:
            update(job, status="splitting", message="ffmpeg 拆分音轨…", progress=35)
            run([
@@ -1232,6 +1241,7 @@ def pipeline_analyze(
                "-vn", "-ac", "1", "-ar", "16000", "-c:a", "pcm_s16le",
                str(wav),
            ])
            update(job, source_audio_url=f"/jobs/{job_id}/audio.wav")
        n = max(1, min(int(frame_count), 20))
        target_label = FRAME_TARGET_LABELS.get(target, FRAME_TARGET_LABELS["balanced"])
        duration = max(float(job.duration or 1.0), 0.1)
@@ -1497,12 +1507,12 @@ def _transcript_join(segments: list[TranscriptSegment], field: Literal["en", "zh
 def _fallback_audio_script(segments: list[TranscriptSegment]) -> str:
-    joined = " ".join((s.zh or s.en).strip() for s in segments if (s.zh or s.en).strip())
+    joined = " ".join((s.en or s.zh).strip() for s in segments if (s.en or s.zh).strip())
    if not joined:
-        return "日常疲惫不用硬扛。戴上 SKG，让肩颈慢慢放松，跟着呼吸找回轻松状态。"
+        return "Ease into the moment with SKG. Gentle warmth and rhythmic massage help everyday tension feel lighter, cleaner, and easier to leave behind."
    return (
-        "把日常紧绷交给 SKG。贴合身体需要放松的位置，热敷与按摩节奏自然陪伴，"
+        "Let SKG turn a short break into real relief. With soothing warmth and steady massage rhythm, "
-        "让每一次短暂休息都更轻松、更有质感。"
+        "everyday tension feels lighter, calmer, and easier to leave behind."
    )
@@ -1513,24 +1523,24 @@ def _rewrite_audio_script_sync(segments: list[TranscriptSegment]) -> tuple[str,
    source_text = _transcript_join(segments, "en")
    source_zh = _transcript_join(segments, "zh")
    prompt = (
-        "你是 SKG 短视频口播编导。根据参考视频音频转写，抽取它的表达结构、情绪节奏和可复用卖点，"
+        "You are an English short-video voice-over writer for SKG wellness massagers. "
-        "改写成适合 SKG 按摩/放松产品二创视频的中文口播文案。\n"
+        "Use the source transcript only for structure, pacing, and emotional hook, then rewrite it into a clean English VO for SKG.\n"
-        "要求：\n"
+        "Rules:\n"
-        "1. 输出 35-90 个中文字，适合 8-18 秒短视频配音。\n"
+        "1. Output 28-55 English words, suitable for an 8-18 second TTS voice-over.\n"
-        "2. 口语化、干净、高级，能直接给 TTS 朗读。\n"
+        "2. Make it natural, premium, concise, and ready to read aloud.\n"
-        "3. 不承诺治疗、治愈、医学疗效，不夸大。\n"
+        "3. Do not claim medical treatment, cure, pain elimination, or clinical effects.\n"
-        "4. 不复刻原视频品牌/人物/价格/平台话术，只保留表达结构。\n"
+        "4. Do not copy the original brand, creator, price, platform language, or exact claims.\n"
-        "5. 如果参考转写信息不足，按产品信息生成通用 SKG 放松口播。\n"
+        "5. If the source transcript is too thin, write a general SKG relaxation VO.\n"
-        '严格返回 JSON：{"rewritten_text":"..."}。\n\n'
+        'Return strict JSON only: {"rewritten_text":"..."}.\n\n'
-        f"SKG 产品信息：{AUDIO_PRODUCT_BRIEF}\n\n"
+        f"SKG product context: {AUDIO_PRODUCT_BRIEF}\n\n"
-        f"英文转写：\n{source_text or '无'}\n\n"
+        f"English transcript:\n{source_text or 'None'}\n\n"
-        f"中文翻译：\n{source_zh or '无'}"
+        f"Chinese translation for reference:\n{source_zh or 'None'}"
    )
    try:
        resp = llm().chat.completions.create(
            model=AUDIO_REWRITE_MODEL,
            messages=[
-                {"role": "system", "content": "只输出合法 JSON，不要解释，不要 markdown。"},
+                {"role": "system", "content": "Return valid JSON only. No explanation. No markdown."},
                {"role": "user", "content": prompt},
            ],
            response_format={"type": "json_object"},
@@ -1564,7 +1574,7 @@ def _minimax_tts_sync(job_id: str, text: str) -> str:
        "model": MINIMAX_TTS_MODEL,
        "text": text.strip()[:9500],
        "stream": False,
-        "language_boost": "Chinese",
+        "language_boost": "English",
        "output_format": "hex",
        "voice_setting": {
            "voice_id": MINIMAX_TTS_VOICE_ID,
@@ -1651,6 +1661,7 @@ def pipeline_transcribe(job_id: str, manage_job_status: bool = True) -> None:
            ])
            if not wav.exists():
                raise RuntimeError("音频提取完成但找不到 audio.wav")
        update(job, source_audio_url=f"/jobs/{job_id}/audio.wav")
        if not LLM_API_KEY:
            # 无 key 模式：mock 数据
@@ -2112,7 +2123,7 @@ def get_job(job_id: str) -> Job:
    job = JOBS.get(job_id)
    if not job:
        raise HTTPException(404, "job not found")
-    return job
+    return job_with_artifacts(job)
@app.delete("/jobs/{job_id}")
@@ -2153,7 +2164,7 @@ async def trigger_transcribe(job_id: str, bg: BackgroundTasks) -> Job:
        update(job, error="", audio_script=audio_payload)
    if not start_audio_processing(job_id, manage_job_status=manage_job_status):
        update(job, message="音频已在处理中")
-    return job
+    return job_with_artifacts(job)
@app.get("/jobs/{job_id}/video.mp4")
--- a/web/app/page.tsx
+++ b/web/app/page.tsx
@@ -100,6 +100,8 @@ export default function Home() {
  const [jobs, setJobs] = useState<Job[]>([])
  const [activeJobId, setActiveJobId] = useState<string | null>(null)
  const job = useMemo(() => jobs.find((j) => j.id === activeJobId) ?? null, [jobs, activeJobId])
  const [audioStripJobId, setAudioStripJobId] = useState<string | null>(null)
  const audioStripJob = useMemo(() => jobs.find((j) => j.id === audioStripJobId) ?? null, [jobs, audioStripJobId])
  const [submitting, setSubmitting] = useState(false)
  const [analyzing, setAnalyzing] = useState(false)
  const [frameTargets, setFrameTargets] = useState<Record<string, FrameExtractTarget>>({})
@@ -159,6 +161,10 @@ export default function Home() {
  const handleSwitchJob = useCallback((id: string) => {
    setActiveJobId(id)
  }, [])
  const handleOpenAudioStrip = useCallback((jobId?: string) => {
    const targetId = jobId ?? activeJobId
    if (targetId) setAudioStripJobId(targetId)
  }, [activeJobId])
  const pollRef = useRef<ReturnType<typeof setInterval> | null>(null)
  const handleSubmit = useCallback(async (url: string) => {
@@ -393,16 +399,13 @@ export default function Home() {
  const handleTranscribeAudio = useCallback(async (jobId?: string, options?: { silent?: boolean }) => {
    const targetId = jobId ?? activeJobId
    if (!targetId) return
    setAudioStripJobId(targetId)
    const target = jobs.find((item) => item.id === targetId)
    if (!target) return
    if (!target.video_url) {
      if (!options?.silent) toast.info("视频导入完成后，可在音频卡片点击提取音频")
      return
    }
    if (target.status === "splitting") {
      if (!options?.silent) toast.info("当前正在抽帧，结束后可重新点击提取音频")
      return
    }
    if (target.status === "transcribing" || target.audio_script?.status === "rewriting") {
      if (!options?.silent) toast.info("音频正在处理中")
      return
@@ -728,9 +731,10 @@ export default function Home() {
    onCopyImage: handleCopyImage,
    onGenerateProductFusionVideo: handleGenerateProductFusionVideo,
    onTranscribeAudio: handleTranscribeAudio,
    onOpenAudioStrip: handleOpenAudioStrip,
    pinnedNodes,
    onToggleNodePin: handleToggleNodePin,
-  }), [job, jobs, activeJobId, submitting, analyzing, frameTargets, frameCounts, frameQualities, selectedFrames, expandedFrame, framePanelScale, framePanelPinned, framePanelDock, videoPanelJobId, videoPanelScale, videoPanelDock, handleSubmit, handleUpload, handleAnalyze, handleAnalyzeJob, handleFrameTargetChange, handleFrameCountChange, handleFrameQualityChange, handleToggleFrame, handleOpenFramePanel, handleFramePanelScaleChange, handleCloseExpandedFrame, handleAddManualFrame, handleAddManualFrameForJob, handleOpenVideoPanel, handleVideoPanelScaleChange, handleSwitchJob, updateJobInList, handleDeleteJob, handleDeleteFrame, handleDeleteFrameForJob, handleDeleteGenerated, handleDeleteVideo, handleDeleteCutout, handleOpenStoryboard, handleOpenWorkbench, clipboard, handleCopyImage, handleGenerateProductFusionVideo, handleTranscribeAudio, pinnedNodes, handleToggleNodePin])
+  }), [job, jobs, activeJobId, submitting, analyzing, frameTargets, frameCounts, frameQualities, selectedFrames, expandedFrame, framePanelScale, framePanelPinned, framePanelDock, videoPanelJobId, videoPanelScale, videoPanelDock, handleSubmit, handleUpload, handleAnalyze, handleAnalyzeJob, handleFrameTargetChange, handleFrameCountChange, handleFrameQualityChange, handleToggleFrame, handleOpenFramePanel, handleFramePanelScaleChange, handleCloseExpandedFrame, handleAddManualFrame, handleAddManualFrameForJob, handleOpenVideoPanel, handleVideoPanelScaleChange, handleSwitchJob, updateJobInList, handleDeleteJob, handleDeleteFrame, handleDeleteFrameForJob, handleDeleteGenerated, handleDeleteVideo, handleDeleteCutout, handleOpenStoryboard, handleOpenWorkbench, clipboard, handleCopyImage, handleGenerateProductFusionVideo, handleTranscribeAudio, handleOpenAudioStrip, pinnedNodes, handleToggleNodePin])
  // 用 useNodesState 让 ReactFlow 自己管位置（避免轮询时重置 drag）
  const savedSizes = useMemo(() => loadNodeSizes(), [])
@@ -1013,7 +1017,7 @@ export default function Home() {
            <div className="h-full w-full" suppressHydrationWarning />
          )}
          </div>
-          {clientReady && <AudioStrip job={job} />}
+          {clientReady && <AudioStrip job={audioStripJob} open={!!audioStripJob} onClose={() => setAudioStripJobId(null)} />}
        </section>
        <Toaster theme="system" position="top-center" />
--- a/web/components/audio-strip.tsx
+++ b/web/components/audio-strip.tsx
@@ -1,7 +1,7 @@
 "use client"
 import { useEffect, useMemo, useRef, useState, type PointerEvent as ReactPointerEvent } from "react"
-import { ChevronDown, ChevronUp, GripHorizontal, Mic2, Volume2 } from "lucide-react"
+import { ChevronDown, ChevronUp, GripHorizontal, Mic2, Volume2, X } from "lucide-react"
 import { apiAssetUrl, sourceAudioUrl, type Job, type TranscriptSegment } from "@/lib/api"
 const STORAGE_KEY = "skg.audio-strip.height"
@@ -58,20 +58,34 @@ function SegmentCard({
  segment,
  peaks,
  duration,
  currentTime,
 }: {
  segment: TranscriptSegment
  peaks: number[]
  duration: number
  currentTime: number
 }) {
  const segDuration = Math.max(1.2, segment.end - segment.start)
  const width = clamp(180 + segDuration * 42, 220, 520)
  const segPeaks = slicePeaks(peaks, segment.start, segment.end, duration)
  const active = currentTime >= segment.start && currentTime <= Math.max(segment.end, segment.start + 0.2)
  const pointerPct = active ? clamp(((currentTime - segment.start) / Math.max(0.2, segment.end - segment.start)) * 100, 0, 100) : 0
  return (
    <article
-      className="shrink-0 rounded-lg border border-white/10 bg-white/[0.045] p-3 shadow-[0_12px_30px_-22px_rgba(0,0,0,0.8)]"
+      className={`relative shrink-0 overflow-hidden rounded-lg border p-3 shadow-[0_12px_30px_-22px_rgba(0,0,0,0.8)] transition ${
        active
          ? "border-emerald-300/55 bg-emerald-300/[0.105]"
          : "border-white/10 bg-white/[0.045]"
      }`}
      style={{ width }}
    >
      {active && (
        <div
          className="pointer-events-none absolute inset-y-0 z-10 w-[2px] bg-emerald-200 shadow-[0_0_18px_rgba(110,231,183,0.9)]"
          style={{ left: `${pointerPct}%` }}
        />
      )}
      <div className="mb-2 flex items-center justify-between gap-3">
        <span className="font-mono text-[10px] text-[var(--text-faint)]">
          {segment.start.toFixed(1)}s to {segment.end.toFixed(1)}s
@@ -93,7 +107,7 @@ function SegmentCard({
            {segment.zh || <span className="text-[var(--text-faint)] italic">翻译中...</span>}
          </p>
        </div>
-        <Waveform peaks={segPeaks} />
+        <Waveform peaks={segPeaks} active={active} />
      </div>
    </article>
  )
@@ -126,19 +140,33 @@ async function decodeWaveform(url: string, targetPeaks = 1800) {
  }
 }
-export function AudioStrip({ job }: { job: Job | null }) {
+export function AudioStrip({ job, open, onClose }: { job: Job | null; open: boolean; onClose?: () => void }) {
  const [collapsed, setCollapsed] = useState(false)
  const [height, setHeight] = useState(DEFAULT_HEIGHT)
  const [peaks, setPeaks] = useState<number[]>([])
  const [sourceReady, setSourceReady] = useState(false)
  const [audioKey, setAudioKey] = useState(0)
  const [currentTime, setCurrentTime] = useState(0)
  const dragRef = useRef<{ startY: number; startHeight: number } | null>(null)
  const audioRef = useRef<HTMLAudioElement>(null)
  const transcript = job?.transcript ?? []
  const audioScript = job?.audio_script
  const voiceUrl = apiAssetUrl(audioScript?.voice_url)
-  const hasAudio = !!job && (transcript.length > 0 || !!audioScript?.rewritten_text || job.status === "transcribing")
+  const sourceUrl = job ? apiAssetUrl(job.source_audio_url || sourceAudioUrl(job.id)) : ""
  const processing = !!job && (job.status === "transcribing" || audioScript?.status === "rewriting")
  const activeSegment = transcript.find((segment) => currentTime >= segment.start && currentTime <= Math.max(segment.end, segment.start + 0.2))
  const duration = useMemo(() => {
    const lastTranscriptEnd = transcript.reduce((max, s) => Math.max(max, s.end || 0), 0)
-    return Math.max(job?.duration ?? 0, lastTranscriptEnd, 1)
+    const audioDuration = audioRef.current?.duration
    return Math.max(
      Number.isFinite(audioDuration) ? Number(audioDuration) : 0,
      job?.duration ?? 0,
      lastTranscriptEnd,
      1,
    )
  }, [job?.duration, transcript])
  const timelinePeaks = useMemo(() => slicePeaks(peaks, 0, duration, duration, 160), [duration, peaks])
  const pointerPct = clamp((currentTime / duration) * 100, 0, 100)
  useEffect(() => {
    if (typeof window === "undefined") return
@@ -148,21 +176,38 @@ export function AudioStrip({ job }: { job: Job | null }) {
  useEffect(() => {
    let cancelled = false
    let timer: ReturnType<typeof setTimeout> | null = null
    let attempts = 0
    setPeaks([])
-    if (!job?.id || !hasAudio) return
+    setSourceReady(false)
-    decodeWaveform(sourceAudioUrl(job.id))
+    setCurrentTime(0)
    if (!job?.id || !open) return
    setPeaks(fallbackPeaks(1800, `${job.id}-loading`))
    const load = () => {
      attempts += 1
      decodeWaveform(sourceUrl)
      .then((next) => {
-        if (!cancelled) setPeaks(next)
+        if (cancelled) return
        setPeaks(next)
        setSourceReady(true)
        setAudioKey((key) => key + 1)
      })
      .catch(() => {
-        if (!cancelled) setPeaks(fallbackPeaks(1800, `${job.id}-${transcript.length}`))
+        if (cancelled) return
        setSourceReady(false)
        if (attempts < (processing ? 45 : 6)) {
          timer = setTimeout(load, 1000)
        }
      })
    }
    load()
    return () => {
      cancelled = true
      if (timer) clearTimeout(timer)
    }
-  }, [job?.id, hasAudio, transcript.length])
+  }, [job?.id, open, processing, sourceUrl, transcript.length])
-  if (!hasAudio || !job) return null
+  if (!open || !job) return null
  const startDrag = (e: ReactPointerEvent<HTMLDivElement>) => {
    e.preventDefault()
@@ -206,7 +251,7 @@ export function AudioStrip({ job }: { job: Job | null }) {
          {voiceUrl && (
            <div className="hidden items-center gap-1.5 text-[10px] text-emerald-200/80 sm:flex">
              <Volume2 className="h-3.5 w-3.5" />
-              MiniMax ready
+              English VO ready
            </div>
          )}
          <button
@@ -217,27 +262,79 @@ export function AudioStrip({ job }: { job: Job | null }) {
          >
            {collapsed ? <ChevronUp className="h-3.5 w-3.5" /> : <ChevronDown className="h-3.5 w-3.5" />}
          </button>
          {onClose && (
            <button
              type="button"
              onClick={onClose}
              className="inline-flex h-6 w-6 items-center justify-center rounded-md border border-white/10 text-white/65 transition hover:bg-white/10 hover:text-white"
              title="关闭音频条"
            >
              <X className="h-3.5 w-3.5" />
            </button>
          )}
        </div>
      </div>
      {!collapsed && (
        <div className="grid h-[calc(100%-48px)] grid-cols-[minmax(0,1fr)_300px] gap-3 p-3 max-lg:grid-cols-1">
-          <div className="min-w-0 overflow-x-auto overflow-y-hidden pb-1">
+          <div className="flex min-w-0 min-h-0 flex-col gap-3 overflow-hidden">
-            {transcript.length > 0 ? (
+            <div className="rounded-lg border border-white/10 bg-black/20 p-2">
-              <div className="flex h-full items-stretch gap-3">
+              <div className="mb-2 flex items-center justify-between gap-3">
-                {transcript.map((segment) => (
+                <div className="min-w-0 text-[10px] uppercase tracking-widest text-white/45">
-                  <SegmentCard key={segment.index} segment={segment} peaks={peaks} duration={duration} />
+                  Source audio playback
-                ))}
+                  {activeSegment ? <span className="ml-2 text-emerald-200/80">#{activeSegment.index + 1}</span> : null}
                </div>
                <div className="shrink-0 font-mono text-[10px] text-white/45">
                  {currentTime.toFixed(1)}s / {duration.toFixed(1)}s
                </div>
              </div>
              {sourceReady ? (
                <audio
                  key={audioKey}
                  ref={audioRef}
                  controls
                  src={sourceUrl}
                  className="h-8 w-full"
                  onTimeUpdate={(event) => setCurrentTime(event.currentTarget.currentTime)}
                  onSeeked={(event) => setCurrentTime(event.currentTarget.currentTime)}
                  onLoadedMetadata={(event) => setCurrentTime(event.currentTarget.currentTime)}
                />
              ) : (
                <div className="flex h-8 items-center rounded-md border border-dashed border-white/12 px-3 text-[11px] text-white/45">
                  {processing ? "正在提取原音频并准备波形..." : "等待原音频波形..."}
                </div>
              )}
              <div className="relative mt-2">
                <Waveform peaks={timelinePeaks} active={sourceReady} />
                <div
                  className="pointer-events-none absolute inset-y-0 w-[2px] bg-emerald-200 shadow-[0_0_18px_rgba(110,231,183,0.9)]"
                  style={{ left: `${pointerPct}%` }}
                />
              </div>
            </div>
            <div className="min-h-0 overflow-x-auto overflow-y-hidden pb-1">
              {transcript.length > 0 ? (
                <div className="flex h-full items-stretch gap-3">
                  {transcript.map((segment) => (
                    <SegmentCard
                      key={segment.index}
                      segment={segment}
                      peaks={peaks}
                      duration={duration}
                      currentTime={currentTime}
                    />
                  ))}
                </div>
            ) : (
              <div className="flex h-full items-center justify-center rounded-lg border border-dashed border-white/12 text-[12px] text-white/45">
-                音频识别完成后，这里会按时间显示英文、中文翻译和对应波形。
+                点击音频卡片后开始解析；完成后这里会按时间显示英文、中文翻译和对应波形。
              </div>
            )}
            </div>
          </div>
          <div className="min-h-0 overflow-y-auto rounded-lg border border-emerald-300/20 bg-emerald-300/[0.07] p-3 max-lg:hidden">
-            <div className="mb-2 text-[10px] uppercase tracking-widest text-emerald-100/70">改后 · SKG 口播</div>
+            <div className="mb-2 text-[10px] uppercase tracking-widest text-emerald-100/70">English VO · SKG rewrite</div>
            <p className="text-[12.5px] leading-relaxed text-white/90">
-              {audioScript?.rewritten_text || "等待转录完成后生成适合 SKG 产品视频的口播文案。"}
+              {audioScript?.rewritten_text || "Waiting for the parsed transcript to become an English SKG voice-over."}
            </p>
            {voiceUrl && (
              <audio controls src={voiceUrl} className="mt-3 h-8 w-full" />
--- a/web/components/nodes/index.tsx
+++ b/web/components/nodes/index.tsx
@@ -76,6 +76,7 @@ export interface NodeData {
  onCopyImage?: (ref: ImageRef) => void  // 复制图片到全局剪贴板（粘贴到分镜头编排插槽）
  onGenerateProductFusionVideo?: (frameIdx: number, shot: ProductFusionShot) => Promise<void> | void
  onTranscribeAudio?: (jobId?: string) => Promise<void> | void
  onOpenAudioStrip?: (jobId?: string) => void
  pinnedNodes?: Set<string>              // 已钉住的节点 id 集合 — 钉住后位置 + 尺寸锁定
  onToggleNodePin?: (id: string) => void
 }
@@ -2141,7 +2142,12 @@ export function AudioNode({ data, selected }: any) {
      pinned={d.pinnedNodes?.has("audio")}
      onTogglePin={() => d.onToggleNodePin?.("audio")}
    >
-      <div className="space-y-2 text-[11px] text-[var(--text-soft)] leading-snug">
+      <div
        className="space-y-2 text-[11px] text-[var(--text-soft)] leading-snug"
        onClick={() => {
          if (job?.video_url) d.onOpenAudioStrip?.(job.id)
        }}
      >
        <div>
          音轨 → ASR 转录 → 英中翻译 → SKG 口播改写 → MiniMax 配音<br />
          <span className="text-[var(--text-faint)] font-mono">
@@ -2154,6 +2160,7 @@ export function AudioNode({ data, selected }: any) {
            disabled={audioButtonDisabled}
            onClick={(e) => {
              e.stopPropagation()
              d.onOpenAudioStrip?.(job.id)
              if (audioButtonDisabled) return
              void d.onTranscribeAudio?.(job.id)
            }}
@@ -2185,9 +2192,7 @@ export function AudioNode({ data, selected }: any) {
            )}
          </div>
        )}
-        {voiceUrl && (
+        {voiceUrl && <div className="text-[10.5px] text-emerald-200/85">MiniMax English voice ready · 底部音频条播放</div>}
          <audio controls src={voiceUrl} className="h-7 w-full" />
        )}
        {isRewriting && (
          <div className="text-[10.5px] text-[var(--text-faint)]">正在生成改写文案和配音…</div>
        )}
--- a/web/lib/api.ts
+++ b/web/lib/api.ts
@@ -331,6 +331,7 @@ export interface Job {
  duration?: number
  width?: number
  height?: number
  source_audio_url?: string
  frames: KeyFrame[]
  transcript: TranscriptSegment[]
  audio_script?: AudioScript