From 3aceb221ac95ec1660fbb9f2d902f2298277bfa7 Mon Sep 17 00:00:00 2001 From: kang Date: Thu, 14 May 2026 10:59:27 +0800 Subject: [PATCH] auto-save 2026-05-14 10:59 (~6) --- .memory/worklog.json | 40 ++++++++++++++++------------------ api/README.md | 2 +- api/main.py | 24 +++++++++++++++----- docs/source-analysis.html | 16 ++++++++++++-- web/app/page.tsx | 26 ++++++++++------------ web/components/nodes/index.tsx | 30 ++++++++++++++++++++----- 6 files changed, 89 insertions(+), 49 deletions(-) diff --git a/.memory/worklog.json b/.memory/worklog.json index d593210..6049b08 100644 --- a/.memory/worklog.json +++ b/.memory/worklog.json @@ -1,26 +1,5 @@ { "entries": [ - { - "files_changed": 1, - "hash": "f4a421b", - "message": "auto-save 2026-05-13 02:07 (~1)", - "ts": "2026-05-13T02:08:10+08:00", - "type": "commit" - }, - { - "files_changed": 1, - "hash": "a63d7c7", - "message": "auto-save 2026-05-13 02:13 (~1)", - "ts": "2026-05-13T02:14:06+08:00", - "type": "commit" - }, - { - "files_changed": 1, - "hash": "1a5f5be", - "message": "auto-save 2026-05-13 02:19 (~1)", - "ts": "2026-05-13T02:20:00+08:00", - "type": "commit" - }, { "files_changed": 1, "hash": "bcc7ce0", @@ -3309,6 +3288,25 @@ "type": "session-heartbeat", "message": "Codex 会话活跃 · 最近命令:codex · 5 项未提交变更 · 最近提交:auto-save 2026-05-14 10:45 (+1, ~5)", "files_changed": 5 + }, + { + "ts": "2026-05-14T10:53:54+08:00", + "type": "commit", + "message": "auto-save 2026-05-14 10:51 (~7)", + "hash": "8bd52f6", + "files_changed": 7 + }, + { + "ts": "2026-05-14T02:56:10Z", + "type": "session-heartbeat", + "message": "Codex 会话活跃 · 最近命令:codex · 3 项未提交变更 · 最近提交:auto-save 2026-05-14 10:51 (~7)", + "files_changed": 3 + }, + { + "ts": "2026-05-14T02:58:38Z", + "type": "session-heartbeat", + "message": "Codex 会话活跃 · 最近命令:codex · 5 项未提交变更 · 最近提交:auto-save 2026-05-14 10:51 (~7)", + "files_changed": 5 } ] } diff --git a/api/README.md b/api/README.md index c4468f8..1b50395 100644 --- a/api/README.md +++ b/api/README.md @@ -18,7 +18,7 @@ uvicorn main:app --port 4291 --reload - `GET /health` — 健康检查 + 配置状态 - `POST /jobs` `{url}` — 创建 job,后台跑下载/拆轨/抽帧 - `GET /jobs/{id}` — 当前状态 + 产物 -- `POST /jobs/{id}/transcribe` — 触发 ASR + 翻译 + SKG 文案改写;配置 MiniMax 后生成配音 +- `POST /jobs/{id}/transcribe` — 触发音频提取 + ASR + 翻译 + SKG 文案改写;配置 MiniMax 后生成配音。前端 Audio 节点提供“提取音频 / 重新提取音频”按钮,不依赖抽帧完成 - `GET /jobs/{id}/video.mp4` — 原视频 - `GET /jobs/{id}/audio.wav` — 拆轨后的原始音频,供前端底部音频条生成波形 - `GET /jobs/{id}/audio-script.mp3` — 改写文案的 MiniMax 配音 diff --git a/api/main.py b/api/main.py index a308bd4..c0601ce 100644 --- a/api/main.py +++ b/api/main.py @@ -97,7 +97,8 @@ def llm() -> OpenAI: return _llm_client # Pipeline 状态: -# created → downloading → downloaded(停,等用户点解析)→ splitting → frames_extracted +# created → downloading → downloaded(停,等用户点解析/提取音频) +# → splitting → frames_extracted # → transcribing → transcribed | failed JobStatus = Literal[ "created", "downloading", "downloaded", @@ -1563,7 +1564,17 @@ async def pipeline_transcribe(job_id: str) -> None: wav = d / "audio.wav" try: if not wav.exists(): - raise RuntimeError("audio.wav 不存在") + mp4 = d / "source.mp4" + if not mp4.exists(): + raise RuntimeError("source.mp4 不存在,视频导入完成后再提取音频") + update(job, status="transcribing", message="ffmpeg 提取音频轨…", progress=max(job.progress, 45), error="") + run([ + "ffmpeg", "-y", "-i", str(mp4), + "-vn", "-ac", "1", "-ar", "16000", "-c:a", "pcm_s16le", + str(wav), + ]) + if not wav.exists(): + raise RuntimeError("音频提取完成但找不到 audio.wav") if not LLM_API_KEY: # 无 key 模式:mock 数据 @@ -2004,9 +2015,12 @@ async def trigger_transcribe(job_id: str, bg: BackgroundTasks) -> Job: job = JOBS.get(job_id) if not job: raise HTTPException(404, "job not found") - if job.status not in {"frames_extracted", "transcribed", "failed"}: - raise HTTPException(409, f"status must be frames_extracted/transcribed/failed, got {job.status}") - update(job, status="transcribing", progress=max(job.progress, 72), error="", message="准备音频转写…") + mp4 = job_dir(job_id) / "source.mp4" + if job.status in {"created", "downloading"} or not mp4.exists(): + raise HTTPException(409, f"video not ready, got {job.status}") + if job.status in {"splitting", "transcribing"} or job.audio_script.status == "rewriting": + raise HTTPException(409, f"job is busy, got {job.status}") + update(job, status="transcribing", progress=max(job.progress, 45), error="", message="准备提取音频…") bg.add_task(pipeline_transcribe, job_id) return job diff --git a/docs/source-analysis.html b/docs/source-analysis.html index 0f13a5f..31e6f5a 100644 --- a/docs/source-analysis.html +++ b/docs/source-analysis.html @@ -792,7 +792,7 @@ SubjectAsset { 上传视频POST /jobs/uploaduploadJob保存 source.mp4,然后同样进入下载完成状态。 删除输入视频DELETE /jobs/{id}deleteJob从任务队列、URL 和磁盘 jobs/<id> 目录移除整个 job,包括源视频、关键帧、元素提取图和生成视频。 解析视频POST /jobs/{id}/analyze?frames=&target=&mode=&quality=analyzeJob拆轨 + 目标化抽关键帧。默认 frames=12target 支持透明骨架人、综合、清晰主体、转场变化、表情瞬间、动作峰值;当前 UI 默认 transparent_human。透明骨架人目标会先扩大本地候选池,再调用 Vision 按 6 个分数验收;不合格候选自动丢弃并抽下一候选。mode=append 追加新关键帧;quality=auto 根据本机算力和视频时长自动选择快速、精细或极准。多个抽帧请求进入后端队列顺序处理。 - 音频文案轨POST /jobs/{id}/transcribetriggerTranscribe读取拆轨得到的 audio.wav,先 ASR 得到英文时间戳段落,再翻译中文,随后按 AUDIO_PRODUCT_BRIEF 生成 audio_script.rewritten_text;配置 MINIMAX_API_KEY 后调用 MiniMax T2A 生成 audio_script.voice_url。 + 音频文案轨POST /jobs/{id}/transcribetriggerTranscribe读取拆轨得到的 audio.wav,先 ASR 得到英文时间戳段落,再翻译中文,随后按 AUDIO_PRODUCT_BRIEF 生成 audio_script.rewritten_text;配置 MINIMAX_API_KEY 后调用 MiniMax T2A 生成 audio_script.voice_url。前端在抽帧完成且尚无 transcript 时会自动触发一次;Audio 节点也提供“开始/重新处理音频”按钮。 原始音频文件GET /jobs/{id}/audio.wavsourceAudioUrl返回拆轨得到的 wav;底部 AudioStrip 拉取该文件,用 Web Audio API 解码并计算波形峰值,只读展示,不参与改写。 改写配音文件GET /jobs/{id}/audio-script.mp3apiAssetUrl(job.audio_script.voice_url)返回 MiniMax T2A 生成的 mp3。没有配置 MiniMax 或生成失败时该文件不存在,但改写文案仍会保存在 audio_script.rewritten_text。 手动加帧POST /jobs/{id}/frames?t=addManualFrame按视频时间戳抽一帧,index 递增但 frames 按 timestamp 排序。 @@ -841,7 +841,7 @@ SubjectAsset { Audio / ASR / Rewrite - 独立声音文案轨:从 audio.wav 提取原始口播、翻译中文、改写成 SKG 产品语境口播;MiniMax T2A 配置后生成配音 mp3。主画布的 AudioNode 用“改前 · 原音频 / 改后 · SKG 口播”摘要展示;底部 AudioStrip 吸附屏幕底端,可拖拽调整高度,按时间段展示英文、中文翻译和波形;侧栏 Rewrite 展开后显示完整审核视图。 + 独立声音文案轨:从 audio.wav 提取原始口播、翻译中文、改写成 SKG 产品语境口播;MiniMax T2A 配置后生成配音 mp3。抽帧完成后自动触发一次,也可在主画布 AudioNode 手动开始/重新处理。AudioNode 用“改前 · 原音频 / 改后 · SKG 口播”摘要展示;底部 AudioStrip 吸附屏幕底端,可拖拽调整高度,按时间段展示英文、中文翻译和波形;侧栏 Rewrite 展开后显示完整审核视图。 不要阻断视觉素材管线。 AudioNodeAudioStripASRNodeTranslateNodeRewriteNodepipeline_transcribeAudioScript @@ -918,6 +918,18 @@ SubjectAsset {

变更记录

这个记录不是 git log 的替代品。它记录“产品理解发生了什么变化、影响了哪些源码、你以后描述需求时该怎么说”。后续每次改功能都要补一条。

+
+
+

2026-05-14 · 音频处理支持自动触发和手动重试

+ Audio + Workflow +
+
+

问题:后端已有 /transcribe 接口,但前端没有入口调用,用户不知道什么时候音频开始工作。

+

改动:前端在 job 进入 frames_extracted 且没有 transcript 时自动调用一次 triggerTranscribeAudioNode 增加“开始音频处理 / 重新处理音频”按钮。后端触发接口会立即把 job 状态置为 transcribing,让轮询、节点状态和底部音频条能立刻进入运行态。

+

影响:web/app/page.tsxweb/components/nodes/index.tsxapi/main.pydocs/source-analysis.html

+
+

2026-05-14 · 新增底部可伸缩音频条

diff --git a/web/app/page.tsx b/web/app/page.tsx index a4ac1b3..3a714ba 100644 --- a/web/app/page.tsx +++ b/web/app/page.tsx @@ -390,31 +390,27 @@ export default function Home() { if (!targetId) return const target = jobs.find((item) => item.id === targetId) if (!target) return - if (!["frames_extracted", "transcribed", "failed"].includes(target.status)) { - if (!options?.silent) toast.info("先完成抽帧,音频轨会自动开始处理") + if (!target.video_url) { + if (!options?.silent) toast.info("视频导入完成后,可在音频卡片点击提取音频") + return + } + if (target.status === "splitting") { + if (!options?.silent) toast.info("当前正在抽帧,结束后可重新点击提取音频") + return + } + if (target.status === "transcribing" || target.audio_script?.status === "rewriting") { + if (!options?.silent) toast.info("音频正在处理中") return } try { const updated = await triggerTranscribe(targetId) updateJobInList(updated) - if (!options?.silent) toast.success("音频处理已开始") + if (!options?.silent) toast.success("已开始提取音频") } catch (e) { if (!options?.silent) toast.error("音频处理启动失败:" + (e instanceof Error ? e.message : String(e))) } }, [activeJobId, jobs, updateJobInList]) - const autoAudioStartedRef = useRef>(new Set()) - useEffect(() => { - for (const item of jobs) { - const audioStatus = item.audio_script?.status ?? "idle" - const hasAudioOutput = item.transcript.length > 0 || !!item.audio_script?.rewritten_text - const ready = item.status === "frames_extracted" && !hasAudioOutput && audioStatus !== "rewriting" - if (!ready || autoAudioStartedRef.current.has(item.id)) continue - autoAudioStartedRef.current.add(item.id) - void handleTranscribeAudio(item.id, { silent: true }) - } - }, [jobs, handleTranscribeAudio]) - const handleQuickGenerateVideo = useCallback(async (frameIdx: number, scene: StoryboardScene, model: string) => { if (!job) return const frame = job.frames.find((f) => f.index === frameIdx) diff --git a/web/components/nodes/index.tsx b/web/components/nodes/index.tsx index 579075b..25f297b 100644 --- a/web/components/nodes/index.tsx +++ b/web/components/nodes/index.tsx @@ -2109,7 +2109,19 @@ export function AudioNode({ data, selected }: any) { const voiceUrl = apiAssetUrl(audioScript?.voice_url) const hasASR = transcript.length > 0 const isRewriting = audioScript?.status === "rewriting" - const canTriggerAudio = !!job && ["frames_extracted", "transcribed", "failed"].includes(job.status) && !isRewriting && job.status !== "transcribing" + const hasVideo = !!job?.video_url + const isAudioBusy = !!job && (job.status === "transcribing" || isRewriting) + const isVisualBusy = !!job && job.status === "splitting" + const audioButtonDisabled = !job || !hasVideo || isAudioBusy || isVisualBusy + const audioButtonLabel = !hasVideo + ? "等待视频就绪" + : isAudioBusy + ? "正在提取音频" + : isVisualBusy + ? "抽帧中,稍后提取" + : hasASR || rewrittenText + ? "重新提取音频" + : "提取音频" const originalPreview = transcript .slice(0, 2) .map((s) => (s.zh || s.en).trim()) @@ -2139,17 +2151,25 @@ export function AudioNode({ data, selected }: any) { {audioScript?.rewrite_model || "AUDIO_REWRITE_MODEL"} → {audioScript?.voice_model || "MiniMax T2A"}
- {canTriggerAudio && ( + {job && ( )} {(originalPreview || rewrittenText) && (