diff --git a/.memory/worklog.json b/.memory/worklog.json index 9e173d9..cf164ed 100644 --- a/.memory/worklog.json +++ b/.memory/worklog.json @@ -1,19 +1,5 @@ { "entries": [ - { - "files_changed": 1, - "hash": "d6b86fc", - "message": "auto-save 2026-05-13 02:49 (~1)", - "ts": "2026-05-13T02:49:30+08:00", - "type": "commit" - }, - { - "files_changed": 1, - "hash": "8cbb1a9", - "message": "auto-save 2026-05-13 02:55 (~1)", - "ts": "2026-05-13T02:55:25+08:00", - "type": "commit" - }, { "files_changed": 1, "hash": "989728d", @@ -3305,6 +3291,19 @@ "type": "session-heartbeat", "message": "Codex 会话活跃 · 最近命令:codex · 1 项未提交变更 · 最近提交:auto-save 2026-05-14 11:04 (~6)", "files_changed": 1 + }, + { + "ts": "2026-05-14T11:10:38+08:00", + "type": "commit", + "message": "auto-save 2026-05-14 11:10 (~1)", + "hash": "7270222", + "files_changed": 1 + }, + { + "ts": "2026-05-14T03:16:10Z", + "type": "session-heartbeat", + "message": "Codex 会话活跃 · 最近命令:codex · 5 项未提交变更 · 最近提交:auto-save 2026-05-14 11:10 (~1)", + "files_changed": 5 } ] } diff --git a/api/README.md b/api/README.md index 28ba9e3..763b0da 100644 --- a/api/README.md +++ b/api/README.md @@ -20,7 +20,7 @@ uvicorn main:app --host 127.0.0.1 --port 4291 - `GET /health` — 健康检查 + 配置状态 - `POST /jobs` `{url}` — 创建 job,后台下载源视频,视频就绪后可手动解析或提取音频 - `GET /jobs/{id}` — 当前状态 + 产物 -- `POST /jobs/{id}/transcribe` — 触发音频提取 + ASR + 翻译 + SKG 文案改写;配置 MiniMax 后生成配音。前端 Audio 节点提供“提取音频 / 重新提取音频”按钮,不依赖抽帧完成 +- `POST /jobs/{id}/transcribe` — 触发音频提取 + ASR + 翻译 + SKG 文案改写;配置 MiniMax 后生成配音。前端 Audio 节点提供“提取音频 / 重新提取音频”按钮,可与抽帧并行,不自动触发 - `GET /jobs/{id}/video.mp4` — 原视频 - `GET /jobs/{id}/audio.wav` — 拆轨后的原始音频,供前端底部音频条生成波形 - `GET /jobs/{id}/audio-script.mp3` — 改写文案的 MiniMax 配音 diff --git a/api/main.py b/api/main.py index 358e62a..d07c48b 100644 --- a/api/main.py +++ b/api/main.py @@ -6,6 +6,7 @@ import json import os import shutil import subprocess +import threading import time import uuid from contextlib import asynccontextmanager @@ -388,6 +389,8 @@ class Job(BaseModel): JOBS: dict[str, Job] = {} ANALYZE_QUEUE: list[AnalyzeTask] = [] ANALYZE_WORKER_RUNNING = False +AUDIO_WORKERS_RUNNING: set[str] = set() +AUDIO_WORKERS_LOCK = threading.Lock() def job_dir(job_id: str) -> Path: @@ -974,8 +977,8 @@ def _target_score(item: dict, target: FrameExtractTarget) -> float: motion = float(item.get("motion_n", 0.0)) if target == "transparent_human": - # 透明骨架人仍先依赖本地清晰度 / 中心主体 / 对比度筛候选, - # 后续再交给 Vision 逐张语义验收。 + # 当前抽帧阶段走本地算力:优先清晰中心主体、高对比、适度色彩和时间覆盖。 + # 透明骨架人的语义判断留给后续审核/识别,不在抽帧阶段逐帧调用 Vision。 score = center * 0.45 + sharp * 0.30 + contrast * 0.15 + color * 0.10 elif target == "subject": score = center * 0.48 + sharp * 0.25 + contrast * 0.17 + color * 0.10 @@ -1217,7 +1220,6 @@ def pipeline_analyze( "-vn", "-ac", "1", "-ar", "16000", "-c:a", "pcm_s16le", str(wav), ]) - n = max(1, min(int(frame_count), 20)) target_label = FRAME_TARGET_LABELS.get(target, FRAME_TARGET_LABELS["balanced"]) duration = max(float(job.duration or 1.0), 0.1) @@ -1260,16 +1262,11 @@ def pipeline_analyze( raise RuntimeError("候选帧评分失败") # 2) 目标化筛选:pHash 去重 + 清晰度 / 中心细节 / 转场变化 / 动作强度。 - # 透明骨架人目标会先扩大候选池,再用 Vision 逐张验收;不合格自动换下一帧。 - semantic_transparent = target == "transparent_human" - if semantic_transparent: - selection_count = min(len(candidates), min(max(n * 10, 24), 48)) - update(job, message=f"{quality_label}筛选透明骨架人候选 · 本地 {selection_count} / {len(candidates)} 张…", progress=58) - chosen = _rank_keyframe_candidates(candidates, target, selection_count) - else: - selection_count = n if replacing else min(len(candidates), max(n * 4, n + len(existing_frames) + 2)) - update(job, message=f"{quality_label}筛选 · {target_label} · {n} / {len(candidates)} 张…", progress=60) - chosen = _select_keyframes(candidates, selection_count, target) + # 抽帧阶段只走本机算力,不逐帧调用 Vision;语义审核留到后续素材准备。 + semantic_transparent = False + selection_count = n if replacing else min(len(candidates), max(n * 4, n + len(existing_frames) + 2)) + update(job, message=f"{quality_label}本地筛选 · {target_label} · {n} / {len(candidates)} 张…", progress=60) + chosen = _select_keyframes(candidates, selection_count, target) # 3) 只对最终选中的时间点,从原视频抽高质量关键帧。 renamed: list[KeyFrame] = [] @@ -1558,16 +1555,20 @@ def _build_audio_script_sync(job_id: str, segments: list[TranscriptSegment]) -> ) -async def pipeline_transcribe(job_id: str) -> None: +def pipeline_transcribe(job_id: str, manage_job_status: bool = True) -> None: job = JOBS[job_id] d = job_dir(job_id) wav = d / "audio.wav" + def progress(message: str, value: int) -> None: + if manage_job_status: + update(job, status="transcribing", message=message, progress=value, error="") + try: if not wav.exists(): mp4 = d / "source.mp4" if not mp4.exists(): raise RuntimeError("source.mp4 不存在,视频导入完成后再提取音频") - update(job, status="transcribing", message="ffmpeg 提取音频轨…", progress=max(45, min(job.progress, 70)), error="") + progress("ffmpeg 提取音频轨…", max(45, min(job.progress, 70))) run([ "ffmpeg", "-y", "-i", str(mp4), "-vn", "-ac", "1", "-ar", "16000", "-c:a", "pcm_s16le", @@ -1578,8 +1579,8 @@ async def pipeline_transcribe(job_id: str) -> None: if not LLM_API_KEY: # 无 key 模式:mock 数据 - update(job, status="transcribing", message="ASR (mock) …", progress=75) - await asyncio.sleep(1.0) + progress("ASR (mock) …", 75) + time.sleep(1.0) mock = [ TranscriptSegment(index=0, start=0.0, end=3.5, en="Welcome back, today we're testing something new.", @@ -1588,10 +1589,9 @@ async def pipeline_transcribe(job_id: str) -> None: en="This device looks really sleek and minimal.", zh="这个设备看起来非常时尚和简约。"), ] - update( - job, - transcript=mock, - audio_script=AudioScript( + update_kwargs = { + "transcript": mock, + "audio_script": AudioScript( status="rewriting", source_text=_transcript_join(mock, "en"), source_zh=_transcript_join(mock, "zh"), @@ -1601,18 +1601,22 @@ async def pipeline_transcribe(job_id: str) -> None: voice_model=MINIMAX_TTS_MODEL, voice_id=MINIMAX_TTS_VOICE_ID, ), - message="ASR mock 完成,生成 SKG 改写文案…", - progress=92, - ) - audio_script = await asyncio.to_thread(_build_audio_script_sync, job_id, mock) - update(job, transcript=mock, status="transcribed", progress=100, - audio_script=audio_script, - message="转录完成(MOCK · 未设 LLM_API_KEY)") + } + if manage_job_status: + update_kwargs.update(message="ASR mock 完成,生成 SKG 改写文案…", progress=92) + update(job, **update_kwargs) + audio_script = _build_audio_script_sync(job_id, mock) + if manage_job_status: + update(job, transcript=mock, status="transcribed", progress=100, + audio_script=audio_script, + message="转录完成(MOCK · 未设 LLM_API_KEY)") + else: + update(job, transcript=mock, audio_script=audio_script) return # 1) whisper ASR - update(job, status="transcribing", message=f"{ASR_MODEL} 转录中…", progress=78) - segments = await asyncio.to_thread(_transcribe_sync, wav) + progress(f"{ASR_MODEL} 转录中…", 78) + segments = _transcribe_sync(wav) if not segments: raise RuntimeError("ASR 返回 0 段(可能无人声 / 格式问题)") @@ -1627,10 +1631,13 @@ async def pipeline_transcribe(job_id: str) -> None: ) for i, s in enumerate(segments) ] - update(job, transcript=en_only, message=f"ASR 完成 · {len(en_only)} 段,开始翻译…", progress=88) + if manage_job_status: + update(job, transcript=en_only, message=f"ASR 完成 · {len(en_only)} 段,开始翻译…", progress=88) + else: + update(job, transcript=en_only) # 2) Gemini 翻译 - zh_list = await asyncio.to_thread(_translate_sync, segments) + zh_list = _translate_sync(segments) full = [ TranscriptSegment( index=seg.index, start=seg.start, end=seg.end, en=seg.en, @@ -1638,10 +1645,9 @@ async def pipeline_transcribe(job_id: str) -> None: ) for i, seg in enumerate(en_only) ] - update( - job, - transcript=full, - audio_script=AudioScript( + update_kwargs = { + "transcript": full, + "audio_script": AudioScript( status="rewriting", source_text=_transcript_join(full, "en"), source_zh=_transcript_join(full, "zh"), @@ -1651,22 +1657,58 @@ async def pipeline_transcribe(job_id: str) -> None: voice_model=MINIMAX_TTS_MODEL, voice_id=MINIMAX_TTS_VOICE_ID, ), - message="翻译完成,生成 SKG 改写文案与 MiniMax 配音…", - progress=94, - ) - audio_script = await asyncio.to_thread(_build_audio_script_sync, job_id, full) - update(job, transcript=full, status="transcribed", progress=100, - audio_script=audio_script, - message=f"转录完成 · {len(full)} 段({ASR_MODEL} + {TRANSLATE_MODEL})") + } + if manage_job_status: + update_kwargs.update(message="翻译完成,生成 SKG 改写文案与 MiniMax 配音…", progress=94) + update(job, **update_kwargs) + audio_script = _build_audio_script_sync(job_id, full) + if manage_job_status: + update(job, transcript=full, status="transcribed", progress=100, + audio_script=audio_script, + message=f"转录完成 · {len(full)} 段({ASR_MODEL} + {TRANSLATE_MODEL})") + else: + update(job, transcript=full, audio_script=audio_script) except Exception as e: - update( - job, - status="failed", - audio_script=AudioScript(status="failed", error=str(e), created_at=time.time()), - error=str(e), - message="转录失败", - ) + if manage_job_status: + update( + job, + status="failed", + audio_script=AudioScript(status="failed", error=str(e), created_at=time.time()), + error=str(e), + message="转录失败", + ) + else: + update(job, audio_script=AudioScript(status="failed", error=str(e), created_at=time.time())) + + +def _audio_processing_worker(job_id: str, manage_job_status: bool) -> None: + try: + pipeline_transcribe(job_id, manage_job_status=manage_job_status) + finally: + with AUDIO_WORKERS_LOCK: + AUDIO_WORKERS_RUNNING.discard(job_id) + + +def start_audio_processing(job_id: str, manage_job_status: bool = True) -> bool: + job = JOBS.get(job_id) + if not job: + return False + if not manage_job_status: + has_audio_output = bool(job.transcript) or bool(job.audio_script.rewritten_text) + if has_audio_output or job.audio_script.status == "rewriting": + return False + with AUDIO_WORKERS_LOCK: + if job_id in AUDIO_WORKERS_RUNNING: + return False + AUDIO_WORKERS_RUNNING.add(job_id) + threading.Thread( + target=_audio_processing_worker, + args=(job_id, manage_job_status), + daemon=True, + name=f"audio-{job_id}", + ).start() + return True def _image_edit_call( @@ -2018,10 +2060,23 @@ async def trigger_transcribe(job_id: str, bg: BackgroundTasks) -> Job: mp4 = job_dir(job_id) / "source.mp4" if job.status in {"created", "downloading"} or not mp4.exists(): raise HTTPException(409, f"video not ready, got {job.status}") - if job.status in {"splitting", "transcribing"} or job.audio_script.status == "rewriting": + if job.status == "transcribing" or job.audio_script.status == "rewriting" or job_id in AUDIO_WORKERS_RUNNING: raise HTTPException(409, f"job is busy, got {job.status}") - update(job, status="transcribing", progress=max(45, min(job.progress, 70)), error="", message="准备提取音频…") - bg.add_task(pipeline_transcribe, job_id) + manage_job_status = job.status != "splitting" + audio_payload = AudioScript( + status="rewriting", + product_brief=AUDIO_PRODUCT_BRIEF, + rewrite_model=AUDIO_REWRITE_MODEL, + voice_provider="minimax", + voice_model=MINIMAX_TTS_MODEL, + voice_id=MINIMAX_TTS_VOICE_ID, + ) + if manage_job_status: + update(job, status="transcribing", progress=max(45, min(job.progress, 70)), error="", message="准备提取音频…", audio_script=audio_payload) + else: + update(job, error="", audio_script=audio_payload) + if not start_audio_processing(job_id, manage_job_status=manage_job_status): + update(job, message="音频已在处理中") return job diff --git a/docs/source-analysis.html b/docs/source-analysis.html index 2675ea2..a120e20 100644 --- a/docs/source-analysis.html +++ b/docs/source-analysis.html @@ -552,7 +552,7 @@
当前产品不是“复制别人的视频”,而是拆解参考视频,提取可借鉴的镜头元素,再改造成 SKG 产品语境的视频素材。
TK 链接或本地上传,后端下载/保存源视频。
拆轨、抽关键帧、手动加帧,形成参考分镜池。当前主题默认直接抽 12 帧,并使用“透明骨架人”抽帧目标:本地先扫候选,Vision 再按透明身体、白色骨架、人物占比、清晰度、广告感和产品可用性打分验收;不合格候选会自动换下一帧。
拆轨、抽关键帧、手动加帧,形成参考分镜池。当前主题默认直接抽 12 帧,并使用“透明骨架人”抽帧目标:抽帧阶段只走本机算力扫描、评分、去重和时间覆盖;透明骨架人的语义判断放到后续审核/识别,不在抽帧阶段逐帧调用 Vision。
对关键帧做全图或区域清洗,清洗版先进入待审核状态;确认后可单张替换,也可一键替换全部待应用清洗版。
识别场景和主体候选,只是候选,不应锁死。
清洗关键帧,把多张关键帧作为同一主体的参考,先重绘六张标准站立主体资产图,再按关键帧生成多个去主体、相似或换风格场景图。
POST /jobscreateJobPOST /jobs/uploaduploadJobDELETE /jobs/{id}deleteJobjobs/<id> 目录移除整个 job,包括源视频、关键帧、元素提取图和生成视频。POST /jobs/{id}/analyze?frames=&target=&mode=&quality=analyzeJobframes=12;target 支持透明骨架人、综合、清晰主体、转场变化、表情瞬间、动作峰值;当前 UI 默认 transparent_human。透明骨架人目标会先扩大本地候选池,再调用 Vision 按 6 个分数验收;不合格候选自动丢弃并抽下一候选。mode=append 追加新关键帧;quality=auto 根据本机算力和视频时长自动选择快速、精细或极准。多个抽帧请求进入后端队列顺序处理。POST /jobs/{id}/transcribetriggerTranscribesource.mp4 提取 audio.wav;随后 ASR 得到英文时间戳段落,再翻译中文,并按 AUDIO_PRODUCT_BRIEF 生成 audio_script.rewritten_text;配置 MINIMAX_API_KEY 后调用 MiniMax T2A 生成 audio_script.voice_url。前端不自动触发,用户在 Audio 节点点击“提取音频 / 重新提取音频”即可启动,不依赖抽帧完成。POST /jobs/{id}/analyze?frames=&target=&mode=&quality=analyzeJobframes=12;target 支持透明骨架人、综合、清晰主体、转场变化、表情瞬间、动作峰值;当前 UI 默认 transparent_human。透明骨架人目标现在只走本地清晰度、中心主体、对比度、画面变化和 pHash 去重,不在抽帧阶段逐帧调用 Vision;mode=append 追加新关键帧;quality=auto 根据本机算力和视频时长自动选择快速、精细或极准。抽帧开始时同步拆出 audio.wav 并启动音频处理线程。多个抽帧请求进入后端队列顺序处理。POST /jobs/{id}/transcribetriggerTranscribesource.mp4 提取 audio.wav;随后 ASR 得到英文时间戳段落,再翻译中文,并按 AUDIO_PRODUCT_BRIEF 生成 audio_script.rewritten_text;配置 MINIMAX_API_KEY 后调用 MiniMax T2A 生成 audio_script.voice_url。前端不自动触发,用户在 Audio 节点点击“提取音频 / 重新提取音频”即可启动;抽帧中也允许并行触发,忙碌态由 audio_script.status 管理。GET /jobs/{id}/audio.wavsourceAudioUrlAudioStrip 拉取该文件,用 Web Audio API 解码并计算波形峰值,只读展示,不参与改写。GET /jobs/{id}/audio-script.mp3apiAssetUrl(job.audio_script.voice_url)audio_script.rewritten_text。POST /jobs/{id}/frames?t=addManualFramesource.mp4 直接提取 audio.wav,再提取原始口播、翻译中文、改写成 SKG 产品语境口播;MiniMax T2A 配置后生成配音 mp3。不再等待抽帧完成,用户在主画布 AudioNode 手动点击“提取音频 / 重新提取音频”启动。AudioNode 用“改前 · 原音频 / 改后 · SKG 口播”摘要展示;底部 AudioStrip 吸附屏幕底端,可拖拽调整高度,按时间段展示英文、中文翻译和波形;侧栏 Rewrite 展开后显示完整审核视图。source.mp4 直接提取 audio.wav,再提取原始口播、翻译中文、改写成 SKG 产品语境口播;MiniMax T2A 配置后生成配音 mp3。不再等待抽帧完成,用户在主画布 AudioNode 手动点击“提取音频 / 重新提取音频”启动;即使视觉抽帧正在进行,也通过 audio_script.status 并行管理音频忙碌态。AudioNode 用“改前 · 原音频 / 改后 · SKG 口播”摘要展示;底部 AudioStrip 吸附屏幕底端,可拖拽调整高度,按时间段展示英文、中文翻译和波形;侧栏 Rewrite 展开后显示完整审核视图。AudioNode、AudioStrip、ASRNode、TranslateNode、RewriteNode、pipeline_transcribe、AudioScript问题:等待抽帧完成后自动启动音频,不符合“先把声音文案拿出来审核”的工作流;用户需要在音频卡片上直接触发。
-改动:移除前端抽帧完成后的自动转写逻辑;AudioNode 保留并固定显示“提取音频 / 重新提取音频”按钮。后端 /transcribe 不再要求 frames_extracted,视频就绪后可直接从 source.mp4 拆出 audio.wav,并继续 ASR、翻译、SKG 改写和 MiniMax 配音。
改动:移除前端抽帧完成后的自动转写逻辑;AudioNode 保留并固定显示“提取音频 / 重新提取音频”按钮。后端 /transcribe 不再要求 frames_extracted,视频就绪后可直接从 source.mp4 拆出 audio.wav,并继续 ASR、翻译、SKG 改写和 MiniMax 配音;抽帧中触发时不抢主状态,而是用 audio_script.status 表示音频处理中。
影响:web/app/page.tsx、web/components/nodes/index.tsx、api/main.py、docs/source-analysis.html。