diff --git a/.memory/worklog.json b/.memory/worklog.json index cf164ed..e5546ae 100644 --- a/.memory/worklog.json +++ b/.memory/worklog.json @@ -1,19 +1,5 @@ { "entries": [ - { - "files_changed": 1, - "hash": "989728d", - "message": "auto-save 2026-05-13 03:01 (~1)", - "ts": "2026-05-13T03:01:19+08:00", - "type": "commit" - }, - { - "files_changed": 1, - "hash": "4ae9105", - "message": "auto-save 2026-05-13 03:07 (~1)", - "ts": "2026-05-13T03:07:14+08:00", - "type": "commit" - }, { "files_changed": 1, "hash": "06186cb", @@ -3304,6 +3290,19 @@ "type": "session-heartbeat", "message": "Codex 会话活跃 · 最近命令:codex · 5 项未提交变更 · 最近提交:auto-save 2026-05-14 11:10 (~1)", "files_changed": 5 + }, + { + "ts": "2026-05-14T11:16:12+08:00", + "type": "commit", + "message": "auto-save 2026-05-14 11:15 (~5)", + "hash": "4127adc", + "files_changed": 5 + }, + { + "ts": "2026-05-14T03:18:38Z", + "type": "session-heartbeat", + "message": "Codex 会话活跃 · 最近命令:codex · 4 项未提交变更 · 最近提交:auto-save 2026-05-14 11:15 (~5)", + "files_changed": 4 } ] } diff --git a/RULES.md b/RULES.md index 04244cf..4369c39 100644 --- a/RULES.md +++ b/RULES.md @@ -33,7 +33,8 @@ ## 环境变量 - `LLM_BASE_URL` / `LLM_API_KEY`:OpenAI 兼容网关,用于 ASR、翻译、文案改写、图像等模型调用 -- `ASR_MODEL`:音频转写模型,默认 `whisper-1` +- `ASR_MODEL`:OpenAI Audio Transcriptions 音频转写模型,默认 `whisper-1` +- `ASR_FALLBACK_MODEL`:当当前网关没有 `/audio/transcriptions` 时,用 Gemini 多模态 chat 直接识别 wav,默认 `gemini-2.5-flash` - `TRANSLATE_MODEL`:字幕翻译模型,默认 `gemini-2.5-flash` - `REWRITE_MODEL`:通用改写/分镜描述模型,默认 `gemini-2.5-pro` - `AUDIO_REWRITE_MODEL`:音频口播改写模型,默认跟随 `REWRITE_MODEL` diff --git a/api/.env.example b/api/.env.example index 648db3a..5b6a159 100644 --- a/api/.env.example +++ b/api/.env.example @@ -4,6 +4,7 @@ LLM_API_KEY= # 模型分工 ASR_MODEL=whisper-1 +ASR_FALLBACK_MODEL=gemini-2.5-flash TRANSLATE_MODEL=gemini-2.5-flash REWRITE_MODEL=gemini-2.5-pro IMAGE_MODEL=gemini-3-pro-image-preview diff --git a/api/README.md b/api/README.md index 763b0da..b5ac919 100644 --- a/api/README.md +++ b/api/README.md @@ -34,5 +34,5 @@ uvicorn main:app --host 127.0.0.1 --port 4291 - `ffmpeg` 系统二进制(拆轨 / 抽帧) - `yt-dlp` 系统二进制(也可走 Python 包) -- OpenAI 兼容 LLM 网关(ASR / 翻译 / 文案改写) +- OpenAI 兼容 LLM 网关(ASR / 翻译 / 文案改写);如果 `/audio/transcriptions` 不可用,会用 `ASR_FALLBACK_MODEL` 走 Gemini 多模态音频识别 - MiniMax T2A HTTP(改写文案配音,使用 `MINIMAX_API_KEY`) diff --git a/api/main.py b/api/main.py index d07c48b..c9b859b 100644 --- a/api/main.py +++ b/api/main.py @@ -33,6 +33,7 @@ PRODUCT_LIBRARY_MANIFEST = PRODUCT_LIBRARY_DIR / "manifest.json" LLM_BASE_URL = os.getenv("LLM_BASE_URL", "").strip() LLM_API_KEY = os.getenv("LLM_API_KEY", "").strip() ASR_MODEL = os.getenv("ASR_MODEL", "whisper-1") +ASR_FALLBACK_MODEL = os.getenv("ASR_FALLBACK_MODEL", "gemini-2.5-flash").strip() or "gemini-2.5-flash" TRANSLATE_MODEL = os.getenv("TRANSLATE_MODEL", "gemini-2.5-flash") REWRITE_MODEL = os.getenv("REWRITE_MODEL", "gemini-2.5-pro") VISION_MODEL = os.getenv("VISION_MODEL", "gemini-2.5-flash") @@ -687,8 +688,8 @@ def _resolve_frame_quality(duration: float, quality: FrameExtractQuality) -> Fra cores = os.cpu_count() or 4 memory_gb = _physical_memory_gb() strong_machine = cores >= 10 and (memory_gb == 0.0 or memory_gb >= 32) - if strong_machine and duration <= 180: - return "ultra" + # 展示/演示时不能把本机资源打满:auto 最高只到 accurate。 + # ultra 保留为手动选择项,不再由 auto 自动命中。 if strong_machine and duration <= 600: return "accurate" if cores >= 8 and duration <= 240: @@ -1157,6 +1158,16 @@ def ffprobe_meta(mp4: Path) -> dict: return json.loads(out) +def media_duration(path: Path) -> float: + try: + out = run([ + "ffprobe", "-v", "error", "-print_format", "json", "-show_format", str(path), + ]) + return float(json.loads(out).get("format", {}).get("duration") or 0) + except Exception: + return 0.0 + + def pipeline_download(job_id: str) -> None: """阶段 1:仅下载(或上传跳过),落 source.mp4,停在 downloaded 等用户点解析/提取音频。""" job = JOBS[job_id] @@ -1362,21 +1373,83 @@ def analyze_queue_worker() -> None: # ---------- 音频转写 + 翻译 + SKG 改写 + MiniMax 配音 ---------- +def _parse_asr_segments(content: str, duration: float) -> list[dict]: + raw = (content or "").strip() + if raw.startswith("```"): + import re as _re + match = _re.search(r"(\[[\s\S]*\]|\{[\s\S]*\})", raw) + raw = match.group(0) if match else raw + try: + data = json.loads(raw) + except json.JSONDecodeError: + text = raw.strip() + return [{"start": 0.0, "end": duration, "text": text}] if text else [] + if isinstance(data, dict): + for key in ("segments", "data", "items", "result"): + if isinstance(data.get(key), list): + data = data[key] + break + else: + text = str(data.get("text") or data.get("transcript") or "").strip() + return [{"start": 0.0, "end": duration, "text": text}] if text else [] + if not isinstance(data, list): + return [] + segments: list[dict] = [] + for i, item in enumerate(data): + if isinstance(item, str): + text = item.strip() + start = 0.0 if len(data) == 1 else duration * i / max(1, len(data)) + end = duration if len(data) == 1 else duration * (i + 1) / max(1, len(data)) + elif isinstance(item, dict): + text = str(item.get("text") or item.get("en") or item.get("transcript") or "").strip() + start = float(item.get("start") or item.get("start_time") or 0) + end = float(item.get("end") or item.get("end_time") or duration) + else: + continue + if text: + segments.append({"start": max(0.0, start), "end": max(start, end), "text": text}) + return segments + + +def _transcribe_gemini_sync(wav: Path) -> list[dict]: + duration = media_duration(wav) + audio_b64 = base64.b64encode(wav.read_bytes()).decode("ascii") + prompt = ( + "Transcribe the attached audio. Return strict JSON only, no markdown. " + "Schema: [{\"start\": 0.0, \"end\": 1.2, \"text\": \"English transcript\"}]. " + "Use English for the transcript. If exact timestamps are uncertain, return one segment " + f"from 0 to {duration:.2f} seconds." + ) + resp = llm().chat.completions.create( + model=ASR_FALLBACK_MODEL, + messages=[{"role": "user", "content": [ + {"type": "text", "text": prompt}, + {"type": "input_audio", "input_audio": {"data": audio_b64, "format": "wav"}}, + ]}], + temperature=0, + ) + content = (resp.choices[0].message.content or "").strip() + return _parse_asr_segments(content, duration) + + def _transcribe_sync(wav: Path) -> list[dict]: """whisper-1 verbose_json → segments[{start, end, text}]""" - with wav.open("rb") as f: - resp = llm().audio.transcriptions.create( - file=(wav.name, f, "audio/wav"), - model=ASR_MODEL, - response_format="verbose_json", - timestamp_granularities=["segment"], - ) - raw = resp.model_dump() if hasattr(resp, "model_dump") else resp - segments = raw.get("segments") or [] - # 兜底:网关如果不返回 segments,把全文当一段 - if not segments and raw.get("text"): - segments = [{"start": 0.0, "end": float(raw.get("duration", 0) or 0), "text": raw["text"]}] - return segments + try: + with wav.open("rb") as f: + resp = llm().audio.transcriptions.create( + file=(wav.name, f, "audio/wav"), + model=ASR_MODEL, + response_format="verbose_json", + timestamp_granularities=["segment"], + ) + raw = resp.model_dump() if hasattr(resp, "model_dump") else resp + segments = raw.get("segments") or [] + # 兜底:网关如果不返回 segments,把全文当一段 + if not segments and raw.get("text"): + segments = [{"start": 0.0, "end": float(raw.get("duration", 0) or 0), "text": raw["text"]}] + return segments + except Exception: + return _transcribe_gemini_sync(wav) def _translate_sync(segments: list[dict]) -> list[str]: @@ -1865,6 +1938,7 @@ def health() -> dict: "base_url": LLM_BASE_URL or "openai-default", "models": { "asr": ASR_MODEL, + "asr_fallback": ASR_FALLBACK_MODEL, "translate": TRANSLATE_MODEL, "rewrite": REWRITE_MODEL, "audio_rewrite": AUDIO_REWRITE_MODEL, diff --git a/docs/source-analysis.html b/docs/source-analysis.html index a120e20..c8e4027 100644 --- a/docs/source-analysis.html +++ b/docs/source-analysis.html @@ -791,7 +791,7 @@ SubjectAsset { 创建任务POST /jobscreateJob提交 TK 链接,后台开始下载,停在 downloaded 等用户点解析。 上传视频POST /jobs/uploaduploadJob保存 source.mp4,然后同样进入下载完成状态。 删除输入视频DELETE /jobs/{id}deleteJob从任务队列、URL 和磁盘 jobs/<id> 目录移除整个 job,包括源视频、关键帧、元素提取图和生成视频。 - 解析视频POST /jobs/{id}/analyze?frames=&target=&mode=&quality=analyzeJob拆轨 + 目标化抽关键帧。默认 frames=12target 支持透明骨架人、综合、清晰主体、转场变化、表情瞬间、动作峰值;当前 UI 默认 transparent_human。透明骨架人目标现在只走本地清晰度、中心主体、对比度、画面变化和 pHash 去重,不在抽帧阶段逐帧调用 Vision;mode=append 追加新关键帧;quality=auto 根据本机算力和视频时长自动选择快速、精细或极准。抽帧开始时同步拆出 audio.wav 并启动音频处理线程。多个抽帧请求进入后端队列顺序处理。 + 解析视频POST /jobs/{id}/analyze?frames=&target=&mode=&quality=analyzeJob拆轨 + 目标化抽关键帧。默认 frames=12target 支持透明骨架人、综合、清晰主体、转场变化、表情瞬间、动作峰值;当前 UI 默认 transparent_human。透明骨架人目标现在只走本地清晰度、中心主体、对比度、画面变化和 pHash 去重,不在抽帧阶段逐帧调用 Vision;mode=append 追加新关键帧;quality=auto 为展示友好档,最高只自动选择精细,不会自动上极准;极准保留为手动选择。抽帧开始时同步拆出 audio.wav 并启动音频处理线程。多个抽帧请求进入后端队列顺序处理。 音频文案轨POST /jobs/{id}/transcribetriggerTranscribe若尚未拆轨,先从 source.mp4 提取 audio.wav;随后 ASR 得到英文时间戳段落,再翻译中文,并按 AUDIO_PRODUCT_BRIEF 生成 audio_script.rewritten_text;配置 MINIMAX_API_KEY 后调用 MiniMax T2A 生成 audio_script.voice_url。前端不自动触发,用户在 Audio 节点点击“提取音频 / 重新提取音频”即可启动;抽帧中也允许并行触发,忙碌态由 audio_script.status 管理。 原始音频文件GET /jobs/{id}/audio.wavsourceAudioUrl返回拆轨得到的 wav;底部 AudioStrip 拉取该文件,用 Web Audio API 解码并计算波形峰值,只读展示,不参与改写。 改写配音文件GET /jobs/{id}/audio-script.mp3apiAssetUrl(job.audio_script.voice_url)返回 MiniMax T2A 生成的 mp3。没有配置 MiniMax 或生成失败时该文件不存在,但改写文案仍会保存在 audio_script.rewritten_text。 @@ -875,7 +875,7 @@ SubjectAsset {

阻塞 / 占位