diff --git a/.memory/worklog.json b/.memory/worklog.json index ab86aeb..a9e32a3 100644 --- a/.memory/worklog.json +++ b/.memory/worklog.json @@ -1,31 +1,5 @@ { "entries": [ - { - "files_changed": 1, - "message": "Codex 会话活跃 · 最近命令:codex · 1 项未提交变更 · 最近提交:auto-save 2026-05-14 18:05 (~1)", - "ts": "2026-05-14T10:08:43Z", - "type": "session-heartbeat" - }, - { - "files_changed": 1, - "hash": "8f0c92c", - "message": "auto-save 2026-05-14 18:10 (~1)", - "ts": "2026-05-14T18:10:54+08:00", - "type": "commit" - }, - { - "files_changed": 1, - "message": "Codex 会话活跃 · 最近命令:codex · 1 项未提交变更 · 最近提交:auto-save 2026-05-14 18:10 (~1)", - "ts": "2026-05-14T10:16:15Z", - "type": "session-heartbeat" - }, - { - "files_changed": 1, - "hash": "49cfc2b", - "message": "auto-save 2026-05-14 18:16 (~1)", - "ts": "2026-05-14T18:16:26+08:00", - "type": "commit" - }, { "files_changed": 1, "message": "Codex 会话活跃 · 最近命令:codex · 1 项未提交变更 · 最近提交:auto-save 2026-05-14 18:16 (~1)", @@ -3270,6 +3244,31 @@ "message": "auto-save 2026-05-17 13:23 (~2)", "hash": "6d684e0", "files_changed": 2 + }, + { + "ts": "2026-05-17T13:27:36+08:00", + "type": "commit", + "message": "fix: recover media intake and remove audio strip", + "hash": "126f1dd", + "files_changed": 3 + }, + { + "ts": "2026-05-17T05:28:24Z", + "type": "session-heartbeat", + "message": "Codex 会话活跃 · 最近命令:codex · 分支 main · 1 项未提交变更 · 最近提交:fix: recover media intake and remove audio strip", + "files_changed": 1 + }, + { + "ts": "2026-05-17T05:38:24Z", + "type": "session-heartbeat", + "message": "Codex 会话活跃 · 最近命令:codex · 分支 main · 1 项未提交变更 · 最近提交:fix: recover media intake and remove audio strip", + "files_changed": 1 + }, + { + "ts": "2026-05-17T05:48:24Z", + "type": "session-heartbeat", + "message": "Codex 会话活跃 · 最近命令:codex · 分支 main · 2 项未提交变更 · 最近提交:fix: recover media intake and remove audio strip", + "files_changed": 2 } ] } diff --git a/api/main.py b/api/main.py index a9b4ad3..93e9300 100644 --- a/api/main.py +++ b/api/main.py @@ -44,6 +44,9 @@ LLM_API_KEY = os.getenv("LLM_API_KEY", "").strip() ASR_MODEL = os.getenv("ASR_MODEL", "whisper-1") ASR_FALLBACK_MODEL = os.getenv("ASR_FALLBACK_MODEL", "gemini-2.5-flash").strip() or "gemini-2.5-flash" ASR_TIMEOUT_SECONDS = max(15, int(os.getenv("ASR_TIMEOUT_SECONDS", "45"))) +LOCAL_ASR_BIN = os.getenv("LOCAL_ASR_BIN", "").strip() +LOCAL_ASR_MODEL = os.getenv("LOCAL_ASR_MODEL", "mlx-community/whisper-tiny").strip() or "mlx-community/whisper-tiny" +LOCAL_ASR_TIMEOUT_SECONDS = max(30, int(os.getenv("LOCAL_ASR_TIMEOUT_SECONDS", "180"))) TRANSLATE_MODEL = os.getenv("TRANSLATE_MODEL", "gemini-2.5-flash") REWRITE_MODEL = os.getenv("REWRITE_MODEL", "gemini-2.5-pro") VISION_MODEL = os.getenv("VISION_MODEL", "gemini-2.5-flash") @@ -1696,6 +1699,10 @@ def analyze_queue_worker() -> None: # ---------- 音频转写 + 翻译 + SKG 改写 + MiniMax 配音 ---------- +class TranscriptionUnavailable(RuntimeError): + pass + + def _parse_asr_segments(content: str, duration: float) -> list[dict]: raw = (content or "").strip() if raw.startswith("```"): @@ -1708,6 +1715,8 @@ def _parse_asr_segments(content: str, duration: float) -> list[dict]: text = raw.strip() return [{"start": 0.0, "end": duration, "text": text}] if text else [] if isinstance(data, dict): + if data.get("can_hear") is False: + raise TranscriptionUnavailable("fallback ASR could not hear the audio") for key in ("segments", "data", "items", "result"): if isinstance(data.get(key), list): data = data[key] @@ -1734,14 +1743,126 @@ def _parse_asr_segments(content: str, duration: float) -> list[dict]: return segments +def _clean_asr_segments(segments: list[dict], duration: float) -> list[dict]: + clean: list[dict] = [] + cursor = 0.0 + for item in segments: + text = str(item.get("text") or item.get("en") or item.get("transcript") or "").strip() + if not text: + continue + try: + start = float(item.get("start") if item.get("start") is not None else item.get("start_time") or 0) + end = float(item.get("end") if item.get("end") is not None else item.get("end_time") or 0) + except (TypeError, ValueError): + continue + if end <= 0 and duration > 0: + end = duration + start = max(0.0, min(start, duration if duration > 0 else start)) + end = max(start + 0.05, min(end, duration if duration > 0 else end)) + # Keep the timeline monotonic. Real ASR can overlap slightly, but the UI table should not jump back. + if start < cursor - 0.25: + start = cursor + end = max(end, start + 0.05) + cursor = max(cursor, end) + clean.append({"start": round(start, 2), "end": round(end, 2), "text": text}) + return clean + + +def _segment_text_key(text: str) -> str: + return re.sub(r"[^a-z0-9]+", " ", text.lower()).strip() + + +def _validate_asr_segments(segments: list[dict], duration: float, source: str) -> list[dict]: + clean = _clean_asr_segments(segments, duration) + if not clean: + raise TranscriptionUnavailable(f"{source} did not return transcript segments") + keyed = [_segment_text_key(str(s.get("text") or "")) for s in clean if _segment_text_key(str(s.get("text") or ""))] + unique_ratio = len(set(keyed)) / max(1, len(keyed)) + one_secondish = [ + s for s in clean + if 0.75 <= (float(s["end"]) - float(s["start"])) <= 1.25 + ] + if len(clean) >= 12 and unique_ratio < 0.35: + raise TranscriptionUnavailable(f"{source} returned repetitive transcript segments") + if len(clean) >= 20 and len(one_secondish) / len(clean) > 0.75 and unique_ratio < 0.65: + raise TranscriptionUnavailable(f"{source} returned synthetic one-second timeline") + if duration > 0: + last_end = max(float(s["end"]) for s in clean) + words = sum(len(str(s.get("text") or "").split()) for s in clean) + if len(clean) > 1 and last_end > duration + 3: + raise TranscriptionUnavailable(f"{source} returned timestamps outside audio duration") + if duration > 10 and last_end < duration * 0.45 and words < 20: + raise TranscriptionUnavailable(f"{source} returned too little transcript coverage") + return clean + + +def _local_asr_binary() -> str: + candidates = [ + LOCAL_ASR_BIN, + shutil.which("mlx_whisper") or "", + "/opt/homebrew/bin/mlx_whisper", + ] + for candidate in candidates: + if candidate and Path(candidate).exists() and os.access(candidate, os.X_OK): + return candidate + raise TranscriptionUnavailable("本机未找到可用 mlx_whisper") + + +def _transcribe_mlx_sync(wav: Path) -> list[dict]: + wav = wav.resolve() + duration = media_duration(wav) + binary = _local_asr_binary() + output_name = "asr-local" + output_path = wav.parent / f"{output_name}.json" + if output_path.exists(): + output_path.unlink() + env = os.environ.copy() + try: + ffmpeg_path = Path(media_binary("ffmpeg")) + env["PATH"] = f"{ffmpeg_path.parent}{os.pathsep}{env.get('PATH', '')}" + except Exception: + pass + cmd = [ + binary, + str(wav), + "--model", LOCAL_ASR_MODEL, + "--output-dir", str(wav.parent), + "--output-name", output_name, + "--output-format", "json", + "--verbose", "False", + "--condition-on-previous-text", "False", + "--word-timestamps", "True", + ] + try: + result = subprocess.run( + cmd, + cwd=str(wav.parent), + env=env, + capture_output=True, + text=True, + timeout=LOCAL_ASR_TIMEOUT_SECONDS, + ) + except subprocess.TimeoutExpired as e: + raise TranscriptionUnavailable(f"本机 ASR 超时:{LOCAL_ASR_TIMEOUT_SECONDS}s") from e + if result.returncode != 0: + detail = (result.stderr or result.stdout or "").strip().splitlines()[-1:] or ["本机 ASR 执行失败"] + raise TranscriptionUnavailable(detail[0][:500]) + if not output_path.exists(): + raise TranscriptionUnavailable("本机 ASR 未生成 json 结果") + data = json.loads(output_path.read_text(encoding="utf-8")) + segments = data.get("segments") or [] + return _validate_asr_segments(segments, duration, "mlx_whisper") + + def _transcribe_gemini_sync(wav: Path) -> list[dict]: duration = media_duration(wav) audio_b64 = base64.b64encode(wav.read_bytes()).decode("ascii") prompt = ( "Transcribe the attached audio. Return strict JSON only, no markdown. " - "Schema: [{\"start\": 0.0, \"end\": 1.2, \"text\": \"English transcript\"}]. " - "Use English for the transcript. If exact timestamps are uncertain, return one segment " - f"from 0 to {duration:.2f} seconds." + "If you cannot truly hear the audio, return {\"can_hear\": false}. Do not guess. " + "If you can hear it, return {\"can_hear\": true, \"segments\": " + "[{\"start\": 0.0, \"end\": 1.2, \"text\": \"English transcript\"}]}. " + "Use English for the transcript. Only include timestamps you can infer from the audio." ) last_error: Exception | None = None for attempt in range(3): @@ -1756,7 +1877,7 @@ def _transcribe_gemini_sync(wav: Path) -> list[dict]: timeout=ASR_TIMEOUT_SECONDS, ) content = (resp.choices[0].message.content or "").strip() - return _parse_asr_segments(content, duration) + return _validate_asr_segments(_parse_asr_segments(content, duration), duration, "gemini audio fallback") except Exception as e: last_error = e if attempt < 2: @@ -1765,7 +1886,9 @@ def _transcribe_gemini_sync(wav: Path) -> list[dict]: def _transcribe_sync(wav: Path) -> list[dict]: - """whisper-1 verbose_json → segments[{start, end, text}]""" + """Remote ASR first, local mlx_whisper second. Gemini fallback is guarded against fake timelines.""" + errors: list[str] = [] + duration = media_duration(wav) try: with wav.open("rb") as f: resp = llm().audio.transcriptions.create( @@ -1780,9 +1903,18 @@ def _transcribe_sync(wav: Path) -> list[dict]: # 兜底:网关如果不返回 segments,把全文当一段 if not segments and raw.get("text"): segments = [{"start": 0.0, "end": float(raw.get("duration", 0) or 0), "text": raw["text"]}] - return segments - except Exception: + return _validate_asr_segments(segments, duration, ASR_MODEL) + except Exception as e: + errors.append(f"{ASR_MODEL}: {e}") + try: + return _transcribe_mlx_sync(wav) + except Exception as e: + errors.append(f"mlx_whisper: {e}") + try: return _transcribe_gemini_sync(wav) + except Exception as e: + errors.append(f"{ASR_FALLBACK_MODEL}: {e}") + raise TranscriptionUnavailable(";".join(errors)) def _translate_sync(segments: list[dict]) -> list[str]: @@ -2187,19 +2319,9 @@ def pipeline_transcribe(job_id: str, manage_job_status: bool = True) -> None: # 1) whisper ASR progress(f"{ASR_MODEL} 转录中…", 78) - try: - segments = _transcribe_sync(wav) - except Exception: - if job.transcript: - segments = [ - {"start": seg.start, "end": seg.end, "text": seg.en} - for seg in job.transcript - if seg.en.strip() - ] - else: - segments = [{"start": 0.0, "end": target_duration, "text": "Source audio timing reference."}] + segments = _transcribe_sync(wav) if not segments: - segments = [{"start": 0.0, "end": target_duration, "text": "Source audio timing reference."}] + raise TranscriptionUnavailable("ASR 未返回可用字幕段") # 先把英文段落落到 job 上(让 UI 提前看到,翻译再补 zh) en_only = [