auto-save 2026-05-17 13:50 (~2)
This commit is contained in:
@@ -1,31 +1,5 @@
|
|||||||
{
|
{
|
||||||
"entries": [
|
"entries": [
|
||||||
{
|
|
||||||
"files_changed": 1,
|
|
||||||
"message": "Codex 会话活跃 · 最近命令:codex · 1 项未提交变更 · 最近提交:auto-save 2026-05-14 18:05 (~1)",
|
|
||||||
"ts": "2026-05-14T10:08:43Z",
|
|
||||||
"type": "session-heartbeat"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"files_changed": 1,
|
|
||||||
"hash": "8f0c92c",
|
|
||||||
"message": "auto-save 2026-05-14 18:10 (~1)",
|
|
||||||
"ts": "2026-05-14T18:10:54+08:00",
|
|
||||||
"type": "commit"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"files_changed": 1,
|
|
||||||
"message": "Codex 会话活跃 · 最近命令:codex · 1 项未提交变更 · 最近提交:auto-save 2026-05-14 18:10 (~1)",
|
|
||||||
"ts": "2026-05-14T10:16:15Z",
|
|
||||||
"type": "session-heartbeat"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"files_changed": 1,
|
|
||||||
"hash": "49cfc2b",
|
|
||||||
"message": "auto-save 2026-05-14 18:16 (~1)",
|
|
||||||
"ts": "2026-05-14T18:16:26+08:00",
|
|
||||||
"type": "commit"
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"files_changed": 1,
|
"files_changed": 1,
|
||||||
"message": "Codex 会话活跃 · 最近命令:codex · 1 项未提交变更 · 最近提交:auto-save 2026-05-14 18:16 (~1)",
|
"message": "Codex 会话活跃 · 最近命令:codex · 1 项未提交变更 · 最近提交:auto-save 2026-05-14 18:16 (~1)",
|
||||||
@@ -3270,6 +3244,31 @@
|
|||||||
"message": "auto-save 2026-05-17 13:23 (~2)",
|
"message": "auto-save 2026-05-17 13:23 (~2)",
|
||||||
"hash": "6d684e0",
|
"hash": "6d684e0",
|
||||||
"files_changed": 2
|
"files_changed": 2
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"ts": "2026-05-17T13:27:36+08:00",
|
||||||
|
"type": "commit",
|
||||||
|
"message": "fix: recover media intake and remove audio strip",
|
||||||
|
"hash": "126f1dd",
|
||||||
|
"files_changed": 3
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"ts": "2026-05-17T05:28:24Z",
|
||||||
|
"type": "session-heartbeat",
|
||||||
|
"message": "Codex 会话活跃 · 最近命令:codex · 分支 main · 1 项未提交变更 · 最近提交:fix: recover media intake and remove audio strip",
|
||||||
|
"files_changed": 1
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"ts": "2026-05-17T05:38:24Z",
|
||||||
|
"type": "session-heartbeat",
|
||||||
|
"message": "Codex 会话活跃 · 最近命令:codex · 分支 main · 1 项未提交变更 · 最近提交:fix: recover media intake and remove audio strip",
|
||||||
|
"files_changed": 1
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"ts": "2026-05-17T05:48:24Z",
|
||||||
|
"type": "session-heartbeat",
|
||||||
|
"message": "Codex 会话活跃 · 最近命令:codex · 分支 main · 2 项未提交变更 · 最近提交:fix: recover media intake and remove audio strip",
|
||||||
|
"files_changed": 2
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|||||||
160
api/main.py
160
api/main.py
@@ -44,6 +44,9 @@ LLM_API_KEY = os.getenv("LLM_API_KEY", "").strip()
|
|||||||
ASR_MODEL = os.getenv("ASR_MODEL", "whisper-1")
|
ASR_MODEL = os.getenv("ASR_MODEL", "whisper-1")
|
||||||
ASR_FALLBACK_MODEL = os.getenv("ASR_FALLBACK_MODEL", "gemini-2.5-flash").strip() or "gemini-2.5-flash"
|
ASR_FALLBACK_MODEL = os.getenv("ASR_FALLBACK_MODEL", "gemini-2.5-flash").strip() or "gemini-2.5-flash"
|
||||||
ASR_TIMEOUT_SECONDS = max(15, int(os.getenv("ASR_TIMEOUT_SECONDS", "45")))
|
ASR_TIMEOUT_SECONDS = max(15, int(os.getenv("ASR_TIMEOUT_SECONDS", "45")))
|
||||||
|
LOCAL_ASR_BIN = os.getenv("LOCAL_ASR_BIN", "").strip()
|
||||||
|
LOCAL_ASR_MODEL = os.getenv("LOCAL_ASR_MODEL", "mlx-community/whisper-tiny").strip() or "mlx-community/whisper-tiny"
|
||||||
|
LOCAL_ASR_TIMEOUT_SECONDS = max(30, int(os.getenv("LOCAL_ASR_TIMEOUT_SECONDS", "180")))
|
||||||
TRANSLATE_MODEL = os.getenv("TRANSLATE_MODEL", "gemini-2.5-flash")
|
TRANSLATE_MODEL = os.getenv("TRANSLATE_MODEL", "gemini-2.5-flash")
|
||||||
REWRITE_MODEL = os.getenv("REWRITE_MODEL", "gemini-2.5-pro")
|
REWRITE_MODEL = os.getenv("REWRITE_MODEL", "gemini-2.5-pro")
|
||||||
VISION_MODEL = os.getenv("VISION_MODEL", "gemini-2.5-flash")
|
VISION_MODEL = os.getenv("VISION_MODEL", "gemini-2.5-flash")
|
||||||
@@ -1696,6 +1699,10 @@ def analyze_queue_worker() -> None:
|
|||||||
|
|
||||||
# ---------- 音频转写 + 翻译 + SKG 改写 + MiniMax 配音 ----------
|
# ---------- 音频转写 + 翻译 + SKG 改写 + MiniMax 配音 ----------
|
||||||
|
|
||||||
|
class TranscriptionUnavailable(RuntimeError):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
def _parse_asr_segments(content: str, duration: float) -> list[dict]:
|
def _parse_asr_segments(content: str, duration: float) -> list[dict]:
|
||||||
raw = (content or "").strip()
|
raw = (content or "").strip()
|
||||||
if raw.startswith("```"):
|
if raw.startswith("```"):
|
||||||
@@ -1708,6 +1715,8 @@ def _parse_asr_segments(content: str, duration: float) -> list[dict]:
|
|||||||
text = raw.strip()
|
text = raw.strip()
|
||||||
return [{"start": 0.0, "end": duration, "text": text}] if text else []
|
return [{"start": 0.0, "end": duration, "text": text}] if text else []
|
||||||
if isinstance(data, dict):
|
if isinstance(data, dict):
|
||||||
|
if data.get("can_hear") is False:
|
||||||
|
raise TranscriptionUnavailable("fallback ASR could not hear the audio")
|
||||||
for key in ("segments", "data", "items", "result"):
|
for key in ("segments", "data", "items", "result"):
|
||||||
if isinstance(data.get(key), list):
|
if isinstance(data.get(key), list):
|
||||||
data = data[key]
|
data = data[key]
|
||||||
@@ -1734,14 +1743,126 @@ def _parse_asr_segments(content: str, duration: float) -> list[dict]:
|
|||||||
return segments
|
return segments
|
||||||
|
|
||||||
|
|
||||||
|
def _clean_asr_segments(segments: list[dict], duration: float) -> list[dict]:
|
||||||
|
clean: list[dict] = []
|
||||||
|
cursor = 0.0
|
||||||
|
for item in segments:
|
||||||
|
text = str(item.get("text") or item.get("en") or item.get("transcript") or "").strip()
|
||||||
|
if not text:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
start = float(item.get("start") if item.get("start") is not None else item.get("start_time") or 0)
|
||||||
|
end = float(item.get("end") if item.get("end") is not None else item.get("end_time") or 0)
|
||||||
|
except (TypeError, ValueError):
|
||||||
|
continue
|
||||||
|
if end <= 0 and duration > 0:
|
||||||
|
end = duration
|
||||||
|
start = max(0.0, min(start, duration if duration > 0 else start))
|
||||||
|
end = max(start + 0.05, min(end, duration if duration > 0 else end))
|
||||||
|
# Keep the timeline monotonic. Real ASR can overlap slightly, but the UI table should not jump back.
|
||||||
|
if start < cursor - 0.25:
|
||||||
|
start = cursor
|
||||||
|
end = max(end, start + 0.05)
|
||||||
|
cursor = max(cursor, end)
|
||||||
|
clean.append({"start": round(start, 2), "end": round(end, 2), "text": text})
|
||||||
|
return clean
|
||||||
|
|
||||||
|
|
||||||
|
def _segment_text_key(text: str) -> str:
|
||||||
|
return re.sub(r"[^a-z0-9]+", " ", text.lower()).strip()
|
||||||
|
|
||||||
|
|
||||||
|
def _validate_asr_segments(segments: list[dict], duration: float, source: str) -> list[dict]:
|
||||||
|
clean = _clean_asr_segments(segments, duration)
|
||||||
|
if not clean:
|
||||||
|
raise TranscriptionUnavailable(f"{source} did not return transcript segments")
|
||||||
|
keyed = [_segment_text_key(str(s.get("text") or "")) for s in clean if _segment_text_key(str(s.get("text") or ""))]
|
||||||
|
unique_ratio = len(set(keyed)) / max(1, len(keyed))
|
||||||
|
one_secondish = [
|
||||||
|
s for s in clean
|
||||||
|
if 0.75 <= (float(s["end"]) - float(s["start"])) <= 1.25
|
||||||
|
]
|
||||||
|
if len(clean) >= 12 and unique_ratio < 0.35:
|
||||||
|
raise TranscriptionUnavailable(f"{source} returned repetitive transcript segments")
|
||||||
|
if len(clean) >= 20 and len(one_secondish) / len(clean) > 0.75 and unique_ratio < 0.65:
|
||||||
|
raise TranscriptionUnavailable(f"{source} returned synthetic one-second timeline")
|
||||||
|
if duration > 0:
|
||||||
|
last_end = max(float(s["end"]) for s in clean)
|
||||||
|
words = sum(len(str(s.get("text") or "").split()) for s in clean)
|
||||||
|
if len(clean) > 1 and last_end > duration + 3:
|
||||||
|
raise TranscriptionUnavailable(f"{source} returned timestamps outside audio duration")
|
||||||
|
if duration > 10 and last_end < duration * 0.45 and words < 20:
|
||||||
|
raise TranscriptionUnavailable(f"{source} returned too little transcript coverage")
|
||||||
|
return clean
|
||||||
|
|
||||||
|
|
||||||
|
def _local_asr_binary() -> str:
|
||||||
|
candidates = [
|
||||||
|
LOCAL_ASR_BIN,
|
||||||
|
shutil.which("mlx_whisper") or "",
|
||||||
|
"/opt/homebrew/bin/mlx_whisper",
|
||||||
|
]
|
||||||
|
for candidate in candidates:
|
||||||
|
if candidate and Path(candidate).exists() and os.access(candidate, os.X_OK):
|
||||||
|
return candidate
|
||||||
|
raise TranscriptionUnavailable("本机未找到可用 mlx_whisper")
|
||||||
|
|
||||||
|
|
||||||
|
def _transcribe_mlx_sync(wav: Path) -> list[dict]:
|
||||||
|
wav = wav.resolve()
|
||||||
|
duration = media_duration(wav)
|
||||||
|
binary = _local_asr_binary()
|
||||||
|
output_name = "asr-local"
|
||||||
|
output_path = wav.parent / f"{output_name}.json"
|
||||||
|
if output_path.exists():
|
||||||
|
output_path.unlink()
|
||||||
|
env = os.environ.copy()
|
||||||
|
try:
|
||||||
|
ffmpeg_path = Path(media_binary("ffmpeg"))
|
||||||
|
env["PATH"] = f"{ffmpeg_path.parent}{os.pathsep}{env.get('PATH', '')}"
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
cmd = [
|
||||||
|
binary,
|
||||||
|
str(wav),
|
||||||
|
"--model", LOCAL_ASR_MODEL,
|
||||||
|
"--output-dir", str(wav.parent),
|
||||||
|
"--output-name", output_name,
|
||||||
|
"--output-format", "json",
|
||||||
|
"--verbose", "False",
|
||||||
|
"--condition-on-previous-text", "False",
|
||||||
|
"--word-timestamps", "True",
|
||||||
|
]
|
||||||
|
try:
|
||||||
|
result = subprocess.run(
|
||||||
|
cmd,
|
||||||
|
cwd=str(wav.parent),
|
||||||
|
env=env,
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
timeout=LOCAL_ASR_TIMEOUT_SECONDS,
|
||||||
|
)
|
||||||
|
except subprocess.TimeoutExpired as e:
|
||||||
|
raise TranscriptionUnavailable(f"本机 ASR 超时:{LOCAL_ASR_TIMEOUT_SECONDS}s") from e
|
||||||
|
if result.returncode != 0:
|
||||||
|
detail = (result.stderr or result.stdout or "").strip().splitlines()[-1:] or ["本机 ASR 执行失败"]
|
||||||
|
raise TranscriptionUnavailable(detail[0][:500])
|
||||||
|
if not output_path.exists():
|
||||||
|
raise TranscriptionUnavailable("本机 ASR 未生成 json 结果")
|
||||||
|
data = json.loads(output_path.read_text(encoding="utf-8"))
|
||||||
|
segments = data.get("segments") or []
|
||||||
|
return _validate_asr_segments(segments, duration, "mlx_whisper")
|
||||||
|
|
||||||
|
|
||||||
def _transcribe_gemini_sync(wav: Path) -> list[dict]:
|
def _transcribe_gemini_sync(wav: Path) -> list[dict]:
|
||||||
duration = media_duration(wav)
|
duration = media_duration(wav)
|
||||||
audio_b64 = base64.b64encode(wav.read_bytes()).decode("ascii")
|
audio_b64 = base64.b64encode(wav.read_bytes()).decode("ascii")
|
||||||
prompt = (
|
prompt = (
|
||||||
"Transcribe the attached audio. Return strict JSON only, no markdown. "
|
"Transcribe the attached audio. Return strict JSON only, no markdown. "
|
||||||
"Schema: [{\"start\": 0.0, \"end\": 1.2, \"text\": \"English transcript\"}]. "
|
"If you cannot truly hear the audio, return {\"can_hear\": false}. Do not guess. "
|
||||||
"Use English for the transcript. If exact timestamps are uncertain, return one segment "
|
"If you can hear it, return {\"can_hear\": true, \"segments\": "
|
||||||
f"from 0 to {duration:.2f} seconds."
|
"[{\"start\": 0.0, \"end\": 1.2, \"text\": \"English transcript\"}]}. "
|
||||||
|
"Use English for the transcript. Only include timestamps you can infer from the audio."
|
||||||
)
|
)
|
||||||
last_error: Exception | None = None
|
last_error: Exception | None = None
|
||||||
for attempt in range(3):
|
for attempt in range(3):
|
||||||
@@ -1756,7 +1877,7 @@ def _transcribe_gemini_sync(wav: Path) -> list[dict]:
|
|||||||
timeout=ASR_TIMEOUT_SECONDS,
|
timeout=ASR_TIMEOUT_SECONDS,
|
||||||
)
|
)
|
||||||
content = (resp.choices[0].message.content or "").strip()
|
content = (resp.choices[0].message.content or "").strip()
|
||||||
return _parse_asr_segments(content, duration)
|
return _validate_asr_segments(_parse_asr_segments(content, duration), duration, "gemini audio fallback")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
last_error = e
|
last_error = e
|
||||||
if attempt < 2:
|
if attempt < 2:
|
||||||
@@ -1765,7 +1886,9 @@ def _transcribe_gemini_sync(wav: Path) -> list[dict]:
|
|||||||
|
|
||||||
|
|
||||||
def _transcribe_sync(wav: Path) -> list[dict]:
|
def _transcribe_sync(wav: Path) -> list[dict]:
|
||||||
"""whisper-1 verbose_json → segments[{start, end, text}]"""
|
"""Remote ASR first, local mlx_whisper second. Gemini fallback is guarded against fake timelines."""
|
||||||
|
errors: list[str] = []
|
||||||
|
duration = media_duration(wav)
|
||||||
try:
|
try:
|
||||||
with wav.open("rb") as f:
|
with wav.open("rb") as f:
|
||||||
resp = llm().audio.transcriptions.create(
|
resp = llm().audio.transcriptions.create(
|
||||||
@@ -1780,9 +1903,18 @@ def _transcribe_sync(wav: Path) -> list[dict]:
|
|||||||
# 兜底:网关如果不返回 segments,把全文当一段
|
# 兜底:网关如果不返回 segments,把全文当一段
|
||||||
if not segments and raw.get("text"):
|
if not segments and raw.get("text"):
|
||||||
segments = [{"start": 0.0, "end": float(raw.get("duration", 0) or 0), "text": raw["text"]}]
|
segments = [{"start": 0.0, "end": float(raw.get("duration", 0) or 0), "text": raw["text"]}]
|
||||||
return segments
|
return _validate_asr_segments(segments, duration, ASR_MODEL)
|
||||||
except Exception:
|
except Exception as e:
|
||||||
|
errors.append(f"{ASR_MODEL}: {e}")
|
||||||
|
try:
|
||||||
|
return _transcribe_mlx_sync(wav)
|
||||||
|
except Exception as e:
|
||||||
|
errors.append(f"mlx_whisper: {e}")
|
||||||
|
try:
|
||||||
return _transcribe_gemini_sync(wav)
|
return _transcribe_gemini_sync(wav)
|
||||||
|
except Exception as e:
|
||||||
|
errors.append(f"{ASR_FALLBACK_MODEL}: {e}")
|
||||||
|
raise TranscriptionUnavailable(";".join(errors))
|
||||||
|
|
||||||
|
|
||||||
def _translate_sync(segments: list[dict]) -> list[str]:
|
def _translate_sync(segments: list[dict]) -> list[str]:
|
||||||
@@ -2187,19 +2319,9 @@ def pipeline_transcribe(job_id: str, manage_job_status: bool = True) -> None:
|
|||||||
|
|
||||||
# 1) whisper ASR
|
# 1) whisper ASR
|
||||||
progress(f"{ASR_MODEL} 转录中…", 78)
|
progress(f"{ASR_MODEL} 转录中…", 78)
|
||||||
try:
|
segments = _transcribe_sync(wav)
|
||||||
segments = _transcribe_sync(wav)
|
|
||||||
except Exception:
|
|
||||||
if job.transcript:
|
|
||||||
segments = [
|
|
||||||
{"start": seg.start, "end": seg.end, "text": seg.en}
|
|
||||||
for seg in job.transcript
|
|
||||||
if seg.en.strip()
|
|
||||||
]
|
|
||||||
else:
|
|
||||||
segments = [{"start": 0.0, "end": target_duration, "text": "Source audio timing reference."}]
|
|
||||||
if not segments:
|
if not segments:
|
||||||
segments = [{"start": 0.0, "end": target_duration, "text": "Source audio timing reference."}]
|
raise TranscriptionUnavailable("ASR 未返回可用字幕段")
|
||||||
|
|
||||||
# 先把英文段落落到 job 上(让 UI 提前看到,翻译再补 zh)
|
# 先把英文段落落到 job 上(让 UI 提前看到,翻译再补 zh)
|
||||||
en_only = [
|
en_only = [
|
||||||
|
|||||||
Reference in New Issue
Block a user