auto-save 2026-05-14 11:21 (~7)

This commit is contained in:
2026-05-14 11:25:23 +08:00
parent 4127adc5e7
commit 12dec58056
7 changed files with 123 additions and 36 deletions

View File

@@ -33,6 +33,7 @@ PRODUCT_LIBRARY_MANIFEST = PRODUCT_LIBRARY_DIR / "manifest.json"
LLM_BASE_URL = os.getenv("LLM_BASE_URL", "").strip()
LLM_API_KEY = os.getenv("LLM_API_KEY", "").strip()
ASR_MODEL = os.getenv("ASR_MODEL", "whisper-1")
ASR_FALLBACK_MODEL = os.getenv("ASR_FALLBACK_MODEL", "gemini-2.5-flash").strip() or "gemini-2.5-flash"
TRANSLATE_MODEL = os.getenv("TRANSLATE_MODEL", "gemini-2.5-flash")
REWRITE_MODEL = os.getenv("REWRITE_MODEL", "gemini-2.5-pro")
VISION_MODEL = os.getenv("VISION_MODEL", "gemini-2.5-flash")
@@ -687,8 +688,8 @@ def _resolve_frame_quality(duration: float, quality: FrameExtractQuality) -> Fra
cores = os.cpu_count() or 4
memory_gb = _physical_memory_gb()
strong_machine = cores >= 10 and (memory_gb == 0.0 or memory_gb >= 32)
if strong_machine and duration <= 180:
return "ultra"
# 展示/演示时不能把本机资源打满auto 最高只到 accurate。
# ultra 保留为手动选择项,不再由 auto 自动命中。
if strong_machine and duration <= 600:
return "accurate"
if cores >= 8 and duration <= 240:
@@ -1157,6 +1158,16 @@ def ffprobe_meta(mp4: Path) -> dict:
return json.loads(out)
def media_duration(path: Path) -> float:
try:
out = run([
"ffprobe", "-v", "error", "-print_format", "json", "-show_format", str(path),
])
return float(json.loads(out).get("format", {}).get("duration") or 0)
except Exception:
return 0.0
def pipeline_download(job_id: str) -> None:
"""阶段 1仅下载或上传跳过落 source.mp4停在 downloaded 等用户点解析/提取音频。"""
job = JOBS[job_id]
@@ -1362,21 +1373,83 @@ def analyze_queue_worker() -> None:
# ---------- 音频转写 + 翻译 + SKG 改写 + MiniMax 配音 ----------
def _parse_asr_segments(content: str, duration: float) -> list[dict]:
raw = (content or "").strip()
if raw.startswith("```"):
import re as _re
match = _re.search(r"(\[[\s\S]*\]|\{[\s\S]*\})", raw)
raw = match.group(0) if match else raw
try:
data = json.loads(raw)
except json.JSONDecodeError:
text = raw.strip()
return [{"start": 0.0, "end": duration, "text": text}] if text else []
if isinstance(data, dict):
for key in ("segments", "data", "items", "result"):
if isinstance(data.get(key), list):
data = data[key]
break
else:
text = str(data.get("text") or data.get("transcript") or "").strip()
return [{"start": 0.0, "end": duration, "text": text}] if text else []
if not isinstance(data, list):
return []
segments: list[dict] = []
for i, item in enumerate(data):
if isinstance(item, str):
text = item.strip()
start = 0.0 if len(data) == 1 else duration * i / max(1, len(data))
end = duration if len(data) == 1 else duration * (i + 1) / max(1, len(data))
elif isinstance(item, dict):
text = str(item.get("text") or item.get("en") or item.get("transcript") or "").strip()
start = float(item.get("start") or item.get("start_time") or 0)
end = float(item.get("end") or item.get("end_time") or duration)
else:
continue
if text:
segments.append({"start": max(0.0, start), "end": max(start, end), "text": text})
return segments
def _transcribe_gemini_sync(wav: Path) -> list[dict]:
duration = media_duration(wav)
audio_b64 = base64.b64encode(wav.read_bytes()).decode("ascii")
prompt = (
"Transcribe the attached audio. Return strict JSON only, no markdown. "
"Schema: [{\"start\": 0.0, \"end\": 1.2, \"text\": \"English transcript\"}]. "
"Use English for the transcript. If exact timestamps are uncertain, return one segment "
f"from 0 to {duration:.2f} seconds."
)
resp = llm().chat.completions.create(
model=ASR_FALLBACK_MODEL,
messages=[{"role": "user", "content": [
{"type": "text", "text": prompt},
{"type": "input_audio", "input_audio": {"data": audio_b64, "format": "wav"}},
]}],
temperature=0,
)
content = (resp.choices[0].message.content or "").strip()
return _parse_asr_segments(content, duration)
def _transcribe_sync(wav: Path) -> list[dict]:
"""whisper-1 verbose_json → segments[{start, end, text}]"""
with wav.open("rb") as f:
resp = llm().audio.transcriptions.create(
file=(wav.name, f, "audio/wav"),
model=ASR_MODEL,
response_format="verbose_json",
timestamp_granularities=["segment"],
)
raw = resp.model_dump() if hasattr(resp, "model_dump") else resp
segments = raw.get("segments") or []
# 兜底:网关如果不返回 segments把全文当一段
if not segments and raw.get("text"):
segments = [{"start": 0.0, "end": float(raw.get("duration", 0) or 0), "text": raw["text"]}]
return segments
try:
with wav.open("rb") as f:
resp = llm().audio.transcriptions.create(
file=(wav.name, f, "audio/wav"),
model=ASR_MODEL,
response_format="verbose_json",
timestamp_granularities=["segment"],
)
raw = resp.model_dump() if hasattr(resp, "model_dump") else resp
segments = raw.get("segments") or []
# 兜底:网关如果不返回 segments把全文当一段
if not segments and raw.get("text"):
segments = [{"start": 0.0, "end": float(raw.get("duration", 0) or 0), "text": raw["text"]}]
return segments
except Exception:
return _transcribe_gemini_sync(wav)
def _translate_sync(segments: list[dict]) -> list[str]:
@@ -1865,6 +1938,7 @@ def health() -> dict:
"base_url": LLM_BASE_URL or "openai-default",
"models": {
"asr": ASR_MODEL,
"asr_fallback": ASR_FALLBACK_MODEL,
"translate": TRANSLATE_MODEL,
"rewrite": REWRITE_MODEL,
"audio_rewrite": AUDIO_REWRITE_MODEL,