auto-save 2026-05-12 16:16 (~4)

This commit is contained in:
2026-05-12 16:16:52 +08:00
parent cc31bfeba2
commit 35b327815a
4 changed files with 130 additions and 50 deletions

View File

@@ -41,6 +41,13 @@
"message": "auto-save 2026-05-12 16:02 (+2, ~6)",
"hash": "b0ffd03",
"files_changed": 9
},
{
"ts": "2026-05-12T16:11:20+08:00",
"type": "commit",
"message": "auto-save 2026-05-12 16:11 (~1)",
"hash": "cc31bfe",
"files_changed": 1
}
]
}

View File

@@ -1,9 +1,13 @@
# Gemini API优先用 Poe 中转,按用户偏好
# Poe 网关示例GEMINI_API_BASE=https://api.poe.com/v1 + key
# Google 直连示例:留空 GEMINI_API_BASE用 google-generativeai SDK
GEMINI_API_KEY=
GEMINI_API_BASE=
GEMINI_MODEL=gemini-2.5-flash
# SKG AI 网关OpenAI 兼容
LLM_BASE_URL=https://ai.skg.com/ezlink/v1
LLM_API_KEY=
# 模型分工
ASR_MODEL=whisper-1
TRANSLATE_MODEL=gemini-2.5-flash
REWRITE_MODEL=gemini-2.5-pro
IMAGE_MODEL=gemini-3-pro-image-preview
VIDEO_MODEL=sora-2
# 工作目录
JOBS_DIR=./jobs
@@ -11,5 +15,4 @@ JOBS_DIR=./jobs
# CORS
CORS_ORIGINS=http://localhost:4290
# 端口(启动用 uvicorn --port 4291 覆盖)
API_PORT=4291

View File

@@ -21,8 +21,23 @@ load_dotenv()
JOBS_DIR = Path(os.getenv("JOBS_DIR", "./jobs")).resolve()
JOBS_DIR.mkdir(parents=True, exist_ok=True)
CORS_ORIGINS = [o.strip() for o in os.getenv("CORS_ORIGINS", "http://localhost:4290").split(",") if o.strip()]
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY", "").strip()
GEMINI_MODEL = os.getenv("GEMINI_MODEL", "gemini-2.5-flash")
LLM_BASE_URL = os.getenv("LLM_BASE_URL", "").strip()
LLM_API_KEY = os.getenv("LLM_API_KEY", "").strip()
ASR_MODEL = os.getenv("ASR_MODEL", "whisper-1")
TRANSLATE_MODEL = os.getenv("TRANSLATE_MODEL", "gemini-2.5-flash")
REWRITE_MODEL = os.getenv("REWRITE_MODEL", "gemini-2.5-pro")
# OpenAI 客户端OpenAI 兼容网关,含 SKG ezlink
from openai import OpenAI
_llm_client: OpenAI | None = None
def llm() -> OpenAI:
global _llm_client
if _llm_client is None:
if not LLM_API_KEY:
raise RuntimeError("LLM_API_KEY 未配置")
_llm_client = OpenAI(base_url=LLM_BASE_URL or None, api_key=LLM_API_KEY)
return _llm_client
# Pipeline 状态created → downloading → splitting → frames_extracted → transcribing → transcribed | failed
JobStatus = Literal[
@@ -225,6 +240,57 @@ async def pipeline_download_split_frames(job_id: str) -> None:
# ---------- Gemini ASR + 翻译 ----------
def _transcribe_sync(wav: Path) -> list[dict]:
"""whisper-1 verbose_json → segments[{start, end, text}]"""
with wav.open("rb") as f:
resp = llm().audio.transcriptions.create(
file=(wav.name, f, "audio/wav"),
model=ASR_MODEL,
response_format="verbose_json",
timestamp_granularities=["segment"],
)
raw = resp.model_dump() if hasattr(resp, "model_dump") else resp
segments = raw.get("segments") or []
# 兜底:网关如果不返回 segments把全文当一段
if not segments and raw.get("text"):
segments = [{"start": 0.0, "end": float(raw.get("duration", 0) or 0), "text": raw["text"]}]
return segments
def _translate_sync(segments: list[dict]) -> list[str]:
"""gemini-2.5-flash 批量翻译为中文,按段返回"""
payload = [{"i": i, "en": s.get("text", "").strip()} for i, s in enumerate(segments)]
prompt = (
"你是字幕翻译。把下列英文字幕段翻译为简体中文,保持原意、口语化、自然流畅。"
"严格返回 JSON 数组,不要任何 markdown 或多余文字schema: "
'[{"i": 0, "zh": "..."}, ...]\n\n输入:\n'
+ json.dumps(payload, ensure_ascii=False)
)
resp = llm().chat.completions.create(
model=TRANSLATE_MODEL,
messages=[{"role": "user", "content": prompt}],
response_format={"type": "json_object"},
temperature=0.2,
)
content = resp.choices[0].message.content or "[]"
try:
data = json.loads(content)
if isinstance(data, dict):
for k in ("data", "items", "result", "translations"):
if k in data and isinstance(data[k], list):
data = data[k]
break
if not isinstance(data, list):
data = []
except json.JSONDecodeError:
data = []
zh_by_idx: dict[int, str] = {}
for it in data:
if isinstance(it, dict) and "i" in it:
zh_by_idx[int(it["i"])] = str(it.get("zh", ""))
return [zh_by_idx.get(i, "") for i in range(len(segments))]
async def pipeline_transcribe(job_id: str) -> None:
job = JOBS[job_id]
d = job_dir(job_id)
@@ -233,57 +299,52 @@ async def pipeline_transcribe(job_id: str) -> None:
if not wav.exists():
raise RuntimeError("audio.wav 不存在")
update(job, status="transcribing", message="Gemini ASR 处理中…", progress=75)
if not GEMINI_API_KEY:
# 无 key 模式mock 数据,方便 UI 联调
await asyncio.sleep(1.2)
mock_segments = [
if not LLM_API_KEY:
# 无 key 模式mock 数据
update(job, status="transcribing", message="ASR (mock) …", progress=75)
await asyncio.sleep(1.0)
mock = [
TranscriptSegment(index=0, start=0.0, end=3.5,
en="Welcome back to my channel, today we're testing something new.",
zh="欢迎回来我的频道,今天我们要测试一些新东西。"),
en="Welcome back, today we're testing something new.",
zh="欢迎回来,今天我们要测试一些新东西。"),
TranscriptSegment(index=1, start=3.5, end=7.2,
en="This device looks really sleek and the design is quite minimal.",
zh="这个设备看起来非常时尚,设计也相当简约。"),
TranscriptSegment(index=2, start=7.2, end=11.0,
en="Let me show you how it works in real life situations.",
zh="让我向你展示它在实际场景中如何工作。"),
en="This device looks really sleek and minimal.",
zh="这个设备看起来非常时尚简约。"),
]
update(job, transcript=mock_segments, status="transcribed", progress=100,
message="转录完成MOCK 模式 · 未设 GEMINI_API_KEY")
update(job, transcript=mock, status="transcribed", progress=100,
message="转录完成MOCK · 未设 LLM_API_KEY")
return
# 真模式:调 Gemini
import google.generativeai as genai
genai.configure(api_key=GEMINI_API_KEY)
model = genai.GenerativeModel(GEMINI_MODEL)
# 1) whisper ASR
update(job, status="transcribing", message=f"{ASR_MODEL} 转录中…", progress=78)
segments = await asyncio.to_thread(_transcribe_sync, wav)
if not segments:
raise RuntimeError("ASR 返回 0 段(可能无人声 / 格式问题)")
audio_file = genai.upload_file(str(wav), mime_type="audio/wav")
prompt = (
"Transcribe the English audio with sentence-level timestamps. "
"Then provide a Chinese translation for each segment. "
"Return strictly as JSON array, no prose, schema: "
'[{"start": float_seconds, "end": float_seconds, "en": "...", "zh": "..."}]'
)
resp = await asyncio.to_thread(
model.generate_content,
[audio_file, prompt],
generation_config={"response_mime_type": "application/json"},
)
raw = resp.text or "[]"
data = json.loads(raw)
segs = [
# 先把英文段落落到 job 上(让 UI 提前看到,翻译再补 zh
en_only = [
TranscriptSegment(
index=i,
start=float(s.get("start", 0)),
end=float(s.get("end", 0)),
en=str(s.get("en", "")),
zh=str(s.get("zh", "")),
en=str(s.get("text", "")).strip(),
zh="",
)
for i, s in enumerate(data)
for i, s in enumerate(segments)
]
update(job, transcript=segs, status="transcribed", progress=100,
message=f"转录完成 · {len(segs)}")
update(job, transcript=en_only, message=f"ASR 完成 · {len(en_only)} 段,开始翻译…", progress=88)
# 2) Gemini 翻译
zh_list = await asyncio.to_thread(_translate_sync, segments)
full = [
TranscriptSegment(
index=seg.index, start=seg.start, end=seg.end, en=seg.en,
zh=zh_list[i] if i < len(zh_list) else "",
)
for i, seg in enumerate(en_only)
]
update(job, transcript=full, status="transcribed", progress=100,
message=f"转录完成 · {len(full)} 段({ASR_MODEL} + {TRANSLATE_MODEL}")
except Exception as e:
update(job, status="failed", error=str(e), message="转录失败")
@@ -297,7 +358,16 @@ class CreateJobReq(BaseModel):
@app.get("/health")
def health() -> dict:
return {"ok": True, "gemini_configured": bool(GEMINI_API_KEY), "model": GEMINI_MODEL}
return {
"ok": True,
"llm_configured": bool(LLM_API_KEY),
"base_url": LLM_BASE_URL or "openai-default",
"models": {
"asr": ASR_MODEL,
"translate": TRANSLATE_MODEL,
"rewrite": REWRITE_MODEL,
},
}
@app.post("/jobs", response_model=Job)

View File

@@ -4,5 +4,5 @@ pydantic==2.9.2
python-multipart==0.0.12
python-dotenv==1.0.1
yt-dlp==2026.3.17
google-generativeai==0.8.3
openai==1.55.3
httpx==0.27.2