auto-save 2026-05-12 16:16 (~4)
This commit is contained in:
@@ -41,6 +41,13 @@
|
||||
"message": "auto-save 2026-05-12 16:02 (+2, ~6)",
|
||||
"hash": "b0ffd03",
|
||||
"files_changed": 9
|
||||
},
|
||||
{
|
||||
"ts": "2026-05-12T16:11:20+08:00",
|
||||
"type": "commit",
|
||||
"message": "auto-save 2026-05-12 16:11 (~1)",
|
||||
"hash": "cc31bfe",
|
||||
"files_changed": 1
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
@@ -1,9 +1,13 @@
|
||||
# Gemini API(优先用 Poe 中转,按用户偏好)
|
||||
# Poe 网关示例:GEMINI_API_BASE=https://api.poe.com/v1 + key
|
||||
# Google 直连示例:留空 GEMINI_API_BASE,用 google-generativeai SDK
|
||||
GEMINI_API_KEY=
|
||||
GEMINI_API_BASE=
|
||||
GEMINI_MODEL=gemini-2.5-flash
|
||||
# SKG AI 网关(OpenAI 兼容)
|
||||
LLM_BASE_URL=https://ai.skg.com/ezlink/v1
|
||||
LLM_API_KEY=
|
||||
|
||||
# 模型分工
|
||||
ASR_MODEL=whisper-1
|
||||
TRANSLATE_MODEL=gemini-2.5-flash
|
||||
REWRITE_MODEL=gemini-2.5-pro
|
||||
IMAGE_MODEL=gemini-3-pro-image-preview
|
||||
VIDEO_MODEL=sora-2
|
||||
|
||||
# 工作目录
|
||||
JOBS_DIR=./jobs
|
||||
@@ -11,5 +15,4 @@ JOBS_DIR=./jobs
|
||||
# CORS
|
||||
CORS_ORIGINS=http://localhost:4290
|
||||
|
||||
# 端口(启动用 uvicorn --port 4291 覆盖)
|
||||
API_PORT=4291
|
||||
|
||||
154
api/main.py
154
api/main.py
@@ -21,8 +21,23 @@ load_dotenv()
|
||||
JOBS_DIR = Path(os.getenv("JOBS_DIR", "./jobs")).resolve()
|
||||
JOBS_DIR.mkdir(parents=True, exist_ok=True)
|
||||
CORS_ORIGINS = [o.strip() for o in os.getenv("CORS_ORIGINS", "http://localhost:4290").split(",") if o.strip()]
|
||||
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY", "").strip()
|
||||
GEMINI_MODEL = os.getenv("GEMINI_MODEL", "gemini-2.5-flash")
|
||||
|
||||
LLM_BASE_URL = os.getenv("LLM_BASE_URL", "").strip()
|
||||
LLM_API_KEY = os.getenv("LLM_API_KEY", "").strip()
|
||||
ASR_MODEL = os.getenv("ASR_MODEL", "whisper-1")
|
||||
TRANSLATE_MODEL = os.getenv("TRANSLATE_MODEL", "gemini-2.5-flash")
|
||||
REWRITE_MODEL = os.getenv("REWRITE_MODEL", "gemini-2.5-pro")
|
||||
|
||||
# OpenAI 客户端(OpenAI 兼容网关,含 SKG ezlink)
|
||||
from openai import OpenAI
|
||||
_llm_client: OpenAI | None = None
|
||||
def llm() -> OpenAI:
|
||||
global _llm_client
|
||||
if _llm_client is None:
|
||||
if not LLM_API_KEY:
|
||||
raise RuntimeError("LLM_API_KEY 未配置")
|
||||
_llm_client = OpenAI(base_url=LLM_BASE_URL or None, api_key=LLM_API_KEY)
|
||||
return _llm_client
|
||||
|
||||
# Pipeline 状态:created → downloading → splitting → frames_extracted → transcribing → transcribed | failed
|
||||
JobStatus = Literal[
|
||||
@@ -225,6 +240,57 @@ async def pipeline_download_split_frames(job_id: str) -> None:
|
||||
|
||||
# ---------- Gemini ASR + 翻译 ----------
|
||||
|
||||
def _transcribe_sync(wav: Path) -> list[dict]:
|
||||
"""whisper-1 verbose_json → segments[{start, end, text}]"""
|
||||
with wav.open("rb") as f:
|
||||
resp = llm().audio.transcriptions.create(
|
||||
file=(wav.name, f, "audio/wav"),
|
||||
model=ASR_MODEL,
|
||||
response_format="verbose_json",
|
||||
timestamp_granularities=["segment"],
|
||||
)
|
||||
raw = resp.model_dump() if hasattr(resp, "model_dump") else resp
|
||||
segments = raw.get("segments") or []
|
||||
# 兜底:网关如果不返回 segments,把全文当一段
|
||||
if not segments and raw.get("text"):
|
||||
segments = [{"start": 0.0, "end": float(raw.get("duration", 0) or 0), "text": raw["text"]}]
|
||||
return segments
|
||||
|
||||
|
||||
def _translate_sync(segments: list[dict]) -> list[str]:
|
||||
"""gemini-2.5-flash 批量翻译为中文,按段返回"""
|
||||
payload = [{"i": i, "en": s.get("text", "").strip()} for i, s in enumerate(segments)]
|
||||
prompt = (
|
||||
"你是字幕翻译。把下列英文字幕段翻译为简体中文,保持原意、口语化、自然流畅。"
|
||||
"严格返回 JSON 数组,不要任何 markdown 或多余文字,schema: "
|
||||
'[{"i": 0, "zh": "..."}, ...]\n\n输入:\n'
|
||||
+ json.dumps(payload, ensure_ascii=False)
|
||||
)
|
||||
resp = llm().chat.completions.create(
|
||||
model=TRANSLATE_MODEL,
|
||||
messages=[{"role": "user", "content": prompt}],
|
||||
response_format={"type": "json_object"},
|
||||
temperature=0.2,
|
||||
)
|
||||
content = resp.choices[0].message.content or "[]"
|
||||
try:
|
||||
data = json.loads(content)
|
||||
if isinstance(data, dict):
|
||||
for k in ("data", "items", "result", "translations"):
|
||||
if k in data and isinstance(data[k], list):
|
||||
data = data[k]
|
||||
break
|
||||
if not isinstance(data, list):
|
||||
data = []
|
||||
except json.JSONDecodeError:
|
||||
data = []
|
||||
zh_by_idx: dict[int, str] = {}
|
||||
for it in data:
|
||||
if isinstance(it, dict) and "i" in it:
|
||||
zh_by_idx[int(it["i"])] = str(it.get("zh", ""))
|
||||
return [zh_by_idx.get(i, "") for i in range(len(segments))]
|
||||
|
||||
|
||||
async def pipeline_transcribe(job_id: str) -> None:
|
||||
job = JOBS[job_id]
|
||||
d = job_dir(job_id)
|
||||
@@ -233,57 +299,52 @@ async def pipeline_transcribe(job_id: str) -> None:
|
||||
if not wav.exists():
|
||||
raise RuntimeError("audio.wav 不存在")
|
||||
|
||||
update(job, status="transcribing", message="Gemini ASR 处理中…", progress=75)
|
||||
|
||||
if not GEMINI_API_KEY:
|
||||
# 无 key 模式:mock 数据,方便 UI 联调
|
||||
await asyncio.sleep(1.2)
|
||||
mock_segments = [
|
||||
if not LLM_API_KEY:
|
||||
# 无 key 模式:mock 数据
|
||||
update(job, status="transcribing", message="ASR (mock) …", progress=75)
|
||||
await asyncio.sleep(1.0)
|
||||
mock = [
|
||||
TranscriptSegment(index=0, start=0.0, end=3.5,
|
||||
en="Welcome back to my channel, today we're testing something new.",
|
||||
zh="欢迎回来我的频道,今天我们要测试一些新东西。"),
|
||||
en="Welcome back, today we're testing something new.",
|
||||
zh="欢迎回来,今天我们要测试一些新东西。"),
|
||||
TranscriptSegment(index=1, start=3.5, end=7.2,
|
||||
en="This device looks really sleek and the design is quite minimal.",
|
||||
zh="这个设备看起来非常时尚,设计也相当简约。"),
|
||||
TranscriptSegment(index=2, start=7.2, end=11.0,
|
||||
en="Let me show you how it works in real life situations.",
|
||||
zh="让我向你展示它在实际场景中如何工作。"),
|
||||
en="This device looks really sleek and minimal.",
|
||||
zh="这个设备看起来非常时尚和简约。"),
|
||||
]
|
||||
update(job, transcript=mock_segments, status="transcribed", progress=100,
|
||||
message="转录完成(MOCK 模式 · 未设 GEMINI_API_KEY)")
|
||||
update(job, transcript=mock, status="transcribed", progress=100,
|
||||
message="转录完成(MOCK · 未设 LLM_API_KEY)")
|
||||
return
|
||||
|
||||
# 真模式:调 Gemini
|
||||
import google.generativeai as genai
|
||||
genai.configure(api_key=GEMINI_API_KEY)
|
||||
model = genai.GenerativeModel(GEMINI_MODEL)
|
||||
# 1) whisper ASR
|
||||
update(job, status="transcribing", message=f"{ASR_MODEL} 转录中…", progress=78)
|
||||
segments = await asyncio.to_thread(_transcribe_sync, wav)
|
||||
if not segments:
|
||||
raise RuntimeError("ASR 返回 0 段(可能无人声 / 格式问题)")
|
||||
|
||||
audio_file = genai.upload_file(str(wav), mime_type="audio/wav")
|
||||
prompt = (
|
||||
"Transcribe the English audio with sentence-level timestamps. "
|
||||
"Then provide a Chinese translation for each segment. "
|
||||
"Return strictly as JSON array, no prose, schema: "
|
||||
'[{"start": float_seconds, "end": float_seconds, "en": "...", "zh": "..."}]'
|
||||
)
|
||||
resp = await asyncio.to_thread(
|
||||
model.generate_content,
|
||||
[audio_file, prompt],
|
||||
generation_config={"response_mime_type": "application/json"},
|
||||
)
|
||||
raw = resp.text or "[]"
|
||||
data = json.loads(raw)
|
||||
segs = [
|
||||
# 先把英文段落落到 job 上(让 UI 提前看到,翻译再补 zh)
|
||||
en_only = [
|
||||
TranscriptSegment(
|
||||
index=i,
|
||||
start=float(s.get("start", 0)),
|
||||
end=float(s.get("end", 0)),
|
||||
en=str(s.get("en", "")),
|
||||
zh=str(s.get("zh", "")),
|
||||
en=str(s.get("text", "")).strip(),
|
||||
zh="",
|
||||
)
|
||||
for i, s in enumerate(data)
|
||||
for i, s in enumerate(segments)
|
||||
]
|
||||
update(job, transcript=segs, status="transcribed", progress=100,
|
||||
message=f"转录完成 · {len(segs)} 段")
|
||||
update(job, transcript=en_only, message=f"ASR 完成 · {len(en_only)} 段,开始翻译…", progress=88)
|
||||
|
||||
# 2) Gemini 翻译
|
||||
zh_list = await asyncio.to_thread(_translate_sync, segments)
|
||||
full = [
|
||||
TranscriptSegment(
|
||||
index=seg.index, start=seg.start, end=seg.end, en=seg.en,
|
||||
zh=zh_list[i] if i < len(zh_list) else "",
|
||||
)
|
||||
for i, seg in enumerate(en_only)
|
||||
]
|
||||
update(job, transcript=full, status="transcribed", progress=100,
|
||||
message=f"转录完成 · {len(full)} 段({ASR_MODEL} + {TRANSLATE_MODEL})")
|
||||
|
||||
except Exception as e:
|
||||
update(job, status="failed", error=str(e), message="转录失败")
|
||||
@@ -297,7 +358,16 @@ class CreateJobReq(BaseModel):
|
||||
|
||||
@app.get("/health")
|
||||
def health() -> dict:
|
||||
return {"ok": True, "gemini_configured": bool(GEMINI_API_KEY), "model": GEMINI_MODEL}
|
||||
return {
|
||||
"ok": True,
|
||||
"llm_configured": bool(LLM_API_KEY),
|
||||
"base_url": LLM_BASE_URL or "openai-default",
|
||||
"models": {
|
||||
"asr": ASR_MODEL,
|
||||
"translate": TRANSLATE_MODEL,
|
||||
"rewrite": REWRITE_MODEL,
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
@app.post("/jobs", response_model=Job)
|
||||
|
||||
@@ -4,5 +4,5 @@ pydantic==2.9.2
|
||||
python-multipart==0.0.12
|
||||
python-dotenv==1.0.1
|
||||
yt-dlp==2026.3.17
|
||||
google-generativeai==0.8.3
|
||||
openai==1.55.3
|
||||
httpx==0.27.2
|
||||
|
||||
Reference in New Issue
Block a user