From 35b327815a452c24900cc6c6384606b0a407f276 Mon Sep 17 00:00:00 2001 From: kang Date: Tue, 12 May 2026 16:16:52 +0800 Subject: [PATCH] auto-save 2026-05-12 16:16 (~4) --- .memory/worklog.json | 7 ++ api/.env.example | 17 +++-- api/main.py | 154 +++++++++++++++++++++++++++++++------------ api/requirements.txt | 2 +- 4 files changed, 130 insertions(+), 50 deletions(-) diff --git a/.memory/worklog.json b/.memory/worklog.json index a72ef2a..c24b46e 100644 --- a/.memory/worklog.json +++ b/.memory/worklog.json @@ -41,6 +41,13 @@ "message": "auto-save 2026-05-12 16:02 (+2, ~6)", "hash": "b0ffd03", "files_changed": 9 + }, + { + "ts": "2026-05-12T16:11:20+08:00", + "type": "commit", + "message": "auto-save 2026-05-12 16:11 (~1)", + "hash": "cc31bfe", + "files_changed": 1 } ] } diff --git a/api/.env.example b/api/.env.example index 9e2b44f..a8d4e28 100644 --- a/api/.env.example +++ b/api/.env.example @@ -1,9 +1,13 @@ -# Gemini API(优先用 Poe 中转,按用户偏好) -# Poe 网关示例:GEMINI_API_BASE=https://api.poe.com/v1 + key -# Google 直连示例:留空 GEMINI_API_BASE,用 google-generativeai SDK -GEMINI_API_KEY= -GEMINI_API_BASE= -GEMINI_MODEL=gemini-2.5-flash +# SKG AI 网关(OpenAI 兼容) +LLM_BASE_URL=https://ai.skg.com/ezlink/v1 +LLM_API_KEY= + +# 模型分工 +ASR_MODEL=whisper-1 +TRANSLATE_MODEL=gemini-2.5-flash +REWRITE_MODEL=gemini-2.5-pro +IMAGE_MODEL=gemini-3-pro-image-preview +VIDEO_MODEL=sora-2 # 工作目录 JOBS_DIR=./jobs @@ -11,5 +15,4 @@ JOBS_DIR=./jobs # CORS CORS_ORIGINS=http://localhost:4290 -# 端口(启动用 uvicorn --port 4291 覆盖) API_PORT=4291 diff --git a/api/main.py b/api/main.py index b3a0346..3409987 100644 --- a/api/main.py +++ b/api/main.py @@ -21,8 +21,23 @@ load_dotenv() JOBS_DIR = Path(os.getenv("JOBS_DIR", "./jobs")).resolve() JOBS_DIR.mkdir(parents=True, exist_ok=True) CORS_ORIGINS = [o.strip() for o in os.getenv("CORS_ORIGINS", "http://localhost:4290").split(",") if o.strip()] -GEMINI_API_KEY = os.getenv("GEMINI_API_KEY", "").strip() -GEMINI_MODEL = os.getenv("GEMINI_MODEL", "gemini-2.5-flash") + +LLM_BASE_URL = os.getenv("LLM_BASE_URL", "").strip() +LLM_API_KEY = os.getenv("LLM_API_KEY", "").strip() +ASR_MODEL = os.getenv("ASR_MODEL", "whisper-1") +TRANSLATE_MODEL = os.getenv("TRANSLATE_MODEL", "gemini-2.5-flash") +REWRITE_MODEL = os.getenv("REWRITE_MODEL", "gemini-2.5-pro") + +# OpenAI 客户端(OpenAI 兼容网关,含 SKG ezlink) +from openai import OpenAI +_llm_client: OpenAI | None = None +def llm() -> OpenAI: + global _llm_client + if _llm_client is None: + if not LLM_API_KEY: + raise RuntimeError("LLM_API_KEY 未配置") + _llm_client = OpenAI(base_url=LLM_BASE_URL or None, api_key=LLM_API_KEY) + return _llm_client # Pipeline 状态:created → downloading → splitting → frames_extracted → transcribing → transcribed | failed JobStatus = Literal[ @@ -225,6 +240,57 @@ async def pipeline_download_split_frames(job_id: str) -> None: # ---------- Gemini ASR + 翻译 ---------- +def _transcribe_sync(wav: Path) -> list[dict]: + """whisper-1 verbose_json → segments[{start, end, text}]""" + with wav.open("rb") as f: + resp = llm().audio.transcriptions.create( + file=(wav.name, f, "audio/wav"), + model=ASR_MODEL, + response_format="verbose_json", + timestamp_granularities=["segment"], + ) + raw = resp.model_dump() if hasattr(resp, "model_dump") else resp + segments = raw.get("segments") or [] + # 兜底:网关如果不返回 segments,把全文当一段 + if not segments and raw.get("text"): + segments = [{"start": 0.0, "end": float(raw.get("duration", 0) or 0), "text": raw["text"]}] + return segments + + +def _translate_sync(segments: list[dict]) -> list[str]: + """gemini-2.5-flash 批量翻译为中文,按段返回""" + payload = [{"i": i, "en": s.get("text", "").strip()} for i, s in enumerate(segments)] + prompt = ( + "你是字幕翻译。把下列英文字幕段翻译为简体中文,保持原意、口语化、自然流畅。" + "严格返回 JSON 数组,不要任何 markdown 或多余文字,schema: " + '[{"i": 0, "zh": "..."}, ...]\n\n输入:\n' + + json.dumps(payload, ensure_ascii=False) + ) + resp = llm().chat.completions.create( + model=TRANSLATE_MODEL, + messages=[{"role": "user", "content": prompt}], + response_format={"type": "json_object"}, + temperature=0.2, + ) + content = resp.choices[0].message.content or "[]" + try: + data = json.loads(content) + if isinstance(data, dict): + for k in ("data", "items", "result", "translations"): + if k in data and isinstance(data[k], list): + data = data[k] + break + if not isinstance(data, list): + data = [] + except json.JSONDecodeError: + data = [] + zh_by_idx: dict[int, str] = {} + for it in data: + if isinstance(it, dict) and "i" in it: + zh_by_idx[int(it["i"])] = str(it.get("zh", "")) + return [zh_by_idx.get(i, "") for i in range(len(segments))] + + async def pipeline_transcribe(job_id: str) -> None: job = JOBS[job_id] d = job_dir(job_id) @@ -233,57 +299,52 @@ async def pipeline_transcribe(job_id: str) -> None: if not wav.exists(): raise RuntimeError("audio.wav 不存在") - update(job, status="transcribing", message="Gemini ASR 处理中…", progress=75) - - if not GEMINI_API_KEY: - # 无 key 模式:mock 数据,方便 UI 联调 - await asyncio.sleep(1.2) - mock_segments = [ + if not LLM_API_KEY: + # 无 key 模式:mock 数据 + update(job, status="transcribing", message="ASR (mock) …", progress=75) + await asyncio.sleep(1.0) + mock = [ TranscriptSegment(index=0, start=0.0, end=3.5, - en="Welcome back to my channel, today we're testing something new.", - zh="欢迎回来我的频道,今天我们要测试一些新东西。"), + en="Welcome back, today we're testing something new.", + zh="欢迎回来,今天我们要测试一些新东西。"), TranscriptSegment(index=1, start=3.5, end=7.2, - en="This device looks really sleek and the design is quite minimal.", - zh="这个设备看起来非常时尚,设计也相当简约。"), - TranscriptSegment(index=2, start=7.2, end=11.0, - en="Let me show you how it works in real life situations.", - zh="让我向你展示它在实际场景中如何工作。"), + en="This device looks really sleek and minimal.", + zh="这个设备看起来非常时尚和简约。"), ] - update(job, transcript=mock_segments, status="transcribed", progress=100, - message="转录完成(MOCK 模式 · 未设 GEMINI_API_KEY)") + update(job, transcript=mock, status="transcribed", progress=100, + message="转录完成(MOCK · 未设 LLM_API_KEY)") return - # 真模式:调 Gemini - import google.generativeai as genai - genai.configure(api_key=GEMINI_API_KEY) - model = genai.GenerativeModel(GEMINI_MODEL) + # 1) whisper ASR + update(job, status="transcribing", message=f"{ASR_MODEL} 转录中…", progress=78) + segments = await asyncio.to_thread(_transcribe_sync, wav) + if not segments: + raise RuntimeError("ASR 返回 0 段(可能无人声 / 格式问题)") - audio_file = genai.upload_file(str(wav), mime_type="audio/wav") - prompt = ( - "Transcribe the English audio with sentence-level timestamps. " - "Then provide a Chinese translation for each segment. " - "Return strictly as JSON array, no prose, schema: " - '[{"start": float_seconds, "end": float_seconds, "en": "...", "zh": "..."}]' - ) - resp = await asyncio.to_thread( - model.generate_content, - [audio_file, prompt], - generation_config={"response_mime_type": "application/json"}, - ) - raw = resp.text or "[]" - data = json.loads(raw) - segs = [ + # 先把英文段落落到 job 上(让 UI 提前看到,翻译再补 zh) + en_only = [ TranscriptSegment( index=i, start=float(s.get("start", 0)), end=float(s.get("end", 0)), - en=str(s.get("en", "")), - zh=str(s.get("zh", "")), + en=str(s.get("text", "")).strip(), + zh="", ) - for i, s in enumerate(data) + for i, s in enumerate(segments) ] - update(job, transcript=segs, status="transcribed", progress=100, - message=f"转录完成 · {len(segs)} 段") + update(job, transcript=en_only, message=f"ASR 完成 · {len(en_only)} 段,开始翻译…", progress=88) + + # 2) Gemini 翻译 + zh_list = await asyncio.to_thread(_translate_sync, segments) + full = [ + TranscriptSegment( + index=seg.index, start=seg.start, end=seg.end, en=seg.en, + zh=zh_list[i] if i < len(zh_list) else "", + ) + for i, seg in enumerate(en_only) + ] + update(job, transcript=full, status="transcribed", progress=100, + message=f"转录完成 · {len(full)} 段({ASR_MODEL} + {TRANSLATE_MODEL})") except Exception as e: update(job, status="failed", error=str(e), message="转录失败") @@ -297,7 +358,16 @@ class CreateJobReq(BaseModel): @app.get("/health") def health() -> dict: - return {"ok": True, "gemini_configured": bool(GEMINI_API_KEY), "model": GEMINI_MODEL} + return { + "ok": True, + "llm_configured": bool(LLM_API_KEY), + "base_url": LLM_BASE_URL or "openai-default", + "models": { + "asr": ASR_MODEL, + "translate": TRANSLATE_MODEL, + "rewrite": REWRITE_MODEL, + }, + } @app.post("/jobs", response_model=Job) diff --git a/api/requirements.txt b/api/requirements.txt index 1304c47..8cde924 100644 --- a/api/requirements.txt +++ b/api/requirements.txt @@ -4,5 +4,5 @@ pydantic==2.9.2 python-multipart==0.0.12 python-dotenv==1.0.1 yt-dlp==2026.3.17 -google-generativeai==0.8.3 +openai==1.55.3 httpx==0.27.2