From a7b131534a0a0f172bb5ed3120fa798577d61141 Mon Sep 17 00:00:00 2001 From: kang Date: Mon, 18 May 2026 00:23:43 +0800 Subject: [PATCH] auto-save 2026-05-18 00:23 (~2) --- .memory/worklog.json | 40 ++++++------- api/main.py | 134 +++++++++++++++++++++++++++++++++++++------ 2 files changed, 137 insertions(+), 37 deletions(-) diff --git a/.memory/worklog.json b/.memory/worklog.json index e97aa39..b1ded1b 100644 --- a/.memory/worklog.json +++ b/.memory/worklog.json @@ -1,25 +1,5 @@ { "entries": [ - { - "files_changed": 10, - "hash": "f7cc49a", - "message": "auto-save 2026-05-15 15:21 (+1, ~9)", - "ts": "2026-05-15T15:21:20+08:00", - "type": "commit" - }, - { - "files_changed": 2, - "message": "Codex 会话活跃 · 最近命令:codex · 2 项未提交变更 · 最近提交:auto-save 2026-05-15 15:21 (+1, ~9)", - "ts": "2026-05-15T07:24:47Z", - "type": "session-heartbeat" - }, - { - "files_changed": 3, - "hash": "caa28e2", - "message": "auto-save 2026-05-15 15:26 (~3)", - "ts": "2026-05-15T15:26:51+08:00", - "type": "commit" - }, { "files_changed": 1, "hash": "45e7401", @@ -3258,6 +3238,26 @@ "type": "session-heartbeat", "message": "Codex 会话活跃 · 最近命令:codex · 分支 main · 2 项未提交变更 · 最近提交:auto-save 2026-05-18 00:07 (~3)", "files_changed": 2 + }, + { + "ts": "2026-05-18T00:12:58+08:00", + "type": "commit", + "message": "auto-save 2026-05-18 00:12 (~3)", + "hash": "ba202e4", + "files_changed": 3 + }, + { + "ts": "2026-05-18T00:16:10+08:00", + "type": "commit", + "message": "fix: show generated subject views", + "hash": "eeff64c", + "files_changed": 1 + }, + { + "ts": "2026-05-17T16:18:31Z", + "type": "session-heartbeat", + "message": "Codex 会话活跃 · 最近命令:codex · 分支 main · 1 项未提交变更 · 最近提交:fix: show generated subject views", + "files_changed": 1 } ] } diff --git a/api/main.py b/api/main.py index 9a3b6f3..e285d68 100644 --- a/api/main.py +++ b/api/main.py @@ -50,8 +50,10 @@ LOCAL_ASR_TIMEOUT_SECONDS = max(30, int(os.getenv("LOCAL_ASR_TIMEOUT_SECONDS", " TRANSLATE_MODEL = os.getenv("TRANSLATE_MODEL", "gemini-2.5-flash") REWRITE_MODEL = os.getenv("REWRITE_MODEL", "gemini-2.5-pro") VISION_MODEL = os.getenv("VISION_MODEL", "gemini-2.5-flash") -IMAGE_MODEL = os.getenv("IMAGE_MODEL", "gemini-3-pro-image-preview") GPT_IMAGE_MODEL = os.getenv("GPT_IMAGE_MODEL", "gpt-image-2").strip() or "gpt-image-2" +IMAGE_BASE_URL = os.getenv("IMAGE_BASE_URL", LLM_BASE_URL).strip() +IMAGE_API_KEY = os.getenv("IMAGE_API_KEY", LLM_API_KEY).strip() +IMAGE_MODEL = os.getenv("IMAGE_MODEL", GPT_IMAGE_MODEL).strip() or GPT_IMAGE_MODEL SUBJECT_ASSET_IMAGE_MODEL = os.getenv("SUBJECT_ASSET_IMAGE_MODEL", GPT_IMAGE_MODEL).strip() or GPT_IMAGE_MODEL SUBJECT_ASSET_IMAGE_MODELS = [ m.strip() @@ -87,6 +89,18 @@ MINIMAX_TTS_VOICE_POOL = [ for v in os.getenv("MINIMAX_TTS_VOICE_POOL", ",".join(DEFAULT_MINIMAX_TTS_VOICE_POOL)).split(",") if v.strip() ] +VOICE_PROVIDER = os.getenv("VOICE_PROVIDER", "azure_openai").strip().lower() or "azure_openai" +AZURE_OPENAI_BASE_URL = os.getenv("AZURE_OPENAI_BASE_URL", "https://ai.skg.com/azure").strip().rstrip("/") +AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY", LLM_API_KEY).strip() +AZURE_TTS_MODEL = os.getenv("AZURE_TTS_MODEL", "gpt-4o-mini-tts").strip() or "gpt-4o-mini-tts" +AZURE_TTS_VOICE_ID = os.getenv("AZURE_TTS_VOICE_ID", "alloy").strip() or "alloy" +DEFAULT_AZURE_TTS_VOICE_POOL = ["alloy", "verse", "shimmer"] +AZURE_TTS_VOICE_POOL = [ + v.strip() + for v in os.getenv("AZURE_TTS_VOICE_POOL", ",".join(DEFAULT_AZURE_TTS_VOICE_POOL)).split(",") + if v.strip() +] +AZURE_TTS_PATH = os.getenv("AZURE_TTS_PATH", "/audio/speech").strip() or "/audio/speech" POE_API_BASE_URL = os.getenv("POE_API_BASE_URL", "https://api.poe.com/v1").strip() or "https://api.poe.com/v1" POE_API_KEY = os.getenv("POE_API_KEY", "").strip() @@ -163,6 +177,7 @@ _MEDIA_BIN_CACHE: dict[str, str] = {} # OpenAI 客户端(OpenAI 兼容网关,含 SKG ezlink) from openai import OpenAI _llm_client: OpenAI | None = None +_image_client: OpenAI | None = None def llm() -> OpenAI: global _llm_client if _llm_client is None: @@ -171,6 +186,14 @@ def llm() -> OpenAI: _llm_client = OpenAI(base_url=LLM_BASE_URL or None, api_key=LLM_API_KEY) return _llm_client +def image_llm() -> OpenAI: + global _image_client + if _image_client is None: + if not IMAGE_API_KEY: + raise RuntimeError("IMAGE_API_KEY 或 LLM_API_KEY 未配置") + _image_client = OpenAI(base_url=IMAGE_BASE_URL or None, api_key=IMAGE_API_KEY) + return _image_client + # Pipeline 状态: # created → downloading → downloaded(前端“开始”会继续触发音频解析) # → splitting → frames_extracted @@ -2180,6 +2203,18 @@ def _choose_minimax_voice_id() -> str: return MINIMAX_TTS_VOICE_ID +def _choose_azure_voice_id() -> str: + if AZURE_TTS_VOICE_POOL: + return random.choice(AZURE_TTS_VOICE_POOL) + return AZURE_TTS_VOICE_ID + + +def _choose_tts_voice_id() -> str: + if VOICE_PROVIDER == "azure_openai": + return _choose_azure_voice_id() + return _choose_minimax_voice_id() + + def _voice_speed_for(voice_id: str, target_seconds: float, text: str) -> float: words = len([w for w in text.replace("\n", " ").split(" ") if w.strip()]) estimated_seconds = words / 2.35 if words else target_seconds @@ -2241,17 +2276,71 @@ def _minimax_tts_sync(job_id: str, text: str, voice_id: str, target_seconds: flo return f"/jobs/{job_id}/audio-script.mp3" +def _azure_tts_url() -> str: + path = AZURE_TTS_PATH if AZURE_TTS_PATH.startswith("/") else f"/{AZURE_TTS_PATH}" + if AZURE_OPENAI_BASE_URL.endswith(path): + return AZURE_OPENAI_BASE_URL + return f"{AZURE_OPENAI_BASE_URL}{path}" + + +def _azure_openai_tts_sync(job_id: str, text: str, voice_id: str, target_seconds: float = 12.0) -> str: + if not AZURE_OPENAI_API_KEY: + raise RuntimeError("AZURE_OPENAI_API_KEY 或 LLM_API_KEY 未配置,未生成配音") + if not text.strip(): + raise RuntimeError("改写文案为空,未生成配音") + payload = { + "model": AZURE_TTS_MODEL, + "voice": voice_id, + "input": text.strip()[:9500], + "response_format": "mp3", + "speed": _voice_speed_for(voice_id, target_seconds, text), + } + resp = httpx.post( + _azure_tts_url(), + headers={ + "Authorization": f"Bearer {AZURE_OPENAI_API_KEY}", + "api-key": AZURE_OPENAI_API_KEY, + "Content-Type": "application/json", + }, + json=payload, + timeout=120, + ) + if resp.status_code >= 400: + raise RuntimeError(f"Azure OpenAI TTS HTTP {resp.status_code}: {resp.text[:300]}") + audio_bytes = resp.content + if not audio_bytes: + raise RuntimeError("Azure OpenAI TTS 未返回音频内容") + content_type = resp.headers.get("content-type", "") + if "application/json" in content_type.lower(): + try: + data = resp.json() + except Exception: + data = {"error": resp.text[:300]} + raise RuntimeError(f"Azure OpenAI TTS 返回 JSON 而不是音频:{str(data)[:300]}") + out = job_dir(job_id) / "audio_script.mp3" + out.write_bytes(audio_bytes) + return f"/jobs/{job_id}/audio-script.mp3" + + +def _tts_sync(job_id: str, text: str, voice_id: str, target_seconds: float = 12.0) -> tuple[str, str, str]: + if VOICE_PROVIDER == "azure_openai": + return _azure_openai_tts_sync(job_id, text, voice_id, target_seconds), "azure_openai", AZURE_TTS_MODEL + return _minimax_tts_sync(job_id, text, voice_id, target_seconds), "minimax", MINIMAX_TTS_MODEL + + def _build_audio_script_sync(job_id: str, segments: list[TranscriptSegment], target_seconds: float = 12.0) -> AudioScript: source_text = _transcript_join(segments, "en") source_zh = _transcript_join(segments, "zh") duration = max(float(target_seconds or 0), _segment_duration(segments), 4.0) rewritten, rewrite_error = _rewrite_audio_script_sync(segments, duration) - selected_voice_id = _choose_minimax_voice_id() + selected_voice_id = _choose_tts_voice_id() speaker_profile, rhythm_profile = _audio_delivery_profile(segments, duration, selected_voice_id) voice_url = "" voice_error = "" + voice_provider = "azure_openai" if VOICE_PROVIDER == "azure_openai" else "minimax" + voice_model = AZURE_TTS_MODEL if voice_provider == "azure_openai" else MINIMAX_TTS_MODEL try: - voice_url = _minimax_tts_sync(job_id, rewritten, selected_voice_id, duration) + voice_url, voice_provider, voice_model = _tts_sync(job_id, rewritten, selected_voice_id, duration) except Exception as e: voice_error = str(e) # 改写失败时已有本地 SKG 模板兜底,不把它标成用户可见错误;配音失败才需要提示。 @@ -2265,8 +2354,8 @@ def _build_audio_script_sync(job_id: str, segments: list[TranscriptSegment], tar rhythm_profile=rhythm_profile, product_brief=AUDIO_PRODUCT_BRIEF, rewrite_model=AUDIO_REWRITE_MODEL, - voice_provider="minimax", - voice_model=MINIMAX_TTS_MODEL, + voice_provider=voice_provider, + voice_model=voice_model, voice_id=selected_voice_id, voice_url=voice_url, error=errors, @@ -2453,8 +2542,8 @@ def _image_edit_call( import time as _time import httpx from PIL import Image as _PILImage - if not LLM_API_KEY: - raise RuntimeError("LLM_API_KEY 未配置") + if not IMAGE_API_KEY: + raise RuntimeError("IMAGE_API_KEY 或 LLM_API_KEY 未配置") # model 优先级:models 列表 > 单个 model 参数 > IMAGE_MODEL if models and len(models) > 0: models_cycle = list(models) @@ -2489,9 +2578,9 @@ def _image_edit_call( if current_mode == "edit": with httpx.Client(timeout=120) as client: r = client.post( - f"{LLM_BASE_URL}/images/generations", + f"{IMAGE_BASE_URL}/images/generations", headers={ - "Authorization": f"Bearer {LLM_API_KEY}", + "Authorization": f"Bearer {IMAGE_API_KEY}", "Content-Type": "application/json", }, json={"model": current_model, "prompt": prompt, "image": data_uri, "n": 1}, @@ -2499,7 +2588,7 @@ def _image_edit_call( r.raise_for_status() resp_data = r.json() else: - resp = llm().images.generate(model=current_model, prompt=prompt, n=1) + resp = image_llm().images.generate(model=current_model, prompt=prompt, n=1) resp_data = resp.model_dump() if hasattr(resp, "model_dump") else {"data": [{"b64_json": resp.data[0].b64_json}]} if resp_data.get("data"): effective_mode = current_mode @@ -2542,15 +2631,15 @@ def _image_text_call( """Text-only image generation with light model rotation.""" import base64 as b64lib import time as _time - if not LLM_API_KEY: - raise RuntimeError("LLM_API_KEY 未配置") + if not IMAGE_API_KEY: + raise RuntimeError("IMAGE_API_KEY 或 LLM_API_KEY 未配置") models_cycle = list(models) if models else [model or IMAGE_MODEL] last_err = "" resp_data: dict = {} for attempt in range(max_attempts): current_model = models_cycle[min(attempt, len(models_cycle) - 1)] try: - resp = llm().images.generate(model=current_model, prompt=prompt, n=1) + resp = image_llm().images.generate(model=current_model, prompt=prompt, n=1) resp_data = resp.model_dump() if hasattr(resp, "model_dump") else {"data": [{"b64_json": resp.data[0].b64_json}]} if resp_data.get("data"): b64 = resp_data["data"][0].get("b64_json") @@ -2752,6 +2841,8 @@ def health() -> dict: "llm_configured": bool(LLM_API_KEY), "auth_configured": WEB_AUTH_CONFIGURED, "base_url": LLM_BASE_URL or "openai-default", + "image_base_url": IMAGE_BASE_URL or LLM_BASE_URL or "openai-default", + "voice_base_url": AZURE_OPENAI_BASE_URL if VOICE_PROVIDER == "azure_openai" else MINIMAX_TTS_BASE_URL, "models": { "asr": ASR_MODEL, "local_asr": LOCAL_ASR_MODEL, @@ -2761,9 +2852,16 @@ def health() -> dict: "audio_rewrite": AUDIO_REWRITE_MODEL, "vision": VISION_MODEL, "image": IMAGE_MODEL, - "image_fallbacks": [IMAGE_MODEL, "gemini-3.1-flash-image-preview", "gemini-2.5-flash-image"], + "image_base_url": IMAGE_BASE_URL or LLM_BASE_URL or "openai-default", + "image_fallbacks": [IMAGE_MODEL, GPT_IMAGE_MODEL, "gpt-image-1.5"], "subject_image": SUBJECT_ASSET_IMAGE_MODEL, "subject_image_fallbacks": SUBJECT_ASSET_IMAGE_MODELS, + "voice_provider": VOICE_PROVIDER, + "voice_base_url": AZURE_OPENAI_BASE_URL if VOICE_PROVIDER == "azure_openai" else MINIMAX_TTS_BASE_URL, + "voice_tts": AZURE_TTS_MODEL if VOICE_PROVIDER == "azure_openai" else MINIMAX_TTS_MODEL, + "voice_id": AZURE_TTS_VOICE_ID if VOICE_PROVIDER == "azure_openai" else MINIMAX_TTS_VOICE_ID, + "voice_pool": AZURE_TTS_VOICE_POOL if VOICE_PROVIDER == "azure_openai" else (MINIMAX_TTS_VOICE_POOL or [MINIMAX_TTS_VOICE_ID]), + "voice_configured": bool(AZURE_OPENAI_API_KEY) if VOICE_PROVIDER == "azure_openai" else bool(MINIMAX_API_KEY), "minimax_tts": MINIMAX_TTS_MODEL, "minimax_voice": MINIMAX_TTS_VOICE_ID, "minimax_voice_pool": MINIMAX_TTS_VOICE_POOL or [MINIMAX_TTS_VOICE_ID], @@ -3049,6 +3147,8 @@ def generate_image(job_id: str, idx: int, req: GenerateReq) -> Job: full_prompt = f"{full_prompt}. Avoid: {req.negative_prompt.strip()}" if not full_prompt: raise HTTPException(400, "prompt required") + if not IMAGE_API_KEY: + raise HTTPException(503, "IMAGE_API_KEY 或 LLM_API_KEY 未配置") model = req.model or IMAGE_MODEL gen_id = uuid.uuid4().hex[:12] @@ -3075,9 +3175,9 @@ def generate_image(job_id: str, idx: int, req: GenerateReq) -> Job: # OpenAI SDK 不直接支持 image 参数,用底层 httpx with httpx.Client(timeout=120) as client: r = client.post( - f"{LLM_BASE_URL}/images/generations", + f"{IMAGE_BASE_URL}/images/generations", headers={ - "Authorization": f"Bearer {LLM_API_KEY}", + "Authorization": f"Bearer {IMAGE_API_KEY}", "Content-Type": "application/json", }, json={ @@ -3091,7 +3191,7 @@ def generate_image(job_id: str, idx: int, req: GenerateReq) -> Job: resp_data = r.json() else: # text-only - resp = llm().images.generate(model=model, prompt=full_prompt, n=1) + resp = image_llm().images.generate(model=model, prompt=full_prompt, n=1) resp_data = resp.model_dump() if hasattr(resp, "model_dump") else {"data": [{"b64_json": resp.data[0].b64_json}]} if resp_data.get("data"):