diff --git a/RULES.md b/RULES.md index f02bd07..1925c7e 100644 --- a/RULES.md +++ b/RULES.md @@ -65,12 +65,10 @@ - `IMAGE_BASE_URL` / `IMAGE_API_KEY` / `IMAGE_MODEL`:OpenAI 兼容生图网关;当前所有生图入口一律强制使用 `gpt-image-2`,不做其他图片模型 fallback - `GPT_IMAGE_MODEL` / `SUBJECT_ASSET_IMAGE_MODEL` / `SUBJECT_ASSET_IMAGE_MODELS`:保留兼容旧环境变量名,但服务端会强制主体 6 视图和所有其他生图入口都只使用 `gpt-image-2` - `AI_HTTP_PROXY` / `IMAGE_HTTP_PROXY`:可选的 AI 网关出站代理;本地 launchd 后台进程不一定继承 shell 的 `http_proxy/https_proxy`,如生图报 DNS / ConnectError,可在本地 `api/.env` 配置后重启后端。`/health` 只回传是否配置代理,不回传代理地址。 -- `VOICE_PROVIDER`:配音通道,当前固定使用 `azure_openai` +- `VOICE_PROVIDER`:配音通道,服务端固定使用 `azure_openai`;旧环境若写 `minimax` 会被忽略 - `AZURE_OPENAI_BASE_URL` / `AZURE_OPENAI_API_KEY`:微软 Azure OpenAI 协议配音网关;本地未单独配置 Key 时回退复用 `LLM_API_KEY` -- `AZURE_TTS_MODEL` / `AZURE_TTS_VOICE_ID` / `AZURE_TTS_VOICE_POOL` / `AZURE_TTS_PATH`:Azure OpenAI TTS 模型、默认音色、音色池和 OpenAI 协议语音路径 -- `MINIMAX_API_KEY`:MiniMax T2A 配音 Key,只能放本地 `api/.env`,不能入库;当前第一步暂不默认调用 -- `MINIMAX_TTS_BASE_URL` / `MINIMAX_TTS_MODEL` / `MINIMAX_TTS_VOICE_ID`:MiniMax 旧配音端点、模型和兜底音色配置,仅作为保留兼容;当前不作为默认语音通道 -- `MINIMAX_TTS_VOICE_POOL`:MiniMax 英文随机音色池;当前默认男声 `English_magnetic_voiced_man`、女声 `English_Upbeat_Woman`、成熟声 `English_MaturePartner`,供后续新配音阶段使用 +- `AZURE_TTS_MODEL` / `AZURE_TTS_VOICE_ID` / `AZURE_TTS_VOICE_POOL` / `AZURE_TTS_PATH` / `AZURE_TTS_PATHS`:Azure OpenAI TTS 模型、默认音色、音色池和 OpenAI 协议语音路径;后端会按 `AZURE_TTS_PATHS` 依次尝试,便于区分路径不对和整条语音服务不可用 +- MiniMax TTS 不再作为语音 fallback;不要新增或依赖 `MINIMAX_*` 配置 - `POE_API_KEY` / `VIDEO_API_KEY`:视频生成通道 Key,只能放本地环境变量 - `WEB_AUTH_USERNAME` / `WEB_AUTH_PASSWORD` / `WEB_AUTH_SESSION_SECRET`:生产网页登录和会话签名配置;密码和 session secret 只放服务器环境变量,不入库 - `FFMPEG_BIN` / `FFPROBE_BIN`:可选本地媒体二进制路径;本机 Homebrew ffmpeg 动态库损坏时,后端会自动跳过不可用的 PATH 版本并尝试本机静态 ffmpeg 备选,生产仍建议使用系统 ffmpeg/ffprobe diff --git a/api/.env.example b/api/.env.example index dbc7e1e..6c0908c 100644 --- a/api/.env.example +++ b/api/.env.example @@ -37,6 +37,7 @@ VIDEO_MODEL_VEO3=veo-3.1-fast # 音频文案改写 + Azure OpenAI 配音 AUDIO_REWRITE_MODEL=gpt-4o AUDIO_PRODUCT_BRIEF="SKG 智能按摩产品,主打日常肩颈、腰背、眼部、膝盖或足部放松;广告表达要高级、干净、可信,不做医疗疗效承诺。" +# 语音通道服务端固定为 azure_openai;旧 VOICE_PROVIDER=minimax 会被忽略。 VOICE_PROVIDER=azure_openai AZURE_OPENAI_BASE_URL=https://ai.skg.com/azure AZURE_OPENAI_API_KEY= @@ -44,13 +45,7 @@ AZURE_TTS_MODEL=gpt-4o-mini-tts AZURE_TTS_VOICE_ID=alloy AZURE_TTS_VOICE_POOL=alloy,verse,shimmer AZURE_TTS_PATH=/audio/speech - -# MiniMax 旧配音通道,保留兼容;默认不走 -MINIMAX_API_KEY= -MINIMAX_TTS_BASE_URL=https://api.minimax.io -MINIMAX_TTS_MODEL=speech-2.8-turbo -MINIMAX_TTS_VOICE_ID=English_expressive_narrator -MINIMAX_TTS_VOICE_POOL=English_magnetic_voiced_man,English_Upbeat_Woman,English_MaturePartner +AZURE_TTS_PATHS=/audio/speech,/v1/audio/speech # Poe 视频 API(优先用于 Seedance / Kling / Veo) POE_API_BASE_URL=https://api.poe.com/v1 diff --git a/api/main.py b/api/main.py index ee2d75a..62f7fce 100644 --- a/api/main.py +++ b/api/main.py @@ -105,7 +105,9 @@ MINIMAX_TTS_VOICE_POOL = [ for v in os.getenv("MINIMAX_TTS_VOICE_POOL", ",".join(DEFAULT_MINIMAX_TTS_VOICE_POOL)).split(",") if v.strip() ] -VOICE_PROVIDER = os.getenv("VOICE_PROVIDER", "azure_openai").strip().lower() or "azure_openai" +# Voice is intentionally fixed to Azure OpenAI. Older envs may still contain +# VOICE_PROVIDER=minimax, but the runtime must not fall back to MiniMax. +VOICE_PROVIDER = "azure_openai" AZURE_OPENAI_BASE_URL = os.getenv("AZURE_OPENAI_BASE_URL", "https://ai.skg.com/azure").strip().rstrip("/") AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY", LLM_API_KEY).strip() AZURE_TTS_MODEL = os.getenv("AZURE_TTS_MODEL", "gpt-4o-mini-tts").strip() or "gpt-4o-mini-tts" @@ -117,6 +119,11 @@ AZURE_TTS_VOICE_POOL = [ if v.strip() ] AZURE_TTS_PATH = os.getenv("AZURE_TTS_PATH", "/audio/speech").strip() or "/audio/speech" +AZURE_TTS_PATHS = [ + p.strip() + for p in os.getenv("AZURE_TTS_PATHS", f"{AZURE_TTS_PATH},/audio/speech,/v1/audio/speech").split(",") + if p.strip() +] POE_API_BASE_URL = os.getenv("POE_API_BASE_URL", "https://api.poe.com/v1").strip() or "https://api.poe.com/v1" POE_API_KEY = os.getenv("POE_API_KEY", "").strip() @@ -2334,9 +2341,7 @@ def _choose_azure_voice_id() -> str: def _choose_tts_voice_id() -> str: - if VOICE_PROVIDER == "azure_openai": - return _choose_azure_voice_id() - return _choose_minimax_voice_id() + return _choose_azure_voice_id() def _voice_speed_for(voice_id: str, target_seconds: float, text: str) -> float: @@ -2400,13 +2405,22 @@ def _minimax_tts_sync(job_id: str, text: str, voice_id: str, target_seconds: flo return f"/jobs/{job_id}/audio-script.mp3" -def _azure_tts_url() -> str: - path = AZURE_TTS_PATH if AZURE_TTS_PATH.startswith("/") else f"/{AZURE_TTS_PATH}" +def _azure_tts_url_for(path_value: str) -> str: + path = path_value if path_value.startswith("/") else f"/{path_value}" if AZURE_OPENAI_BASE_URL.endswith(path): return AZURE_OPENAI_BASE_URL return f"{AZURE_OPENAI_BASE_URL}{path}" +def _azure_tts_urls() -> list[str]: + urls: list[str] = [] + for path in AZURE_TTS_PATHS or [AZURE_TTS_PATH]: + url = _azure_tts_url_for(path) + if url not in urls: + urls.append(url) + return urls + + def _azure_openai_tts_sync(job_id: str, text: str, voice_id: str, target_seconds: float = 12.0) -> str: if not AZURE_OPENAI_API_KEY: raise RuntimeError("AZURE_OPENAI_API_KEY 或 LLM_API_KEY 未配置,未生成配音") @@ -2419,18 +2433,32 @@ def _azure_openai_tts_sync(job_id: str, text: str, voice_id: str, target_seconds "response_format": "mp3", "speed": _voice_speed_for(voice_id, target_seconds, text), } - resp = httpx.post( - _azure_tts_url(), - headers={ - "Authorization": f"Bearer {AZURE_OPENAI_API_KEY}", - "api-key": AZURE_OPENAI_API_KEY, - "Content-Type": "application/json", - }, - json=payload, - timeout=120, - ) + headers = { + "Authorization": f"Bearer {AZURE_OPENAI_API_KEY}", + "api-key": AZURE_OPENAI_API_KEY, + "Content-Type": "application/json", + } + resp: httpx.Response | None = None + errors: list[str] = [] + with ai_http_client(timeout=120) as client: + for url in _azure_tts_urls(): + try: + current = client.post(url, headers=headers, json=payload) + except Exception as e: + errors.append(f"{url}: {type(e).__name__}: {e}") + continue + if current.status_code < 400: + resp = current + break + errors.append(f"{url}: HTTP {current.status_code}: {current.text[:180]}") + if current.status_code not in {404, 405}: + resp = current + break + if resp is None: + raise RuntimeError("Azure OpenAI TTS 不可用;已尝试 " + " | ".join(errors)) if resp.status_code >= 400: - raise RuntimeError(f"Azure OpenAI TTS HTTP {resp.status_code}: {resp.text[:300]}") + detail = " | ".join(errors) or resp.text[:300] + raise RuntimeError(f"Azure OpenAI TTS HTTP {resp.status_code}: {detail[:600]}") audio_bytes = resp.content if not audio_bytes: raise RuntimeError("Azure OpenAI TTS 未返回音频内容") @@ -2447,9 +2475,7 @@ def _azure_openai_tts_sync(job_id: str, text: str, voice_id: str, target_seconds def _tts_sync(job_id: str, text: str, voice_id: str, target_seconds: float = 12.0) -> tuple[str, str, str]: - if VOICE_PROVIDER == "azure_openai": - return _azure_openai_tts_sync(job_id, text, voice_id, target_seconds), "azure_openai", AZURE_TTS_MODEL - return _minimax_tts_sync(job_id, text, voice_id, target_seconds), "minimax", MINIMAX_TTS_MODEL + return _azure_openai_tts_sync(job_id, text, voice_id, target_seconds), "azure_openai", AZURE_TTS_MODEL def _build_audio_script_sync(job_id: str, segments: list[TranscriptSegment], target_seconds: float = 12.0) -> AudioScript: @@ -2461,8 +2487,8 @@ def _build_audio_script_sync(job_id: str, segments: list[TranscriptSegment], tar speaker_profile, rhythm_profile = _audio_delivery_profile(segments, duration, selected_voice_id) voice_url = "" voice_error = "" - voice_provider = "azure_openai" if VOICE_PROVIDER == "azure_openai" else "minimax" - voice_model = AZURE_TTS_MODEL if voice_provider == "azure_openai" else MINIMAX_TTS_MODEL + voice_provider = "azure_openai" + voice_model = AZURE_TTS_MODEL try: voice_url, voice_provider, voice_model = _tts_sync(job_id, rewritten, selected_voice_id, duration) except Exception as e: @@ -3060,7 +3086,7 @@ def health() -> dict: "auth_configured": WEB_AUTH_CONFIGURED, "base_url": LLM_BASE_URL or "openai-default", "image_base_url": IMAGE_BASE_URL or LLM_BASE_URL or "openai-default", - "voice_base_url": AZURE_OPENAI_BASE_URL if VOICE_PROVIDER == "azure_openai" else MINIMAX_TTS_BASE_URL, + "voice_base_url": AZURE_OPENAI_BASE_URL, "models": { "asr": ASR_MODEL, "local_asr": LOCAL_ASR_MODEL, @@ -3077,15 +3103,13 @@ def health() -> dict: "subject_image": SUBJECT_ASSET_IMAGE_MODEL, "subject_image_fallbacks": SUBJECT_ASSET_IMAGE_MODELS, "voice_provider": VOICE_PROVIDER, - "voice_base_url": AZURE_OPENAI_BASE_URL if VOICE_PROVIDER == "azure_openai" else MINIMAX_TTS_BASE_URL, - "voice_tts": AZURE_TTS_MODEL if VOICE_PROVIDER == "azure_openai" else MINIMAX_TTS_MODEL, - "voice_id": AZURE_TTS_VOICE_ID if VOICE_PROVIDER == "azure_openai" else MINIMAX_TTS_VOICE_ID, - "voice_pool": AZURE_TTS_VOICE_POOL if VOICE_PROVIDER == "azure_openai" else (MINIMAX_TTS_VOICE_POOL or [MINIMAX_TTS_VOICE_ID]), - "voice_configured": bool(AZURE_OPENAI_API_KEY) if VOICE_PROVIDER == "azure_openai" else bool(MINIMAX_API_KEY), - "minimax_tts": MINIMAX_TTS_MODEL, - "minimax_voice": MINIMAX_TTS_VOICE_ID, - "minimax_voice_pool": MINIMAX_TTS_VOICE_POOL or [MINIMAX_TTS_VOICE_ID], - "minimax_configured": bool(MINIMAX_API_KEY), + "voice_base_url": AZURE_OPENAI_BASE_URL, + "voice_tts": AZURE_TTS_MODEL, + "voice_tts_paths": AZURE_TTS_PATHS, + "voice_id": AZURE_TTS_VOICE_ID, + "voice_pool": AZURE_TTS_VOICE_POOL, + "voice_configured": bool(AZURE_OPENAI_API_KEY), + "minimax_disabled": True, "video": VIDEO_MODEL, "video_aliases": VIDEO_MODEL_ALIASES, "video_provider": video_provider_name(), diff --git a/deploy/.env.production.example b/deploy/.env.production.example index ac8bfda..75200b0 100644 --- a/deploy/.env.production.example +++ b/deploy/.env.production.example @@ -38,6 +38,7 @@ AI_HTTP_PROXY= # Audio rewrite and Azure OpenAI TTS AUDIO_REWRITE_MODEL=gpt-4o AUDIO_PRODUCT_BRIEF="SKG smart massage products for daily neck, shoulder, back, eye, knee, and foot relaxation. Keep claims premium, clean, credible, and non-medical." +# Voice is fixed to Azure OpenAI in the backend; legacy VOICE_PROVIDER=minimax is ignored. VOICE_PROVIDER=azure_openai AZURE_OPENAI_BASE_URL=https://ai.skg.com/azure AZURE_OPENAI_API_KEY= @@ -45,13 +46,7 @@ AZURE_TTS_MODEL=gpt-4o-mini-tts AZURE_TTS_VOICE_ID=alloy AZURE_TTS_VOICE_POOL=alloy,verse,shimmer AZURE_TTS_PATH=/audio/speech - -# Legacy MiniMax TTS fallback; not the default voice provider. -MINIMAX_API_KEY= -MINIMAX_TTS_BASE_URL=https://api.minimax.io -MINIMAX_TTS_MODEL=speech-2.8-turbo -MINIMAX_TTS_VOICE_ID=English_expressive_narrator -MINIMAX_TTS_VOICE_POOL=English_magnetic_voiced_man,English_Upbeat_Woman,English_MaturePartner +AZURE_TTS_PATHS=/audio/speech,/v1/audio/speech # Video generation. Use SKG Doubao / Seedance gateway in production. POE_API_BASE_URL=https://api.poe.com/v1 diff --git a/web/components/dashboard.tsx b/web/components/dashboard.tsx index ea6d0ff..792f4d1 100644 --- a/web/components/dashboard.tsx +++ b/web/components/dashboard.tsx @@ -641,15 +641,15 @@ export const Dashboard = forwardRef(function Dashboard({ - + {job?.audio_script?.voice_url ? ( )} diff --git a/web/components/nodes/index.tsx b/web/components/nodes/index.tsx index 9245cea..295db24 100644 --- a/web/components/nodes/index.tsx +++ b/web/components/nodes/index.tsx @@ -2102,7 +2102,7 @@ export function RewriteNode({ data, selected }: any) { } /* ============================================================ - 5b. AudioNode — 合并 ASR + 翻译 + 改写 + MiniMax 配音 + 5b. AudioNode — 合并 ASR + 翻译 + 改写 + Azure OpenAI 配音 ============================================================ */ export function AudioNode({ data, selected }: any) { const d: NodeData = data @@ -2152,9 +2152,9 @@ export function AudioNode({ data, selected }: any) { }} >
- 音轨 → 取时长/节奏 → SKG 英文产品口播 → MiniMax 随机英文配音
+ 音轨 → 取时长/节奏 → SKG 英文产品口播 → Azure OpenAI 英文配音
- {audioScript?.rewrite_model || "AUDIO_REWRITE_MODEL"} → {audioScript?.voice_model || "MiniMax T2A"} + {audioScript?.rewrite_model || "AUDIO_REWRITE_MODEL"} → {audioScript?.voice_model || "Azure OpenAI TTS"}
{job && ( @@ -2195,7 +2195,7 @@ export function AudioNode({ data, selected }: any) { )} )} - {voiceUrl &&
MiniMax natural English voice ready · 底部音频条播放
} + {voiceUrl &&
Azure OpenAI English voice ready · 底部音频条播放
} {isRewriting && (
正在按原音频时长生成英文产品口播和配音…
)} diff --git a/web/lib/api.ts b/web/lib/api.ts index bfd111b..b320155 100644 --- a/web/lib/api.ts +++ b/web/lib/api.ts @@ -172,10 +172,8 @@ export interface RuntimeModels { voice_id?: string voice_pool?: string[] voice_configured?: boolean - minimax_tts?: string - minimax_voice?: string - minimax_voice_pool?: string[] - minimax_configured?: boolean + voice_tts_paths?: string[] + minimax_disabled?: boolean video?: string video_aliases?: Record video_provider?: string @@ -601,9 +599,8 @@ export interface BackendHealth { translate?: string rewrite?: string audio_rewrite?: string - minimax_tts?: string - minimax_voice?: string - minimax_configured?: boolean + voice_tts_paths?: string[] + minimax_disabled?: boolean video?: string video_aliases?: Record video_base_url?: string