auto-save 2026-05-18 00:23 (~2)
This commit is contained in:
@@ -1,25 +1,5 @@
|
||||
{
|
||||
"entries": [
|
||||
{
|
||||
"files_changed": 10,
|
||||
"hash": "f7cc49a",
|
||||
"message": "auto-save 2026-05-15 15:21 (+1, ~9)",
|
||||
"ts": "2026-05-15T15:21:20+08:00",
|
||||
"type": "commit"
|
||||
},
|
||||
{
|
||||
"files_changed": 2,
|
||||
"message": "Codex 会话活跃 · 最近命令:codex · 2 项未提交变更 · 最近提交:auto-save 2026-05-15 15:21 (+1, ~9)",
|
||||
"ts": "2026-05-15T07:24:47Z",
|
||||
"type": "session-heartbeat"
|
||||
},
|
||||
{
|
||||
"files_changed": 3,
|
||||
"hash": "caa28e2",
|
||||
"message": "auto-save 2026-05-15 15:26 (~3)",
|
||||
"ts": "2026-05-15T15:26:51+08:00",
|
||||
"type": "commit"
|
||||
},
|
||||
{
|
||||
"files_changed": 1,
|
||||
"hash": "45e7401",
|
||||
@@ -3258,6 +3238,26 @@
|
||||
"type": "session-heartbeat",
|
||||
"message": "Codex 会话活跃 · 最近命令:codex · 分支 main · 2 项未提交变更 · 最近提交:auto-save 2026-05-18 00:07 (~3)",
|
||||
"files_changed": 2
|
||||
},
|
||||
{
|
||||
"ts": "2026-05-18T00:12:58+08:00",
|
||||
"type": "commit",
|
||||
"message": "auto-save 2026-05-18 00:12 (~3)",
|
||||
"hash": "ba202e4",
|
||||
"files_changed": 3
|
||||
},
|
||||
{
|
||||
"ts": "2026-05-18T00:16:10+08:00",
|
||||
"type": "commit",
|
||||
"message": "fix: show generated subject views",
|
||||
"hash": "eeff64c",
|
||||
"files_changed": 1
|
||||
},
|
||||
{
|
||||
"ts": "2026-05-17T16:18:31Z",
|
||||
"type": "session-heartbeat",
|
||||
"message": "Codex 会话活跃 · 最近命令:codex · 分支 main · 1 项未提交变更 · 最近提交:fix: show generated subject views",
|
||||
"files_changed": 1
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
134
api/main.py
134
api/main.py
@@ -50,8 +50,10 @@ LOCAL_ASR_TIMEOUT_SECONDS = max(30, int(os.getenv("LOCAL_ASR_TIMEOUT_SECONDS", "
|
||||
TRANSLATE_MODEL = os.getenv("TRANSLATE_MODEL", "gemini-2.5-flash")
|
||||
REWRITE_MODEL = os.getenv("REWRITE_MODEL", "gemini-2.5-pro")
|
||||
VISION_MODEL = os.getenv("VISION_MODEL", "gemini-2.5-flash")
|
||||
IMAGE_MODEL = os.getenv("IMAGE_MODEL", "gemini-3-pro-image-preview")
|
||||
GPT_IMAGE_MODEL = os.getenv("GPT_IMAGE_MODEL", "gpt-image-2").strip() or "gpt-image-2"
|
||||
IMAGE_BASE_URL = os.getenv("IMAGE_BASE_URL", LLM_BASE_URL).strip()
|
||||
IMAGE_API_KEY = os.getenv("IMAGE_API_KEY", LLM_API_KEY).strip()
|
||||
IMAGE_MODEL = os.getenv("IMAGE_MODEL", GPT_IMAGE_MODEL).strip() or GPT_IMAGE_MODEL
|
||||
SUBJECT_ASSET_IMAGE_MODEL = os.getenv("SUBJECT_ASSET_IMAGE_MODEL", GPT_IMAGE_MODEL).strip() or GPT_IMAGE_MODEL
|
||||
SUBJECT_ASSET_IMAGE_MODELS = [
|
||||
m.strip()
|
||||
@@ -87,6 +89,18 @@ MINIMAX_TTS_VOICE_POOL = [
|
||||
for v in os.getenv("MINIMAX_TTS_VOICE_POOL", ",".join(DEFAULT_MINIMAX_TTS_VOICE_POOL)).split(",")
|
||||
if v.strip()
|
||||
]
|
||||
VOICE_PROVIDER = os.getenv("VOICE_PROVIDER", "azure_openai").strip().lower() or "azure_openai"
|
||||
AZURE_OPENAI_BASE_URL = os.getenv("AZURE_OPENAI_BASE_URL", "https://ai.skg.com/azure").strip().rstrip("/")
|
||||
AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY", LLM_API_KEY).strip()
|
||||
AZURE_TTS_MODEL = os.getenv("AZURE_TTS_MODEL", "gpt-4o-mini-tts").strip() or "gpt-4o-mini-tts"
|
||||
AZURE_TTS_VOICE_ID = os.getenv("AZURE_TTS_VOICE_ID", "alloy").strip() or "alloy"
|
||||
DEFAULT_AZURE_TTS_VOICE_POOL = ["alloy", "verse", "shimmer"]
|
||||
AZURE_TTS_VOICE_POOL = [
|
||||
v.strip()
|
||||
for v in os.getenv("AZURE_TTS_VOICE_POOL", ",".join(DEFAULT_AZURE_TTS_VOICE_POOL)).split(",")
|
||||
if v.strip()
|
||||
]
|
||||
AZURE_TTS_PATH = os.getenv("AZURE_TTS_PATH", "/audio/speech").strip() or "/audio/speech"
|
||||
|
||||
POE_API_BASE_URL = os.getenv("POE_API_BASE_URL", "https://api.poe.com/v1").strip() or "https://api.poe.com/v1"
|
||||
POE_API_KEY = os.getenv("POE_API_KEY", "").strip()
|
||||
@@ -163,6 +177,7 @@ _MEDIA_BIN_CACHE: dict[str, str] = {}
|
||||
# OpenAI 客户端(OpenAI 兼容网关,含 SKG ezlink)
|
||||
from openai import OpenAI
|
||||
_llm_client: OpenAI | None = None
|
||||
_image_client: OpenAI | None = None
|
||||
def llm() -> OpenAI:
|
||||
global _llm_client
|
||||
if _llm_client is None:
|
||||
@@ -171,6 +186,14 @@ def llm() -> OpenAI:
|
||||
_llm_client = OpenAI(base_url=LLM_BASE_URL or None, api_key=LLM_API_KEY)
|
||||
return _llm_client
|
||||
|
||||
def image_llm() -> OpenAI:
|
||||
global _image_client
|
||||
if _image_client is None:
|
||||
if not IMAGE_API_KEY:
|
||||
raise RuntimeError("IMAGE_API_KEY 或 LLM_API_KEY 未配置")
|
||||
_image_client = OpenAI(base_url=IMAGE_BASE_URL or None, api_key=IMAGE_API_KEY)
|
||||
return _image_client
|
||||
|
||||
# Pipeline 状态:
|
||||
# created → downloading → downloaded(前端“开始”会继续触发音频解析)
|
||||
# → splitting → frames_extracted
|
||||
@@ -2180,6 +2203,18 @@ def _choose_minimax_voice_id() -> str:
|
||||
return MINIMAX_TTS_VOICE_ID
|
||||
|
||||
|
||||
def _choose_azure_voice_id() -> str:
|
||||
if AZURE_TTS_VOICE_POOL:
|
||||
return random.choice(AZURE_TTS_VOICE_POOL)
|
||||
return AZURE_TTS_VOICE_ID
|
||||
|
||||
|
||||
def _choose_tts_voice_id() -> str:
|
||||
if VOICE_PROVIDER == "azure_openai":
|
||||
return _choose_azure_voice_id()
|
||||
return _choose_minimax_voice_id()
|
||||
|
||||
|
||||
def _voice_speed_for(voice_id: str, target_seconds: float, text: str) -> float:
|
||||
words = len([w for w in text.replace("\n", " ").split(" ") if w.strip()])
|
||||
estimated_seconds = words / 2.35 if words else target_seconds
|
||||
@@ -2241,17 +2276,71 @@ def _minimax_tts_sync(job_id: str, text: str, voice_id: str, target_seconds: flo
|
||||
return f"/jobs/{job_id}/audio-script.mp3"
|
||||
|
||||
|
||||
def _azure_tts_url() -> str:
|
||||
path = AZURE_TTS_PATH if AZURE_TTS_PATH.startswith("/") else f"/{AZURE_TTS_PATH}"
|
||||
if AZURE_OPENAI_BASE_URL.endswith(path):
|
||||
return AZURE_OPENAI_BASE_URL
|
||||
return f"{AZURE_OPENAI_BASE_URL}{path}"
|
||||
|
||||
|
||||
def _azure_openai_tts_sync(job_id: str, text: str, voice_id: str, target_seconds: float = 12.0) -> str:
|
||||
if not AZURE_OPENAI_API_KEY:
|
||||
raise RuntimeError("AZURE_OPENAI_API_KEY 或 LLM_API_KEY 未配置,未生成配音")
|
||||
if not text.strip():
|
||||
raise RuntimeError("改写文案为空,未生成配音")
|
||||
payload = {
|
||||
"model": AZURE_TTS_MODEL,
|
||||
"voice": voice_id,
|
||||
"input": text.strip()[:9500],
|
||||
"response_format": "mp3",
|
||||
"speed": _voice_speed_for(voice_id, target_seconds, text),
|
||||
}
|
||||
resp = httpx.post(
|
||||
_azure_tts_url(),
|
||||
headers={
|
||||
"Authorization": f"Bearer {AZURE_OPENAI_API_KEY}",
|
||||
"api-key": AZURE_OPENAI_API_KEY,
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
json=payload,
|
||||
timeout=120,
|
||||
)
|
||||
if resp.status_code >= 400:
|
||||
raise RuntimeError(f"Azure OpenAI TTS HTTP {resp.status_code}: {resp.text[:300]}")
|
||||
audio_bytes = resp.content
|
||||
if not audio_bytes:
|
||||
raise RuntimeError("Azure OpenAI TTS 未返回音频内容")
|
||||
content_type = resp.headers.get("content-type", "")
|
||||
if "application/json" in content_type.lower():
|
||||
try:
|
||||
data = resp.json()
|
||||
except Exception:
|
||||
data = {"error": resp.text[:300]}
|
||||
raise RuntimeError(f"Azure OpenAI TTS 返回 JSON 而不是音频:{str(data)[:300]}")
|
||||
out = job_dir(job_id) / "audio_script.mp3"
|
||||
out.write_bytes(audio_bytes)
|
||||
return f"/jobs/{job_id}/audio-script.mp3"
|
||||
|
||||
|
||||
def _tts_sync(job_id: str, text: str, voice_id: str, target_seconds: float = 12.0) -> tuple[str, str, str]:
|
||||
if VOICE_PROVIDER == "azure_openai":
|
||||
return _azure_openai_tts_sync(job_id, text, voice_id, target_seconds), "azure_openai", AZURE_TTS_MODEL
|
||||
return _minimax_tts_sync(job_id, text, voice_id, target_seconds), "minimax", MINIMAX_TTS_MODEL
|
||||
|
||||
|
||||
def _build_audio_script_sync(job_id: str, segments: list[TranscriptSegment], target_seconds: float = 12.0) -> AudioScript:
|
||||
source_text = _transcript_join(segments, "en")
|
||||
source_zh = _transcript_join(segments, "zh")
|
||||
duration = max(float(target_seconds or 0), _segment_duration(segments), 4.0)
|
||||
rewritten, rewrite_error = _rewrite_audio_script_sync(segments, duration)
|
||||
selected_voice_id = _choose_minimax_voice_id()
|
||||
selected_voice_id = _choose_tts_voice_id()
|
||||
speaker_profile, rhythm_profile = _audio_delivery_profile(segments, duration, selected_voice_id)
|
||||
voice_url = ""
|
||||
voice_error = ""
|
||||
voice_provider = "azure_openai" if VOICE_PROVIDER == "azure_openai" else "minimax"
|
||||
voice_model = AZURE_TTS_MODEL if voice_provider == "azure_openai" else MINIMAX_TTS_MODEL
|
||||
try:
|
||||
voice_url = _minimax_tts_sync(job_id, rewritten, selected_voice_id, duration)
|
||||
voice_url, voice_provider, voice_model = _tts_sync(job_id, rewritten, selected_voice_id, duration)
|
||||
except Exception as e:
|
||||
voice_error = str(e)
|
||||
# 改写失败时已有本地 SKG 模板兜底,不把它标成用户可见错误;配音失败才需要提示。
|
||||
@@ -2265,8 +2354,8 @@ def _build_audio_script_sync(job_id: str, segments: list[TranscriptSegment], tar
|
||||
rhythm_profile=rhythm_profile,
|
||||
product_brief=AUDIO_PRODUCT_BRIEF,
|
||||
rewrite_model=AUDIO_REWRITE_MODEL,
|
||||
voice_provider="minimax",
|
||||
voice_model=MINIMAX_TTS_MODEL,
|
||||
voice_provider=voice_provider,
|
||||
voice_model=voice_model,
|
||||
voice_id=selected_voice_id,
|
||||
voice_url=voice_url,
|
||||
error=errors,
|
||||
@@ -2453,8 +2542,8 @@ def _image_edit_call(
|
||||
import time as _time
|
||||
import httpx
|
||||
from PIL import Image as _PILImage
|
||||
if not LLM_API_KEY:
|
||||
raise RuntimeError("LLM_API_KEY 未配置")
|
||||
if not IMAGE_API_KEY:
|
||||
raise RuntimeError("IMAGE_API_KEY 或 LLM_API_KEY 未配置")
|
||||
# model 优先级:models 列表 > 单个 model 参数 > IMAGE_MODEL
|
||||
if models and len(models) > 0:
|
||||
models_cycle = list(models)
|
||||
@@ -2489,9 +2578,9 @@ def _image_edit_call(
|
||||
if current_mode == "edit":
|
||||
with httpx.Client(timeout=120) as client:
|
||||
r = client.post(
|
||||
f"{LLM_BASE_URL}/images/generations",
|
||||
f"{IMAGE_BASE_URL}/images/generations",
|
||||
headers={
|
||||
"Authorization": f"Bearer {LLM_API_KEY}",
|
||||
"Authorization": f"Bearer {IMAGE_API_KEY}",
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
json={"model": current_model, "prompt": prompt, "image": data_uri, "n": 1},
|
||||
@@ -2499,7 +2588,7 @@ def _image_edit_call(
|
||||
r.raise_for_status()
|
||||
resp_data = r.json()
|
||||
else:
|
||||
resp = llm().images.generate(model=current_model, prompt=prompt, n=1)
|
||||
resp = image_llm().images.generate(model=current_model, prompt=prompt, n=1)
|
||||
resp_data = resp.model_dump() if hasattr(resp, "model_dump") else {"data": [{"b64_json": resp.data[0].b64_json}]}
|
||||
if resp_data.get("data"):
|
||||
effective_mode = current_mode
|
||||
@@ -2542,15 +2631,15 @@ def _image_text_call(
|
||||
"""Text-only image generation with light model rotation."""
|
||||
import base64 as b64lib
|
||||
import time as _time
|
||||
if not LLM_API_KEY:
|
||||
raise RuntimeError("LLM_API_KEY 未配置")
|
||||
if not IMAGE_API_KEY:
|
||||
raise RuntimeError("IMAGE_API_KEY 或 LLM_API_KEY 未配置")
|
||||
models_cycle = list(models) if models else [model or IMAGE_MODEL]
|
||||
last_err = ""
|
||||
resp_data: dict = {}
|
||||
for attempt in range(max_attempts):
|
||||
current_model = models_cycle[min(attempt, len(models_cycle) - 1)]
|
||||
try:
|
||||
resp = llm().images.generate(model=current_model, prompt=prompt, n=1)
|
||||
resp = image_llm().images.generate(model=current_model, prompt=prompt, n=1)
|
||||
resp_data = resp.model_dump() if hasattr(resp, "model_dump") else {"data": [{"b64_json": resp.data[0].b64_json}]}
|
||||
if resp_data.get("data"):
|
||||
b64 = resp_data["data"][0].get("b64_json")
|
||||
@@ -2752,6 +2841,8 @@ def health() -> dict:
|
||||
"llm_configured": bool(LLM_API_KEY),
|
||||
"auth_configured": WEB_AUTH_CONFIGURED,
|
||||
"base_url": LLM_BASE_URL or "openai-default",
|
||||
"image_base_url": IMAGE_BASE_URL or LLM_BASE_URL or "openai-default",
|
||||
"voice_base_url": AZURE_OPENAI_BASE_URL if VOICE_PROVIDER == "azure_openai" else MINIMAX_TTS_BASE_URL,
|
||||
"models": {
|
||||
"asr": ASR_MODEL,
|
||||
"local_asr": LOCAL_ASR_MODEL,
|
||||
@@ -2761,9 +2852,16 @@ def health() -> dict:
|
||||
"audio_rewrite": AUDIO_REWRITE_MODEL,
|
||||
"vision": VISION_MODEL,
|
||||
"image": IMAGE_MODEL,
|
||||
"image_fallbacks": [IMAGE_MODEL, "gemini-3.1-flash-image-preview", "gemini-2.5-flash-image"],
|
||||
"image_base_url": IMAGE_BASE_URL or LLM_BASE_URL or "openai-default",
|
||||
"image_fallbacks": [IMAGE_MODEL, GPT_IMAGE_MODEL, "gpt-image-1.5"],
|
||||
"subject_image": SUBJECT_ASSET_IMAGE_MODEL,
|
||||
"subject_image_fallbacks": SUBJECT_ASSET_IMAGE_MODELS,
|
||||
"voice_provider": VOICE_PROVIDER,
|
||||
"voice_base_url": AZURE_OPENAI_BASE_URL if VOICE_PROVIDER == "azure_openai" else MINIMAX_TTS_BASE_URL,
|
||||
"voice_tts": AZURE_TTS_MODEL if VOICE_PROVIDER == "azure_openai" else MINIMAX_TTS_MODEL,
|
||||
"voice_id": AZURE_TTS_VOICE_ID if VOICE_PROVIDER == "azure_openai" else MINIMAX_TTS_VOICE_ID,
|
||||
"voice_pool": AZURE_TTS_VOICE_POOL if VOICE_PROVIDER == "azure_openai" else (MINIMAX_TTS_VOICE_POOL or [MINIMAX_TTS_VOICE_ID]),
|
||||
"voice_configured": bool(AZURE_OPENAI_API_KEY) if VOICE_PROVIDER == "azure_openai" else bool(MINIMAX_API_KEY),
|
||||
"minimax_tts": MINIMAX_TTS_MODEL,
|
||||
"minimax_voice": MINIMAX_TTS_VOICE_ID,
|
||||
"minimax_voice_pool": MINIMAX_TTS_VOICE_POOL or [MINIMAX_TTS_VOICE_ID],
|
||||
@@ -3049,6 +3147,8 @@ def generate_image(job_id: str, idx: int, req: GenerateReq) -> Job:
|
||||
full_prompt = f"{full_prompt}. Avoid: {req.negative_prompt.strip()}"
|
||||
if not full_prompt:
|
||||
raise HTTPException(400, "prompt required")
|
||||
if not IMAGE_API_KEY:
|
||||
raise HTTPException(503, "IMAGE_API_KEY 或 LLM_API_KEY 未配置")
|
||||
|
||||
model = req.model or IMAGE_MODEL
|
||||
gen_id = uuid.uuid4().hex[:12]
|
||||
@@ -3075,9 +3175,9 @@ def generate_image(job_id: str, idx: int, req: GenerateReq) -> Job:
|
||||
# OpenAI SDK 不直接支持 image 参数,用底层 httpx
|
||||
with httpx.Client(timeout=120) as client:
|
||||
r = client.post(
|
||||
f"{LLM_BASE_URL}/images/generations",
|
||||
f"{IMAGE_BASE_URL}/images/generations",
|
||||
headers={
|
||||
"Authorization": f"Bearer {LLM_API_KEY}",
|
||||
"Authorization": f"Bearer {IMAGE_API_KEY}",
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
json={
|
||||
@@ -3091,7 +3191,7 @@ def generate_image(job_id: str, idx: int, req: GenerateReq) -> Job:
|
||||
resp_data = r.json()
|
||||
else:
|
||||
# text-only
|
||||
resp = llm().images.generate(model=model, prompt=full_prompt, n=1)
|
||||
resp = image_llm().images.generate(model=model, prompt=full_prompt, n=1)
|
||||
resp_data = resp.model_dump() if hasattr(resp, "model_dump") else {"data": [{"b64_json": resp.data[0].b64_json}]}
|
||||
|
||||
if resp_data.get("data"):
|
||||
|
||||
Reference in New Issue
Block a user