auto-save 2026-05-18 00:23 (~2)

This commit is contained in:
2026-05-18 00:23:43 +08:00
parent eeff64c6e5
commit a7b131534a
2 changed files with 137 additions and 37 deletions

View File

@@ -1,25 +1,5 @@
{ {
"entries": [ "entries": [
{
"files_changed": 10,
"hash": "f7cc49a",
"message": "auto-save 2026-05-15 15:21 (+1, ~9)",
"ts": "2026-05-15T15:21:20+08:00",
"type": "commit"
},
{
"files_changed": 2,
"message": "Codex 会话活跃 · 最近命令codex · 2 项未提交变更 · 最近提交auto-save 2026-05-15 15:21 (+1, ~9)",
"ts": "2026-05-15T07:24:47Z",
"type": "session-heartbeat"
},
{
"files_changed": 3,
"hash": "caa28e2",
"message": "auto-save 2026-05-15 15:26 (~3)",
"ts": "2026-05-15T15:26:51+08:00",
"type": "commit"
},
{ {
"files_changed": 1, "files_changed": 1,
"hash": "45e7401", "hash": "45e7401",
@@ -3258,6 +3238,26 @@
"type": "session-heartbeat", "type": "session-heartbeat",
"message": "Codex 会话活跃 · 最近命令codex · 分支 main · 2 项未提交变更 · 最近提交auto-save 2026-05-18 00:07 (~3)", "message": "Codex 会话活跃 · 最近命令codex · 分支 main · 2 项未提交变更 · 最近提交auto-save 2026-05-18 00:07 (~3)",
"files_changed": 2 "files_changed": 2
},
{
"ts": "2026-05-18T00:12:58+08:00",
"type": "commit",
"message": "auto-save 2026-05-18 00:12 (~3)",
"hash": "ba202e4",
"files_changed": 3
},
{
"ts": "2026-05-18T00:16:10+08:00",
"type": "commit",
"message": "fix: show generated subject views",
"hash": "eeff64c",
"files_changed": 1
},
{
"ts": "2026-05-17T16:18:31Z",
"type": "session-heartbeat",
"message": "Codex 会话活跃 · 最近命令codex · 分支 main · 1 项未提交变更 · 最近提交fix: show generated subject views",
"files_changed": 1
} }
] ]
} }

View File

@@ -50,8 +50,10 @@ LOCAL_ASR_TIMEOUT_SECONDS = max(30, int(os.getenv("LOCAL_ASR_TIMEOUT_SECONDS", "
TRANSLATE_MODEL = os.getenv("TRANSLATE_MODEL", "gemini-2.5-flash") TRANSLATE_MODEL = os.getenv("TRANSLATE_MODEL", "gemini-2.5-flash")
REWRITE_MODEL = os.getenv("REWRITE_MODEL", "gemini-2.5-pro") REWRITE_MODEL = os.getenv("REWRITE_MODEL", "gemini-2.5-pro")
VISION_MODEL = os.getenv("VISION_MODEL", "gemini-2.5-flash") VISION_MODEL = os.getenv("VISION_MODEL", "gemini-2.5-flash")
IMAGE_MODEL = os.getenv("IMAGE_MODEL", "gemini-3-pro-image-preview")
GPT_IMAGE_MODEL = os.getenv("GPT_IMAGE_MODEL", "gpt-image-2").strip() or "gpt-image-2" GPT_IMAGE_MODEL = os.getenv("GPT_IMAGE_MODEL", "gpt-image-2").strip() or "gpt-image-2"
IMAGE_BASE_URL = os.getenv("IMAGE_BASE_URL", LLM_BASE_URL).strip()
IMAGE_API_KEY = os.getenv("IMAGE_API_KEY", LLM_API_KEY).strip()
IMAGE_MODEL = os.getenv("IMAGE_MODEL", GPT_IMAGE_MODEL).strip() or GPT_IMAGE_MODEL
SUBJECT_ASSET_IMAGE_MODEL = os.getenv("SUBJECT_ASSET_IMAGE_MODEL", GPT_IMAGE_MODEL).strip() or GPT_IMAGE_MODEL SUBJECT_ASSET_IMAGE_MODEL = os.getenv("SUBJECT_ASSET_IMAGE_MODEL", GPT_IMAGE_MODEL).strip() or GPT_IMAGE_MODEL
SUBJECT_ASSET_IMAGE_MODELS = [ SUBJECT_ASSET_IMAGE_MODELS = [
m.strip() m.strip()
@@ -87,6 +89,18 @@ MINIMAX_TTS_VOICE_POOL = [
for v in os.getenv("MINIMAX_TTS_VOICE_POOL", ",".join(DEFAULT_MINIMAX_TTS_VOICE_POOL)).split(",") for v in os.getenv("MINIMAX_TTS_VOICE_POOL", ",".join(DEFAULT_MINIMAX_TTS_VOICE_POOL)).split(",")
if v.strip() if v.strip()
] ]
VOICE_PROVIDER = os.getenv("VOICE_PROVIDER", "azure_openai").strip().lower() or "azure_openai"
AZURE_OPENAI_BASE_URL = os.getenv("AZURE_OPENAI_BASE_URL", "https://ai.skg.com/azure").strip().rstrip("/")
AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY", LLM_API_KEY).strip()
AZURE_TTS_MODEL = os.getenv("AZURE_TTS_MODEL", "gpt-4o-mini-tts").strip() or "gpt-4o-mini-tts"
AZURE_TTS_VOICE_ID = os.getenv("AZURE_TTS_VOICE_ID", "alloy").strip() or "alloy"
DEFAULT_AZURE_TTS_VOICE_POOL = ["alloy", "verse", "shimmer"]
AZURE_TTS_VOICE_POOL = [
v.strip()
for v in os.getenv("AZURE_TTS_VOICE_POOL", ",".join(DEFAULT_AZURE_TTS_VOICE_POOL)).split(",")
if v.strip()
]
AZURE_TTS_PATH = os.getenv("AZURE_TTS_PATH", "/audio/speech").strip() or "/audio/speech"
POE_API_BASE_URL = os.getenv("POE_API_BASE_URL", "https://api.poe.com/v1").strip() or "https://api.poe.com/v1" POE_API_BASE_URL = os.getenv("POE_API_BASE_URL", "https://api.poe.com/v1").strip() or "https://api.poe.com/v1"
POE_API_KEY = os.getenv("POE_API_KEY", "").strip() POE_API_KEY = os.getenv("POE_API_KEY", "").strip()
@@ -163,6 +177,7 @@ _MEDIA_BIN_CACHE: dict[str, str] = {}
# OpenAI 客户端OpenAI 兼容网关,含 SKG ezlink # OpenAI 客户端OpenAI 兼容网关,含 SKG ezlink
from openai import OpenAI from openai import OpenAI
_llm_client: OpenAI | None = None _llm_client: OpenAI | None = None
_image_client: OpenAI | None = None
def llm() -> OpenAI: def llm() -> OpenAI:
global _llm_client global _llm_client
if _llm_client is None: if _llm_client is None:
@@ -171,6 +186,14 @@ def llm() -> OpenAI:
_llm_client = OpenAI(base_url=LLM_BASE_URL or None, api_key=LLM_API_KEY) _llm_client = OpenAI(base_url=LLM_BASE_URL or None, api_key=LLM_API_KEY)
return _llm_client return _llm_client
def image_llm() -> OpenAI:
global _image_client
if _image_client is None:
if not IMAGE_API_KEY:
raise RuntimeError("IMAGE_API_KEY 或 LLM_API_KEY 未配置")
_image_client = OpenAI(base_url=IMAGE_BASE_URL or None, api_key=IMAGE_API_KEY)
return _image_client
# Pipeline 状态: # Pipeline 状态:
# created → downloading → downloaded前端“开始”会继续触发音频解析 # created → downloading → downloaded前端“开始”会继续触发音频解析
# → splitting → frames_extracted # → splitting → frames_extracted
@@ -2180,6 +2203,18 @@ def _choose_minimax_voice_id() -> str:
return MINIMAX_TTS_VOICE_ID return MINIMAX_TTS_VOICE_ID
def _choose_azure_voice_id() -> str:
if AZURE_TTS_VOICE_POOL:
return random.choice(AZURE_TTS_VOICE_POOL)
return AZURE_TTS_VOICE_ID
def _choose_tts_voice_id() -> str:
if VOICE_PROVIDER == "azure_openai":
return _choose_azure_voice_id()
return _choose_minimax_voice_id()
def _voice_speed_for(voice_id: str, target_seconds: float, text: str) -> float: def _voice_speed_for(voice_id: str, target_seconds: float, text: str) -> float:
words = len([w for w in text.replace("\n", " ").split(" ") if w.strip()]) words = len([w for w in text.replace("\n", " ").split(" ") if w.strip()])
estimated_seconds = words / 2.35 if words else target_seconds estimated_seconds = words / 2.35 if words else target_seconds
@@ -2241,17 +2276,71 @@ def _minimax_tts_sync(job_id: str, text: str, voice_id: str, target_seconds: flo
return f"/jobs/{job_id}/audio-script.mp3" return f"/jobs/{job_id}/audio-script.mp3"
def _azure_tts_url() -> str:
path = AZURE_TTS_PATH if AZURE_TTS_PATH.startswith("/") else f"/{AZURE_TTS_PATH}"
if AZURE_OPENAI_BASE_URL.endswith(path):
return AZURE_OPENAI_BASE_URL
return f"{AZURE_OPENAI_BASE_URL}{path}"
def _azure_openai_tts_sync(job_id: str, text: str, voice_id: str, target_seconds: float = 12.0) -> str:
if not AZURE_OPENAI_API_KEY:
raise RuntimeError("AZURE_OPENAI_API_KEY 或 LLM_API_KEY 未配置,未生成配音")
if not text.strip():
raise RuntimeError("改写文案为空,未生成配音")
payload = {
"model": AZURE_TTS_MODEL,
"voice": voice_id,
"input": text.strip()[:9500],
"response_format": "mp3",
"speed": _voice_speed_for(voice_id, target_seconds, text),
}
resp = httpx.post(
_azure_tts_url(),
headers={
"Authorization": f"Bearer {AZURE_OPENAI_API_KEY}",
"api-key": AZURE_OPENAI_API_KEY,
"Content-Type": "application/json",
},
json=payload,
timeout=120,
)
if resp.status_code >= 400:
raise RuntimeError(f"Azure OpenAI TTS HTTP {resp.status_code}: {resp.text[:300]}")
audio_bytes = resp.content
if not audio_bytes:
raise RuntimeError("Azure OpenAI TTS 未返回音频内容")
content_type = resp.headers.get("content-type", "")
if "application/json" in content_type.lower():
try:
data = resp.json()
except Exception:
data = {"error": resp.text[:300]}
raise RuntimeError(f"Azure OpenAI TTS 返回 JSON 而不是音频:{str(data)[:300]}")
out = job_dir(job_id) / "audio_script.mp3"
out.write_bytes(audio_bytes)
return f"/jobs/{job_id}/audio-script.mp3"
def _tts_sync(job_id: str, text: str, voice_id: str, target_seconds: float = 12.0) -> tuple[str, str, str]:
if VOICE_PROVIDER == "azure_openai":
return _azure_openai_tts_sync(job_id, text, voice_id, target_seconds), "azure_openai", AZURE_TTS_MODEL
return _minimax_tts_sync(job_id, text, voice_id, target_seconds), "minimax", MINIMAX_TTS_MODEL
def _build_audio_script_sync(job_id: str, segments: list[TranscriptSegment], target_seconds: float = 12.0) -> AudioScript: def _build_audio_script_sync(job_id: str, segments: list[TranscriptSegment], target_seconds: float = 12.0) -> AudioScript:
source_text = _transcript_join(segments, "en") source_text = _transcript_join(segments, "en")
source_zh = _transcript_join(segments, "zh") source_zh = _transcript_join(segments, "zh")
duration = max(float(target_seconds or 0), _segment_duration(segments), 4.0) duration = max(float(target_seconds or 0), _segment_duration(segments), 4.0)
rewritten, rewrite_error = _rewrite_audio_script_sync(segments, duration) rewritten, rewrite_error = _rewrite_audio_script_sync(segments, duration)
selected_voice_id = _choose_minimax_voice_id() selected_voice_id = _choose_tts_voice_id()
speaker_profile, rhythm_profile = _audio_delivery_profile(segments, duration, selected_voice_id) speaker_profile, rhythm_profile = _audio_delivery_profile(segments, duration, selected_voice_id)
voice_url = "" voice_url = ""
voice_error = "" voice_error = ""
voice_provider = "azure_openai" if VOICE_PROVIDER == "azure_openai" else "minimax"
voice_model = AZURE_TTS_MODEL if voice_provider == "azure_openai" else MINIMAX_TTS_MODEL
try: try:
voice_url = _minimax_tts_sync(job_id, rewritten, selected_voice_id, duration) voice_url, voice_provider, voice_model = _tts_sync(job_id, rewritten, selected_voice_id, duration)
except Exception as e: except Exception as e:
voice_error = str(e) voice_error = str(e)
# 改写失败时已有本地 SKG 模板兜底,不把它标成用户可见错误;配音失败才需要提示。 # 改写失败时已有本地 SKG 模板兜底,不把它标成用户可见错误;配音失败才需要提示。
@@ -2265,8 +2354,8 @@ def _build_audio_script_sync(job_id: str, segments: list[TranscriptSegment], tar
rhythm_profile=rhythm_profile, rhythm_profile=rhythm_profile,
product_brief=AUDIO_PRODUCT_BRIEF, product_brief=AUDIO_PRODUCT_BRIEF,
rewrite_model=AUDIO_REWRITE_MODEL, rewrite_model=AUDIO_REWRITE_MODEL,
voice_provider="minimax", voice_provider=voice_provider,
voice_model=MINIMAX_TTS_MODEL, voice_model=voice_model,
voice_id=selected_voice_id, voice_id=selected_voice_id,
voice_url=voice_url, voice_url=voice_url,
error=errors, error=errors,
@@ -2453,8 +2542,8 @@ def _image_edit_call(
import time as _time import time as _time
import httpx import httpx
from PIL import Image as _PILImage from PIL import Image as _PILImage
if not LLM_API_KEY: if not IMAGE_API_KEY:
raise RuntimeError("LLM_API_KEY 未配置") raise RuntimeError("IMAGE_API_KEY 或 LLM_API_KEY 未配置")
# model 优先级models 列表 > 单个 model 参数 > IMAGE_MODEL # model 优先级models 列表 > 单个 model 参数 > IMAGE_MODEL
if models and len(models) > 0: if models and len(models) > 0:
models_cycle = list(models) models_cycle = list(models)
@@ -2489,9 +2578,9 @@ def _image_edit_call(
if current_mode == "edit": if current_mode == "edit":
with httpx.Client(timeout=120) as client: with httpx.Client(timeout=120) as client:
r = client.post( r = client.post(
f"{LLM_BASE_URL}/images/generations", f"{IMAGE_BASE_URL}/images/generations",
headers={ headers={
"Authorization": f"Bearer {LLM_API_KEY}", "Authorization": f"Bearer {IMAGE_API_KEY}",
"Content-Type": "application/json", "Content-Type": "application/json",
}, },
json={"model": current_model, "prompt": prompt, "image": data_uri, "n": 1}, json={"model": current_model, "prompt": prompt, "image": data_uri, "n": 1},
@@ -2499,7 +2588,7 @@ def _image_edit_call(
r.raise_for_status() r.raise_for_status()
resp_data = r.json() resp_data = r.json()
else: else:
resp = llm().images.generate(model=current_model, prompt=prompt, n=1) resp = image_llm().images.generate(model=current_model, prompt=prompt, n=1)
resp_data = resp.model_dump() if hasattr(resp, "model_dump") else {"data": [{"b64_json": resp.data[0].b64_json}]} resp_data = resp.model_dump() if hasattr(resp, "model_dump") else {"data": [{"b64_json": resp.data[0].b64_json}]}
if resp_data.get("data"): if resp_data.get("data"):
effective_mode = current_mode effective_mode = current_mode
@@ -2542,15 +2631,15 @@ def _image_text_call(
"""Text-only image generation with light model rotation.""" """Text-only image generation with light model rotation."""
import base64 as b64lib import base64 as b64lib
import time as _time import time as _time
if not LLM_API_KEY: if not IMAGE_API_KEY:
raise RuntimeError("LLM_API_KEY 未配置") raise RuntimeError("IMAGE_API_KEY 或 LLM_API_KEY 未配置")
models_cycle = list(models) if models else [model or IMAGE_MODEL] models_cycle = list(models) if models else [model or IMAGE_MODEL]
last_err = "" last_err = ""
resp_data: dict = {} resp_data: dict = {}
for attempt in range(max_attempts): for attempt in range(max_attempts):
current_model = models_cycle[min(attempt, len(models_cycle) - 1)] current_model = models_cycle[min(attempt, len(models_cycle) - 1)]
try: try:
resp = llm().images.generate(model=current_model, prompt=prompt, n=1) resp = image_llm().images.generate(model=current_model, prompt=prompt, n=1)
resp_data = resp.model_dump() if hasattr(resp, "model_dump") else {"data": [{"b64_json": resp.data[0].b64_json}]} resp_data = resp.model_dump() if hasattr(resp, "model_dump") else {"data": [{"b64_json": resp.data[0].b64_json}]}
if resp_data.get("data"): if resp_data.get("data"):
b64 = resp_data["data"][0].get("b64_json") b64 = resp_data["data"][0].get("b64_json")
@@ -2752,6 +2841,8 @@ def health() -> dict:
"llm_configured": bool(LLM_API_KEY), "llm_configured": bool(LLM_API_KEY),
"auth_configured": WEB_AUTH_CONFIGURED, "auth_configured": WEB_AUTH_CONFIGURED,
"base_url": LLM_BASE_URL or "openai-default", "base_url": LLM_BASE_URL or "openai-default",
"image_base_url": IMAGE_BASE_URL or LLM_BASE_URL or "openai-default",
"voice_base_url": AZURE_OPENAI_BASE_URL if VOICE_PROVIDER == "azure_openai" else MINIMAX_TTS_BASE_URL,
"models": { "models": {
"asr": ASR_MODEL, "asr": ASR_MODEL,
"local_asr": LOCAL_ASR_MODEL, "local_asr": LOCAL_ASR_MODEL,
@@ -2761,9 +2852,16 @@ def health() -> dict:
"audio_rewrite": AUDIO_REWRITE_MODEL, "audio_rewrite": AUDIO_REWRITE_MODEL,
"vision": VISION_MODEL, "vision": VISION_MODEL,
"image": IMAGE_MODEL, "image": IMAGE_MODEL,
"image_fallbacks": [IMAGE_MODEL, "gemini-3.1-flash-image-preview", "gemini-2.5-flash-image"], "image_base_url": IMAGE_BASE_URL or LLM_BASE_URL or "openai-default",
"image_fallbacks": [IMAGE_MODEL, GPT_IMAGE_MODEL, "gpt-image-1.5"],
"subject_image": SUBJECT_ASSET_IMAGE_MODEL, "subject_image": SUBJECT_ASSET_IMAGE_MODEL,
"subject_image_fallbacks": SUBJECT_ASSET_IMAGE_MODELS, "subject_image_fallbacks": SUBJECT_ASSET_IMAGE_MODELS,
"voice_provider": VOICE_PROVIDER,
"voice_base_url": AZURE_OPENAI_BASE_URL if VOICE_PROVIDER == "azure_openai" else MINIMAX_TTS_BASE_URL,
"voice_tts": AZURE_TTS_MODEL if VOICE_PROVIDER == "azure_openai" else MINIMAX_TTS_MODEL,
"voice_id": AZURE_TTS_VOICE_ID if VOICE_PROVIDER == "azure_openai" else MINIMAX_TTS_VOICE_ID,
"voice_pool": AZURE_TTS_VOICE_POOL if VOICE_PROVIDER == "azure_openai" else (MINIMAX_TTS_VOICE_POOL or [MINIMAX_TTS_VOICE_ID]),
"voice_configured": bool(AZURE_OPENAI_API_KEY) if VOICE_PROVIDER == "azure_openai" else bool(MINIMAX_API_KEY),
"minimax_tts": MINIMAX_TTS_MODEL, "minimax_tts": MINIMAX_TTS_MODEL,
"minimax_voice": MINIMAX_TTS_VOICE_ID, "minimax_voice": MINIMAX_TTS_VOICE_ID,
"minimax_voice_pool": MINIMAX_TTS_VOICE_POOL or [MINIMAX_TTS_VOICE_ID], "minimax_voice_pool": MINIMAX_TTS_VOICE_POOL or [MINIMAX_TTS_VOICE_ID],
@@ -3049,6 +3147,8 @@ def generate_image(job_id: str, idx: int, req: GenerateReq) -> Job:
full_prompt = f"{full_prompt}. Avoid: {req.negative_prompt.strip()}" full_prompt = f"{full_prompt}. Avoid: {req.negative_prompt.strip()}"
if not full_prompt: if not full_prompt:
raise HTTPException(400, "prompt required") raise HTTPException(400, "prompt required")
if not IMAGE_API_KEY:
raise HTTPException(503, "IMAGE_API_KEY 或 LLM_API_KEY 未配置")
model = req.model or IMAGE_MODEL model = req.model or IMAGE_MODEL
gen_id = uuid.uuid4().hex[:12] gen_id = uuid.uuid4().hex[:12]
@@ -3075,9 +3175,9 @@ def generate_image(job_id: str, idx: int, req: GenerateReq) -> Job:
# OpenAI SDK 不直接支持 image 参数,用底层 httpx # OpenAI SDK 不直接支持 image 参数,用底层 httpx
with httpx.Client(timeout=120) as client: with httpx.Client(timeout=120) as client:
r = client.post( r = client.post(
f"{LLM_BASE_URL}/images/generations", f"{IMAGE_BASE_URL}/images/generations",
headers={ headers={
"Authorization": f"Bearer {LLM_API_KEY}", "Authorization": f"Bearer {IMAGE_API_KEY}",
"Content-Type": "application/json", "Content-Type": "application/json",
}, },
json={ json={
@@ -3091,7 +3191,7 @@ def generate_image(job_id: str, idx: int, req: GenerateReq) -> Job:
resp_data = r.json() resp_data = r.json()
else: else:
# text-only # text-only
resp = llm().images.generate(model=model, prompt=full_prompt, n=1) resp = image_llm().images.generate(model=model, prompt=full_prompt, n=1)
resp_data = resp.model_dump() if hasattr(resp, "model_dump") else {"data": [{"b64_json": resp.data[0].b64_json}]} resp_data = resp.model_dump() if hasattr(resp, "model_dump") else {"data": [{"b64_json": resp.data[0].b64_json}]}
if resp_data.get("data"): if resp_data.get("data"):