feat: improve subject generation workflow
This commit is contained in:
417
api/main.py
417
api/main.py
@@ -52,8 +52,18 @@ LOCAL_ASR_BIN = os.getenv("LOCAL_ASR_BIN", "").strip()
|
||||
LOCAL_ASR_MODEL = os.getenv("LOCAL_ASR_MODEL", "mlx-community/whisper-tiny").strip() or "mlx-community/whisper-tiny"
|
||||
LOCAL_ASR_TIMEOUT_SECONDS = max(30, int(os.getenv("LOCAL_ASR_TIMEOUT_SECONDS", "180")))
|
||||
TRANSLATE_MODEL = os.getenv("TRANSLATE_MODEL", "gemini-2.5-flash")
|
||||
REWRITE_MODEL = os.getenv("REWRITE_MODEL", "gemini-2.5-pro")
|
||||
VISION_MODEL = os.getenv("VISION_MODEL", "gemini-2.5-flash")
|
||||
DEFAULT_GPT_TEXT_MODEL = os.getenv("GPT_TEXT_MODEL", "gpt-4o").strip() or "gpt-4o"
|
||||
|
||||
|
||||
def gpt_model_env(name: str, default: str | None = None) -> str:
|
||||
value = os.getenv(name, default or DEFAULT_GPT_TEXT_MODEL).strip()
|
||||
if not value or value.lower().startswith("gemini-"):
|
||||
return default or DEFAULT_GPT_TEXT_MODEL
|
||||
return value
|
||||
|
||||
|
||||
REWRITE_MODEL = gpt_model_env("REWRITE_MODEL")
|
||||
VISION_MODEL = gpt_model_env("VISION_MODEL")
|
||||
IMAGE_BASE_URL = os.getenv("IMAGE_BASE_URL", LLM_BASE_URL).strip()
|
||||
IMAGE_API_KEY = os.getenv("IMAGE_API_KEY", LLM_API_KEY).strip()
|
||||
AI_HTTP_PROXY = (
|
||||
@@ -77,29 +87,14 @@ PRODUCT_ASSET_MIN_LONG_SIDE = max(512, int(os.getenv("PRODUCT_ASSET_MIN_LONG_SID
|
||||
PRODUCT_ASSET_MIN_SHORT_SIDE = max(320, int(os.getenv("PRODUCT_ASSET_MIN_SHORT_SIDE", "600")))
|
||||
PRODUCT_ASSET_JPEG_QUALITY = max(80, min(95, int(os.getenv("PRODUCT_ASSET_JPEG_QUALITY", "92"))))
|
||||
VIDEO_MODEL = os.getenv("VIDEO_MODEL", "seedance").strip() or "seedance"
|
||||
YTDLP_COOKIES_FILE = os.getenv("YTDLP_COOKIES_FILE", "").strip()
|
||||
YTDLP_COOKIES_FROM_BROWSER = os.getenv("YTDLP_COOKIES_FROM_BROWSER", "").strip()
|
||||
AUDIO_PRODUCT_BRIEF = os.getenv(
|
||||
"AUDIO_PRODUCT_BRIEF",
|
||||
"SKG 智能按摩产品,主打日常肩颈、腰背、眼部、膝盖或足部放松;广告表达要高级、干净、可信,不做医疗疗效承诺。",
|
||||
).strip()
|
||||
AUDIO_REWRITE_MODEL = os.getenv("AUDIO_REWRITE_MODEL", REWRITE_MODEL).strip() or REWRITE_MODEL
|
||||
MINIMAX_API_KEY = os.getenv("MINIMAX_API_KEY", "").strip()
|
||||
MINIMAX_TTS_BASE_URL = os.getenv("MINIMAX_TTS_BASE_URL", "https://api.minimax.io").strip().rstrip("/")
|
||||
MINIMAX_TTS_MODEL = os.getenv("MINIMAX_TTS_MODEL", "speech-2.8-turbo").strip() or "speech-2.8-turbo"
|
||||
MINIMAX_TTS_VOICE_ID = os.getenv(
|
||||
"MINIMAX_TTS_VOICE_ID",
|
||||
"English_expressive_narrator",
|
||||
).strip() or "English_expressive_narrator"
|
||||
DEFAULT_MINIMAX_TTS_VOICE_POOL = [
|
||||
"English_magnetic_voiced_man",
|
||||
"English_Upbeat_Woman",
|
||||
"English_MaturePartner",
|
||||
]
|
||||
MINIMAX_TTS_VOICE_POOL = [
|
||||
v.strip()
|
||||
for v in os.getenv("MINIMAX_TTS_VOICE_POOL", ",".join(DEFAULT_MINIMAX_TTS_VOICE_POOL)).split(",")
|
||||
if v.strip()
|
||||
]
|
||||
VOICE_PROVIDER = os.getenv("VOICE_PROVIDER", "azure_openai").strip().lower() or "azure_openai"
|
||||
AUDIO_REWRITE_MODEL = gpt_model_env("AUDIO_REWRITE_MODEL", REWRITE_MODEL)
|
||||
VOICE_PROVIDER = "azure_openai"
|
||||
AZURE_OPENAI_BASE_URL = os.getenv("AZURE_OPENAI_BASE_URL", "https://ai.skg.com/azure").strip().rstrip("/")
|
||||
AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY", LLM_API_KEY).strip()
|
||||
AZURE_TTS_MODEL = os.getenv("AZURE_TTS_MODEL", "gpt-4o-mini-tts").strip() or "gpt-4o-mini-tts"
|
||||
@@ -111,6 +106,11 @@ AZURE_TTS_VOICE_POOL = [
|
||||
if v.strip()
|
||||
]
|
||||
AZURE_TTS_PATH = os.getenv("AZURE_TTS_PATH", "/audio/speech").strip() or "/audio/speech"
|
||||
AZURE_TTS_PATHS = [
|
||||
p.strip()
|
||||
for p in os.getenv("AZURE_TTS_PATHS", f"{AZURE_TTS_PATH},/audio/speech,/v1/audio/speech").split(",")
|
||||
if p.strip()
|
||||
]
|
||||
|
||||
POE_API_BASE_URL = os.getenv("POE_API_BASE_URL", "https://api.poe.com/v1").strip() or "https://api.poe.com/v1"
|
||||
POE_API_KEY = os.getenv("POE_API_KEY", "").strip()
|
||||
@@ -452,6 +452,7 @@ class CharacterLibraryItem(BaseModel):
|
||||
name: str
|
||||
folder: str = ""
|
||||
description: str = ""
|
||||
prompt_brief: str = ""
|
||||
primary_image: str = ""
|
||||
images: list[CharacterLibraryImage] = Field(default_factory=list)
|
||||
|
||||
@@ -477,6 +478,7 @@ class SubjectTemplateItem(BaseModel):
|
||||
name: str
|
||||
description: str = ""
|
||||
note: str = ""
|
||||
prompt_brief: str = ""
|
||||
source: Literal["database"] = "database"
|
||||
source_job_id: str = ""
|
||||
source_frame_idx: int = -1
|
||||
@@ -1075,6 +1077,35 @@ def run(cmd: list[str], cwd: Path | None = None) -> str:
|
||||
return res.stdout
|
||||
|
||||
|
||||
def ytdlp_cookie_args() -> list[str]:
|
||||
if YTDLP_COOKIES_FILE:
|
||||
cookies = Path(YTDLP_COOKIES_FILE).expanduser()
|
||||
if not cookies.exists():
|
||||
raise RuntimeError("TikTok cookies 文件不可用,请检查 YTDLP_COOKIES_FILE 配置。")
|
||||
return ["--cookies", str(cookies)]
|
||||
if YTDLP_COOKIES_FROM_BROWSER:
|
||||
return ["--cookies-from-browser", YTDLP_COOKIES_FROM_BROWSER]
|
||||
return []
|
||||
|
||||
|
||||
def normalize_download_error(error: Exception) -> str:
|
||||
raw = str(error)
|
||||
lower = raw.lower()
|
||||
auth_required = (
|
||||
"log in for access" in lower
|
||||
or "login" in lower and "cookies" in lower
|
||||
or "cookies-from-browser" in lower
|
||||
or "sign in" in lower and "tiktok" in lower
|
||||
)
|
||||
if auth_required:
|
||||
return (
|
||||
"TikTok 下载需要登录态。请上传视频文件,或在后端配置 "
|
||||
"YTDLP_COOKIES_FILE / YTDLP_COOKIES_FROM_BROWSER 后重试。"
|
||||
f"原始错误:{raw}"
|
||||
)
|
||||
return raw
|
||||
|
||||
|
||||
# ---- 启发式选帧工具 ----
|
||||
import imagehash
|
||||
import numpy as np
|
||||
@@ -1728,13 +1759,15 @@ def pipeline_download(job_id: str) -> None:
|
||||
update(job, status="downloading", message="本地上传 · 跳过下载", progress=15)
|
||||
else:
|
||||
update(job, status="downloading", message="yt-dlp 下载中…", progress=5)
|
||||
run([
|
||||
cmd = [
|
||||
"yt-dlp", "-f", "best[ext=mp4]/best",
|
||||
"-o", str(mp4),
|
||||
"--no-warnings", "--no-playlist",
|
||||
"--retries", "3",
|
||||
*ytdlp_cookie_args(),
|
||||
job.url,
|
||||
])
|
||||
]
|
||||
run(cmd)
|
||||
if not mp4.exists():
|
||||
raise RuntimeError("下载完成但找不到 source.mp4")
|
||||
|
||||
@@ -1757,7 +1790,7 @@ def pipeline_download(job_id: str) -> None:
|
||||
)
|
||||
except Exception as e:
|
||||
message = "视频元数据解析失败" if stage == "metadata" else "下载失败"
|
||||
update(job, status="failed", error=str(e), message=message)
|
||||
update(job, status="failed", error=normalize_download_error(e), message=message)
|
||||
|
||||
|
||||
def pipeline_analyze(
|
||||
@@ -1929,7 +1962,7 @@ def analyze_queue_worker() -> None:
|
||||
ANALYZE_WORKER_RUNNING = False
|
||||
|
||||
|
||||
# ---------- 音频转写 + 翻译 + SKG 改写 + MiniMax 配音 ----------
|
||||
# ---------- 音频转写 + 翻译 + SKG 改写 + Azure OpenAI 配音 ----------
|
||||
|
||||
class TranscriptionUnavailable(RuntimeError):
|
||||
pass
|
||||
@@ -2385,18 +2418,6 @@ def _rewrite_audio_script_sync(segments: list[TranscriptSegment], target_seconds
|
||||
return fallback, f"改写失败,使用本地模板:{e}"
|
||||
|
||||
|
||||
def _minimax_tts_url() -> str:
|
||||
if MINIMAX_TTS_BASE_URL.endswith("/v1/t2a_v2"):
|
||||
return MINIMAX_TTS_BASE_URL
|
||||
return f"{MINIMAX_TTS_BASE_URL}/v1/t2a_v2"
|
||||
|
||||
|
||||
def _choose_minimax_voice_id() -> str:
|
||||
if MINIMAX_TTS_VOICE_POOL:
|
||||
return random.choice(MINIMAX_TTS_VOICE_POOL)
|
||||
return MINIMAX_TTS_VOICE_ID
|
||||
|
||||
|
||||
def _choose_azure_voice_id() -> str:
|
||||
if AZURE_TTS_VOICE_POOL:
|
||||
return random.choice(AZURE_TTS_VOICE_POOL)
|
||||
@@ -2404,9 +2425,7 @@ def _choose_azure_voice_id() -> str:
|
||||
|
||||
|
||||
def _choose_tts_voice_id() -> str:
|
||||
if VOICE_PROVIDER == "azure_openai":
|
||||
return _choose_azure_voice_id()
|
||||
return _choose_minimax_voice_id()
|
||||
return _choose_azure_voice_id()
|
||||
|
||||
|
||||
def _voice_speed_for(voice_id: str, target_seconds: float, text: str) -> float:
|
||||
@@ -2423,60 +2442,22 @@ def _voice_speed_for(voice_id: str, target_seconds: float, text: str) -> float:
|
||||
return 0.99
|
||||
|
||||
|
||||
def _minimax_tts_sync(job_id: str, text: str, voice_id: str, target_seconds: float = 12.0) -> str:
|
||||
if not MINIMAX_API_KEY:
|
||||
raise RuntimeError("MINIMAX_API_KEY 未配置,未生成配音")
|
||||
if not text.strip():
|
||||
raise RuntimeError("改写文案为空,未生成配音")
|
||||
payload = {
|
||||
"model": MINIMAX_TTS_MODEL,
|
||||
"text": text.strip()[:9500],
|
||||
"stream": False,
|
||||
"language_boost": "English",
|
||||
"output_format": "hex",
|
||||
"voice_setting": {
|
||||
"voice_id": voice_id,
|
||||
"speed": _voice_speed_for(voice_id, target_seconds, text),
|
||||
"vol": 1,
|
||||
"pitch": 0,
|
||||
},
|
||||
"audio_setting": {
|
||||
"sample_rate": 32000,
|
||||
"bitrate": 128000,
|
||||
"format": "mp3",
|
||||
"channel": 1,
|
||||
},
|
||||
}
|
||||
resp = httpx.post(
|
||||
_minimax_tts_url(),
|
||||
headers={"Authorization": f"Bearer {MINIMAX_API_KEY}", "Content-Type": "application/json"},
|
||||
json=payload,
|
||||
timeout=90,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
base_resp = data.get("base_resp") or {}
|
||||
if int(base_resp.get("status_code", 0) or 0) != 0:
|
||||
raise RuntimeError(base_resp.get("status_msg") or "MiniMax TTS 返回失败")
|
||||
audio_hex = ((data.get("data") or {}).get("audio") or "").strip()
|
||||
if not audio_hex:
|
||||
raise RuntimeError("MiniMax TTS 未返回 audio hex")
|
||||
try:
|
||||
audio_bytes = bytes.fromhex(audio_hex)
|
||||
except ValueError as e:
|
||||
raise RuntimeError(f"MiniMax TTS audio hex 无法解析:{e}") from e
|
||||
out = job_dir(job_id) / "audio_script.mp3"
|
||||
out.write_bytes(audio_bytes)
|
||||
return f"/jobs/{job_id}/audio-script.mp3"
|
||||
|
||||
|
||||
def _azure_tts_url() -> str:
|
||||
path = AZURE_TTS_PATH if AZURE_TTS_PATH.startswith("/") else f"/{AZURE_TTS_PATH}"
|
||||
def _azure_tts_url_for(path_value: str) -> str:
|
||||
path = path_value if path_value.startswith("/") else f"/{path_value}"
|
||||
if AZURE_OPENAI_BASE_URL.endswith(path):
|
||||
return AZURE_OPENAI_BASE_URL
|
||||
return f"{AZURE_OPENAI_BASE_URL}{path}"
|
||||
|
||||
|
||||
def _azure_tts_urls() -> list[str]:
|
||||
urls: list[str] = []
|
||||
for path in AZURE_TTS_PATHS or [AZURE_TTS_PATH]:
|
||||
url = _azure_tts_url_for(path)
|
||||
if url not in urls:
|
||||
urls.append(url)
|
||||
return urls
|
||||
|
||||
|
||||
def _azure_openai_tts_sync(job_id: str, text: str, voice_id: str, target_seconds: float = 12.0) -> str:
|
||||
if not AZURE_OPENAI_API_KEY:
|
||||
raise RuntimeError("AZURE_OPENAI_API_KEY 或 LLM_API_KEY 未配置,未生成配音")
|
||||
@@ -2489,18 +2470,32 @@ def _azure_openai_tts_sync(job_id: str, text: str, voice_id: str, target_seconds
|
||||
"response_format": "mp3",
|
||||
"speed": _voice_speed_for(voice_id, target_seconds, text),
|
||||
}
|
||||
resp = httpx.post(
|
||||
_azure_tts_url(),
|
||||
headers={
|
||||
"Authorization": f"Bearer {AZURE_OPENAI_API_KEY}",
|
||||
"api-key": AZURE_OPENAI_API_KEY,
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
json=payload,
|
||||
timeout=120,
|
||||
)
|
||||
headers = {
|
||||
"Authorization": f"Bearer {AZURE_OPENAI_API_KEY}",
|
||||
"api-key": AZURE_OPENAI_API_KEY,
|
||||
"Content-Type": "application/json",
|
||||
}
|
||||
resp: httpx.Response | None = None
|
||||
errors: list[str] = []
|
||||
with ai_http_client(timeout=120) as client:
|
||||
for url in _azure_tts_urls():
|
||||
try:
|
||||
current = client.post(url, headers=headers, json=payload)
|
||||
except Exception as e:
|
||||
errors.append(f"{url}: {type(e).__name__}: {e}")
|
||||
continue
|
||||
if current.status_code < 400:
|
||||
resp = current
|
||||
break
|
||||
errors.append(f"{url}: HTTP {current.status_code}: {current.text[:180]}")
|
||||
if current.status_code not in {404, 405}:
|
||||
resp = current
|
||||
break
|
||||
if resp is None:
|
||||
raise RuntimeError("Azure OpenAI TTS 不可用;已尝试 " + " | ".join(errors))
|
||||
if resp.status_code >= 400:
|
||||
raise RuntimeError(f"Azure OpenAI TTS HTTP {resp.status_code}: {resp.text[:300]}")
|
||||
detail = " | ".join(errors) or resp.text[:300]
|
||||
raise RuntimeError(f"Azure OpenAI TTS HTTP {resp.status_code}: {detail[:600]}")
|
||||
audio_bytes = resp.content
|
||||
if not audio_bytes:
|
||||
raise RuntimeError("Azure OpenAI TTS 未返回音频内容")
|
||||
@@ -2517,9 +2512,7 @@ def _azure_openai_tts_sync(job_id: str, text: str, voice_id: str, target_seconds
|
||||
|
||||
|
||||
def _tts_sync(job_id: str, text: str, voice_id: str, target_seconds: float = 12.0) -> tuple[str, str, str]:
|
||||
if VOICE_PROVIDER == "azure_openai":
|
||||
return _azure_openai_tts_sync(job_id, text, voice_id, target_seconds), "azure_openai", AZURE_TTS_MODEL
|
||||
return _minimax_tts_sync(job_id, text, voice_id, target_seconds), "minimax", MINIMAX_TTS_MODEL
|
||||
return _azure_openai_tts_sync(job_id, text, voice_id, target_seconds), "azure_openai", AZURE_TTS_MODEL
|
||||
|
||||
|
||||
def _build_audio_script_sync(job_id: str, segments: list[TranscriptSegment], target_seconds: float = 12.0) -> AudioScript:
|
||||
@@ -2531,8 +2524,8 @@ def _build_audio_script_sync(job_id: str, segments: list[TranscriptSegment], tar
|
||||
speaker_profile, rhythm_profile = _audio_delivery_profile(segments, duration, selected_voice_id)
|
||||
voice_url = ""
|
||||
voice_error = ""
|
||||
voice_provider = "azure_openai" if VOICE_PROVIDER == "azure_openai" else "minimax"
|
||||
voice_model = AZURE_TTS_MODEL if voice_provider == "azure_openai" else MINIMAX_TTS_MODEL
|
||||
voice_provider = "azure_openai"
|
||||
voice_model = AZURE_TTS_MODEL
|
||||
try:
|
||||
voice_url, voice_provider, voice_model = _tts_sync(job_id, rewritten, selected_voice_id, duration)
|
||||
except Exception as e:
|
||||
@@ -2944,6 +2937,83 @@ def _image_text_call(
|
||||
raise RuntimeError(_image_failure_message("image text", max_attempts, last_err, capacity_seen))
|
||||
|
||||
|
||||
def _image_path_to_data_url(path: Path) -> str:
|
||||
media_type = "image/png" if path.suffix.lower() == ".png" else "image/jpeg"
|
||||
return f"data:{media_type};base64,{base64.b64encode(path.read_bytes()).decode('ascii')}"
|
||||
|
||||
|
||||
def _vision_brief_from_images(image_paths: list[Path], prompt: str, max_images: int = 8) -> str:
|
||||
paths = [path for path in image_paths if path.exists()][:max_images]
|
||||
if not paths:
|
||||
return ""
|
||||
if not LLM_API_KEY:
|
||||
return ""
|
||||
content: list[dict] = [{"type": "text", "text": prompt}]
|
||||
for path in paths:
|
||||
content.append({"type": "image_url", "image_url": {"url": _image_path_to_data_url(path)}})
|
||||
try:
|
||||
resp = llm().chat.completions.create(
|
||||
model=VISION_MODEL,
|
||||
messages=[{"role": "user", "content": content}],
|
||||
response_format={"type": "json_object"},
|
||||
temperature=0.1,
|
||||
max_tokens=1400,
|
||||
)
|
||||
raw = (resp.choices[0].message.content or "").strip()
|
||||
if not raw:
|
||||
raw = (getattr(resp.choices[0].message, "reasoning_content", "") or "").strip()
|
||||
match = re.search(r"\{[\s\S]*\}", raw)
|
||||
raw = match.group(0) if match else raw
|
||||
data = json.loads(raw)
|
||||
except Exception as e:
|
||||
print(f"[vision brief failed] {e}", flush=True)
|
||||
return ""
|
||||
|
||||
if isinstance(data, dict):
|
||||
if isinstance(data.get("brief"), str) and data["brief"].strip():
|
||||
return data["brief"].strip()[:1800]
|
||||
parts: list[str] = []
|
||||
for key in (
|
||||
"gender_presentation", "age_range", "body_proportion", "hair", "skin_tone",
|
||||
"wardrobe_style", "pose_language", "camera_visibility", "commercial_mood",
|
||||
"neck_shoulder_readiness", "style_constraints",
|
||||
):
|
||||
value = data.get(key)
|
||||
if isinstance(value, str) and value.strip():
|
||||
parts.append(f"{key.replace('_', ' ')}: {value.strip()}")
|
||||
if parts:
|
||||
return "; ".join(parts)[:1800]
|
||||
return ""
|
||||
|
||||
|
||||
def _describe_source_subject(job_id: str, source_indices: list[int]) -> str:
|
||||
"""Turn source keyframes into a non-identifying visual brief for similar-subject text generation."""
|
||||
paths = [_source_frame_path(job_id, idx) for idx in source_indices]
|
||||
prompt = (
|
||||
"You are preparing a non-identifying character brief for generating a NEW similar but non-identical ad subject. "
|
||||
"Look at these source video keyframes as evidence of one role and style, not as a person to identify. "
|
||||
"Do NOT identify the person, do NOT estimate exact age, do NOT describe biometric identity, and do NOT mention celebrity or real-person likeness. "
|
||||
"Output strict JSON only. Use broad style traits suitable for text-to-image generation.\n"
|
||||
"Required keys: gender_presentation, age_range, body_proportion, hair, skin_tone, wardrobe_style, "
|
||||
"pose_language, camera_visibility, commercial_mood, neck_shoulder_readiness, style_constraints, brief.\n"
|
||||
"The brief should be 80-140 words and should preserve category, role, energy, camera readability, and commercial atmosphere while explicitly allowing a new non-identical subject."
|
||||
)
|
||||
return _vision_brief_from_images(paths, prompt, max_images=8)
|
||||
|
||||
|
||||
def _describe_subject_template_from_images(name: str, subject_style: str, image_paths: list[Path], note: str = "") -> str:
|
||||
prompt = (
|
||||
f"You are summarizing a saved SKG subject template named '{name}' for future text-to-image generation. "
|
||||
f"Subject style: {subject_style}. User note: {note[:500]}. "
|
||||
"Look at the subject views and describe the reusable creative direction without copying identity or pixels. "
|
||||
"Do NOT identify a person and do NOT describe exact facial identity. "
|
||||
"Output strict JSON only with keys: gender_presentation, age_range, body_proportion, material_or_skin, "
|
||||
"wardrobe_or_surface_style, pose_language, camera_readability, neck_shoulder_readiness, commercial_mood, brief. "
|
||||
"The brief should be 80-140 words and must be useful as a reference character brief for creating a new innovative variation."
|
||||
)
|
||||
return _vision_brief_from_images(image_paths, prompt, max_images=10)
|
||||
|
||||
|
||||
# ---------- API 路由 ----------
|
||||
|
||||
class CreateJobReq(BaseModel):
|
||||
@@ -3130,7 +3200,7 @@ def health() -> dict:
|
||||
"auth_configured": WEB_AUTH_CONFIGURED,
|
||||
"base_url": LLM_BASE_URL or "openai-default",
|
||||
"image_base_url": IMAGE_BASE_URL or LLM_BASE_URL or "openai-default",
|
||||
"voice_base_url": AZURE_OPENAI_BASE_URL if VOICE_PROVIDER == "azure_openai" else MINIMAX_TTS_BASE_URL,
|
||||
"voice_base_url": AZURE_OPENAI_BASE_URL,
|
||||
"models": {
|
||||
"asr": ASR_MODEL,
|
||||
"local_asr": LOCAL_ASR_MODEL,
|
||||
@@ -3147,15 +3217,12 @@ def health() -> dict:
|
||||
"subject_image": SUBJECT_ASSET_IMAGE_MODEL,
|
||||
"subject_image_fallbacks": SUBJECT_ASSET_IMAGE_MODELS,
|
||||
"voice_provider": VOICE_PROVIDER,
|
||||
"voice_base_url": AZURE_OPENAI_BASE_URL if VOICE_PROVIDER == "azure_openai" else MINIMAX_TTS_BASE_URL,
|
||||
"voice_tts": AZURE_TTS_MODEL if VOICE_PROVIDER == "azure_openai" else MINIMAX_TTS_MODEL,
|
||||
"voice_id": AZURE_TTS_VOICE_ID if VOICE_PROVIDER == "azure_openai" else MINIMAX_TTS_VOICE_ID,
|
||||
"voice_pool": AZURE_TTS_VOICE_POOL if VOICE_PROVIDER == "azure_openai" else (MINIMAX_TTS_VOICE_POOL or [MINIMAX_TTS_VOICE_ID]),
|
||||
"voice_configured": bool(AZURE_OPENAI_API_KEY) if VOICE_PROVIDER == "azure_openai" else bool(MINIMAX_API_KEY),
|
||||
"minimax_tts": MINIMAX_TTS_MODEL,
|
||||
"minimax_voice": MINIMAX_TTS_VOICE_ID,
|
||||
"minimax_voice_pool": MINIMAX_TTS_VOICE_POOL or [MINIMAX_TTS_VOICE_ID],
|
||||
"minimax_configured": bool(MINIMAX_API_KEY),
|
||||
"voice_base_url": AZURE_OPENAI_BASE_URL,
|
||||
"voice_tts": AZURE_TTS_MODEL,
|
||||
"voice_tts_paths": AZURE_TTS_PATHS,
|
||||
"voice_id": AZURE_TTS_VOICE_ID,
|
||||
"voice_pool": AZURE_TTS_VOICE_POOL,
|
||||
"voice_configured": bool(AZURE_OPENAI_API_KEY),
|
||||
"video": VIDEO_MODEL,
|
||||
"video_aliases": VIDEO_MODEL_ALIASES,
|
||||
"video_provider": video_provider_name(),
|
||||
@@ -3225,6 +3292,31 @@ async def create_job(req: CreateJobReq, bg: BackgroundTasks) -> Job:
|
||||
return job
|
||||
|
||||
|
||||
@app.post("/jobs/{job_id}/download/retry", response_model=Job)
|
||||
async def retry_job_download(job_id: str, bg: BackgroundTasks) -> Job:
|
||||
job = JOBS.get(job_id)
|
||||
if not job:
|
||||
raise HTTPException(404, "job not found")
|
||||
if job.source_kind == "upload" or job.url.startswith("upload://"):
|
||||
raise HTTPException(409, "uploaded videos cannot be redownloaded; upload the file again")
|
||||
if job.status in {"downloading", "splitting", "transcribing"}:
|
||||
raise HTTPException(409, f"job is busy: {job.status}")
|
||||
|
||||
mp4 = job_dir(job_id) / "source.mp4"
|
||||
if mp4.exists() and mp4.stat().st_size == 0:
|
||||
mp4.unlink()
|
||||
update(
|
||||
job,
|
||||
status="downloading",
|
||||
progress=1,
|
||||
error="",
|
||||
message="重新提交下载…",
|
||||
video_url="",
|
||||
)
|
||||
bg.add_task(pipeline_download, job_id)
|
||||
return job
|
||||
|
||||
|
||||
@app.post("/jobs/upload", response_model=Job)
|
||||
async def create_job_from_upload(bg: BackgroundTasks, file: UploadFile = File(...)) -> Job:
|
||||
if not file.filename:
|
||||
@@ -4308,43 +4400,56 @@ def generate_subject_assets(job_id: str, idx: int, element_id: str, req: Generat
|
||||
source_indices = [idx] + source_indices
|
||||
source_indices = list(dict.fromkeys(source_indices))[:12]
|
||||
|
||||
similar_mode = req.reconstruction_mode == "similar"
|
||||
character_reference_paths: list[Path] = []
|
||||
character_reference_clause = ""
|
||||
template_brief_clause = ""
|
||||
character_label = ""
|
||||
subject_template_id = (req.subject_template_id or "").strip()
|
||||
character_id = (req.character_id or "").strip()
|
||||
if subject_template_id:
|
||||
template = find_subject_template_item(subject_template_id)
|
||||
character_label = template.name
|
||||
for image in template.images[:10]:
|
||||
character_reference_paths.append(subject_template_image_file(image.filename))
|
||||
character_reference_clause = (
|
||||
f"Selected reusable subject template from database: {template.name}. "
|
||||
"Use these saved generated subject views as a high-quality creative direction and identity bible only; "
|
||||
"do not copy pixels, file artifacts, exact pose, labels, or accidental defects. "
|
||||
"Create a new innovative variation that keeps the same broad subject type, transparent wellness character language, "
|
||||
"camera readability, shoulder/neck product compatibility, and commercial role. "
|
||||
template_paths = [subject_template_image_file(image.filename) for image in template.images[:10]]
|
||||
character_reference_paths.extend(template_paths)
|
||||
brief = template.prompt_brief.strip() or template.note.strip() or template.description.strip()
|
||||
if similar_mode and not brief:
|
||||
brief = _describe_subject_template_from_images(template.name, template.subject_style, template_paths, template.note)
|
||||
template_brief_clause = (
|
||||
f"Reference character brief from saved database template '{template.name}': {brief}. "
|
||||
"Use this as a high-quality creative direction and identity bible only; do not copy a face, exact pose, pixels, file artifacts, labels, or accidental defects. "
|
||||
"Create a new innovative variation that keeps the same broad subject type, transparent wellness character language, camera readability, shoulder/neck product compatibility, and commercial role. "
|
||||
if brief else
|
||||
f"Selected reusable subject template from database: {template.name}. Create a new innovative variation, not a duplicate. "
|
||||
)
|
||||
elif character_id:
|
||||
character = find_character_library_item(character_id)
|
||||
character_label = character.name
|
||||
for image in character.images[:7]:
|
||||
character_reference_paths.append(character_library_file(image.filename))
|
||||
character_reference_clause = (
|
||||
f"Selected built-in creative character reference: {character.name}. "
|
||||
"Use these planned character images as a high-quality creative direction and anatomy/style bible only; "
|
||||
character_reference_paths.extend(character_library_file(image.filename) for image in character.images[:7])
|
||||
brief = character.prompt_brief.strip() or character.description.strip()
|
||||
template_brief_clause = (
|
||||
f"Reference character brief from built-in creative character '{character.name}': {brief}. "
|
||||
"Use this planned character brief as a high-quality creative direction and anatomy/style bible only; "
|
||||
"do not copy the exact face, exact pose, exact silhouette, pixels, or make a duplicate. "
|
||||
"Create a new innovative variation that keeps the same broad role, transparent wellness character language, "
|
||||
"camera readability, and shoulder/neck product compatibility. "
|
||||
"Create a new innovative variation that keeps the same broad role, transparent wellness character language, camera readability, and shoulder/neck product compatibility. "
|
||||
)
|
||||
|
||||
model_src, tmp_focus = _focus_source_for_element(job_id, idx, el)
|
||||
tmp_focus: Path | None = None
|
||||
model_src: Path | list[Path] | None = None
|
||||
frame_reference_paths = [p for p in (_source_frame_path(job_id, i) for i in source_indices) if p.exists()]
|
||||
if character_reference_paths:
|
||||
remaining = max(0, 10 - len(character_reference_paths))
|
||||
model_src = character_reference_paths + frame_reference_paths[:remaining]
|
||||
elif len(frame_reference_paths) > 1:
|
||||
model_src = frame_reference_paths[:10]
|
||||
source_subject_brief = _describe_source_subject(job_id, source_indices) if similar_mode else ""
|
||||
source_subject_clause = (
|
||||
f"Source video role brief from selected keyframes: {source_subject_brief}. "
|
||||
"Use this brief to preserve role category, creator-ad energy, camera readability, and broad styling, while creating a new non-identical subject. "
|
||||
if source_subject_brief else
|
||||
"Source video role brief unavailable; create a new non-identical ad subject guided by the user direction, template brief, and requested view. "
|
||||
)
|
||||
if not similar_mode:
|
||||
model_src, tmp_focus = _focus_source_for_element(job_id, idx, el)
|
||||
if character_reference_paths:
|
||||
remaining = max(0, 10 - len(character_reference_paths))
|
||||
model_src = character_reference_paths + frame_reference_paths[:remaining]
|
||||
elif len(frame_reference_paths) > 1:
|
||||
model_src = frame_reference_paths[:10]
|
||||
|
||||
try:
|
||||
with Image.open(_source_frame_path(job_id, idx)) as src_im:
|
||||
@@ -4371,7 +4476,7 @@ def generate_subject_assets(job_id: str, idx: int, element_id: str, req: Generat
|
||||
)
|
||||
actor_style_clause = (
|
||||
"Generate a believable normal commercial video actor, not a transparent or skeleton character. "
|
||||
"Use the references to understand the source video's casting direction, age range, gender presentation, body proportion, wardrobe category, gesture vocabulary, framing, energy, lighting, and creator-ad style. "
|
||||
"Use the text briefs to understand the source video's casting direction, age range, gender presentation, body proportion, wardrobe category, gesture vocabulary, framing, energy, lighting, and creator-ad style. "
|
||||
"Do not recreate the exact person's face, biometric identity, unique likeness, tattoos, scars, logos, watermarks, captions, or platform UI. "
|
||||
"The output must be a newly designed similar actor that could play the same role in a new ad, with consistent identity across all views. "
|
||||
if similar_actor
|
||||
@@ -4386,7 +4491,7 @@ def generate_subject_assets(job_id: str, idx: int, element_id: str, req: Generat
|
||||
prompt_extra_clause = f"User direction: {prompt_extra[:1200]} " if prompt_extra else ""
|
||||
identity_lock_clause = (
|
||||
"Identity lock: these API calls generate one high-definition multi-view pack for ONE single subject, but each individual output file must show only its one requested view. "
|
||||
"Before rendering, infer one consistent character bible from the reference image(s): gender presentation, age range, body proportions, head shape, face direction cues, material, silhouette, wardrobe/material style, and commercial mood. "
|
||||
"Before rendering, infer one consistent character bible from the supplied text brief and generation instructions: gender presentation, age range, body proportions, head shape, face direction cues, material, silhouette, wardrobe/material style, and commercial mood. "
|
||||
"Keep that same character bible unchanged across every generated view in separate files. "
|
||||
"If user direction requests a gender, age, or style change, apply that one change uniformly to all views; never mix male/female, young/old, or multiple style identities inside the same pack. "
|
||||
"For transparent humanoids, keep the same transparent skin shell, skeleton proportions, visible spine/rib cage/pelvis/limb bones, and non-horror wellness character style in every view. "
|
||||
@@ -4427,14 +4532,22 @@ def generate_subject_assets(job_id: str, idx: int, element_id: str, req: Generat
|
||||
if closeup_view and req.subject_kind == "living"
|
||||
else "The subject must be complete, centered, full body or full object, head-to-feet visible when applicable, not cropped by the canvas. Make the subject large and readable: it should occupy about 85-95% of the image height with only small margins. "
|
||||
)
|
||||
reference_strategy_clause = (
|
||||
"Text-only generation mode: no source image is attached to this image request. Use only the written source/video/template briefs below as creative constraints. "
|
||||
"This is intentionally NOT image editing and NOT identity replication. "
|
||||
+ source_subject_clause
|
||||
+ template_brief_clause
|
||||
if similar_mode else
|
||||
"Use the reference image(s) only as visual evidence; do not crop, cut out, paste, trace, or extract pixels from the source. "
|
||||
)
|
||||
prompt = (
|
||||
f"Use the reference image(s) only as visual evidence; do not crop, cut out, paste, trace, or extract pixels from the source. "
|
||||
reference_strategy_clause
|
||||
+
|
||||
f"Generate one newly rendered {view_prompt} for {target}. "
|
||||
f"The subject is a {kind_phrase}. If multiple frames are shown, treat them as evidence of one same subject, not multiple subjects. "
|
||||
f"The subject is a {kind_phrase}. Treat all source evidence as one role and one consistent subject bible, not multiple subjects. "
|
||||
+ single_view_clause
|
||||
+ identity_clause
|
||||
+ identity_lock_clause
|
||||
+ character_reference_clause
|
||||
+ neck_product_clause
|
||||
+ canvas_clause
|
||||
+ prompt_extra_clause
|
||||
@@ -4447,7 +4560,16 @@ def generate_subject_assets(job_id: str, idx: int, element_id: str, req: Generat
|
||||
+ transparent_character_clause
|
||||
)
|
||||
try:
|
||||
img_bytes, _mode = _image_edit_call(model_src, prompt, models=models, fallback_text=False, max_attempts=3, max_side=1280)
|
||||
if similar_mode:
|
||||
print(
|
||||
f"[subject assets] reconstruction_mode=similar endpoint=/images/generations view={view} image_refs=0 model={GPT_IMAGE_MODEL}",
|
||||
flush=True,
|
||||
)
|
||||
img_bytes, _mode = _image_text_call(prompt, models=models, max_attempts=3)
|
||||
else:
|
||||
if model_src is None:
|
||||
raise RuntimeError("subject asset edit reference image missing")
|
||||
img_bytes, _mode = _image_edit_call(model_src, prompt, models=models, fallback_text=False, max_attempts=3, max_side=1280)
|
||||
except RuntimeError as e:
|
||||
raise HTTPException(_image_error_status(e), f"subject asset {view} failed: {e}")
|
||||
|
||||
@@ -5026,6 +5148,7 @@ def save_subject_template(job_id: str, req: SaveSubjectTemplateReq) -> SubjectTe
|
||||
template_dir.mkdir(parents=True, exist_ok=True)
|
||||
now = _time.time()
|
||||
images: list[SubjectTemplateImage] = []
|
||||
saved_image_paths: list[Path] = []
|
||||
for asset in selected_assets:
|
||||
src = job_dir(job_id) / "assets" / f"{asset.id}.jpg"
|
||||
if not src.exists():
|
||||
@@ -5034,6 +5157,7 @@ def save_subject_template(job_id: str, req: SaveSubjectTemplateReq) -> SubjectTe
|
||||
filename = f"{template_id}/{image_id}.jpg"
|
||||
dst = SUBJECT_TEMPLATE_IMAGE_DIR / filename
|
||||
shutil.copy2(src, dst)
|
||||
saved_image_paths.append(dst)
|
||||
images.append(SubjectTemplateImage(
|
||||
id=image_id,
|
||||
view=asset.view,
|
||||
@@ -5053,11 +5177,18 @@ def save_subject_template(job_id: str, req: SaveSubjectTemplateReq) -> SubjectTe
|
||||
raise HTTPException(404, "subject asset files missing")
|
||||
|
||||
primary = next((image.id for image in images if image.view == "front"), images[0].id)
|
||||
prompt_brief = _describe_subject_template_from_images(
|
||||
name,
|
||||
req.subject_style,
|
||||
saved_image_paths,
|
||||
req.note.strip(),
|
||||
) or req.note.strip()
|
||||
item = SubjectTemplateItem(
|
||||
id=template_id,
|
||||
name=name,
|
||||
description=req.note.strip(),
|
||||
note=req.note.strip(),
|
||||
prompt_brief=prompt_brief,
|
||||
source_job_id=job_id,
|
||||
source_frame_idx=frame.index,
|
||||
source_element_id=element.id,
|
||||
|
||||
Reference in New Issue
Block a user