feat: improve subject generation workflow

This commit is contained in:
2026-05-18 17:44:52 +08:00
parent 78bd294d57
commit 1f600ae436
12 changed files with 682 additions and 372 deletions

View File

@@ -52,8 +52,18 @@ LOCAL_ASR_BIN = os.getenv("LOCAL_ASR_BIN", "").strip()
LOCAL_ASR_MODEL = os.getenv("LOCAL_ASR_MODEL", "mlx-community/whisper-tiny").strip() or "mlx-community/whisper-tiny"
LOCAL_ASR_TIMEOUT_SECONDS = max(30, int(os.getenv("LOCAL_ASR_TIMEOUT_SECONDS", "180")))
TRANSLATE_MODEL = os.getenv("TRANSLATE_MODEL", "gemini-2.5-flash")
REWRITE_MODEL = os.getenv("REWRITE_MODEL", "gemini-2.5-pro")
VISION_MODEL = os.getenv("VISION_MODEL", "gemini-2.5-flash")
DEFAULT_GPT_TEXT_MODEL = os.getenv("GPT_TEXT_MODEL", "gpt-4o").strip() or "gpt-4o"
def gpt_model_env(name: str, default: str | None = None) -> str:
value = os.getenv(name, default or DEFAULT_GPT_TEXT_MODEL).strip()
if not value or value.lower().startswith("gemini-"):
return default or DEFAULT_GPT_TEXT_MODEL
return value
REWRITE_MODEL = gpt_model_env("REWRITE_MODEL")
VISION_MODEL = gpt_model_env("VISION_MODEL")
IMAGE_BASE_URL = os.getenv("IMAGE_BASE_URL", LLM_BASE_URL).strip()
IMAGE_API_KEY = os.getenv("IMAGE_API_KEY", LLM_API_KEY).strip()
AI_HTTP_PROXY = (
@@ -77,29 +87,14 @@ PRODUCT_ASSET_MIN_LONG_SIDE = max(512, int(os.getenv("PRODUCT_ASSET_MIN_LONG_SID
PRODUCT_ASSET_MIN_SHORT_SIDE = max(320, int(os.getenv("PRODUCT_ASSET_MIN_SHORT_SIDE", "600")))
PRODUCT_ASSET_JPEG_QUALITY = max(80, min(95, int(os.getenv("PRODUCT_ASSET_JPEG_QUALITY", "92"))))
VIDEO_MODEL = os.getenv("VIDEO_MODEL", "seedance").strip() or "seedance"
YTDLP_COOKIES_FILE = os.getenv("YTDLP_COOKIES_FILE", "").strip()
YTDLP_COOKIES_FROM_BROWSER = os.getenv("YTDLP_COOKIES_FROM_BROWSER", "").strip()
AUDIO_PRODUCT_BRIEF = os.getenv(
"AUDIO_PRODUCT_BRIEF",
"SKG 智能按摩产品,主打日常肩颈、腰背、眼部、膝盖或足部放松;广告表达要高级、干净、可信,不做医疗疗效承诺。",
).strip()
AUDIO_REWRITE_MODEL = os.getenv("AUDIO_REWRITE_MODEL", REWRITE_MODEL).strip() or REWRITE_MODEL
MINIMAX_API_KEY = os.getenv("MINIMAX_API_KEY", "").strip()
MINIMAX_TTS_BASE_URL = os.getenv("MINIMAX_TTS_BASE_URL", "https://api.minimax.io").strip().rstrip("/")
MINIMAX_TTS_MODEL = os.getenv("MINIMAX_TTS_MODEL", "speech-2.8-turbo").strip() or "speech-2.8-turbo"
MINIMAX_TTS_VOICE_ID = os.getenv(
"MINIMAX_TTS_VOICE_ID",
"English_expressive_narrator",
).strip() or "English_expressive_narrator"
DEFAULT_MINIMAX_TTS_VOICE_POOL = [
"English_magnetic_voiced_man",
"English_Upbeat_Woman",
"English_MaturePartner",
]
MINIMAX_TTS_VOICE_POOL = [
v.strip()
for v in os.getenv("MINIMAX_TTS_VOICE_POOL", ",".join(DEFAULT_MINIMAX_TTS_VOICE_POOL)).split(",")
if v.strip()
]
VOICE_PROVIDER = os.getenv("VOICE_PROVIDER", "azure_openai").strip().lower() or "azure_openai"
AUDIO_REWRITE_MODEL = gpt_model_env("AUDIO_REWRITE_MODEL", REWRITE_MODEL)
VOICE_PROVIDER = "azure_openai"
AZURE_OPENAI_BASE_URL = os.getenv("AZURE_OPENAI_BASE_URL", "https://ai.skg.com/azure").strip().rstrip("/")
AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY", LLM_API_KEY).strip()
AZURE_TTS_MODEL = os.getenv("AZURE_TTS_MODEL", "gpt-4o-mini-tts").strip() or "gpt-4o-mini-tts"
@@ -111,6 +106,11 @@ AZURE_TTS_VOICE_POOL = [
if v.strip()
]
AZURE_TTS_PATH = os.getenv("AZURE_TTS_PATH", "/audio/speech").strip() or "/audio/speech"
AZURE_TTS_PATHS = [
p.strip()
for p in os.getenv("AZURE_TTS_PATHS", f"{AZURE_TTS_PATH},/audio/speech,/v1/audio/speech").split(",")
if p.strip()
]
POE_API_BASE_URL = os.getenv("POE_API_BASE_URL", "https://api.poe.com/v1").strip() or "https://api.poe.com/v1"
POE_API_KEY = os.getenv("POE_API_KEY", "").strip()
@@ -452,6 +452,7 @@ class CharacterLibraryItem(BaseModel):
name: str
folder: str = ""
description: str = ""
prompt_brief: str = ""
primary_image: str = ""
images: list[CharacterLibraryImage] = Field(default_factory=list)
@@ -477,6 +478,7 @@ class SubjectTemplateItem(BaseModel):
name: str
description: str = ""
note: str = ""
prompt_brief: str = ""
source: Literal["database"] = "database"
source_job_id: str = ""
source_frame_idx: int = -1
@@ -1075,6 +1077,35 @@ def run(cmd: list[str], cwd: Path | None = None) -> str:
return res.stdout
def ytdlp_cookie_args() -> list[str]:
if YTDLP_COOKIES_FILE:
cookies = Path(YTDLP_COOKIES_FILE).expanduser()
if not cookies.exists():
raise RuntimeError("TikTok cookies 文件不可用,请检查 YTDLP_COOKIES_FILE 配置。")
return ["--cookies", str(cookies)]
if YTDLP_COOKIES_FROM_BROWSER:
return ["--cookies-from-browser", YTDLP_COOKIES_FROM_BROWSER]
return []
def normalize_download_error(error: Exception) -> str:
raw = str(error)
lower = raw.lower()
auth_required = (
"log in for access" in lower
or "login" in lower and "cookies" in lower
or "cookies-from-browser" in lower
or "sign in" in lower and "tiktok" in lower
)
if auth_required:
return (
"TikTok 下载需要登录态。请上传视频文件,或在后端配置 "
"YTDLP_COOKIES_FILE / YTDLP_COOKIES_FROM_BROWSER 后重试。"
f"原始错误:{raw}"
)
return raw
# ---- 启发式选帧工具 ----
import imagehash
import numpy as np
@@ -1728,13 +1759,15 @@ def pipeline_download(job_id: str) -> None:
update(job, status="downloading", message="本地上传 · 跳过下载", progress=15)
else:
update(job, status="downloading", message="yt-dlp 下载中…", progress=5)
run([
cmd = [
"yt-dlp", "-f", "best[ext=mp4]/best",
"-o", str(mp4),
"--no-warnings", "--no-playlist",
"--retries", "3",
*ytdlp_cookie_args(),
job.url,
])
]
run(cmd)
if not mp4.exists():
raise RuntimeError("下载完成但找不到 source.mp4")
@@ -1757,7 +1790,7 @@ def pipeline_download(job_id: str) -> None:
)
except Exception as e:
message = "视频元数据解析失败" if stage == "metadata" else "下载失败"
update(job, status="failed", error=str(e), message=message)
update(job, status="failed", error=normalize_download_error(e), message=message)
def pipeline_analyze(
@@ -1929,7 +1962,7 @@ def analyze_queue_worker() -> None:
ANALYZE_WORKER_RUNNING = False
# ---------- 音频转写 + 翻译 + SKG 改写 + MiniMax 配音 ----------
# ---------- 音频转写 + 翻译 + SKG 改写 + Azure OpenAI 配音 ----------
class TranscriptionUnavailable(RuntimeError):
pass
@@ -2385,18 +2418,6 @@ def _rewrite_audio_script_sync(segments: list[TranscriptSegment], target_seconds
return fallback, f"改写失败,使用本地模板:{e}"
def _minimax_tts_url() -> str:
if MINIMAX_TTS_BASE_URL.endswith("/v1/t2a_v2"):
return MINIMAX_TTS_BASE_URL
return f"{MINIMAX_TTS_BASE_URL}/v1/t2a_v2"
def _choose_minimax_voice_id() -> str:
if MINIMAX_TTS_VOICE_POOL:
return random.choice(MINIMAX_TTS_VOICE_POOL)
return MINIMAX_TTS_VOICE_ID
def _choose_azure_voice_id() -> str:
if AZURE_TTS_VOICE_POOL:
return random.choice(AZURE_TTS_VOICE_POOL)
@@ -2404,9 +2425,7 @@ def _choose_azure_voice_id() -> str:
def _choose_tts_voice_id() -> str:
if VOICE_PROVIDER == "azure_openai":
return _choose_azure_voice_id()
return _choose_minimax_voice_id()
return _choose_azure_voice_id()
def _voice_speed_for(voice_id: str, target_seconds: float, text: str) -> float:
@@ -2423,60 +2442,22 @@ def _voice_speed_for(voice_id: str, target_seconds: float, text: str) -> float:
return 0.99
def _minimax_tts_sync(job_id: str, text: str, voice_id: str, target_seconds: float = 12.0) -> str:
if not MINIMAX_API_KEY:
raise RuntimeError("MINIMAX_API_KEY 未配置,未生成配音")
if not text.strip():
raise RuntimeError("改写文案为空,未生成配音")
payload = {
"model": MINIMAX_TTS_MODEL,
"text": text.strip()[:9500],
"stream": False,
"language_boost": "English",
"output_format": "hex",
"voice_setting": {
"voice_id": voice_id,
"speed": _voice_speed_for(voice_id, target_seconds, text),
"vol": 1,
"pitch": 0,
},
"audio_setting": {
"sample_rate": 32000,
"bitrate": 128000,
"format": "mp3",
"channel": 1,
},
}
resp = httpx.post(
_minimax_tts_url(),
headers={"Authorization": f"Bearer {MINIMAX_API_KEY}", "Content-Type": "application/json"},
json=payload,
timeout=90,
)
resp.raise_for_status()
data = resp.json()
base_resp = data.get("base_resp") or {}
if int(base_resp.get("status_code", 0) or 0) != 0:
raise RuntimeError(base_resp.get("status_msg") or "MiniMax TTS 返回失败")
audio_hex = ((data.get("data") or {}).get("audio") or "").strip()
if not audio_hex:
raise RuntimeError("MiniMax TTS 未返回 audio hex")
try:
audio_bytes = bytes.fromhex(audio_hex)
except ValueError as e:
raise RuntimeError(f"MiniMax TTS audio hex 无法解析:{e}") from e
out = job_dir(job_id) / "audio_script.mp3"
out.write_bytes(audio_bytes)
return f"/jobs/{job_id}/audio-script.mp3"
def _azure_tts_url() -> str:
path = AZURE_TTS_PATH if AZURE_TTS_PATH.startswith("/") else f"/{AZURE_TTS_PATH}"
def _azure_tts_url_for(path_value: str) -> str:
path = path_value if path_value.startswith("/") else f"/{path_value}"
if AZURE_OPENAI_BASE_URL.endswith(path):
return AZURE_OPENAI_BASE_URL
return f"{AZURE_OPENAI_BASE_URL}{path}"
def _azure_tts_urls() -> list[str]:
urls: list[str] = []
for path in AZURE_TTS_PATHS or [AZURE_TTS_PATH]:
url = _azure_tts_url_for(path)
if url not in urls:
urls.append(url)
return urls
def _azure_openai_tts_sync(job_id: str, text: str, voice_id: str, target_seconds: float = 12.0) -> str:
if not AZURE_OPENAI_API_KEY:
raise RuntimeError("AZURE_OPENAI_API_KEY 或 LLM_API_KEY 未配置,未生成配音")
@@ -2489,18 +2470,32 @@ def _azure_openai_tts_sync(job_id: str, text: str, voice_id: str, target_seconds
"response_format": "mp3",
"speed": _voice_speed_for(voice_id, target_seconds, text),
}
resp = httpx.post(
_azure_tts_url(),
headers={
"Authorization": f"Bearer {AZURE_OPENAI_API_KEY}",
"api-key": AZURE_OPENAI_API_KEY,
"Content-Type": "application/json",
},
json=payload,
timeout=120,
)
headers = {
"Authorization": f"Bearer {AZURE_OPENAI_API_KEY}",
"api-key": AZURE_OPENAI_API_KEY,
"Content-Type": "application/json",
}
resp: httpx.Response | None = None
errors: list[str] = []
with ai_http_client(timeout=120) as client:
for url in _azure_tts_urls():
try:
current = client.post(url, headers=headers, json=payload)
except Exception as e:
errors.append(f"{url}: {type(e).__name__}: {e}")
continue
if current.status_code < 400:
resp = current
break
errors.append(f"{url}: HTTP {current.status_code}: {current.text[:180]}")
if current.status_code not in {404, 405}:
resp = current
break
if resp is None:
raise RuntimeError("Azure OpenAI TTS 不可用;已尝试 " + " | ".join(errors))
if resp.status_code >= 400:
raise RuntimeError(f"Azure OpenAI TTS HTTP {resp.status_code}: {resp.text[:300]}")
detail = " | ".join(errors) or resp.text[:300]
raise RuntimeError(f"Azure OpenAI TTS HTTP {resp.status_code}: {detail[:600]}")
audio_bytes = resp.content
if not audio_bytes:
raise RuntimeError("Azure OpenAI TTS 未返回音频内容")
@@ -2517,9 +2512,7 @@ def _azure_openai_tts_sync(job_id: str, text: str, voice_id: str, target_seconds
def _tts_sync(job_id: str, text: str, voice_id: str, target_seconds: float = 12.0) -> tuple[str, str, str]:
if VOICE_PROVIDER == "azure_openai":
return _azure_openai_tts_sync(job_id, text, voice_id, target_seconds), "azure_openai", AZURE_TTS_MODEL
return _minimax_tts_sync(job_id, text, voice_id, target_seconds), "minimax", MINIMAX_TTS_MODEL
return _azure_openai_tts_sync(job_id, text, voice_id, target_seconds), "azure_openai", AZURE_TTS_MODEL
def _build_audio_script_sync(job_id: str, segments: list[TranscriptSegment], target_seconds: float = 12.0) -> AudioScript:
@@ -2531,8 +2524,8 @@ def _build_audio_script_sync(job_id: str, segments: list[TranscriptSegment], tar
speaker_profile, rhythm_profile = _audio_delivery_profile(segments, duration, selected_voice_id)
voice_url = ""
voice_error = ""
voice_provider = "azure_openai" if VOICE_PROVIDER == "azure_openai" else "minimax"
voice_model = AZURE_TTS_MODEL if voice_provider == "azure_openai" else MINIMAX_TTS_MODEL
voice_provider = "azure_openai"
voice_model = AZURE_TTS_MODEL
try:
voice_url, voice_provider, voice_model = _tts_sync(job_id, rewritten, selected_voice_id, duration)
except Exception as e:
@@ -2944,6 +2937,83 @@ def _image_text_call(
raise RuntimeError(_image_failure_message("image text", max_attempts, last_err, capacity_seen))
def _image_path_to_data_url(path: Path) -> str:
media_type = "image/png" if path.suffix.lower() == ".png" else "image/jpeg"
return f"data:{media_type};base64,{base64.b64encode(path.read_bytes()).decode('ascii')}"
def _vision_brief_from_images(image_paths: list[Path], prompt: str, max_images: int = 8) -> str:
paths = [path for path in image_paths if path.exists()][:max_images]
if not paths:
return ""
if not LLM_API_KEY:
return ""
content: list[dict] = [{"type": "text", "text": prompt}]
for path in paths:
content.append({"type": "image_url", "image_url": {"url": _image_path_to_data_url(path)}})
try:
resp = llm().chat.completions.create(
model=VISION_MODEL,
messages=[{"role": "user", "content": content}],
response_format={"type": "json_object"},
temperature=0.1,
max_tokens=1400,
)
raw = (resp.choices[0].message.content or "").strip()
if not raw:
raw = (getattr(resp.choices[0].message, "reasoning_content", "") or "").strip()
match = re.search(r"\{[\s\S]*\}", raw)
raw = match.group(0) if match else raw
data = json.loads(raw)
except Exception as e:
print(f"[vision brief failed] {e}", flush=True)
return ""
if isinstance(data, dict):
if isinstance(data.get("brief"), str) and data["brief"].strip():
return data["brief"].strip()[:1800]
parts: list[str] = []
for key in (
"gender_presentation", "age_range", "body_proportion", "hair", "skin_tone",
"wardrobe_style", "pose_language", "camera_visibility", "commercial_mood",
"neck_shoulder_readiness", "style_constraints",
):
value = data.get(key)
if isinstance(value, str) and value.strip():
parts.append(f"{key.replace('_', ' ')}: {value.strip()}")
if parts:
return "; ".join(parts)[:1800]
return ""
def _describe_source_subject(job_id: str, source_indices: list[int]) -> str:
"""Turn source keyframes into a non-identifying visual brief for similar-subject text generation."""
paths = [_source_frame_path(job_id, idx) for idx in source_indices]
prompt = (
"You are preparing a non-identifying character brief for generating a NEW similar but non-identical ad subject. "
"Look at these source video keyframes as evidence of one role and style, not as a person to identify. "
"Do NOT identify the person, do NOT estimate exact age, do NOT describe biometric identity, and do NOT mention celebrity or real-person likeness. "
"Output strict JSON only. Use broad style traits suitable for text-to-image generation.\n"
"Required keys: gender_presentation, age_range, body_proportion, hair, skin_tone, wardrobe_style, "
"pose_language, camera_visibility, commercial_mood, neck_shoulder_readiness, style_constraints, brief.\n"
"The brief should be 80-140 words and should preserve category, role, energy, camera readability, and commercial atmosphere while explicitly allowing a new non-identical subject."
)
return _vision_brief_from_images(paths, prompt, max_images=8)
def _describe_subject_template_from_images(name: str, subject_style: str, image_paths: list[Path], note: str = "") -> str:
prompt = (
f"You are summarizing a saved SKG subject template named '{name}' for future text-to-image generation. "
f"Subject style: {subject_style}. User note: {note[:500]}. "
"Look at the subject views and describe the reusable creative direction without copying identity or pixels. "
"Do NOT identify a person and do NOT describe exact facial identity. "
"Output strict JSON only with keys: gender_presentation, age_range, body_proportion, material_or_skin, "
"wardrobe_or_surface_style, pose_language, camera_readability, neck_shoulder_readiness, commercial_mood, brief. "
"The brief should be 80-140 words and must be useful as a reference character brief for creating a new innovative variation."
)
return _vision_brief_from_images(image_paths, prompt, max_images=10)
# ---------- API 路由 ----------
class CreateJobReq(BaseModel):
@@ -3130,7 +3200,7 @@ def health() -> dict:
"auth_configured": WEB_AUTH_CONFIGURED,
"base_url": LLM_BASE_URL or "openai-default",
"image_base_url": IMAGE_BASE_URL or LLM_BASE_URL or "openai-default",
"voice_base_url": AZURE_OPENAI_BASE_URL if VOICE_PROVIDER == "azure_openai" else MINIMAX_TTS_BASE_URL,
"voice_base_url": AZURE_OPENAI_BASE_URL,
"models": {
"asr": ASR_MODEL,
"local_asr": LOCAL_ASR_MODEL,
@@ -3147,15 +3217,12 @@ def health() -> dict:
"subject_image": SUBJECT_ASSET_IMAGE_MODEL,
"subject_image_fallbacks": SUBJECT_ASSET_IMAGE_MODELS,
"voice_provider": VOICE_PROVIDER,
"voice_base_url": AZURE_OPENAI_BASE_URL if VOICE_PROVIDER == "azure_openai" else MINIMAX_TTS_BASE_URL,
"voice_tts": AZURE_TTS_MODEL if VOICE_PROVIDER == "azure_openai" else MINIMAX_TTS_MODEL,
"voice_id": AZURE_TTS_VOICE_ID if VOICE_PROVIDER == "azure_openai" else MINIMAX_TTS_VOICE_ID,
"voice_pool": AZURE_TTS_VOICE_POOL if VOICE_PROVIDER == "azure_openai" else (MINIMAX_TTS_VOICE_POOL or [MINIMAX_TTS_VOICE_ID]),
"voice_configured": bool(AZURE_OPENAI_API_KEY) if VOICE_PROVIDER == "azure_openai" else bool(MINIMAX_API_KEY),
"minimax_tts": MINIMAX_TTS_MODEL,
"minimax_voice": MINIMAX_TTS_VOICE_ID,
"minimax_voice_pool": MINIMAX_TTS_VOICE_POOL or [MINIMAX_TTS_VOICE_ID],
"minimax_configured": bool(MINIMAX_API_KEY),
"voice_base_url": AZURE_OPENAI_BASE_URL,
"voice_tts": AZURE_TTS_MODEL,
"voice_tts_paths": AZURE_TTS_PATHS,
"voice_id": AZURE_TTS_VOICE_ID,
"voice_pool": AZURE_TTS_VOICE_POOL,
"voice_configured": bool(AZURE_OPENAI_API_KEY),
"video": VIDEO_MODEL,
"video_aliases": VIDEO_MODEL_ALIASES,
"video_provider": video_provider_name(),
@@ -3225,6 +3292,31 @@ async def create_job(req: CreateJobReq, bg: BackgroundTasks) -> Job:
return job
@app.post("/jobs/{job_id}/download/retry", response_model=Job)
async def retry_job_download(job_id: str, bg: BackgroundTasks) -> Job:
job = JOBS.get(job_id)
if not job:
raise HTTPException(404, "job not found")
if job.source_kind == "upload" or job.url.startswith("upload://"):
raise HTTPException(409, "uploaded videos cannot be redownloaded; upload the file again")
if job.status in {"downloading", "splitting", "transcribing"}:
raise HTTPException(409, f"job is busy: {job.status}")
mp4 = job_dir(job_id) / "source.mp4"
if mp4.exists() and mp4.stat().st_size == 0:
mp4.unlink()
update(
job,
status="downloading",
progress=1,
error="",
message="重新提交下载…",
video_url="",
)
bg.add_task(pipeline_download, job_id)
return job
@app.post("/jobs/upload", response_model=Job)
async def create_job_from_upload(bg: BackgroundTasks, file: UploadFile = File(...)) -> Job:
if not file.filename:
@@ -4308,43 +4400,56 @@ def generate_subject_assets(job_id: str, idx: int, element_id: str, req: Generat
source_indices = [idx] + source_indices
source_indices = list(dict.fromkeys(source_indices))[:12]
similar_mode = req.reconstruction_mode == "similar"
character_reference_paths: list[Path] = []
character_reference_clause = ""
template_brief_clause = ""
character_label = ""
subject_template_id = (req.subject_template_id or "").strip()
character_id = (req.character_id or "").strip()
if subject_template_id:
template = find_subject_template_item(subject_template_id)
character_label = template.name
for image in template.images[:10]:
character_reference_paths.append(subject_template_image_file(image.filename))
character_reference_clause = (
f"Selected reusable subject template from database: {template.name}. "
"Use these saved generated subject views as a high-quality creative direction and identity bible only; "
"do not copy pixels, file artifacts, exact pose, labels, or accidental defects. "
"Create a new innovative variation that keeps the same broad subject type, transparent wellness character language, "
"camera readability, shoulder/neck product compatibility, and commercial role. "
template_paths = [subject_template_image_file(image.filename) for image in template.images[:10]]
character_reference_paths.extend(template_paths)
brief = template.prompt_brief.strip() or template.note.strip() or template.description.strip()
if similar_mode and not brief:
brief = _describe_subject_template_from_images(template.name, template.subject_style, template_paths, template.note)
template_brief_clause = (
f"Reference character brief from saved database template '{template.name}': {brief}. "
"Use this as a high-quality creative direction and identity bible only; do not copy a face, exact pose, pixels, file artifacts, labels, or accidental defects. "
"Create a new innovative variation that keeps the same broad subject type, transparent wellness character language, camera readability, shoulder/neck product compatibility, and commercial role. "
if brief else
f"Selected reusable subject template from database: {template.name}. Create a new innovative variation, not a duplicate. "
)
elif character_id:
character = find_character_library_item(character_id)
character_label = character.name
for image in character.images[:7]:
character_reference_paths.append(character_library_file(image.filename))
character_reference_clause = (
f"Selected built-in creative character reference: {character.name}. "
"Use these planned character images as a high-quality creative direction and anatomy/style bible only; "
character_reference_paths.extend(character_library_file(image.filename) for image in character.images[:7])
brief = character.prompt_brief.strip() or character.description.strip()
template_brief_clause = (
f"Reference character brief from built-in creative character '{character.name}': {brief}. "
"Use this planned character brief as a high-quality creative direction and anatomy/style bible only; "
"do not copy the exact face, exact pose, exact silhouette, pixels, or make a duplicate. "
"Create a new innovative variation that keeps the same broad role, transparent wellness character language, "
"camera readability, and shoulder/neck product compatibility. "
"Create a new innovative variation that keeps the same broad role, transparent wellness character language, camera readability, and shoulder/neck product compatibility. "
)
model_src, tmp_focus = _focus_source_for_element(job_id, idx, el)
tmp_focus: Path | None = None
model_src: Path | list[Path] | None = None
frame_reference_paths = [p for p in (_source_frame_path(job_id, i) for i in source_indices) if p.exists()]
if character_reference_paths:
remaining = max(0, 10 - len(character_reference_paths))
model_src = character_reference_paths + frame_reference_paths[:remaining]
elif len(frame_reference_paths) > 1:
model_src = frame_reference_paths[:10]
source_subject_brief = _describe_source_subject(job_id, source_indices) if similar_mode else ""
source_subject_clause = (
f"Source video role brief from selected keyframes: {source_subject_brief}. "
"Use this brief to preserve role category, creator-ad energy, camera readability, and broad styling, while creating a new non-identical subject. "
if source_subject_brief else
"Source video role brief unavailable; create a new non-identical ad subject guided by the user direction, template brief, and requested view. "
)
if not similar_mode:
model_src, tmp_focus = _focus_source_for_element(job_id, idx, el)
if character_reference_paths:
remaining = max(0, 10 - len(character_reference_paths))
model_src = character_reference_paths + frame_reference_paths[:remaining]
elif len(frame_reference_paths) > 1:
model_src = frame_reference_paths[:10]
try:
with Image.open(_source_frame_path(job_id, idx)) as src_im:
@@ -4371,7 +4476,7 @@ def generate_subject_assets(job_id: str, idx: int, element_id: str, req: Generat
)
actor_style_clause = (
"Generate a believable normal commercial video actor, not a transparent or skeleton character. "
"Use the references to understand the source video's casting direction, age range, gender presentation, body proportion, wardrobe category, gesture vocabulary, framing, energy, lighting, and creator-ad style. "
"Use the text briefs to understand the source video's casting direction, age range, gender presentation, body proportion, wardrobe category, gesture vocabulary, framing, energy, lighting, and creator-ad style. "
"Do not recreate the exact person's face, biometric identity, unique likeness, tattoos, scars, logos, watermarks, captions, or platform UI. "
"The output must be a newly designed similar actor that could play the same role in a new ad, with consistent identity across all views. "
if similar_actor
@@ -4386,7 +4491,7 @@ def generate_subject_assets(job_id: str, idx: int, element_id: str, req: Generat
prompt_extra_clause = f"User direction: {prompt_extra[:1200]} " if prompt_extra else ""
identity_lock_clause = (
"Identity lock: these API calls generate one high-definition multi-view pack for ONE single subject, but each individual output file must show only its one requested view. "
"Before rendering, infer one consistent character bible from the reference image(s): gender presentation, age range, body proportions, head shape, face direction cues, material, silhouette, wardrobe/material style, and commercial mood. "
"Before rendering, infer one consistent character bible from the supplied text brief and generation instructions: gender presentation, age range, body proportions, head shape, face direction cues, material, silhouette, wardrobe/material style, and commercial mood. "
"Keep that same character bible unchanged across every generated view in separate files. "
"If user direction requests a gender, age, or style change, apply that one change uniformly to all views; never mix male/female, young/old, or multiple style identities inside the same pack. "
"For transparent humanoids, keep the same transparent skin shell, skeleton proportions, visible spine/rib cage/pelvis/limb bones, and non-horror wellness character style in every view. "
@@ -4427,14 +4532,22 @@ def generate_subject_assets(job_id: str, idx: int, element_id: str, req: Generat
if closeup_view and req.subject_kind == "living"
else "The subject must be complete, centered, full body or full object, head-to-feet visible when applicable, not cropped by the canvas. Make the subject large and readable: it should occupy about 85-95% of the image height with only small margins. "
)
reference_strategy_clause = (
"Text-only generation mode: no source image is attached to this image request. Use only the written source/video/template briefs below as creative constraints. "
"This is intentionally NOT image editing and NOT identity replication. "
+ source_subject_clause
+ template_brief_clause
if similar_mode else
"Use the reference image(s) only as visual evidence; do not crop, cut out, paste, trace, or extract pixels from the source. "
)
prompt = (
f"Use the reference image(s) only as visual evidence; do not crop, cut out, paste, trace, or extract pixels from the source. "
reference_strategy_clause
+
f"Generate one newly rendered {view_prompt} for {target}. "
f"The subject is a {kind_phrase}. If multiple frames are shown, treat them as evidence of one same subject, not multiple subjects. "
f"The subject is a {kind_phrase}. Treat all source evidence as one role and one consistent subject bible, not multiple subjects. "
+ single_view_clause
+ identity_clause
+ identity_lock_clause
+ character_reference_clause
+ neck_product_clause
+ canvas_clause
+ prompt_extra_clause
@@ -4447,7 +4560,16 @@ def generate_subject_assets(job_id: str, idx: int, element_id: str, req: Generat
+ transparent_character_clause
)
try:
img_bytes, _mode = _image_edit_call(model_src, prompt, models=models, fallback_text=False, max_attempts=3, max_side=1280)
if similar_mode:
print(
f"[subject assets] reconstruction_mode=similar endpoint=/images/generations view={view} image_refs=0 model={GPT_IMAGE_MODEL}",
flush=True,
)
img_bytes, _mode = _image_text_call(prompt, models=models, max_attempts=3)
else:
if model_src is None:
raise RuntimeError("subject asset edit reference image missing")
img_bytes, _mode = _image_edit_call(model_src, prompt, models=models, fallback_text=False, max_attempts=3, max_side=1280)
except RuntimeError as e:
raise HTTPException(_image_error_status(e), f"subject asset {view} failed: {e}")
@@ -5026,6 +5148,7 @@ def save_subject_template(job_id: str, req: SaveSubjectTemplateReq) -> SubjectTe
template_dir.mkdir(parents=True, exist_ok=True)
now = _time.time()
images: list[SubjectTemplateImage] = []
saved_image_paths: list[Path] = []
for asset in selected_assets:
src = job_dir(job_id) / "assets" / f"{asset.id}.jpg"
if not src.exists():
@@ -5034,6 +5157,7 @@ def save_subject_template(job_id: str, req: SaveSubjectTemplateReq) -> SubjectTe
filename = f"{template_id}/{image_id}.jpg"
dst = SUBJECT_TEMPLATE_IMAGE_DIR / filename
shutil.copy2(src, dst)
saved_image_paths.append(dst)
images.append(SubjectTemplateImage(
id=image_id,
view=asset.view,
@@ -5053,11 +5177,18 @@ def save_subject_template(job_id: str, req: SaveSubjectTemplateReq) -> SubjectTe
raise HTTPException(404, "subject asset files missing")
primary = next((image.id for image in images if image.view == "front"), images[0].id)
prompt_brief = _describe_subject_template_from_images(
name,
req.subject_style,
saved_image_paths,
req.note.strip(),
) or req.note.strip()
item = SubjectTemplateItem(
id=template_id,
name=name,
description=req.note.strip(),
note=req.note.strip(),
prompt_brief=prompt_brief,
source_job_id=job_id,
source_frame_idx=frame.index,
source_element_id=element.id,