feat: enforce english prompt language strategy

This commit is contained in:
2026-05-18 20:07:11 +08:00
parent adf8b2ba0a
commit bc0b010def
4 changed files with 560 additions and 205 deletions

View File

@@ -91,7 +91,7 @@ YTDLP_COOKIES_FILE = os.getenv("YTDLP_COOKIES_FILE", "").strip()
YTDLP_COOKIES_FROM_BROWSER = os.getenv("YTDLP_COOKIES_FROM_BROWSER", "").strip()
AUDIO_PRODUCT_BRIEF = os.getenv(
"AUDIO_PRODUCT_BRIEF",
"SKG 智能按摩产品,主打日常肩颈、腰背、眼部、膝盖或足部放松;广告表达要高级、干净、可信,不做医疗疗效承诺。",
"SKG smart massage products for everyday neck-and-shoulder, back, eye, knee, or foot relaxation. Ads should feel premium, clean, trustworthy, and must not make medical efficacy claims.",
).strip()
AUDIO_REWRITE_MODEL = gpt_model_env("AUDIO_REWRITE_MODEL", REWRITE_MODEL)
VOICE_PROVIDER = "azure_openai"
@@ -454,6 +454,7 @@ class CharacterLibraryItem(BaseModel):
folder: str = ""
description: str = ""
prompt_brief: str = ""
prompt_brief_zh: str = ""
primary_image: str = ""
images: list[CharacterLibraryImage] = Field(default_factory=list)
@@ -480,6 +481,7 @@ class SubjectTemplateItem(BaseModel):
description: str = ""
note: str = ""
prompt_brief: str = ""
prompt_brief_zh: str = ""
source: Literal["database"] = "database"
source_job_id: str = ""
source_frame_idx: int = -1
@@ -534,6 +536,7 @@ class KeyElement(BaseModel):
subject_kind: SubjectKind = "object"
subject_assets: list[SubjectAsset] = Field(default_factory=list)
subject_consensus_brief: str = ""
subject_consensus_brief_zh: str = ""
created_at: float = 0.0
@@ -565,6 +568,7 @@ class AudioScript(BaseModel):
source_text: str = ""
source_zh: str = ""
rewritten_text: str = ""
rewritten_text_zh: str = ""
speaker_profile: str = ""
rhythm_profile: str = ""
background_audio_profile: str = ""
@@ -2307,7 +2311,7 @@ def _audio_profile_model_sync(wav: Path, segments: list[TranscriptSegment], targ
fallback = _fallback_audio_profile(segments, target_seconds)
if not LLM_API_KEY or not wav.exists():
return fallback
transcript = _transcript_join(segments, "en") or _transcript_join(segments, "zh") or "No reliable transcript."
transcript = _ensure_english(_transcript_join(segments, "en") or _transcript_join(segments, "zh") or "No reliable transcript.")
try:
audio_b64 = base64.b64encode(wav.read_bytes()).decode("ascii")
except Exception:
@@ -2373,12 +2377,15 @@ def _build_audio_intake_sync(job_id: str, wav: Path, segments: list[TranscriptSe
)
def _rewrite_audio_script_sync(segments: list[TranscriptSegment], target_seconds: float = 12.0) -> tuple[str, str]:
def _rewrite_audio_script_sync(segments: list[TranscriptSegment], target_seconds: float = 12.0) -> tuple[str, str, str]:
fallback = _fallback_audio_script(segments, target_seconds)
try:
fallback_zh = _translate_text_sync(fallback, "zh", max_tokens=300) if LLM_API_KEY else ""
except Exception:
fallback_zh = ""
if not LLM_API_KEY:
return fallback, "LLM_API_KEY 未配置,使用本地 SKG 模板"
return fallback, fallback_zh, "LLM_API_KEY 未配置,使用本地 SKG 模板"
source_text = _transcript_join(segments, "en")
source_zh = _transcript_join(segments, "zh")
min_words, max_words = _voiceover_target_words(target_seconds)
prompt = (
"You are an English short-video voice-over writer for SKG wellness massagers. "
@@ -2392,10 +2399,9 @@ def _rewrite_audio_script_sync(segments: list[TranscriptSegment], target_seconds
"5. Introduce SKG products directly: smart massage, warmth, rhythm, daily neck/back/eye/knee/foot relaxation.\n"
"6. Keep it easy for TTS: short sentences, spoken phrasing, no hashtags, no stage directions, no quotation marks.\n"
"7. If the source transcript is thin, ignore it and write a general SKG product intro.\n"
'Return strict JSON only: {"rewritten_text":"..."}.\n\n'
f"SKG product context: {AUDIO_PRODUCT_BRIEF}\n\n"
f"English transcript:\n{source_text or 'None'}\n\n"
f"Chinese translation for reference:\n{source_zh or 'None'}"
'Return strict JSON only: {"rewritten_text":"English VO","rewritten_text_zh":"Simplified Chinese mirror for team review"}.\n\n'
f"SKG product context: {_ensure_english(AUDIO_PRODUCT_BRIEF)}\n\n"
f"English transcript:\n{source_text or 'None'}"
)
try:
resp = llm().chat.completions.create(
@@ -2415,9 +2421,12 @@ def _rewrite_audio_script_sync(segments: list[TranscriptSegment], target_seconds
raw = match.group(0) if match else raw
data = json.loads(raw)
text = str(data.get("rewritten_text", "")).strip()
return (text or fallback), ""
text_zh = str(data.get("rewritten_text_zh", "")).strip()
if text and not text_zh:
text_zh = _translate_text_sync(text, "zh", max_tokens=300)
return (text or fallback), (text_zh or fallback_zh), ""
except Exception as e:
return fallback, f"改写失败,使用本地模板:{e}"
return fallback, fallback_zh, f"改写失败,使用本地模板:{e}"
def _choose_azure_voice_id() -> str:
@@ -2521,7 +2530,7 @@ def _build_audio_script_sync(job_id: str, segments: list[TranscriptSegment], tar
source_text = _transcript_join(segments, "en")
source_zh = _transcript_join(segments, "zh")
duration = max(float(target_seconds or 0), _segment_duration(segments), 4.0)
rewritten, rewrite_error = _rewrite_audio_script_sync(segments, duration)
rewritten, rewritten_zh, rewrite_error = _rewrite_audio_script_sync(segments, duration)
selected_voice_id = _choose_tts_voice_id()
speaker_profile, rhythm_profile = _audio_delivery_profile(segments, duration, selected_voice_id)
voice_url = ""
@@ -2539,6 +2548,7 @@ def _build_audio_script_sync(job_id: str, segments: list[TranscriptSegment], tar
source_text=source_text,
source_zh=source_zh,
rewritten_text=rewritten,
rewritten_text_zh=rewritten_zh,
speaker_profile=speaker_profile,
rhythm_profile=rhythm_profile,
product_brief=AUDIO_PRODUCT_BRIEF,
@@ -3055,6 +3065,55 @@ class RewriteStoryboardScriptReq(BaseModel):
segments: list[ScriptRewriteSegmentReq] = Field(default_factory=list)
_TRANSLATION_CACHE: dict[str, str] = {}
def _contains_cjk(text: str) -> bool:
return bool(re.search(r"[\u3400-\u9fff]", text or ""))
def _translate_text_sync(text: str, target: Literal["en", "zh"] = "en", *, max_tokens: int = 700) -> str:
text = (text or "").strip()
if not text or not LLM_API_KEY:
return text
target_label = "English" if target == "en" else "Simplified Chinese"
prompt = (
f"Translate the following TikTok ad planning text into concise natural {target_label}. "
"Preserve concrete product, camera, subject, timing, and structure details. "
"Do not add commentary, markdown, quotes, or explanations.\n\n"
f"Input:\n{text}"
)
resp = llm().chat.completions.create(
model=TRANSLATE_MODEL,
messages=[{"role": "user", "content": prompt}],
temperature=0.15,
max_tokens=max_tokens,
)
out = (resp.choices[0].message.content or "").strip()
if not out:
rc = getattr(resp.choices[0].message, "reasoning_content", "") or ""
if rc:
out = rc.strip().splitlines()[-1].strip()
return re.sub(r'^[\'"「『]+|[\'"」』]+$', "", out).strip() or text
def _ensure_english(text: str) -> str:
text = (text or "").strip()
if not text or not _contains_cjk(text):
return text
key = hashlib.sha256(("en\0" + text).encode("utf-8")).hexdigest()
cached = _TRANSLATION_CACHE.get(key)
if cached:
return cached
try:
translated = _translate_text_sync(text, "en", max_tokens=max(700, min(3500, len(text) // 2 + 900)))
_TRANSLATION_CACHE[key] = translated
return translated
except Exception as e:
print(f"[ensure english fallback] {e}", flush=True)
return text
@app.post("/translate")
def translate_text(req: TranslateReq) -> dict:
"""单条文本翻译(给生图自定义提取元素 zh→en 用)"""
@@ -3092,22 +3151,26 @@ def translate_text(req: TranslateReq) -> dict:
def _fallback_script_rewrite_item(segment: ScriptRewriteSegmentReq, author_intent: str = "") -> dict:
source = (segment.source or "").strip()
intent = (author_intent or "").strip()
intent = _ensure_english(author_intent or "")
role = segment.role or ""
templates = {
"开场钩子": "你有没有发现,低头久了以后,脖子和肩膀会先替你喊累。",
"痛点推进": "刷手机、坐电脑、赶通勤叠在一起,肩颈很容易一直绷着放不下来。",
"利益证明": "SKG 这种挂脖按摩仪,重点就是贴住肩颈位置,把热敷感和揉按感带到真正紧的地方。",
"方案过渡": "这一段可以直接拍拿起、戴上、贴合,让产品自然进入日常放松场景。",
"转化收口": "如果你也想把肩颈放松变成每天的小习惯,可以从这台 SKG 开始。",
"节奏承接": "顺着原片节奏,把这一句落到一个具体的肩颈使用场景里。",
"hook": "Have you noticed that after hours of looking down, your neck and shoulders complain before you do?",
"pain": "Phone scrolling, desk work, and commuting can keep your neck and shoulders tight all day.",
"proof": "An SKG wearable massager sits around the neck and shoulders, bringing warm, rhythmic comfort to the spots that feel tense.",
"solution": "This beat can simply show pick up, wear, fit, and relax, so the product enters a normal daily routine.",
"cta": "If you want neck-and-shoulder relaxation to become a daily habit, start with this SKG massager.",
"bridge": "Follow the source rhythm, but land this line in one specific neck-and-shoulder use moment.",
}
rewritten = templates.get(role, templates["节奏承接"])
if source and role not in {"开场钩子", "转化收口"}:
rewritten = f"{rewritten} 原片这一句的节奏可以保留,但内容换成 SKG 的佩戴和放松体验。"
rewritten = templates.get(role, templates["bridge"])
if source and role not in {"hook", "cta"}:
rewritten = f"{rewritten} Keep the source sentence rhythm, but replace the content with SKG wearing and relaxation experience."
if intent:
rewritten = f"{rewritten} 语气按作者想法处理:{intent[:44]}"
return {"index": segment.index, "text": rewritten[:220]}
rewritten = f"{rewritten} Adjust the tone based on the creator note: {intent[:90]}."
try:
zh = _translate_text_sync(rewritten, "zh", max_tokens=260) if LLM_API_KEY else ""
except Exception:
zh = ""
return {"index": segment.index, "text": rewritten[:260], "text_zh": zh}
def _parse_script_rewrite_items(raw: str, requested: list[ScriptRewriteSegmentReq], author_intent: str = "") -> list[dict]:
@@ -3123,7 +3186,7 @@ def _parse_script_rewrite_items(raw: str, requested: list[ScriptRewriteSegmentRe
raw_items = data.get("items") if isinstance(data, dict) else data
if not isinstance(raw_items, list):
raw_items = []
by_index: dict[int, str] = {}
by_index: dict[int, tuple[str, str]] = {}
for item in raw_items:
if not isinstance(item, dict):
continue
@@ -3132,19 +3195,27 @@ def _parse_script_rewrite_items(raw: str, requested: list[ScriptRewriteSegmentRe
except Exception:
continue
value = str(item.get("text") or item.get("rewritten_text") or "").strip()
value_zh = str(item.get("text_zh") or item.get("rewritten_text_zh") or "").strip()
if value:
by_index[idx] = re.sub(r"\s+", " ", value).strip()[:260]
return [
{"index": segment.index, "text": by_index.get(segment.index) or _fallback_script_rewrite_item(segment, author_intent)["text"]}
for segment in requested
]
by_index[idx] = (re.sub(r"\s+", " ", value).strip()[:260], re.sub(r"\s+", " ", value_zh).strip()[:260])
items = []
for segment in requested:
fallback = _fallback_script_rewrite_item(segment, author_intent)
text, text_zh = by_index.get(segment.index, ("", ""))
if text and not text_zh:
try:
text_zh = _translate_text_sync(text, "zh", max_tokens=260) if LLM_API_KEY else ""
except Exception:
text_zh = ""
items.append({"index": segment.index, "text": text or fallback["text"], "text_zh": text_zh or fallback.get("text_zh", "")})
return items
def _rewrite_storyboard_script_sync(req: RewriteStoryboardScriptReq) -> list[dict]:
segments = [segment for segment in req.segments if (segment.source or segment.current_text).strip()]
if not segments:
return []
author_intent = (req.author_intent or "").strip()
author_intent = _ensure_english(req.author_intent or "")
if not LLM_API_KEY:
return [_fallback_script_rewrite_item(segment, author_intent) for segment in segments]
payload = [
@@ -3152,26 +3223,27 @@ def _rewrite_storyboard_script_sync(req: RewriteStoryboardScriptReq) -> list[dic
"index": segment.index,
"time": f"{segment.start:.1f}-{segment.end:.1f}s",
"role": segment.role,
"source_reference": segment.source,
"current_voiceover": segment.current_text,
"source_reference": _ensure_english(segment.source),
"current_voiceover": _ensure_english(segment.current_text),
}
for segment in segments
]
prompt = (
"你是信息流广告脚本文案改写师。任务:基于原参考文案的节奏和信息结构,把每段改写成 SKG 挂脖肩颈按摩仪的新口播文案。\n"
"硬规则:\n"
"1. 输出中文短视频口播,不要英文,不要舞台说明,不要引号。\n"
"2. 不逐字翻译原文,不保留原品牌、价格、优惠码、平台话术;只参考节奏、钩子、痛点、转化结构。\n"
"3. 产品固定为套在脖子上的 U 形肩颈按摩仪,表达肩颈紧绷、久坐低头、热敷感、揉按感、佩戴放松和日常使用场景。\n"
"4. 避免医疗疗效、治疗、治愈、止痛等强功效承诺。\n"
"5. 每段尽量短,适配该段时间;保持自然创作者口吻。\n"
"6. mode=all整片要前后连贯mode=segment 时,只改给定段落但仍要贴合上下文风格。\n"
f"作者想法:{author_intent or '没有额外想法,按原片节奏改成自然卖点口播。'}\n"
f"改写模式:{req.mode}\n"
f"SKG 产品背景:{AUDIO_PRODUCT_BRIEF}\n\n"
"输入段落 JSON\n"
"You are an information-feed ad voice-over rewrite specialist. Rewrite each segment into a new ENGLISH SKG neck-and-shoulder massager voice-over line while preserving the source rhythm and information structure.\n"
"Hard rules:\n"
"1. The main text field must be English short-video VO. No stage directions, no quotes.\n"
"2. Do not translate word-for-word. Do not keep the original brand, price, discount code, platform CTA, or exact claims; only reuse rhythm, hook, pain-point, proof, and conversion structure.\n"
"3. The product is a U-shaped neck-and-shoulder wearable massager worn around the neck. Express neck/shoulder tension, desk posture, looking down, warmth, kneading-like comfort, wearing, relaxation, and daily use.\n"
"4. Avoid medical treatment, cure, pain elimination, clinical, or disease claims.\n"
"5. Keep each segment short enough for its time range and natural for a creator voice.\n"
"6. If mode=all, make the whole piece coherent; if mode=segment, rewrite only the given segment while matching the broader style.\n"
"7. Also return a Simplified Chinese mirror for team review in text_zh; it is not for model prompts.\n"
f"Creator note: {author_intent or 'No extra note; follow the source pacing and turn it into natural SKG product VO.'}\n"
f"Rewrite mode: {req.mode}\n"
f"SKG product context: {_ensure_english(AUDIO_PRODUCT_BRIEF)}\n\n"
"Input segments JSON:\n"
+ json.dumps(payload, ensure_ascii=False)
+ '\n\n只输出严格 JSON{"items":[{"index":0,"text":"改写后的中文口播"}]}'
+ '\n\nReturn strict JSON only: {"items":[{"index":0,"text":"rewritten English VO","text_zh":"中文镜像"}]}'
)
models = []
for model in [AUDIO_REWRITE_MODEL, ASR_FALLBACK_MODEL, TRANSLATE_MODEL]:
@@ -3182,7 +3254,7 @@ def _rewrite_storyboard_script_sync(req: RewriteStoryboardScriptReq) -> list[dic
resp = llm().chat.completions.create(
model=model,
messages=[
{"role": "system", "content": "只返回合法 JSON不要 markdown不要解释。"},
{"role": "system", "content": "Return valid JSON only. No markdown. No explanation."},
{"role": "user", "content": prompt},
],
response_format={"type": "json_object"},
@@ -3950,6 +4022,7 @@ class UpdateElementReq(BaseModel):
name_en: str | None = None
position: str | None = None
subject_consensus_brief: str | None = None
subject_consensus_brief_zh: str | None = None
class GenerateSceneAssetReq(BaseModel):
@@ -3998,8 +4071,8 @@ class GenerateSubjectAssetsReq(BaseModel):
def _subject_profile_prompt_clause(profile: SubjectProfilePreference | None) -> str:
if not profile:
return ""
prompt_summary = (profile.prompt_summary or "").strip()
resolved_summary = (profile.resolved_summary or "").strip()
prompt_summary = _ensure_english(profile.prompt_summary or "")
resolved_summary = _ensure_english(profile.resolved_summary or "")
if prompt_summary:
body = prompt_summary[:1400]
else:
@@ -4013,7 +4086,7 @@ def _subject_profile_prompt_clause(profile: SubjectProfilePreference | None) ->
("hair style", profile.hair),
("commercial mood", profile.mood),
]
body = "; ".join(f"{name}: {value.strip()}" for name, value in parts if value and value.strip())[:1400]
body = "; ".join(f"{name}: {_ensure_english(value.strip())}" for name, value in parts if value and value.strip())[:1400]
if not body and not resolved_summary:
return ""
mode = "random-composed" if profile.mode == "random" else "manually selected"
@@ -4125,7 +4198,9 @@ def update_element(job_id: str, idx: int, element_id: str, req: UpdateElementReq
if req.position is not None:
e.position = req.position.strip()
if req.subject_consensus_brief is not None:
e.subject_consensus_brief = req.subject_consensus_brief.strip()[:2200]
e.subject_consensus_brief = _ensure_english(req.subject_consensus_brief.strip())[:2200]
if req.subject_consensus_brief_zh is not None:
e.subject_consensus_brief_zh = req.subject_consensus_brief_zh.strip()[:2200]
new_frames.append(f)
if not found:
raise HTTPException(404, "element not found")
@@ -4208,7 +4283,7 @@ def generate_scene_asset(job_id: str, idx: int, req: GenerateSceneAssetReq) -> J
if confirmed_subjects
else "Remove the main foreground subject from the frame if present. "
)
subject_brief = req.subject_brief.strip()
subject_brief = _ensure_english(req.subject_brief.strip())
subject_brief_clause = (
f"Subject identity (text only, no image reference): {subject_brief[:1800]}. "
"Maintain this identity across this and other endpoint frames in the same storyboard. "
@@ -4237,7 +4312,7 @@ def generate_scene_asset(job_id: str, idx: int, req: GenerateSceneAssetReq) -> J
"warm_lifestyle": "Use a warm lifestyle style: realistic lived-in details, soft natural light, approachable atmosphere.",
"cinematic": "Use a cinematic style: dramatic but natural lighting, richer depth, filmic contrast, not fantasy.",
}[req.scene_style]
user_prompt = req.prompt.strip()
user_prompt = _ensure_english(req.prompt.strip())
user_prompt_clause = (
"User scene direction: " + user_prompt[:1200] + " "
if user_prompt
@@ -4483,6 +4558,7 @@ def generate_subject_assets(job_id: str, idx: int, element_id: str, req: Generat
brief = template.prompt_brief.strip() or template.note.strip() or template.description.strip()
if similar_mode and not brief:
brief = _describe_subject_template_from_images(template.name, template.subject_style, template_paths, template.note)
brief = _ensure_english(brief)
selected_template_brief = brief.strip()
template_brief_clause = (
f"Reference character brief from saved database template '{template.name}': {brief}. "
@@ -4496,6 +4572,7 @@ def generate_subject_assets(job_id: str, idx: int, element_id: str, req: Generat
character_label = character.name
character_reference_paths.extend(character_library_file(image.filename) for image in character.images[:7])
brief = character.prompt_brief.strip() or character.description.strip()
brief = _ensure_english(brief)
selected_template_brief = brief.strip()
template_brief_clause = (
f"Reference character brief from built-in creative character '{character.name}': {brief}. "
@@ -4558,7 +4635,7 @@ def generate_subject_assets(job_id: str, idx: int, element_id: str, req: Generat
if req.reconstruction_mode == "similar"
else "Preserve identity, proportions, silhouette, material, colors, styling, and distinctive details across all generated views. "
)
prompt_extra = req.prompt.strip()
prompt_extra = _ensure_english(req.prompt.strip())
prompt_extra_clause = f"User direction: {prompt_extra[:1200]} " if prompt_extra else ""
subject_profile_clause = _subject_profile_prompt_clause(req.subject_profile)
identity_lock_clause = (
@@ -4709,7 +4786,7 @@ def generate_subject_assets(job_id: str, idx: int, element_id: str, req: Generat
]
fallback_brief = " ".join(part.strip() for part in fallback_parts if part and part.strip())[:1800]
if selected_template_brief:
e.subject_consensus_brief = selected_template_brief[:1800]
e.subject_consensus_brief = _ensure_english(selected_template_brief)[:1800]
else:
asset_paths = [
job_dir(job_id) / "assets" / f"{asset.id}.jpg"
@@ -4722,9 +4799,14 @@ def generate_subject_assets(job_id: str, idx: int, element_id: str, req: Generat
asset_paths,
fallback_brief,
)
e.subject_consensus_brief = brief or current_brief or fallback_brief or (
e.subject_consensus_brief = _ensure_english(brief or current_brief or fallback_brief or (
"Generated SKG ad subject; identity brief unavailable. Keep one consistent commercial subject with clear neck and shoulder placement area."
)
))[:1800]
if e.subject_consensus_brief and not e.subject_consensus_brief_zh:
try:
e.subject_consensus_brief_zh = _translate_text_sync(e.subject_consensus_brief, "zh", max_tokens=500)[:1800]
except Exception:
e.subject_consensus_brief_zh = ""
new_frames.append(f)
if generation_errors:
msg = f"主体资产包部分生成完成 · {el.name_zh} · {len(generated)} 张,失败 {len(generation_errors)}"
@@ -5296,18 +5378,23 @@ def save_subject_template(job_id: str, req: SaveSubjectTemplateReq) -> SubjectTe
raise HTTPException(404, "subject asset files missing")
primary = next((image.id for image in images if image.view == "front"), images[0].id)
prompt_brief = _describe_subject_template_from_images(
prompt_brief = _ensure_english(_describe_subject_template_from_images(
name,
req.subject_style,
saved_image_paths,
req.note.strip(),
) or req.note.strip()
) or req.note.strip())
try:
prompt_brief_zh = _translate_text_sync(prompt_brief, "zh", max_tokens=500) if prompt_brief else ""
except Exception:
prompt_brief_zh = ""
item = SubjectTemplateItem(
id=template_id,
name=name,
description=req.note.strip(),
note=req.note.strip(),
prompt_brief=prompt_brief,
prompt_brief_zh=prompt_brief_zh,
source_job_id=job_id,
source_frame_idx=frame.index,
source_element_id=element.id,