fix: make AI polish intent-aware

2026-05-26 11:50:01 +08:00
parent 13fa5a08da
commit f5be97b9e7
3 changed files with 325 additions and 57 deletions
--- a/api/main.py
+++ b/api/main.py
@@ -5517,55 +5517,282 @@ _PERSON_INTENT_RE = re.compile(
    re.I,
 )

+_OBJECT_INTENT_RE = re.compile(
+    r"("
+    r"产品|商品|物体|物件|道具|设备|机器|仪器|建筑|房子|屋顶|椅子|桌子|汽车|飞船|"
+    r"\b(?:product|object|item|device|machine|gadget|building|house|roof|chair|table|car|vehicle|spaceship)\b"
+    r")",
+    re.I,
+)

-def _prompt_has_person_intent(*parts: str) -> bool:
-    text = "\n".join(part for part in parts if part).strip()
-    if not text or _NO_PERSON_INTENT_RE.search(text):
-        return False
-    return bool(_PERSON_INTENT_RE.search(text))
+_SCENE_INTENT_RE = re.compile(
+    r"("
+    r"场景|街道|房间|室内|室外|空间|城市|森林|海边|天空|太空|夜景|摊位|"
+    r"\b(?:scene|street|room|interior|exterior|space|city|forest|beach|sky|night|stall|booth)\b"
+    r")",
+    re.I,
+)
+
+_ANIMAL_INTENT_RE = re.compile(
+    r"("
+    r"动物|猫|狗|鸟|马|鱼|龙|"
+    r"\b(?:animal|cat|dog|bird|horse|fish|dragon)\b"
+    r")",
+    re.I,
+)
+
+_SKG_RE = re.compile(r"\bskg\b", re.I)
+
+_NO_PERSON_CLAUSE_RE = re.compile(
+    r"\b(?:no|without)\s+people[^.。!?！？]*(?:[.。!?！？]|$)|"
+    r"\bdo\s+not\s+(?:introduce|add|include)\s+people[^.。!?！？]*(?:[.。!?！？]|$)",
+    re.I,
+)
+
+_PREVIOUS_POLISH_BOILERPLATE_PATTERNS = [
+    re.compile(
+        r"\bDetailed visual prompt,\s*clear main subject,\s*coherent composition,\s*"
+        r"natural lighting,\s*refined color palette,\s*high-quality details\.?",
+        re.I,
+    ),
+    re.compile(
+        r"\bSmooth camera movement,\s*clear subject continuity,\s*stable composition,\s*"
+        r"natural motion,\s*coherent lighting,\s*no subtitles,\s*no watermark\.?",
+        re.I,
+    ),
+    re.compile(
+        r"\bPreserve the original object-only,\s*scene-only,\s*or product-only composition;?\s*"
+        r"do not introduce people,\s*faces,\s*bodies,\s*hands,\s*avatars,\s*characters,\s*"
+        r"crowds,\s*bystanders,\s*or human silhouettes\.?",
+        re.I,
+    ),
+    re.compile(
+        r"\bUse a fully fictional synthetic AI character,\s*not based on any real person,\s*"
+        r"celebrity,\s*public figure,\s*or identifiable private individual\.?",
+        re.I,
+    ),
+    re.compile(
+        r"\bNot based on any real person,\s*celebrity,\s*public figure,\s*or identifiable private individual\.?",
+        re.I,
+    ),
+    re.compile(
+        r"\bThe subject is a fictional synthetic AI character,\s*not based on any real person\.?",
+        re.I,
+    ),
+]


-def _prompt_person_guard(req: PromptPolishReq) -> str:
-    if req.mode not in {"image", "video", "general"}:
+@dataclass(frozen=True)
+class PromptIntent:
+    raw_text: str
+    cleaned_text: str
+    person_requested: bool
+    no_person_requested: bool
+    subject_kind: str
+    skg_requested: bool
+
+
+def _strip_previous_polish_boilerplate(text: str) -> str:
+    raw = (text or "").strip()
+    if not raw:
        return ""
-    if _prompt_has_person_intent(req.text, req.system_prompt):
-        return (
-            "The user requested a person, portrait, model, or character subject. "
-            "Describe any such subject as a fully fictional synthetic AI character or virtual avatar, "
-            "not based on any real person, celebrity, public figure, or identifiable private individual. "
-            "Avoid real-person likeness, biometric identity, endorsement, or impersonation.\n"
-        )
-    return (
-        "The user did not request a person or character subject. Preserve the original object-only, "
-        "scene-only, or product-only composition. Do not introduce people, faces, bodies, hands, "
-        "avatars, characters, crowds, bystanders, or human silhouettes.\n"
+    cleaned = raw
+    for pattern in _PREVIOUS_POLISH_BOILERPLATE_PATTERNS:
+        cleaned = pattern.sub(" ", cleaned)
+    cleaned = re.sub(r"\s+", " ", cleaned).strip()
+    cleaned = re.sub(r"\s+([,.;:!?])", r"\1", cleaned)
+    cleaned = re.sub(r"(?:[.。]\s*){2,}", ". ", cleaned)
+    cleaned = re.sub(r"^[,.;:!?，。！？\s]+", "", cleaned)
+    cleaned = re.sub(r"[,;:，；：\s]+$", "", cleaned)
+    return cleaned or raw
+
+
+def _classify_prompt_intent(text: str) -> PromptIntent:
+    cleaned = _strip_previous_polish_boilerplate(text)
+    person_requested = bool(_PERSON_INTENT_RE.search(cleaned))
+    no_person_requested = bool(_NO_PERSON_INTENT_RE.search(cleaned)) and not person_requested
+    if person_requested:
+        subject_kind = "person"
+    elif _ANIMAL_INTENT_RE.search(cleaned):
+        subject_kind = "animal"
+    elif _OBJECT_INTENT_RE.search(cleaned):
+        subject_kind = "object"
+    elif _SCENE_INTENT_RE.search(cleaned) or no_person_requested:
+        subject_kind = "scene"
+    else:
+        subject_kind = "unknown"
+    return PromptIntent(
+        raw_text=text,
+        cleaned_text=cleaned,
+        person_requested=person_requested,
+        no_person_requested=no_person_requested,
+        subject_kind=subject_kind,
+        skg_requested=bool(_SKG_RE.search(cleaned)),
    )


-def _prompt_polish_fallback(req: PromptPolishReq) -> PromptPolishResp:
-    text = req.text.strip()
-    base = _ensure_english(text) if req.target_language == "en" else text
+def _remove_no_person_phrases(text: str) -> str:
+    return _NO_PERSON_INTENT_RE.sub(" ", _NO_PERSON_CLAUSE_RE.sub(" ", text or ""))
+
+
+def _output_mentions_person_subject(text: str) -> bool:
+    return bool(_PERSON_INTENT_RE.search(_remove_no_person_phrases(text or "")))
+
+
+def _clean_prompt_output(text: str) -> str:
+    out = (text or "").strip()
+    out = re.sub(r"^```(?:text)?\s*", "", out, flags=re.I).strip()
+    out = re.sub(r"\s*```$", "", out).strip()
+    out = re.sub(r'^[\'"「『]+|[\'"」』]+$', "", out).strip()
+    out = _strip_previous_polish_boilerplate(out)
+    return out
+
+
+def _ensure_fictional_person_subject(text: str) -> str:
+    out = (text or "").strip()
+    if not out:
+        return out
+    if re.search(r"\b(?:fictional|synthetic|virtual avatar|AI character|not based on any real person)\b", out, re.I):
+        return out
+    out = re.sub(
+        r"\b(?:a|an|the)?\s*(?:person|human|model|woman|man|girl|boy|actor|actress|character|avatar)\b",
+        "a fully fictional synthetic AI character",
+        out,
+        count=1,
+        flags=re.I,
+    )
+    if not re.search(r"\b(?:fictional|synthetic|virtual avatar|AI character)\b", out, re.I):
+        out = f"{out}. The subject is a fictional synthetic AI character, not based on any real person."
+    return out
+
+
+def _basic_polished_prompt(req: PromptPolishReq, intent: PromptIntent) -> str:
+    base = intent.cleaned_text or req.text.strip()
+    base = _ensure_english(base) if req.target_language == "en" else base
    base = re.sub(r"\s+", " ", base).strip()
    base = re.sub(r"[。.!！?？]+$", "", base).strip()
-    person_intent = _prompt_has_person_intent(req.text, req.system_prompt)
-    person_guard = (
-        " Use a fully fictional synthetic AI character, not based on any real person, celebrity, public figure, or identifiable private individual."
-        if person_intent
-        else " Preserve the original object-only, scene-only, or product-only composition; do not introduce people, faces, bodies, hands, avatars, characters, crowds, bystanders, or human silhouettes."
-    )
+    if intent.person_requested:
+        base = _ensure_fictional_person_subject(base)
    if req.mode == "video":
        polished = (
-            f"{base}. Smooth camera movement, clear subject continuity, stable composition, "
-            f"natural motion, coherent lighting, no subtitles, no watermark.{person_guard}"
+            f"{base}. Cinematic motion, clear subject continuity, coherent camera movement, "
+            "natural lighting transition, stable composition, detailed environmental interaction, "
+            "no subtitles, no watermark."
        )
    elif req.mode in {"general", "chat"}:
        polished = base
    else:
        polished = (
-            f"{base}. Detailed visual prompt, clear main subject, coherent composition, "
-            f"natural lighting, refined color palette, high-quality details.{person_guard}"
+            f"{base}. Clear main subject, coherent composition, natural lighting, refined color palette, "
+            "detailed textures, cinematic camera framing, high-quality visual detail."
        )
-    return PromptPolishResp(model="fallback", text=polished[:1800])
+    if intent.no_person_requested:
+        polished = f"{polished} No people, faces, bodies, hands, crowds, or human silhouettes."
+    return re.sub(r"\s+", " ", polished).strip()
+
+
+def _polished_prompt_issue(intent: PromptIntent, output: str) -> str:
+    out = output or ""
+    if not intent.skg_requested and _SKG_RE.search(out):
+        return "introduced SKG without user input"
+    if intent.person_requested and _NO_PERSON_INTENT_RE.search(out):
+        return "person prompt contains a no-person prohibition"
+    if not intent.no_person_requested and _NO_PERSON_INTENT_RE.search(out):
+        return "added a no-person prohibition that the user did not request"
+    if not intent.person_requested and _output_mentions_person_subject(out):
+        return "introduced a person or character subject that the user did not request"
+    return ""
+
+
+def _sanitize_polished_prompt(req: PromptPolishReq, intent: PromptIntent, output: str) -> str:
+    out = _clean_prompt_output(output)
+    if not out:
+        out = _basic_polished_prompt(req, intent)
+    if not intent.skg_requested:
+        out = re.sub(r"\bSKG\b[-\s]*(?:branded|brand|product|device|campaign|ad)?", "", out, flags=re.I)
+    if intent.person_requested:
+        out = _remove_no_person_phrases(out)
+        out = _ensure_fictional_person_subject(out)
+    elif intent.no_person_requested:
+        if _output_mentions_person_subject(out):
+            out = _basic_polished_prompt(req, intent)
+    else:
+        out = _remove_no_person_phrases(out)
+        if _output_mentions_person_subject(out):
+            out = _basic_polished_prompt(req, intent)
+    out = re.sub(r"\s+", " ", out).strip()
+    out = re.sub(r"\s+([,.;:!?])", r"\1", out)
+    if out and "a" <= out[0] <= "z":
+        out = out[0].upper() + out[1:]
+    return out[:1800]
+
+
+def _prompt_has_person_intent(*parts: str) -> bool:
+    text = "\n".join(part for part in parts if part).strip()
+    return _classify_prompt_intent(text).person_requested
+
+
+def _prompt_person_guard(req: PromptPolishReq) -> str:
+    if req.mode not in {"image", "video", "general"}:
+        return ""
+    intent = _classify_prompt_intent(req.text)
+    if intent.person_requested:
+        return (
+            "The input explicitly requests a person, portrait, model, or character subject. "
+            "Keep that subject, but describe them as a fully fictional synthetic AI character or virtual avatar. "
+            "Do not imply a real person, celebrity, public figure, private individual, endorsement, or copied likeness.\n"
+        )
+    if intent.no_person_requested:
+        return (
+            "The input explicitly requests a no-person composition. Keep the output free of people, faces, bodies, "
+            "hands, avatars, characters, crowds, bystanders, and human silhouettes.\n"
+        )
+    return (
+        "The input does not explicitly request a person or character. Do not force a person into the rewrite, "
+        "and do not add a no-person prohibition. If the source input or an attached/reference image already contains "
+        "a person or character, preserve that visible subject conditionally as a fictional AI-generated synthetic "
+        "character; otherwise use neutral wording such as 'main subject' when the subject is unclear.\n"
+    )
+
+
+def _prompt_polish_fallback(req: PromptPolishReq) -> PromptPolishResp:
+    intent = _classify_prompt_intent(req.text)
+    return PromptPolishResp(model="fallback", text=_sanitize_polished_prompt(req, intent, _basic_polished_prompt(req, intent)))
+
+
+def _repair_polished_prompt(req: PromptPolishReq, intent: PromptIntent, output: str, *, allow_llm: bool = False) -> str:
+    out = _sanitize_polished_prompt(req, intent, output)
+    issue = _polished_prompt_issue(intent, out)
+    if not issue or not allow_llm or not LLM_API_KEY:
+        return out
+    repair_prompt = (
+        "Repair the rewritten generation prompt so it follows the source input exactly.\n"
+        f"Issue to fix: {issue}.\n"
+        "Hard rules:\n"
+        "- Do not introduce SKG or any brand unless it appears literally in the source input.\n"
+        "- Do not introduce products, platforms, ad framing, sales language, slogans, hashtags, or claims unless present in the source input.\n"
+        "- If the source requests a person, keep the person only as a fully fictional synthetic AI character.\n"
+        "- If the source mentions a reference, uploaded, first-frame, last-frame, or current image that may already contain a person, preserve that visible subject conditionally as a fictional AI-generated synthetic character.\n"
+        "- If neither the source nor a referenced image requests or shows a person, do not add a person or character.\n"
+        "- Return one clean prompt only, no explanation.\n\n"
+        f"Source input:\n{intent.cleaned_text[:1800]}\n\n"
+        f"Current rewritten prompt:\n{out[:1800]}"
+    )
+    try:
+        resp = llm().chat.completions.create(
+            model=REWRITE_MODEL,
+            messages=[
+                {"role": "system", "content": "You repair generation prompts by removing contradictions and preserving only source intent."},
+                {"role": "user", "content": repair_prompt},
+            ],
+            temperature=0.15,
+            max_tokens=700,
+        )
+        repaired = _sanitize_polished_prompt(req, intent, resp.choices[0].message.content or "")
+        return repaired if not _polished_prompt_issue(intent, repaired) else out
+    except Exception as e:
+        print(f"[prompt polish repair fallback] {e}", flush=True)
+        return out


@app.post("/prompt/polish", response_model=PromptPolishResp)
@@ -5573,6 +5800,7 @@ def polish_prompt(req: PromptPolishReq) -> PromptPolishResp:
    text = req.text.strip()
    if not text:
        raise HTTPException(400, "text required")
+    intent = _classify_prompt_intent(text)
    if not LLM_API_KEY:
        return _prompt_polish_fallback(req)

@@ -5588,47 +5816,54 @@ def polish_prompt(req: PromptPolishReq) -> PromptPolishResp:
        "chat": "a professional response to the user's request",
    }.get(req.mode, "an image-generation prompt")
    user_system = req.system_prompt.strip()
+    structure_hint = (
+        "For image prompts, write one polished paragraph covering subject, action/state, setting, composition, camera framing, lighting, material/detail, color mood, and production quality. "
+        if req.mode != "video"
+        else "For video prompts, write one polished paragraph covering opening state, subject motion, camera movement, continuity, environmental interaction, lighting transition, and ending state. "
+    )
    prompt = (
        f"Rewrite the user's input into {mode_hint} in {target_label}.\n"
-        "Preserve the user's actual subject, brand, product, place, style, and intent.\n"
-        "Do not add SKG, health-tech, massage products, TikTok ad framing, product sales language, hashtags, captions, or any brand/product not explicitly present in the input or user-selected guidance.\n"
-        "Do not add medical, wellness, or advertising claims unless the user asked for them.\n"
-        "Improve concrete visual details, composition, lighting, camera language, materials, mood, and quality.\n"
-        "Return only the rewritten prompt. No markdown, labels, JSON, quotes, explanation, or alternatives.\n"
+        "Preserve only the subject, brand, product, place, platform, style, action, and intent explicitly present in the source input.\n"
+        "Do not introduce SKG or any other brand unless the source input literally includes it.\n"
+        "Do not introduce products, platforms, ad framing, sales language, slogans, hashtags, captions, or marketing claims unless explicitly present in the source input.\n"
+        "Improve visual specificity, composition, lighting, camera language, materials, motion, mood, and production quality without changing the subject.\n"
+        "Do not add a no-person prohibition unless the source input explicitly asks for no people.\n"
+        "If the source input mentions uploaded images, reference images, first frames, last frames, or current images, keep any existing visible person in those references as a fictional AI-generated synthetic character; do not invent people for references that have none.\n"
+        f"{structure_hint}"
+        "Return only the rewritten prompt. No markdown, labels, JSON, quotes, explanation, alternatives, or meta-instructions.\n"
        f"{_prompt_person_guard(req)}"
    )
    if req.mode == "chat":
        prompt = (
            f"Answer or rewrite the user's request professionally in {target_label}.\n"
            "Follow the user-selected guidance when provided.\n"
-            "Do not add SKG, health-tech, massage products, TikTok ad framing, product sales language, hashtags, captions, or any brand/product not explicitly present in the input or user-selected guidance.\n"
+            "Do not add SKG or any brand/product/platform not explicitly present in the source input or user-selected guidance.\n"
            "Do not add medical, wellness, or advertising claims unless the user asked for them.\n"
            "Return only the final content in the format requested by the guidance. No markdown fences, labels, explanation, or alternatives unless explicitly requested.\n"
        )
    if req.mode == "video":
        prompt += (
            "For video, describe motion, timing, camera movement, continuity, and what changes over time. "
-            "Do not add people for scale, atmosphere, lifestyle context, or background decoration unless the input explicitly asked for people.\n"
+            "Do not add people for scale, atmosphere, lifestyle context, or background decoration unless the input or reference image already contains or requests people.\n"
        )
    if user_system:
        prompt += f"\nUser-selected polishing guidance:\n{user_system[:1000]}\n"
-    prompt += f"\nInput:\n{text[:2500]}"
+    prompt += f"\nSource input:\n{intent.cleaned_text[:2500]}"

    try:
        resp = llm().chat.completions.create(
            model=REWRITE_MODEL,
            messages=[
-                {"role": "system", "content": "You are a neutral professional prompt editor. You preserve intent and never inject unrelated brands or products."},
+                {"role": "system", "content": "You are a neutral professional prompt editor. Preserve source intent exactly and never inject SKG or unrelated brands, products, platforms, people, or marketing context."},
                {"role": "user", "content": prompt},
            ],
            temperature=0.45,
            max_tokens=900,
        )
-        out = (resp.choices[0].message.content or "").strip()
-        out = re.sub(r"^```(?:text)?\s*", "", out, flags=re.I).strip()
-        out = re.sub(r"\s*```$", "", out).strip()
-        out = re.sub(r'^[\'"「『]+|[\'"」』]+$', "", out).strip()
-        return PromptPolishResp(model=REWRITE_MODEL, text=(out or _prompt_polish_fallback(req).text)[:1800])
+        out = _clean_prompt_output(resp.choices[0].message.content or "")
+        if not out:
+            out = _prompt_polish_fallback(req).text
+        return PromptPolishResp(model=REWRITE_MODEL, text=_repair_polished_prompt(req, intent, out, allow_llm=True))
    except Exception as e:
        print(f"[prompt polish fallback] {e}", flush=True)
        return _prompt_polish_fallback(req)
@@ -8197,6 +8432,24 @@ def _storyboard_video_prompt(scene: StoryboardScene, seed: int | None = None) ->
    return "\n".join([p for p in parts if p.strip()])


+_REFERENCE_IMAGE_SYNTHETIC_PERSON_GUARD = (
+    "Reference images may be AI-generated visual assets. If any person, face, body, hand, avatar, or character "
+    "appears in the provided reference image(s), treat the visible subject as a fully fictional AI-generated "
+    "synthetic character, not a real person, celebrity, public figure, private individual, or copied likeness. "
+    "Preserve the reference subject and composition without identifying, impersonating, or implying endorsement "
+    "by any real person."
+)
+
+
+def _append_reference_image_person_guard(prompt: str, has_reference: bool) -> str:
+    out = (prompt or "").strip()
+    if not has_reference or not out:
+        return out
+    if "Reference images may be AI-generated visual assets" in out or "fully fictional AI-generated synthetic character" in out:
+        return out
+    return f"{out}\n\n{_REFERENCE_IMAGE_SYNTHETIC_PERSON_GUARD}"
+
+
 class ProductFusionDescriptionReq(BaseModel):
    shots: list[ProductFusionShot] = Field(default_factory=list)

@@ -8287,8 +8540,9 @@ def _video_public_error(raw: object) -> str:
        "人脸",
    )):
        return (
-            "视频生成失败：参考图里有清晰人物或疑似真实人脸，视频模型出于肖像/隐私风控拒绝生成。"
-            "请换成无可识别人脸的首帧，或先裁掉/模糊人物脸，再重新生成视频。"
+            "视频生成失败：参考图被视频模型判定为疑似真实人脸或肖像隐私。"
+            "系统会按 AI 生成的虚拟角色提交，但上游仍可能误判；请尝试换更低识别度的首帧，"
+            "或裁掉/弱化脸部后重新生成视频。"
        )

    if any(token in lower for token in (
@@ -8687,6 +8941,7 @@ def _enqueue_storyboard_videos(job: Job, frame: KeyFrame, req: GenerateStoryboar
    source_ref = req.source_ref
    if source_ref and source_ref.kind == "source_video" and not source_ref.url:
        source_ref = None
+    has_visual_reference = bool(ref_path.exists() or last_ref_path or reference_ref_paths)
    items: list[GeneratedVideo] = []
    ids: list[str] = []
    queued_tasks: list[tuple[str, tuple]] = []
@@ -8695,6 +8950,7 @@ def _enqueue_storyboard_videos(job: Job, frame: KeyFrame, req: GenerateStoryboar
        ids.append(local_id)
        variant_seed = (req.seed + i) if req.seed is not None else random.randint(100000, 999999)
        variant_prompt = _ensure_english(f"{prompt}\n\nCreate variation {i + 1} of {count}. Variation seed: {variant_seed}. Keep the same compact row meaning but vary camera motion, gesture timing, and composition.")
+        variant_prompt = _append_reference_image_person_guard(variant_prompt, has_visual_reference)
        items.append(GeneratedVideo(
            id=local_id,
            provider_id="",