feat: add automatic production start workflow

2026-05-17 12:33:13 +08:00
parent 08f18373b9
commit b02bc3f583
7 changed files with 76 additions and 25 deletions
--- a/api/main.py
+++ b/api/main.py
@@ -435,6 +435,8 @@ class AudioScript(BaseModel):
    source_text: str = ""
    source_zh: str = ""
    rewritten_text: str = ""
+    speaker_profile: str = ""
+    rhythm_profile: str = ""
    product_brief: str = ""
    rewrite_model: str = ""
    voice_provider: str = ""
@@ -1763,6 +1765,26 @@ def _fallback_audio_script(segments: list[TranscriptSegment], target_seconds: fl
    )


+def _audio_delivery_profile(segments: list[TranscriptSegment], target_seconds: float, voice_id: str) -> tuple[str, str]:
+    duration = max(float(target_seconds or 0), _segment_duration(segments), 0.0)
+    words = sum(len([w for w in s.en.replace("\n", " ").split(" ") if w.strip()]) for s in segments)
+    sentence_count = len([s for s in segments if (s.en or s.zh).strip()])
+    wpm = int(round(words / max(duration, 1.0) * 60)) if words else 0
+    avg_sentence = duration / sentence_count if sentence_count else 0.0
+    speaker = (
+        f"按原素材的短视频单人旁白处理；当前近似音色为 {voice_id}，用于保持商业口播的亲近感和节奏。"
+        if voice_id
+        else "按原素材的短视频单人旁白处理；等待选择 TTS 音色。"
+    )
+    rhythm = (
+        f"源音频约 {duration:.1f}s，{sentence_count} 个语义段，语速约 {wpm} wpm，平均每段 {avg_sentence:.1f}s；"
+        "新配音按相同时长、短句停顿和信息密度改写。"
+        if duration > 0 and sentence_count
+        else "源音频节奏信息不足；新配音按 8-12 秒信息流广告口播节奏生成。"
+    )
+    return speaker, rhythm
+
+
 def _rewrite_audio_script_sync(segments: list[TranscriptSegment], target_seconds: float = 12.0) -> tuple[str, str]:
    fallback = _fallback_audio_script(segments, target_seconds)
    if not LLM_API_KEY:
@@ -1889,6 +1911,7 @@ def _build_audio_script_sync(job_id: str, segments: list[TranscriptSegment], tar
    duration = max(float(target_seconds or 0), _segment_duration(segments), 4.0)
    rewritten, rewrite_error = _rewrite_audio_script_sync(segments, duration)
    selected_voice_id = _choose_minimax_voice_id()
+    speaker_profile, rhythm_profile = _audio_delivery_profile(segments, duration, selected_voice_id)
    voice_url = ""
    voice_error = ""
    try:
@@ -1902,6 +1925,8 @@ def _build_audio_script_sync(job_id: str, segments: list[TranscriptSegment], tar
        source_text=source_text,
        source_zh=source_zh,
        rewritten_text=rewritten,
+        speaker_profile=speaker_profile,
+        rhythm_profile=rhythm_profile,
        product_brief=AUDIO_PRODUCT_BRIEF,
        rewrite_model=AUDIO_REWRITE_MODEL,
        voice_provider="minimax",
@@ -2472,6 +2497,8 @@ async def trigger_transcribe(job_id: str, bg: BackgroundTasks) -> Job:
    manage_job_status = job.status != "splitting"
    audio_payload = AudioScript(
        status="rewriting",
+        speaker_profile="正在分析原音频讲话人和口播节奏…",
+        rhythm_profile="正在按原音频时长、语速和停顿生成 SKG 产品配音脚本…",
        product_brief=AUDIO_PRODUCT_BRIEF,
        rewrite_model=AUDIO_REWRITE_MODEL,
        voice_provider="minimax",