fix: add cloud local asr fallback
This commit is contained in:
@@ -1,92 +1,5 @@
|
||||
{
|
||||
"entries": [
|
||||
{
|
||||
"files_changed" : 1,
|
||||
"message" : "Codex 会话活跃 · 最近命令:codex · 1 项未提交变更 · 最近提交:auto-save 2026-05-16 16:46 (~2)",
|
||||
"ts" : "2026-05-16T08:48:08Z",
|
||||
"type" : "session-heartbeat"
|
||||
},
|
||||
{
|
||||
"files_changed" : 1,
|
||||
"hash" : "8d39539",
|
||||
"message" : "auto-save 2026-05-16 16:51 (~1)",
|
||||
"ts" : "2026-05-16T16:51:42+08:00",
|
||||
"type" : "commit"
|
||||
},
|
||||
{
|
||||
"files_changed" : 1,
|
||||
"hash" : "da46de9",
|
||||
"message" : "auto-save 2026-05-16 16:56 (~1)",
|
||||
"ts" : "2026-05-16T16:57:10+08:00",
|
||||
"type" : "commit"
|
||||
},
|
||||
{
|
||||
"files_changed" : 1,
|
||||
"message" : "Codex 会话活跃 · 最近命令:codex · 1 项未提交变更 · 最近提交:auto-save 2026-05-16 16:56 (~1)",
|
||||
"ts" : "2026-05-16T08:58:08Z",
|
||||
"type" : "session-heartbeat"
|
||||
},
|
||||
{
|
||||
"files_changed" : 1,
|
||||
"hash" : "eedfceb",
|
||||
"message" : "auto-save 2026-05-16 17:02 (~1)",
|
||||
"ts" : "2026-05-16T17:02:38+08:00",
|
||||
"type" : "commit"
|
||||
},
|
||||
{
|
||||
"files_changed" : 1,
|
||||
"hash" : "a1979cb",
|
||||
"message" : "auto-save 2026-05-16 17:07 (~1)",
|
||||
"ts" : "2026-05-16T17:08:06+08:00",
|
||||
"type" : "commit"
|
||||
},
|
||||
{
|
||||
"files_changed" : 1,
|
||||
"message" : "Codex 会话活跃 · 最近命令:codex · 1 项未提交变更 · 最近提交:auto-save 2026-05-16 17:07 (~1)",
|
||||
"ts" : "2026-05-16T09:08:08Z",
|
||||
"type" : "session-heartbeat"
|
||||
},
|
||||
{
|
||||
"files_changed" : 1,
|
||||
"hash" : "5f27130",
|
||||
"message" : "auto-save 2026-05-16 17:13 (~1)",
|
||||
"ts" : "2026-05-16T17:13:37+08:00",
|
||||
"type" : "commit"
|
||||
},
|
||||
{
|
||||
"files_changed" : 1,
|
||||
"hash" : "3b5f9b5",
|
||||
"message" : "auto-save 2026-05-16 17:13 (~1)",
|
||||
"ts" : "2026-05-16T17:14:28+08:00",
|
||||
"type" : "commit"
|
||||
},
|
||||
{
|
||||
"files_changed" : 1,
|
||||
"hash" : "1d3b1ab",
|
||||
"message" : "auto-save 2026-05-16 17:14 (~1)",
|
||||
"ts" : "2026-05-16T17:14:42+08:00",
|
||||
"type" : "commit"
|
||||
},
|
||||
{
|
||||
"files_changed" : 1,
|
||||
"message" : "Codex 会话活跃 · 最近命令:codex · 1 项未提交变更 · 最近提交:auto-save 2026-05-16 17:14 (~1)",
|
||||
"ts" : "2026-05-16T09:18:08Z",
|
||||
"type" : "session-heartbeat"
|
||||
},
|
||||
{
|
||||
"files_changed" : 1,
|
||||
"hash" : "cfb79fd",
|
||||
"message" : "auto-save 2026-05-16 17:20 (~1)",
|
||||
"ts" : "2026-05-16T17:20:39+08:00",
|
||||
"type" : "commit"
|
||||
},
|
||||
{
|
||||
"files_changed" : 1,
|
||||
"hash" : "14ee9f3",
|
||||
"message" : "auto-save 2026-05-16 17:25 (~1)",
|
||||
"ts" : "2026-05-16T17:26:09+08:00",
|
||||
"type" : "commit"
|
||||
},
|
||||
{
|
||||
"files_changed": 1,
|
||||
"message": "Codex 会话活跃 · 最近命令:codex · 1 项未提交变更 · 最近提交:auto-save 2026-05-16 17:25 (~1)",
|
||||
@@ -3205,6 +3118,89 @@
|
||||
"message": "启动 Codex 接力会话 · 已载入 Cursor / Claude / Codex 最近会话,等待下一条指令 · 分支 main · 3 项未提交变更 · 最近提交:chore: persist resource libraries in production",
|
||||
"ts": "2026-05-19T00:44:28Z",
|
||||
"type": "assistant-session"
|
||||
},
|
||||
{
|
||||
"ts": "2026-05-19T08:46:28+08:00",
|
||||
"type": "commit",
|
||||
"message": "auto-save 2026-05-19 08:46 (~3)",
|
||||
"hash": "49c998f",
|
||||
"files_changed": 3
|
||||
},
|
||||
{
|
||||
"ts": "2026-05-19T00:49:51Z",
|
||||
"type": "session-end",
|
||||
"message": "Codex 会话结束 · 持续 0 秒 · 最近命令:codex · 分支 main · 1 项未提交变更 · 最近提交:auto-save 2026-05-19 08:46 (~3)",
|
||||
"files_changed": 1
|
||||
},
|
||||
{
|
||||
"ts": "2026-05-19T00:54:30Z",
|
||||
"type": "session-heartbeat",
|
||||
"message": "Codex 会话活跃 · 最近命令:codex · 分支 main · 1 项未提交变更 · 最近提交:auto-save 2026-05-19 08:46 (~3)",
|
||||
"files_changed": 1
|
||||
},
|
||||
{
|
||||
"ts": "2026-05-19T01:04:30Z",
|
||||
"type": "session-heartbeat",
|
||||
"message": "Codex 会话活跃 · 最近命令:codex · 分支 main · 1 项未提交变更 · 最近提交:auto-save 2026-05-19 08:46 (~3)",
|
||||
"files_changed": 1
|
||||
},
|
||||
{
|
||||
"ts": "2026-05-19T09:10:04+08:00",
|
||||
"type": "commit",
|
||||
"message": "chore: mount tiktok cookies in production",
|
||||
"hash": "2fe3db8",
|
||||
"files_changed": 5
|
||||
},
|
||||
{
|
||||
"ts": "2026-05-19T01:14:30Z",
|
||||
"type": "session-heartbeat",
|
||||
"message": "Codex 会话活跃 · 最近命令:codex · 分支 main · 1 项未提交变更 · 最近提交:chore: mount tiktok cookies in production",
|
||||
"files_changed": 1
|
||||
},
|
||||
{
|
||||
"ts": "2026-05-19T09:17:36+08:00",
|
||||
"type": "commit",
|
||||
"message": "fix: allow yt-dlp to update production cookies",
|
||||
"hash": "e5652c4",
|
||||
"files_changed": 3
|
||||
},
|
||||
{
|
||||
"ts": "2026-05-19T01:24:30Z",
|
||||
"type": "session-heartbeat",
|
||||
"message": "Codex 会话活跃 · 最近命令:codex · 分支 main · 1 项未提交变更 · 最近提交:fix: allow yt-dlp to update production cookies",
|
||||
"files_changed": 1
|
||||
},
|
||||
{
|
||||
"ts": "2026-05-19T09:27:47+08:00",
|
||||
"type": "commit",
|
||||
"message": "fix: enforce asr client timeout",
|
||||
"hash": "9a42682",
|
||||
"files_changed": 2
|
||||
},
|
||||
{
|
||||
"ts": "2026-05-19T01:34:31Z",
|
||||
"type": "session-heartbeat",
|
||||
"message": "Codex 会话活跃 · 最近命令:codex · 分支 main · 1 项未提交变更 · 最近提交:fix: enforce asr client timeout",
|
||||
"files_changed": 1
|
||||
},
|
||||
{
|
||||
"ts": "2026-05-19T09:38:17+08:00",
|
||||
"type": "commit",
|
||||
"message": "fix: configure dedicated asr upload gateway",
|
||||
"hash": "2954e58",
|
||||
"files_changed": 5
|
||||
},
|
||||
{
|
||||
"ts": "2026-05-19T01:44:31Z",
|
||||
"type": "session-heartbeat",
|
||||
"message": "Codex 会话活跃 · 最近命令:codex · 分支 main · 1 项未提交变更 · 最近提交:fix: configure dedicated asr upload gateway",
|
||||
"files_changed": 1
|
||||
},
|
||||
{
|
||||
"ts": "2026-05-19T01:54:31Z",
|
||||
"type": "session-heartbeat",
|
||||
"message": "Codex 会话活跃 · 最近命令:codex · 分支 main · 1 项未提交变更 · 最近提交:fix: configure dedicated asr upload gateway",
|
||||
"files_changed": 1
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
@@ -9,7 +9,7 @@ ENV PYTHONDONTWRITEBYTECODE=1 \
|
||||
WORKDIR /app
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y --no-install-recommends ffmpeg ca-certificates curl \
|
||||
&& apt-get install -y --no-install-recommends ffmpeg ca-certificates curl libgomp1 \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
COPY api/requirements.txt /app/requirements.txt
|
||||
|
||||
2
RULES.md
2
RULES.md
@@ -54,6 +54,8 @@
|
||||
- `LLM_BASE_URL` / `LLM_API_KEY`:OpenAI 兼容网关,用于翻译、文案改写、音频分析等文本/多模态理解模型调用
|
||||
- `ASR_BASE_URL` / `ASR_API_KEY`:OpenAI Audio Transcriptions 兼容网关,用于上传 `audio.wav` 做真实转写;未配置 `ASR_API_KEY` 时复用 `LLM_API_KEY`,生产默认指向 `https://ai.skg.com/azure/v1`
|
||||
- `ASR_MODEL`:OpenAI Audio Transcriptions 音频转写模型,默认 `whisper-1`
|
||||
- `ASR_REMOTE_ENABLED`:是否启用远端 OpenAI Audio Transcriptions;云端音频网关不可用时可设为 `false`,直接走容器内 CPU 版 `faster-whisper`
|
||||
- `FASTER_WHISPER_MODEL` / `FASTER_WHISPER_DEVICE` / `FASTER_WHISPER_COMPUTE_TYPE`:容器内本地 ASR 兜底,生产可用 `tiny.en` / `cpu` / `int8`
|
||||
- `ASR_FALLBACK_MODEL`:远端 ASR 和本机 ASR 都不可用时才尝试的多模态兜底,默认 `gemini-2.5-flash`;如果模型不能真实听到音频或返回疑似逐秒假字幕,后端必须拒绝写入时间轴
|
||||
- `ASR_TIMEOUT_SECONDS`:远端 ASR / 音频分析单次请求超时,默认 45 秒,避免第一步长时间停在转录中
|
||||
- `LOCAL_ASR_BIN` / `LOCAL_ASR_MODEL` / `LOCAL_ASR_TIMEOUT_SECONDS`:本机 ASR 兜底,默认使用 `/opt/homebrew/bin/mlx_whisper` + `mlx-community/whisper-tiny`,用于当前 SKG 网关 `/audio/transcriptions` 不可用时生成真实逐句时间轴
|
||||
|
||||
39
api/main.py
39
api/main.py
@@ -61,8 +61,12 @@ LLM_API_KEY = os.getenv("LLM_API_KEY", "").strip()
|
||||
ASR_BASE_URL = os.getenv("ASR_BASE_URL", LLM_BASE_URL).strip()
|
||||
ASR_API_KEY = (os.getenv("ASR_API_KEY") or LLM_API_KEY).strip()
|
||||
ASR_MODEL = os.getenv("ASR_MODEL", "whisper-1")
|
||||
ASR_REMOTE_ENABLED = os.getenv("ASR_REMOTE_ENABLED", "true").strip().lower() not in {"0", "false", "no", "off"}
|
||||
ASR_FALLBACK_MODEL = os.getenv("ASR_FALLBACK_MODEL", "gemini-2.5-flash").strip() or "gemini-2.5-flash"
|
||||
ASR_TIMEOUT_SECONDS = max(15, int(os.getenv("ASR_TIMEOUT_SECONDS", "45")))
|
||||
FASTER_WHISPER_MODEL = os.getenv("FASTER_WHISPER_MODEL", "tiny.en").strip() or "tiny.en"
|
||||
FASTER_WHISPER_DEVICE = os.getenv("FASTER_WHISPER_DEVICE", "cpu").strip() or "cpu"
|
||||
FASTER_WHISPER_COMPUTE_TYPE = os.getenv("FASTER_WHISPER_COMPUTE_TYPE", "int8").strip() or "int8"
|
||||
LOCAL_ASR_BIN = os.getenv("LOCAL_ASR_BIN", "").strip()
|
||||
LOCAL_ASR_MODEL = os.getenv("LOCAL_ASR_MODEL", "mlx-community/whisper-tiny").strip() or "mlx-community/whisper-tiny"
|
||||
LOCAL_ASR_TIMEOUT_SECONDS = max(30, int(os.getenv("LOCAL_ASR_TIMEOUT_SECONDS", "180")))
|
||||
@@ -2794,6 +2798,32 @@ def _transcribe_mlx_sync(wav: Path) -> list[dict]:
|
||||
return _validate_asr_segments(segments, duration, "mlx_whisper")
|
||||
|
||||
|
||||
def _transcribe_faster_whisper_sync(wav: Path) -> list[dict]:
|
||||
try:
|
||||
from faster_whisper import WhisperModel
|
||||
except Exception as e:
|
||||
raise TranscriptionUnavailable(f"faster-whisper 不可用:{e}") from e
|
||||
duration = media_duration(wav)
|
||||
model = WhisperModel(
|
||||
FASTER_WHISPER_MODEL,
|
||||
device=FASTER_WHISPER_DEVICE,
|
||||
compute_type=FASTER_WHISPER_COMPUTE_TYPE,
|
||||
)
|
||||
raw_segments, _info = model.transcribe(
|
||||
str(wav.resolve()),
|
||||
language="en",
|
||||
beam_size=1,
|
||||
vad_filter=True,
|
||||
condition_on_previous_text=False,
|
||||
)
|
||||
segments = [
|
||||
{"start": float(seg.start), "end": float(seg.end), "text": str(seg.text or "").strip()}
|
||||
for seg in raw_segments
|
||||
if str(seg.text or "").strip()
|
||||
]
|
||||
return _validate_asr_segments(segments, duration, f"faster-whisper:{FASTER_WHISPER_MODEL}")
|
||||
|
||||
|
||||
def _transcribe_gemini_sync(wav: Path) -> list[dict]:
|
||||
duration = media_duration(wav)
|
||||
audio_b64 = base64.b64encode(wav.read_bytes()).decode("ascii")
|
||||
@@ -2828,6 +2858,7 @@ def _transcribe_sync(wav: Path) -> list[dict]:
|
||||
"""Remote ASR first, local mlx_whisper second. Gemini fallback is guarded against fake timelines."""
|
||||
errors: list[str] = []
|
||||
duration = media_duration(wav)
|
||||
if ASR_REMOTE_ENABLED:
|
||||
try:
|
||||
with wav.open("rb") as f:
|
||||
resp = asr_llm().with_options(timeout=ASR_TIMEOUT_SECONDS).audio.transcriptions.create(
|
||||
@@ -2844,6 +2875,12 @@ def _transcribe_sync(wav: Path) -> list[dict]:
|
||||
return _validate_asr_segments(segments, duration, ASR_MODEL)
|
||||
except Exception as e:
|
||||
errors.append(f"{ASR_MODEL}: {e}")
|
||||
else:
|
||||
errors.append(f"{ASR_MODEL}: remote disabled")
|
||||
try:
|
||||
return _transcribe_faster_whisper_sync(wav)
|
||||
except Exception as e:
|
||||
errors.append(f"faster-whisper: {e}")
|
||||
try:
|
||||
return _transcribe_mlx_sync(wav)
|
||||
except Exception as e:
|
||||
@@ -3956,6 +3993,8 @@ def health() -> dict:
|
||||
"models": {
|
||||
"asr": ASR_MODEL,
|
||||
"asr_base_url": ASR_BASE_URL or LLM_BASE_URL or "openai-default",
|
||||
"asr_remote_enabled": ASR_REMOTE_ENABLED,
|
||||
"faster_whisper": FASTER_WHISPER_MODEL,
|
||||
"local_asr": LOCAL_ASR_MODEL,
|
||||
"asr_fallback": ASR_FALLBACK_MODEL,
|
||||
"translate": TRANSLATE_MODEL,
|
||||
|
||||
@@ -9,3 +9,4 @@ httpx==0.27.2
|
||||
imagehash==4.3.1
|
||||
Pillow>=11.0
|
||||
numpy>=2.0
|
||||
faster-whisper==1.1.1
|
||||
|
||||
@@ -24,7 +24,12 @@ LLM_API_KEY=
|
||||
ASR_BASE_URL=https://ai.skg.com/azure/v1
|
||||
ASR_API_KEY=
|
||||
ASR_MODEL=whisper-1
|
||||
ASR_REMOTE_ENABLED=true
|
||||
ASR_FALLBACK_MODEL=gemini-2.5-flash
|
||||
ASR_TIMEOUT_SECONDS=45
|
||||
FASTER_WHISPER_MODEL=tiny.en
|
||||
FASTER_WHISPER_DEVICE=cpu
|
||||
FASTER_WHISPER_COMPUTE_TYPE=int8
|
||||
TRANSLATE_MODEL=gemini-2.5-flash
|
||||
GPT_TEXT_MODEL=gpt-4o
|
||||
REWRITE_MODEL=gpt-4o
|
||||
|
||||
@@ -950,14 +950,14 @@ ProductRefStateItem {
|
||||
</thead>
|
||||
<tbody>
|
||||
<tr><td>网页登录</td><td><code>POST /auth/login</code>、<code>GET /auth/check</code>、<code>POST /auth/logout</code></td><td><code>web/app/login/page.tsx</code>、Nginx <code>auth_request</code></td><td>登录页提交账号密码到 <code>/api/auth/login</code>,后端设置 HttpOnly 会话 Cookie;生产 Nginx 对工作台和 <code>/api/</code> 调 <code>/auth/check</code> 做统一校验,未登录页面跳 <code>/login/</code>,API 返回 JSON 401。</td></tr>
|
||||
<tr><td>运行配置 / 模型标注</td><td><code>GET /health</code></td><td><code>getRuntimeHealth</code>、<code>ModelTrace</code></td><td>返回 <code>models</code>:ASR、<code>asr_base_url</code>、本机 ASR、ASR fallback、翻译、GPT 改写、GPT 画面理解、产品视角识别 <code>product_view</code>、GPT 图像模型、主体 6 视图 GPT 图像模型、Azure OpenAI TTS、视频别名和 Seedance 服务商。当前 <code>REWRITE_MODEL</code>、<code>AUDIO_REWRITE_MODEL</code> 和 <code>VISION_MODEL</code> 默认使用 <code>gpt-4o</code>;如果旧环境变量仍写 <code>gemini-*</code>,后端会归一化回 <code>GPT_TEXT_MODEL</code> / <code>REWRITE_MODEL</code>。语音只走 Azure OpenAI TTS,<code>models.voice_tts_paths</code> 会回传当前尝试的语音路径,方便区分路径错误和语音服务不可用。前端所有当前主路径里会调用模型的按钮旁显示模型名,点击弹出小窗口查看模型链路和输入输出逻辑;不返回 API Key 或敏感凭证。</td></tr>
|
||||
<tr><td>运行配置 / 模型标注</td><td><code>GET /health</code></td><td><code>getRuntimeHealth</code>、<code>ModelTrace</code></td><td>返回 <code>models</code>:ASR、<code>asr_base_url</code>、<code>asr_remote_enabled</code>、<code>faster_whisper</code>、本机 ASR、ASR fallback、翻译、GPT 改写、GPT 画面理解、产品视角识别 <code>product_view</code>、GPT 图像模型、主体 6 视图 GPT 图像模型、Azure OpenAI TTS、视频别名和 Seedance 服务商。当前 <code>REWRITE_MODEL</code>、<code>AUDIO_REWRITE_MODEL</code> 和 <code>VISION_MODEL</code> 默认使用 <code>gpt-4o</code>;如果旧环境变量仍写 <code>gemini-*</code>,后端会归一化回 <code>GPT_TEXT_MODEL</code> / <code>REWRITE_MODEL</code>。语音只走 Azure OpenAI TTS,<code>models.voice_tts_paths</code> 会回传当前尝试的语音路径,方便区分路径错误和语音服务不可用。前端所有当前主路径里会调用模型的按钮旁显示模型名,点击弹出小窗口查看模型链路和输入输出逻辑;不返回 API Key 或敏感凭证。</td></tr>
|
||||
<tr><td>历史列表</td><td><code>GET /jobs</code></td><td><code>listJobs</code></td><td>所有 job 精简列表(id/url/status/thumbnail/mtime…),按 state.json mtime 倒序。前端 URL 无 <code>?job=</code> 时拉它回填全部历史;带 <code>limit</code> 可截断。</td></tr>
|
||||
<tr><td>创建任务</td><td><code>POST /jobs</code></td><td><code>createJob</code></td><td>提交 TK 链接,后台开始下载;前端“开始”队列会在 downloaded 后自动触发音频解析。下载阶段优先使用 <code>YTDLP_COOKIES_FILE</code>,其次使用 <code>YTDLP_COOKIES_FROM_BROWSER</code>;生产云端固定走 <code>/run/secrets/tiktok_cookies.txt</code>,由宿主机 <code>./secrets/tiktok_cookies.txt</code> 挂载进容器。TikTok 要求登录态时会提示上传 MP4 或配置后端 cookies。</td></tr>
|
||||
<tr><td>重试下载</td><td><code>POST /jobs/{id}/download/retry</code></td><td><code>retryJobDownload</code></td><td>用于 TK 链接下载失败且没有 <code>video_url</code> 的素材;清空错误、重新进入下载状态,并在后台再次执行 <code>pipeline_download</code>。上传视频不能重下载,需要重新上传文件。</td></tr>
|
||||
<tr><td>上传视频</td><td><code>POST /jobs/upload</code></td><td><code>uploadJob</code></td><td>保存 source.mp4,然后同样进入下载完成状态;当前上传后也加入第一步队列,下载完成后自动解析音频。</td></tr>
|
||||
<tr><td>删除输入视频</td><td><code>DELETE /jobs/{id}</code></td><td><code>deleteJob</code></td><td>从任务队列、URL 和磁盘 <code>jobs/<id></code> 目录移除整个 job,包括源视频、关键帧、元素提取图和生成视频。</td></tr>
|
||||
<tr><td>解析视频</td><td><code>POST /jobs/{id}/analyze?frames=&target=&mode=&quality=</code></td><td><code>analyzeJob</code></td><td>抽参考帧能力。当前开始流程会在视频下载完成后自动调用一次,默认 <code>frames=12</code>、<code>target=motion</code>、<code>quality=accurate</code>、<code>mode=replace</code>,形成全局动作/节奏参考帧池;原版视频旁的“抽参考 12 帧”也会用同一参数显式重跑。<code>target</code> 仍支持透明骨架人、综合、清晰主体、转场变化、表情瞬间、动作峰值。</td></tr>
|
||||
<tr><td>音频文案轨</td><td><code>POST /jobs/{id}/transcribe</code></td><td><code>triggerTranscribe</code></td><td>若尚未拆轨,先从 <code>source.mp4</code> 提取 <code>audio.wav</code> 并回填 <code>source_audio_url</code>;随后把 <code>audio.wav</code> 上传到 <code>ASR_BASE_URL</code> 的 OpenAI Audio Transcriptions 兼容接口,用 <code>ASR_MODEL</code> 提取原始文案,翻译成中文,写入 <code>audio_script.source_text</code>、<code>source_zh</code> 和逐句 <code>transcript</code>。远端 ASR 失败后先走本机 <code>LOCAL_ASR_BIN</code>/<code>LOCAL_ASR_MODEL</code>(默认 <code>mlx_whisper</code>),再尝试 <code>ASR_FALLBACK_MODEL</code>。后端会拒绝重复文本、逐秒假字幕或覆盖率过低的结果,不再把不可听的多模态输出写进时间轴。中文翻译由 <code>TRANSLATE_MODEL</code> 按 ASR 段落补齐,失败时保留原文时间轴且中文可为空。再用 <code>ASR_FALLBACK_MODEL</code> 读取 <code>audio.wav</code> 和已有转写时间轴,多模态音频分析讲话人、语速节奏、停顿、背景音乐/环境声/音效,写入 <code>speaker_profile</code>、<code>rhythm_profile</code>、<code>background_audio_profile</code>;若模型分析失败,则用转写段落、时长和语速做本地估算兜底。当前第一步不默认生成 SKG 新口播和 Azure OpenAI 配音。</td></tr>
|
||||
<tr><td>音频文案轨</td><td><code>POST /jobs/{id}/transcribe</code></td><td><code>triggerTranscribe</code></td><td>若尚未拆轨,先从 <code>source.mp4</code> 提取 <code>audio.wav</code> 并回填 <code>source_audio_url</code>;远端启用时把 <code>audio.wav</code> 上传到 <code>ASR_BASE_URL</code> 的 OpenAI Audio Transcriptions 兼容接口,用 <code>ASR_MODEL</code> 提取原始文案;远端不可用或关闭时走容器内 CPU 版 <code>faster-whisper</code>,再补中文翻译并写入 <code>audio_script.source_text</code>、<code>source_zh</code> 和逐句 <code>transcript</code>。后端会拒绝重复文本、逐秒假字幕或覆盖率过低的结果,不再把不可听的多模态输出写进时间轴。中文翻译由 <code>TRANSLATE_MODEL</code> 按 ASR 段落补齐,失败时保留原文时间轴且中文可为空。再用 <code>ASR_FALLBACK_MODEL</code> 读取 <code>audio.wav</code> 和已有转写时间轴,多模态音频分析讲话人、语速节奏、停顿、背景音乐/环境声/音效,写入 <code>speaker_profile</code>、<code>rhythm_profile</code>、<code>background_audio_profile</code>;若模型分析失败,则用转写段落、时长和语速做本地估算兜底。当前第一步不默认生成 SKG 新口播和 Azure OpenAI 配音。</td></tr>
|
||||
<tr><td>分镜脚本改写</td><td><code>POST /jobs/{id}/script/rewrite</code></td><td><code>rewriteStoryboardScript</code></td><td>根据原英文参考文案、当前英文新口播、英文 role enum、时间段和作者想法改写英文口播;作者想法若含中文,后端会先经 <code>_ensure_english</code> 兜底翻译。<code>mode=segment</code> 只改一段;<code>mode=all</code> 一次改完整片,要求整片前后连贯。后端按 <code>AUDIO_REWRITE_MODEL</code>、<code>ASR_FALLBACK_MODEL</code>、<code>TRANSLATE_MODEL</code> 依次尝试,全部失败时用英文本地模板保留可编辑文案。接口返回 <code>items[index,text,text_zh]</code>,其中 <code>text</code> 是写入模型链路的英文主值,<code>text_zh</code> 只供团队审稿镜像显示;点击保存规划后写入 <code>StoryboardScene.action</code>。</td></tr>
|
||||
<tr><td>原始音频文件</td><td><code>GET /jobs/{id}/audio.wav</code></td><td><code>sourceAudioUrl</code></td><td>返回拆轨得到的 wav;当前主界面不再渲染底部吸附音频条,右侧复刻工作表会读取该文件生成参考图式横向响度波形,并和原视频、逐句时间轴联动;波形标题栏显示当前播放秒数、总时长和鼠标指针停点秒数。</td></tr>
|
||||
<tr><td>改写配音文件</td><td><code>GET /jobs/{id}/audio-script.mp3</code></td><td><code>apiAssetUrl(job.audio_script.voice_url)</code></td><td>后续新配音阶段保留的 TTS 产物;服务端固定走 <code>VOICE_PROVIDER=azure_openai</code>,通过 <code>AZURE_OPENAI_BASE_URL</code> 的 OpenAI 协议生成 mp3,并按 <code>AZURE_TTS_PATHS</code> 依次尝试 <code>/audio/speech</code>、<code>/v1/audio/speech</code> 等路径。当前第一步不默认生成该文件。</td></tr>
|
||||
@@ -1248,6 +1248,7 @@ ProductRefStateItem {
|
||||
<p><strong>问题:</strong>生产只配置 <code>LLM_BASE_URL=https://ai.skg.com/ezlink/v1</code>,文本网关不一定提供 <code>/audio/transcriptions</code> 文件上传接口,导致音频文案步骤无法真实转写。</p>
|
||||
<p><strong>改动:</strong><code>api/main.py</code> 新增 <code>ASR_BASE_URL</code> / <code>ASR_API_KEY</code> 和独立 ASR OpenAI client;音频转写只通过该 client 上传 <code>audio.wav</code>,不再绑死 <code>LLM_BASE_URL</code>。<code>deploy/.env.production.example</code> 增加生产 ASR 网关示例。</p>
|
||||
<p><strong>影响:</strong>文本/视觉模型仍走 <code>LLM_BASE_URL</code>,音频文件上传可单独切换到支持 Audio Transcriptions 的网关;<code>/health</code> 会回传 <code>asr_base_url</code> 供排障。</p>
|
||||
<p><strong>补充:</strong>当云端音频网关不兼容 OpenAI Audio Transcriptions 时,可设置 <code>ASR_REMOTE_ENABLED=false</code>,直接使用容器内 <code>faster-whisper</code> 做 CPU 转写。</p>
|
||||
</div>
|
||||
</article>
|
||||
<article class="change">
|
||||
|
||||
Reference in New Issue
Block a user