fix: support multilingual audio transcription
This commit is contained in:
@@ -35,6 +35,6 @@ uvicorn main:app --host 127.0.0.1 --port 4291
|
||||
|
||||
- `ffmpeg` 系统二进制(拆轨 / 抽帧)
|
||||
- `yt-dlp` 系统二进制(也可走 Python 包)
|
||||
- OpenAI 兼容 LLM 网关(ASR / 翻译 / 文案改写 / 视觉 brief);远端 `whisper-1` 失败后先走本机 `mlx_whisper`,再用 `ASR_FALLBACK_MODEL` 走 Gemini 多模态音频识别,后端会拒绝疑似假字幕或覆盖率过低的时间轴
|
||||
- OpenAI 兼容 LLM 网关(ASR / 翻译 / 文案改写 / 视觉 brief);ASR 默认自动识别中文、英文和其他多语言,远端失败后先走容器内多语言 `faster-whisper` / 本机 `mlx_whisper`,再按开关用 `ASR_FALLBACK_MODEL` 走多模态音频识别,后端会拒绝疑似假字幕或覆盖率过低的时间轴
|
||||
- GPT 图片网关(当前所有生图 / 修图 / 产品视角识别 / 主体资产 / 首尾帧都强制使用 `gpt-image-2`,不做其他图片模型 fallback)
|
||||
- Azure OpenAI TTS(后续新配音阶段使用 `AZURE_OPENAI_API_KEY`;默认模型 `gpt-4o-mini-tts`,按 `AZURE_TTS_PATHS` 依次尝试语音路径)
|
||||
|
||||
64
api/main.py
64
api/main.py
@@ -63,13 +63,13 @@ LLM_API_KEY = os.getenv("LLM_API_KEY", "").strip()
|
||||
ASR_BASE_URL = os.getenv("ASR_BASE_URL", LLM_BASE_URL).strip()
|
||||
ASR_API_KEY = (os.getenv("ASR_API_KEY") or LLM_API_KEY).strip()
|
||||
ASR_MODEL = os.getenv("ASR_MODEL", "whisper-1")
|
||||
ASR_LANGUAGE = os.getenv("ASR_LANGUAGE", "en").strip()
|
||||
ASR_LANGUAGE = os.getenv("ASR_LANGUAGE", "").strip()
|
||||
ASR_REMOTE_ENABLED = os.getenv("ASR_REMOTE_ENABLED", "true").strip().lower() not in {"0", "false", "no", "off"}
|
||||
ASR_LOCAL_FALLBACK_ENABLED = os.getenv("ASR_LOCAL_FALLBACK_ENABLED", "true").strip().lower() not in {"0", "false", "no", "off"}
|
||||
ASR_AUDIO_FALLBACK_ENABLED = os.getenv("ASR_AUDIO_FALLBACK_ENABLED", "true").strip().lower() not in {"0", "false", "no", "off"}
|
||||
ASR_FALLBACK_MODEL = os.getenv("ASR_FALLBACK_MODEL", "gemini-2.5-flash").strip() or "gemini-2.5-flash"
|
||||
ASR_TIMEOUT_SECONDS = max(15, int(os.getenv("ASR_TIMEOUT_SECONDS", "45")))
|
||||
FASTER_WHISPER_MODEL = os.getenv("FASTER_WHISPER_MODEL", "tiny.en").strip() or "tiny.en"
|
||||
FASTER_WHISPER_MODEL = os.getenv("FASTER_WHISPER_MODEL", "base").strip() or "base"
|
||||
FASTER_WHISPER_DEVICE = os.getenv("FASTER_WHISPER_DEVICE", "cpu").strip() or "cpu"
|
||||
FASTER_WHISPER_COMPUTE_TYPE = os.getenv("FASTER_WHISPER_COMPUTE_TYPE", "int8").strip() or "int8"
|
||||
LOCAL_ASR_BIN = os.getenv("LOCAL_ASR_BIN", "").strip()
|
||||
@@ -79,6 +79,20 @@ TRANSLATE_MODEL = os.getenv("TRANSLATE_MODEL", "gemini-2.5-flash")
|
||||
DEFAULT_GPT_TEXT_MODEL = os.getenv("GPT_TEXT_MODEL", "gpt-4o").strip() or "gpt-4o"
|
||||
|
||||
|
||||
ASR_AUTO_LANGUAGE_VALUES = {"", "auto", "detect", "multilingual", "multi"}
|
||||
|
||||
|
||||
def _asr_language_hint() -> str:
|
||||
language = ASR_LANGUAGE.strip()
|
||||
if language.lower() in ASR_AUTO_LANGUAGE_VALUES:
|
||||
return ""
|
||||
return language
|
||||
|
||||
|
||||
def _asr_language_label() -> str:
|
||||
return _asr_language_hint() or "auto"
|
||||
|
||||
|
||||
def gpt_model_env(name: str, default: str | None = None) -> str:
|
||||
value = os.getenv(name, default or DEFAULT_GPT_TEXT_MODEL).strip()
|
||||
if not value or value.lower().startswith("gemini-"):
|
||||
@@ -2811,7 +2825,7 @@ def _clean_asr_segments(segments: list[dict], duration: float) -> list[dict]:
|
||||
|
||||
|
||||
def _segment_text_key(text: str) -> str:
|
||||
return re.sub(r"[^a-z0-9]+", " ", text.lower()).strip()
|
||||
return re.sub(r"[^\w]+", " ", text.casefold(), flags=re.UNICODE).strip()
|
||||
|
||||
|
||||
def _validate_asr_segments(segments: list[dict], duration: float, source: str) -> list[dict]:
|
||||
@@ -2909,19 +2923,22 @@ def _transcribe_faster_whisper_sync(wav: Path) -> list[dict]:
|
||||
device=FASTER_WHISPER_DEVICE,
|
||||
compute_type=FASTER_WHISPER_COMPUTE_TYPE,
|
||||
)
|
||||
raw_segments, _info = model.transcribe(
|
||||
str(wav.resolve()),
|
||||
language="en",
|
||||
beam_size=1,
|
||||
vad_filter=True,
|
||||
condition_on_previous_text=False,
|
||||
)
|
||||
language_hint = _asr_language_hint()
|
||||
transcribe_options = {
|
||||
"beam_size": 1,
|
||||
"vad_filter": True,
|
||||
"condition_on_previous_text": False,
|
||||
}
|
||||
if language_hint:
|
||||
transcribe_options["language"] = language_hint
|
||||
raw_segments, _info = model.transcribe(str(wav.resolve()), **transcribe_options)
|
||||
detected_language = str(getattr(_info, "language", "") or language_hint or "auto")
|
||||
segments = [
|
||||
{"start": float(seg.start), "end": float(seg.end), "text": str(seg.text or "").strip()}
|
||||
for seg in raw_segments
|
||||
if str(seg.text or "").strip()
|
||||
]
|
||||
return _validate_asr_segments(segments, duration, f"faster-whisper:{FASTER_WHISPER_MODEL}")
|
||||
return _validate_asr_segments(segments, duration, f"faster-whisper:{FASTER_WHISPER_MODEL}:{detected_language}")
|
||||
|
||||
|
||||
def _transcribe_gemini_sync(wav: Path) -> list[dict]:
|
||||
@@ -2931,8 +2948,9 @@ def _transcribe_gemini_sync(wav: Path) -> list[dict]:
|
||||
"Transcribe the attached audio. Return strict JSON only, no markdown. "
|
||||
"If you cannot truly hear the audio, return {\"can_hear\": false}. Do not guess. "
|
||||
"If you can hear it, return {\"can_hear\": true, \"segments\": "
|
||||
"[{\"start\": 0.0, \"end\": 1.2, \"text\": \"English transcript\"}]}. "
|
||||
"Use English for the transcript. Only include timestamps you can infer from the audio."
|
||||
"[{\"start\": 0.0, \"end\": 1.2, \"text\": \"original-language transcript\"}]}. "
|
||||
"Keep the transcript in the spoken source language; do not translate it here. "
|
||||
"Only include timestamps you can infer from the audio."
|
||||
)
|
||||
last_error: Exception | None = None
|
||||
for attempt in range(3):
|
||||
@@ -2961,19 +2979,21 @@ def _transcribe_sync(wav: Path) -> list[dict]:
|
||||
if ASR_REMOTE_ENABLED:
|
||||
try:
|
||||
with wav.open("rb") as f:
|
||||
language_hint = _asr_language_hint()
|
||||
resp = asr_llm().with_options(timeout=ASR_TIMEOUT_SECONDS).audio.transcriptions.create(
|
||||
file=(wav.name, f, "audio/wav"),
|
||||
model=ASR_MODEL,
|
||||
response_format="verbose_json",
|
||||
timestamp_granularities=["segment"],
|
||||
**({"language": ASR_LANGUAGE} if ASR_LANGUAGE else {}),
|
||||
**({"language": language_hint} if language_hint else {}),
|
||||
)
|
||||
raw = resp.model_dump() if hasattr(resp, "model_dump") else resp
|
||||
segments = raw.get("segments") or []
|
||||
# 兜底:网关如果不返回 segments,把全文当一段
|
||||
if not segments and raw.get("text"):
|
||||
segments = [{"start": 0.0, "end": float(raw.get("duration", 0) or 0), "text": raw["text"]}]
|
||||
return _validate_asr_segments(segments, duration, ASR_MODEL)
|
||||
detected_language = str(raw.get("language") or language_hint or "auto")
|
||||
return _validate_asr_segments(segments, duration, f"{ASR_MODEL}:{detected_language}")
|
||||
except Exception as e:
|
||||
errors.append(f"{ASR_MODEL}: {e}")
|
||||
else:
|
||||
@@ -3001,11 +3021,13 @@ def _transcribe_sync(wav: Path) -> list[dict]:
|
||||
|
||||
def _translate_sync(segments: list[dict]) -> list[str]:
|
||||
"""批量翻译为中文,按段返回"""
|
||||
payload = [{"i": i, "en": s.get("text", "").strip()} for i, s in enumerate(segments)]
|
||||
payload = [{"i": i, "text": s.get("text", "").strip()} for i, s in enumerate(segments)]
|
||||
prompt = (
|
||||
"你是字幕翻译。把下列英文字幕段翻译为简体中文,保持原意、口语化、自然流畅。"
|
||||
"严格返回 JSON 数组,不要任何 markdown 或多余文字,schema: "
|
||||
'[{"i": 0, "zh": "..."}, ...]\n\n输入:\n'
|
||||
"你是多语言字幕翻译。把下列原语言字幕段翻译为简体中文;"
|
||||
"如果原文已经是中文,只做简体中文规范化和口语化整理,不要改写意思。"
|
||||
"保持原意、口语化、自然流畅。"
|
||||
"严格返回 JSON object,不要任何 markdown 或多余文字,schema: "
|
||||
'{"translations":[{"i": 0, "zh": "..."}]}\n\n输入:\n'
|
||||
+ json.dumps(payload, ensure_ascii=False)
|
||||
)
|
||||
try:
|
||||
@@ -3432,7 +3454,7 @@ def pipeline_transcribe(job_id: str, manage_job_status: bool = True) -> None:
|
||||
return
|
||||
|
||||
# 1) whisper ASR
|
||||
progress(f"{ASR_MODEL} 转录中…", 78)
|
||||
progress(f"{ASR_MODEL} {_asr_language_label()} 语种转录中…", 78)
|
||||
segments = _transcribe_sync(wav)
|
||||
if not segments:
|
||||
raise TranscriptionUnavailable("ASR 未返回可用字幕段")
|
||||
@@ -4494,7 +4516,7 @@ def health() -> dict:
|
||||
"voice_base_url": AZURE_OPENAI_BASE_URL,
|
||||
"models": {
|
||||
"asr": ASR_MODEL,
|
||||
"asr_language": ASR_LANGUAGE,
|
||||
"asr_language": _asr_language_label(),
|
||||
"asr_base_url": ASR_BASE_URL or LLM_BASE_URL or "openai-default",
|
||||
"asr_remote_enabled": ASR_REMOTE_ENABLED,
|
||||
"asr_local_fallback_enabled": ASR_LOCAL_FALLBACK_ENABLED,
|
||||
|
||||
Reference in New Issue
Block a user