fix: force azure asr mode

2026-05-19 10:31:31 +08:00
parent 5b44d35316
commit ff7bf00f6d
5 changed files with 49 additions and 22 deletions
--- a/api/main.py
+++ b/api/main.py
@@ -62,6 +62,8 @@ ASR_BASE_URL = os.getenv("ASR_BASE_URL", LLM_BASE_URL).strip()
 ASR_API_KEY = (os.getenv("ASR_API_KEY") or LLM_API_KEY).strip()
 ASR_MODEL = os.getenv("ASR_MODEL", "whisper-1")
 ASR_REMOTE_ENABLED = os.getenv("ASR_REMOTE_ENABLED", "true").strip().lower() not in {"0", "false", "no", "off"}
+ASR_LOCAL_FALLBACK_ENABLED = os.getenv("ASR_LOCAL_FALLBACK_ENABLED", "true").strip().lower() not in {"0", "false", "no", "off"}
+ASR_AUDIO_FALLBACK_ENABLED = os.getenv("ASR_AUDIO_FALLBACK_ENABLED", "true").strip().lower() not in {"0", "false", "no", "off"}
 ASR_FALLBACK_MODEL = os.getenv("ASR_FALLBACK_MODEL", "gemini-2.5-flash").strip() or "gemini-2.5-flash"
 ASR_TIMEOUT_SECONDS = max(15, int(os.getenv("ASR_TIMEOUT_SECONDS", "45")))
 FASTER_WHISPER_MODEL = os.getenv("FASTER_WHISPER_MODEL", "tiny.en").strip() or "tiny.en"
@@ -2855,7 +2857,7 @@ def _transcribe_gemini_sync(wav: Path) -> list[dict]:


 def _transcribe_sync(wav: Path) -> list[dict]:
-    """Remote ASR first, local mlx_whisper second. Gemini fallback is guarded against fake timelines."""
+    """Remote ASR first; local/multimodal fallbacks are explicit runtime switches."""
    errors: list[str] = []
    duration = media_duration(wav)
    if ASR_REMOTE_ENABLED:
@@ -2877,18 +2879,24 @@ def _transcribe_sync(wav: Path) -> list[dict]:
            errors.append(f"{ASR_MODEL}: {e}")
    else:
        errors.append(f"{ASR_MODEL}: remote disabled")
-    try:
-        return _transcribe_faster_whisper_sync(wav)
-    except Exception as e:
-        errors.append(f"faster-whisper: {e}")
-    try:
-        return _transcribe_mlx_sync(wav)
-    except Exception as e:
-        errors.append(f"mlx_whisper: {e}")
-    try:
-        return _transcribe_gemini_sync(wav)
-    except Exception as e:
-        errors.append(f"{ASR_FALLBACK_MODEL}: {e}")
+    if ASR_LOCAL_FALLBACK_ENABLED:
+        try:
+            return _transcribe_faster_whisper_sync(wav)
+        except Exception as e:
+            errors.append(f"faster-whisper: {e}")
+        try:
+            return _transcribe_mlx_sync(wav)
+        except Exception as e:
+            errors.append(f"mlx_whisper: {e}")
+    else:
+        errors.append("local ASR fallback disabled")
+    if ASR_AUDIO_FALLBACK_ENABLED:
+        try:
+            return _transcribe_gemini_sync(wav)
+        except Exception as e:
+            errors.append(f"{ASR_FALLBACK_MODEL}: {e}")
+    else:
+        errors.append("multimodal audio fallback disabled")
    raise TranscriptionUnavailable("；".join(errors))


@@ -3994,6 +4002,8 @@ def health() -> dict:
            "asr": ASR_MODEL,
            "asr_base_url": ASR_BASE_URL or LLM_BASE_URL or "openai-default",
            "asr_remote_enabled": ASR_REMOTE_ENABLED,
+            "asr_local_fallback_enabled": ASR_LOCAL_FALLBACK_ENABLED,
+            "asr_audio_fallback_enabled": ASR_AUDIO_FALLBACK_ENABLED,
            "faster_whisper": FASTER_WHISPER_MODEL,
            "local_asr": LOCAL_ASR_MODEL,
            "asr_fallback": ASR_FALLBACK_MODEL,
--- a/api/requirements.txt
+++ b/api/requirements.txt
@@ -6,6 +6,7 @@ python-dotenv==1.0.1
 yt-dlp==2026.3.17
 openai==1.55.3
 httpx==0.27.2
+requests==2.32.5
 imagehash==4.3.1
 Pillow>=11.0
 numpy>=2.0