feat: auto-detect language and prefer gpu for windows asr

2026-04-06 11:14:12 +08:00
parent 7698b5e1e4
commit 4ff7efb61c
4 changed files with 144 additions and 17 deletions
--- a/deploy/storyforge-windows-asr-http/app.py
+++ b/deploy/storyforge-windows-asr-http/app.py
@@ -10,9 +10,6 @@ from fastapi import FastAPI, File, UploadFile
 from fastapi.responses import JSONResponse

 MODEL_NAME = os.getenv("WHISPER_MODEL", "base")
-LANGUAGE = os.getenv("WHISPER_LANGUAGE", "zh")
-DEVICE = os.getenv("WHISPER_DEVICE", "cpu")
-COMPUTE_TYPE = os.getenv("WHISPER_COMPUTE_TYPE", "int8")
 BEAM_SIZE = int(os.getenv("WHISPER_BEAM_SIZE", "5"))
 VAD_FILTER = os.getenv("WHISPER_VAD_FILTER", "1").strip().lower() not in {"0", "false", "no"}
 DOWNLOAD_ROOT = Path(os.getenv("WHISPER_DOWNLOAD_ROOT", str(Path(__file__).resolve().parent / "models-cache")))
@@ -20,17 +17,59 @@ DOWNLOAD_ROOT = Path(os.getenv("WHISPER_DOWNLOAD_ROOT", str(Path(__file__).resol
 app = FastAPI(title="storyforge-windows-asr", version="1.0.0")


+def describe_language_mode() -> str:
+    value = (os.getenv("WHISPER_LANGUAGE", "") or "").strip()
+    if not value or value.lower() in {"auto", "detect"}:
+        return "auto"
+    return value
+
+
+def resolve_language() -> str | None:
+    value = describe_language_mode()
+    return None if value == "auto" else value
+
+
+def describe_device_mode() -> str:
+    value = (os.getenv("WHISPER_DEVICE", "") or "").strip().lower()
+    return value or "auto"
+
+
+def describe_compute_mode() -> str:
+    value = (os.getenv("WHISPER_COMPUTE_TYPE", "") or "").strip()
+    return value or "auto"
+
+
+def build_runtime_profiles() -> list[tuple[str, str]]:
+    device = describe_device_mode()
+    compute = describe_compute_mode()
+    if device != "auto":
+        return [(device, compute if compute != "auto" else "int8")]
+    if compute != "auto":
+        return [("cuda", compute), ("cpu", compute)]
+    return [("cuda", "int8_float16"), ("cpu", "int8")]
+
+
@lru_cache(maxsize=1)
 def get_model():
    from faster_whisper import WhisperModel

    DOWNLOAD_ROOT.mkdir(parents=True, exist_ok=True)
-    return WhisperModel(
-        MODEL_NAME,
-        device=DEVICE,
-        compute_type=COMPUTE_TYPE,
-        download_root=str(DOWNLOAD_ROOT),
-    )
+    last_error: Exception | None = None
+    for device, compute_type in build_runtime_profiles():
+        try:
+            model = WhisperModel(
+                MODEL_NAME,
+                device=device,
+                compute_type=compute_type,
+                download_root=str(DOWNLOAD_ROOT),
+            )
+            app.state.runtime_device = device
+            app.state.runtime_compute_type = compute_type
+            return model
+        except Exception as exc:  # pragma: no cover - exercised on real hosts
+            last_error = exc
+    assert last_error is not None
+    raise last_error


@app.get("/health")
@@ -39,9 +78,11 @@ def health() -> dict[str, object]:
        "status": "ok",
        "service": "storyforge-windows-asr",
        "model_name": MODEL_NAME,
-        "language": LANGUAGE,
-        "device": DEVICE,
-        "compute_type": COMPUTE_TYPE,
+        "language": describe_language_mode(),
+        "device": describe_device_mode(),
+        "compute_type": describe_compute_mode(),
+        "active_device": getattr(app.state, "runtime_device", ""),
+        "active_compute_type": getattr(app.state, "runtime_compute_type", ""),
        "download_root": str(DOWNLOAD_ROOT),
        "model_loaded": get_model.cache_info().currsize > 0,
    }
@@ -62,9 +103,9 @@ async def transcribe(wav: UploadFile = File(...)):

    try:
        model = get_model()
-        segments, _info = model.transcribe(
+        segments, info = model.transcribe(
            str(temp_path),
-            language=LANGUAGE or None,
+            language=resolve_language(),
            beam_size=max(1, BEAM_SIZE),
            vad_filter=VAD_FILTER,
        )
@@ -74,6 +115,8 @@ async def transcribe(wav: UploadFile = File(...)):
            "text": text,
            "success": bool(text),
            "duration_ms": duration_ms,
+            "detected_language": getattr(info, "language", None),
+            "detected_language_probability": getattr(info, "language_probability", None),
            "error_message": None if text else "empty transcription",
        }
    except Exception as exc:
--- a/deploy/storyforge-windows-asr-http/run.ps1
+++ b/deploy/storyforge-windows-asr-http/run.ps1
@@ -13,9 +13,9 @@ $venvPython = Join-Path $venvDir "Scripts\python.exe"
 & $venvPython -m pip install -r (Join-Path $scriptDir "requirements.txt")

 $env:WHISPER_MODEL = if ($env:WHISPER_MODEL) { $env:WHISPER_MODEL } else { "base" }
-$env:WHISPER_LANGUAGE = if ($env:WHISPER_LANGUAGE) { $env:WHISPER_LANGUAGE } else { "zh" }
-$env:WHISPER_DEVICE = if ($env:WHISPER_DEVICE) { $env:WHISPER_DEVICE } else { "cpu" }
-$env:WHISPER_COMPUTE_TYPE = if ($env:WHISPER_COMPUTE_TYPE) { $env:WHISPER_COMPUTE_TYPE } else { "int8" }
+$env:WHISPER_LANGUAGE = if ($env:WHISPER_LANGUAGE) { $env:WHISPER_LANGUAGE } else { "" }
+$env:WHISPER_DEVICE = if ($env:WHISPER_DEVICE) { $env:WHISPER_DEVICE } else { "auto" }
+$env:WHISPER_COMPUTE_TYPE = if ($env:WHISPER_COMPUTE_TYPE) { $env:WHISPER_COMPUTE_TYPE } else { "" }
 $env:WHISPER_BEAM_SIZE = if ($env:WHISPER_BEAM_SIZE) { $env:WHISPER_BEAM_SIZE } else { "5" }
 $env:WHISPER_VAD_FILTER = if ($env:WHISPER_VAD_FILTER) { $env:WHISPER_VAD_FILTER } else { "1" }
 $env:WHISPER_DOWNLOAD_ROOT = if ($env:WHISPER_DOWNLOAD_ROOT) { $env:WHISPER_DOWNLOAD_ROOT } else { (Join-Path $scriptDir "models-cache") }