fix: make STT config env-overridable and fix doc issues
Code fixes: - STT model, Groq base URL, and OpenAI STT base URL are now configurable via env vars (STT_GROQ_MODEL, STT_OPENAI_MODEL, GROQ_BASE_URL, STT_OPENAI_BASE_URL) instead of hardcoded - Gateway and Discord VC now read stt.model from config.yaml (previously only CLI did this — gateway always used defaults) Doc fixes: - voice-mode.md: move Web UI troubleshooting to web.md (was duplicated) - voice-mode.md: simplify "How It Works" for end users (remove NaCl, DAVE, RTP internals) - voice-mode.md: clarify STT priority (OpenAI used first if both keys set, Groq recommended for free tier) - voice-mode.md: document new STT env overrides in config reference - web.md: remove duplicate Quick Start / Step 1-3 sections - web.md: add mobile HTTPS mic workarounds (moved from voice-mode.md) - web.md: clarify STT fallback order
This commit is contained in:
parent
79ed0effdd
commit
238a431545
5 changed files with 78 additions and 118 deletions
|
|
@ -881,7 +881,18 @@ class DiscordAdapter(BasePlatformAdapter):
|
||||||
await asyncio.to_thread(VoiceReceiver.pcm_to_wav, pcm_data, wav_path)
|
await asyncio.to_thread(VoiceReceiver.pcm_to_wav, pcm_data, wav_path)
|
||||||
|
|
||||||
from tools.transcription_tools import transcribe_audio
|
from tools.transcription_tools import transcribe_audio
|
||||||
result = await asyncio.to_thread(transcribe_audio, wav_path)
|
# Read STT model from config.yaml
|
||||||
|
stt_model = None
|
||||||
|
try:
|
||||||
|
import yaml as _y
|
||||||
|
from pathlib import Path as _P
|
||||||
|
_cfg = _P(os.getenv("HERMES_HOME", _P.home() / ".hermes")) / "config.yaml"
|
||||||
|
if _cfg.exists():
|
||||||
|
with open(_cfg) as _f:
|
||||||
|
stt_model = (_y.safe_load(_f) or {}).get("stt", {}).get("model")
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
result = await asyncio.to_thread(transcribe_audio, wav_path, model=stt_model)
|
||||||
|
|
||||||
if not result.get("success"):
|
if not result.get("success"):
|
||||||
return
|
return
|
||||||
|
|
|
||||||
|
|
@ -3326,11 +3326,23 @@ class GatewayRunner:
|
||||||
from tools.transcription_tools import transcribe_audio
|
from tools.transcription_tools import transcribe_audio
|
||||||
import asyncio
|
import asyncio
|
||||||
|
|
||||||
|
# Read STT model from config.yaml (same key the CLI uses)
|
||||||
|
stt_model = None
|
||||||
|
try:
|
||||||
|
import yaml as _y
|
||||||
|
_cfg = _hermes_home / "config.yaml"
|
||||||
|
if _cfg.exists():
|
||||||
|
with open(_cfg) as _f:
|
||||||
|
_data = _y.safe_load(_f) or {}
|
||||||
|
stt_model = _data.get("stt", {}).get("model")
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
enriched_parts = []
|
enriched_parts = []
|
||||||
for path in audio_paths:
|
for path in audio_paths:
|
||||||
try:
|
try:
|
||||||
logger.debug("Transcribing user voice: %s", path)
|
logger.debug("Transcribing user voice: %s", path)
|
||||||
result = await asyncio.to_thread(transcribe_audio, path)
|
result = await asyncio.to_thread(transcribe_audio, path, model=stt_model)
|
||||||
if result["success"]:
|
if result["success"]:
|
||||||
transcript = result["transcript"]
|
transcript = result["transcript"]
|
||||||
enriched_parts.append(
|
enriched_parts.append(
|
||||||
|
|
|
||||||
|
|
@ -32,13 +32,13 @@ from typing import Optional, Dict, Any, Tuple
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
# Default STT models per provider
|
# Default STT models per provider (overridable via env)
|
||||||
DEFAULT_STT_MODEL = "whisper-1"
|
DEFAULT_STT_MODEL = os.getenv("STT_OPENAI_MODEL", "whisper-1")
|
||||||
DEFAULT_GROQ_STT_MODEL = "whisper-large-v3-turbo"
|
DEFAULT_GROQ_STT_MODEL = os.getenv("STT_GROQ_MODEL", "whisper-large-v3-turbo")
|
||||||
|
|
||||||
# Provider endpoints
|
# Provider endpoints (overridable via env for proxies / self-hosted)
|
||||||
GROQ_BASE_URL = "https://api.groq.com/openai/v1"
|
GROQ_BASE_URL = os.getenv("GROQ_BASE_URL", "https://api.groq.com/openai/v1")
|
||||||
OPENAI_BASE_URL = "https://api.openai.com/v1"
|
OPENAI_BASE_URL = os.getenv("STT_OPENAI_BASE_URL", "https://api.openai.com/v1")
|
||||||
|
|
||||||
|
|
||||||
def _resolve_stt_provider() -> Tuple[Optional[str], Optional[str], str]:
|
def _resolve_stt_provider() -> Tuple[Optional[str], Optional[str], str]:
|
||||||
|
|
|
||||||
|
|
@ -78,8 +78,8 @@ Add to `~/.hermes/.env`:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# Speech-to-Text (at least one required)
|
# Speech-to-Text (at least one required)
|
||||||
GROQ_API_KEY=your-key # Groq Whisper — fast, free tier available (recommended)
|
GROQ_API_KEY=your-key # Groq Whisper — fast, free tier (recommended for most users)
|
||||||
VOICE_TOOLS_OPENAI_KEY=your-key # OpenAI Whisper — alternative
|
VOICE_TOOLS_OPENAI_KEY=your-key # OpenAI Whisper — used first if both keys are set
|
||||||
|
|
||||||
# Text-to-Speech (optional — Edge TTS works without any key)
|
# Text-to-Speech (optional — Edge TTS works without any key)
|
||||||
ELEVENLABS_API_KEY=your-key # ElevenLabs — premium quality
|
ELEVENLABS_API_KEY=your-key # ElevenLabs — premium quality
|
||||||
|
|
@ -327,16 +327,11 @@ You must be in a voice channel before running `/voice join`. The bot joins the s
|
||||||
|
|
||||||
When the bot joins a voice channel, it:
|
When the bot joins a voice channel, it:
|
||||||
|
|
||||||
1. **Captures audio** via Discord's UDP socket (RTP packets)
|
1. **Listens** to each user's audio stream independently
|
||||||
2. **Decrypts** using NaCl transport encryption (aead_xchacha20_poly1305_rtpsize)
|
2. **Detects silence** — 1.5s of silence after at least 0.5s of speech triggers processing
|
||||||
3. **Decrypts** DAVE end-to-end encryption (Discord Audio/Video Encryption)
|
3. **Transcribes** the audio via Whisper STT (Groq or OpenAI)
|
||||||
4. **Decodes** Opus audio to raw PCM (48kHz stereo, per-user decoder)
|
4. **Processes** through the full agent pipeline (session, tools, memory)
|
||||||
5. **Detects silence** — 1.5s of silence after at least 0.5s of speech triggers processing
|
5. **Speaks** the reply back in the voice channel via TTS
|
||||||
6. **Converts** PCM to 16kHz mono WAV via ffmpeg
|
|
||||||
7. **Transcribes** via Whisper STT (Groq or OpenAI)
|
|
||||||
8. **Processes** through the full agent pipeline (session, tools, memory)
|
|
||||||
9. **Generates TTS** reply audio
|
|
||||||
10. **Plays** the reply in the voice channel
|
|
||||||
|
|
||||||
### Text Channel Integration
|
### Text Channel Integration
|
||||||
|
|
||||||
|
|
@ -397,7 +392,13 @@ tts:
|
||||||
```bash
|
```bash
|
||||||
# Speech-to-Text providers
|
# Speech-to-Text providers
|
||||||
GROQ_API_KEY=... # Groq Whisper (recommended — fast, free tier)
|
GROQ_API_KEY=... # Groq Whisper (recommended — fast, free tier)
|
||||||
VOICE_TOOLS_OPENAI_KEY=... # OpenAI Whisper (alternative)
|
VOICE_TOOLS_OPENAI_KEY=... # OpenAI Whisper (used first if both set)
|
||||||
|
|
||||||
|
# STT advanced overrides (optional)
|
||||||
|
STT_GROQ_MODEL=whisper-large-v3-turbo # Override default Groq STT model
|
||||||
|
STT_OPENAI_MODEL=whisper-1 # Override default OpenAI STT model
|
||||||
|
GROQ_BASE_URL=https://api.groq.com/openai/v1 # Custom Groq endpoint
|
||||||
|
STT_OPENAI_BASE_URL=https://api.openai.com/v1 # Custom OpenAI STT endpoint
|
||||||
|
|
||||||
# Text-to-Speech providers (Edge TTS needs no key)
|
# Text-to-Speech providers (Edge TTS needs no key)
|
||||||
ELEVENLABS_API_KEY=... # ElevenLabs (premium quality)
|
ELEVENLABS_API_KEY=... # ElevenLabs (premium quality)
|
||||||
|
|
@ -464,63 +465,9 @@ The bot requires an @mention by default in server channels. Make sure you:
|
||||||
- Edge TTS (free, no key) is the default fallback
|
- Edge TTS (free, no key) is the default fallback
|
||||||
- Check logs for TTS errors
|
- Check logs for TTS errors
|
||||||
|
|
||||||
### Web UI not accessible from other devices on the network
|
### Web UI issues (firewall, mobile mic)
|
||||||
|
|
||||||
The macOS firewall may block incoming connections. Allow the gateway through:
|
See the [Web UI Troubleshooting](../messaging/web.md#troubleshooting) guide for firewall, HTTPS, and mobile microphone issues.
|
||||||
|
|
||||||
1. **System Settings** → **Network** → **Firewall** → **Options**
|
|
||||||
2. Add `/usr/local/bin/python3` (or your Python path) to the allowed list
|
|
||||||
3. Or temporarily disable the firewall for testing
|
|
||||||
|
|
||||||
On Linux, allow the port through `ufw`:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
sudo ufw allow 8765/tcp
|
|
||||||
```
|
|
||||||
|
|
||||||
### Web UI microphone not working on mobile
|
|
||||||
|
|
||||||
Mobile browsers require **HTTPS** for microphone access (`navigator.mediaDevices` API). When accessing the Web UI over HTTP on a LAN IP (e.g. `http://192.168.1.x:8765`), the mic button will appear dimmed.
|
|
||||||
|
|
||||||
**Workarounds:**
|
|
||||||
|
|
||||||
**Android Chrome** — flag the LAN IP as secure:
|
|
||||||
1. Open `chrome://flags/#unsafely-treat-insecure-origin-as-secure`
|
|
||||||
2. Add your Web UI URL (e.g. `http://192.168.1.106:8765`)
|
|
||||||
3. Set to **Enabled** and relaunch Chrome
|
|
||||||
|
|
||||||
**iOS Safari / Chrome** — no flag bypass available. Use one of these instead:
|
|
||||||
|
|
||||||
1. **Self-signed HTTPS** with mkcert (recommended):
|
|
||||||
```bash
|
|
||||||
# Install mkcert
|
|
||||||
brew install mkcert
|
|
||||||
mkcert -install
|
|
||||||
|
|
||||||
# Generate cert for your LAN IP
|
|
||||||
mkcert 192.168.1.106
|
|
||||||
|
|
||||||
# Run a simple HTTPS reverse proxy (requires Node.js)
|
|
||||||
npx local-ssl-proxy --source 8443 --target 8765 \
|
|
||||||
--cert 192.168.1.106.pem --key 192.168.1.106-key.pem
|
|
||||||
```
|
|
||||||
Then access `https://192.168.1.106:8443` on your iPhone. You'll need to trust the mkcert root CA on iOS: **Settings → General → About → Certificate Trust Settings**.
|
|
||||||
|
|
||||||
2. **Caddy reverse proxy** (auto-HTTPS for local networks):
|
|
||||||
```bash
|
|
||||||
brew install caddy
|
|
||||||
caddy reverse-proxy --from https://192.168.1.106:8443 --to http://127.0.0.1:8765
|
|
||||||
```
|
|
||||||
|
|
||||||
3. **SSH tunnel from mobile** (if you have an SSH client like Termius):
|
|
||||||
```bash
|
|
||||||
ssh -L 8765:127.0.0.1:8765 user@your-mac-ip
|
|
||||||
```
|
|
||||||
Then access `http://localhost:8765` on the mobile browser — localhost is exempt from HTTPS requirement.
|
|
||||||
|
|
||||||
:::tip
|
|
||||||
Text chat works on mobile over HTTP without any workaround — only the microphone feature requires HTTPS.
|
|
||||||
:::
|
|
||||||
|
|
||||||
### Whisper returns garbage text
|
### Whisper returns garbage text
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -61,46 +61,6 @@ The web UI starts automatically alongside your other platforms.
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## Step 1: Configure
|
|
||||||
|
|
||||||
Add to `~/.hermes/.env`:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Enable Web UI
|
|
||||||
WEB_UI_ENABLED=true
|
|
||||||
|
|
||||||
# Port to listen on (default: 8765)
|
|
||||||
WEB_UI_PORT=8765
|
|
||||||
|
|
||||||
# Bind address (default: 0.0.0.0 = all interfaces, for LAN access)
|
|
||||||
# Set to 127.0.0.1 for localhost-only access
|
|
||||||
WEB_UI_HOST=0.0.0.0
|
|
||||||
|
|
||||||
# Access token (leave empty to auto-generate on each startup)
|
|
||||||
WEB_UI_TOKEN=your-secret-token
|
|
||||||
```
|
|
||||||
|
|
||||||
## Step 2: Start the Gateway
|
|
||||||
|
|
||||||
```bash
|
|
||||||
hermes gateway
|
|
||||||
```
|
|
||||||
|
|
||||||
You'll see output like:
|
|
||||||
|
|
||||||
```
|
|
||||||
[Web] Web UI: http://192.168.1.106:8765
|
|
||||||
[Web] Access token: your-secret-token
|
|
||||||
```
|
|
||||||
|
|
||||||
## Step 3: Open in Browser
|
|
||||||
|
|
||||||
1. Open the URL shown in the console on any device on the same network
|
|
||||||
2. Enter the access token
|
|
||||||
3. Start chatting
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Features
|
## Features
|
||||||
|
|
||||||
### Markdown & Code Highlighting
|
### Markdown & Code Highlighting
|
||||||
|
|
@ -111,7 +71,7 @@ Bot responses render full GitHub-flavored Markdown with syntax-highlighted code
|
||||||
|
|
||||||
Click the microphone button to record a voice message. The audio is transcribed via Whisper STT (using OpenAI or Groq as fallback) and sent to the agent. The bot automatically replies with audio playback — voice first, then the text response appears. No extra configuration needed.
|
Click the microphone button to record a voice message. The audio is transcribed via Whisper STT (using OpenAI or Groq as fallback) and sent to the agent. The bot automatically replies with audio playback — voice first, then the text response appears. No extra configuration needed.
|
||||||
|
|
||||||
STT priority: `VOICE_TOOLS_OPENAI_KEY` (OpenAI Whisper) > `GROQ_API_KEY` (Groq Whisper). TTS uses Edge TTS (free, no key) by default, or ElevenLabs/OpenAI if configured in `~/.hermes/config.yaml`.
|
STT uses `VOICE_TOOLS_OPENAI_KEY` (OpenAI Whisper) if set, otherwise falls back to `GROQ_API_KEY` (Groq Whisper, free tier). If you only need STT, setting `GROQ_API_KEY` is the simplest option. TTS uses Edge TTS (free, no key) by default, or ElevenLabs/OpenAI if configured in `~/.hermes/config.yaml`.
|
||||||
|
|
||||||
### Images & Files
|
### Images & Files
|
||||||
|
|
||||||
|
|
@ -211,6 +171,36 @@ WEB_UI_PORT=9000
|
||||||
- HTTPS is required for microphone access on non-localhost origins
|
- HTTPS is required for microphone access on non-localhost origins
|
||||||
- On localhost (`127.0.0.1`), HTTP works fine for microphone
|
- On localhost (`127.0.0.1`), HTTP works fine for microphone
|
||||||
|
|
||||||
|
### Microphone not working on mobile
|
||||||
|
|
||||||
|
Mobile browsers require **HTTPS** for microphone access (`navigator.mediaDevices` API). When accessing the Web UI over HTTP on a LAN IP (e.g. `http://192.168.1.x:8765`), the mic button will appear dimmed.
|
||||||
|
|
||||||
|
**Android Chrome** — flag the LAN IP as secure:
|
||||||
|
1. Open `chrome://flags/#unsafely-treat-insecure-origin-as-secure`
|
||||||
|
2. Add your Web UI URL (e.g. `http://192.168.1.106:8765`)
|
||||||
|
3. Set to **Enabled** and relaunch Chrome
|
||||||
|
|
||||||
|
**iOS Safari / Chrome** — no flag bypass available. Use one of these instead:
|
||||||
|
|
||||||
|
1. **Self-signed HTTPS** with mkcert (recommended):
|
||||||
|
```bash
|
||||||
|
brew install mkcert && mkcert -install
|
||||||
|
mkcert 192.168.1.106
|
||||||
|
npx local-ssl-proxy --source 8443 --target 8765 \
|
||||||
|
--cert 192.168.1.106.pem --key 192.168.1.106-key.pem
|
||||||
|
```
|
||||||
|
Then access `https://192.168.1.106:8443`. Trust the mkcert root CA on iOS: **Settings > General > About > Certificate Trust Settings**.
|
||||||
|
|
||||||
|
2. **SSH tunnel from mobile** (if you have Termius or similar):
|
||||||
|
```bash
|
||||||
|
ssh -L 8765:127.0.0.1:8765 user@your-mac-ip
|
||||||
|
```
|
||||||
|
Then access `http://localhost:8765` — localhost is exempt from the HTTPS requirement.
|
||||||
|
|
||||||
|
:::tip
|
||||||
|
Text chat works on mobile over HTTP without any workaround — only the microphone feature requires HTTPS.
|
||||||
|
:::
|
||||||
|
|
||||||
### CDN resources not loading
|
### CDN resources not loading
|
||||||
|
|
||||||
The UI loads `marked.js` and `highlight.js` from CDN. If you're offline or behind a restrictive proxy, markdown rendering and code highlighting won't work but basic chat still functions.
|
The UI loads `marked.js` and `highlight.js` from CDN. If you're offline or behind a restrictive proxy, markdown rendering and code highlighting won't work but basic chat still functions.
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue