feat: add continuous voice mode with VAD silence detection
- Voice mode: press mic once to enter, press again to exit - VAD (Voice Activity Detection) auto-stops recording after 1.5s silence - Continuous loop: speak → transcribe → agent responds → TTS plays → auto-listen - Voice mode UI: input bar hides, large mic button centered - Auto-restart listening when TTS playback finishes - Fallback: restart listening on text response if no TTS arrives
This commit is contained in:
parent
d3e09df01a
commit
c477f660da
1 changed files with 102 additions and 6 deletions
|
|
@ -947,6 +947,21 @@ body {
|
||||||
animation: pulse 1.5s infinite;
|
animation: pulse 1.5s infinite;
|
||||||
box-shadow: 0 0 16px rgba(255,107,107,0.4);
|
box-shadow: 0 0 16px rgba(255,107,107,0.4);
|
||||||
}
|
}
|
||||||
|
#voice-btn.voice-mode {
|
||||||
|
background: var(--accent);
|
||||||
|
border-color: var(--accent);
|
||||||
|
color: #fff;
|
||||||
|
box-shadow: 0 0 16px var(--accent-glow);
|
||||||
|
}
|
||||||
|
#input-bar.voice-mode-active {
|
||||||
|
justify-content: center;
|
||||||
|
}
|
||||||
|
#input-bar.voice-mode-active #input,
|
||||||
|
#input-bar.voice-mode-active #send-btn { display: none; }
|
||||||
|
#input-bar.voice-mode-active #voice-btn {
|
||||||
|
width: 56px; height: 56px;
|
||||||
|
}
|
||||||
|
#input-bar.voice-mode-active #voice-btn svg { width: 26px; height: 26px; }
|
||||||
@keyframes pulse {
|
@keyframes pulse {
|
||||||
0%, 100% { opacity: 1; }
|
0%, 100% { opacity: 1; }
|
||||||
50% { opacity: 0.6; }
|
50% { opacity: 0.6; }
|
||||||
|
|
@ -1014,6 +1029,9 @@ let authToken = '';
|
||||||
let isRecording = false;
|
let isRecording = false;
|
||||||
let mediaRecorder = null;
|
let mediaRecorder = null;
|
||||||
let audioChunks = [];
|
let audioChunks = [];
|
||||||
|
let voiceMode = false;
|
||||||
|
let voiceAwaitingResponse = false;
|
||||||
|
let currentTtsAudio = null;
|
||||||
let typingTimeout = null;
|
let typingTimeout = null;
|
||||||
let autoScroll = true;
|
let autoScroll = true;
|
||||||
|
|
||||||
|
|
@ -1090,6 +1108,16 @@ function handleServerMessage(data) {
|
||||||
case 'message':
|
case 'message':
|
||||||
hideTyping();
|
hideTyping();
|
||||||
addBotMessage(data.id, data.content, data.timestamp);
|
addBotMessage(data.id, data.content, data.timestamp);
|
||||||
|
// In voice mode, if no TTS audio is coming, restart listening after text
|
||||||
|
if (voiceMode && voiceAwaitingResponse) {
|
||||||
|
// Give a short delay for play_audio to arrive
|
||||||
|
setTimeout(() => {
|
||||||
|
if (voiceMode && voiceAwaitingResponse && !currentTtsAudio) {
|
||||||
|
voiceAwaitingResponse = false;
|
||||||
|
startRecording();
|
||||||
|
}
|
||||||
|
}, 2000);
|
||||||
|
}
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case 'edit':
|
case 'edit':
|
||||||
|
|
@ -1122,7 +1150,16 @@ function handleServerMessage(data) {
|
||||||
|
|
||||||
case 'play_audio':
|
case 'play_audio':
|
||||||
// Invisible TTS playback — no UI element, just play audio
|
// Invisible TTS playback — no UI element, just play audio
|
||||||
{ const a = new Audio(data.url); a.play().catch(() => {}); }
|
{
|
||||||
|
const a = new Audio(data.url);
|
||||||
|
currentTtsAudio = a;
|
||||||
|
voiceAwaitingResponse = false;
|
||||||
|
a.onended = () => {
|
||||||
|
currentTtsAudio = null;
|
||||||
|
if (voiceMode) startRecording();
|
||||||
|
};
|
||||||
|
a.play().catch(() => { currentTtsAudio = null; if (voiceMode) startRecording(); });
|
||||||
|
}
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case 'error':
|
case 'error':
|
||||||
|
|
@ -1155,28 +1192,63 @@ function autoGrow(el) {
|
||||||
el.style.height = Math.min(el.scrollHeight, 120) + 'px';
|
el.style.height = Math.min(el.scrollHeight, 120) + 'px';
|
||||||
}
|
}
|
||||||
|
|
||||||
// --- Voice Recording ---
|
// --- Voice Mode & Recording with VAD ---
|
||||||
|
const SILENCE_THRESHOLD = 0.015;
|
||||||
|
const SILENCE_DURATION = 1500; // ms of silence to auto-stop
|
||||||
|
|
||||||
async function toggleVoice() {
|
async function toggleVoice() {
|
||||||
if (isRecording) {
|
if (voiceMode) {
|
||||||
stopRecording();
|
exitVoiceMode();
|
||||||
} else {
|
} else {
|
||||||
await startRecording();
|
enterVoiceMode();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function enterVoiceMode() {
|
||||||
|
voiceMode = true;
|
||||||
|
document.getElementById('voice-btn').classList.add('voice-mode');
|
||||||
|
document.getElementById('input-bar').classList.add('voice-mode-active');
|
||||||
|
startRecording();
|
||||||
|
}
|
||||||
|
|
||||||
|
function exitVoiceMode() {
|
||||||
|
voiceMode = false;
|
||||||
|
voiceAwaitingResponse = false;
|
||||||
|
document.getElementById('voice-btn').classList.remove('voice-mode');
|
||||||
|
document.getElementById('input-bar').classList.remove('voice-mode-active');
|
||||||
|
if (currentTtsAudio) { currentTtsAudio.pause(); currentTtsAudio = null; }
|
||||||
|
stopRecording();
|
||||||
|
}
|
||||||
|
|
||||||
async function startRecording() {
|
async function startRecording() {
|
||||||
|
if (isRecording) return;
|
||||||
try {
|
try {
|
||||||
const stream = await navigator.mediaDevices.getUserMedia({audio: true});
|
const stream = await navigator.mediaDevices.getUserMedia({audio: true});
|
||||||
audioChunks = [];
|
audioChunks = [];
|
||||||
|
|
||||||
|
// Set up VAD with AnalyserNode
|
||||||
|
const audioCtx = new (window.AudioContext || window.webkitAudioContext)();
|
||||||
|
const source = audioCtx.createMediaStreamSource(stream);
|
||||||
|
const analyser = audioCtx.createAnalyser();
|
||||||
|
analyser.fftSize = 512;
|
||||||
|
source.connect(analyser);
|
||||||
|
const dataArray = new Float32Array(analyser.fftSize);
|
||||||
|
let silenceStart = null;
|
||||||
|
let hasSpoken = false;
|
||||||
|
let vadActive = true;
|
||||||
|
|
||||||
mediaRecorder = new MediaRecorder(stream, {mimeType: 'audio/webm;codecs=opus'});
|
mediaRecorder = new MediaRecorder(stream, {mimeType: 'audio/webm;codecs=opus'});
|
||||||
mediaRecorder.ondataavailable = (e) => { if (e.data.size > 0) audioChunks.push(e.data); };
|
mediaRecorder.ondataavailable = (e) => { if (e.data.size > 0) audioChunks.push(e.data); };
|
||||||
mediaRecorder.onstop = async () => {
|
mediaRecorder.onstop = async () => {
|
||||||
|
vadActive = false;
|
||||||
|
audioCtx.close();
|
||||||
stream.getTracks().forEach(t => t.stop());
|
stream.getTracks().forEach(t => t.stop());
|
||||||
if (audioChunks.length === 0) return;
|
if (audioChunks.length === 0 || !hasSpoken) return;
|
||||||
const blob = new Blob(audioChunks, {type: 'audio/webm'});
|
const blob = new Blob(audioChunks, {type: 'audio/webm'});
|
||||||
const reader = new FileReader();
|
const reader = new FileReader();
|
||||||
reader.onloadend = () => {
|
reader.onloadend = () => {
|
||||||
const b64 = reader.result.split(',')[1];
|
const b64 = reader.result.split(',')[1];
|
||||||
|
voiceAwaitingResponse = true;
|
||||||
ws.send(JSON.stringify({type: 'voice', audio: b64, format: 'webm'}));
|
ws.send(JSON.stringify({type: 'voice', audio: b64, format: 'webm'}));
|
||||||
};
|
};
|
||||||
reader.readAsDataURL(blob);
|
reader.readAsDataURL(blob);
|
||||||
|
|
@ -1184,8 +1256,32 @@ async function startRecording() {
|
||||||
mediaRecorder.start();
|
mediaRecorder.start();
|
||||||
isRecording = true;
|
isRecording = true;
|
||||||
document.getElementById('voice-btn').classList.add('recording');
|
document.getElementById('voice-btn').classList.add('recording');
|
||||||
|
|
||||||
|
// VAD loop — detect silence to auto-stop
|
||||||
|
function checkVAD() {
|
||||||
|
if (!vadActive || !isRecording) return;
|
||||||
|
analyser.getFloatTimeDomainData(dataArray);
|
||||||
|
let sum = 0;
|
||||||
|
for (let i = 0; i < dataArray.length; i++) sum += dataArray[i] * dataArray[i];
|
||||||
|
const rms = Math.sqrt(sum / dataArray.length);
|
||||||
|
|
||||||
|
if (rms > SILENCE_THRESHOLD) {
|
||||||
|
hasSpoken = true;
|
||||||
|
silenceStart = null;
|
||||||
|
} else if (hasSpoken) {
|
||||||
|
if (!silenceStart) silenceStart = Date.now();
|
||||||
|
else if (Date.now() - silenceStart > SILENCE_DURATION) {
|
||||||
|
stopRecording();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
requestAnimationFrame(checkVAD);
|
||||||
|
}
|
||||||
|
requestAnimationFrame(checkVAD);
|
||||||
|
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
addSystemMessage('Microphone access denied.');
|
addSystemMessage('Microphone access denied.');
|
||||||
|
if (voiceMode) exitVoiceMode();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue