Do audio recognition

This commit is contained in:
Слонова Анна 2026-03-22 21:15:39 +03:00
parent 4f8e10df16
commit 471e9bcc3b
4 changed files with 191 additions and 28 deletions

View file

@ -9,3 +9,7 @@ ACCESS_TOKEN=syt_...
# Allowed rooms (comma-separated, no spaces) # Allowed rooms (comma-separated, no spaces)
ALLOWED_ROOMS=!roomid1:matrix.org,!roomid2:matrix.org ALLOWED_ROOMS=!roomid1:matrix.org,!roomid2:matrix.org
# Whisper settings
WHISPER_LANGUAGE=ru
WHISPER_MODEL=small

8
.gitignore vendored
View file

@ -4,6 +4,14 @@ __pycache__/
*.py[cod] *.py[cod]
*$py.class *$py.class
.idea/*
.idea
venv1
venv1/
venv1/*
venv1/**
# C extensions # C extensions
*.so *.so

205
main.py
View file

@ -3,6 +3,8 @@ import asyncio
import os import os
import tempfile import tempfile
import time import time
import shutil
import subprocess
from typing import Dict, Optional, Tuple from typing import Dict, Optional, Tuple
from dotenv import load_dotenv from dotenv import load_dotenv
@ -16,62 +18,182 @@ from nio import (
ErrorResponse, ErrorResponse,
) )
from faster_whisper import WhisperModel
load_dotenv() load_dotenv()
HOMESERVER = os.getenv("HOMESERVER", "https://matrix.org") HOMESERVER = os.getenv("HOMESERVER", "https://matrix.org")
USERNAME = os.getenv("MATRIX_USERNAME") USERNAME = os.getenv("MATRIX_USERNAME")
PASSWORD = os.getenv("PASSWORD") PASSWORD = os.getenv("PASSWORD")
ALLOWED_ROOMS = set(room.strip() for room in os.getenv("ALLOWED_ROOMS", "").split(",") if room.strip()) ALLOWED_ROOMS = set(room.strip() for room in os.getenv("ALLOWED_ROOMS", "").split(",") if room.strip())
WHISPER_LANGUAGE = os.getenv("WHISPER_LANGUAGE", "ru")
WHISPER_MODEL = os.getenv("WHISPER_MODEL", "small")
TEMP_DIR = tempfile.gettempdir() TEMP_DIR = tempfile.gettempdir()
GROUPING_TIMEOUT = 15.0 GROUPING_TIMEOUT = 15.0
client: AsyncClient = None client: AsyncClient = None
pending_by_conversation: Dict[Tuple[str, str], Dict] = {} pending_by_conversation: Dict[Tuple[str, str], Dict] = {}
pending_by_event_id: Dict[str, Dict] = {} pending_by_event_id: Dict[str, Dict] = {}
whisper_model = None
async def process_audio(audio_bytes: bytes) -> str:
print(f"[AUDIO] Получено {len(audio_bytes)} байт аудио")
return "Placeholder"
async def process_image(image_bytes: bytes) -> str: def get_whisper_model():
print(f"[IMAGE] Получено {len(image_bytes)} байт изображения") global whisper_model
return "Placeholder" if whisper_model is None:
whisper_model = WhisperModel(WHISPER_MODEL, device="cpu", compute_type="int8")
print(f"Whisper модель {WHISPER_MODEL} загружена (faster-whisper).")
return whisper_model
async def generate_report(text: str, image_descriptions: list, audio_texts: list) -> str: def ffmpeg_available() -> bool:
found = shutil.which("ffmpeg") is not None
if not found:
print("[ERROR] ffmpeg не найден в системе. Установите ffmpeg и добавьте в PATH.")
return found
def get_file_extension(mimetype: str) -> str:
ext_map = {
"audio/ogg": ".ogg",
"audio/mpeg": ".mp3",
"audio/mp4": ".m4a",
"audio/x-m4a": ".m4a",
"audio/wav": ".wav",
"audio/webm": ".webm",
}
return ext_map.get(mimetype, ".tmp")
async def convert_to_wav(input_path: str) -> Optional[str]:
output_fd, output_path = tempfile.mkstemp(suffix=".wav")
os.close(output_fd)
cmd = [
"ffmpeg", "-i", input_path,
"-map", "0:a:0",
"-map_metadata", "-1",
"-vn",
"-acodec", "pcm_s16le",
"-ar", "16000",
"-ac", "1",
"-y",
output_path
]
try:
loop = asyncio.get_running_loop()
await loop.run_in_executor(None, lambda: subprocess.run(cmd, capture_output=True, check=True))
return output_path
except subprocess.CalledProcessError as e:
print(f"[AUDIO] Ошибка конвертации ffmpeg: {e.stderr.decode()}")
if os.path.exists(output_path):
os.unlink(output_path)
return None
async def transcribe_audio(audio_bytes: bytes, mimetype: str) -> Optional[str]:
if not ffmpeg_available():
print("[AUDIO] Ошибка: ffmpeg не установлен.")
return None
ext = get_file_extension(mimetype)
loop = asyncio.get_running_loop()
with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as tmp:
tmp.write(audio_bytes)
input_path = tmp.name
wav_path = None
try:
wav_path = await convert_to_wav(input_path)
if not wav_path:
print("[AUDIO] Конвертация в WAV не удалась.")
return None
model = get_whisper_model()
segments, info = await loop.run_in_executor(
None,
lambda: model.transcribe(wav_path, beam_size=5, language=WHISPER_LANGUAGE)
)
text = " ".join([segment.text for segment in segments])
return text.strip()
except Exception as e:
print(f"[AUDIO] Ошибка при распознавании: {e}")
return None
finally:
if os.path.exists(input_path):
os.unlink(input_path)
if wav_path and os.path.exists(wav_path):
os.unlink(wav_path)
async def process_audio(audio_data: Dict) -> str:
audio_bytes = audio_data["bytes"]
mimetype = audio_data.get("mimetype", "audio/ogg")
print(f"[AUDIO] Получено {len(audio_bytes)} байт аудио, тип: {mimetype}")
text = await transcribe_audio(audio_bytes, mimetype)
if text is None:
print("[AUDIO] Распознавание не удалось.")
return ""
return text
async def process_image(image_data: Dict) -> str:
print(f"[IMAGE] Получено {len(image_data['bytes'])} байт изображения")
return "[Описание изображения будет добавлено позже]"
async def generate_report(text: str, images_data: list, audios_data: list) -> str:
audio_texts = []
for audio in audios_data:
audio_text = await process_audio(audio)
if audio_text:
audio_texts.append(audio_text)
image_descriptions = []
for img in images_data:
desc = await process_image(img)
if desc:
image_descriptions.append(desc)
parts = []
if text:
parts.append(f"**Текст сообщения:**\n{text}")
if audio_texts:
parts.append("**Распознанный текст из аудио:**\n" + "\n\n".join(audio_texts))
if image_descriptions:
parts.append("**Описания изображений:**\n" + "\n".join(image_descriptions))
if not parts:
return "Не удалось обработать сообщение (нет текста, не распознано аудио или ошибка)."
print(f"[REPORT] text: {text}, images: {len(image_descriptions)}, audio: {len(audio_texts)}") print(f"[REPORT] text: {text}, images: {len(image_descriptions)}, audio: {len(audio_texts)}")
#TODO whisper + отчёт return "\n\n".join(parts)
return "Placeholder"
async def send_error_message(room_id: str, error_text: str): async def send_error_message(room_id: str, error_text: str):
await client.room_send( await client.room_send(
room_id, room_id,
"m.room.message", "m.room.message",
{"msgtype": "m.text", "body": f"❌ Ошибка: {error_text}"} {"msgtype": "m.text", "body": f" {error_text}"}
) )
async def process_complete_message(data: Dict): async def process_complete_message(data: Dict):
room_id = data["room_id"] room_id = data["room_id"]
image_descriptions = [] # Объединяем все текстовые сообщения, которые были в этой группе
for img_bytes in data.get("images", []): text_parts = data.get("text", [])
desc = await process_image(img_bytes) text = "\n".join(text_parts) if text_parts else ""
image_descriptions.append(desc) images_data = data.get("images", [])
audio_texts = [] audios_data = data.get("audio", [])
for aud_bytes in data.get("audio", []):
text = await process_audio(aud_bytes) report = await generate_report(text, images_data, audios_data)
audio_texts.append(text)
report = await generate_report(data.get("text", ""), image_descriptions, audio_texts)
await client.room_send( await client.room_send(
room_id, room_id,
"m.room.message", "m.room.message",
{"msgtype": "m.text", "body": report} {"msgtype": "m.text", "body": report}
) )
if "event_id" in data: if "event_id" in data:
pending_by_event_id.pop(data["event_id"], None) pending_by_event_id.pop(data["event_id"], None)
pending_by_conversation.pop((room_id, data["sender"]), None) pending_by_conversation.pop((room_id, data["sender"]), None)
@ -95,7 +217,7 @@ def get_or_create_pending(room_id: str, sender: str, event_id: Optional[str] = N
data = { data = {
"room_id": room_id, "room_id": room_id,
"sender": sender, "sender": sender,
"text": None, "text": [], # список строк, а не одна строка
"images": [], "images": [],
"audio": [], "audio": [],
"timestamp": time.time(), "timestamp": time.time(),
@ -124,7 +246,8 @@ async def on_text_message(room, event: RoomMessageText):
event_id = event.event_id event_id = event.event_id
data = get_or_create_pending(room.room_id, event.sender, event_id) data = get_or_create_pending(room.room_id, event.sender, event_id)
data["text"] = event.body # Добавляем текст в список, а не заменяем
data["text"].append(event.body)
reset_timer(data) reset_timer(data)
print(f"[TEXT] Добавлен текст в сообщение от {event.sender}: {event.body}") print(f"[TEXT] Добавлен текст в сообщение от {event.sender}: {event.body}")
@ -145,11 +268,20 @@ async def on_image_message(room, event: RoomMessageImage):
download_result = await client.download(event.url) download_result = await client.download(event.url)
if isinstance(download_result, ErrorResponse): if isinstance(download_result, ErrorResponse):
print(f"Ошибка скачивания изображения: {download_result.status_code}") print(f"[IMAGE] Ошибка скачивания: {download_result.status_code} - {download_result.message}")
await send_error_message(room.room_id, "Не удалось загрузить изображение.") await send_error_message(room.room_id, "Не удалось загрузить изображение.")
return return
data["images"].append(download_result.body) mimetype = getattr(event, "mimetype", None)
if not mimetype and hasattr(event, "info") and isinstance(event.info, dict):
mimetype = event.info.get("mimetype")
if not mimetype:
mimetype = "image/jpeg"
data["images"].append({
"bytes": download_result.body,
"mimetype": mimetype,
})
reset_timer(data) reset_timer(data)
print(f"[IMAGE] Добавлено изображение в сообщение от {event.sender}") print(f"[IMAGE] Добавлено изображение в сообщение от {event.sender}")
@ -170,11 +302,20 @@ async def on_audio_message(room, event: RoomMessageAudio):
download_result = await client.download(event.url) download_result = await client.download(event.url)
if isinstance(download_result, ErrorResponse): if isinstance(download_result, ErrorResponse):
print(f"Ошибка скачивания аудио: {download_result.status_code}") print(f"[AUDIO] Ошибка скачивания: {download_result.status_code} - {download_result.message}")
await send_error_message(room.room_id, "Не удалось загрузить аудио.") await send_error_message(room.room_id, "Не удалось загрузить аудио.")
return return
data["audio"].append(download_result.body) mimetype = None
if hasattr(event, "info") and isinstance(event.info, dict):
mimetype = event.info.get("mimetype")
if not mimetype:
mimetype = "audio/ogg"
data["audio"].append({
"bytes": download_result.body,
"mimetype": mimetype,
})
reset_timer(data) reset_timer(data)
print(f"[AUDIO] Добавлено аудио в сообщение от {event.sender}") print(f"[AUDIO] Добавлено аудио в сообщение от {event.sender}")
@ -210,6 +351,15 @@ async def main():
print(f"Исключение при авторизации: {e}") print(f"Исключение при авторизации: {e}")
return return
if not ffmpeg_available():
print("ВНИМАНИЕ: ffmpeg не найден. Бот не сможет распознавать аудио.")
print("Установите ffmpeg (https://ffmpeg.org/download.html) и добавьте в PATH.")
else:
print("ffmpeg найден, аудио будет обрабатываться.")
if WHISPER_LANGUAGE:
print(f"Язык распознавания: {WHISPER_LANGUAGE}")
client.add_event_callback(on_text_message, RoomMessageText) client.add_event_callback(on_text_message, RoomMessageText)
client.add_event_callback(on_image_message, RoomMessageImage) client.add_event_callback(on_image_message, RoomMessageImage)
client.add_event_callback(on_audio_message, RoomMessageAudio) client.add_event_callback(on_audio_message, RoomMessageAudio)
@ -222,5 +372,6 @@ async def main():
finally: finally:
await client.close() await client.close()
if __name__ == "__main__": if __name__ == "__main__":
asyncio.run(main()) asyncio.run(main())

Binary file not shown.