Do audio recognition
This commit is contained in:
parent
4f8e10df16
commit
471e9bcc3b
4 changed files with 191 additions and 28 deletions
|
|
@ -8,4 +8,8 @@ PASSWORD=
|
|||
ACCESS_TOKEN=syt_...
|
||||
|
||||
# Allowed rooms (comma-separated, no spaces)
|
||||
ALLOWED_ROOMS=!roomid1:matrix.org,!roomid2:matrix.org
|
||||
ALLOWED_ROOMS=!roomid1:matrix.org,!roomid2:matrix.org
|
||||
|
||||
# Whisper settings
|
||||
WHISPER_LANGUAGE=ru
|
||||
WHISPER_MODEL=small
|
||||
8
.gitignore
vendored
8
.gitignore
vendored
|
|
@ -4,6 +4,14 @@ __pycache__/
|
|||
*.py[cod]
|
||||
*$py.class
|
||||
|
||||
.idea/*
|
||||
.idea
|
||||
|
||||
venv1
|
||||
venv1/
|
||||
venv1/*
|
||||
venv1/**
|
||||
|
||||
# C extensions
|
||||
*.so
|
||||
|
||||
|
|
|
|||
205
main.py
205
main.py
|
|
@ -3,6 +3,8 @@ import asyncio
|
|||
import os
|
||||
import tempfile
|
||||
import time
|
||||
import shutil
|
||||
import subprocess
|
||||
from typing import Dict, Optional, Tuple
|
||||
from dotenv import load_dotenv
|
||||
|
||||
|
|
@ -16,62 +18,182 @@ from nio import (
|
|||
ErrorResponse,
|
||||
)
|
||||
|
||||
from faster_whisper import WhisperModel
|
||||
|
||||
load_dotenv()
|
||||
|
||||
HOMESERVER = os.getenv("HOMESERVER", "https://matrix.org")
|
||||
USERNAME = os.getenv("MATRIX_USERNAME")
|
||||
PASSWORD = os.getenv("PASSWORD")
|
||||
ALLOWED_ROOMS = set(room.strip() for room in os.getenv("ALLOWED_ROOMS", "").split(",") if room.strip())
|
||||
WHISPER_LANGUAGE = os.getenv("WHISPER_LANGUAGE", "ru")
|
||||
WHISPER_MODEL = os.getenv("WHISPER_MODEL", "small")
|
||||
|
||||
TEMP_DIR = tempfile.gettempdir()
|
||||
GROUPING_TIMEOUT = 15.0
|
||||
|
||||
client: AsyncClient = None
|
||||
|
||||
pending_by_conversation: Dict[Tuple[str, str], Dict] = {}
|
||||
pending_by_event_id: Dict[str, Dict] = {}
|
||||
|
||||
|
||||
async def process_audio(audio_bytes: bytes) -> str:
|
||||
print(f"[AUDIO] Получено {len(audio_bytes)} байт аудио")
|
||||
return "Placeholder"
|
||||
whisper_model = None
|
||||
|
||||
|
||||
async def process_image(image_bytes: bytes) -> str:
|
||||
print(f"[IMAGE] Получено {len(image_bytes)} байт изображения")
|
||||
return "Placeholder"
|
||||
def get_whisper_model():
|
||||
global whisper_model
|
||||
if whisper_model is None:
|
||||
whisper_model = WhisperModel(WHISPER_MODEL, device="cpu", compute_type="int8")
|
||||
print(f"Whisper модель {WHISPER_MODEL} загружена (faster-whisper).")
|
||||
return whisper_model
|
||||
|
||||
|
||||
async def generate_report(text: str, image_descriptions: list, audio_texts: list) -> str:
|
||||
def ffmpeg_available() -> bool:
|
||||
found = shutil.which("ffmpeg") is not None
|
||||
if not found:
|
||||
print("[ERROR] ffmpeg не найден в системе. Установите ffmpeg и добавьте в PATH.")
|
||||
return found
|
||||
|
||||
|
||||
def get_file_extension(mimetype: str) -> str:
|
||||
ext_map = {
|
||||
"audio/ogg": ".ogg",
|
||||
"audio/mpeg": ".mp3",
|
||||
"audio/mp4": ".m4a",
|
||||
"audio/x-m4a": ".m4a",
|
||||
"audio/wav": ".wav",
|
||||
"audio/webm": ".webm",
|
||||
}
|
||||
return ext_map.get(mimetype, ".tmp")
|
||||
|
||||
|
||||
async def convert_to_wav(input_path: str) -> Optional[str]:
|
||||
output_fd, output_path = tempfile.mkstemp(suffix=".wav")
|
||||
os.close(output_fd)
|
||||
cmd = [
|
||||
"ffmpeg", "-i", input_path,
|
||||
"-map", "0:a:0",
|
||||
"-map_metadata", "-1",
|
||||
"-vn",
|
||||
"-acodec", "pcm_s16le",
|
||||
"-ar", "16000",
|
||||
"-ac", "1",
|
||||
"-y",
|
||||
output_path
|
||||
]
|
||||
try:
|
||||
loop = asyncio.get_running_loop()
|
||||
await loop.run_in_executor(None, lambda: subprocess.run(cmd, capture_output=True, check=True))
|
||||
return output_path
|
||||
except subprocess.CalledProcessError as e:
|
||||
print(f"[AUDIO] Ошибка конвертации ffmpeg: {e.stderr.decode()}")
|
||||
if os.path.exists(output_path):
|
||||
os.unlink(output_path)
|
||||
return None
|
||||
|
||||
|
||||
async def transcribe_audio(audio_bytes: bytes, mimetype: str) -> Optional[str]:
|
||||
if not ffmpeg_available():
|
||||
print("[AUDIO] Ошибка: ffmpeg не установлен.")
|
||||
return None
|
||||
|
||||
ext = get_file_extension(mimetype)
|
||||
loop = asyncio.get_running_loop()
|
||||
with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as tmp:
|
||||
tmp.write(audio_bytes)
|
||||
input_path = tmp.name
|
||||
|
||||
wav_path = None
|
||||
try:
|
||||
wav_path = await convert_to_wav(input_path)
|
||||
if not wav_path:
|
||||
print("[AUDIO] Конвертация в WAV не удалась.")
|
||||
return None
|
||||
|
||||
model = get_whisper_model()
|
||||
segments, info = await loop.run_in_executor(
|
||||
None,
|
||||
lambda: model.transcribe(wav_path, beam_size=5, language=WHISPER_LANGUAGE)
|
||||
)
|
||||
text = " ".join([segment.text for segment in segments])
|
||||
return text.strip()
|
||||
except Exception as e:
|
||||
print(f"[AUDIO] Ошибка при распознавании: {e}")
|
||||
return None
|
||||
finally:
|
||||
if os.path.exists(input_path):
|
||||
os.unlink(input_path)
|
||||
if wav_path and os.path.exists(wav_path):
|
||||
os.unlink(wav_path)
|
||||
|
||||
|
||||
async def process_audio(audio_data: Dict) -> str:
|
||||
audio_bytes = audio_data["bytes"]
|
||||
mimetype = audio_data.get("mimetype", "audio/ogg")
|
||||
print(f"[AUDIO] Получено {len(audio_bytes)} байт аудио, тип: {mimetype}")
|
||||
text = await transcribe_audio(audio_bytes, mimetype)
|
||||
if text is None:
|
||||
print("[AUDIO] Распознавание не удалось.")
|
||||
return ""
|
||||
return text
|
||||
|
||||
|
||||
async def process_image(image_data: Dict) -> str:
|
||||
print(f"[IMAGE] Получено {len(image_data['bytes'])} байт изображения")
|
||||
return "[Описание изображения будет добавлено позже]"
|
||||
|
||||
|
||||
async def generate_report(text: str, images_data: list, audios_data: list) -> str:
|
||||
audio_texts = []
|
||||
for audio in audios_data:
|
||||
audio_text = await process_audio(audio)
|
||||
if audio_text:
|
||||
audio_texts.append(audio_text)
|
||||
|
||||
image_descriptions = []
|
||||
for img in images_data:
|
||||
desc = await process_image(img)
|
||||
if desc:
|
||||
image_descriptions.append(desc)
|
||||
|
||||
parts = []
|
||||
if text:
|
||||
parts.append(f"**Текст сообщения:**\n{text}")
|
||||
if audio_texts:
|
||||
parts.append("**Распознанный текст из аудио:**\n" + "\n\n".join(audio_texts))
|
||||
if image_descriptions:
|
||||
parts.append("**Описания изображений:**\n" + "\n".join(image_descriptions))
|
||||
|
||||
if not parts:
|
||||
return "Не удалось обработать сообщение (нет текста, не распознано аудио или ошибка)."
|
||||
|
||||
print(f"[REPORT] text: {text}, images: {len(image_descriptions)}, audio: {len(audio_texts)}")
|
||||
#TODO whisper + отчёт
|
||||
return "Placeholder"
|
||||
return "\n\n".join(parts)
|
||||
|
||||
|
||||
async def send_error_message(room_id: str, error_text: str):
|
||||
await client.room_send(
|
||||
room_id,
|
||||
"m.room.message",
|
||||
{"msgtype": "m.text", "body": f"❌ Ошибка: {error_text}"}
|
||||
{"msgtype": "m.text", "body": f"❌ {error_text}"}
|
||||
)
|
||||
|
||||
|
||||
async def process_complete_message(data: Dict):
|
||||
room_id = data["room_id"]
|
||||
image_descriptions = []
|
||||
for img_bytes in data.get("images", []):
|
||||
desc = await process_image(img_bytes)
|
||||
image_descriptions.append(desc)
|
||||
audio_texts = []
|
||||
for aud_bytes in data.get("audio", []):
|
||||
text = await process_audio(aud_bytes)
|
||||
audio_texts.append(text)
|
||||
report = await generate_report(data.get("text", ""), image_descriptions, audio_texts)
|
||||
# Объединяем все текстовые сообщения, которые были в этой группе
|
||||
text_parts = data.get("text", [])
|
||||
text = "\n".join(text_parts) if text_parts else ""
|
||||
images_data = data.get("images", [])
|
||||
audios_data = data.get("audio", [])
|
||||
|
||||
report = await generate_report(text, images_data, audios_data)
|
||||
|
||||
await client.room_send(
|
||||
room_id,
|
||||
"m.room.message",
|
||||
{"msgtype": "m.text", "body": report}
|
||||
)
|
||||
|
||||
if "event_id" in data:
|
||||
pending_by_event_id.pop(data["event_id"], None)
|
||||
pending_by_conversation.pop((room_id, data["sender"]), None)
|
||||
|
|
@ -95,7 +217,7 @@ def get_or_create_pending(room_id: str, sender: str, event_id: Optional[str] = N
|
|||
data = {
|
||||
"room_id": room_id,
|
||||
"sender": sender,
|
||||
"text": None,
|
||||
"text": [], # список строк, а не одна строка
|
||||
"images": [],
|
||||
"audio": [],
|
||||
"timestamp": time.time(),
|
||||
|
|
@ -124,7 +246,8 @@ async def on_text_message(room, event: RoomMessageText):
|
|||
|
||||
event_id = event.event_id
|
||||
data = get_or_create_pending(room.room_id, event.sender, event_id)
|
||||
data["text"] = event.body
|
||||
# Добавляем текст в список, а не заменяем
|
||||
data["text"].append(event.body)
|
||||
reset_timer(data)
|
||||
print(f"[TEXT] Добавлен текст в сообщение от {event.sender}: {event.body}")
|
||||
|
||||
|
|
@ -145,11 +268,20 @@ async def on_image_message(room, event: RoomMessageImage):
|
|||
|
||||
download_result = await client.download(event.url)
|
||||
if isinstance(download_result, ErrorResponse):
|
||||
print(f"Ошибка скачивания изображения: {download_result.status_code}")
|
||||
print(f"[IMAGE] Ошибка скачивания: {download_result.status_code} - {download_result.message}")
|
||||
await send_error_message(room.room_id, "Не удалось загрузить изображение.")
|
||||
return
|
||||
|
||||
data["images"].append(download_result.body)
|
||||
mimetype = getattr(event, "mimetype", None)
|
||||
if not mimetype and hasattr(event, "info") and isinstance(event.info, dict):
|
||||
mimetype = event.info.get("mimetype")
|
||||
if not mimetype:
|
||||
mimetype = "image/jpeg"
|
||||
|
||||
data["images"].append({
|
||||
"bytes": download_result.body,
|
||||
"mimetype": mimetype,
|
||||
})
|
||||
reset_timer(data)
|
||||
print(f"[IMAGE] Добавлено изображение в сообщение от {event.sender}")
|
||||
|
||||
|
|
@ -170,11 +302,20 @@ async def on_audio_message(room, event: RoomMessageAudio):
|
|||
|
||||
download_result = await client.download(event.url)
|
||||
if isinstance(download_result, ErrorResponse):
|
||||
print(f"Ошибка скачивания аудио: {download_result.status_code}")
|
||||
print(f"[AUDIO] Ошибка скачивания: {download_result.status_code} - {download_result.message}")
|
||||
await send_error_message(room.room_id, "Не удалось загрузить аудио.")
|
||||
return
|
||||
|
||||
data["audio"].append(download_result.body)
|
||||
mimetype = None
|
||||
if hasattr(event, "info") and isinstance(event.info, dict):
|
||||
mimetype = event.info.get("mimetype")
|
||||
if not mimetype:
|
||||
mimetype = "audio/ogg"
|
||||
|
||||
data["audio"].append({
|
||||
"bytes": download_result.body,
|
||||
"mimetype": mimetype,
|
||||
})
|
||||
reset_timer(data)
|
||||
print(f"[AUDIO] Добавлено аудио в сообщение от {event.sender}")
|
||||
|
||||
|
|
@ -210,6 +351,15 @@ async def main():
|
|||
print(f"Исключение при авторизации: {e}")
|
||||
return
|
||||
|
||||
if not ffmpeg_available():
|
||||
print("ВНИМАНИЕ: ffmpeg не найден. Бот не сможет распознавать аудио.")
|
||||
print("Установите ffmpeg (https://ffmpeg.org/download.html) и добавьте в PATH.")
|
||||
else:
|
||||
print("ffmpeg найден, аудио будет обрабатываться.")
|
||||
|
||||
if WHISPER_LANGUAGE:
|
||||
print(f"Язык распознавания: {WHISPER_LANGUAGE}")
|
||||
|
||||
client.add_event_callback(on_text_message, RoomMessageText)
|
||||
client.add_event_callback(on_image_message, RoomMessageImage)
|
||||
client.add_event_callback(on_audio_message, RoomMessageAudio)
|
||||
|
|
@ -222,5 +372,6 @@ async def main():
|
|||
finally:
|
||||
await client.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
BIN
requirements.txt
BIN
requirements.txt
Binary file not shown.
Loading…
Add table
Add a link
Reference in a new issue