From 471e9bcc3b92500bd897cb66700c149b913271e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=A1=D0=BB=D0=BE=D0=BD=D0=BE=D0=B2=D0=B0=20=D0=90=D0=BD?= =?UTF-8?q?=D0=BD=D0=B0?= Date: Sun, 22 Mar 2026 21:15:39 +0300 Subject: [PATCH] Do audio recognition --- .env.example | 6 +- .gitignore | 8 ++ main.py | 205 ++++++++++++++++++++++++++++++++++++++++------- requirements.txt | Bin 916 -> 972 bytes 4 files changed, 191 insertions(+), 28 deletions(-) diff --git a/.env.example b/.env.example index 9175207..e6010b4 100644 --- a/.env.example +++ b/.env.example @@ -8,4 +8,8 @@ PASSWORD= ACCESS_TOKEN=syt_... # Allowed rooms (comma-separated, no spaces) -ALLOWED_ROOMS=!roomid1:matrix.org,!roomid2:matrix.org \ No newline at end of file +ALLOWED_ROOMS=!roomid1:matrix.org,!roomid2:matrix.org + +# Whisper settings +WHISPER_LANGUAGE=ru +WHISPER_MODEL=small \ No newline at end of file diff --git a/.gitignore b/.gitignore index ab3e8ce..a11843e 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,14 @@ __pycache__/ *.py[cod] *$py.class +.idea/* +.idea + +venv1 +venv1/ +venv1/* +venv1/** + # C extensions *.so diff --git a/main.py b/main.py index 005b623..75af493 100644 --- a/main.py +++ b/main.py @@ -3,6 +3,8 @@ import asyncio import os import tempfile import time +import shutil +import subprocess from typing import Dict, Optional, Tuple from dotenv import load_dotenv @@ -16,62 +18,182 @@ from nio import ( ErrorResponse, ) +from faster_whisper import WhisperModel + load_dotenv() HOMESERVER = os.getenv("HOMESERVER", "https://matrix.org") USERNAME = os.getenv("MATRIX_USERNAME") PASSWORD = os.getenv("PASSWORD") ALLOWED_ROOMS = set(room.strip() for room in os.getenv("ALLOWED_ROOMS", "").split(",") if room.strip()) +WHISPER_LANGUAGE = os.getenv("WHISPER_LANGUAGE", "ru") +WHISPER_MODEL = os.getenv("WHISPER_MODEL", "small") TEMP_DIR = tempfile.gettempdir() GROUPING_TIMEOUT = 15.0 client: AsyncClient = None - pending_by_conversation: Dict[Tuple[str, str], Dict] = {} pending_by_event_id: Dict[str, Dict] = {} - -async def process_audio(audio_bytes: bytes) -> str: - print(f"[AUDIO] Получено {len(audio_bytes)} байт аудио") - return "Placeholder" +whisper_model = None -async def process_image(image_bytes: bytes) -> str: - print(f"[IMAGE] Получено {len(image_bytes)} байт изображения") - return "Placeholder" +def get_whisper_model(): + global whisper_model + if whisper_model is None: + whisper_model = WhisperModel(WHISPER_MODEL, device="cpu", compute_type="int8") + print(f"Whisper модель {WHISPER_MODEL} загружена (faster-whisper).") + return whisper_model -async def generate_report(text: str, image_descriptions: list, audio_texts: list) -> str: +def ffmpeg_available() -> bool: + found = shutil.which("ffmpeg") is not None + if not found: + print("[ERROR] ffmpeg не найден в системе. Установите ffmpeg и добавьте в PATH.") + return found + + +def get_file_extension(mimetype: str) -> str: + ext_map = { + "audio/ogg": ".ogg", + "audio/mpeg": ".mp3", + "audio/mp4": ".m4a", + "audio/x-m4a": ".m4a", + "audio/wav": ".wav", + "audio/webm": ".webm", + } + return ext_map.get(mimetype, ".tmp") + + +async def convert_to_wav(input_path: str) -> Optional[str]: + output_fd, output_path = tempfile.mkstemp(suffix=".wav") + os.close(output_fd) + cmd = [ + "ffmpeg", "-i", input_path, + "-map", "0:a:0", + "-map_metadata", "-1", + "-vn", + "-acodec", "pcm_s16le", + "-ar", "16000", + "-ac", "1", + "-y", + output_path + ] + try: + loop = asyncio.get_running_loop() + await loop.run_in_executor(None, lambda: subprocess.run(cmd, capture_output=True, check=True)) + return output_path + except subprocess.CalledProcessError as e: + print(f"[AUDIO] Ошибка конвертации ffmpeg: {e.stderr.decode()}") + if os.path.exists(output_path): + os.unlink(output_path) + return None + + +async def transcribe_audio(audio_bytes: bytes, mimetype: str) -> Optional[str]: + if not ffmpeg_available(): + print("[AUDIO] Ошибка: ffmpeg не установлен.") + return None + + ext = get_file_extension(mimetype) + loop = asyncio.get_running_loop() + with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as tmp: + tmp.write(audio_bytes) + input_path = tmp.name + + wav_path = None + try: + wav_path = await convert_to_wav(input_path) + if not wav_path: + print("[AUDIO] Конвертация в WAV не удалась.") + return None + + model = get_whisper_model() + segments, info = await loop.run_in_executor( + None, + lambda: model.transcribe(wav_path, beam_size=5, language=WHISPER_LANGUAGE) + ) + text = " ".join([segment.text for segment in segments]) + return text.strip() + except Exception as e: + print(f"[AUDIO] Ошибка при распознавании: {e}") + return None + finally: + if os.path.exists(input_path): + os.unlink(input_path) + if wav_path and os.path.exists(wav_path): + os.unlink(wav_path) + + +async def process_audio(audio_data: Dict) -> str: + audio_bytes = audio_data["bytes"] + mimetype = audio_data.get("mimetype", "audio/ogg") + print(f"[AUDIO] Получено {len(audio_bytes)} байт аудио, тип: {mimetype}") + text = await transcribe_audio(audio_bytes, mimetype) + if text is None: + print("[AUDIO] Распознавание не удалось.") + return "" + return text + + +async def process_image(image_data: Dict) -> str: + print(f"[IMAGE] Получено {len(image_data['bytes'])} байт изображения") + return "[Описание изображения будет добавлено позже]" + + +async def generate_report(text: str, images_data: list, audios_data: list) -> str: + audio_texts = [] + for audio in audios_data: + audio_text = await process_audio(audio) + if audio_text: + audio_texts.append(audio_text) + + image_descriptions = [] + for img in images_data: + desc = await process_image(img) + if desc: + image_descriptions.append(desc) + + parts = [] + if text: + parts.append(f"**Текст сообщения:**\n{text}") + if audio_texts: + parts.append("**Распознанный текст из аудио:**\n" + "\n\n".join(audio_texts)) + if image_descriptions: + parts.append("**Описания изображений:**\n" + "\n".join(image_descriptions)) + + if not parts: + return "Не удалось обработать сообщение (нет текста, не распознано аудио или ошибка)." + print(f"[REPORT] text: {text}, images: {len(image_descriptions)}, audio: {len(audio_texts)}") - #TODO whisper + отчёт - return "Placeholder" + return "\n\n".join(parts) async def send_error_message(room_id: str, error_text: str): await client.room_send( room_id, "m.room.message", - {"msgtype": "m.text", "body": f"❌ Ошибка: {error_text}"} + {"msgtype": "m.text", "body": f"❌ {error_text}"} ) async def process_complete_message(data: Dict): room_id = data["room_id"] - image_descriptions = [] - for img_bytes in data.get("images", []): - desc = await process_image(img_bytes) - image_descriptions.append(desc) - audio_texts = [] - for aud_bytes in data.get("audio", []): - text = await process_audio(aud_bytes) - audio_texts.append(text) - report = await generate_report(data.get("text", ""), image_descriptions, audio_texts) + # Объединяем все текстовые сообщения, которые были в этой группе + text_parts = data.get("text", []) + text = "\n".join(text_parts) if text_parts else "" + images_data = data.get("images", []) + audios_data = data.get("audio", []) + + report = await generate_report(text, images_data, audios_data) + await client.room_send( room_id, "m.room.message", {"msgtype": "m.text", "body": report} ) + if "event_id" in data: pending_by_event_id.pop(data["event_id"], None) pending_by_conversation.pop((room_id, data["sender"]), None) @@ -95,7 +217,7 @@ def get_or_create_pending(room_id: str, sender: str, event_id: Optional[str] = N data = { "room_id": room_id, "sender": sender, - "text": None, + "text": [], # список строк, а не одна строка "images": [], "audio": [], "timestamp": time.time(), @@ -124,7 +246,8 @@ async def on_text_message(room, event: RoomMessageText): event_id = event.event_id data = get_or_create_pending(room.room_id, event.sender, event_id) - data["text"] = event.body + # Добавляем текст в список, а не заменяем + data["text"].append(event.body) reset_timer(data) print(f"[TEXT] Добавлен текст в сообщение от {event.sender}: {event.body}") @@ -145,11 +268,20 @@ async def on_image_message(room, event: RoomMessageImage): download_result = await client.download(event.url) if isinstance(download_result, ErrorResponse): - print(f"Ошибка скачивания изображения: {download_result.status_code}") + print(f"[IMAGE] Ошибка скачивания: {download_result.status_code} - {download_result.message}") await send_error_message(room.room_id, "Не удалось загрузить изображение.") return - data["images"].append(download_result.body) + mimetype = getattr(event, "mimetype", None) + if not mimetype and hasattr(event, "info") and isinstance(event.info, dict): + mimetype = event.info.get("mimetype") + if not mimetype: + mimetype = "image/jpeg" + + data["images"].append({ + "bytes": download_result.body, + "mimetype": mimetype, + }) reset_timer(data) print(f"[IMAGE] Добавлено изображение в сообщение от {event.sender}") @@ -170,11 +302,20 @@ async def on_audio_message(room, event: RoomMessageAudio): download_result = await client.download(event.url) if isinstance(download_result, ErrorResponse): - print(f"Ошибка скачивания аудио: {download_result.status_code}") + print(f"[AUDIO] Ошибка скачивания: {download_result.status_code} - {download_result.message}") await send_error_message(room.room_id, "Не удалось загрузить аудио.") return - data["audio"].append(download_result.body) + mimetype = None + if hasattr(event, "info") and isinstance(event.info, dict): + mimetype = event.info.get("mimetype") + if not mimetype: + mimetype = "audio/ogg" + + data["audio"].append({ + "bytes": download_result.body, + "mimetype": mimetype, + }) reset_timer(data) print(f"[AUDIO] Добавлено аудио в сообщение от {event.sender}") @@ -210,6 +351,15 @@ async def main(): print(f"Исключение при авторизации: {e}") return + if not ffmpeg_available(): + print("ВНИМАНИЕ: ffmpeg не найден. Бот не сможет распознавать аудио.") + print("Установите ffmpeg (https://ffmpeg.org/download.html) и добавьте в PATH.") + else: + print("ffmpeg найден, аудио будет обрабатываться.") + + if WHISPER_LANGUAGE: + print(f"Язык распознавания: {WHISPER_LANGUAGE}") + client.add_event_callback(on_text_message, RoomMessageText) client.add_event_callback(on_image_message, RoomMessageImage) client.add_event_callback(on_audio_message, RoomMessageAudio) @@ -222,5 +372,6 @@ async def main(): finally: await client.close() + if __name__ == "__main__": asyncio.run(main()) \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index d8456314e1bdb9396d4af963de03c3bf1c46f49c..3fc940d775a2ab714ed750f3b23cc291bd8a4c2f 100644 GIT binary patch literal 972 zcmYLI%aYq55WMp*qJ%6z_Mk)VNmZ^nxiCV8SONu5*4nR6H{K*SG0gPz18E;8KifPE zT`!evh4#_ub=~c5Z+f12Ax~1(dMWEL9HctP!{75YNAo~aE7j+8_V=LYVNkLsTnMq~ zV#j8ThHb6vQQUnT1}yA|TuR~^Y~jFfjGocf3S(38lgCwy%T9JjVPAm$?oOnwTsRK~eDhBt&-s zF}L0CKHMj5?A2{IHl2yEgZz~Ag0YuJehj(>p~TM|LpIJXC}16#{4X`+3kQuZ9&MHR z5zZ*F==2!lduV)V)}5m?e99>IMJJB+!De>ygaPvaO6du%QhE3dIPDaYJS)j zVs!}qAYus5w1whH8^b+vrxltK(L|gh87mR&O)2&ON80D*xb=>Px# literal 916 zcmZvb(N2S44264b;-j!^pi?h=7Nf8Y)B$14I3Hg9PP>0>#*qFndU|?#=zhMRt+CG5 zobR@>ojr2pc4ISJ5Oce>D+^4`z2`cBQ@rr|oh#MMZ0S|!6{C$*h_mpV&f z!CvEvda9qMNN;=8NO#IjnrmF@={Ml+0BXy#P?x$It9&A5!lQ-z^XiVYoTb%NZed70 za_0yI|KT#K)VX9%ec6)Qk$Xp{^tI;9bITkLM08XK`PbN|WgvxDM>!GL+?8!@kAvc? zi1O&8rn)okM?Oip|oFXp>5Ca%rSjMI?{pM1JP8up>Tbg3OU4^R@Qj3gvn3 zk$5NU<%WCvVSCe%+m^$X8ahI#-1