From 992a748c51877d188240f832496fd119b4b2458c Mon Sep 17 00:00:00 2001 From: slonovaad Date: Tue, 21 Apr 2026 18:09:10 +0000 Subject: [PATCH] Add scripts --- scripts/concat_wav.sh | 30 ++++++++++++++++ scripts/generate_pdf.py | 45 +++++++++++++++++++++++ scripts/generate_report.sh | 73 ++++++++++++++++++++++++++++++++++++++ scripts/hotwords.txt | 38 ++++++++++++++++++++ scripts/local_whisper.py | 44 +++++++++++++++++++++++ 5 files changed, 230 insertions(+) create mode 100644 scripts/concat_wav.sh create mode 100644 scripts/generate_pdf.py create mode 100644 scripts/generate_report.sh create mode 100644 scripts/hotwords.txt create mode 100644 scripts/local_whisper.py diff --git a/scripts/concat_wav.sh b/scripts/concat_wav.sh new file mode 100644 index 0000000..8ab8df6 --- /dev/null +++ b/scripts/concat_wav.sh @@ -0,0 +1,30 @@ +#!/bin/bash +# Concatenate multiple WAV/audio files into a single mp3 using ffmpeg concat demuxer +# +# Usage: ./concat_wav.sh ... +# Example: ./concat_wav.sh transcription/saramonic.mp3 20260325-091912.WAV 20260325-095007.WAV + +set -euo pipefail + +if [ $# -lt 3 ]; then + echo "Usage: $0 [input3 ...]" + exit 1 +fi + +OUTPUT="$1" +shift + +LISTFILE=$(mktemp /tmp/ffmpeg_concat_XXXXXX.txt) +trap "rm -f '$LISTFILE'" EXIT + +for f in "$@"; do + ABSPATH="$(cd "$(dirname "$f")" && pwd)/$(basename "$f")" + echo "file '$ABSPATH'" >> "$LISTFILE" +done + +echo "Concatenating $# files -> $OUTPUT" +ffmpeg -y -f concat -safe 0 -i "$LISTFILE" -ac 1 -ar 16000 -b:a 64k "$OUTPUT" 2>/dev/null + +DUR=$(ffprobe -v error -show_entries format=duration -of csv=p=0 "$OUTPUT" | cut -d. -f1) +SIZE=$(du -h "$OUTPUT" | cut -f1) +echo "Done: ${DUR}s ($(( DUR / 60 ))m$(( DUR % 60 ))s), $SIZE" \ No newline at end of file diff --git a/scripts/generate_pdf.py b/scripts/generate_pdf.py new file mode 100644 index 0000000..962de0b --- /dev/null +++ b/scripts/generate_pdf.py @@ -0,0 +1,45 @@ +#!/usr/bin/env python3 +""" +Генерация PDF из Markdown через weasyprint. + +УСТАНОВКА: pip install weasyprint +ИСПОЛЬЗОВАНИЕ: python3 generate_pdf.py report.md report.pdf +""" + +import sys +from weasyprint import HTML + +def markdown_to_pdf(md_path, pdf_path): + """Конвертирует Markdown в PDF через HTML.""" + with open(md_path, 'r', encoding='utf-8') as f: + md_content = f.read() + + # Simple MD to HTML conversion + html_content = f""" + + + + + + + {md_content.replace('## ', '

').replace('# ', '

')} + + + """ + + HTML(string=html_content).write_pdf(pdf_path) + +if __name__ == "__main__": + if len(sys.argv) < 3: + print("Usage: python3 generate_pdf.py ") + sys.exit(1) + + markdown_to_pdf(sys.argv[1], sys.argv[2]) + print(f"PDF created: {sys.argv[2]}") diff --git a/scripts/generate_report.sh b/scripts/generate_report.sh new file mode 100644 index 0000000..ee42d3b --- /dev/null +++ b/scripts/generate_report.sh @@ -0,0 +1,73 @@ +#!/bin/bash +# generate_report.sh — Full pipeline for generating meeting report (without diagrams) +# Usage: ./generate_report.sh /absolute/path/to/meeting_folder +# Example: ./generate_report.sh /app/hermes_data/meetings/2026-04-15 + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" + +# Load .env if exists (for hotwords etc.) +if [ -f "$SCRIPT_DIR/.env" ]; then + set -a + source "$SCRIPT_DIR/.env" + set +a +fi + +if [ $# -lt 1 ]; then + echo "Usage: $0 " + echo "Example: $0 /app/hermes_data/meetings/2026-04-15" + exit 1 +fi + +MEETING_DIR="$1" + +# If relative path provided, convert to absolute +if [[ "$MEETING_DIR" != /* ]]; then + MEETING_DIR="$SCRIPT_DIR/$MEETING_DIR" +fi + +# Resolve absolute path +MEETING_DIR="$(realpath "$MEETING_DIR")" + +if [ ! -d "$MEETING_DIR" ]; then + echo "Error: Meeting directory not found: $MEETING_DIR" + exit 1 +fi + +# ============================================================ +# Step 1: Transcription (skip if already done) +# ============================================================ +if [ -d "$MEETING_DIR/transcription" ] && [ -f "$MEETING_DIR/transcription/plain_text.txt" ]; then + echo "[1/2] Transcription already exists, skipping." +else + echo "[1/2] Running transcription..." + bash "$SCRIPT_DIR/transcribe.sh" "$MEETING_DIR" +fi + +# ============================================================ +# Step 2: Generate PDF from markdown +# ============================================================ +echo "[2/2] Generating PDF..." +REPORT_MD="$MEETING_DIR/report.md" +REPORT_PDF="$MEETING_DIR/report.pdf" + +if [ ! -f "$REPORT_MD" ]; then + echo " Error: report.md not found at $REPORT_MD" + exit 1 +fi + +cd "$MEETING_DIR" +pandoc report.md -o report.pdf \ + --pdf-engine=xelatex \ + -V mainfont="DejaVu Serif" \ + -V sansfont="DejaVu Sans" \ + -V monofont="DejaVu Sans Mono" \ + -V geometry:margin=2cm \ + -V fontsize=11pt \ + -V lang=ru \ + --highlight-style=tango + +PDF_SIZE=$(du -h "$REPORT_PDF" | cut -f1) +echo "" +echo "Done! Report: $REPORT_PDF ($PDF_SIZE)" \ No newline at end of file diff --git a/scripts/hotwords.txt b/scripts/hotwords.txt new file mode 100644 index 0000000..1f96698 --- /dev/null +++ b/scripts/hotwords.txt @@ -0,0 +1,38 @@ +# Hotwords for Whisper transcription of Lambda Lab meetings +# One term per line, or comma-separated on one line +# Used with faster-whisper "hotwords" parameter to improve recognition +# of domain-specific terms, names, and abbreviations +# +# Project & product names +OpenClaw, NanoClaw, IronClaw, ZeroClaw, PicoClaw, BlueClaw, MicroClaw +ClawHub, Manus +SOUL.md, MEMORY.md, HEARTBEAT.md +# +# Infrastructure & tools +LiteLLM, LangFuse, Forgejo, Gitea +Docker, LXC, MCP, API, RAG +Telegram, Matrix, Element +Playwright, Selenium +# +# AI/ML terms +Claude, Anthropic, GPT, Qwen, DeepSeek +промпт, агент, токен, эмбеддинг +# +# University & lab +МАИ, лаборатория Лямбда, практика, бригада, кафедра +# +# People (lab members) +Туревич, Нураев, Батурин, Бахтиозин, Слонова +Кондрушин, Курноскин, Смирнов, Путиловский +Эль-Тахир, Кобылкевич, Малинин, Шварц +Тимошенко, Чубинец, Яшков, Соболев +# +# Companies & orgs +ВТБ, T1, Сбербанк, МТС Диджитал +Wildberries, Хабр +# +# General terms often misrecognized +тимлид, календарный план, конгломерат, квиз +стартап, деплой, инфраструктура, безопасность +IT-льготы, GitHub, open source, VPS +петличка, Saramonic, диктофон, скрипт diff --git a/scripts/local_whisper.py b/scripts/local_whisper.py new file mode 100644 index 0000000..d5c7d53 --- /dev/null +++ b/scripts/local_whisper.py @@ -0,0 +1,44 @@ +#!/usr/bin/env python3 +""" +Транскрипция аудио через faster-whisper с исправлением MKL ошибки. + +ИСПОЛЬЗОВАНИЕ: + import os + os.environ["MKL_SERVICE_FORCE_INTEL"] = "1" + os.environ["OMP_NUM_THREADS"] = "2" + + from faster_whisper import WhisperModel + model = WhisperModel("small") +""" + +import os +import sys + +# CRITICAL: Must be set BEFORE importing faster_whisper +os.environ["MKL_SERVICE_FORCE_INTEL"] = "1" +os.environ["OMP_NUM_THREADS"] = "2" + +from faster_whisper import WhisperModel + +def transcribe_audio(audio_path, model_size="small", language="ru"): + """Транскрибирует аудиофайл.""" + print(f"Loading model {model_size}...") + model = WhisperModel(model_size) + + print(f"Transcribing {audio_path}...") + segments, _ = model.transcribe(audio_path, language=language) + + # Convert to list for proper handling + segments = list(segments) + + return segments + +if __name__ == "__main__": + if len(sys.argv) < 2: + print("Usage: python3 local_whisper.py ") + sys.exit(1) + + segments = transcribe_audio(sys.argv[1]) + + for segment in segments: + print(segment.text)