From 992a748c51877d188240f832496fd119b4b2458c Mon Sep 17 00:00:00 2001
From: slonovaad <mai.slonovaad@gmail.com>
Date: Tue, 21 Apr 2026 18:09:10 +0000
Subject: [PATCH] Add scripts

---
 scripts/concat_wav.sh      | 30 ++++++++++++++++
 scripts/generate_pdf.py    | 45 +++++++++++++++++++++++
 scripts/generate_report.sh | 73 ++++++++++++++++++++++++++++++++++++++
 scripts/hotwords.txt       | 38 ++++++++++++++++++++
 scripts/local_whisper.py   | 44 +++++++++++++++++++++++
 5 files changed, 230 insertions(+)
 create mode 100644 scripts/concat_wav.sh
 create mode 100644 scripts/generate_pdf.py
 create mode 100644 scripts/generate_report.sh
 create mode 100644 scripts/hotwords.txt
 create mode 100644 scripts/local_whisper.py
diff --git a/scripts/concat_wav.sh b/scripts/concat_wav.sh
new file mode 100644
index 0000000..8ab8df6
--- /dev/null
+++ b/scripts/concat_wav.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+# Concatenate multiple WAV/audio files into a single mp3 using ffmpeg concat demuxer
+#
+# Usage: ./concat_wav.sh <output.mp3> <input1.WAV> <input2.WAV> ...
+# Example: ./concat_wav.sh transcription/saramonic.mp3 20260325-091912.WAV 20260325-095007.WAV
+
+set -euo pipefail
+
+if [ $# -lt 3 ]; then
+    echo "Usage: $0 <output.mp3> <input1> <input2> [input3 ...]"
+    exit 1
+fi
+
+OUTPUT="$1"
+shift
+
+LISTFILE=$(mktemp /tmp/ffmpeg_concat_XXXXXX.txt)
+trap "rm -f '$LISTFILE'" EXIT
+
+for f in "$@"; do
+    ABSPATH="$(cd "$(dirname "$f")" && pwd)/$(basename "$f")"
+    echo "file '$ABSPATH'" >> "$LISTFILE"
+done
+
+echo "Concatenating $# files -> $OUTPUT"
+ffmpeg -y -f concat -safe 0 -i "$LISTFILE" -ac 1 -ar 16000 -b:a 64k "$OUTPUT" 2>/dev/null
+
+DUR=$(ffprobe -v error -show_entries format=duration -of csv=p=0 "$OUTPUT" | cut -d. -f1)
+SIZE=$(du -h "$OUTPUT" | cut -f1)
+echo "Done: ${DUR}s ($(( DUR / 60 ))m$(( DUR % 60 ))s), $SIZE"
\ No newline at end of file
diff --git a/scripts/generate_pdf.py b/scripts/generate_pdf.py
new file mode 100644
index 0000000..962de0b
--- /dev/null
+++ b/scripts/generate_pdf.py
@@ -0,0 +1,45 @@
+#!/usr/bin/env python3
+"""
+Генерация PDF из Markdown через weasyprint.
+
+УСТАНОВКА: pip install weasyprint
+ИСПОЛЬЗОВАНИЕ: python3 generate_pdf.py report.md report.pdf
+"""
+
+import sys
+from weasyprint import HTML
+
+def markdown_to_pdf(md_path, pdf_path):
+    """Конвертирует Markdown в PDF через HTML."""
+    with open(md_path, 'r', encoding='utf-8') as f:
+        md_content = f.read()
+    
+    # Simple MD to HTML conversion
+    html_content = f"""
+    <html lang="ru">
+    <head>
+        <meta charset="UTF-8">
+        <style>
+            @page {{ margin: 2cm; }}
+            body {{ font-family: sans-serif; line-height: 1.6; }}
+            h1 {{ color: #2c3e50; border-bottom: 2px solid #3498db; }}
+            h2 {{ color: #34495e; border-bottom: 1px solid #bdc3c7; }}
+            table {{ border-collapse: collapse; width: 100%; }}
+            th, td {{ border: 1px solid #ddd; padding: 8px; }}
+        </style>
+    </head>
+    <body>
+    {md_content.replace('## ', '<h2>').replace('# ', '<h1>')}
+    </body>
+    </html>
+    """
+    
+    HTML(string=html_content).write_pdf(pdf_path)
+
+if __name__ == "__main__":
+    if len(sys.argv) < 3:
+        print("Usage: python3 generate_pdf.py <report.md> <output.pdf>")
+        sys.exit(1)
+    
+    markdown_to_pdf(sys.argv[1], sys.argv[2])
+    print(f"PDF created: {sys.argv[2]}")
diff --git a/scripts/generate_report.sh b/scripts/generate_report.sh
new file mode 100644
index 0000000..ee42d3b
--- /dev/null
+++ b/scripts/generate_report.sh
@@ -0,0 +1,73 @@
+#!/bin/bash
+# generate_report.sh — Full pipeline for generating meeting report (without diagrams)
+# Usage: ./generate_report.sh /absolute/path/to/meeting_folder
+# Example: ./generate_report.sh /app/hermes_data/meetings/2026-04-15
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+
+# Load .env if exists (for hotwords etc.)
+if [ -f "$SCRIPT_DIR/.env" ]; then
+    set -a
+    source "$SCRIPT_DIR/.env"
+    set +a
+fi
+
+if [ $# -lt 1 ]; then
+    echo "Usage: $0 <absolute_path_to_meeting_folder>"
+    echo "Example: $0 /app/hermes_data/meetings/2026-04-15"
+    exit 1
+fi
+
+MEETING_DIR="$1"
+
+# If relative path provided, convert to absolute
+if [[ "$MEETING_DIR" != /* ]]; then
+    MEETING_DIR="$SCRIPT_DIR/$MEETING_DIR"
+fi
+
+# Resolve absolute path
+MEETING_DIR="$(realpath "$MEETING_DIR")"
+
+if [ ! -d "$MEETING_DIR" ]; then
+    echo "Error: Meeting directory not found: $MEETING_DIR"
+    exit 1
+fi
+
+# ============================================================
+# Step 1: Transcription (skip if already done)
+# ============================================================
+if [ -d "$MEETING_DIR/transcription" ] && [ -f "$MEETING_DIR/transcription/plain_text.txt" ]; then
+    echo "[1/2] Transcription already exists, skipping."
+else
+    echo "[1/2] Running transcription..."
+    bash "$SCRIPT_DIR/transcribe.sh" "$MEETING_DIR"
+fi
+
+# ============================================================
+# Step 2: Generate PDF from markdown
+# ============================================================
+echo "[2/2] Generating PDF..."
+REPORT_MD="$MEETING_DIR/report.md"
+REPORT_PDF="$MEETING_DIR/report.pdf"
+
+if [ ! -f "$REPORT_MD" ]; then
+    echo "  Error: report.md not found at $REPORT_MD"
+    exit 1
+fi
+
+cd "$MEETING_DIR"
+pandoc report.md -o report.pdf \
+    --pdf-engine=xelatex \
+    -V mainfont="DejaVu Serif" \
+    -V sansfont="DejaVu Sans" \
+    -V monofont="DejaVu Sans Mono" \
+    -V geometry:margin=2cm \
+    -V fontsize=11pt \
+    -V lang=ru \
+    --highlight-style=tango
+
+PDF_SIZE=$(du -h "$REPORT_PDF" | cut -f1)
+echo ""
+echo "Done! Report: $REPORT_PDF ($PDF_SIZE)"
\ No newline at end of file
diff --git a/scripts/hotwords.txt b/scripts/hotwords.txt
new file mode 100644
index 0000000..1f96698
--- /dev/null
+++ b/scripts/hotwords.txt
@@ -0,0 +1,38 @@
+# Hotwords for Whisper transcription of Lambda Lab meetings
+# One term per line, or comma-separated on one line
+# Used with faster-whisper "hotwords" parameter to improve recognition
+# of domain-specific terms, names, and abbreviations
+#
+# Project & product names
+OpenClaw, NanoClaw, IronClaw, ZeroClaw, PicoClaw, BlueClaw, MicroClaw
+ClawHub, Manus
+SOUL.md, MEMORY.md, HEARTBEAT.md
+#
+# Infrastructure & tools
+LiteLLM, LangFuse, Forgejo, Gitea
+Docker, LXC, MCP, API, RAG
+Telegram, Matrix, Element
+Playwright, Selenium
+#
+# AI/ML terms
+Claude, Anthropic, GPT, Qwen, DeepSeek
+промпт, агент, токен, эмбеддинг
+#
+# University & lab
+МАИ, лаборатория Лямбда, практика, бригада, кафедра
+#
+# People (lab members)
+Туревич, Нураев, Батурин, Бахтиозин, Слонова
+Кондрушин, Курноскин, Смирнов, Путиловский
+Эль-Тахир, Кобылкевич, Малинин, Шварц
+Тимошенко, Чубинец, Яшков, Соболев
+#
+# Companies & orgs
+ВТБ, T1, Сбербанк, МТС Диджитал
+Wildberries, Хабр
+#
+# General terms often misrecognized
+тимлид, календарный план, конгломерат, квиз
+стартап, деплой, инфраструктура, безопасность
+IT-льготы, GitHub, open source, VPS
+петличка, Saramonic, диктофон, скрипт
diff --git a/scripts/local_whisper.py b/scripts/local_whisper.py
new file mode 100644
index 0000000..d5c7d53
--- /dev/null
+++ b/scripts/local_whisper.py
@@ -0,0 +1,44 @@
+#!/usr/bin/env python3
+"""
+Транскрипция аудио через faster-whisper с исправлением MKL ошибки.
+
+ИСПОЛЬЗОВАНИЕ:
+    import os
+    os.environ["MKL_SERVICE_FORCE_INTEL"] = "1"
+    os.environ["OMP_NUM_THREADS"] = "2"
+    
+    from faster_whisper import WhisperModel
+    model = WhisperModel("small")
+"""
+
+import os
+import sys
+
+# CRITICAL: Must be set BEFORE importing faster_whisper
+os.environ["MKL_SERVICE_FORCE_INTEL"] = "1"
+os.environ["OMP_NUM_THREADS"] = "2"
+
+from faster_whisper import WhisperModel
+
+def transcribe_audio(audio_path, model_size="small", language="ru"):
+    """Транскрибирует аудиофайл."""
+    print(f"Loading model {model_size}...")
+    model = WhisperModel(model_size)
+    
+    print(f"Transcribing {audio_path}...")
+    segments, _ = model.transcribe(audio_path, language=language)
+    
+    # Convert to list for proper handling
+    segments = list(segments)
+    
+    return segments
+
+if __name__ == "__main__":
+    if len(sys.argv) < 2:
+        print("Usage: python3 local_whisper.py <audio.wav>")
+        sys.exit(1)
+    
+    segments = transcribe_audio(sys.argv[1])
+    
+    for segment in segments:
+        print(segment.text)