Add scripts

This commit is contained in:
Слонова Анна 2026-04-21 18:09:10 +00:00
parent c196f14ae9
commit 992a748c51
5 changed files with 230 additions and 0 deletions

30
scripts/concat_wav.sh Normal file
View file

@ -0,0 +1,30 @@
#!/bin/bash
# Concatenate multiple WAV/audio files into a single mp3 using ffmpeg concat demuxer
#
# Usage: ./concat_wav.sh <output.mp3> <input1.WAV> <input2.WAV> ...
# Example: ./concat_wav.sh transcription/saramonic.mp3 20260325-091912.WAV 20260325-095007.WAV
set -euo pipefail
if [ $# -lt 3 ]; then
echo "Usage: $0 <output.mp3> <input1> <input2> [input3 ...]"
exit 1
fi
OUTPUT="$1"
shift
LISTFILE=$(mktemp /tmp/ffmpeg_concat_XXXXXX.txt)
trap "rm -f '$LISTFILE'" EXIT
for f in "$@"; do
ABSPATH="$(cd "$(dirname "$f")" && pwd)/$(basename "$f")"
echo "file '$ABSPATH'" >> "$LISTFILE"
done
echo "Concatenating $# files -> $OUTPUT"
ffmpeg -y -f concat -safe 0 -i "$LISTFILE" -ac 1 -ar 16000 -b:a 64k "$OUTPUT" 2>/dev/null
DUR=$(ffprobe -v error -show_entries format=duration -of csv=p=0 "$OUTPUT" | cut -d. -f1)
SIZE=$(du -h "$OUTPUT" | cut -f1)
echo "Done: ${DUR}s ($(( DUR / 60 ))m$(( DUR % 60 ))s), $SIZE"

45
scripts/generate_pdf.py Normal file
View file

@ -0,0 +1,45 @@
#!/usr/bin/env python3
"""
Генерация PDF из Markdown через weasyprint.
УСТАНОВКА: pip install weasyprint
ИСПОЛЬЗОВАНИЕ: python3 generate_pdf.py report.md report.pdf
"""
import sys
from weasyprint import HTML
def markdown_to_pdf(md_path, pdf_path):
"""Конвертирует Markdown в PDF через HTML."""
with open(md_path, 'r', encoding='utf-8') as f:
md_content = f.read()
# Simple MD to HTML conversion
html_content = f"""
<html lang="ru">
<head>
<meta charset="UTF-8">
<style>
@page {{ margin: 2cm; }}
body {{ font-family: sans-serif; line-height: 1.6; }}
h1 {{ color: #2c3e50; border-bottom: 2px solid #3498db; }}
h2 {{ color: #34495e; border-bottom: 1px solid #bdc3c7; }}
table {{ border-collapse: collapse; width: 100%; }}
th, td {{ border: 1px solid #ddd; padding: 8px; }}
</style>
</head>
<body>
{md_content.replace('## ', '<h2>').replace('# ', '<h1>')}
</body>
</html>
"""
HTML(string=html_content).write_pdf(pdf_path)
if __name__ == "__main__":
if len(sys.argv) < 3:
print("Usage: python3 generate_pdf.py <report.md> <output.pdf>")
sys.exit(1)
markdown_to_pdf(sys.argv[1], sys.argv[2])
print(f"PDF created: {sys.argv[2]}")

View file

@ -0,0 +1,73 @@
#!/bin/bash
# generate_report.sh — Full pipeline for generating meeting report (without diagrams)
# Usage: ./generate_report.sh /absolute/path/to/meeting_folder
# Example: ./generate_report.sh /app/hermes_data/meetings/2026-04-15
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
# Load .env if exists (for hotwords etc.)
if [ -f "$SCRIPT_DIR/.env" ]; then
set -a
source "$SCRIPT_DIR/.env"
set +a
fi
if [ $# -lt 1 ]; then
echo "Usage: $0 <absolute_path_to_meeting_folder>"
echo "Example: $0 /app/hermes_data/meetings/2026-04-15"
exit 1
fi
MEETING_DIR="$1"
# If relative path provided, convert to absolute
if [[ "$MEETING_DIR" != /* ]]; then
MEETING_DIR="$SCRIPT_DIR/$MEETING_DIR"
fi
# Resolve absolute path
MEETING_DIR="$(realpath "$MEETING_DIR")"
if [ ! -d "$MEETING_DIR" ]; then
echo "Error: Meeting directory not found: $MEETING_DIR"
exit 1
fi
# ============================================================
# Step 1: Transcription (skip if already done)
# ============================================================
if [ -d "$MEETING_DIR/transcription" ] && [ -f "$MEETING_DIR/transcription/plain_text.txt" ]; then
echo "[1/2] Transcription already exists, skipping."
else
echo "[1/2] Running transcription..."
bash "$SCRIPT_DIR/transcribe.sh" "$MEETING_DIR"
fi
# ============================================================
# Step 2: Generate PDF from markdown
# ============================================================
echo "[2/2] Generating PDF..."
REPORT_MD="$MEETING_DIR/report.md"
REPORT_PDF="$MEETING_DIR/report.pdf"
if [ ! -f "$REPORT_MD" ]; then
echo " Error: report.md not found at $REPORT_MD"
exit 1
fi
cd "$MEETING_DIR"
pandoc report.md -o report.pdf \
--pdf-engine=xelatex \
-V mainfont="DejaVu Serif" \
-V sansfont="DejaVu Sans" \
-V monofont="DejaVu Sans Mono" \
-V geometry:margin=2cm \
-V fontsize=11pt \
-V lang=ru \
--highlight-style=tango
PDF_SIZE=$(du -h "$REPORT_PDF" | cut -f1)
echo ""
echo "Done! Report: $REPORT_PDF ($PDF_SIZE)"

38
scripts/hotwords.txt Normal file
View file

@ -0,0 +1,38 @@
# Hotwords for Whisper transcription of Lambda Lab meetings
# One term per line, or comma-separated on one line
# Used with faster-whisper "hotwords" parameter to improve recognition
# of domain-specific terms, names, and abbreviations
#
# Project & product names
OpenClaw, NanoClaw, IronClaw, ZeroClaw, PicoClaw, BlueClaw, MicroClaw
ClawHub, Manus
SOUL.md, MEMORY.md, HEARTBEAT.md
#
# Infrastructure & tools
LiteLLM, LangFuse, Forgejo, Gitea
Docker, LXC, MCP, API, RAG
Telegram, Matrix, Element
Playwright, Selenium
#
# AI/ML terms
Claude, Anthropic, GPT, Qwen, DeepSeek
промпт, агент, токен, эмбеддинг
#
# University & lab
МАИ, лаборатория Лямбда, практика, бригада, кафедра
#
# People (lab members)
Туревич, Нураев, Батурин, Бахтиозин, Слонова
Кондрушин, Курноскин, Смирнов, Путиловский
Эль-Тахир, Кобылкевич, Малинин, Шварц
Тимошенко, Чубинец, Яшков, Соболев
#
# Companies & orgs
ВТБ, T1, Сбербанк, МТС Диджитал
Wildberries, Хабр
#
# General terms often misrecognized
тимлид, календарный план, конгломерат, квиз
стартап, деплой, инфраструктура, безопасность
IT-льготы, GitHub, open source, VPS
петличка, Saramonic, диктофон, скрипт

44
scripts/local_whisper.py Normal file
View file

@ -0,0 +1,44 @@
#!/usr/bin/env python3
"""
Транскрипция аудио через faster-whisper с исправлением MKL ошибки.
ИСПОЛЬЗОВАНИЕ:
import os
os.environ["MKL_SERVICE_FORCE_INTEL"] = "1"
os.environ["OMP_NUM_THREADS"] = "2"
from faster_whisper import WhisperModel
model = WhisperModel("small")
"""
import os
import sys
# CRITICAL: Must be set BEFORE importing faster_whisper
os.environ["MKL_SERVICE_FORCE_INTEL"] = "1"
os.environ["OMP_NUM_THREADS"] = "2"
from faster_whisper import WhisperModel
def transcribe_audio(audio_path, model_size="small", language="ru"):
"""Транскрибирует аудиофайл."""
print(f"Loading model {model_size}...")
model = WhisperModel(model_size)
print(f"Transcribing {audio_path}...")
segments, _ = model.transcribe(audio_path, language=language)
# Convert to list for proper handling
segments = list(segments)
return segments
if __name__ == "__main__":
if len(sys.argv) < 2:
print("Usage: python3 local_whisper.py <audio.wav>")
sys.exit(1)
segments = transcribe_audio(sys.argv[1])
for segment in segments:
print(segment.text)