Add scripts
This commit is contained in:
parent
c196f14ae9
commit
992a748c51
5 changed files with 230 additions and 0 deletions
30
scripts/concat_wav.sh
Normal file
30
scripts/concat_wav.sh
Normal file
|
|
@ -0,0 +1,30 @@
|
|||
#!/bin/bash
|
||||
# Concatenate multiple WAV/audio files into a single mp3 using ffmpeg concat demuxer
|
||||
#
|
||||
# Usage: ./concat_wav.sh <output.mp3> <input1.WAV> <input2.WAV> ...
|
||||
# Example: ./concat_wav.sh transcription/saramonic.mp3 20260325-091912.WAV 20260325-095007.WAV
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
if [ $# -lt 3 ]; then
|
||||
echo "Usage: $0 <output.mp3> <input1> <input2> [input3 ...]"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
OUTPUT="$1"
|
||||
shift
|
||||
|
||||
LISTFILE=$(mktemp /tmp/ffmpeg_concat_XXXXXX.txt)
|
||||
trap "rm -f '$LISTFILE'" EXIT
|
||||
|
||||
for f in "$@"; do
|
||||
ABSPATH="$(cd "$(dirname "$f")" && pwd)/$(basename "$f")"
|
||||
echo "file '$ABSPATH'" >> "$LISTFILE"
|
||||
done
|
||||
|
||||
echo "Concatenating $# files -> $OUTPUT"
|
||||
ffmpeg -y -f concat -safe 0 -i "$LISTFILE" -ac 1 -ar 16000 -b:a 64k "$OUTPUT" 2>/dev/null
|
||||
|
||||
DUR=$(ffprobe -v error -show_entries format=duration -of csv=p=0 "$OUTPUT" | cut -d. -f1)
|
||||
SIZE=$(du -h "$OUTPUT" | cut -f1)
|
||||
echo "Done: ${DUR}s ($(( DUR / 60 ))m$(( DUR % 60 ))s), $SIZE"
|
||||
45
scripts/generate_pdf.py
Normal file
45
scripts/generate_pdf.py
Normal file
|
|
@ -0,0 +1,45 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Генерация PDF из Markdown через weasyprint.
|
||||
|
||||
УСТАНОВКА: pip install weasyprint
|
||||
ИСПОЛЬЗОВАНИЕ: python3 generate_pdf.py report.md report.pdf
|
||||
"""
|
||||
|
||||
import sys
|
||||
from weasyprint import HTML
|
||||
|
||||
def markdown_to_pdf(md_path, pdf_path):
|
||||
"""Конвертирует Markdown в PDF через HTML."""
|
||||
with open(md_path, 'r', encoding='utf-8') as f:
|
||||
md_content = f.read()
|
||||
|
||||
# Simple MD to HTML conversion
|
||||
html_content = f"""
|
||||
<html lang="ru">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<style>
|
||||
@page {{ margin: 2cm; }}
|
||||
body {{ font-family: sans-serif; line-height: 1.6; }}
|
||||
h1 {{ color: #2c3e50; border-bottom: 2px solid #3498db; }}
|
||||
h2 {{ color: #34495e; border-bottom: 1px solid #bdc3c7; }}
|
||||
table {{ border-collapse: collapse; width: 100%; }}
|
||||
th, td {{ border: 1px solid #ddd; padding: 8px; }}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
{md_content.replace('## ', '<h2>').replace('# ', '<h1>')}
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
HTML(string=html_content).write_pdf(pdf_path)
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) < 3:
|
||||
print("Usage: python3 generate_pdf.py <report.md> <output.pdf>")
|
||||
sys.exit(1)
|
||||
|
||||
markdown_to_pdf(sys.argv[1], sys.argv[2])
|
||||
print(f"PDF created: {sys.argv[2]}")
|
||||
73
scripts/generate_report.sh
Normal file
73
scripts/generate_report.sh
Normal file
|
|
@ -0,0 +1,73 @@
|
|||
#!/bin/bash
|
||||
# generate_report.sh — Full pipeline for generating meeting report (without diagrams)
|
||||
# Usage: ./generate_report.sh /absolute/path/to/meeting_folder
|
||||
# Example: ./generate_report.sh /app/hermes_data/meetings/2026-04-15
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
||||
|
||||
# Load .env if exists (for hotwords etc.)
|
||||
if [ -f "$SCRIPT_DIR/.env" ]; then
|
||||
set -a
|
||||
source "$SCRIPT_DIR/.env"
|
||||
set +a
|
||||
fi
|
||||
|
||||
if [ $# -lt 1 ]; then
|
||||
echo "Usage: $0 <absolute_path_to_meeting_folder>"
|
||||
echo "Example: $0 /app/hermes_data/meetings/2026-04-15"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
MEETING_DIR="$1"
|
||||
|
||||
# If relative path provided, convert to absolute
|
||||
if [[ "$MEETING_DIR" != /* ]]; then
|
||||
MEETING_DIR="$SCRIPT_DIR/$MEETING_DIR"
|
||||
fi
|
||||
|
||||
# Resolve absolute path
|
||||
MEETING_DIR="$(realpath "$MEETING_DIR")"
|
||||
|
||||
if [ ! -d "$MEETING_DIR" ]; then
|
||||
echo "Error: Meeting directory not found: $MEETING_DIR"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# ============================================================
|
||||
# Step 1: Transcription (skip if already done)
|
||||
# ============================================================
|
||||
if [ -d "$MEETING_DIR/transcription" ] && [ -f "$MEETING_DIR/transcription/plain_text.txt" ]; then
|
||||
echo "[1/2] Transcription already exists, skipping."
|
||||
else
|
||||
echo "[1/2] Running transcription..."
|
||||
bash "$SCRIPT_DIR/transcribe.sh" "$MEETING_DIR"
|
||||
fi
|
||||
|
||||
# ============================================================
|
||||
# Step 2: Generate PDF from markdown
|
||||
# ============================================================
|
||||
echo "[2/2] Generating PDF..."
|
||||
REPORT_MD="$MEETING_DIR/report.md"
|
||||
REPORT_PDF="$MEETING_DIR/report.pdf"
|
||||
|
||||
if [ ! -f "$REPORT_MD" ]; then
|
||||
echo " Error: report.md not found at $REPORT_MD"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
cd "$MEETING_DIR"
|
||||
pandoc report.md -o report.pdf \
|
||||
--pdf-engine=xelatex \
|
||||
-V mainfont="DejaVu Serif" \
|
||||
-V sansfont="DejaVu Sans" \
|
||||
-V monofont="DejaVu Sans Mono" \
|
||||
-V geometry:margin=2cm \
|
||||
-V fontsize=11pt \
|
||||
-V lang=ru \
|
||||
--highlight-style=tango
|
||||
|
||||
PDF_SIZE=$(du -h "$REPORT_PDF" | cut -f1)
|
||||
echo ""
|
||||
echo "Done! Report: $REPORT_PDF ($PDF_SIZE)"
|
||||
38
scripts/hotwords.txt
Normal file
38
scripts/hotwords.txt
Normal file
|
|
@ -0,0 +1,38 @@
|
|||
# Hotwords for Whisper transcription of Lambda Lab meetings
|
||||
# One term per line, or comma-separated on one line
|
||||
# Used with faster-whisper "hotwords" parameter to improve recognition
|
||||
# of domain-specific terms, names, and abbreviations
|
||||
#
|
||||
# Project & product names
|
||||
OpenClaw, NanoClaw, IronClaw, ZeroClaw, PicoClaw, BlueClaw, MicroClaw
|
||||
ClawHub, Manus
|
||||
SOUL.md, MEMORY.md, HEARTBEAT.md
|
||||
#
|
||||
# Infrastructure & tools
|
||||
LiteLLM, LangFuse, Forgejo, Gitea
|
||||
Docker, LXC, MCP, API, RAG
|
||||
Telegram, Matrix, Element
|
||||
Playwright, Selenium
|
||||
#
|
||||
# AI/ML terms
|
||||
Claude, Anthropic, GPT, Qwen, DeepSeek
|
||||
промпт, агент, токен, эмбеддинг
|
||||
#
|
||||
# University & lab
|
||||
МАИ, лаборатория Лямбда, практика, бригада, кафедра
|
||||
#
|
||||
# People (lab members)
|
||||
Туревич, Нураев, Батурин, Бахтиозин, Слонова
|
||||
Кондрушин, Курноскин, Смирнов, Путиловский
|
||||
Эль-Тахир, Кобылкевич, Малинин, Шварц
|
||||
Тимошенко, Чубинец, Яшков, Соболев
|
||||
#
|
||||
# Companies & orgs
|
||||
ВТБ, T1, Сбербанк, МТС Диджитал
|
||||
Wildberries, Хабр
|
||||
#
|
||||
# General terms often misrecognized
|
||||
тимлид, календарный план, конгломерат, квиз
|
||||
стартап, деплой, инфраструктура, безопасность
|
||||
IT-льготы, GitHub, open source, VPS
|
||||
петличка, Saramonic, диктофон, скрипт
|
||||
44
scripts/local_whisper.py
Normal file
44
scripts/local_whisper.py
Normal file
|
|
@ -0,0 +1,44 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Транскрипция аудио через faster-whisper с исправлением MKL ошибки.
|
||||
|
||||
ИСПОЛЬЗОВАНИЕ:
|
||||
import os
|
||||
os.environ["MKL_SERVICE_FORCE_INTEL"] = "1"
|
||||
os.environ["OMP_NUM_THREADS"] = "2"
|
||||
|
||||
from faster_whisper import WhisperModel
|
||||
model = WhisperModel("small")
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
|
||||
# CRITICAL: Must be set BEFORE importing faster_whisper
|
||||
os.environ["MKL_SERVICE_FORCE_INTEL"] = "1"
|
||||
os.environ["OMP_NUM_THREADS"] = "2"
|
||||
|
||||
from faster_whisper import WhisperModel
|
||||
|
||||
def transcribe_audio(audio_path, model_size="small", language="ru"):
|
||||
"""Транскрибирует аудиофайл."""
|
||||
print(f"Loading model {model_size}...")
|
||||
model = WhisperModel(model_size)
|
||||
|
||||
print(f"Transcribing {audio_path}...")
|
||||
segments, _ = model.transcribe(audio_path, language=language)
|
||||
|
||||
# Convert to list for proper handling
|
||||
segments = list(segments)
|
||||
|
||||
return segments
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) < 2:
|
||||
print("Usage: python3 local_whisper.py <audio.wav>")
|
||||
sys.exit(1)
|
||||
|
||||
segments = transcribe_audio(sys.argv[1])
|
||||
|
||||
for segment in segments:
|
||||
print(segment.text)
|
||||
Loading…
Add table
Add a link
Reference in a new issue