From ba56147e953947002b56365e00c7ea1114517d1e Mon Sep 17 00:00:00 2001
From: slonovaad <mai.slonovaad@gmail.com>
Date: Tue, 21 Apr 2026 18:09:28 +0000
Subject: [PATCH] Add scripts

---
 scripts/merge_transcriptions.py |  25 ++++
 scripts/overlay.sh              |  30 +++++
 scripts/transcribe.sh           | 210 ++++++++++++++++++++++++++++++++
 scripts/transcribe_chunked.sh   |  64 ++++++++++
 4 files changed, 329 insertions(+)
 create mode 100644 scripts/merge_transcriptions.py
 create mode 100644 scripts/overlay.sh
 create mode 100644 scripts/transcribe.sh
 create mode 100644 scripts/transcribe_chunked.sh

diff --git a/scripts/merge_transcriptions.py b/scripts/merge_transcriptions.py
new file mode 100644
index 0000000..112df43
--- /dev/null
+++ b/scripts/merge_transcriptions.py
@@ -0,0 +1,25 @@
+#!/usr/bin/env python3
+"""Объединение транскрипций из нескольких файлов."""
+
+import sys
+import os
+
+def merge_transcriptions(timeline_dir, output_path="merged_plain.txt"):
+    """Собирает все .txt файлы в один."""
+    txt_files = sorted([f for f in os.listdir(timeline_dir) if f.endswith('.txt') and 'merged' not in f])
+    
+    merged = []
+    for txt_file in txt_files:
+        with open(os.path.join(timeline_dir, txt_file), 'r', encoding='utf-8') as f:
+            content = f.read().strip()
+            if content:
+                merged.append(f"--- {txt_file} ---\n{content}\n")
+    
+    with open(os.path.join(timeline_dir, output_path), 'w', encoding='utf-8') as f:
+        f.write('\n\n'.join(merged))
+    
+    print(f"Merged {len(txt_files)} files into {output_path}")
+
+if __name__ == "__main__":
+    dir_path = sys.argv[1] if len(sys.argv) > 1 else "transcription"
+    merge_transcriptions(dir_path)
diff --git a/scripts/overlay.sh b/scripts/overlay.sh
new file mode 100644
index 0000000..156899d
--- /dev/null
+++ b/scripts/overlay.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+# Обёртка для запуска транскрипции с правильной настройкой окружения
+
+set -euo pipefail
+
+# CRITICAL: переменные для Intel oneMKL
+export MKL_SERVICE_FORCE_INTEL=1
+export OMP_NUM_THREADS=2
+
+MEETING_DIR="${1:-.}"
+
+if [ ! -f "$MEETING_DIR"/*.wav ] && [ ! -f "$MEETING_DIR"/*.WAV ]; then
+    echo "Error: No WAV file found in $MEETING_DIR"
+    exit 1
+fi
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+# Check audio duration
+DURATION=$(ffprobe -i "$MEETING_DIR"/*.wav -show_entries format=duration -v quiet -of csv="p=0" 2>/dev/null | cut -d. -f1)
+
+if [ $DURATION -gt 1800 ]; then  # >30 минут
+    echo "Audio is $DURATION seconds. Using chunked transcription..."
+    bash "$SCRIPT_DIR/transcribe_chunked.sh" "$MEETING_DIR"
+else
+    echo "Audio is $DURATION seconds. Using standard transcription..."
+    bash "$SCRIPT_DIR/transcribe.sh" "$MEETING_DIR"
+fi
+
+echo "Transcription complete. Check $MEETING_DIR/transcription/"
diff --git a/scripts/transcribe.sh b/scripts/transcribe.sh
new file mode 100644
index 0000000..0d37f7d
--- /dev/null
+++ b/scripts/transcribe.sh
@@ -0,0 +1,210 @@
+#!/bin/bash
+# Transcribe audio recordings using local faster-whisper
+# Supports multiple sources: Zoom H2n (4ch WAV), Saramonic (mono WAV), etc.
+#
+# Usage:
+#   ./transcribe.sh /absolute/path/to/meeting_folder
+#   ./transcribe.sh /absolute/path/to/meeting_folder specific.WAV output_name
+#
+# Examples:
+#   ./transcribe.sh /app/hermes_data/meetings/2026-02-18
+#   ./transcribe.sh /app/hermes_data/meetings/2026-02-18 SR003XY.WAV h2n_xy
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+WHISPER_MODEL="base"
+LANGUAGE="ru"
+
+# Load hotwords
+HOTWORDS_FILE="$SCRIPT_DIR/hotwords.txt"
+if [ -f "$HOTWORDS_FILE" ]; then
+    HOTWORDS=$(grep -v '^#' "$HOTWORDS_FILE" | grep -v '^$' | tr '\n' ',' | sed 's/,,*/,/g; s/^,//; s/,$//')
+    echo "Loaded hotwords from $HOTWORDS_FILE"
+else
+    HOTWORDS=""
+    echo "Warning: hotwords.txt not found, proceeding without hotwords"
+fi
+
+if [ $# -lt 1 ]; then
+    echo "Usage: $0 <absolute_meeting_dir> [<file.WAV> <output_name>]"
+    echo "Example: $0 /app/hermes_data/meetings/2026-02-18"
+    exit 1
+fi
+
+MEETING_DIR="$1"
+if [[ "$MEETING_DIR" != /* ]]; then
+    MEETING_DIR="$(realpath "$MEETING_DIR")"
+else
+    MEETING_DIR="$(realpath "$MEETING_DIR")"
+fi
+
+WORK_DIR="$MEETING_DIR"
+OUTPUT_DIR="$WORK_DIR/transcription"
+mkdir -p "$OUTPUT_DIR"
+
+# Function: convert WAV(s) to mono mp3
+convert_to_mp3() {
+    local output_mp3="$1"
+    shift
+    local inputs=("$@")
+
+    if [ -f "$output_mp3" ]; then
+        echo "  $output_mp3 already exists, skipping conversion"
+        return
+    fi
+
+    if [ ${#inputs[@]} -eq 1 ]; then
+        echo "  Converting ${inputs[0]} -> $output_mp3"
+        ffmpeg -y -i "${inputs[0]}" -ac 1 -ar 16000 -b:a 64k "$output_mp3" 2>/dev/null
+    else
+        local listfile
+        listfile=$(mktemp /tmp/ffmpeg_concat_XXXXXX.txt)
+        for f in "${inputs[@]}"; do
+            echo "file '$f'" >> "$listfile"
+        done
+        echo "  Concatenating ${#inputs[@]} files -> $output_mp3"
+        ffmpeg -y -f concat -safe 0 -i "$listfile" -ac 1 -ar 16000 -b:a 64k "$output_mp3" 2>/dev/null
+        rm -f "$listfile"
+    fi
+
+    local dur
+    dur=$(ffprobe -v error -show_entries format=duration -of csv=p=0 "$output_mp3" | cut -d. -f1)
+    echo "  Duration: ${dur}s ($(( dur / 60 ))m$(( dur % 60 ))s)"
+}
+
+# Function: transcribe using local faster-whisper (with chunking if needed)
+transcribe_file() {
+    local mp3_file="$1"
+    local name="$2"
+    local json_file="$OUTPUT_DIR/${name}.json"
+
+    if [ -f "$json_file" ]; then
+        echo "  $name already transcribed, skipping"
+        return
+    fi
+
+    # Check duration of mp3
+    local duration=$(ffprobe -v error -show_entries format=duration -of csv=p=0 "$mp3_file" | cut -d. -f1)
+    if [ "$duration" -gt 1800 ]; then  # > 30 minutes
+        echo "  Audio is ${duration}s long (>30 min), using chunked transcription..."
+        bash "$SCRIPT_DIR/transcribe_chunked.sh" "$mp3_file" "$name" "$OUTPUT_DIR"
+        return
+    fi
+
+    echo "  Transcribing $name (local faster-whisper)..."
+    local started
+    started=$(date +%s)
+
+    MKL_SERVICE_FORCE_INTEL=1 OMP_NUM_THREADS=2 python3 "$SCRIPT_DIR/local_whisper.py" "$mp3_file" "$json_file" "$WHISPER_MODEL" "$HOTWORDS"
+
+    local elapsed=$(( $(date +%s) - started ))
+    echo "    Done in ${elapsed}s"
+
+    # Extract plain text and timestamped text
+    python3 - "$json_file" "$OUTPUT_DIR" "$name" <<'PYEOF'
+import json, sys, os
+
+json_path = sys.argv[1]
+output_dir = sys.argv[2]
+name = sys.argv[3]
+
+with open(json_path) as f:
+    data = json.load(f)
+
+segs = data.get("segments", [])
+
+# Timestamped text
+lines = []
+for seg in segs:
+    start = seg.get("start", 0)
+    h, m, s = int(start // 3600), int((start % 3600) // 60), int(start % 60)
+    lines.append(f"[{h:02d}:{m:02d}:{s:02d}] {seg['text'].strip()}")
+
+txt_path = os.path.join(output_dir, f"{name}.txt")
+with open(txt_path, "w") as f:
+    f.write("\n".join(lines))
+
+# Plain text
+plain = " ".join(seg["text"].strip() for seg in segs)
+plain_path = os.path.join(output_dir, f"{name}_plain.txt")
+with open(plain_path, "w") as f:
+    f.write(plain)
+
+print(f"    {len(segs)} segments, {len(plain)} chars")
+PYEOF
+}
+
+# Manual mode: specific file
+if [ $# -ge 3 ]; then
+    WAV_FILE="$WORK_DIR/$2"
+    NAME="$3"
+    MP3_FILE="$OUTPUT_DIR/${NAME}.mp3"
+
+    echo "=== Transcribing $2 as '$NAME' ==="
+    convert_to_mp3 "$MP3_FILE" "$WAV_FILE"
+    transcribe_file "$MP3_FILE" "$NAME"
+    echo "=== Done ==="
+    exit 0
+fi
+
+# Auto mode: detect and transcribe all sources
+echo "=== Auto-detecting audio sources in $WORK_DIR ==="
+
+# Detect H2n files (SR*XY.WAV, SR*MS.WAV)
+H2N_XY=$(find "$WORK_DIR" -maxdepth 1 -name "SR*XY.WAV" | head -1)
+H2N_MS=$(find "$WORK_DIR" -maxdepth 1 -name "SR*MS.WAV" | head -1)
+
+# Detect Saramonic / other timestamped WAV files (not SR*)
+mapfile -t SARAMONIC_FILES < <(find "$WORK_DIR" -maxdepth 1 -name "*.WAV" ! -name "SR*" | sort)
+
+SOURCES=()
+
+if [ -n "$H2N_XY" ]; then
+    echo "  Found H2n XY: $(basename "$H2N_XY")"
+    SOURCES+=("h2n_xy:$H2N_XY")
+fi
+if [ -n "$H2N_MS" ]; then
+    echo "  Found H2n MS: $(basename "$H2N_MS")"
+    SOURCES+=("h2n_ms:$H2N_MS")
+fi
+if [ ${#SARAMONIC_FILES[@]} -gt 0 ]; then
+    echo "  Found Saramonic files: ${SARAMONIC_FILES[*]##*/}"
+    joined=$(printf "|%s" "${SARAMONIC_FILES[@]}")
+    joined="${joined:1}"
+    SOURCES+=("saramonic:$joined")
+fi
+
+if [ ${#SOURCES[@]} -eq 0 ]; then
+    echo "Error: No WAV files found in $WORK_DIR"
+    exit 1
+fi
+
+echo ""
+echo "=== Step 1: Converting to mp3 ==="
+for entry in "${SOURCES[@]}"; do
+    name="${entry%%:*}"
+    paths="${entry#*:}"
+    mp3="$OUTPUT_DIR/${name}.mp3"
+
+    IFS='|' read -ra files <<< "$paths"
+    convert_to_mp3 "$mp3" "${files[@]}"
+done
+
+echo ""
+echo "=== Step 2: Transcribing ==="
+for entry in "${SOURCES[@]}"; do
+    name="${entry%%:*}"
+    mp3="$OUTPUT_DIR/${name}.mp3"
+    transcribe_file "$mp3" "$name"
+done
+
+echo ""
+echo "=== Done! ==="
+echo "Results in: $OUTPUT_DIR/"
+for entry in "${SOURCES[@]}"; do
+    name="${entry%%:*}"
+    echo "  ${name}.json        - whisper JSON with segments"
+    echo "  ${name}.txt         - timestamped transcription"
+    echo "  ${name}_plain.txt   - plain text"
+done
\ No newline at end of file
diff --git a/scripts/transcribe_chunked.sh b/scripts/transcribe_chunked.sh
new file mode 100644
index 0000000..f8e68e0
--- /dev/null
+++ b/scripts/transcribe_chunked.sh
@@ -0,0 +1,64 @@
+#!/bin/bash
+# Транскрипция с разбивкой на чанки для длинных аудио
+
+set -euo pipefail
+
+MEETING_DIR="${1:-.}"
+CHUNKS_DIR="$MEETING_DIR/transcription"
+
+mkdir -p "$CHUNKS_DIR"
+
+# Get audio file
+WAV_FILE=$(ls "$MEETING_DIR"/*.wav 2>/dev/null || ls "$MEETING_DIR"/*.WAV 2>/dev/null)
+if [ -z "$WAV_FILE" ] || [ ! -f "$WAV_FILE" ]; then
+    echo "Error: No WAV file found"
+    exit 1
+fi
+
+# Duration
+DURATION=$(ffprobe -i "$WAV_FILE" -show_entries format=duration -v quiet -of csv="p=0")
+
+echo "Audio duration: $DURATION seconds"
+
+# Chunk settings
+chunk_duration=600
+offset=0
+chunk_num=0
+
+echo "Extracting chunks..."
+
+while (( $(echo "$offset < $DURATION" | bc -l) )); do
+    chunk_file="$CHUNKS_DIR/chunk_${chunk_num}.wav"
+    echo "Extracting chunk $chunk_num at offset $offset..."
+    
+    # Retry logic
+    for attempt in 1 2 3; do
+        if ffmpeg -i "$WAV_FILE" -ss "$offset" -t "$chunk_duration" -acodec pcm_s16le -ar 16000 "$chunk_file" -y 2>/dev/null; then
+            break
+        elif [ $attempt -eq 3 ]; then
+            echo "Error: Failed to extract chunk $chunk_num"
+            exit 1
+        fi
+        sleep 1
+    done
+    
+    offset=$((offset + chunk_duration))
+    ((chunk_num++))
+done
+
+echo "Transcribing $chunk_num chunks..."
+
+# Transcribe each chunk
+for i in $(seq 0 $((chunk_num - 1))); do
+    chunk_file="$CHUNKS_DIR/chunk_${i}.wav"
+    output_file="$CHUNKS_DIR/chunk_${i}.txt"
+    
+    echo "Transcribing chunk $i..."
+    MKL_SERVICE_FORCE_INTEL=1 OMP_NUM_THREADS=2 python3 "${BASH_SOURCE[0]%/*}/local_whisper.py" "$chunk_file" > "$output_file"
+done
+
+# Merge
+echo "Merging transcriptions..."
+cat "$CHUNKS_DIR"/chunk_*.txt > "$CHUNKS_DIR/merged_raw.txt"
+
+echo "Done. Output: $CHUNKS_DIR/merged_raw.txt"