From ba56147e953947002b56365e00c7ea1114517d1e Mon Sep 17 00:00:00 2001 From: slonovaad Date: Tue, 21 Apr 2026 18:09:28 +0000 Subject: [PATCH] Add scripts --- scripts/merge_transcriptions.py | 25 ++++ scripts/overlay.sh | 30 +++++ scripts/transcribe.sh | 210 ++++++++++++++++++++++++++++++++ scripts/transcribe_chunked.sh | 64 ++++++++++ 4 files changed, 329 insertions(+) create mode 100644 scripts/merge_transcriptions.py create mode 100644 scripts/overlay.sh create mode 100644 scripts/transcribe.sh create mode 100644 scripts/transcribe_chunked.sh diff --git a/scripts/merge_transcriptions.py b/scripts/merge_transcriptions.py new file mode 100644 index 0000000..112df43 --- /dev/null +++ b/scripts/merge_transcriptions.py @@ -0,0 +1,25 @@ +#!/usr/bin/env python3 +"""Объединение транскрипций из нескольких файлов.""" + +import sys +import os + +def merge_transcriptions(timeline_dir, output_path="merged_plain.txt"): + """Собирает все .txt файлы в один.""" + txt_files = sorted([f for f in os.listdir(timeline_dir) if f.endswith('.txt') and 'merged' not in f]) + + merged = [] + for txt_file in txt_files: + with open(os.path.join(timeline_dir, txt_file), 'r', encoding='utf-8') as f: + content = f.read().strip() + if content: + merged.append(f"--- {txt_file} ---\n{content}\n") + + with open(os.path.join(timeline_dir, output_path), 'w', encoding='utf-8') as f: + f.write('\n\n'.join(merged)) + + print(f"Merged {len(txt_files)} files into {output_path}") + +if __name__ == "__main__": + dir_path = sys.argv[1] if len(sys.argv) > 1 else "transcription" + merge_transcriptions(dir_path) diff --git a/scripts/overlay.sh b/scripts/overlay.sh new file mode 100644 index 0000000..156899d --- /dev/null +++ b/scripts/overlay.sh @@ -0,0 +1,30 @@ +#!/bin/bash +# Обёртка для запуска транскрипции с правильной настройкой окружения + +set -euo pipefail + +# CRITICAL: переменные для Intel oneMKL +export MKL_SERVICE_FORCE_INTEL=1 +export OMP_NUM_THREADS=2 + +MEETING_DIR="${1:-.}" + +if [ ! -f "$MEETING_DIR"/*.wav ] && [ ! -f "$MEETING_DIR"/*.WAV ]; then + echo "Error: No WAV file found in $MEETING_DIR" + exit 1 +fi + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +# Check audio duration +DURATION=$(ffprobe -i "$MEETING_DIR"/*.wav -show_entries format=duration -v quiet -of csv="p=0" 2>/dev/null | cut -d. -f1) + +if [ $DURATION -gt 1800 ]; then # >30 минут + echo "Audio is $DURATION seconds. Using chunked transcription..." + bash "$SCRIPT_DIR/transcribe_chunked.sh" "$MEETING_DIR" +else + echo "Audio is $DURATION seconds. Using standard transcription..." + bash "$SCRIPT_DIR/transcribe.sh" "$MEETING_DIR" +fi + +echo "Transcription complete. Check $MEETING_DIR/transcription/" diff --git a/scripts/transcribe.sh b/scripts/transcribe.sh new file mode 100644 index 0000000..0d37f7d --- /dev/null +++ b/scripts/transcribe.sh @@ -0,0 +1,210 @@ +#!/bin/bash +# Transcribe audio recordings using local faster-whisper +# Supports multiple sources: Zoom H2n (4ch WAV), Saramonic (mono WAV), etc. +# +# Usage: +# ./transcribe.sh /absolute/path/to/meeting_folder +# ./transcribe.sh /absolute/path/to/meeting_folder specific.WAV output_name +# +# Examples: +# ./transcribe.sh /app/hermes_data/meetings/2026-02-18 +# ./transcribe.sh /app/hermes_data/meetings/2026-02-18 SR003XY.WAV h2n_xy + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +WHISPER_MODEL="base" +LANGUAGE="ru" + +# Load hotwords +HOTWORDS_FILE="$SCRIPT_DIR/hotwords.txt" +if [ -f "$HOTWORDS_FILE" ]; then + HOTWORDS=$(grep -v '^#' "$HOTWORDS_FILE" | grep -v '^$' | tr '\n' ',' | sed 's/,,*/,/g; s/^,//; s/,$//') + echo "Loaded hotwords from $HOTWORDS_FILE" +else + HOTWORDS="" + echo "Warning: hotwords.txt not found, proceeding without hotwords" +fi + +if [ $# -lt 1 ]; then + echo "Usage: $0 [ ]" + echo "Example: $0 /app/hermes_data/meetings/2026-02-18" + exit 1 +fi + +MEETING_DIR="$1" +if [[ "$MEETING_DIR" != /* ]]; then + MEETING_DIR="$(realpath "$MEETING_DIR")" +else + MEETING_DIR="$(realpath "$MEETING_DIR")" +fi + +WORK_DIR="$MEETING_DIR" +OUTPUT_DIR="$WORK_DIR/transcription" +mkdir -p "$OUTPUT_DIR" + +# Function: convert WAV(s) to mono mp3 +convert_to_mp3() { + local output_mp3="$1" + shift + local inputs=("$@") + + if [ -f "$output_mp3" ]; then + echo " $output_mp3 already exists, skipping conversion" + return + fi + + if [ ${#inputs[@]} -eq 1 ]; then + echo " Converting ${inputs[0]} -> $output_mp3" + ffmpeg -y -i "${inputs[0]}" -ac 1 -ar 16000 -b:a 64k "$output_mp3" 2>/dev/null + else + local listfile + listfile=$(mktemp /tmp/ffmpeg_concat_XXXXXX.txt) + for f in "${inputs[@]}"; do + echo "file '$f'" >> "$listfile" + done + echo " Concatenating ${#inputs[@]} files -> $output_mp3" + ffmpeg -y -f concat -safe 0 -i "$listfile" -ac 1 -ar 16000 -b:a 64k "$output_mp3" 2>/dev/null + rm -f "$listfile" + fi + + local dur + dur=$(ffprobe -v error -show_entries format=duration -of csv=p=0 "$output_mp3" | cut -d. -f1) + echo " Duration: ${dur}s ($(( dur / 60 ))m$(( dur % 60 ))s)" +} + +# Function: transcribe using local faster-whisper (with chunking if needed) +transcribe_file() { + local mp3_file="$1" + local name="$2" + local json_file="$OUTPUT_DIR/${name}.json" + + if [ -f "$json_file" ]; then + echo " $name already transcribed, skipping" + return + fi + + # Check duration of mp3 + local duration=$(ffprobe -v error -show_entries format=duration -of csv=p=0 "$mp3_file" | cut -d. -f1) + if [ "$duration" -gt 1800 ]; then # > 30 minutes + echo " Audio is ${duration}s long (>30 min), using chunked transcription..." + bash "$SCRIPT_DIR/transcribe_chunked.sh" "$mp3_file" "$name" "$OUTPUT_DIR" + return + fi + + echo " Transcribing $name (local faster-whisper)..." + local started + started=$(date +%s) + + MKL_SERVICE_FORCE_INTEL=1 OMP_NUM_THREADS=2 python3 "$SCRIPT_DIR/local_whisper.py" "$mp3_file" "$json_file" "$WHISPER_MODEL" "$HOTWORDS" + + local elapsed=$(( $(date +%s) - started )) + echo " Done in ${elapsed}s" + + # Extract plain text and timestamped text + python3 - "$json_file" "$OUTPUT_DIR" "$name" <<'PYEOF' +import json, sys, os + +json_path = sys.argv[1] +output_dir = sys.argv[2] +name = sys.argv[3] + +with open(json_path) as f: + data = json.load(f) + +segs = data.get("segments", []) + +# Timestamped text +lines = [] +for seg in segs: + start = seg.get("start", 0) + h, m, s = int(start // 3600), int((start % 3600) // 60), int(start % 60) + lines.append(f"[{h:02d}:{m:02d}:{s:02d}] {seg['text'].strip()}") + +txt_path = os.path.join(output_dir, f"{name}.txt") +with open(txt_path, "w") as f: + f.write("\n".join(lines)) + +# Plain text +plain = " ".join(seg["text"].strip() for seg in segs) +plain_path = os.path.join(output_dir, f"{name}_plain.txt") +with open(plain_path, "w") as f: + f.write(plain) + +print(f" {len(segs)} segments, {len(plain)} chars") +PYEOF +} + +# Manual mode: specific file +if [ $# -ge 3 ]; then + WAV_FILE="$WORK_DIR/$2" + NAME="$3" + MP3_FILE="$OUTPUT_DIR/${NAME}.mp3" + + echo "=== Transcribing $2 as '$NAME' ===" + convert_to_mp3 "$MP3_FILE" "$WAV_FILE" + transcribe_file "$MP3_FILE" "$NAME" + echo "=== Done ===" + exit 0 +fi + +# Auto mode: detect and transcribe all sources +echo "=== Auto-detecting audio sources in $WORK_DIR ===" + +# Detect H2n files (SR*XY.WAV, SR*MS.WAV) +H2N_XY=$(find "$WORK_DIR" -maxdepth 1 -name "SR*XY.WAV" | head -1) +H2N_MS=$(find "$WORK_DIR" -maxdepth 1 -name "SR*MS.WAV" | head -1) + +# Detect Saramonic / other timestamped WAV files (not SR*) +mapfile -t SARAMONIC_FILES < <(find "$WORK_DIR" -maxdepth 1 -name "*.WAV" ! -name "SR*" | sort) + +SOURCES=() + +if [ -n "$H2N_XY" ]; then + echo " Found H2n XY: $(basename "$H2N_XY")" + SOURCES+=("h2n_xy:$H2N_XY") +fi +if [ -n "$H2N_MS" ]; then + echo " Found H2n MS: $(basename "$H2N_MS")" + SOURCES+=("h2n_ms:$H2N_MS") +fi +if [ ${#SARAMONIC_FILES[@]} -gt 0 ]; then + echo " Found Saramonic files: ${SARAMONIC_FILES[*]##*/}" + joined=$(printf "|%s" "${SARAMONIC_FILES[@]}") + joined="${joined:1}" + SOURCES+=("saramonic:$joined") +fi + +if [ ${#SOURCES[@]} -eq 0 ]; then + echo "Error: No WAV files found in $WORK_DIR" + exit 1 +fi + +echo "" +echo "=== Step 1: Converting to mp3 ===" +for entry in "${SOURCES[@]}"; do + name="${entry%%:*}" + paths="${entry#*:}" + mp3="$OUTPUT_DIR/${name}.mp3" + + IFS='|' read -ra files <<< "$paths" + convert_to_mp3 "$mp3" "${files[@]}" +done + +echo "" +echo "=== Step 2: Transcribing ===" +for entry in "${SOURCES[@]}"; do + name="${entry%%:*}" + mp3="$OUTPUT_DIR/${name}.mp3" + transcribe_file "$mp3" "$name" +done + +echo "" +echo "=== Done! ===" +echo "Results in: $OUTPUT_DIR/" +for entry in "${SOURCES[@]}"; do + name="${entry%%:*}" + echo " ${name}.json - whisper JSON with segments" + echo " ${name}.txt - timestamped transcription" + echo " ${name}_plain.txt - plain text" +done \ No newline at end of file diff --git a/scripts/transcribe_chunked.sh b/scripts/transcribe_chunked.sh new file mode 100644 index 0000000..f8e68e0 --- /dev/null +++ b/scripts/transcribe_chunked.sh @@ -0,0 +1,64 @@ +#!/bin/bash +# Транскрипция с разбивкой на чанки для длинных аудио + +set -euo pipefail + +MEETING_DIR="${1:-.}" +CHUNKS_DIR="$MEETING_DIR/transcription" + +mkdir -p "$CHUNKS_DIR" + +# Get audio file +WAV_FILE=$(ls "$MEETING_DIR"/*.wav 2>/dev/null || ls "$MEETING_DIR"/*.WAV 2>/dev/null) +if [ -z "$WAV_FILE" ] || [ ! -f "$WAV_FILE" ]; then + echo "Error: No WAV file found" + exit 1 +fi + +# Duration +DURATION=$(ffprobe -i "$WAV_FILE" -show_entries format=duration -v quiet -of csv="p=0") + +echo "Audio duration: $DURATION seconds" + +# Chunk settings +chunk_duration=600 +offset=0 +chunk_num=0 + +echo "Extracting chunks..." + +while (( $(echo "$offset < $DURATION" | bc -l) )); do + chunk_file="$CHUNKS_DIR/chunk_${chunk_num}.wav" + echo "Extracting chunk $chunk_num at offset $offset..." + + # Retry logic + for attempt in 1 2 3; do + if ffmpeg -i "$WAV_FILE" -ss "$offset" -t "$chunk_duration" -acodec pcm_s16le -ar 16000 "$chunk_file" -y 2>/dev/null; then + break + elif [ $attempt -eq 3 ]; then + echo "Error: Failed to extract chunk $chunk_num" + exit 1 + fi + sleep 1 + done + + offset=$((offset + chunk_duration)) + ((chunk_num++)) +done + +echo "Transcribing $chunk_num chunks..." + +# Transcribe each chunk +for i in $(seq 0 $((chunk_num - 1))); do + chunk_file="$CHUNKS_DIR/chunk_${i}.wav" + output_file="$CHUNKS_DIR/chunk_${i}.txt" + + echo "Transcribing chunk $i..." + MKL_SERVICE_FORCE_INTEL=1 OMP_NUM_THREADS=2 python3 "${BASH_SOURCE[0]%/*}/local_whisper.py" "$chunk_file" > "$output_file" +done + +# Merge +echo "Merging transcriptions..." +cat "$CHUNKS_DIR"/chunk_*.txt > "$CHUNKS_DIR/merged_raw.txt" + +echo "Done. Output: $CHUNKS_DIR/merged_raw.txt"