Add scripts
This commit is contained in:
parent
992a748c51
commit
ba56147e95
4 changed files with 329 additions and 0 deletions
25
scripts/merge_transcriptions.py
Normal file
25
scripts/merge_transcriptions.py
Normal file
|
|
@ -0,0 +1,25 @@
|
|||
#!/usr/bin/env python3
|
||||
"""Объединение транскрипций из нескольких файлов."""
|
||||
|
||||
import sys
|
||||
import os
|
||||
|
||||
def merge_transcriptions(timeline_dir, output_path="merged_plain.txt"):
|
||||
"""Собирает все .txt файлы в один."""
|
||||
txt_files = sorted([f for f in os.listdir(timeline_dir) if f.endswith('.txt') and 'merged' not in f])
|
||||
|
||||
merged = []
|
||||
for txt_file in txt_files:
|
||||
with open(os.path.join(timeline_dir, txt_file), 'r', encoding='utf-8') as f:
|
||||
content = f.read().strip()
|
||||
if content:
|
||||
merged.append(f"--- {txt_file} ---\n{content}\n")
|
||||
|
||||
with open(os.path.join(timeline_dir, output_path), 'w', encoding='utf-8') as f:
|
||||
f.write('\n\n'.join(merged))
|
||||
|
||||
print(f"Merged {len(txt_files)} files into {output_path}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
dir_path = sys.argv[1] if len(sys.argv) > 1 else "transcription"
|
||||
merge_transcriptions(dir_path)
|
||||
30
scripts/overlay.sh
Normal file
30
scripts/overlay.sh
Normal file
|
|
@ -0,0 +1,30 @@
|
|||
#!/bin/bash
|
||||
# Обёртка для запуска транскрипции с правильной настройкой окружения
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# CRITICAL: переменные для Intel oneMKL
|
||||
export MKL_SERVICE_FORCE_INTEL=1
|
||||
export OMP_NUM_THREADS=2
|
||||
|
||||
MEETING_DIR="${1:-.}"
|
||||
|
||||
if [ ! -f "$MEETING_DIR"/*.wav ] && [ ! -f "$MEETING_DIR"/*.WAV ]; then
|
||||
echo "Error: No WAV file found in $MEETING_DIR"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
|
||||
# Check audio duration
|
||||
DURATION=$(ffprobe -i "$MEETING_DIR"/*.wav -show_entries format=duration -v quiet -of csv="p=0" 2>/dev/null | cut -d. -f1)
|
||||
|
||||
if [ $DURATION -gt 1800 ]; then # >30 минут
|
||||
echo "Audio is $DURATION seconds. Using chunked transcription..."
|
||||
bash "$SCRIPT_DIR/transcribe_chunked.sh" "$MEETING_DIR"
|
||||
else
|
||||
echo "Audio is $DURATION seconds. Using standard transcription..."
|
||||
bash "$SCRIPT_DIR/transcribe.sh" "$MEETING_DIR"
|
||||
fi
|
||||
|
||||
echo "Transcription complete. Check $MEETING_DIR/transcription/"
|
||||
210
scripts/transcribe.sh
Normal file
210
scripts/transcribe.sh
Normal file
|
|
@ -0,0 +1,210 @@
|
|||
#!/bin/bash
|
||||
# Transcribe audio recordings using local faster-whisper
|
||||
# Supports multiple sources: Zoom H2n (4ch WAV), Saramonic (mono WAV), etc.
|
||||
#
|
||||
# Usage:
|
||||
# ./transcribe.sh /absolute/path/to/meeting_folder
|
||||
# ./transcribe.sh /absolute/path/to/meeting_folder specific.WAV output_name
|
||||
#
|
||||
# Examples:
|
||||
# ./transcribe.sh /app/hermes_data/meetings/2026-02-18
|
||||
# ./transcribe.sh /app/hermes_data/meetings/2026-02-18 SR003XY.WAV h2n_xy
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
||||
WHISPER_MODEL="base"
|
||||
LANGUAGE="ru"
|
||||
|
||||
# Load hotwords
|
||||
HOTWORDS_FILE="$SCRIPT_DIR/hotwords.txt"
|
||||
if [ -f "$HOTWORDS_FILE" ]; then
|
||||
HOTWORDS=$(grep -v '^#' "$HOTWORDS_FILE" | grep -v '^$' | tr '\n' ',' | sed 's/,,*/,/g; s/^,//; s/,$//')
|
||||
echo "Loaded hotwords from $HOTWORDS_FILE"
|
||||
else
|
||||
HOTWORDS=""
|
||||
echo "Warning: hotwords.txt not found, proceeding without hotwords"
|
||||
fi
|
||||
|
||||
if [ $# -lt 1 ]; then
|
||||
echo "Usage: $0 <absolute_meeting_dir> [<file.WAV> <output_name>]"
|
||||
echo "Example: $0 /app/hermes_data/meetings/2026-02-18"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
MEETING_DIR="$1"
|
||||
if [[ "$MEETING_DIR" != /* ]]; then
|
||||
MEETING_DIR="$(realpath "$MEETING_DIR")"
|
||||
else
|
||||
MEETING_DIR="$(realpath "$MEETING_DIR")"
|
||||
fi
|
||||
|
||||
WORK_DIR="$MEETING_DIR"
|
||||
OUTPUT_DIR="$WORK_DIR/transcription"
|
||||
mkdir -p "$OUTPUT_DIR"
|
||||
|
||||
# Function: convert WAV(s) to mono mp3
|
||||
convert_to_mp3() {
|
||||
local output_mp3="$1"
|
||||
shift
|
||||
local inputs=("$@")
|
||||
|
||||
if [ -f "$output_mp3" ]; then
|
||||
echo " $output_mp3 already exists, skipping conversion"
|
||||
return
|
||||
fi
|
||||
|
||||
if [ ${#inputs[@]} -eq 1 ]; then
|
||||
echo " Converting ${inputs[0]} -> $output_mp3"
|
||||
ffmpeg -y -i "${inputs[0]}" -ac 1 -ar 16000 -b:a 64k "$output_mp3" 2>/dev/null
|
||||
else
|
||||
local listfile
|
||||
listfile=$(mktemp /tmp/ffmpeg_concat_XXXXXX.txt)
|
||||
for f in "${inputs[@]}"; do
|
||||
echo "file '$f'" >> "$listfile"
|
||||
done
|
||||
echo " Concatenating ${#inputs[@]} files -> $output_mp3"
|
||||
ffmpeg -y -f concat -safe 0 -i "$listfile" -ac 1 -ar 16000 -b:a 64k "$output_mp3" 2>/dev/null
|
||||
rm -f "$listfile"
|
||||
fi
|
||||
|
||||
local dur
|
||||
dur=$(ffprobe -v error -show_entries format=duration -of csv=p=0 "$output_mp3" | cut -d. -f1)
|
||||
echo " Duration: ${dur}s ($(( dur / 60 ))m$(( dur % 60 ))s)"
|
||||
}
|
||||
|
||||
# Function: transcribe using local faster-whisper (with chunking if needed)
|
||||
transcribe_file() {
|
||||
local mp3_file="$1"
|
||||
local name="$2"
|
||||
local json_file="$OUTPUT_DIR/${name}.json"
|
||||
|
||||
if [ -f "$json_file" ]; then
|
||||
echo " $name already transcribed, skipping"
|
||||
return
|
||||
fi
|
||||
|
||||
# Check duration of mp3
|
||||
local duration=$(ffprobe -v error -show_entries format=duration -of csv=p=0 "$mp3_file" | cut -d. -f1)
|
||||
if [ "$duration" -gt 1800 ]; then # > 30 minutes
|
||||
echo " Audio is ${duration}s long (>30 min), using chunked transcription..."
|
||||
bash "$SCRIPT_DIR/transcribe_chunked.sh" "$mp3_file" "$name" "$OUTPUT_DIR"
|
||||
return
|
||||
fi
|
||||
|
||||
echo " Transcribing $name (local faster-whisper)..."
|
||||
local started
|
||||
started=$(date +%s)
|
||||
|
||||
MKL_SERVICE_FORCE_INTEL=1 OMP_NUM_THREADS=2 python3 "$SCRIPT_DIR/local_whisper.py" "$mp3_file" "$json_file" "$WHISPER_MODEL" "$HOTWORDS"
|
||||
|
||||
local elapsed=$(( $(date +%s) - started ))
|
||||
echo " Done in ${elapsed}s"
|
||||
|
||||
# Extract plain text and timestamped text
|
||||
python3 - "$json_file" "$OUTPUT_DIR" "$name" <<'PYEOF'
|
||||
import json, sys, os
|
||||
|
||||
json_path = sys.argv[1]
|
||||
output_dir = sys.argv[2]
|
||||
name = sys.argv[3]
|
||||
|
||||
with open(json_path) as f:
|
||||
data = json.load(f)
|
||||
|
||||
segs = data.get("segments", [])
|
||||
|
||||
# Timestamped text
|
||||
lines = []
|
||||
for seg in segs:
|
||||
start = seg.get("start", 0)
|
||||
h, m, s = int(start // 3600), int((start % 3600) // 60), int(start % 60)
|
||||
lines.append(f"[{h:02d}:{m:02d}:{s:02d}] {seg['text'].strip()}")
|
||||
|
||||
txt_path = os.path.join(output_dir, f"{name}.txt")
|
||||
with open(txt_path, "w") as f:
|
||||
f.write("\n".join(lines))
|
||||
|
||||
# Plain text
|
||||
plain = " ".join(seg["text"].strip() for seg in segs)
|
||||
plain_path = os.path.join(output_dir, f"{name}_plain.txt")
|
||||
with open(plain_path, "w") as f:
|
||||
f.write(plain)
|
||||
|
||||
print(f" {len(segs)} segments, {len(plain)} chars")
|
||||
PYEOF
|
||||
}
|
||||
|
||||
# Manual mode: specific file
|
||||
if [ $# -ge 3 ]; then
|
||||
WAV_FILE="$WORK_DIR/$2"
|
||||
NAME="$3"
|
||||
MP3_FILE="$OUTPUT_DIR/${NAME}.mp3"
|
||||
|
||||
echo "=== Transcribing $2 as '$NAME' ==="
|
||||
convert_to_mp3 "$MP3_FILE" "$WAV_FILE"
|
||||
transcribe_file "$MP3_FILE" "$NAME"
|
||||
echo "=== Done ==="
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# Auto mode: detect and transcribe all sources
|
||||
echo "=== Auto-detecting audio sources in $WORK_DIR ==="
|
||||
|
||||
# Detect H2n files (SR*XY.WAV, SR*MS.WAV)
|
||||
H2N_XY=$(find "$WORK_DIR" -maxdepth 1 -name "SR*XY.WAV" | head -1)
|
||||
H2N_MS=$(find "$WORK_DIR" -maxdepth 1 -name "SR*MS.WAV" | head -1)
|
||||
|
||||
# Detect Saramonic / other timestamped WAV files (not SR*)
|
||||
mapfile -t SARAMONIC_FILES < <(find "$WORK_DIR" -maxdepth 1 -name "*.WAV" ! -name "SR*" | sort)
|
||||
|
||||
SOURCES=()
|
||||
|
||||
if [ -n "$H2N_XY" ]; then
|
||||
echo " Found H2n XY: $(basename "$H2N_XY")"
|
||||
SOURCES+=("h2n_xy:$H2N_XY")
|
||||
fi
|
||||
if [ -n "$H2N_MS" ]; then
|
||||
echo " Found H2n MS: $(basename "$H2N_MS")"
|
||||
SOURCES+=("h2n_ms:$H2N_MS")
|
||||
fi
|
||||
if [ ${#SARAMONIC_FILES[@]} -gt 0 ]; then
|
||||
echo " Found Saramonic files: ${SARAMONIC_FILES[*]##*/}"
|
||||
joined=$(printf "|%s" "${SARAMONIC_FILES[@]}")
|
||||
joined="${joined:1}"
|
||||
SOURCES+=("saramonic:$joined")
|
||||
fi
|
||||
|
||||
if [ ${#SOURCES[@]} -eq 0 ]; then
|
||||
echo "Error: No WAV files found in $WORK_DIR"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "=== Step 1: Converting to mp3 ==="
|
||||
for entry in "${SOURCES[@]}"; do
|
||||
name="${entry%%:*}"
|
||||
paths="${entry#*:}"
|
||||
mp3="$OUTPUT_DIR/${name}.mp3"
|
||||
|
||||
IFS='|' read -ra files <<< "$paths"
|
||||
convert_to_mp3 "$mp3" "${files[@]}"
|
||||
done
|
||||
|
||||
echo ""
|
||||
echo "=== Step 2: Transcribing ==="
|
||||
for entry in "${SOURCES[@]}"; do
|
||||
name="${entry%%:*}"
|
||||
mp3="$OUTPUT_DIR/${name}.mp3"
|
||||
transcribe_file "$mp3" "$name"
|
||||
done
|
||||
|
||||
echo ""
|
||||
echo "=== Done! ==="
|
||||
echo "Results in: $OUTPUT_DIR/"
|
||||
for entry in "${SOURCES[@]}"; do
|
||||
name="${entry%%:*}"
|
||||
echo " ${name}.json - whisper JSON with segments"
|
||||
echo " ${name}.txt - timestamped transcription"
|
||||
echo " ${name}_plain.txt - plain text"
|
||||
done
|
||||
64
scripts/transcribe_chunked.sh
Normal file
64
scripts/transcribe_chunked.sh
Normal file
|
|
@ -0,0 +1,64 @@
|
|||
#!/bin/bash
|
||||
# Транскрипция с разбивкой на чанки для длинных аудио
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
MEETING_DIR="${1:-.}"
|
||||
CHUNKS_DIR="$MEETING_DIR/transcription"
|
||||
|
||||
mkdir -p "$CHUNKS_DIR"
|
||||
|
||||
# Get audio file
|
||||
WAV_FILE=$(ls "$MEETING_DIR"/*.wav 2>/dev/null || ls "$MEETING_DIR"/*.WAV 2>/dev/null)
|
||||
if [ -z "$WAV_FILE" ] || [ ! -f "$WAV_FILE" ]; then
|
||||
echo "Error: No WAV file found"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Duration
|
||||
DURATION=$(ffprobe -i "$WAV_FILE" -show_entries format=duration -v quiet -of csv="p=0")
|
||||
|
||||
echo "Audio duration: $DURATION seconds"
|
||||
|
||||
# Chunk settings
|
||||
chunk_duration=600
|
||||
offset=0
|
||||
chunk_num=0
|
||||
|
||||
echo "Extracting chunks..."
|
||||
|
||||
while (( $(echo "$offset < $DURATION" | bc -l) )); do
|
||||
chunk_file="$CHUNKS_DIR/chunk_${chunk_num}.wav"
|
||||
echo "Extracting chunk $chunk_num at offset $offset..."
|
||||
|
||||
# Retry logic
|
||||
for attempt in 1 2 3; do
|
||||
if ffmpeg -i "$WAV_FILE" -ss "$offset" -t "$chunk_duration" -acodec pcm_s16le -ar 16000 "$chunk_file" -y 2>/dev/null; then
|
||||
break
|
||||
elif [ $attempt -eq 3 ]; then
|
||||
echo "Error: Failed to extract chunk $chunk_num"
|
||||
exit 1
|
||||
fi
|
||||
sleep 1
|
||||
done
|
||||
|
||||
offset=$((offset + chunk_duration))
|
||||
((chunk_num++))
|
||||
done
|
||||
|
||||
echo "Transcribing $chunk_num chunks..."
|
||||
|
||||
# Transcribe each chunk
|
||||
for i in $(seq 0 $((chunk_num - 1))); do
|
||||
chunk_file="$CHUNKS_DIR/chunk_${i}.wav"
|
||||
output_file="$CHUNKS_DIR/chunk_${i}.txt"
|
||||
|
||||
echo "Transcribing chunk $i..."
|
||||
MKL_SERVICE_FORCE_INTEL=1 OMP_NUM_THREADS=2 python3 "${BASH_SOURCE[0]%/*}/local_whisper.py" "$chunk_file" > "$output_file"
|
||||
done
|
||||
|
||||
# Merge
|
||||
echo "Merging transcriptions..."
|
||||
cat "$CHUNKS_DIR"/chunk_*.txt > "$CHUNKS_DIR/merged_raw.txt"
|
||||
|
||||
echo "Done. Output: $CHUNKS_DIR/merged_raw.txt"
|
||||
Loading…
Add table
Add a link
Reference in a new issue