Add scripts

This commit is contained in:
Слонова Анна 2026-04-21 18:09:28 +00:00
parent 992a748c51
commit ba56147e95
4 changed files with 329 additions and 0 deletions

View file

@ -0,0 +1,64 @@
#!/bin/bash
# Транскрипция с разбивкой на чанки для длинных аудио
set -euo pipefail
MEETING_DIR="${1:-.}"
CHUNKS_DIR="$MEETING_DIR/transcription"
mkdir -p "$CHUNKS_DIR"
# Get audio file
WAV_FILE=$(ls "$MEETING_DIR"/*.wav 2>/dev/null || ls "$MEETING_DIR"/*.WAV 2>/dev/null)
if [ -z "$WAV_FILE" ] || [ ! -f "$WAV_FILE" ]; then
echo "Error: No WAV file found"
exit 1
fi
# Duration
DURATION=$(ffprobe -i "$WAV_FILE" -show_entries format=duration -v quiet -of csv="p=0")
echo "Audio duration: $DURATION seconds"
# Chunk settings
chunk_duration=600
offset=0
chunk_num=0
echo "Extracting chunks..."
while (( $(echo "$offset < $DURATION" | bc -l) )); do
chunk_file="$CHUNKS_DIR/chunk_${chunk_num}.wav"
echo "Extracting chunk $chunk_num at offset $offset..."
# Retry logic
for attempt in 1 2 3; do
if ffmpeg -i "$WAV_FILE" -ss "$offset" -t "$chunk_duration" -acodec pcm_s16le -ar 16000 "$chunk_file" -y 2>/dev/null; then
break
elif [ $attempt -eq 3 ]; then
echo "Error: Failed to extract chunk $chunk_num"
exit 1
fi
sleep 1
done
offset=$((offset + chunk_duration))
((chunk_num++))
done
echo "Transcribing $chunk_num chunks..."
# Transcribe each chunk
for i in $(seq 0 $((chunk_num - 1))); do
chunk_file="$CHUNKS_DIR/chunk_${i}.wav"
output_file="$CHUNKS_DIR/chunk_${i}.txt"
echo "Transcribing chunk $i..."
MKL_SERVICE_FORCE_INTEL=1 OMP_NUM_THREADS=2 python3 "${BASH_SOURCE[0]%/*}/local_whisper.py" "$chunk_file" > "$output_file"
done
# Merge
echo "Merging transcriptions..."
cat "$CHUNKS_DIR"/chunk_*.txt > "$CHUNKS_DIR/merged_raw.txt"
echo "Done. Output: $CHUNKS_DIR/merged_raw.txt"