Make scripts better

This commit is contained in:
Слонова Анна 2026-04-30 10:57:18 +03:00
parent ba56147e95
commit e8ad7df469
12 changed files with 614 additions and 432 deletions

View file

@ -1,64 +1,197 @@
#!/bin/bash
# Транскрипция с разбивкой на чанки для длинных аудио
# Transcribe long audio by splitting at silence boundaries
# Uses Whisper API with authentication
set -euo pipefail
MEETING_DIR="${1:-.}"
CHUNKS_DIR="$MEETING_DIR/transcription"
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
: ${WHISPER_URL:?ERROR: WHISPER_URL not set}
: ${WHISPER_API_KEY:?ERROR: WHISPER_API_KEY not set}
MODEL="whisper-1" # Изменено с конкретной модели на общее название
LANGUAGE="ru"
TARGET_CHUNK=600
HOTWORDS_FILE="${HOTWORKS_PATH:-$SCRIPT_DIR/hotwords.txt}"
HOTWORDS=""
if [ -f "$HOTWORDS_FILE" ]; then
HOTWORDS=$(grep -v '^#' "$HOTWORDS_FILE" | grep -v '^$' | tr '\n' ',' | sed 's/,,*/,/g; s/^,//; s/,$//')
fi
MP3_FILE="$1"
NAME="$2"
OUTPUT_DIR="$3"
CHUNKS_DIR="$OUTPUT_DIR/chunks_${NAME}"
if [ -f "$OUTPUT_DIR/${NAME}.json" ]; then
echo "$NAME already transcribed, skipping"
exit 0
fi
mkdir -p "$CHUNKS_DIR"
# Get audio file
WAV_FILE=$(ls "$MEETING_DIR"/*.wav 2>/dev/null || ls "$MEETING_DIR"/*.WAV 2>/dev/null)
if [ -z "$WAV_FILE" ] || [ ! -f "$WAV_FILE" ]; then
echo "Error: No WAV file found"
exit 1
DURATION=$(ffprobe -v error -show_entries format=duration -of csv=p=0 "$MP3_FILE" | cut -d. -f1)
echo "=== Chunked transcription: $NAME ($DURATION s / $((DURATION/60))m) ==="
# Find silence gaps
SILENCES_FILE="$CHUNKS_DIR/silences.txt"
if [ ! -f "$SILENCES_FILE" ]; then
ffmpeg -i "$MP3_FILE" -af "silencedetect=noise=-35dB:d=0.5" -f null - 2>&1 \
| grep "silence_end" \
| sed 's/.*silence_end: \([0-9.]*\).*/\1/' \
> "$SILENCES_FILE"
fi
echo " Found $(wc -l < "$SILENCES_FILE") silence gaps"
# Duration
DURATION=$(ffprobe -i "$WAV_FILE" -show_entries format=duration -v quiet -of csv="p=0")
# Compute split points
SPLIT_POINTS=$(python3 - "$SILENCES_FILE" "$TARGET_CHUNK" "$DURATION" <<'PYEOF'
import sys
silences_file = sys.argv[1]
target = float(sys.argv[2])
duration = float(sys.argv[3])
echo "Audio duration: $DURATION seconds"
with open(silences_file) as f:
silences = [float(line.strip()) for line in f if line.strip()]
# Chunk settings
chunk_duration=600
offset=0
chunk_num=0
if not silences:
n = max(2, int(duration / target))
splits = [duration * i / n for i in range(1, n)]
else:
splits = []
t = target
while t < duration - 30:
best = min(silences, key=lambda s: abs(s - t))
if not splits or best > splits[-1] + 30:
splits.append(best)
t += target
echo "Extracting chunks..."
print(" ".join(f"{s:.2f}" for s in splits))
PYEOF
)
while (( $(echo "$offset < $DURATION" | bc -l) )); do
chunk_file="$CHUNKS_DIR/chunk_${chunk_num}.wav"
echo "Extracting chunk $chunk_num at offset $offset..."
# Retry logic
for attempt in 1 2 3; do
if ffmpeg -i "$WAV_FILE" -ss "$offset" -t "$chunk_duration" -acodec pcm_s16le -ar 16000 "$chunk_file" -y 2>/dev/null; then
break
elif [ $attempt -eq 3 ]; then
echo "Error: Failed to extract chunk $chunk_num"
exit 1
IFS=' ' read -ra POINTS <<< "$SPLIT_POINTS"
N_CHUNKS=$((${#POINTS[@]} + 1))
echo " Will create $N_CHUNKS chunks"
# Split audio
PREV=0
for i in $(seq 0 $((N_CHUNKS - 1))); do
CHUNK_FILE="$CHUNKS_DIR/chunk_$(printf '%03d' $i).mp3"
if [ -f "$CHUNK_FILE" ] && [ $(stat -c%s "$CHUNK_FILE") -gt 1000 ]; then
if [ $i -lt ${#POINTS[@]} ]; then
PREV="${POINTS[$i]}"
fi
sleep 1
done
offset=$((offset + chunk_duration))
((chunk_num++))
done
echo " chunk_$(printf '%03d' $i): exists, skipping"
continue
fi
echo "Transcribing $chunk_num chunks..."
if [ $i -lt ${#POINTS[@]} ]; then
END="${POINTS[$i]}"
DUR=$(python3 -c "print(f'{$END - $PREV:.2f}')")
ffmpeg -y -i "$MP3_FILE" -ss "$PREV" -t "$DUR" -c copy "$CHUNK_FILE" 2>/dev/null
PREV="$END"
else
ffmpeg -y -i "$MP3_FILE" -ss "$PREV" -c copy "$CHUNK_FILE" 2>/dev/null
fi
CHUNK_DUR=$(ffprobe -v error -show_entries format=duration -of csv=p=0 "$CHUNK_FILE" | cut -d. -f1)
echo " chunk_$(printf '%03d' $i): ${CHUNK_DUR}s"
done
# Transcribe each chunk
for i in $(seq 0 $((chunk_num - 1))); do
chunk_file="$CHUNKS_DIR/chunk_${i}.wav"
output_file="$CHUNKS_DIR/chunk_${i}.txt"
echo "Transcribing chunk $i..."
MKL_SERVICE_FORCE_INTEL=1 OMP_NUM_THREADS=2 python3 "${BASH_SOURCE[0]%/*}/local_whisper.py" "$chunk_file" > "$output_file"
echo "Transcribing chunks..."
for i in $(seq 0 $((N_CHUNKS - 1))); do
CHUNK_FILE="$CHUNKS_DIR/chunk_$(printf '%03d' $i).mp3"
CHUNK_JSON="$CHUNKS_DIR/chunk_$(printf '%03d' $i).json"
if [ -f "$CHUNK_JSON" ]; then
echo " chunk_$(printf '%03d' $i): already transcribed"
continue
fi
echo -n " chunk_$(printf '%03d' $i): transcribing... "
STARTED=$(date +%s)
full_url="${WHISPER_URL}/audio/transcriptions"
CURL_ARGS=(
-s -w "%{http_code}" -o "$CHUNK_JSON"
-X POST "$full_url"
-H "Authorization: Bearer $WHISPER_API_KEY"
-F "file=@${CHUNK_FILE}"
-F "model=${MODEL}"
-F "language=${LANGUAGE}"
-F "response_format=verbose_json"
-F "temperature=0.0"
--max-time 600
)
[ -n "$HOTWORDS" ] && CURL_ARGS+=(-F "hotwords=${HOTWORDS}")
HTTP_CODE=$(curl "${CURL_ARGS[@]}")
ELAPSED=$(( $(date +%s) - STARTED ))
if [ "$HTTP_CODE" != "200" ]; then
echo "ERROR (HTTP $HTTP_CODE)"
if [ -f "$CHUNK_JSON" ]; then
cat "$CHUNK_JSON"
fi
rm -f "$CHUNK_JSON"
exit 1
fi
echo "done in ${ELAPSED}s"
done
# Merge
echo "Merging transcriptions..."
cat "$CHUNKS_DIR"/chunk_*.txt > "$CHUNKS_DIR/merged_raw.txt"
# Merge chunks into final JSON
echo "Merging chunks..."
python3 - "$CHUNKS_DIR" "$OUTPUT_DIR" "$NAME" "$SPLIT_POINTS" <<'PYEOF'
import json, sys, os, glob
echo "Done. Output: $CHUNKS_DIR/merged_raw.txt"
chunks_dir = sys.argv[1]
output_dir = sys.argv[2]
name = sys.argv[3]
split_points_str = sys.argv[4] if len(sys.argv) > 4 else ""
if split_points_str.strip():
split_points = [float(x) for x in split_points_str.strip().split()]
else:
split_points = []
offsets = [0.0] + split_points
chunk_files = sorted(glob.glob(os.path.join(chunks_dir, "chunk_*.json")))
all_segments = []
total_duration = 0
for idx, cf in enumerate(chunk_files):
with open(cf) as f:
data = json.load(f)
offset = offsets[idx] if idx < len(offsets) else offsets[-1]
for seg in data.get("segments", []):
all_segments.append({
"start": round(seg.get("start", 0) + offset, 2),
"end": round(seg.get("end", 0) + offset, 2),
"text": seg.get("text", "").strip(),
})
chunk_dur = data.get("duration", 0)
total_duration = max(total_duration, offset + chunk_dur)
all_segments.sort(key=lambda s: s["start"])
merged = {"segments": all_segments, "duration": total_duration}
json_path = os.path.join(output_dir, f"{name}.json")
with open(json_path, "w") as f:
json.dump(merged, f, ensure_ascii=False, indent=2)
txt_path = os.path.join(output_dir, f"{name}.txt")
with open(txt_path, "w") as f:
for seg in all_segments:
start = seg["start"]
h, m, s = int(start // 3600), int((start % 3600) // 60), int(start % 60)
f.write(f"[{h:02d}:{m:02d}:{s:02d}] {seg['text']}\n")
plain_path = os.path.join(output_dir, f"{name}_plain.txt")
with open(plain_path, "w") as f:
f.write(" ".join(seg["text"] for seg in all_segments))
print(f" {len(all_segments)} segments total")
print(f" Written: {json_path}, {txt_path}, {plain_path}")
PYEOF
echo "=== Done: $NAME ==="
echo "=== Done: $NAME ==="