auto-report-skill/scripts/transcribe_chunked.sh

197 lines
No EOL
6 KiB
Bash
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/bin/bash
# Transcribe long audio by splitting at silence boundaries
# Uses Whisper API with authentication
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
: ${WHISPER_URL:?ERROR: WHISPER_URL not set}
: ${WHISPER_API_KEY:?ERROR: WHISPER_API_KEY not set}
MODEL="whisper-1" # Изменено с конкретной модели на общее название
LANGUAGE="ru"
TARGET_CHUNK=600
HOTWORDS_FILE="${HOTWORKS_PATH:-$SCRIPT_DIR/hotwords.txt}"
HOTWORDS=""
if [ -f "$HOTWORDS_FILE" ]; then
HOTWORDS=$(grep -v '^#' "$HOTWORDS_FILE" | grep -v '^$' | tr '\n' ',' | sed 's/,,*/,/g; s/^,//; s/,$//')
fi
MP3_FILE="$1"
NAME="$2"
OUTPUT_DIR="$3"
CHUNKS_DIR="$OUTPUT_DIR/chunks_${NAME}"
if [ -f "$OUTPUT_DIR/${NAME}.json" ]; then
echo "$NAME already transcribed, skipping"
exit 0
fi
mkdir -p "$CHUNKS_DIR"
DURATION=$(ffprobe -v error -show_entries format=duration -of csv=p=0 "$MP3_FILE" | cut -d. -f1)
echo "=== Chunked transcription: $NAME ($DURATION s / $((DURATION/60))m) ==="
# Find silence gaps
SILENCES_FILE="$CHUNKS_DIR/silences.txt"
if [ ! -f "$SILENCES_FILE" ]; then
ffmpeg -i "$MP3_FILE" -af "silencedetect=noise=-35dB:d=0.5" -f null - 2>&1 \
| grep "silence_end" \
| sed 's/.*silence_end: \([0-9.]*\).*/\1/' \
> "$SILENCES_FILE"
fi
echo " Found $(wc -l < "$SILENCES_FILE") silence gaps"
# Compute split points
SPLIT_POINTS=$(python3 - "$SILENCES_FILE" "$TARGET_CHUNK" "$DURATION" <<'PYEOF'
import sys
silences_file = sys.argv[1]
target = float(sys.argv[2])
duration = float(sys.argv[3])
with open(silences_file) as f:
silences = [float(line.strip()) for line in f if line.strip()]
if not silences:
n = max(2, int(duration / target))
splits = [duration * i / n for i in range(1, n)]
else:
splits = []
t = target
while t < duration - 30:
best = min(silences, key=lambda s: abs(s - t))
if not splits or best > splits[-1] + 30:
splits.append(best)
t += target
print(" ".join(f"{s:.2f}" for s in splits))
PYEOF
)
IFS=' ' read -ra POINTS <<< "$SPLIT_POINTS"
N_CHUNKS=$((${#POINTS[@]} + 1))
echo " Will create $N_CHUNKS chunks"
# Split audio
PREV=0
for i in $(seq 0 $((N_CHUNKS - 1))); do
CHUNK_FILE="$CHUNKS_DIR/chunk_$(printf '%03d' $i).mp3"
if [ -f "$CHUNK_FILE" ] && [ $(stat -c%s "$CHUNK_FILE") -gt 1000 ]; then
if [ $i -lt ${#POINTS[@]} ]; then
PREV="${POINTS[$i]}"
fi
echo " chunk_$(printf '%03d' $i): exists, skipping"
continue
fi
if [ $i -lt ${#POINTS[@]} ]; then
END="${POINTS[$i]}"
DUR=$(python3 -c "print(f'{$END - $PREV:.2f}')")
ffmpeg -y -i "$MP3_FILE" -ss "$PREV" -t "$DUR" -c copy "$CHUNK_FILE" 2>/dev/null
PREV="$END"
else
ffmpeg -y -i "$MP3_FILE" -ss "$PREV" -c copy "$CHUNK_FILE" 2>/dev/null
fi
CHUNK_DUR=$(ffprobe -v error -show_entries format=duration -of csv=p=0 "$CHUNK_FILE" | cut -d. -f1)
echo " chunk_$(printf '%03d' $i): ${CHUNK_DUR}s"
done
# Transcribe each chunk
echo "Transcribing chunks..."
for i in $(seq 0 $((N_CHUNKS - 1))); do
CHUNK_FILE="$CHUNKS_DIR/chunk_$(printf '%03d' $i).mp3"
CHUNK_JSON="$CHUNKS_DIR/chunk_$(printf '%03d' $i).json"
if [ -f "$CHUNK_JSON" ]; then
echo " chunk_$(printf '%03d' $i): already transcribed"
continue
fi
echo -n " chunk_$(printf '%03d' $i): transcribing... "
STARTED=$(date +%s)
full_url="${WHISPER_URL}/audio/transcriptions"
CURL_ARGS=(
-s -w "%{http_code}" -o "$CHUNK_JSON"
-X POST "$full_url"
-H "Authorization: Bearer $WHISPER_API_KEY"
-F "file=@${CHUNK_FILE}"
-F "model=${MODEL}"
-F "language=${LANGUAGE}"
-F "response_format=verbose_json"
-F "temperature=0.0"
--max-time 600
)
[ -n "$HOTWORDS" ] && CURL_ARGS+=(-F "hotwords=${HOTWORDS}")
HTTP_CODE=$(curl "${CURL_ARGS[@]}")
ELAPSED=$(( $(date +%s) - STARTED ))
if [ "$HTTP_CODE" != "200" ]; then
echo "ERROR (HTTP $HTTP_CODE)"
if [ -f "$CHUNK_JSON" ]; then
cat "$CHUNK_JSON"
fi
rm -f "$CHUNK_JSON"
exit 1
fi
echo "done in ${ELAPSED}s"
done
# Merge chunks into final JSON
echo "Merging chunks..."
python3 - "$CHUNKS_DIR" "$OUTPUT_DIR" "$NAME" "$SPLIT_POINTS" <<'PYEOF'
import json, sys, os, glob
chunks_dir = sys.argv[1]
output_dir = sys.argv[2]
name = sys.argv[3]
split_points_str = sys.argv[4] if len(sys.argv) > 4 else ""
if split_points_str.strip():
split_points = [float(x) for x in split_points_str.strip().split()]
else:
split_points = []
offsets = [0.0] + split_points
chunk_files = sorted(glob.glob(os.path.join(chunks_dir, "chunk_*.json")))
all_segments = []
total_duration = 0
for idx, cf in enumerate(chunk_files):
with open(cf) as f:
data = json.load(f)
offset = offsets[idx] if idx < len(offsets) else offsets[-1]
for seg in data.get("segments", []):
all_segments.append({
"start": round(seg.get("start", 0) + offset, 2),
"end": round(seg.get("end", 0) + offset, 2),
"text": seg.get("text", "").strip(),
})
chunk_dur = data.get("duration", 0)
total_duration = max(total_duration, offset + chunk_dur)
all_segments.sort(key=lambda s: s["start"])
merged = {"segments": all_segments, "duration": total_duration}
json_path = os.path.join(output_dir, f"{name}.json")
with open(json_path, "w") as f:
json.dump(merged, f, ensure_ascii=False, indent=2)
txt_path = os.path.join(output_dir, f"{name}.txt")
with open(txt_path, "w") as f:
for seg in all_segments:
start = seg["start"]
h, m, s = int(start // 3600), int((start % 3600) // 60), int(start % 60)
f.write(f"[{h:02d}:{m:02d}:{s:02d}] {seg['text']}\n")
plain_path = os.path.join(output_dir, f"{name}_plain.txt")
with open(plain_path, "w") as f:
f.write(" ".join(seg["text"] for seg in all_segments))
print(f" {len(all_segments)} segments total")
print(f" Written: {json_path}, {txt_path}, {plain_path}")
PYEOF
echo "=== Done: $NAME ==="
echo "=== Done: $NAME ==="