197 lines
No EOL
6 KiB
Bash
197 lines
No EOL
6 KiB
Bash
#!/bin/bash
|
||
# Transcribe long audio by splitting at silence boundaries
|
||
# Uses Whisper API with authentication
|
||
|
||
set -euo pipefail
|
||
|
||
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
||
: ${WHISPER_URL:?ERROR: WHISPER_URL not set}
|
||
: ${WHISPER_API_KEY:?ERROR: WHISPER_API_KEY not set}
|
||
MODEL="whisper-1" # Изменено с конкретной модели на общее название
|
||
LANGUAGE="ru"
|
||
TARGET_CHUNK=600
|
||
|
||
HOTWORDS_FILE="${HOTWORKS_PATH:-$SCRIPT_DIR/hotwords.txt}"
|
||
HOTWORDS=""
|
||
if [ -f "$HOTWORDS_FILE" ]; then
|
||
HOTWORDS=$(grep -v '^#' "$HOTWORDS_FILE" | grep -v '^$' | tr '\n' ',' | sed 's/,,*/,/g; s/^,//; s/,$//')
|
||
fi
|
||
|
||
MP3_FILE="$1"
|
||
NAME="$2"
|
||
OUTPUT_DIR="$3"
|
||
CHUNKS_DIR="$OUTPUT_DIR/chunks_${NAME}"
|
||
|
||
if [ -f "$OUTPUT_DIR/${NAME}.json" ]; then
|
||
echo "$NAME already transcribed, skipping"
|
||
exit 0
|
||
fi
|
||
|
||
mkdir -p "$CHUNKS_DIR"
|
||
|
||
DURATION=$(ffprobe -v error -show_entries format=duration -of csv=p=0 "$MP3_FILE" | cut -d. -f1)
|
||
echo "=== Chunked transcription: $NAME ($DURATION s / $((DURATION/60))m) ==="
|
||
|
||
# Find silence gaps
|
||
SILENCES_FILE="$CHUNKS_DIR/silences.txt"
|
||
if [ ! -f "$SILENCES_FILE" ]; then
|
||
ffmpeg -i "$MP3_FILE" -af "silencedetect=noise=-35dB:d=0.5" -f null - 2>&1 \
|
||
| grep "silence_end" \
|
||
| sed 's/.*silence_end: \([0-9.]*\).*/\1/' \
|
||
> "$SILENCES_FILE"
|
||
fi
|
||
echo " Found $(wc -l < "$SILENCES_FILE") silence gaps"
|
||
|
||
# Compute split points
|
||
SPLIT_POINTS=$(python3 - "$SILENCES_FILE" "$TARGET_CHUNK" "$DURATION" <<'PYEOF'
|
||
import sys
|
||
silences_file = sys.argv[1]
|
||
target = float(sys.argv[2])
|
||
duration = float(sys.argv[3])
|
||
|
||
with open(silences_file) as f:
|
||
silences = [float(line.strip()) for line in f if line.strip()]
|
||
|
||
if not silences:
|
||
n = max(2, int(duration / target))
|
||
splits = [duration * i / n for i in range(1, n)]
|
||
else:
|
||
splits = []
|
||
t = target
|
||
while t < duration - 30:
|
||
best = min(silences, key=lambda s: abs(s - t))
|
||
if not splits or best > splits[-1] + 30:
|
||
splits.append(best)
|
||
t += target
|
||
|
||
print(" ".join(f"{s:.2f}" for s in splits))
|
||
PYEOF
|
||
)
|
||
|
||
IFS=' ' read -ra POINTS <<< "$SPLIT_POINTS"
|
||
N_CHUNKS=$((${#POINTS[@]} + 1))
|
||
echo " Will create $N_CHUNKS chunks"
|
||
|
||
# Split audio
|
||
PREV=0
|
||
for i in $(seq 0 $((N_CHUNKS - 1))); do
|
||
CHUNK_FILE="$CHUNKS_DIR/chunk_$(printf '%03d' $i).mp3"
|
||
if [ -f "$CHUNK_FILE" ] && [ $(stat -c%s "$CHUNK_FILE") -gt 1000 ]; then
|
||
if [ $i -lt ${#POINTS[@]} ]; then
|
||
PREV="${POINTS[$i]}"
|
||
fi
|
||
echo " chunk_$(printf '%03d' $i): exists, skipping"
|
||
continue
|
||
fi
|
||
|
||
if [ $i -lt ${#POINTS[@]} ]; then
|
||
END="${POINTS[$i]}"
|
||
DUR=$(python3 -c "print(f'{$END - $PREV:.2f}')")
|
||
ffmpeg -y -i "$MP3_FILE" -ss "$PREV" -t "$DUR" -c copy "$CHUNK_FILE" 2>/dev/null
|
||
PREV="$END"
|
||
else
|
||
ffmpeg -y -i "$MP3_FILE" -ss "$PREV" -c copy "$CHUNK_FILE" 2>/dev/null
|
||
fi
|
||
CHUNK_DUR=$(ffprobe -v error -show_entries format=duration -of csv=p=0 "$CHUNK_FILE" | cut -d. -f1)
|
||
echo " chunk_$(printf '%03d' $i): ${CHUNK_DUR}s"
|
||
done
|
||
|
||
# Transcribe each chunk
|
||
echo "Transcribing chunks..."
|
||
for i in $(seq 0 $((N_CHUNKS - 1))); do
|
||
CHUNK_FILE="$CHUNKS_DIR/chunk_$(printf '%03d' $i).mp3"
|
||
CHUNK_JSON="$CHUNKS_DIR/chunk_$(printf '%03d' $i).json"
|
||
|
||
if [ -f "$CHUNK_JSON" ]; then
|
||
echo " chunk_$(printf '%03d' $i): already transcribed"
|
||
continue
|
||
fi
|
||
|
||
echo -n " chunk_$(printf '%03d' $i): transcribing... "
|
||
STARTED=$(date +%s)
|
||
|
||
full_url="${WHISPER_URL}/audio/transcriptions"
|
||
|
||
CURL_ARGS=(
|
||
-s -w "%{http_code}" -o "$CHUNK_JSON"
|
||
-X POST "$full_url"
|
||
-H "Authorization: Bearer $WHISPER_API_KEY"
|
||
-F "file=@${CHUNK_FILE}"
|
||
-F "model=${MODEL}"
|
||
-F "language=${LANGUAGE}"
|
||
-F "response_format=verbose_json"
|
||
-F "temperature=0.0"
|
||
--max-time 600
|
||
)
|
||
[ -n "$HOTWORDS" ] && CURL_ARGS+=(-F "hotwords=${HOTWORDS}")
|
||
|
||
HTTP_CODE=$(curl "${CURL_ARGS[@]}")
|
||
ELAPSED=$(( $(date +%s) - STARTED ))
|
||
|
||
if [ "$HTTP_CODE" != "200" ]; then
|
||
echo "ERROR (HTTP $HTTP_CODE)"
|
||
if [ -f "$CHUNK_JSON" ]; then
|
||
cat "$CHUNK_JSON"
|
||
fi
|
||
rm -f "$CHUNK_JSON"
|
||
exit 1
|
||
fi
|
||
echo "done in ${ELAPSED}s"
|
||
done
|
||
|
||
# Merge chunks into final JSON
|
||
echo "Merging chunks..."
|
||
python3 - "$CHUNKS_DIR" "$OUTPUT_DIR" "$NAME" "$SPLIT_POINTS" <<'PYEOF'
|
||
import json, sys, os, glob
|
||
|
||
chunks_dir = sys.argv[1]
|
||
output_dir = sys.argv[2]
|
||
name = sys.argv[3]
|
||
split_points_str = sys.argv[4] if len(sys.argv) > 4 else ""
|
||
|
||
if split_points_str.strip():
|
||
split_points = [float(x) for x in split_points_str.strip().split()]
|
||
else:
|
||
split_points = []
|
||
offsets = [0.0] + split_points
|
||
|
||
chunk_files = sorted(glob.glob(os.path.join(chunks_dir, "chunk_*.json")))
|
||
all_segments = []
|
||
total_duration = 0
|
||
|
||
for idx, cf in enumerate(chunk_files):
|
||
with open(cf) as f:
|
||
data = json.load(f)
|
||
offset = offsets[idx] if idx < len(offsets) else offsets[-1]
|
||
for seg in data.get("segments", []):
|
||
all_segments.append({
|
||
"start": round(seg.get("start", 0) + offset, 2),
|
||
"end": round(seg.get("end", 0) + offset, 2),
|
||
"text": seg.get("text", "").strip(),
|
||
})
|
||
chunk_dur = data.get("duration", 0)
|
||
total_duration = max(total_duration, offset + chunk_dur)
|
||
|
||
all_segments.sort(key=lambda s: s["start"])
|
||
merged = {"segments": all_segments, "duration": total_duration}
|
||
json_path = os.path.join(output_dir, f"{name}.json")
|
||
with open(json_path, "w") as f:
|
||
json.dump(merged, f, ensure_ascii=False, indent=2)
|
||
|
||
txt_path = os.path.join(output_dir, f"{name}.txt")
|
||
with open(txt_path, "w") as f:
|
||
for seg in all_segments:
|
||
start = seg["start"]
|
||
h, m, s = int(start // 3600), int((start % 3600) // 60), int(start % 60)
|
||
f.write(f"[{h:02d}:{m:02d}:{s:02d}] {seg['text']}\n")
|
||
|
||
plain_path = os.path.join(output_dir, f"{name}_plain.txt")
|
||
with open(plain_path, "w") as f:
|
||
f.write(" ".join(seg["text"] for seg in all_segments))
|
||
|
||
print(f" {len(all_segments)} segments total")
|
||
print(f" Written: {json_path}, {txt_path}, {plain_path}")
|
||
PYEOF
|
||
|
||
echo "=== Done: $NAME ==="
|
||
echo "=== Done: $NAME ===" |