Make scripts better
This commit is contained in:
parent
ba56147e95
commit
e8ad7df469
12 changed files with 614 additions and 432 deletions
|
|
@ -1,64 +1,197 @@
|
|||
#!/bin/bash
|
||||
# Транскрипция с разбивкой на чанки для длинных аудио
|
||||
# Transcribe long audio by splitting at silence boundaries
|
||||
# Uses Whisper API with authentication
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
MEETING_DIR="${1:-.}"
|
||||
CHUNKS_DIR="$MEETING_DIR/transcription"
|
||||
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
||||
: ${WHISPER_URL:?ERROR: WHISPER_URL not set}
|
||||
: ${WHISPER_API_KEY:?ERROR: WHISPER_API_KEY not set}
|
||||
MODEL="whisper-1" # Изменено с конкретной модели на общее название
|
||||
LANGUAGE="ru"
|
||||
TARGET_CHUNK=600
|
||||
|
||||
HOTWORDS_FILE="${HOTWORKS_PATH:-$SCRIPT_DIR/hotwords.txt}"
|
||||
HOTWORDS=""
|
||||
if [ -f "$HOTWORDS_FILE" ]; then
|
||||
HOTWORDS=$(grep -v '^#' "$HOTWORDS_FILE" | grep -v '^$' | tr '\n' ',' | sed 's/,,*/,/g; s/^,//; s/,$//')
|
||||
fi
|
||||
|
||||
MP3_FILE="$1"
|
||||
NAME="$2"
|
||||
OUTPUT_DIR="$3"
|
||||
CHUNKS_DIR="$OUTPUT_DIR/chunks_${NAME}"
|
||||
|
||||
if [ -f "$OUTPUT_DIR/${NAME}.json" ]; then
|
||||
echo "$NAME already transcribed, skipping"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
mkdir -p "$CHUNKS_DIR"
|
||||
|
||||
# Get audio file
|
||||
WAV_FILE=$(ls "$MEETING_DIR"/*.wav 2>/dev/null || ls "$MEETING_DIR"/*.WAV 2>/dev/null)
|
||||
if [ -z "$WAV_FILE" ] || [ ! -f "$WAV_FILE" ]; then
|
||||
echo "Error: No WAV file found"
|
||||
exit 1
|
||||
DURATION=$(ffprobe -v error -show_entries format=duration -of csv=p=0 "$MP3_FILE" | cut -d. -f1)
|
||||
echo "=== Chunked transcription: $NAME ($DURATION s / $((DURATION/60))m) ==="
|
||||
|
||||
# Find silence gaps
|
||||
SILENCES_FILE="$CHUNKS_DIR/silences.txt"
|
||||
if [ ! -f "$SILENCES_FILE" ]; then
|
||||
ffmpeg -i "$MP3_FILE" -af "silencedetect=noise=-35dB:d=0.5" -f null - 2>&1 \
|
||||
| grep "silence_end" \
|
||||
| sed 's/.*silence_end: \([0-9.]*\).*/\1/' \
|
||||
> "$SILENCES_FILE"
|
||||
fi
|
||||
echo " Found $(wc -l < "$SILENCES_FILE") silence gaps"
|
||||
|
||||
# Duration
|
||||
DURATION=$(ffprobe -i "$WAV_FILE" -show_entries format=duration -v quiet -of csv="p=0")
|
||||
# Compute split points
|
||||
SPLIT_POINTS=$(python3 - "$SILENCES_FILE" "$TARGET_CHUNK" "$DURATION" <<'PYEOF'
|
||||
import sys
|
||||
silences_file = sys.argv[1]
|
||||
target = float(sys.argv[2])
|
||||
duration = float(sys.argv[3])
|
||||
|
||||
echo "Audio duration: $DURATION seconds"
|
||||
with open(silences_file) as f:
|
||||
silences = [float(line.strip()) for line in f if line.strip()]
|
||||
|
||||
# Chunk settings
|
||||
chunk_duration=600
|
||||
offset=0
|
||||
chunk_num=0
|
||||
if not silences:
|
||||
n = max(2, int(duration / target))
|
||||
splits = [duration * i / n for i in range(1, n)]
|
||||
else:
|
||||
splits = []
|
||||
t = target
|
||||
while t < duration - 30:
|
||||
best = min(silences, key=lambda s: abs(s - t))
|
||||
if not splits or best > splits[-1] + 30:
|
||||
splits.append(best)
|
||||
t += target
|
||||
|
||||
echo "Extracting chunks..."
|
||||
print(" ".join(f"{s:.2f}" for s in splits))
|
||||
PYEOF
|
||||
)
|
||||
|
||||
while (( $(echo "$offset < $DURATION" | bc -l) )); do
|
||||
chunk_file="$CHUNKS_DIR/chunk_${chunk_num}.wav"
|
||||
echo "Extracting chunk $chunk_num at offset $offset..."
|
||||
|
||||
# Retry logic
|
||||
for attempt in 1 2 3; do
|
||||
if ffmpeg -i "$WAV_FILE" -ss "$offset" -t "$chunk_duration" -acodec pcm_s16le -ar 16000 "$chunk_file" -y 2>/dev/null; then
|
||||
break
|
||||
elif [ $attempt -eq 3 ]; then
|
||||
echo "Error: Failed to extract chunk $chunk_num"
|
||||
exit 1
|
||||
IFS=' ' read -ra POINTS <<< "$SPLIT_POINTS"
|
||||
N_CHUNKS=$((${#POINTS[@]} + 1))
|
||||
echo " Will create $N_CHUNKS chunks"
|
||||
|
||||
# Split audio
|
||||
PREV=0
|
||||
for i in $(seq 0 $((N_CHUNKS - 1))); do
|
||||
CHUNK_FILE="$CHUNKS_DIR/chunk_$(printf '%03d' $i).mp3"
|
||||
if [ -f "$CHUNK_FILE" ] && [ $(stat -c%s "$CHUNK_FILE") -gt 1000 ]; then
|
||||
if [ $i -lt ${#POINTS[@]} ]; then
|
||||
PREV="${POINTS[$i]}"
|
||||
fi
|
||||
sleep 1
|
||||
done
|
||||
|
||||
offset=$((offset + chunk_duration))
|
||||
((chunk_num++))
|
||||
done
|
||||
echo " chunk_$(printf '%03d' $i): exists, skipping"
|
||||
continue
|
||||
fi
|
||||
|
||||
echo "Transcribing $chunk_num chunks..."
|
||||
if [ $i -lt ${#POINTS[@]} ]; then
|
||||
END="${POINTS[$i]}"
|
||||
DUR=$(python3 -c "print(f'{$END - $PREV:.2f}')")
|
||||
ffmpeg -y -i "$MP3_FILE" -ss "$PREV" -t "$DUR" -c copy "$CHUNK_FILE" 2>/dev/null
|
||||
PREV="$END"
|
||||
else
|
||||
ffmpeg -y -i "$MP3_FILE" -ss "$PREV" -c copy "$CHUNK_FILE" 2>/dev/null
|
||||
fi
|
||||
CHUNK_DUR=$(ffprobe -v error -show_entries format=duration -of csv=p=0 "$CHUNK_FILE" | cut -d. -f1)
|
||||
echo " chunk_$(printf '%03d' $i): ${CHUNK_DUR}s"
|
||||
done
|
||||
|
||||
# Transcribe each chunk
|
||||
for i in $(seq 0 $((chunk_num - 1))); do
|
||||
chunk_file="$CHUNKS_DIR/chunk_${i}.wav"
|
||||
output_file="$CHUNKS_DIR/chunk_${i}.txt"
|
||||
|
||||
echo "Transcribing chunk $i..."
|
||||
MKL_SERVICE_FORCE_INTEL=1 OMP_NUM_THREADS=2 python3 "${BASH_SOURCE[0]%/*}/local_whisper.py" "$chunk_file" > "$output_file"
|
||||
echo "Transcribing chunks..."
|
||||
for i in $(seq 0 $((N_CHUNKS - 1))); do
|
||||
CHUNK_FILE="$CHUNKS_DIR/chunk_$(printf '%03d' $i).mp3"
|
||||
CHUNK_JSON="$CHUNKS_DIR/chunk_$(printf '%03d' $i).json"
|
||||
|
||||
if [ -f "$CHUNK_JSON" ]; then
|
||||
echo " chunk_$(printf '%03d' $i): already transcribed"
|
||||
continue
|
||||
fi
|
||||
|
||||
echo -n " chunk_$(printf '%03d' $i): transcribing... "
|
||||
STARTED=$(date +%s)
|
||||
|
||||
full_url="${WHISPER_URL}/audio/transcriptions"
|
||||
|
||||
CURL_ARGS=(
|
||||
-s -w "%{http_code}" -o "$CHUNK_JSON"
|
||||
-X POST "$full_url"
|
||||
-H "Authorization: Bearer $WHISPER_API_KEY"
|
||||
-F "file=@${CHUNK_FILE}"
|
||||
-F "model=${MODEL}"
|
||||
-F "language=${LANGUAGE}"
|
||||
-F "response_format=verbose_json"
|
||||
-F "temperature=0.0"
|
||||
--max-time 600
|
||||
)
|
||||
[ -n "$HOTWORDS" ] && CURL_ARGS+=(-F "hotwords=${HOTWORDS}")
|
||||
|
||||
HTTP_CODE=$(curl "${CURL_ARGS[@]}")
|
||||
ELAPSED=$(( $(date +%s) - STARTED ))
|
||||
|
||||
if [ "$HTTP_CODE" != "200" ]; then
|
||||
echo "ERROR (HTTP $HTTP_CODE)"
|
||||
if [ -f "$CHUNK_JSON" ]; then
|
||||
cat "$CHUNK_JSON"
|
||||
fi
|
||||
rm -f "$CHUNK_JSON"
|
||||
exit 1
|
||||
fi
|
||||
echo "done in ${ELAPSED}s"
|
||||
done
|
||||
|
||||
# Merge
|
||||
echo "Merging transcriptions..."
|
||||
cat "$CHUNKS_DIR"/chunk_*.txt > "$CHUNKS_DIR/merged_raw.txt"
|
||||
# Merge chunks into final JSON
|
||||
echo "Merging chunks..."
|
||||
python3 - "$CHUNKS_DIR" "$OUTPUT_DIR" "$NAME" "$SPLIT_POINTS" <<'PYEOF'
|
||||
import json, sys, os, glob
|
||||
|
||||
echo "Done. Output: $CHUNKS_DIR/merged_raw.txt"
|
||||
chunks_dir = sys.argv[1]
|
||||
output_dir = sys.argv[2]
|
||||
name = sys.argv[3]
|
||||
split_points_str = sys.argv[4] if len(sys.argv) > 4 else ""
|
||||
|
||||
if split_points_str.strip():
|
||||
split_points = [float(x) for x in split_points_str.strip().split()]
|
||||
else:
|
||||
split_points = []
|
||||
offsets = [0.0] + split_points
|
||||
|
||||
chunk_files = sorted(glob.glob(os.path.join(chunks_dir, "chunk_*.json")))
|
||||
all_segments = []
|
||||
total_duration = 0
|
||||
|
||||
for idx, cf in enumerate(chunk_files):
|
||||
with open(cf) as f:
|
||||
data = json.load(f)
|
||||
offset = offsets[idx] if idx < len(offsets) else offsets[-1]
|
||||
for seg in data.get("segments", []):
|
||||
all_segments.append({
|
||||
"start": round(seg.get("start", 0) + offset, 2),
|
||||
"end": round(seg.get("end", 0) + offset, 2),
|
||||
"text": seg.get("text", "").strip(),
|
||||
})
|
||||
chunk_dur = data.get("duration", 0)
|
||||
total_duration = max(total_duration, offset + chunk_dur)
|
||||
|
||||
all_segments.sort(key=lambda s: s["start"])
|
||||
merged = {"segments": all_segments, "duration": total_duration}
|
||||
json_path = os.path.join(output_dir, f"{name}.json")
|
||||
with open(json_path, "w") as f:
|
||||
json.dump(merged, f, ensure_ascii=False, indent=2)
|
||||
|
||||
txt_path = os.path.join(output_dir, f"{name}.txt")
|
||||
with open(txt_path, "w") as f:
|
||||
for seg in all_segments:
|
||||
start = seg["start"]
|
||||
h, m, s = int(start // 3600), int((start % 3600) // 60), int(start % 60)
|
||||
f.write(f"[{h:02d}:{m:02d}:{s:02d}] {seg['text']}\n")
|
||||
|
||||
plain_path = os.path.join(output_dir, f"{name}_plain.txt")
|
||||
with open(plain_path, "w") as f:
|
||||
f.write(" ".join(seg["text"] for seg in all_segments))
|
||||
|
||||
print(f" {len(all_segments)} segments total")
|
||||
print(f" Written: {json_path}, {txt_path}, {plain_path}")
|
||||
PYEOF
|
||||
|
||||
echo "=== Done: $NAME ==="
|
||||
echo "=== Done: $NAME ==="
|
||||
Loading…
Add table
Add a link
Reference in a new issue