Make scripts better

2026-04-30 10:57:18 +03:00 · 2026-04-30 10:57:18 +03:00 · e8ad7df469
commit e8ad7df469
parent ba56147e95
12 changed files with 614 additions and 432 deletions
--- a/scripts/transcribe_chunked.sh
+++ b/scripts/transcribe_chunked.sh
@ -1,64 +1,197 @@
 #!/bin/bash
-# Транскрипция с разбивкой на чанки для длинных аудио
+# Transcribe long audio by splitting at silence boundaries
+# Uses Whisper API with authentication

 set -euo pipefail

-MEETING_DIR="${1:-.}"
-CHUNKS_DIR="$MEETING_DIR/transcription"
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+: ${WHISPER_URL:?ERROR: WHISPER_URL not set}
+: ${WHISPER_API_KEY:?ERROR: WHISPER_API_KEY not set}
+MODEL="whisper-1"  # Изменено с конкретной модели на общее название
+LANGUAGE="ru"
+TARGET_CHUNK=600
+
+HOTWORDS_FILE="${HOTWORKS_PATH:-$SCRIPT_DIR/hotwords.txt}"
+HOTWORDS=""
+if [ -f "$HOTWORDS_FILE" ]; then
+    HOTWORDS=$(grep -v '^#' "$HOTWORDS_FILE" | grep -v '^$' | tr '\n' ',' | sed 's/,,*/,/g; s/^,//; s/,$//')
+fi
+
+MP3_FILE="$1"
+NAME="$2"
+OUTPUT_DIR="$3"
+CHUNKS_DIR="$OUTPUT_DIR/chunks_${NAME}"
+
+if [ -f "$OUTPUT_DIR/${NAME}.json" ]; then
+    echo "$NAME already transcribed, skipping"
+    exit 0
+fi

 mkdir -p "$CHUNKS_DIR"

-# Get audio file
-WAV_FILE=$(ls "$MEETING_DIR"/*.wav 2>/dev/null || ls "$MEETING_DIR"/*.WAV 2>/dev/null)
-if [ -z "$WAV_FILE" ] || [ ! -f "$WAV_FILE" ]; then
-    echo "Error: No WAV file found"
-    exit 1
+DURATION=$(ffprobe -v error -show_entries format=duration -of csv=p=0 "$MP3_FILE" | cut -d. -f1)
+echo "=== Chunked transcription: $NAME ($DURATION s / $((DURATION/60))m) ==="
+
+# Find silence gaps
+SILENCES_FILE="$CHUNKS_DIR/silences.txt"
+if [ ! -f "$SILENCES_FILE" ]; then
+    ffmpeg -i "$MP3_FILE" -af "silencedetect=noise=-35dB:d=0.5" -f null - 2>&1 \
+        | grep "silence_end" \
+        | sed 's/.*silence_end: \([0-9.]*\).*/\1/' \
+        > "$SILENCES_FILE"
 fi
+echo "  Found $(wc -l < "$SILENCES_FILE") silence gaps"

-# Duration
-DURATION=$(ffprobe -i "$WAV_FILE" -show_entries format=duration -v quiet -of csv="p=0")
+# Compute split points
+SPLIT_POINTS=$(python3 - "$SILENCES_FILE" "$TARGET_CHUNK" "$DURATION" <<'PYEOF'
+import sys
+silences_file = sys.argv[1]
+target = float(sys.argv[2])
+duration = float(sys.argv[3])

-echo "Audio duration: $DURATION seconds"
+with open(silences_file) as f:
+    silences = [float(line.strip()) for line in f if line.strip()]

-# Chunk settings
-chunk_duration=600
-offset=0
-chunk_num=0
+if not silences:
+    n = max(2, int(duration / target))
+    splits = [duration * i / n for i in range(1, n)]
+else:
+    splits = []
+    t = target
+    while t < duration - 30:
+        best = min(silences, key=lambda s: abs(s - t))
+        if not splits or best > splits[-1] + 30:
+            splits.append(best)
+        t += target

-echo "Extracting chunks..."
+print(" ".join(f"{s:.2f}" for s in splits))
+PYEOF
+)

-while (( $(echo "$offset < $DURATION" | bc -l) )); do
-    chunk_file="$CHUNKS_DIR/chunk_${chunk_num}.wav"
-    echo "Extracting chunk $chunk_num at offset $offset..."
-    
-    # Retry logic
-    for attempt in 1 2 3; do
-        if ffmpeg -i "$WAV_FILE" -ss "$offset" -t "$chunk_duration" -acodec pcm_s16le -ar 16000 "$chunk_file" -y 2>/dev/null; then
-            break
-        elif [ $attempt -eq 3 ]; then
-            echo "Error: Failed to extract chunk $chunk_num"
-            exit 1
+IFS=' ' read -ra POINTS <<< "$SPLIT_POINTS"
+N_CHUNKS=$((${#POINTS[@]} + 1))
+echo "  Will create $N_CHUNKS chunks"
+
+# Split audio
+PREV=0
+for i in $(seq 0 $((N_CHUNKS - 1))); do
+    CHUNK_FILE="$CHUNKS_DIR/chunk_$(printf '%03d' $i).mp3"
+    if [ -f "$CHUNK_FILE" ] && [ $(stat -c%s "$CHUNK_FILE") -gt 1000 ]; then
+        if [ $i -lt ${#POINTS[@]} ]; then
+            PREV="${POINTS[$i]}"
        fi
-        sleep 1
-    done
-    
-    offset=$((offset + chunk_duration))
-    ((chunk_num++))
-done
+        echo "  chunk_$(printf '%03d' $i): exists, skipping"
+        continue
+    fi

-echo "Transcribing $chunk_num chunks..."
+    if [ $i -lt ${#POINTS[@]} ]; then
+        END="${POINTS[$i]}"
+        DUR=$(python3 -c "print(f'{$END - $PREV:.2f}')")
+        ffmpeg -y -i "$MP3_FILE" -ss "$PREV" -t "$DUR" -c copy "$CHUNK_FILE" 2>/dev/null
+        PREV="$END"
+    else
+        ffmpeg -y -i "$MP3_FILE" -ss "$PREV" -c copy "$CHUNK_FILE" 2>/dev/null
+    fi
+    CHUNK_DUR=$(ffprobe -v error -show_entries format=duration -of csv=p=0 "$CHUNK_FILE" | cut -d. -f1)
+    echo "  chunk_$(printf '%03d' $i): ${CHUNK_DUR}s"
+done

 # Transcribe each chunk
-for i in $(seq 0 $((chunk_num - 1))); do
-    chunk_file="$CHUNKS_DIR/chunk_${i}.wav"
-    output_file="$CHUNKS_DIR/chunk_${i}.txt"
-    
-    echo "Transcribing chunk $i..."
-    MKL_SERVICE_FORCE_INTEL=1 OMP_NUM_THREADS=2 python3 "${BASH_SOURCE[0]%/*}/local_whisper.py" "$chunk_file" > "$output_file"
+echo "Transcribing chunks..."
+for i in $(seq 0 $((N_CHUNKS - 1))); do
+    CHUNK_FILE="$CHUNKS_DIR/chunk_$(printf '%03d' $i).mp3"
+    CHUNK_JSON="$CHUNKS_DIR/chunk_$(printf '%03d' $i).json"
+
+    if [ -f "$CHUNK_JSON" ]; then
+        echo "  chunk_$(printf '%03d' $i): already transcribed"
+        continue
+    fi
+
+    echo -n "  chunk_$(printf '%03d' $i): transcribing... "
+    STARTED=$(date +%s)
+
+    full_url="${WHISPER_URL}/audio/transcriptions"
+
+    CURL_ARGS=(
+        -s -w "%{http_code}" -o "$CHUNK_JSON"
+        -X POST "$full_url"
+        -H "Authorization: Bearer $WHISPER_API_KEY"
+        -F "file=@${CHUNK_FILE}"
+        -F "model=${MODEL}"
+        -F "language=${LANGUAGE}"
+        -F "response_format=verbose_json"
+        -F "temperature=0.0"
+        --max-time 600
+    )
+    [ -n "$HOTWORDS" ] && CURL_ARGS+=(-F "hotwords=${HOTWORDS}")
+
+    HTTP_CODE=$(curl "${CURL_ARGS[@]}")
+    ELAPSED=$(( $(date +%s) - STARTED ))
+
+    if [ "$HTTP_CODE" != "200" ]; then
+        echo "ERROR (HTTP $HTTP_CODE)"
+        if [ -f "$CHUNK_JSON" ]; then
+            cat "$CHUNK_JSON"
+        fi
+        rm -f "$CHUNK_JSON"
+        exit 1
+    fi
+    echo "done in ${ELAPSED}s"
 done

-# Merge
-echo "Merging transcriptions..."
-cat "$CHUNKS_DIR"/chunk_*.txt > "$CHUNKS_DIR/merged_raw.txt"
+# Merge chunks into final JSON
+echo "Merging chunks..."
+python3 - "$CHUNKS_DIR" "$OUTPUT_DIR" "$NAME" "$SPLIT_POINTS" <<'PYEOF'
+import json, sys, os, glob

-echo "Done. Output: $CHUNKS_DIR/merged_raw.txt"
+chunks_dir = sys.argv[1]
+output_dir = sys.argv[2]
+name = sys.argv[3]
+split_points_str = sys.argv[4] if len(sys.argv) > 4 else ""
+
+if split_points_str.strip():
+    split_points = [float(x) for x in split_points_str.strip().split()]
+else:
+    split_points = []
+offsets = [0.0] + split_points
+
+chunk_files = sorted(glob.glob(os.path.join(chunks_dir, "chunk_*.json")))
+all_segments = []
+total_duration = 0
+
+for idx, cf in enumerate(chunk_files):
+    with open(cf) as f:
+        data = json.load(f)
+    offset = offsets[idx] if idx < len(offsets) else offsets[-1]
+    for seg in data.get("segments", []):
+        all_segments.append({
+            "start": round(seg.get("start", 0) + offset, 2),
+            "end": round(seg.get("end", 0) + offset, 2),
+            "text": seg.get("text", "").strip(),
+        })
+    chunk_dur = data.get("duration", 0)
+    total_duration = max(total_duration, offset + chunk_dur)
+
+all_segments.sort(key=lambda s: s["start"])
+merged = {"segments": all_segments, "duration": total_duration}
+json_path = os.path.join(output_dir, f"{name}.json")
+with open(json_path, "w") as f:
+    json.dump(merged, f, ensure_ascii=False, indent=2)
+
+txt_path = os.path.join(output_dir, f"{name}.txt")
+with open(txt_path, "w") as f:
+    for seg in all_segments:
+        start = seg["start"]
+        h, m, s = int(start // 3600), int((start % 3600) // 60), int(start % 60)
+        f.write(f"[{h:02d}:{m:02d}:{s:02d}] {seg['text']}\n")
+
+plain_path = os.path.join(output_dir, f"{name}_plain.txt")
+with open(plain_path, "w") as f:
+    f.write(" ".join(seg["text"] for seg in all_segments))
+
+print(f"  {len(all_segments)} segments total")
+print(f"  Written: {json_path}, {txt_path}, {plain_path}")
+PYEOF
+
+echo "=== Done: $NAME ==="
+echo "=== Done: $NAME ==="