auto-report-skill/scripts/transcribe_chunked.sh

#!/bin/bash
# Transcribe long audio by splitting at silence boundaries
# Uses Whisper API with authentication

set -euo pipefail

SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
: ${WHISPER_URL:?ERROR: WHISPER_URL not set}
: ${WHISPER_API_KEY:?ERROR: WHISPER_API_KEY not set}
MODEL="whisper-1"  # Изменено с конкретной модели на общее название
LANGUAGE="ru"
TARGET_CHUNK=600

HOTWORDS_FILE="${HOTWORKS_PATH:-$SCRIPT_DIR/hotwords.txt}"
HOTWORDS=""
if [ -f "$HOTWORDS_FILE" ]; then
    HOTWORDS=$(grep -v '^#' "$HOTWORDS_FILE" | grep -v '^$' | tr '\n' ',' | sed 's/,,*/,/g; s/^,//; s/,$//')
fi

MP3_FILE="$1"
NAME="$2"
OUTPUT_DIR="$3"
CHUNKS_DIR="$OUTPUT_DIR/chunks_${NAME}"

if [ -f "$OUTPUT_DIR/${NAME}.json" ]; then
    echo "$NAME already transcribed, skipping"
    exit 0
fi

mkdir -p "$CHUNKS_DIR"

DURATION=$(ffprobe -v error -show_entries format=duration -of csv=p=0 "$MP3_FILE" | cut -d. -f1)
echo "=== Chunked transcription: $NAME ($DURATION s / $((DURATION/60))m) ==="

# Find silence gaps
SILENCES_FILE="$CHUNKS_DIR/silences.txt"
if [ ! -f "$SILENCES_FILE" ]; then
    ffmpeg -i "$MP3_FILE" -af "silencedetect=noise=-35dB:d=0.5" -f null - 2>&1 \
        | grep "silence_end" \
        | sed 's/.*silence_end: \([0-9.]*\).*/\1/' \
        > "$SILENCES_FILE"
fi
echo "  Found $(wc -l < "$SILENCES_FILE") silence gaps"

# Compute split points
SPLIT_POINTS=$(python3 - "$SILENCES_FILE" "$TARGET_CHUNK" "$DURATION" <<'PYEOF'
import sys
silences_file = sys.argv[1]
target = float(sys.argv[2])
duration = float(sys.argv[3])

with open(silences_file) as f:
    silences = [float(line.strip()) for line in f if line.strip()]

if not silences:
    n = max(2, int(duration / target))
    splits = [duration * i / n for i in range(1, n)]
else:
    splits = []
    t = target
    while t < duration - 30:
        best = min(silences, key=lambda s: abs(s - t))
        if not splits or best > splits[-1] + 30:
            splits.append(best)
        t += target

print(" ".join(f"{s:.2f}" for s in splits))
PYEOF
)

IFS=' ' read -ra POINTS <<< "$SPLIT_POINTS"
N_CHUNKS=$((${#POINTS[@]} + 1))
echo "  Will create $N_CHUNKS chunks"

# Split audio
PREV=0
for i in $(seq 0 $((N_CHUNKS - 1))); do
    CHUNK_FILE="$CHUNKS_DIR/chunk_$(printf '%03d' $i).mp3"
    if [ -f "$CHUNK_FILE" ] && [ $(stat -c%s "$CHUNK_FILE") -gt 1000 ]; then
        if [ $i -lt ${#POINTS[@]} ]; then
            PREV="${POINTS[$i]}"
        fi
        echo "  chunk_$(printf '%03d' $i): exists, skipping"
        continue
    fi

    if [ $i -lt ${#POINTS[@]} ]; then
        END="${POINTS[$i]}"
        DUR=$(python3 -c "print(f'{$END - $PREV:.2f}')")
        ffmpeg -y -i "$MP3_FILE" -ss "$PREV" -t "$DUR" -c copy "$CHUNK_FILE" 2>/dev/null
        PREV="$END"
    else
        ffmpeg -y -i "$MP3_FILE" -ss "$PREV" -c copy "$CHUNK_FILE" 2>/dev/null
    fi
    CHUNK_DUR=$(ffprobe -v error -show_entries format=duration -of csv=p=0 "$CHUNK_FILE" | cut -d. -f1)
    echo "  chunk_$(printf '%03d' $i): ${CHUNK_DUR}s"
done

# Transcribe each chunk
echo "Transcribing chunks..."
for i in $(seq 0 $((N_CHUNKS - 1))); do
    CHUNK_FILE="$CHUNKS_DIR/chunk_$(printf '%03d' $i).mp3"
    CHUNK_JSON="$CHUNKS_DIR/chunk_$(printf '%03d' $i).json"

    if [ -f "$CHUNK_JSON" ]; then
        echo "  chunk_$(printf '%03d' $i): already transcribed"
        continue
    fi

    echo -n "  chunk_$(printf '%03d' $i): transcribing... "
    STARTED=$(date +%s)

    full_url="${WHISPER_URL}/audio/transcriptions"

    CURL_ARGS=(
        -s -w "%{http_code}" -o "$CHUNK_JSON"
        -X POST "$full_url"
        -H "Authorization: Bearer $WHISPER_API_KEY"
        -F "file=@${CHUNK_FILE}"
        -F "model=${MODEL}"
        -F "language=${LANGUAGE}"
        -F "response_format=verbose_json"
        -F "temperature=0.0"
        --max-time 600
    )
    [ -n "$HOTWORDS" ] && CURL_ARGS+=(-F "hotwords=${HOTWORDS}")

    HTTP_CODE=$(curl "${CURL_ARGS[@]}")
    ELAPSED=$(( $(date +%s) - STARTED ))

    if [ "$HTTP_CODE" != "200" ]; then
        echo "ERROR (HTTP $HTTP_CODE)"
        if [ -f "$CHUNK_JSON" ]; then
            cat "$CHUNK_JSON"
        fi
        rm -f "$CHUNK_JSON"
        exit 1
    fi
    echo "done in ${ELAPSED}s"
done

# Merge chunks into final JSON
echo "Merging chunks..."
python3 - "$CHUNKS_DIR" "$OUTPUT_DIR" "$NAME" "$SPLIT_POINTS" <<'PYEOF'
import json, sys, os, glob

chunks_dir = sys.argv[1]
output_dir = sys.argv[2]
name = sys.argv[3]
split_points_str = sys.argv[4] if len(sys.argv) > 4 else ""

if split_points_str.strip():
    split_points = [float(x) for x in split_points_str.strip().split()]
else:
    split_points = []
offsets = [0.0] + split_points

chunk_files = sorted(glob.glob(os.path.join(chunks_dir, "chunk_*.json")))
all_segments = []
total_duration = 0

for idx, cf in enumerate(chunk_files):
    with open(cf) as f:
        data = json.load(f)
    offset = offsets[idx] if idx < len(offsets) else offsets[-1]
    for seg in data.get("segments", []):
        all_segments.append({
            "start": round(seg.get("start", 0) + offset, 2),
            "end": round(seg.get("end", 0) + offset, 2),
            "text": seg.get("text", "").strip(),
        })
    chunk_dur = data.get("duration", 0)
    total_duration = max(total_duration, offset + chunk_dur)

all_segments.sort(key=lambda s: s["start"])
merged = {"segments": all_segments, "duration": total_duration}
json_path = os.path.join(output_dir, f"{name}.json")
with open(json_path, "w") as f:
    json.dump(merged, f, ensure_ascii=False, indent=2)

txt_path = os.path.join(output_dir, f"{name}.txt")
with open(txt_path, "w") as f:
    for seg in all_segments:
        start = seg["start"]
        h, m, s = int(start // 3600), int((start % 3600) // 60), int(start % 60)
        f.write(f"[{h:02d}:{m:02d}:{s:02d}] {seg['text']}\n")

plain_path = os.path.join(output_dir, f"{name}_plain.txt")
with open(plain_path, "w") as f:
    f.write(" ".join(seg["text"] for seg in all_segments))

print(f"  {len(all_segments)} segments total")
print(f"  Written: {json_path}, {txt_path}, {plain_path}")
PYEOF

echo "=== Done: $NAME ==="
echo "=== Done: $NAME ==="