ege-skill/evaluate_ocr.py

import os
import json
import re
import difflib

# Настройки
POSSIBLE_GT_NAMES = ["gt.txt", "ground_truth.txt", "ref.txt"]
BASE_DIR = "photo"


def calculate_accuracy(text1, text2):
    if not text1 or not text2:
        return 0.0
    t1 = " ".join(text1.split())
    t2 = " ".join(text2.split())
    return round(difflib.SequenceMatcher(None, t1, t2).ratio() * 100, 1)


def extract_ocr_from_md(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()
        # Ищем текст в Блоке 1
        match = re.search(
            r"### \[БЛОК 1\].*?\n(.*?)(?=\n###|\n---)", content, re.DOTALL)
        return match.group(1).strip() if match else ""
    except:
        return ""


def identify_model(filename):
    """Определяет название модели из имени файла на основе твоих версий"""
    fn = filename.lower()

    # OpenAI
    if "gpt-5-pro" in fn:
        return "GPT-5 Pro"
    if "gpt-5.4" in fn:
        return "GPT-5.4"
    if "gpt-4o" in fn:
        return "GPT-4o"

    if "claude-opus-4-6" in fn:
        return "Claude 4.6 Opus"
    if "claude-opus-4" in fn:
        return "Claude 4 Opus"

    # Alibaba / Qwen
    if "qwen3.5-122b" in fn:
        return "Qwen 3.5 (122B)"
    if "qwen3.6" in fn:
        return "Qwen 3.6"

    # Если ничего не подошло, возвращаем чистый идентификатор из файла
    clean_name = filename.replace("REPORT_", "").replace(".md", "")
    return clean_name


def main():
    final_results = []

    if not os.path.exists(BASE_DIR):
        print(f"[!] Папка {BASE_DIR} не найдена!")
        return

    students = [d for d in os.listdir(
        BASE_DIR) if os.path.isdir(os.path.join(BASE_DIR, d))]

    for student in students:
        folder = os.path.join(BASE_DIR, student)

        # 1. Ищем эталон (gt.txt)
        gt_content = None
        for name in POSSIBLE_GT_NAMES:
            path = os.path.join(folder, name)
            if os.path.exists(path):
                with open(path, 'r', encoding='utf-8') as f:
                    gt_content = f.read().strip()
                break

        if not gt_content:
            print(f"  [-] {student}: нет gt.txt, пропускаем")
            continue

        # 2. Ищем ВСЕ отчеты в папке
        reports = [f for f in os.listdir(folder) if f.startswith(
            "REPORT_") and f.endswith(".md")]

        for r_file in reports:
            model_name = identify_model(r_file)
            ocr_text = extract_ocr_from_md(os.path.join(folder, r_file))

            if ocr_text:
                acc = calculate_accuracy(gt_content, ocr_text)
                final_results.append({
                    "surname": student,
                    "model": model_name,
                    "accuracy": acc,
                    "ocr": ocr_text,
                    "original": gt_content,
                    "file": r_file
                })
                print(f"  [+] {student} | {model_name}: {acc}%")

    # Сохраняем всё в один файл
    with open("full_comparison.json", "w", encoding="utf-8") as f:
        json.dump(final_results, f, ensure_ascii=False, indent=4)

    print(f"\nГотово! Сформирован 'full_comparison.json' с данными по всем моделям.")


if __name__ == "__main__":
    main()