add counts to each student + OCR score

2026-04-19 22:25:44 +03:00 · 2026-04-19 22:25:44 +03:00 · dcc36f8f26
commit dcc36f8f26
parent 811e4d3ffa
257 changed files with 12550 additions and 93 deletions
--- a/evaluate_ocr.py
+++ b/evaluate_ocr.py
@ -0,0 +1,113 @@
+import os
+import json
+import re
+import difflib
+
+# Настройки
+POSSIBLE_GT_NAMES = ["gt.txt", "ground_truth.txt", "ref.txt"]
+BASE_DIR = "photo"
+
+
+def calculate_accuracy(text1, text2):
+    if not text1 or not text2:
+        return 0.0
+    t1 = " ".join(text1.split())
+    t2 = " ".join(text2.split())
+    return round(difflib.SequenceMatcher(None, t1, t2).ratio() * 100, 1)
+
+
+def extract_ocr_from_md(file_path):
+    try:
+        with open(file_path, 'r', encoding='utf-8') as f:
+            content = f.read()
+        # Ищем текст в Блоке 1
+        match = re.search(
+            r"### \[БЛОК 1\].*?\n(.*?)(?=\n###|\n---)", content, re.DOTALL)
+        return match.group(1).strip() if match else ""
+    except:
+        return ""
+
+
+def identify_model(filename):
+    """Определяет название модели из имени файла на основе твоих версий"""
+    fn = filename.lower()
+
+    # OpenAI
+    if "gpt-5-pro" in fn:
+        return "GPT-5 Pro"
+    if "gpt-5.4" in fn:
+        return "GPT-5.4"
+    if "gpt-4o" in fn:
+        return "GPT-4o"
+
+    if "claude-opus-4-6" in fn:
+        return "Claude 4.6 Opus"
+    if "claude-opus-4" in fn:
+        return "Claude 4 Opus"
+
+    # Alibaba / Qwen
+    if "qwen3.5-122b" in fn:
+        return "Qwen 3.5 (122B)"
+    if "qwen3.6" in fn:
+        return "Qwen 3.6"
+
+    # Если ничего не подошло, возвращаем чистый идентификатор из файла
+    clean_name = filename.replace("REPORT_", "").replace(".md", "")
+    return clean_name
+
+
+def main():
+    final_results = []
+
+    if not os.path.exists(BASE_DIR):
+        print(f"[!] Папка {BASE_DIR} не найдена!")
+        return
+
+    students = [d for d in os.listdir(
+        BASE_DIR) if os.path.isdir(os.path.join(BASE_DIR, d))]
+
+    for student in students:
+        folder = os.path.join(BASE_DIR, student)
+
+        # 1. Ищем эталон (gt.txt)
+        gt_content = None
+        for name in POSSIBLE_GT_NAMES:
+            path = os.path.join(folder, name)
+            if os.path.exists(path):
+                with open(path, 'r', encoding='utf-8') as f:
+                    gt_content = f.read().strip()
+                break
+
+        if not gt_content:
+            print(f"  [-] {student}: нет gt.txt, пропускаем")
+            continue
+
+        # 2. Ищем ВСЕ отчеты в папке
+        reports = [f for f in os.listdir(folder) if f.startswith(
+            "REPORT_") and f.endswith(".md")]
+
+        for r_file in reports:
+            model_name = identify_model(r_file)
+            ocr_text = extract_ocr_from_md(os.path.join(folder, r_file))
+
+            if ocr_text:
+                acc = calculate_accuracy(gt_content, ocr_text)
+                final_results.append({
+                    "surname": student,
+                    "model": model_name,
+                    "accuracy": acc,
+                    "ocr": ocr_text,
+                    "original": gt_content,
+                    "file": r_file
+                })
+                print(f"  [+] {student} | {model_name}: {acc}%")
+
+    # Сохраняем всё в один файл
+    with open("full_comparison.json", "w", encoding="utf-8") as f:
+        json.dump(final_results, f, ensure_ascii=False, indent=4)
+
+    print(f"\nГотово! Сформирован 'full_comparison.json' с данными по всем моделям.")
+
+
+if __name__ == "__main__":
+    main()