ege-skill/evaluate_ocr.py

113 lines
3.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import os
import json
import re
import difflib
# Настройки
POSSIBLE_GT_NAMES = ["gt.txt", "ground_truth.txt", "ref.txt"]
BASE_DIR = "photo"
def calculate_accuracy(text1, text2):
if not text1 or not text2:
return 0.0
t1 = " ".join(text1.split())
t2 = " ".join(text2.split())
return round(difflib.SequenceMatcher(None, t1, t2).ratio() * 100, 1)
def extract_ocr_from_md(file_path):
try:
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
# Ищем текст в Блоке 1
match = re.search(
r"### \[БЛОК 1\].*?\n(.*?)(?=\n###|\n---)", content, re.DOTALL)
return match.group(1).strip() if match else ""
except:
return ""
def identify_model(filename):
"""Определяет название модели из имени файла на основе твоих версий"""
fn = filename.lower()
# OpenAI
if "gpt-5-pro" in fn:
return "GPT-5 Pro"
if "gpt-5.4" in fn:
return "GPT-5.4"
if "gpt-4o" in fn:
return "GPT-4o"
if "claude-opus-4-6" in fn:
return "Claude 4.6 Opus"
if "claude-opus-4" in fn:
return "Claude 4 Opus"
# Alibaba / Qwen
if "qwen3.5-122b" in fn:
return "Qwen 3.5 (122B)"
if "qwen3.6" in fn:
return "Qwen 3.6"
# Если ничего не подошло, возвращаем чистый идентификатор из файла
clean_name = filename.replace("REPORT_", "").replace(".md", "")
return clean_name
def main():
final_results = []
if not os.path.exists(BASE_DIR):
print(f"[!] Папка {BASE_DIR} не найдена!")
return
students = [d for d in os.listdir(
BASE_DIR) if os.path.isdir(os.path.join(BASE_DIR, d))]
for student in students:
folder = os.path.join(BASE_DIR, student)
# 1. Ищем эталон (gt.txt)
gt_content = None
for name in POSSIBLE_GT_NAMES:
path = os.path.join(folder, name)
if os.path.exists(path):
with open(path, 'r', encoding='utf-8') as f:
gt_content = f.read().strip()
break
if not gt_content:
print(f" [-] {student}: нет gt.txt, пропускаем")
continue
# 2. Ищем ВСЕ отчеты в папке
reports = [f for f in os.listdir(folder) if f.startswith(
"REPORT_") and f.endswith(".md")]
for r_file in reports:
model_name = identify_model(r_file)
ocr_text = extract_ocr_from_md(os.path.join(folder, r_file))
if ocr_text:
acc = calculate_accuracy(gt_content, ocr_text)
final_results.append({
"surname": student,
"model": model_name,
"accuracy": acc,
"ocr": ocr_text,
"original": gt_content,
"file": r_file
})
print(f" [+] {student} | {model_name}: {acc}%")
# Сохраняем всё в один файл
with open("full_comparison.json", "w", encoding="utf-8") as f:
json.dump(final_results, f, ensure_ascii=False, indent=4)
print(f"\nГотово! Сформирован 'full_comparison.json' с данными по всем моделям.")
if __name__ == "__main__":
main()