add counts to each student + OCR score

This commit is contained in:
chubinho 2026-04-19 22:25:44 +03:00
parent 811e4d3ffa
commit dcc36f8f26
257 changed files with 12550 additions and 93 deletions

113
evaluate_ocr.py Normal file
View file

@ -0,0 +1,113 @@
import os
import json
import re
import difflib
# Настройки
POSSIBLE_GT_NAMES = ["gt.txt", "ground_truth.txt", "ref.txt"]
BASE_DIR = "photo"
def calculate_accuracy(text1, text2):
if not text1 or not text2:
return 0.0
t1 = " ".join(text1.split())
t2 = " ".join(text2.split())
return round(difflib.SequenceMatcher(None, t1, t2).ratio() * 100, 1)
def extract_ocr_from_md(file_path):
try:
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
# Ищем текст в Блоке 1
match = re.search(
r"### \[БЛОК 1\].*?\n(.*?)(?=\n###|\n---)", content, re.DOTALL)
return match.group(1).strip() if match else ""
except:
return ""
def identify_model(filename):
"""Определяет название модели из имени файла на основе твоих версий"""
fn = filename.lower()
# OpenAI
if "gpt-5-pro" in fn:
return "GPT-5 Pro"
if "gpt-5.4" in fn:
return "GPT-5.4"
if "gpt-4o" in fn:
return "GPT-4o"
if "claude-opus-4-6" in fn:
return "Claude 4.6 Opus"
if "claude-opus-4" in fn:
return "Claude 4 Opus"
# Alibaba / Qwen
if "qwen3.5-122b" in fn:
return "Qwen 3.5 (122B)"
if "qwen3.6" in fn:
return "Qwen 3.6"
# Если ничего не подошло, возвращаем чистый идентификатор из файла
clean_name = filename.replace("REPORT_", "").replace(".md", "")
return clean_name
def main():
final_results = []
if not os.path.exists(BASE_DIR):
print(f"[!] Папка {BASE_DIR} не найдена!")
return
students = [d for d in os.listdir(
BASE_DIR) if os.path.isdir(os.path.join(BASE_DIR, d))]
for student in students:
folder = os.path.join(BASE_DIR, student)
# 1. Ищем эталон (gt.txt)
gt_content = None
for name in POSSIBLE_GT_NAMES:
path = os.path.join(folder, name)
if os.path.exists(path):
with open(path, 'r', encoding='utf-8') as f:
gt_content = f.read().strip()
break
if not gt_content:
print(f" [-] {student}: нет gt.txt, пропускаем")
continue
# 2. Ищем ВСЕ отчеты в папке
reports = [f for f in os.listdir(folder) if f.startswith(
"REPORT_") and f.endswith(".md")]
for r_file in reports:
model_name = identify_model(r_file)
ocr_text = extract_ocr_from_md(os.path.join(folder, r_file))
if ocr_text:
acc = calculate_accuracy(gt_content, ocr_text)
final_results.append({
"surname": student,
"model": model_name,
"accuracy": acc,
"ocr": ocr_text,
"original": gt_content,
"file": r_file
})
print(f" [+] {student} | {model_name}: {acc}%")
# Сохраняем всё в один файл
with open("full_comparison.json", "w", encoding="utf-8") as f:
json.dump(final_results, f, ensure_ascii=False, indent=4)
print(f"\nГотово! Сформирован 'full_comparison.json' с данными по всем моделям.")
if __name__ == "__main__":
main()