add counts to each student + OCR score
This commit is contained in:
parent
811e4d3ffa
commit
dcc36f8f26
257 changed files with 12550 additions and 93 deletions
113
evaluate_ocr.py
Normal file
113
evaluate_ocr.py
Normal file
|
|
@ -0,0 +1,113 @@
|
|||
import os
|
||||
import json
|
||||
import re
|
||||
import difflib
|
||||
|
||||
# Настройки
|
||||
POSSIBLE_GT_NAMES = ["gt.txt", "ground_truth.txt", "ref.txt"]
|
||||
BASE_DIR = "photo"
|
||||
|
||||
|
||||
def calculate_accuracy(text1, text2):
|
||||
if not text1 or not text2:
|
||||
return 0.0
|
||||
t1 = " ".join(text1.split())
|
||||
t2 = " ".join(text2.split())
|
||||
return round(difflib.SequenceMatcher(None, t1, t2).ratio() * 100, 1)
|
||||
|
||||
|
||||
def extract_ocr_from_md(file_path):
|
||||
try:
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
content = f.read()
|
||||
# Ищем текст в Блоке 1
|
||||
match = re.search(
|
||||
r"### \[БЛОК 1\].*?\n(.*?)(?=\n###|\n---)", content, re.DOTALL)
|
||||
return match.group(1).strip() if match else ""
|
||||
except:
|
||||
return ""
|
||||
|
||||
|
||||
def identify_model(filename):
|
||||
"""Определяет название модели из имени файла на основе твоих версий"""
|
||||
fn = filename.lower()
|
||||
|
||||
# OpenAI
|
||||
if "gpt-5-pro" in fn:
|
||||
return "GPT-5 Pro"
|
||||
if "gpt-5.4" in fn:
|
||||
return "GPT-5.4"
|
||||
if "gpt-4o" in fn:
|
||||
return "GPT-4o"
|
||||
|
||||
if "claude-opus-4-6" in fn:
|
||||
return "Claude 4.6 Opus"
|
||||
if "claude-opus-4" in fn:
|
||||
return "Claude 4 Opus"
|
||||
|
||||
# Alibaba / Qwen
|
||||
if "qwen3.5-122b" in fn:
|
||||
return "Qwen 3.5 (122B)"
|
||||
if "qwen3.6" in fn:
|
||||
return "Qwen 3.6"
|
||||
|
||||
# Если ничего не подошло, возвращаем чистый идентификатор из файла
|
||||
clean_name = filename.replace("REPORT_", "").replace(".md", "")
|
||||
return clean_name
|
||||
|
||||
|
||||
def main():
|
||||
final_results = []
|
||||
|
||||
if not os.path.exists(BASE_DIR):
|
||||
print(f"[!] Папка {BASE_DIR} не найдена!")
|
||||
return
|
||||
|
||||
students = [d for d in os.listdir(
|
||||
BASE_DIR) if os.path.isdir(os.path.join(BASE_DIR, d))]
|
||||
|
||||
for student in students:
|
||||
folder = os.path.join(BASE_DIR, student)
|
||||
|
||||
# 1. Ищем эталон (gt.txt)
|
||||
gt_content = None
|
||||
for name in POSSIBLE_GT_NAMES:
|
||||
path = os.path.join(folder, name)
|
||||
if os.path.exists(path):
|
||||
with open(path, 'r', encoding='utf-8') as f:
|
||||
gt_content = f.read().strip()
|
||||
break
|
||||
|
||||
if not gt_content:
|
||||
print(f" [-] {student}: нет gt.txt, пропускаем")
|
||||
continue
|
||||
|
||||
# 2. Ищем ВСЕ отчеты в папке
|
||||
reports = [f for f in os.listdir(folder) if f.startswith(
|
||||
"REPORT_") and f.endswith(".md")]
|
||||
|
||||
for r_file in reports:
|
||||
model_name = identify_model(r_file)
|
||||
ocr_text = extract_ocr_from_md(os.path.join(folder, r_file))
|
||||
|
||||
if ocr_text:
|
||||
acc = calculate_accuracy(gt_content, ocr_text)
|
||||
final_results.append({
|
||||
"surname": student,
|
||||
"model": model_name,
|
||||
"accuracy": acc,
|
||||
"ocr": ocr_text,
|
||||
"original": gt_content,
|
||||
"file": r_file
|
||||
})
|
||||
print(f" [+] {student} | {model_name}: {acc}%")
|
||||
|
||||
# Сохраняем всё в один файл
|
||||
with open("full_comparison.json", "w", encoding="utf-8") as f:
|
||||
json.dump(final_results, f, ensure_ascii=False, indent=4)
|
||||
|
||||
print(f"\nГотово! Сформирован 'full_comparison.json' с данными по всем моделям.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Add table
Add a link
Reference in a new issue