113 lines
3.4 KiB
Python
113 lines
3.4 KiB
Python
import os
|
||
import json
|
||
import re
|
||
import difflib
|
||
|
||
# Настройки
|
||
POSSIBLE_GT_NAMES = ["gt.txt", "ground_truth.txt", "ref.txt"]
|
||
BASE_DIR = "photo"
|
||
|
||
|
||
def calculate_accuracy(text1, text2):
|
||
if not text1 or not text2:
|
||
return 0.0
|
||
t1 = " ".join(text1.split())
|
||
t2 = " ".join(text2.split())
|
||
return round(difflib.SequenceMatcher(None, t1, t2).ratio() * 100, 1)
|
||
|
||
|
||
def extract_ocr_from_md(file_path):
|
||
try:
|
||
with open(file_path, 'r', encoding='utf-8') as f:
|
||
content = f.read()
|
||
# Ищем текст в Блоке 1
|
||
match = re.search(
|
||
r"### \[БЛОК 1\].*?\n(.*?)(?=\n###|\n---)", content, re.DOTALL)
|
||
return match.group(1).strip() if match else ""
|
||
except:
|
||
return ""
|
||
|
||
|
||
def identify_model(filename):
|
||
"""Определяет название модели из имени файла на основе твоих версий"""
|
||
fn = filename.lower()
|
||
|
||
# OpenAI
|
||
if "gpt-5-pro" in fn:
|
||
return "GPT-5 Pro"
|
||
if "gpt-5.4" in fn:
|
||
return "GPT-5.4"
|
||
if "gpt-4o" in fn:
|
||
return "GPT-4o"
|
||
|
||
if "claude-opus-4-6" in fn:
|
||
return "Claude 4.6 Opus"
|
||
if "claude-opus-4" in fn:
|
||
return "Claude 4 Opus"
|
||
|
||
# Alibaba / Qwen
|
||
if "qwen3.5-122b" in fn:
|
||
return "Qwen 3.5 (122B)"
|
||
if "qwen3.6" in fn:
|
||
return "Qwen 3.6"
|
||
|
||
# Если ничего не подошло, возвращаем чистый идентификатор из файла
|
||
clean_name = filename.replace("REPORT_", "").replace(".md", "")
|
||
return clean_name
|
||
|
||
|
||
def main():
|
||
final_results = []
|
||
|
||
if not os.path.exists(BASE_DIR):
|
||
print(f"[!] Папка {BASE_DIR} не найдена!")
|
||
return
|
||
|
||
students = [d for d in os.listdir(
|
||
BASE_DIR) if os.path.isdir(os.path.join(BASE_DIR, d))]
|
||
|
||
for student in students:
|
||
folder = os.path.join(BASE_DIR, student)
|
||
|
||
# 1. Ищем эталон (gt.txt)
|
||
gt_content = None
|
||
for name in POSSIBLE_GT_NAMES:
|
||
path = os.path.join(folder, name)
|
||
if os.path.exists(path):
|
||
with open(path, 'r', encoding='utf-8') as f:
|
||
gt_content = f.read().strip()
|
||
break
|
||
|
||
if not gt_content:
|
||
print(f" [-] {student}: нет gt.txt, пропускаем")
|
||
continue
|
||
|
||
# 2. Ищем ВСЕ отчеты в папке
|
||
reports = [f for f in os.listdir(folder) if f.startswith(
|
||
"REPORT_") and f.endswith(".md")]
|
||
|
||
for r_file in reports:
|
||
model_name = identify_model(r_file)
|
||
ocr_text = extract_ocr_from_md(os.path.join(folder, r_file))
|
||
|
||
if ocr_text:
|
||
acc = calculate_accuracy(gt_content, ocr_text)
|
||
final_results.append({
|
||
"surname": student,
|
||
"model": model_name,
|
||
"accuracy": acc,
|
||
"ocr": ocr_text,
|
||
"original": gt_content,
|
||
"file": r_file
|
||
})
|
||
print(f" [+] {student} | {model_name}: {acc}%")
|
||
|
||
# Сохраняем всё в один файл
|
||
with open("full_comparison.json", "w", encoding="utf-8") as f:
|
||
json.dump(final_results, f, ensure_ascii=False, indent=4)
|
||
|
||
print(f"\nГотово! Сформирован 'full_comparison.json' с данными по всем моделям.")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|