import os import json import re import difflib # Настройки POSSIBLE_GT_NAMES = ["gt.txt", "ground_truth.txt", "ref.txt"] BASE_DIR = "photo" def calculate_accuracy(text1, text2): if not text1 or not text2: return 0.0 t1 = " ".join(text1.split()) t2 = " ".join(text2.split()) return round(difflib.SequenceMatcher(None, t1, t2).ratio() * 100, 1) def extract_ocr_from_md(file_path): try: with open(file_path, 'r', encoding='utf-8') as f: content = f.read() # Ищем текст в Блоке 1 match = re.search( r"### \[БЛОК 1\].*?\n(.*?)(?=\n###|\n---)", content, re.DOTALL) return match.group(1).strip() if match else "" except: return "" def identify_model(filename): """Определяет название модели из имени файла на основе твоих версий""" fn = filename.lower() # OpenAI if "gpt-5-pro" in fn: return "GPT-5 Pro" if "gpt-5.4" in fn: return "GPT-5.4" if "gpt-4o" in fn: return "GPT-4o" if "claude-opus-4-6" in fn: return "Claude 4.6 Opus" if "claude-opus-4" in fn: return "Claude 4 Opus" # Alibaba / Qwen if "qwen3.5-122b" in fn: return "Qwen 3.5 (122B)" if "qwen3.6" in fn: return "Qwen 3.6" # Если ничего не подошло, возвращаем чистый идентификатор из файла clean_name = filename.replace("REPORT_", "").replace(".md", "") return clean_name def main(): final_results = [] if not os.path.exists(BASE_DIR): print(f"[!] Папка {BASE_DIR} не найдена!") return students = [d for d in os.listdir( BASE_DIR) if os.path.isdir(os.path.join(BASE_DIR, d))] for student in students: folder = os.path.join(BASE_DIR, student) # 1. Ищем эталон (gt.txt) gt_content = None for name in POSSIBLE_GT_NAMES: path = os.path.join(folder, name) if os.path.exists(path): with open(path, 'r', encoding='utf-8') as f: gt_content = f.read().strip() break if not gt_content: print(f" [-] {student}: нет gt.txt, пропускаем") continue # 2. Ищем ВСЕ отчеты в папке reports = [f for f in os.listdir(folder) if f.startswith( "REPORT_") and f.endswith(".md")] for r_file in reports: model_name = identify_model(r_file) ocr_text = extract_ocr_from_md(os.path.join(folder, r_file)) if ocr_text: acc = calculate_accuracy(gt_content, ocr_text) final_results.append({ "surname": student, "model": model_name, "accuracy": acc, "ocr": ocr_text, "original": gt_content, "file": r_file }) print(f" [+] {student} | {model_name}: {acc}%") # Сохраняем всё в один файл with open("full_comparison.json", "w", encoding="utf-8") as f: json.dump(final_results, f, ensure_ascii=False, indent=4) print(f"\nГотово! Сформирован 'full_comparison.json' с данными по всем моделям.") if __name__ == "__main__": main()