|
__pycache__
|
mind2web
|
2026-04-23 00:04:11 +03:00 |
|
runs_dataset
|
252 tests
|
2026-04-23 23:30:20 +03:00 |
|
summaries
|
252 tests
|
2026-04-23 23:30:20 +03:00 |
|
.DS_Store
|
252 tests
|
2026-04-23 23:30:20 +03:00 |
|
agent_parser.py
|
mind2web
|
2026-04-23 00:04:11 +03:00 |
|
batch_run.log
|
252 tests
|
2026-04-23 23:30:20 +03:00 |
|
big_json_loader.py
|
mind2web
|
2026-04-23 00:04:11 +03:00 |
|
comparator.py
|
mind2web
|
2026-04-23 00:04:11 +03:00 |
|
comparator_loose.py
|
mind2web
|
2026-04-23 00:04:11 +03:00 |
|
dataset_loader.py
|
mind2web
|
2026-04-23 00:04:11 +03:00 |
|
final_answer_loader.py
|
mind2web
|
2026-04-23 00:04:11 +03:00 |
|
gold_parser.py
|
mind2web
|
2026-04-23 00:04:11 +03:00 |
|
llm_judge.py
|
mind2web
|
2026-04-23 00:04:11 +03:00 |
|
run_batch_eval.py
|
45 tests
|
2026-04-23 01:21:29 +03:00 |
|
run_eval_v2.py
|
mind2web
|
2026-04-23 00:04:11 +03:00 |
|
semantic_comparator.py
|
mind2web
|
2026-04-23 00:04:11 +03:00 |