suite_config dict | split stringclasses 1
value | results listlengths 1 11 | submission dict |
|---|---|---|---|
{
"name": "asta-bench",
"version": "1.0.0",
"splits": [
{
"name": "validation",
"tasks": [
{
"name": "ArxivDIGESTables_Clean_validation",
"path": "astabench/arxivdigestables_validation",
"primary_metric": "score_tables/mean",
"tags": [
"lit"
]
},
{
"name": "ScholarQA_CS2_validation",
"path": "astabench/sqa_dev",
"primary_metric": "global_avg/mean",
"tags": [
"lit"
]
},
{
"name": "LitQA2_FullText_validation",
"path": "astabench/litqa2_validation",
"primary_metric": "is_correct/accuracy",
"tags": [
"lit"
]
},
{
"name": "PaperFindingBench_validation",
"path": "astabench/paper_finder_validation",
"primary_metric": "score_paper_finder/adjusted_f1_micro_avg",
"tags": [
"lit"
]
},
{
"name": "LitQA2_FullText_Search_validation",
"path": "astabench/paper_finder_litqa2_validation",
"primary_metric": "score_paper_finder/recall_at_30",
"tags": [
"lit"
]
},
{
"name": "DiscoveryBench_validation",
"path": "astabench/discoverybench_validation",
"primary_metric": "score_discoverybench/mean",
"tags": [
"data"
]
},
{
"name": "CORE_Bench_Hard_validation",
"path": "astabench/core_bench_validation",
"primary_metric": "score_with_stderr/accuracy",
"tags": [
"code"
]
},
{
"name": "DS_1000_validation",
"path": "astabench/ds1000_validation",
"primary_metric": "ds1000_scorer/accuracy",
"tags": [
"code"
]
},
{
"name": "E2E_Bench_validation",
"path": "astabench/e2e_discovery_validation",
"primary_metric": "score_rubric/accuracy",
"tags": [
"discovery"
]
},
{
"name": "E2E_Bench_Hard_validation",
"path": "astabench/e2e_discovery_hard_validation",
"primary_metric": "score_rubric/accuracy",
"tags": [
"discovery"
]
},
{
"name": "SUPER_Expert_validation",
"path": "astabench/super_validation",
"primary_metric": "entrypoint/mean",
"tags": [
"code"
]
}
],
"macro_average_weight_adjustments": [
{
"tag": "lit",
"task": "LitQA2_FullText_validation",
"weight": 0.5
},
{
"tag": "lit",
"task": "LitQA2_FullText_Search_validation",
"weight": 0.5
}
]
},
{
"name": "test",
"tasks": [
{
"name": "PaperFindingBench_test",
"path": "astabench/paper_finder_test",
"primary_metric": "score_paper_finder/adjusted_f1_micro_avg",
"tags": [
"lit"
]
},
{
"name": "LitQA2_FullText_Search_test",
"path": "astabench/paper_finder_litqa2_test",
"primary_metric": "score_paper_finder/recall_at_30",
"tags": [
"lit"
]
},
{
"name": "ScholarQA_CS2_test",
"path": "astabench/sqa_test",
"primary_metric": "global_avg/mean",
"tags": [
"lit"
]
},
{
"name": "ArxivDIGESTables_Clean_test",
"path": "astabench/arxivdigestables_test",
"primary_metric": "score_tables/mean",
"tags": [
"lit"
]
},
{
"name": "LitQA2_FullText_test",
"path": "astabench/litqa2_test",
"primary_metric": "is_correct/accuracy",
"tags": [
"lit"
]
},
{
"name": "DiscoveryBench_test",
"path": "astabench/discoverybench_test",
"primary_metric": "score_discoverybench/mean",
"tags": [
"data"
]
},
{
"name": "CORE_Bench_Hard_test",
"path": "astabench/core_bench_test",
"primary_metric": "score_with_stderr/accuracy",
"tags": [
"code"
]
},
{
"name": "DS_1000_test",
"path": "astabench/ds1000_test",
"primary_metric": "ds1000_scorer/accuracy",
"tags": [
"code"
]
},
{
"name": "E2E_Bench_test",
"path": "astabench/e2e_discovery_test",
"primary_metric": "score_rubric/accuracy",
"tags": [
"discovery"
]
},
{
"name": "E2E_Bench_Hard_test",
"path": "astabench/e2e_discovery_hard_test",
"primary_metric": "score_rubric/accuracy",
"tags": [
"discovery"
]
},
{
"name": "SUPER_Expert_test",
"path": "astabench/super_test",
"primary_metric": "output_match/mean",
"tags": [
"code"
]
}
],
"macro_average_weight_adjustments": [
{
"tag": "lit",
"task": "LitQA2_FullText_test",
"weight": 0.5
},
{
"tag": "lit",
"task": "LitQA2_FullText_Search_test",
"weight": 0.5
}
]
}
]
} | validation | [
{
"task_name": "PaperFindingBench_validation",
"eval_spec": {
"solver": "astabench/solvers/search/paper_finder.py@ai2i_paper_finder",
"solver_args": "{\"base_url\": \"http://35.247.123.160:8000\"}",
"model": "openai/gpt-4o-mini",
"model_args": "{}",
"task_args": "{\"with_search... | {
"submit_time": "2025-08-07T19:06:21.862673Z",
"username": "Ai2",
"agent_name": "Asta Paper Finder",
"agent_description": null,
"agent_url": null,
"logs_url": null,
"logs_url_public": "hf://datasets/allenai/asta-bench-submissions/1.0.0-dev1/validation/aryeh_tiktinsky_ai2_Asta_Paper_Finder_2025-08-07T19-06-21",
"summary_url": null,
"openness": "Open source & closed weights",
"tool_usage": "Custom interface"
} |
{
"name": "asta-bench",
"version": "1.0.0",
"splits": [
{
"name": "validation",
"tasks": [
{
"name": "ArxivDIGESTables_Clean_validation",
"path": "astabench/arxivdigestables_validation",
"primary_metric": "score_tables/mean",
"tags": [
"lit"
]
},
{
"name": "ScholarQA_CS2_validation",
"path": "astabench/sqa_dev",
"primary_metric": "global_avg/mean",
"tags": [
"lit"
]
},
{
"name": "LitQA2_FullText_validation",
"path": "astabench/litqa2_validation",
"primary_metric": "is_correct/accuracy",
"tags": [
"lit"
]
},
{
"name": "PaperFindingBench_validation",
"path": "astabench/paper_finder_validation",
"primary_metric": "score_paper_finder/adjusted_f1_micro_avg",
"tags": [
"lit"
]
},
{
"name": "LitQA2_FullText_Search_validation",
"path": "astabench/paper_finder_litqa2_validation",
"primary_metric": "score_paper_finder/recall_at_30",
"tags": [
"lit"
]
},
{
"name": "DiscoveryBench_validation",
"path": "astabench/discoverybench_validation",
"primary_metric": "score_discoverybench/mean",
"tags": [
"data"
]
},
{
"name": "CORE_Bench_Hard_validation",
"path": "astabench/core_bench_validation",
"primary_metric": "score_with_stderr/accuracy",
"tags": [
"code"
]
},
{
"name": "DS_1000_validation",
"path": "astabench/ds1000_validation",
"primary_metric": "ds1000_scorer/accuracy",
"tags": [
"code"
]
},
{
"name": "E2E_Bench_validation",
"path": "astabench/e2e_discovery_validation",
"primary_metric": "score_rubric/accuracy",
"tags": [
"discovery"
]
},
{
"name": "E2E_Bench_Hard_validation",
"path": "astabench/e2e_discovery_hard_validation",
"primary_metric": "score_rubric/accuracy",
"tags": [
"discovery"
]
},
{
"name": "SUPER_Expert_validation",
"path": "astabench/super_validation",
"primary_metric": "entrypoint/mean",
"tags": [
"code"
]
}
],
"macro_average_weight_adjustments": [
{
"tag": "lit",
"task": "LitQA2_FullText_validation",
"weight": 0.5
},
{
"tag": "lit",
"task": "LitQA2_FullText_Search_validation",
"weight": 0.5
}
]
},
{
"name": "test",
"tasks": [
{
"name": "PaperFindingBench_test",
"path": "astabench/paper_finder_test",
"primary_metric": "score_paper_finder/adjusted_f1_micro_avg",
"tags": [
"lit"
]
},
{
"name": "LitQA2_FullText_Search_test",
"path": "astabench/paper_finder_litqa2_test",
"primary_metric": "score_paper_finder/recall_at_30",
"tags": [
"lit"
]
},
{
"name": "ScholarQA_CS2_test",
"path": "astabench/sqa_test",
"primary_metric": "global_avg/mean",
"tags": [
"lit"
]
},
{
"name": "ArxivDIGESTables_Clean_test",
"path": "astabench/arxivdigestables_test",
"primary_metric": "score_tables/mean",
"tags": [
"lit"
]
},
{
"name": "LitQA2_FullText_test",
"path": "astabench/litqa2_test",
"primary_metric": "is_correct/accuracy",
"tags": [
"lit"
]
},
{
"name": "DiscoveryBench_test",
"path": "astabench/discoverybench_test",
"primary_metric": "score_discoverybench/mean",
"tags": [
"data"
]
},
{
"name": "CORE_Bench_Hard_test",
"path": "astabench/core_bench_test",
"primary_metric": "score_with_stderr/accuracy",
"tags": [
"code"
]
},
{
"name": "DS_1000_test",
"path": "astabench/ds1000_test",
"primary_metric": "ds1000_scorer/accuracy",
"tags": [
"code"
]
},
{
"name": "E2E_Bench_test",
"path": "astabench/e2e_discovery_test",
"primary_metric": "score_rubric/accuracy",
"tags": [
"discovery"
]
},
{
"name": "E2E_Bench_Hard_test",
"path": "astabench/e2e_discovery_hard_test",
"primary_metric": "score_rubric/accuracy",
"tags": [
"discovery"
]
},
{
"name": "SUPER_Expert_test",
"path": "astabench/super_test",
"primary_metric": "output_match/mean",
"tags": [
"code"
]
}
],
"macro_average_weight_adjustments": [
{
"tag": "lit",
"task": "LitQA2_FullText_test",
"weight": 0.5
},
{
"tag": "lit",
"task": "LitQA2_FullText_Search_test",
"weight": 0.5
}
]
}
]
} | validation | [
{
"task_name": "PaperFindingBench_validation",
"eval_spec": {
"solver": "/home/aryeht/PycharmProjects/asta-bench/astabench/solvers/search/youcom_search.py@youcom_solver",
"solver_args": "{}",
"model": "openai/gpt-4o-mini",
"model_args": "{}",
"task_args": "{\"with_search_tools\... | {
"submit_time": "2025-08-07T18:54:02.182157Z",
"username": "Ai2",
"agent_name": "You.com Search API",
"agent_description": null,
"agent_url": null,
"logs_url": null,
"logs_url_public": "hf://datasets/allenai/asta-bench-submissions/1.0.0-dev1/validation/aryeh_tiktinsky_ai2_You.com_Search_API_2025-08-07T18-54-02",
"summary_url": null,
"openness": "Closed source & API available",
"tool_usage": "Fully custom"
} |
{
"name": "asta-bench",
"version": "1.0.0",
"splits": [
{
"name": "validation",
"tasks": [
{
"name": "ArxivDIGESTables_Clean_validation",
"path": "astabench/arxivdigestables_validation",
"primary_metric": "score_tables/mean",
"tags": [
"lit"
]
},
{
"name": "ScholarQA_CS2_validation",
"path": "astabench/sqa_dev",
"primary_metric": "global_avg/mean",
"tags": [
"lit"
]
},
{
"name": "LitQA2_FullText_validation",
"path": "astabench/litqa2_validation",
"primary_metric": "is_correct/accuracy",
"tags": [
"lit"
]
},
{
"name": "PaperFindingBench_validation",
"path": "astabench/paper_finder_validation",
"primary_metric": "score_paper_finder/adjusted_f1_micro_avg",
"tags": [
"lit"
]
},
{
"name": "LitQA2_FullText_Search_validation",
"path": "astabench/paper_finder_litqa2_validation",
"primary_metric": "score_paper_finder/recall_at_30",
"tags": [
"lit"
]
},
{
"name": "DiscoveryBench_validation",
"path": "astabench/discoverybench_validation",
"primary_metric": "score_discoverybench/mean",
"tags": [
"data"
]
},
{
"name": "CORE_Bench_Hard_validation",
"path": "astabench/core_bench_validation",
"primary_metric": "score_with_stderr/accuracy",
"tags": [
"code"
]
},
{
"name": "DS_1000_validation",
"path": "astabench/ds1000_validation",
"primary_metric": "ds1000_scorer/accuracy",
"tags": [
"code"
]
},
{
"name": "E2E_Bench_validation",
"path": "astabench/e2e_discovery_validation",
"primary_metric": "score_rubric/accuracy",
"tags": [
"discovery"
]
},
{
"name": "E2E_Bench_Hard_validation",
"path": "astabench/e2e_discovery_hard_validation",
"primary_metric": "score_rubric/accuracy",
"tags": [
"discovery"
]
},
{
"name": "SUPER_Expert_validation",
"path": "astabench/super_validation",
"primary_metric": "entrypoint/mean",
"tags": [
"code"
]
}
],
"macro_average_weight_adjustments": [
{
"tag": "lit",
"task": "LitQA2_FullText_validation",
"weight": 0.5
},
{
"tag": "lit",
"task": "LitQA2_FullText_Search_validation",
"weight": 0.5
}
]
},
{
"name": "test",
"tasks": [
{
"name": "PaperFindingBench_test",
"path": "astabench/paper_finder_test",
"primary_metric": "score_paper_finder/adjusted_f1_micro_avg",
"tags": [
"lit"
]
},
{
"name": "LitQA2_FullText_Search_test",
"path": "astabench/paper_finder_litqa2_test",
"primary_metric": "score_paper_finder/recall_at_30",
"tags": [
"lit"
]
},
{
"name": "ScholarQA_CS2_test",
"path": "astabench/sqa_test",
"primary_metric": "global_avg/mean",
"tags": [
"lit"
]
},
{
"name": "ArxivDIGESTables_Clean_test",
"path": "astabench/arxivdigestables_test",
"primary_metric": "score_tables/mean",
"tags": [
"lit"
]
},
{
"name": "LitQA2_FullText_test",
"path": "astabench/litqa2_test",
"primary_metric": "is_correct/accuracy",
"tags": [
"lit"
]
},
{
"name": "DiscoveryBench_test",
"path": "astabench/discoverybench_test",
"primary_metric": "score_discoverybench/mean",
"tags": [
"data"
]
},
{
"name": "CORE_Bench_Hard_test",
"path": "astabench/core_bench_test",
"primary_metric": "score_with_stderr/accuracy",
"tags": [
"code"
]
},
{
"name": "DS_1000_test",
"path": "astabench/ds1000_test",
"primary_metric": "ds1000_scorer/accuracy",
"tags": [
"code"
]
},
{
"name": "E2E_Bench_test",
"path": "astabench/e2e_discovery_test",
"primary_metric": "score_rubric/accuracy",
"tags": [
"discovery"
]
},
{
"name": "E2E_Bench_Hard_test",
"path": "astabench/e2e_discovery_hard_test",
"primary_metric": "score_rubric/accuracy",
"tags": [
"discovery"
]
},
{
"name": "SUPER_Expert_test",
"path": "astabench/super_test",
"primary_metric": "output_match/mean",
"tags": [
"code"
]
}
],
"macro_average_weight_adjustments": [
{
"tag": "lit",
"task": "LitQA2_FullText_test",
"weight": 0.5
},
{
"tag": "lit",
"task": "LitQA2_FullText_Search_test",
"weight": 0.5
}
]
}
]
} | validation | [
{
"task_name": "SUPER_Expert_validation",
"eval_spec": {
"solver": "astabench/solvers/code_agent/agent.py@code_agent",
"solver_args": "{\"json_output\": 1, \"max_context_tokens\": 1000000, \"max_tries\": 200}",
"model": "openai/gpt-5-mini-2025-08-07",
"model_args": "{}",
"task_... | {
"submit_time": "2025-08-14T19:05:15.098359Z",
"username": "Ai2",
"agent_name": "Asta Code",
"agent_description": null,
"agent_url": "https://github.com/allenai/asta-bench",
"logs_url": null,
"logs_url_public": "hf://datasets/allenai/asta-bench-submissions/1.0.0-dev1/validation/miked-ai_Asta_Code_GPT-5-mini_2025-08-14T19-05-15",
"summary_url": null,
"openness": "Open source & closed weights",
"tool_usage": "Custom interface"
} |
{
"name": "asta-bench",
"version": "1.0.0",
"splits": [
{
"name": "validation",
"tasks": [
{
"name": "ArxivDIGESTables_Clean_validation",
"path": "astabench/arxivdigestables_validation",
"primary_metric": "score_tables/mean",
"tags": [
"lit"
]
},
{
"name": "ScholarQA_CS2_validation",
"path": "astabench/sqa_dev",
"primary_metric": "global_avg/mean",
"tags": [
"lit"
]
},
{
"name": "LitQA2_FullText_validation",
"path": "astabench/litqa2_validation",
"primary_metric": "is_correct/accuracy",
"tags": [
"lit"
]
},
{
"name": "PaperFindingBench_validation",
"path": "astabench/paper_finder_validation",
"primary_metric": "score_paper_finder/adjusted_f1_micro_avg",
"tags": [
"lit"
]
},
{
"name": "LitQA2_FullText_Search_validation",
"path": "astabench/paper_finder_litqa2_validation",
"primary_metric": "score_paper_finder/recall_at_30",
"tags": [
"lit"
]
},
{
"name": "DiscoveryBench_validation",
"path": "astabench/discoverybench_validation",
"primary_metric": "score_discoverybench/mean",
"tags": [
"data"
]
},
{
"name": "CORE_Bench_Hard_validation",
"path": "astabench/core_bench_validation",
"primary_metric": "score_with_stderr/accuracy",
"tags": [
"code"
]
},
{
"name": "DS_1000_validation",
"path": "astabench/ds1000_validation",
"primary_metric": "ds1000_scorer/accuracy",
"tags": [
"code"
]
},
{
"name": "E2E_Bench_validation",
"path": "astabench/e2e_discovery_validation",
"primary_metric": "score_rubric/accuracy",
"tags": [
"discovery"
]
},
{
"name": "E2E_Bench_Hard_validation",
"path": "astabench/e2e_discovery_hard_validation",
"primary_metric": "score_rubric/accuracy",
"tags": [
"discovery"
]
},
{
"name": "SUPER_Expert_validation",
"path": "astabench/super_validation",
"primary_metric": "entrypoint/mean",
"tags": [
"code"
]
}
],
"macro_average_weight_adjustments": [
{
"tag": "lit",
"task": "LitQA2_FullText_validation",
"weight": 0.5
},
{
"tag": "lit",
"task": "LitQA2_FullText_Search_validation",
"weight": 0.5
}
]
},
{
"name": "test",
"tasks": [
{
"name": "PaperFindingBench_test",
"path": "astabench/paper_finder_test",
"primary_metric": "score_paper_finder/adjusted_f1_micro_avg",
"tags": [
"lit"
]
},
{
"name": "LitQA2_FullText_Search_test",
"path": "astabench/paper_finder_litqa2_test",
"primary_metric": "score_paper_finder/recall_at_30",
"tags": [
"lit"
]
},
{
"name": "ScholarQA_CS2_test",
"path": "astabench/sqa_test",
"primary_metric": "global_avg/mean",
"tags": [
"lit"
]
},
{
"name": "ArxivDIGESTables_Clean_test",
"path": "astabench/arxivdigestables_test",
"primary_metric": "score_tables/mean",
"tags": [
"lit"
]
},
{
"name": "LitQA2_FullText_test",
"path": "astabench/litqa2_test",
"primary_metric": "is_correct/accuracy",
"tags": [
"lit"
]
},
{
"name": "DiscoveryBench_test",
"path": "astabench/discoverybench_test",
"primary_metric": "score_discoverybench/mean",
"tags": [
"data"
]
},
{
"name": "CORE_Bench_Hard_test",
"path": "astabench/core_bench_test",
"primary_metric": "score_with_stderr/accuracy",
"tags": [
"code"
]
},
{
"name": "DS_1000_test",
"path": "astabench/ds1000_test",
"primary_metric": "ds1000_scorer/accuracy",
"tags": [
"code"
]
},
{
"name": "E2E_Bench_test",
"path": "astabench/e2e_discovery_test",
"primary_metric": "score_rubric/accuracy",
"tags": [
"discovery"
]
},
{
"name": "E2E_Bench_Hard_test",
"path": "astabench/e2e_discovery_hard_test",
"primary_metric": "score_rubric/accuracy",
"tags": [
"discovery"
]
},
{
"name": "SUPER_Expert_test",
"path": "astabench/super_test",
"primary_metric": "output_match/mean",
"tags": [
"code"
]
}
],
"macro_average_weight_adjustments": [
{
"tag": "lit",
"task": "LitQA2_FullText_test",
"weight": 0.5
},
{
"tag": "lit",
"task": "LitQA2_FullText_Search_test",
"weight": 0.5
}
]
}
]
} | validation | [
{
"task_name": "SUPER_Expert_validation",
"eval_spec": {
"solver": "astabench/solvers/code_agent/agent.py@code_agent",
"solver_args": "{\"json_output\": 1, \"max_context_tokens\": 1000000, \"max_tries\": 200}",
"model": "openai/gpt-5-2025-08-07",
"model_args": "{}",
"task_args"... | {
"submit_time": "2025-08-14T19:06:15.866442Z",
"username": "Ai2",
"agent_name": "Asta Code",
"agent_description": null,
"agent_url": "https://github.com/allenai/asta-bench",
"logs_url": null,
"logs_url_public": "hf://datasets/allenai/asta-bench-submissions/1.0.0-dev1/validation/miked-ai_Asta_Code_GPT-5_2025-08-14T19-06-15",
"summary_url": null,
"openness": "Open source & closed weights",
"tool_usage": "Custom interface"
} |
{
"name": "asta-bench",
"version": "1.0.0",
"splits": [
{
"name": "validation",
"tasks": [
{
"name": "ArxivDIGESTables_Clean_validation",
"path": "astabench/arxivdigestables_validation",
"primary_metric": "score_tables/mean",
"tags": [
"lit"
]
},
{
"name": "ScholarQA_CS2_validation",
"path": "astabench/sqa_dev",
"primary_metric": "global_avg/mean",
"tags": [
"lit"
]
},
{
"name": "LitQA2_FullText_validation",
"path": "astabench/litqa2_validation",
"primary_metric": "is_correct/accuracy",
"tags": [
"lit"
]
},
{
"name": "PaperFindingBench_validation",
"path": "astabench/paper_finder_validation",
"primary_metric": "score_paper_finder/adjusted_f1_micro_avg",
"tags": [
"lit"
]
},
{
"name": "LitQA2_FullText_Search_validation",
"path": "astabench/paper_finder_litqa2_validation",
"primary_metric": "score_paper_finder/recall_at_30",
"tags": [
"lit"
]
},
{
"name": "DiscoveryBench_validation",
"path": "astabench/discoverybench_validation",
"primary_metric": "score_discoverybench/mean",
"tags": [
"data"
]
},
{
"name": "CORE_Bench_Hard_validation",
"path": "astabench/core_bench_validation",
"primary_metric": "score_with_stderr/accuracy",
"tags": [
"code"
]
},
{
"name": "DS_1000_validation",
"path": "astabench/ds1000_validation",
"primary_metric": "ds1000_scorer/accuracy",
"tags": [
"code"
]
},
{
"name": "E2E_Bench_validation",
"path": "astabench/e2e_discovery_validation",
"primary_metric": "score_rubric/accuracy",
"tags": [
"discovery"
]
},
{
"name": "E2E_Bench_Hard_validation",
"path": "astabench/e2e_discovery_hard_validation",
"primary_metric": "score_rubric/accuracy",
"tags": [
"discovery"
]
},
{
"name": "SUPER_Expert_validation",
"path": "astabench/super_validation",
"primary_metric": "entrypoint/mean",
"tags": [
"code"
]
}
],
"macro_average_weight_adjustments": [
{
"tag": "lit",
"task": "LitQA2_FullText_validation",
"weight": 0.5
},
{
"tag": "lit",
"task": "LitQA2_FullText_Search_validation",
"weight": 0.5
}
]
},
{
"name": "test",
"tasks": [
{
"name": "PaperFindingBench_test",
"path": "astabench/paper_finder_test",
"primary_metric": "score_paper_finder/adjusted_f1_micro_avg",
"tags": [
"lit"
]
},
{
"name": "LitQA2_FullText_Search_test",
"path": "astabench/paper_finder_litqa2_test",
"primary_metric": "score_paper_finder/recall_at_30",
"tags": [
"lit"
]
},
{
"name": "ScholarQA_CS2_test",
"path": "astabench/sqa_test",
"primary_metric": "global_avg/mean",
"tags": [
"lit"
]
},
{
"name": "ArxivDIGESTables_Clean_test",
"path": "astabench/arxivdigestables_test",
"primary_metric": "score_tables/mean",
"tags": [
"lit"
]
},
{
"name": "LitQA2_FullText_test",
"path": "astabench/litqa2_test",
"primary_metric": "is_correct/accuracy",
"tags": [
"lit"
]
},
{
"name": "DiscoveryBench_test",
"path": "astabench/discoverybench_test",
"primary_metric": "score_discoverybench/mean",
"tags": [
"data"
]
},
{
"name": "CORE_Bench_Hard_test",
"path": "astabench/core_bench_test",
"primary_metric": "score_with_stderr/accuracy",
"tags": [
"code"
]
},
{
"name": "DS_1000_test",
"path": "astabench/ds1000_test",
"primary_metric": "ds1000_scorer/accuracy",
"tags": [
"code"
]
},
{
"name": "E2E_Bench_test",
"path": "astabench/e2e_discovery_test",
"primary_metric": "score_rubric/accuracy",
"tags": [
"discovery"
]
},
{
"name": "E2E_Bench_Hard_test",
"path": "astabench/e2e_discovery_hard_test",
"primary_metric": "score_rubric/accuracy",
"tags": [
"discovery"
]
},
{
"name": "SUPER_Expert_test",
"path": "astabench/super_test",
"primary_metric": "output_match/mean",
"tags": [
"code"
]
}
],
"macro_average_weight_adjustments": [
{
"tag": "lit",
"task": "LitQA2_FullText_test",
"weight": 0.5
},
{
"tag": "lit",
"task": "LitQA2_FullText_Search_test",
"weight": 0.5
}
]
}
]
} | validation | [
{
"task_name": "SUPER_Expert_validation",
"eval_spec": {
"solver": "agent_baselines/solvers/react/basic_agent.py@instantiated_basic_agent",
"solver_args": "{\"max_steps\": 100, \"with_report_editor\": 0, \"with_search_tools\": 0, \"with_stateful_python\": 0, \"with_table_editor\": 0, \"with_thin... | {
"submit_time": "2025-08-26T06:45:15.968802Z",
"username": "Ai2",
"agent_name": "ReAct",
"agent_description": "Simple ReAct agent using built-in LLM tool-calling. This variant uses Anthropic Claude 3.5 Haiku as the base model.",
"agent_url": "https://github.com/allenai/agent-baselines",
"logs_url": null,
"logs_url_public": "hf://datasets/allenai/asta-bench-submissions/1.0.0/validation/miked-ai_ReAct_2025-08-26T06-45-15",
"summary_url": null,
"openness": "Open source & closed weights",
"tool_usage": "Standard"
} |
{
"name": "asta-bench",
"version": "1.0.0",
"splits": [
{
"name": "validation",
"tasks": [
{
"name": "ArxivDIGESTables_Clean_validation",
"path": "astabench/arxivdigestables_validation",
"primary_metric": "score_tables/mean",
"tags": [
"lit"
]
},
{
"name": "ScholarQA_CS2_validation",
"path": "astabench/sqa_dev",
"primary_metric": "global_avg/mean",
"tags": [
"lit"
]
},
{
"name": "LitQA2_FullText_validation",
"path": "astabench/litqa2_validation",
"primary_metric": "is_correct/accuracy",
"tags": [
"lit"
]
},
{
"name": "PaperFindingBench_validation",
"path": "astabench/paper_finder_validation",
"primary_metric": "score_paper_finder/adjusted_f1_micro_avg",
"tags": [
"lit"
]
},
{
"name": "LitQA2_FullText_Search_validation",
"path": "astabench/paper_finder_litqa2_validation",
"primary_metric": "score_paper_finder/recall_at_30",
"tags": [
"lit"
]
},
{
"name": "DiscoveryBench_validation",
"path": "astabench/discoverybench_validation",
"primary_metric": "score_discoverybench/mean",
"tags": [
"data"
]
},
{
"name": "CORE_Bench_Hard_validation",
"path": "astabench/core_bench_validation",
"primary_metric": "score_with_stderr/accuracy",
"tags": [
"code"
]
},
{
"name": "DS_1000_validation",
"path": "astabench/ds1000_validation",
"primary_metric": "ds1000_scorer/accuracy",
"tags": [
"code"
]
},
{
"name": "E2E_Bench_validation",
"path": "astabench/e2e_discovery_validation",
"primary_metric": "score_rubric/accuracy",
"tags": [
"discovery"
]
},
{
"name": "E2E_Bench_Hard_validation",
"path": "astabench/e2e_discovery_hard_validation",
"primary_metric": "score_rubric/accuracy",
"tags": [
"discovery"
]
},
{
"name": "SUPER_Expert_validation",
"path": "astabench/super_validation",
"primary_metric": "entrypoint/mean",
"tags": [
"code"
]
}
],
"macro_average_weight_adjustments": [
{
"tag": "lit",
"task": "LitQA2_FullText_validation",
"weight": 0.5
},
{
"tag": "lit",
"task": "LitQA2_FullText_Search_validation",
"weight": 0.5
}
]
},
{
"name": "test",
"tasks": [
{
"name": "PaperFindingBench_test",
"path": "astabench/paper_finder_test",
"primary_metric": "score_paper_finder/adjusted_f1_micro_avg",
"tags": [
"lit"
]
},
{
"name": "LitQA2_FullText_Search_test",
"path": "astabench/paper_finder_litqa2_test",
"primary_metric": "score_paper_finder/recall_at_30",
"tags": [
"lit"
]
},
{
"name": "ScholarQA_CS2_test",
"path": "astabench/sqa_test",
"primary_metric": "global_avg/mean",
"tags": [
"lit"
]
},
{
"name": "ArxivDIGESTables_Clean_test",
"path": "astabench/arxivdigestables_test",
"primary_metric": "score_tables/mean",
"tags": [
"lit"
]
},
{
"name": "LitQA2_FullText_test",
"path": "astabench/litqa2_test",
"primary_metric": "is_correct/accuracy",
"tags": [
"lit"
]
},
{
"name": "DiscoveryBench_test",
"path": "astabench/discoverybench_test",
"primary_metric": "score_discoverybench/mean",
"tags": [
"data"
]
},
{
"name": "CORE_Bench_Hard_test",
"path": "astabench/core_bench_test",
"primary_metric": "score_with_stderr/accuracy",
"tags": [
"code"
]
},
{
"name": "DS_1000_test",
"path": "astabench/ds1000_test",
"primary_metric": "ds1000_scorer/accuracy",
"tags": [
"code"
]
},
{
"name": "E2E_Bench_test",
"path": "astabench/e2e_discovery_test",
"primary_metric": "score_rubric/accuracy",
"tags": [
"discovery"
]
},
{
"name": "E2E_Bench_Hard_test",
"path": "astabench/e2e_discovery_hard_test",
"primary_metric": "score_rubric/accuracy",
"tags": [
"discovery"
]
},
{
"name": "SUPER_Expert_test",
"path": "astabench/super_test",
"primary_metric": "output_match/mean",
"tags": [
"code"
]
}
],
"macro_average_weight_adjustments": [
{
"tag": "lit",
"task": "LitQA2_FullText_test",
"weight": 0.5
},
{
"tag": "lit",
"task": "LitQA2_FullText_Search_test",
"weight": 0.5
}
]
}
]
} | validation | [
{
"task_name": "E2E_Bench_Hard_validation",
"eval_spec": {
"solver": "astabench/evals/e2e_discovery/solvers/codescientist/codescientist_cached.py@codescientist_cached_solver",
"solver_args": "{}",
"model": "openai/gpt-4o-mini",
"model_args": "{}",
"task_args": "{}",
"revi... | {
"submit_time": "2025-07-10T18:12:16.960575Z",
"username": "Ai2",
"agent_name": "Asta CodeScientist",
"agent_description": null,
"agent_url": null,
"logs_url": null,
"logs_url_public": "hf://datasets/allenai/asta-bench-submissions/1.0.0-dev1/validation/pclark425_Asta_CodeScientist_2025-07-10T18-12-16",
"summary_url": null,
"openness": "Open source & closed weights",
"tool_usage": "Fully custom"
} |
{
"name": "asta-bench",
"version": "1.0.0",
"splits": [
{
"name": "validation",
"tasks": [
{
"name": "ArxivDIGESTables_Clean_validation",
"path": "astabench/arxivdigestables_validation",
"primary_metric": "score_tables/mean",
"tags": [
"lit"
]
},
{
"name": "ScholarQA_CS2_validation",
"path": "astabench/sqa_dev",
"primary_metric": "global_avg/mean",
"tags": [
"lit"
]
},
{
"name": "LitQA2_FullText_validation",
"path": "astabench/litqa2_validation",
"primary_metric": "is_correct/accuracy",
"tags": [
"lit"
]
},
{
"name": "PaperFindingBench_validation",
"path": "astabench/paper_finder_validation",
"primary_metric": "score_paper_finder/adjusted_f1_micro_avg",
"tags": [
"lit"
]
},
{
"name": "LitQA2_FullText_Search_validation",
"path": "astabench/paper_finder_litqa2_validation",
"primary_metric": "score_paper_finder/recall_at_30",
"tags": [
"lit"
]
},
{
"name": "DiscoveryBench_validation",
"path": "astabench/discoverybench_validation",
"primary_metric": "score_discoverybench/mean",
"tags": [
"data"
]
},
{
"name": "CORE_Bench_Hard_validation",
"path": "astabench/core_bench_validation",
"primary_metric": "score_with_stderr/accuracy",
"tags": [
"code"
]
},
{
"name": "DS_1000_validation",
"path": "astabench/ds1000_validation",
"primary_metric": "ds1000_scorer/accuracy",
"tags": [
"code"
]
},
{
"name": "E2E_Bench_validation",
"path": "astabench/e2e_discovery_validation",
"primary_metric": "score_rubric/accuracy",
"tags": [
"discovery"
]
},
{
"name": "E2E_Bench_Hard_validation",
"path": "astabench/e2e_discovery_hard_validation",
"primary_metric": "score_rubric/accuracy",
"tags": [
"discovery"
]
},
{
"name": "SUPER_Expert_validation",
"path": "astabench/super_validation",
"primary_metric": "entrypoint/mean",
"tags": [
"code"
]
}
],
"macro_average_weight_adjustments": [
{
"tag": "lit",
"task": "LitQA2_FullText_validation",
"weight": 0.5
},
{
"tag": "lit",
"task": "LitQA2_FullText_Search_validation",
"weight": 0.5
}
]
},
{
"name": "test",
"tasks": [
{
"name": "PaperFindingBench_test",
"path": "astabench/paper_finder_test",
"primary_metric": "score_paper_finder/adjusted_f1_micro_avg",
"tags": [
"lit"
]
},
{
"name": "LitQA2_FullText_Search_test",
"path": "astabench/paper_finder_litqa2_test",
"primary_metric": "score_paper_finder/recall_at_30",
"tags": [
"lit"
]
},
{
"name": "ScholarQA_CS2_test",
"path": "astabench/sqa_test",
"primary_metric": "global_avg/mean",
"tags": [
"lit"
]
},
{
"name": "ArxivDIGESTables_Clean_test",
"path": "astabench/arxivdigestables_test",
"primary_metric": "score_tables/mean",
"tags": [
"lit"
]
},
{
"name": "LitQA2_FullText_test",
"path": "astabench/litqa2_test",
"primary_metric": "is_correct/accuracy",
"tags": [
"lit"
]
},
{
"name": "DiscoveryBench_test",
"path": "astabench/discoverybench_test",
"primary_metric": "score_discoverybench/mean",
"tags": [
"data"
]
},
{
"name": "CORE_Bench_Hard_test",
"path": "astabench/core_bench_test",
"primary_metric": "score_with_stderr/accuracy",
"tags": [
"code"
]
},
{
"name": "DS_1000_test",
"path": "astabench/ds1000_test",
"primary_metric": "ds1000_scorer/accuracy",
"tags": [
"code"
]
},
{
"name": "E2E_Bench_test",
"path": "astabench/e2e_discovery_test",
"primary_metric": "score_rubric/accuracy",
"tags": [
"discovery"
]
},
{
"name": "E2E_Bench_Hard_test",
"path": "astabench/e2e_discovery_hard_test",
"primary_metric": "score_rubric/accuracy",
"tags": [
"discovery"
]
},
{
"name": "SUPER_Expert_test",
"path": "astabench/super_test",
"primary_metric": "output_match/mean",
"tags": [
"code"
]
}
],
"macro_average_weight_adjustments": [
{
"tag": "lit",
"task": "LitQA2_FullText_test",
"weight": 0.5
},
{
"tag": "lit",
"task": "LitQA2_FullText_Search_test",
"weight": 0.5
}
]
}
]
} | validation | [
{
"task_name": "DiscoveryBench_validation",
"eval_spec": {
"solver": "astabench/solvers/datavoyager/agent.py@datavoyager_solver",
"solver_args": "{\"config_file\": \"astabench/solvers/datavoyager/dv_core/config/datavoyager_modal_deployment_magentic_one_config_20250617_o3.yaml\"}",
"model":... | {
"submit_time": "2025-08-14T04:16:47.886330Z",
"username": "Ai2",
"agent_name": "Asta DataVoyager",
"agent_description": "o3",
"agent_url": null,
"logs_url": null,
"logs_url_public": "hf://datasets/allenai/asta-bench-submissions/1.0.0-dev1/validation/pclark425_Asta_DataVoyager_2025-08-14T04-16-47",
"summary_url": null,
"openness": "Open source & closed weights",
"tool_usage": "Custom interface"
} |
{
"name": "asta-bench",
"version": "1.0.0",
"splits": [
{
"name": "validation",
"tasks": [
{
"name": "ArxivDIGESTables_Clean_validation",
"path": "astabench/arxivdigestables_validation",
"primary_metric": "score_tables/mean",
"tags": [
"lit"
]
},
{
"name": "ScholarQA_CS2_validation",
"path": "astabench/sqa_dev",
"primary_metric": "global_avg/mean",
"tags": [
"lit"
]
},
{
"name": "LitQA2_FullText_validation",
"path": "astabench/litqa2_validation",
"primary_metric": "is_correct/accuracy",
"tags": [
"lit"
]
},
{
"name": "PaperFindingBench_validation",
"path": "astabench/paper_finder_validation",
"primary_metric": "score_paper_finder/adjusted_f1_micro_avg",
"tags": [
"lit"
]
},
{
"name": "LitQA2_FullText_Search_validation",
"path": "astabench/paper_finder_litqa2_validation",
"primary_metric": "score_paper_finder/recall_at_30",
"tags": [
"lit"
]
},
{
"name": "DiscoveryBench_validation",
"path": "astabench/discoverybench_validation",
"primary_metric": "score_discoverybench/mean",
"tags": [
"data"
]
},
{
"name": "CORE_Bench_Hard_validation",
"path": "astabench/core_bench_validation",
"primary_metric": "score_with_stderr/accuracy",
"tags": [
"code"
]
},
{
"name": "DS_1000_validation",
"path": "astabench/ds1000_validation",
"primary_metric": "ds1000_scorer/accuracy",
"tags": [
"code"
]
},
{
"name": "E2E_Bench_validation",
"path": "astabench/e2e_discovery_validation",
"primary_metric": "score_rubric/accuracy",
"tags": [
"discovery"
]
},
{
"name": "E2E_Bench_Hard_validation",
"path": "astabench/e2e_discovery_hard_validation",
"primary_metric": "score_rubric/accuracy",
"tags": [
"discovery"
]
},
{
"name": "SUPER_Expert_validation",
"path": "astabench/super_validation",
"primary_metric": "entrypoint/mean",
"tags": [
"code"
]
}
],
"macro_average_weight_adjustments": [
{
"tag": "lit",
"task": "LitQA2_FullText_validation",
"weight": 0.5
},
{
"tag": "lit",
"task": "LitQA2_FullText_Search_validation",
"weight": 0.5
}
]
},
{
"name": "test",
"tasks": [
{
"name": "PaperFindingBench_test",
"path": "astabench/paper_finder_test",
"primary_metric": "score_paper_finder/adjusted_f1_micro_avg",
"tags": [
"lit"
]
},
{
"name": "LitQA2_FullText_Search_test",
"path": "astabench/paper_finder_litqa2_test",
"primary_metric": "score_paper_finder/recall_at_30",
"tags": [
"lit"
]
},
{
"name": "ScholarQA_CS2_test",
"path": "astabench/sqa_test",
"primary_metric": "global_avg/mean",
"tags": [
"lit"
]
},
{
"name": "ArxivDIGESTables_Clean_test",
"path": "astabench/arxivdigestables_test",
"primary_metric": "score_tables/mean",
"tags": [
"lit"
]
},
{
"name": "LitQA2_FullText_test",
"path": "astabench/litqa2_test",
"primary_metric": "is_correct/accuracy",
"tags": [
"lit"
]
},
{
"name": "DiscoveryBench_test",
"path": "astabench/discoverybench_test",
"primary_metric": "score_discoverybench/mean",
"tags": [
"data"
]
},
{
"name": "CORE_Bench_Hard_test",
"path": "astabench/core_bench_test",
"primary_metric": "score_with_stderr/accuracy",
"tags": [
"code"
]
},
{
"name": "DS_1000_test",
"path": "astabench/ds1000_test",
"primary_metric": "ds1000_scorer/accuracy",
"tags": [
"code"
]
},
{
"name": "E2E_Bench_test",
"path": "astabench/e2e_discovery_test",
"primary_metric": "score_rubric/accuracy",
"tags": [
"discovery"
]
},
{
"name": "E2E_Bench_Hard_test",
"path": "astabench/e2e_discovery_hard_test",
"primary_metric": "score_rubric/accuracy",
"tags": [
"discovery"
]
},
{
"name": "SUPER_Expert_test",
"path": "astabench/super_test",
"primary_metric": "output_match/mean",
"tags": [
"code"
]
}
],
"macro_average_weight_adjustments": [
{
"tag": "lit",
"task": "LitQA2_FullText_test",
"weight": 0.5
},
{
"tag": "lit",
"task": "LitQA2_FullText_Search_test",
"weight": 0.5
}
]
}
]
} | validation | [
{
"task_name": "DiscoveryBench_validation",
"eval_spec": {
"solver": "astabench/solvers/datavoyager/agent.py@datavoyager_solver",
"solver_args": "{\"config_file\": \"astabench/solvers/datavoyager/dv_core/config/datavoyager_modal_deployment_magentic_one_config_20250617_sonnet_4.yaml\"}",
"m... | {
"submit_time": "2025-08-14T19:32:30.253858Z",
"username": "Ai2",
"agent_name": "Asta DataVoyager",
"agent_description": "claude-sonnet-4",
"agent_url": null,
"logs_url": null,
"logs_url_public": "hf://datasets/allenai/asta-bench-submissions/1.0.0-dev1/validation/pclark425_Asta_DataVoyager_2025-08-14T19-32-30",
"summary_url": null,
"openness": "Open source & closed weights",
"tool_usage": "Custom interface"
} |
{
"name": "asta-bench",
"version": "1.0.0",
"splits": [
{
"name": "validation",
"tasks": [
{
"name": "ArxivDIGESTables_Clean_validation",
"path": "astabench/arxivdigestables_validation",
"primary_metric": "score_tables/mean",
"tags": [
"lit"
]
},
{
"name": "ScholarQA_CS2_validation",
"path": "astabench/sqa_dev",
"primary_metric": "global_avg/mean",
"tags": [
"lit"
]
},
{
"name": "LitQA2_FullText_validation",
"path": "astabench/litqa2_validation",
"primary_metric": "is_correct/accuracy",
"tags": [
"lit"
]
},
{
"name": "PaperFindingBench_validation",
"path": "astabench/paper_finder_validation",
"primary_metric": "score_paper_finder/adjusted_f1_micro_avg",
"tags": [
"lit"
]
},
{
"name": "LitQA2_FullText_Search_validation",
"path": "astabench/paper_finder_litqa2_validation",
"primary_metric": "score_paper_finder/recall_at_30",
"tags": [
"lit"
]
},
{
"name": "DiscoveryBench_validation",
"path": "astabench/discoverybench_validation",
"primary_metric": "score_discoverybench/mean",
"tags": [
"data"
]
},
{
"name": "CORE_Bench_Hard_validation",
"path": "astabench/core_bench_validation",
"primary_metric": "score_with_stderr/accuracy",
"tags": [
"code"
]
},
{
"name": "DS_1000_validation",
"path": "astabench/ds1000_validation",
"primary_metric": "ds1000_scorer/accuracy",
"tags": [
"code"
]
},
{
"name": "E2E_Bench_validation",
"path": "astabench/e2e_discovery_validation",
"primary_metric": "score_rubric/accuracy",
"tags": [
"discovery"
]
},
{
"name": "E2E_Bench_Hard_validation",
"path": "astabench/e2e_discovery_hard_validation",
"primary_metric": "score_rubric/accuracy",
"tags": [
"discovery"
]
},
{
"name": "SUPER_Expert_validation",
"path": "astabench/super_validation",
"primary_metric": "entrypoint/mean",
"tags": [
"code"
]
}
],
"macro_average_weight_adjustments": [
{
"tag": "lit",
"task": "LitQA2_FullText_validation",
"weight": 0.5
},
{
"tag": "lit",
"task": "LitQA2_FullText_Search_validation",
"weight": 0.5
}
]
},
{
"name": "test",
"tasks": [
{
"name": "PaperFindingBench_test",
"path": "astabench/paper_finder_test",
"primary_metric": "score_paper_finder/adjusted_f1_micro_avg",
"tags": [
"lit"
]
},
{
"name": "LitQA2_FullText_Search_test",
"path": "astabench/paper_finder_litqa2_test",
"primary_metric": "score_paper_finder/recall_at_30",
"tags": [
"lit"
]
},
{
"name": "ScholarQA_CS2_test",
"path": "astabench/sqa_test",
"primary_metric": "global_avg/mean",
"tags": [
"lit"
]
},
{
"name": "ArxivDIGESTables_Clean_test",
"path": "astabench/arxivdigestables_test",
"primary_metric": "score_tables/mean",
"tags": [
"lit"
]
},
{
"name": "LitQA2_FullText_test",
"path": "astabench/litqa2_test",
"primary_metric": "is_correct/accuracy",
"tags": [
"lit"
]
},
{
"name": "DiscoveryBench_test",
"path": "astabench/discoverybench_test",
"primary_metric": "score_discoverybench/mean",
"tags": [
"data"
]
},
{
"name": "CORE_Bench_Hard_test",
"path": "astabench/core_bench_test",
"primary_metric": "score_with_stderr/accuracy",
"tags": [
"code"
]
},
{
"name": "DS_1000_test",
"path": "astabench/ds1000_test",
"primary_metric": "ds1000_scorer/accuracy",
"tags": [
"code"
]
},
{
"name": "E2E_Bench_test",
"path": "astabench/e2e_discovery_test",
"primary_metric": "score_rubric/accuracy",
"tags": [
"discovery"
]
},
{
"name": "E2E_Bench_Hard_test",
"path": "astabench/e2e_discovery_hard_test",
"primary_metric": "score_rubric/accuracy",
"tags": [
"discovery"
]
},
{
"name": "SUPER_Expert_test",
"path": "astabench/super_test",
"primary_metric": "output_match/mean",
"tags": [
"code"
]
}
],
"macro_average_weight_adjustments": [
{
"tag": "lit",
"task": "LitQA2_FullText_test",
"weight": 0.5
},
{
"tag": "lit",
"task": "LitQA2_FullText_Search_test",
"weight": 0.5
}
]
}
]
} | validation | [
{
"task_name": "DiscoveryBench_validation",
"eval_spec": {
"solver": "astabench/solvers/datavoyager/agent.py@datavoyager_solver",
"solver_args": "{\"config_file\": \"astabench/solvers/datavoyager/dv_core/config/datavoyager_modal_deployment_magentic_one_config_20250617_gpt5_minimal.yaml\"}",
... | {
"submit_time": "2025-08-14T19:42:20.164747Z",
"username": "Ai2",
"agent_name": "Asta DataVoyager",
"agent_description": "gpt-5/minimal_reasoning",
"agent_url": null,
"logs_url": null,
"logs_url_public": "hf://datasets/allenai/asta-bench-submissions/1.0.0-dev1/validation/pclark425_Asta_DataVoyager_2025-08-14T19-42-20",
"summary_url": null,
"openness": "Open source & closed weights",
"tool_usage": "Custom interface"
} |
{
"name": "asta-bench",
"version": "1.0.0",
"splits": [
{
"name": "validation",
"tasks": [
{
"name": "ArxivDIGESTables_Clean_validation",
"path": "astabench/arxivdigestables_validation",
"primary_metric": "score_tables/mean",
"tags": [
"lit"
]
},
{
"name": "ScholarQA_CS2_validation",
"path": "astabench/sqa_dev",
"primary_metric": "global_avg/mean",
"tags": [
"lit"
]
},
{
"name": "LitQA2_FullText_validation",
"path": "astabench/litqa2_validation",
"primary_metric": "is_correct/accuracy",
"tags": [
"lit"
]
},
{
"name": "PaperFindingBench_validation",
"path": "astabench/paper_finder_validation",
"primary_metric": "score_paper_finder/adjusted_f1_micro_avg",
"tags": [
"lit"
]
},
{
"name": "LitQA2_FullText_Search_validation",
"path": "astabench/paper_finder_litqa2_validation",
"primary_metric": "score_paper_finder/recall_at_30",
"tags": [
"lit"
]
},
{
"name": "DiscoveryBench_validation",
"path": "astabench/discoverybench_validation",
"primary_metric": "score_discoverybench/mean",
"tags": [
"data"
]
},
{
"name": "CORE_Bench_Hard_validation",
"path": "astabench/core_bench_validation",
"primary_metric": "score_with_stderr/accuracy",
"tags": [
"code"
]
},
{
"name": "DS_1000_validation",
"path": "astabench/ds1000_validation",
"primary_metric": "ds1000_scorer/accuracy",
"tags": [
"code"
]
},
{
"name": "E2E_Bench_validation",
"path": "astabench/e2e_discovery_validation",
"primary_metric": "score_rubric/accuracy",
"tags": [
"discovery"
]
},
{
"name": "E2E_Bench_Hard_validation",
"path": "astabench/e2e_discovery_hard_validation",
"primary_metric": "score_rubric/accuracy",
"tags": [
"discovery"
]
},
{
"name": "SUPER_Expert_validation",
"path": "astabench/super_validation",
"primary_metric": "entrypoint/mean",
"tags": [
"code"
]
}
],
"macro_average_weight_adjustments": [
{
"tag": "lit",
"task": "LitQA2_FullText_validation",
"weight": 0.5
},
{
"tag": "lit",
"task": "LitQA2_FullText_Search_validation",
"weight": 0.5
}
]
},
{
"name": "test",
"tasks": [
{
"name": "PaperFindingBench_test",
"path": "astabench/paper_finder_test",
"primary_metric": "score_paper_finder/adjusted_f1_micro_avg",
"tags": [
"lit"
]
},
{
"name": "LitQA2_FullText_Search_test",
"path": "astabench/paper_finder_litqa2_test",
"primary_metric": "score_paper_finder/recall_at_30",
"tags": [
"lit"
]
},
{
"name": "ScholarQA_CS2_test",
"path": "astabench/sqa_test",
"primary_metric": "global_avg/mean",
"tags": [
"lit"
]
},
{
"name": "ArxivDIGESTables_Clean_test",
"path": "astabench/arxivdigestables_test",
"primary_metric": "score_tables/mean",
"tags": [
"lit"
]
},
{
"name": "LitQA2_FullText_test",
"path": "astabench/litqa2_test",
"primary_metric": "is_correct/accuracy",
"tags": [
"lit"
]
},
{
"name": "DiscoveryBench_test",
"path": "astabench/discoverybench_test",
"primary_metric": "score_discoverybench/mean",
"tags": [
"data"
]
},
{
"name": "CORE_Bench_Hard_test",
"path": "astabench/core_bench_test",
"primary_metric": "score_with_stderr/accuracy",
"tags": [
"code"
]
},
{
"name": "DS_1000_test",
"path": "astabench/ds1000_test",
"primary_metric": "ds1000_scorer/accuracy",
"tags": [
"code"
]
},
{
"name": "E2E_Bench_test",
"path": "astabench/e2e_discovery_test",
"primary_metric": "score_rubric/accuracy",
"tags": [
"discovery"
]
},
{
"name": "E2E_Bench_Hard_test",
"path": "astabench/e2e_discovery_hard_test",
"primary_metric": "score_rubric/accuracy",
"tags": [
"discovery"
]
},
{
"name": "SUPER_Expert_test",
"path": "astabench/super_test",
"primary_metric": "output_match/mean",
"tags": [
"code"
]
}
],
"macro_average_weight_adjustments": [
{
"tag": "lit",
"task": "LitQA2_FullText_test",
"weight": 0.5
},
{
"tag": "lit",
"task": "LitQA2_FullText_Search_test",
"weight": 0.5
}
]
}
]
} | validation | [
{
"task_name": "DiscoveryBench_validation",
"eval_spec": {
"solver": "astabench/solvers/datavoyager/agent.py@datavoyager_solver",
"solver_args": "{\"config_file\": \"astabench/solvers/datavoyager/dv_core/config/datavoyager_modal_deployment_magentic_one_config_20250617_gpt-41.yaml\"}",
"mod... | {
"submit_time": "2025-08-14T21:44:09.404096Z",
"username": "Ai2",
"agent_name": "Asta DataVoyager",
"agent_description": "gpt-4.1",
"agent_url": null,
"logs_url": null,
"logs_url_public": "hf://datasets/allenai/asta-bench-submissions/1.0.0-dev1/validation/pclark425_Asta_DataVoyager_2025-08-14T21-44-09",
"summary_url": null,
"openness": "Open source & closed weights",
"tool_usage": "Custom interface"
} |
{
"name": "asta-bench",
"version": "1.0.0",
"splits": [
{
"name": "validation",
"tasks": [
{
"name": "ArxivDIGESTables_Clean_validation",
"path": "astabench/arxivdigestables_validation",
"primary_metric": "score_tables/mean",
"tags": [
"lit"
]
},
{
"name": "ScholarQA_CS2_validation",
"path": "astabench/sqa_dev",
"primary_metric": "global_avg/mean",
"tags": [
"lit"
]
},
{
"name": "LitQA2_FullText_validation",
"path": "astabench/litqa2_validation",
"primary_metric": "is_correct/accuracy",
"tags": [
"lit"
]
},
{
"name": "PaperFindingBench_validation",
"path": "astabench/paper_finder_validation",
"primary_metric": "score_paper_finder/adjusted_f1_micro_avg",
"tags": [
"lit"
]
},
{
"name": "LitQA2_FullText_Search_validation",
"path": "astabench/paper_finder_litqa2_validation",
"primary_metric": "score_paper_finder/recall_at_30",
"tags": [
"lit"
]
},
{
"name": "DiscoveryBench_validation",
"path": "astabench/discoverybench_validation",
"primary_metric": "score_discoverybench/mean",
"tags": [
"data"
]
},
{
"name": "CORE_Bench_Hard_validation",
"path": "astabench/core_bench_validation",
"primary_metric": "score_with_stderr/accuracy",
"tags": [
"code"
]
},
{
"name": "DS_1000_validation",
"path": "astabench/ds1000_validation",
"primary_metric": "ds1000_scorer/accuracy",
"tags": [
"code"
]
},
{
"name": "E2E_Bench_validation",
"path": "astabench/e2e_discovery_validation",
"primary_metric": "score_rubric/accuracy",
"tags": [
"discovery"
]
},
{
"name": "E2E_Bench_Hard_validation",
"path": "astabench/e2e_discovery_hard_validation",
"primary_metric": "score_rubric/accuracy",
"tags": [
"discovery"
]
},
{
"name": "SUPER_Expert_validation",
"path": "astabench/super_validation",
"primary_metric": "entrypoint/mean",
"tags": [
"code"
]
}
],
"macro_average_weight_adjustments": [
{
"tag": "lit",
"task": "LitQA2_FullText_validation",
"weight": 0.5
},
{
"tag": "lit",
"task": "LitQA2_FullText_Search_validation",
"weight": 0.5
}
]
},
{
"name": "test",
"tasks": [
{
"name": "PaperFindingBench_test",
"path": "astabench/paper_finder_test",
"primary_metric": "score_paper_finder/adjusted_f1_micro_avg",
"tags": [
"lit"
]
},
{
"name": "LitQA2_FullText_Search_test",
"path": "astabench/paper_finder_litqa2_test",
"primary_metric": "score_paper_finder/recall_at_30",
"tags": [
"lit"
]
},
{
"name": "ScholarQA_CS2_test",
"path": "astabench/sqa_test",
"primary_metric": "global_avg/mean",
"tags": [
"lit"
]
},
{
"name": "ArxivDIGESTables_Clean_test",
"path": "astabench/arxivdigestables_test",
"primary_metric": "score_tables/mean",
"tags": [
"lit"
]
},
{
"name": "LitQA2_FullText_test",
"path": "astabench/litqa2_test",
"primary_metric": "is_correct/accuracy",
"tags": [
"lit"
]
},
{
"name": "DiscoveryBench_test",
"path": "astabench/discoverybench_test",
"primary_metric": "score_discoverybench/mean",
"tags": [
"data"
]
},
{
"name": "CORE_Bench_Hard_test",
"path": "astabench/core_bench_test",
"primary_metric": "score_with_stderr/accuracy",
"tags": [
"code"
]
},
{
"name": "DS_1000_test",
"path": "astabench/ds1000_test",
"primary_metric": "ds1000_scorer/accuracy",
"tags": [
"code"
]
},
{
"name": "E2E_Bench_test",
"path": "astabench/e2e_discovery_test",
"primary_metric": "score_rubric/accuracy",
"tags": [
"discovery"
]
},
{
"name": "E2E_Bench_Hard_test",
"path": "astabench/e2e_discovery_hard_test",
"primary_metric": "score_rubric/accuracy",
"tags": [
"discovery"
]
},
{
"name": "SUPER_Expert_test",
"path": "astabench/super_test",
"primary_metric": "output_match/mean",
"tags": [
"code"
]
}
],
"macro_average_weight_adjustments": [
{
"tag": "lit",
"task": "LitQA2_FullText_test",
"weight": 0.5
},
{
"tag": "lit",
"task": "LitQA2_FullText_Search_test",
"weight": 0.5
}
]
}
]
} | validation | [
{
"task_name": "E2E_Bench_Hard_validation",
"eval_spec": {
"solver": "astabench/evals/e2e_discovery/solvers/autoasta/autoasta_cached.py@autoasta_cached_solver",
"solver_args": "{\"model\": \"claude-sonnet-4-20250514\"}",
"model": "openai/gpt-4o-mini",
"model_args": "{}",
"task_... | {
"submit_time": "2025-08-03T22:39:22.125992Z",
"username": "Ai2",
"agent_name": "Asta Panda",
"agent_description": "v1.4.7-Claude4",
"agent_url": null,
"logs_url": null,
"logs_url_public": "hf://datasets/allenai/asta-bench-submissions/1.0.0-dev1/validation/pclark425_Asta_Panda_2025-08-03T22-39-22",
"summary_url": null,
"openness": "Open source & closed weights",
"tool_usage": "Fully custom"
} |
{
"name": "asta-bench",
"version": "1.0.0",
"splits": [
{
"name": "validation",
"tasks": [
{
"name": "ArxivDIGESTables_Clean_validation",
"path": "astabench/arxivdigestables_validation",
"primary_metric": "score_tables/mean",
"tags": [
"lit"
]
},
{
"name": "ScholarQA_CS2_validation",
"path": "astabench/sqa_dev",
"primary_metric": "global_avg/mean",
"tags": [
"lit"
]
},
{
"name": "LitQA2_FullText_validation",
"path": "astabench/litqa2_validation",
"primary_metric": "is_correct/accuracy",
"tags": [
"lit"
]
},
{
"name": "PaperFindingBench_validation",
"path": "astabench/paper_finder_validation",
"primary_metric": "score_paper_finder/adjusted_f1_micro_avg",
"tags": [
"lit"
]
},
{
"name": "LitQA2_FullText_Search_validation",
"path": "astabench/paper_finder_litqa2_validation",
"primary_metric": "score_paper_finder/recall_at_30",
"tags": [
"lit"
]
},
{
"name": "DiscoveryBench_validation",
"path": "astabench/discoverybench_validation",
"primary_metric": "score_discoverybench/mean",
"tags": [
"data"
]
},
{
"name": "CORE_Bench_Hard_validation",
"path": "astabench/core_bench_validation",
"primary_metric": "score_with_stderr/accuracy",
"tags": [
"code"
]
},
{
"name": "DS_1000_validation",
"path": "astabench/ds1000_validation",
"primary_metric": "ds1000_scorer/accuracy",
"tags": [
"code"
]
},
{
"name": "E2E_Bench_validation",
"path": "astabench/e2e_discovery_validation",
"primary_metric": "score_rubric/accuracy",
"tags": [
"discovery"
]
},
{
"name": "E2E_Bench_Hard_validation",
"path": "astabench/e2e_discovery_hard_validation",
"primary_metric": "score_rubric/accuracy",
"tags": [
"discovery"
]
},
{
"name": "SUPER_Expert_validation",
"path": "astabench/super_validation",
"primary_metric": "entrypoint/mean",
"tags": [
"code"
]
}
],
"macro_average_weight_adjustments": [
{
"tag": "lit",
"task": "LitQA2_FullText_validation",
"weight": 0.5
},
{
"tag": "lit",
"task": "LitQA2_FullText_Search_validation",
"weight": 0.5
}
]
},
{
"name": "test",
"tasks": [
{
"name": "PaperFindingBench_test",
"path": "astabench/paper_finder_test",
"primary_metric": "score_paper_finder/adjusted_f1_micro_avg",
"tags": [
"lit"
]
},
{
"name": "LitQA2_FullText_Search_test",
"path": "astabench/paper_finder_litqa2_test",
"primary_metric": "score_paper_finder/recall_at_30",
"tags": [
"lit"
]
},
{
"name": "ScholarQA_CS2_test",
"path": "astabench/sqa_test",
"primary_metric": "global_avg/mean",
"tags": [
"lit"
]
},
{
"name": "ArxivDIGESTables_Clean_test",
"path": "astabench/arxivdigestables_test",
"primary_metric": "score_tables/mean",
"tags": [
"lit"
]
},
{
"name": "LitQA2_FullText_test",
"path": "astabench/litqa2_test",
"primary_metric": "is_correct/accuracy",
"tags": [
"lit"
]
},
{
"name": "DiscoveryBench_test",
"path": "astabench/discoverybench_test",
"primary_metric": "score_discoverybench/mean",
"tags": [
"data"
]
},
{
"name": "CORE_Bench_Hard_test",
"path": "astabench/core_bench_test",
"primary_metric": "score_with_stderr/accuracy",
"tags": [
"code"
]
},
{
"name": "DS_1000_test",
"path": "astabench/ds1000_test",
"primary_metric": "ds1000_scorer/accuracy",
"tags": [
"code"
]
},
{
"name": "E2E_Bench_test",
"path": "astabench/e2e_discovery_test",
"primary_metric": "score_rubric/accuracy",
"tags": [
"discovery"
]
},
{
"name": "E2E_Bench_Hard_test",
"path": "astabench/e2e_discovery_hard_test",
"primary_metric": "score_rubric/accuracy",
"tags": [
"discovery"
]
},
{
"name": "SUPER_Expert_test",
"path": "astabench/super_test",
"primary_metric": "output_match/mean",
"tags": [
"code"
]
}
],
"macro_average_weight_adjustments": [
{
"tag": "lit",
"task": "LitQA2_FullText_test",
"weight": 0.5
},
{
"tag": "lit",
"task": "LitQA2_FullText_Search_test",
"weight": 0.5
}
]
}
]
} | validation | [
{
"task_name": "E2E_Bench_Hard_validation",
"eval_spec": {
"solver": "astabench/evals/e2e_discovery/solvers/autoasta/autoasta_cached.py@autoasta_cached_solver",
"solver_args": "{\"model\": \"gpt-4.1\"}",
"model": "openai/gpt-4o-mini",
"model_args": "{}",
"task_args": "{}",
... | {
"submit_time": "2025-08-04T03:31:48.698423Z",
"username": "Ai2",
"agent_name": "Asta Panda",
"agent_description": "v1.4.7-gpt41",
"agent_url": null,
"logs_url": null,
"logs_url_public": "hf://datasets/allenai/asta-bench-submissions/1.0.0-dev1/validation/pclark425_Asta_Panda_2025-08-04T03-31-48",
"summary_url": null,
"openness": "Open source & closed weights",
"tool_usage": "Fully custom"
} |
{
"name": "asta-bench",
"version": "1.0.0",
"splits": [
{
"name": "validation",
"tasks": [
{
"name": "ArxivDIGESTables_Clean_validation",
"path": "astabench/arxivdigestables_validation",
"primary_metric": "score_tables/mean",
"tags": [
"lit"
]
},
{
"name": "ScholarQA_CS2_validation",
"path": "astabench/sqa_dev",
"primary_metric": "global_avg/mean",
"tags": [
"lit"
]
},
{
"name": "LitQA2_FullText_validation",
"path": "astabench/litqa2_validation",
"primary_metric": "is_correct/accuracy",
"tags": [
"lit"
]
},
{
"name": "PaperFindingBench_validation",
"path": "astabench/paper_finder_validation",
"primary_metric": "score_paper_finder/adjusted_f1_micro_avg",
"tags": [
"lit"
]
},
{
"name": "LitQA2_FullText_Search_validation",
"path": "astabench/paper_finder_litqa2_validation",
"primary_metric": "score_paper_finder/recall_at_30",
"tags": [
"lit"
]
},
{
"name": "DiscoveryBench_validation",
"path": "astabench/discoverybench_validation",
"primary_metric": "score_discoverybench/mean",
"tags": [
"data"
]
},
{
"name": "CORE_Bench_Hard_validation",
"path": "astabench/core_bench_validation",
"primary_metric": "score_with_stderr/accuracy",
"tags": [
"code"
]
},
{
"name": "DS_1000_validation",
"path": "astabench/ds1000_validation",
"primary_metric": "ds1000_scorer/accuracy",
"tags": [
"code"
]
},
{
"name": "E2E_Bench_validation",
"path": "astabench/e2e_discovery_validation",
"primary_metric": "score_rubric/accuracy",
"tags": [
"discovery"
]
},
{
"name": "E2E_Bench_Hard_validation",
"path": "astabench/e2e_discovery_hard_validation",
"primary_metric": "score_rubric/accuracy",
"tags": [
"discovery"
]
},
{
"name": "SUPER_Expert_validation",
"path": "astabench/super_validation",
"primary_metric": "entrypoint/mean",
"tags": [
"code"
]
}
],
"macro_average_weight_adjustments": [
{
"tag": "lit",
"task": "LitQA2_FullText_validation",
"weight": 0.5
},
{
"tag": "lit",
"task": "LitQA2_FullText_Search_validation",
"weight": 0.5
}
]
},
{
"name": "test",
"tasks": [
{
"name": "PaperFindingBench_test",
"path": "astabench/paper_finder_test",
"primary_metric": "score_paper_finder/adjusted_f1_micro_avg",
"tags": [
"lit"
]
},
{
"name": "LitQA2_FullText_Search_test",
"path": "astabench/paper_finder_litqa2_test",
"primary_metric": "score_paper_finder/recall_at_30",
"tags": [
"lit"
]
},
{
"name": "ScholarQA_CS2_test",
"path": "astabench/sqa_test",
"primary_metric": "global_avg/mean",
"tags": [
"lit"
]
},
{
"name": "ArxivDIGESTables_Clean_test",
"path": "astabench/arxivdigestables_test",
"primary_metric": "score_tables/mean",
"tags": [
"lit"
]
},
{
"name": "LitQA2_FullText_test",
"path": "astabench/litqa2_test",
"primary_metric": "is_correct/accuracy",
"tags": [
"lit"
]
},
{
"name": "DiscoveryBench_test",
"path": "astabench/discoverybench_test",
"primary_metric": "score_discoverybench/mean",
"tags": [
"data"
]
},
{
"name": "CORE_Bench_Hard_test",
"path": "astabench/core_bench_test",
"primary_metric": "score_with_stderr/accuracy",
"tags": [
"code"
]
},
{
"name": "DS_1000_test",
"path": "astabench/ds1000_test",
"primary_metric": "ds1000_scorer/accuracy",
"tags": [
"code"
]
},
{
"name": "E2E_Bench_test",
"path": "astabench/e2e_discovery_test",
"primary_metric": "score_rubric/accuracy",
"tags": [
"discovery"
]
},
{
"name": "E2E_Bench_Hard_test",
"path": "astabench/e2e_discovery_hard_test",
"primary_metric": "score_rubric/accuracy",
"tags": [
"discovery"
]
},
{
"name": "SUPER_Expert_test",
"path": "astabench/super_test",
"primary_metric": "output_match/mean",
"tags": [
"code"
]
}
],
"macro_average_weight_adjustments": [
{
"tag": "lit",
"task": "LitQA2_FullText_test",
"weight": 0.5
},
{
"tag": "lit",
"task": "LitQA2_FullText_Search_test",
"weight": 0.5
}
]
}
]
} | validation | [
{
"task_name": "E2E_Bench_Hard_validation",
"eval_spec": {
"solver": "astabench/evals/e2e_discovery/solvers/faker/faker_cached.py@faker_cached_solver",
"solver_args": "{}",
"model": "openai/gpt-4.1",
"model_args": "{}",
"task_args": "{}",
"revision": {
"type": "gi... | {
"submit_time": "2025-07-10T18:14:56.434410Z",
"username": "Ai2",
"agent_name": "Faker",
"agent_description": null,
"agent_url": null,
"logs_url": null,
"logs_url_public": "hf://datasets/allenai/asta-bench-submissions/1.0.0-dev1/validation/pclark425_Faker_2025-07-10T18-14-56",
"summary_url": null,
"openness": "Open source & closed weights",
"tool_usage": "Standard"
} |
Subsets and Splits
No community queries yet
The top public SQL queries from the community will appear here once available.