Spestly/Atlas-Pro-1.5B-Preview	Spestly/Atlas-Pro-7B-Preview	01-ai/Yi-6B	deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B	deepseek-ai/DeepSeek-R1-Distill-Qwen-7B	meta-llama/Meta-Llama-3.1-8B	meta-llama/Llama-3.2-3B-Instruct
bbh.acc_norm	0.348030	0.465891	0.426662	0.321298	0.341087	0.463808	0.458601
bbh_boolean_expressions.acc_norm	0.724000	0.884000	0.656000	0.500000	0.532000	0.800000	0.772000
bbh_causal_judgement.acc_norm	0.508021	0.561497	0.572193	0.518717	0.518717	0.577540	0.631016
bbh_date_understanding.acc_norm	0.284000	0.548000	0.412000	0.208000	0.384000	0.480000	0.452000
bbh_disambiguation_qa.acc_norm	0.432000	0.564000	0.608000	0.340000	0.312000	0.524000	0.292000
bbh_formal_fallacies.acc_norm	0.476000	0.588000	0.540000	0.464000	0.468000	0.560000	0.484000
bbh_geometric_shapes.acc_norm	0.156000	0.444000	0.376000	0.220000	0.428000	0.336000	0.368000
bbh_hyperbaton.acc_norm	0.496000	0.588000	0.504000	0.516000	0.484000	0.632000	0.668000
bbh_logical_deduction_five_objects.acc_norm	0.228000	0.464000	0.240000	0.208000	0.288000	0.356000	0.436000
bbh_logical_deduction_seven_objects.acc_norm	0.156000	0.344000	0.176000	0.144000	0.160000	0.328000	0.388000
bbh_logical_deduction_three_objects.acc_norm	0.408000	0.692000	0.400000	0.340000	0.484000	0.512000	0.504000
bbh_movie_recommendation.acc_norm	0.496000	0.584000	0.796000	0.264000	0.296000	0.792000	0.548000
bbh_navigate.acc_norm	0.504000	0.680000	0.464000	0.420000	0.484000	0.508000	0.612000
bbh_object_counting.acc_norm	0.336000	0.376000	0.316000	0.356000	0.116000	0.496000	0.360000
bbh_penguins_in_a_table.acc_norm	0.253425	0.424658	0.424658	0.212329	0.321918	0.438356	0.390411
bbh_reasoning_about_colored_objects.acc_norm	0.272000	0.488000	0.308000	0.192000	0.176000	0.372000	0.452000
bbh_ruin_names.acc_norm	0.216000	0.424000	0.440000	0.232000	0.364000	0.504000	0.608000
bbh_salient_translation_error_detection.acc_norm	0.132000	0.264000	0.272000	0.144000	0.220000	0.396000	0.372000
bbh_snarks.acc_norm	0.500000	0.516854	0.657303	0.539326	0.477528	0.634831	0.606742
bbh_sports_understanding.acc_norm	0.540000	0.564000	0.740000	0.524000	0.480000	0.752000	0.708000
bbh_temporal_sequences.acc_norm	0.072000	0.132000	0.256000	0.240000	0.244000	0.096000	0.260000
bbh_tracking_shuffled_objects_five_objects.acc_norm	0.212000	0.144000	0.160000	0.224000	0.112000	0.160000	0.152000
bbh_tracking_shuffled_objects_seven_objects.acc_norm	0.168000	0.120000	0.176000	0.152000	0.120000	0.124000	0.144000
bbh_tracking_shuffled_objects_three_objects.acc_norm	0.340000	0.320000	0.352000	0.332000	0.304000	0.332000	0.324000
bbh_web_of_lies.acc_norm	0.488000	0.488000	0.496000	0.488000	0.488000	0.488000	0.532000
gpqa.acc_norm	0.296980	0.337248	0.269295	0.255872	0.279362	0.296141	0.278523
gpqa_diamond.acc_norm	0.267677	0.318182	0.267677	0.237374	0.232323	0.242424	0.257576
gpqa_extended.acc_norm	0.313187	0.351648	0.267399	0.239927	0.276557	0.305861	0.293040
gpqa_main.acc_norm	0.290179	0.328125	0.272321	0.283482	0.303571	0.308036	0.270089
ifeval.prompt_level_strict_acc	0.188540	0.249538	0.227357	0.275416	0.332717	0.081331	0.696858
ifeval.inst_level_strict_acc	0.297362	0.381295	0.351319	0.417266	0.474820	0.172662	0.781775
ifeval.prompt_level_loose_acc	0.214418	0.314233	0.243993	0.280961	0.340111	0.092421	0.757856
ifeval.inst_level_loose_acc	0.323741	0.437650	0.368106	0.423261	0.484412	0.179856	0.826139
math_hard.exact_match	0.258308	0.388973	0.015861	0.000000	0.000000	0.051360	0.171450
math_algebra_hard.exact_match	0.501629	0.618893	0.026059	0.000000	0.000000	0.100977	0.351792
math_counting_and_prob_hard.exact_match	0.162602	0.349593	0.024390	0.000000	0.000000	0.016260	0.130081
math_geometry_hard.exact_match	0.121212	0.250000	0.030303	0.000000	0.000000	0.022727	0.083333
math_intermediate_algebra_hard.exact_match	0.064286	0.153571	0.007143	0.000000	0.000000	0.007143	0.028571
math_num_theory_hard.exact_match	0.266234	0.415584	0.012987	0.000000	0.000000	0.045455	0.097403
math_prealgebra_hard.exact_match	0.430052	0.647668	0.005181	0.000000	0.000000	0.103627	0.326425
math_precalculus_hard.exact_match	0.074074	0.125926	0.007407	0.000000	0.000000	0.022222	0.044444
mmlu_pro.acc	0.192487	0.297041	0.299119	0.118684	0.232131	0.324551	0.319481
musr.acc_norm	0.334656	0.390212	0.392857	0.362434	0.365079	0.382275	0.351852
musr_murder_mysteries.acc_norm	0.504000	0.520000	0.524000	0.492000	0.492000	0.540000	0.504000
musr_object_placements.acc_norm	0.238281	0.281250	0.289062	0.234375	0.214844	0.351562	0.226562
musr_team_allocation.acc_norm	0.264000	0.372000	0.368000	0.364000	0.392000	0.256000	0.328000