Spestly/Atlas-Pro-1.5B-Preview Spestly/Atlas-Pro-7B-Preview 01-ai/Yi-6B deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B deepseek-ai/DeepSeek-R1-Distill-Qwen-7B meta-llama/Meta-Llama-3.1-8B meta-llama/Llama-3.2-3B-Instruct
bbh.acc_norm 0.348030 0.465891 0.426662 0.321298 0.341087 0.463808 0.458601
bbh_boolean_expressions.acc_norm 0.724000 0.884000 0.656000 0.500000 0.532000 0.800000 0.772000
bbh_causal_judgement.acc_norm 0.508021 0.561497 0.572193 0.518717 0.518717 0.577540 0.631016
bbh_date_understanding.acc_norm 0.284000 0.548000 0.412000 0.208000 0.384000 0.480000 0.452000
bbh_disambiguation_qa.acc_norm 0.432000 0.564000 0.608000 0.340000 0.312000 0.524000 0.292000
bbh_formal_fallacies.acc_norm 0.476000 0.588000 0.540000 0.464000 0.468000 0.560000 0.484000
bbh_geometric_shapes.acc_norm 0.156000 0.444000 0.376000 0.220000 0.428000 0.336000 0.368000
bbh_hyperbaton.acc_norm 0.496000 0.588000 0.504000 0.516000 0.484000 0.632000 0.668000
bbh_logical_deduction_five_objects.acc_norm 0.228000 0.464000 0.240000 0.208000 0.288000 0.356000 0.436000
bbh_logical_deduction_seven_objects.acc_norm 0.156000 0.344000 0.176000 0.144000 0.160000 0.328000 0.388000
bbh_logical_deduction_three_objects.acc_norm 0.408000 0.692000 0.400000 0.340000 0.484000 0.512000 0.504000
bbh_movie_recommendation.acc_norm 0.496000 0.584000 0.796000 0.264000 0.296000 0.792000 0.548000
bbh_navigate.acc_norm 0.504000 0.680000 0.464000 0.420000 0.484000 0.508000 0.612000
bbh_object_counting.acc_norm 0.336000 0.376000 0.316000 0.356000 0.116000 0.496000 0.360000
bbh_penguins_in_a_table.acc_norm 0.253425 0.424658 0.424658 0.212329 0.321918 0.438356 0.390411
bbh_reasoning_about_colored_objects.acc_norm 0.272000 0.488000 0.308000 0.192000 0.176000 0.372000 0.452000
bbh_ruin_names.acc_norm 0.216000 0.424000 0.440000 0.232000 0.364000 0.504000 0.608000
bbh_salient_translation_error_detection.acc_norm 0.132000 0.264000 0.272000 0.144000 0.220000 0.396000 0.372000
bbh_snarks.acc_norm 0.500000 0.516854 0.657303 0.539326 0.477528 0.634831 0.606742
bbh_sports_understanding.acc_norm 0.540000 0.564000 0.740000 0.524000 0.480000 0.752000 0.708000
bbh_temporal_sequences.acc_norm 0.072000 0.132000 0.256000 0.240000 0.244000 0.096000 0.260000
bbh_tracking_shuffled_objects_five_objects.acc_norm 0.212000 0.144000 0.160000 0.224000 0.112000 0.160000 0.152000
bbh_tracking_shuffled_objects_seven_objects.acc_norm 0.168000 0.120000 0.176000 0.152000 0.120000 0.124000 0.144000
bbh_tracking_shuffled_objects_three_objects.acc_norm 0.340000 0.320000 0.352000 0.332000 0.304000 0.332000 0.324000
bbh_web_of_lies.acc_norm 0.488000 0.488000 0.496000 0.488000 0.488000 0.488000 0.532000
gpqa.acc_norm 0.296980 0.337248 0.269295 0.255872 0.279362 0.296141 0.278523
gpqa_diamond.acc_norm 0.267677 0.318182 0.267677 0.237374 0.232323 0.242424 0.257576
gpqa_extended.acc_norm 0.313187 0.351648 0.267399 0.239927 0.276557 0.305861 0.293040
gpqa_main.acc_norm 0.290179 0.328125 0.272321 0.283482 0.303571 0.308036 0.270089
ifeval.prompt_level_strict_acc 0.188540 0.249538 0.227357 0.275416 0.332717 0.081331 0.696858
ifeval.inst_level_strict_acc 0.297362 0.381295 0.351319 0.417266 0.474820 0.172662 0.781775
ifeval.prompt_level_loose_acc 0.214418 0.314233 0.243993 0.280961 0.340111 0.092421 0.757856
ifeval.inst_level_loose_acc 0.323741 0.437650 0.368106 0.423261 0.484412 0.179856 0.826139
math_hard.exact_match 0.258308 0.388973 0.015861 0.000000 0.000000 0.051360 0.171450
math_algebra_hard.exact_match 0.501629 0.618893 0.026059 0.000000 0.000000 0.100977 0.351792
math_counting_and_prob_hard.exact_match 0.162602 0.349593 0.024390 0.000000 0.000000 0.016260 0.130081
math_geometry_hard.exact_match 0.121212 0.250000 0.030303 0.000000 0.000000 0.022727 0.083333
math_intermediate_algebra_hard.exact_match 0.064286 0.153571 0.007143 0.000000 0.000000 0.007143 0.028571
math_num_theory_hard.exact_match 0.266234 0.415584 0.012987 0.000000 0.000000 0.045455 0.097403
math_prealgebra_hard.exact_match 0.430052 0.647668 0.005181 0.000000 0.000000 0.103627 0.326425
math_precalculus_hard.exact_match 0.074074 0.125926 0.007407 0.000000 0.000000 0.022222 0.044444
mmlu_pro.acc 0.192487 0.297041 0.299119 0.118684 0.232131 0.324551 0.319481
musr.acc_norm 0.334656 0.390212 0.392857 0.362434 0.365079 0.382275 0.351852
musr_murder_mysteries.acc_norm 0.504000 0.520000 0.524000 0.492000 0.492000 0.540000 0.504000
musr_object_placements.acc_norm 0.238281 0.281250 0.289062 0.234375 0.214844 0.351562 0.226562
musr_team_allocation.acc_norm 0.264000 0.372000 0.368000 0.364000 0.392000 0.256000 0.328000