|
Spestly/Atlas-Pro-1.5B-Preview |
Spestly/Atlas-Pro-7B-Preview |
01-ai/Yi-6B |
deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B |
deepseek-ai/DeepSeek-R1-Distill-Qwen-7B |
meta-llama/Meta-Llama-3.1-8B |
meta-llama/Llama-3.2-3B-Instruct |
bbh.acc_norm |
0.348030 |
0.465891 |
0.426662 |
0.321298 |
0.341087 |
0.463808 |
0.458601 |
bbh_boolean_expressions.acc_norm |
0.724000 |
0.884000 |
0.656000 |
0.500000 |
0.532000 |
0.800000 |
0.772000 |
bbh_causal_judgement.acc_norm |
0.508021 |
0.561497 |
0.572193 |
0.518717 |
0.518717 |
0.577540 |
0.631016 |
bbh_date_understanding.acc_norm |
0.284000 |
0.548000 |
0.412000 |
0.208000 |
0.384000 |
0.480000 |
0.452000 |
bbh_disambiguation_qa.acc_norm |
0.432000 |
0.564000 |
0.608000 |
0.340000 |
0.312000 |
0.524000 |
0.292000 |
bbh_formal_fallacies.acc_norm |
0.476000 |
0.588000 |
0.540000 |
0.464000 |
0.468000 |
0.560000 |
0.484000 |
bbh_geometric_shapes.acc_norm |
0.156000 |
0.444000 |
0.376000 |
0.220000 |
0.428000 |
0.336000 |
0.368000 |
bbh_hyperbaton.acc_norm |
0.496000 |
0.588000 |
0.504000 |
0.516000 |
0.484000 |
0.632000 |
0.668000 |
bbh_logical_deduction_five_objects.acc_norm |
0.228000 |
0.464000 |
0.240000 |
0.208000 |
0.288000 |
0.356000 |
0.436000 |
bbh_logical_deduction_seven_objects.acc_norm |
0.156000 |
0.344000 |
0.176000 |
0.144000 |
0.160000 |
0.328000 |
0.388000 |
bbh_logical_deduction_three_objects.acc_norm |
0.408000 |
0.692000 |
0.400000 |
0.340000 |
0.484000 |
0.512000 |
0.504000 |
bbh_movie_recommendation.acc_norm |
0.496000 |
0.584000 |
0.796000 |
0.264000 |
0.296000 |
0.792000 |
0.548000 |
bbh_navigate.acc_norm |
0.504000 |
0.680000 |
0.464000 |
0.420000 |
0.484000 |
0.508000 |
0.612000 |
bbh_object_counting.acc_norm |
0.336000 |
0.376000 |
0.316000 |
0.356000 |
0.116000 |
0.496000 |
0.360000 |
bbh_penguins_in_a_table.acc_norm |
0.253425 |
0.424658 |
0.424658 |
0.212329 |
0.321918 |
0.438356 |
0.390411 |
bbh_reasoning_about_colored_objects.acc_norm |
0.272000 |
0.488000 |
0.308000 |
0.192000 |
0.176000 |
0.372000 |
0.452000 |
bbh_ruin_names.acc_norm |
0.216000 |
0.424000 |
0.440000 |
0.232000 |
0.364000 |
0.504000 |
0.608000 |
bbh_salient_translation_error_detection.acc_norm |
0.132000 |
0.264000 |
0.272000 |
0.144000 |
0.220000 |
0.396000 |
0.372000 |
bbh_snarks.acc_norm |
0.500000 |
0.516854 |
0.657303 |
0.539326 |
0.477528 |
0.634831 |
0.606742 |
bbh_sports_understanding.acc_norm |
0.540000 |
0.564000 |
0.740000 |
0.524000 |
0.480000 |
0.752000 |
0.708000 |
bbh_temporal_sequences.acc_norm |
0.072000 |
0.132000 |
0.256000 |
0.240000 |
0.244000 |
0.096000 |
0.260000 |
bbh_tracking_shuffled_objects_five_objects.acc_norm |
0.212000 |
0.144000 |
0.160000 |
0.224000 |
0.112000 |
0.160000 |
0.152000 |
bbh_tracking_shuffled_objects_seven_objects.acc_norm |
0.168000 |
0.120000 |
0.176000 |
0.152000 |
0.120000 |
0.124000 |
0.144000 |
bbh_tracking_shuffled_objects_three_objects.acc_norm |
0.340000 |
0.320000 |
0.352000 |
0.332000 |
0.304000 |
0.332000 |
0.324000 |
bbh_web_of_lies.acc_norm |
0.488000 |
0.488000 |
0.496000 |
0.488000 |
0.488000 |
0.488000 |
0.532000 |
gpqa.acc_norm |
0.296980 |
0.337248 |
0.269295 |
0.255872 |
0.279362 |
0.296141 |
0.278523 |
gpqa_diamond.acc_norm |
0.267677 |
0.318182 |
0.267677 |
0.237374 |
0.232323 |
0.242424 |
0.257576 |
gpqa_extended.acc_norm |
0.313187 |
0.351648 |
0.267399 |
0.239927 |
0.276557 |
0.305861 |
0.293040 |
gpqa_main.acc_norm |
0.290179 |
0.328125 |
0.272321 |
0.283482 |
0.303571 |
0.308036 |
0.270089 |
ifeval.prompt_level_strict_acc |
0.188540 |
0.249538 |
0.227357 |
0.275416 |
0.332717 |
0.081331 |
0.696858 |
ifeval.inst_level_strict_acc |
0.297362 |
0.381295 |
0.351319 |
0.417266 |
0.474820 |
0.172662 |
0.781775 |
ifeval.prompt_level_loose_acc |
0.214418 |
0.314233 |
0.243993 |
0.280961 |
0.340111 |
0.092421 |
0.757856 |
ifeval.inst_level_loose_acc |
0.323741 |
0.437650 |
0.368106 |
0.423261 |
0.484412 |
0.179856 |
0.826139 |
math_hard.exact_match |
0.258308 |
0.388973 |
0.015861 |
0.000000 |
0.000000 |
0.051360 |
0.171450 |
math_algebra_hard.exact_match |
0.501629 |
0.618893 |
0.026059 |
0.000000 |
0.000000 |
0.100977 |
0.351792 |
math_counting_and_prob_hard.exact_match |
0.162602 |
0.349593 |
0.024390 |
0.000000 |
0.000000 |
0.016260 |
0.130081 |
math_geometry_hard.exact_match |
0.121212 |
0.250000 |
0.030303 |
0.000000 |
0.000000 |
0.022727 |
0.083333 |
math_intermediate_algebra_hard.exact_match |
0.064286 |
0.153571 |
0.007143 |
0.000000 |
0.000000 |
0.007143 |
0.028571 |
math_num_theory_hard.exact_match |
0.266234 |
0.415584 |
0.012987 |
0.000000 |
0.000000 |
0.045455 |
0.097403 |
math_prealgebra_hard.exact_match |
0.430052 |
0.647668 |
0.005181 |
0.000000 |
0.000000 |
0.103627 |
0.326425 |
math_precalculus_hard.exact_match |
0.074074 |
0.125926 |
0.007407 |
0.000000 |
0.000000 |
0.022222 |
0.044444 |
mmlu_pro.acc |
0.192487 |
0.297041 |
0.299119 |
0.118684 |
0.232131 |
0.324551 |
0.319481 |
musr.acc_norm |
0.334656 |
0.390212 |
0.392857 |
0.362434 |
0.365079 |
0.382275 |
0.351852 |
musr_murder_mysteries.acc_norm |
0.504000 |
0.520000 |
0.524000 |
0.492000 |
0.492000 |
0.540000 |
0.504000 |
musr_object_placements.acc_norm |
0.238281 |
0.281250 |
0.289062 |
0.234375 |
0.214844 |
0.351562 |
0.226562 |
musr_team_allocation.acc_norm |
0.264000 |
0.372000 |
0.368000 |
0.364000 |
0.392000 |
0.256000 |
0.328000 |