| Model | Composite | Judge | DeepEval | Avg Latency | Efficiency |
|---|---|---|---|---|---|
| qwen3-235b | 0.93 | 4.60/5 | 0.95 | 8.4s | 0.48 |
| qwen3-32b | 0.86 | 4.22/5 | 0.92 | 4.3s | 0.44 |
| qwen3-coder-30b | 0.84 | 4.12/5 | 0.90 | 1.9s | 0.45 |
| Model | Composite | Judge | DeepEval | Avg Latency | Efficiency |
|---|---|---|---|---|---|
| nova-2-lite | 0.84 | 4.15/5 | 0.89 | 17.6s | 0.41 |
| nova-pro | 0.78 | 3.79/5 | 0.85 | 9.7s | 0.42 |
| nova-lite | 0.70 | 3.35/5 | 0.80 | 2.3s | 0.37 |
| nova-micro | 0.68 | 3.35/5 | 0.78 | 14.1s | 0.37 |
| Model | Composite | Judge | DeepEval | Avg Latency | Efficiency |
|---|---|---|---|---|---|
| claude-sonnet-4.6 | 0.95 | 4.78/5 | 0.96 | 19.2s | 0.48 |
| claude-opus-4.6 | 0.95 | 4.73/5 | 0.97 | 22.0s | 0.47 |
| claude-opus-4.5 | 0.94 | 4.73/5 | 0.96 | 20.7s | 0.48 |
| claude-sonnet-4.5 | 0.94 | 4.71/5 | 0.95 | 13.9s | 0.50 |
| claude-opus-4 | 0.93 | 4.62/5 | 0.95 | 16.5s | 0.49 |
| claude-sonnet-4 | 0.93 | 4.60/5 | 0.95 | 13.3s | 0.48 |
| claude-sonnet-3.7 | 0.87 | 4.31/5 | 0.92 | 7.3s | 0.49 |
| claude-haiku-3 | 0.71 | 3.48/5 | 0.80 | 3.9s | 0.40 |
| Model | Composite | Judge | DeepEval | Avg Latency | Efficiency |
|---|---|---|---|---|---|
| command-a | 0.83 | 4.12/5 | 0.89 | 21.0s | 0.45 |
| Model | Composite | Judge | DeepEval | Avg Latency | Efficiency |
|---|---|---|---|---|---|
| gemini-3-flash | 0.94 | 4.71/5 | 0.95 | 10.1s | 0.49 |
| gemini-3-pro | 0.93 | 4.67/5 | 0.95 | 21.1s | 0.48 |
| gemini-3.1-pro | 0.93 | 4.65/5 | 0.94 | 39.2s | 0.48 |
| gemini-2.5-flash | 0.88 | 4.39/5 | 0.91 | 13.7s | 0.44 |
| gemma-3-27b | 0.86 | 4.19/5 | 0.91 | 20.9s | 0.42 |
| Model | Composite | Judge | DeepEval | Avg Latency | Efficiency |
|---|---|---|---|---|---|
| llama-4-maverick | 0.83 | 4.07/5 | 0.88 | 1.4s | 0.45 |
| llama-4-scout | 0.79 | 3.84/5 | 0.87 | 1.8s | 0.43 |
| llama3.1 | 0.67 | 3.27/5 | 0.77 | 15.6s | 0.38 |
| llama3.2-vision-11b | 0.62 | 3.08/5 | 0.72 | 30.4s | 0.35 |
| llama3.2 | 0.59 | 2.95/5 | 0.70 | 13.5s | 0.34 |
| Model | Composite | Judge | DeepEval | Avg Latency | Efficiency |
|---|---|---|---|---|---|
| minimax-m2.5 | 0.93 | 4.63/5 | 0.94 | 39.7s | 0.44 |
| Model | Composite | Judge | DeepEval | Avg Latency | Efficiency |
|---|---|---|---|---|---|
| mistral-large-3 | 0.88 | 4.34/5 | 0.93 | 24.4s | 0.43 |
| codestral | 0.65 | 3.27/5 | 0.73 | 58.2s | 0.38 |
| Model | Composite | Judge | DeepEval | Avg Latency | Efficiency |
|---|---|---|---|---|---|
| kimi-k2.5 | 0.94 | 4.67/5 | 0.95 | 41.1s | 0.43 |
| Model | Composite | Judge | DeepEval | Avg Latency | Efficiency |
|---|---|---|---|---|---|
| gpt-5.3 | 0.96 | 4.83/5 | 0.97 | 5.7s | 0.55 |
| gpt-5.2 | 0.94 | 4.72/5 | 0.96 | 12.8s | 0.50 |
| gpt-5.4 | 0.94 | 4.70/5 | 0.96 | 13.7s | 0.49 |
| gpt-5.1 | 0.94 | 4.68/5 | 0.96 | 15.4s | 0.49 |
| gpt-4.1 | 0.92 | 4.55/5 | 0.95 | 9.0s | 0.51 |
| gpt-oss-120b | 0.92 | 4.58/5 | 0.93 | 6.5s | 0.41 |
| o4-mini | 0.91 | 4.52/5 | 0.95 | 13.0s | 0.44 |
| o3-mini | 0.91 | 4.51/5 | 0.93 | 10.6s | 0.45 |
| gpt-4.1-mini | 0.91 | 4.51/5 | 0.93 | 10.4s | 0.50 |
| gpt-oss-20b | 0.88 | 4.43/5 | 0.90 | 142.1s | 0.40 |
| gpt-4.1-nano | 0.87 | 4.26/5 | 0.92 | 5.1s | 0.49 |
| gpt-5 | 0.86 | 4.07/5 | 0.96 | 49.6s | 0.36 |
| gpt-4o | 0.83 | 4.11/5 | 0.89 | 7.7s | 0.48 |
| gpt-4o-mini | 0.82 | 4.01/5 | 0.88 | 7.8s | 0.45 |
| Model | Composite | Judge | DeepEval | Avg Latency | Efficiency |
|---|---|---|---|---|---|
| glm-5 | 0.94 | 4.70/5 | 0.95 | 66.5s | 0.42 |
| glm-4.7-flash | 0.86 | 4.26/5 | 0.90 | 38.6s | 0.38 |
| Model | Composite | Judge | DeepEval | Avg Latency | Efficiency |
|---|---|---|---|---|---|
| grok-4 | 0.92 | 4.61/5 | 0.95 | 37.0s | 0.47 |
| grok-4.1-fast | 0.90 | 4.44/5 | 0.94 | 9.4s | 0.47 |