Files
ollama-speedtest/ollama-speedtest.py
2025-03-14 18:40:23 +01:00

233 lines
8.9 KiB
Python

import time
import ollama
from ollama import AsyncClient
from rich.console import Console
from rich.table import Table
import datetime
import signal
import sys
import asyncio
def nanosec_to_sec(nanosec):
"""Convert nanoseconds to seconds"""
return nanosec / 1_000_000_000
async def generate_tokens_async(model_name, host_ip):
# Define the prompt to generate tokens
prompt = """Why is the sky blue? Give a comprehensive explanation."""
# Create a client for the specific host
client = AsyncClient(host=f"http://{host_ip}:11434")
start_time = time.time()
# Print a message to indicate we're waiting
print(f" Generating tokens with {model_name} on {host_ip}...")
# Execute the request with timeout
try:
# Make the request using the chat API
response = await client.chat(
model=model_name,
messages=[
{
'role': 'user',
'content': prompt,
}
]
)
# If we get here, the request was successful
end_time = time.time()
# print generation finished message
print(f" Generation finished with {model_name} on {host_ip}")
generation_time = end_time - start_time
# Get the generated content from the response
generated_content = response['message']['content']
# Get accurate token metrics from the Ollama response metadata
prompt_eval_count = response.get('prompt_eval_count', 0)
prompt_eval_duration = response.get('prompt_eval_duration', 0)
eval_count = response.get('eval_count', 0)
eval_duration = response.get('eval_duration', 0)
total_duration = response.get('total_duration', 0)
# Calculate tokens per second using the Ollama metadata
if prompt_eval_duration > 0:
prompt_tokens_per_second = prompt_eval_count / nanosec_to_sec(prompt_eval_duration)
else:
prompt_tokens_per_second = 0
if eval_duration > 0:
response_tokens_per_second = eval_count / nanosec_to_sec(eval_duration)
else:
response_tokens_per_second = 0
if total_duration > 0:
total_tokens = prompt_eval_count + eval_count
total_tokens_per_second = total_tokens / nanosec_to_sec(total_duration)
else:
# Fall back to wall-clock time if total_duration not provided
total_tokens = prompt_eval_count + eval_count
total_tokens_per_second = total_tokens / generation_time
return {
"success": True,
"total_tokens_per_second": total_tokens_per_second,
"prompt_tokens_per_second": prompt_tokens_per_second,
"response_tokens_per_second": response_tokens_per_second,
"generation_time": generation_time,
"content_length": len(generated_content),
"prompt_eval_count": prompt_eval_count,
"eval_count": eval_count,
"total_duration_seconds": nanosec_to_sec(total_duration),
"prompt_eval_duration_seconds": nanosec_to_sec(prompt_eval_duration),
"eval_duration_seconds": nanosec_to_sec(eval_duration)
}
except Exception as e:
error_msg = str(e)
print(f" Request Error: {error_msg}")
return {
"success": False,
"error": error_msg
}
async def test_host(host, models, console):
"""Test models sequentially on a single host"""
results = []
console.print(f"[bold cyan]Testing host: {host}[/bold cyan]")
for model in models:
console.print(f"[bold green]Testing model: {model} on {host}[/bold green]")
# Run the test
result = await generate_tokens_async(model, host)
# Store the result
results.append({
"host": host,
"model": model,
"result": result
})
# Add a small delay between tests on the same host
await asyncio.sleep(1)
# Print a separator
console.print("" * 50)
return results
def print_report(results):
console = Console()
# Create a table
table = Table(title=f"Ollama Performance Test Report - {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
# Add columns
table.add_column("Host IP", style="cyan")
table.add_column("Model", style="green")
table.add_column("Total T/S", style="magenta")
table.add_column("Prompt T/S", style="blue")
table.add_column("Response T/S", style="yellow")
table.add_column("Prompt Tokens", style="blue")
table.add_column("Response Tokens", style="yellow")
table.add_column("Status", style="red")
# Add rows
for result in results:
host = result["host"]
model = result["model"]
if result["result"]["success"]:
total_tps = f"{result['result'].get('total_tokens_per_second', 0):.2f}"
prompt_tps = f"{result['result'].get('prompt_tokens_per_second', 0):.2f}"
response_tps = f"{result['result'].get('response_tokens_per_second', 0):.2f}"
prompt_tokens = str(result['result'].get('prompt_eval_count', 0))
response_tokens = str(result['result'].get('eval_count', 0))
status = "✅ Success"
else:
total_tps = "N/A"
prompt_tps = "N/A"
response_tps = "N/A"
prompt_tokens = "N/A"
response_tokens = "N/A"
status = f"❌ Failed: {result['result']['error']}"
table.add_row(host, model, total_tps, prompt_tps, response_tps, prompt_tokens, response_tokens, status)
# Print the table
console.print(table)
# Print summary statistics if there are successful results
successful_results = [r for r in results if r["result"]["success"]]
if successful_results:
summary_table = Table(title="Summary Statistics")
summary_table.add_column("Metric", style="cyan")
summary_table.add_column("Value", style="green")
avg_total_tps = sum(r["result"].get("total_tokens_per_second", 0) for r in successful_results) / len(successful_results)
avg_response_tps = sum(r["result"].get("response_tokens_per_second", 0) for r in successful_results) / len(successful_results)
# Find fastest by response token speed (usually what people care about most)
fastest_host_model = max(successful_results, key=lambda x: x["result"].get("response_tokens_per_second", 0))
slowest_host_model = min(successful_results, key=lambda x: x["result"].get("response_tokens_per_second", 0))
summary_table.add_row("Average Total Tokens/Second", f"{avg_total_tps:.2f}")
summary_table.add_row("Average Response Tokens/Second", f"{avg_response_tps:.2f}")
summary_table.add_row("Fastest Response",
f"{fastest_host_model['host']} with {fastest_host_model['model']} " +
f"({fastest_host_model['result'].get('response_tokens_per_second', 0):.2f} tokens/s)")
summary_table.add_row("Slowest Response",
f"{slowest_host_model['host']} with {slowest_host_model['model']} " +
f"({slowest_host_model['result'].get('response_tokens_per_second', 0):.2f} tokens/s)")
console.print(summary_table)
async def main_async():
# Define the test matrix
test_matrix = {
#"localhost": ["llama3.2:3b-instruct-q4_0"],
"192.168.50.3": ["llama3.2:3b-instruct-q4_0", "llama3.1:8b-instruct-q4_0", "llama3.1:8b-instruct-q8_0"],
"192.168.50.121": ["llama3.2:3b-instruct-q4_0", "llama3.1:8b-instruct-q4_0", "llama3.1:8b-instruct-q8_0"]
}
console = Console()
try:
# Create tasks to test each host in parallel
tasks = []
for host, models in test_matrix.items():
# Create a task for each host (models will be tested sequentially within each host)
task = asyncio.create_task(test_host(host, models, console))
tasks.append(task)
# Wait for all host tests to complete
results_by_host = await asyncio.gather(*tasks)
# Flatten the results from all hosts
all_results = []
for host_results in results_by_host:
all_results.extend(host_results)
# Print the overall report
print_report(all_results)
except KeyboardInterrupt:
console.print("\n[bold red]Test interrupted by user.[/bold red]")
# We can't easily get partial results when interrupted in async mode
except Exception as e:
console.print(f"[bold red]Error during testing: {str(e)}[/bold red]")
def main():
# Run the async main function
asyncio.run(main_async())
if __name__ == "__main__":
main()