initial commit

This commit is contained in:
2025-03-14 17:44:08 +01:00
commit 197512a6bc
2 changed files with 241 additions and 0 deletions

74
.gitignore vendored Normal file
View File

@@ -0,0 +1,74 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a PyInstaller build script
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Jupyter Notebook
.ipynb_checkpoints
# VSCode
.vscode/
# PyCharm
.idea/
# Local development settings
.env.local
.env.development.local
# Log files
*.log

167
ollama-speedtest.py Normal file
View File

@@ -0,0 +1,167 @@
import time
import ollama
import base64
import tabulate
from rich.console import Console
from rich.table import Table
import datetime
import asyncio
import concurrent.futures
async def generate_tokens_async(model_name, host_ip):
# Define the prompt to generate tokens
prompt = """Generate 1000 random words with no spaces, each word should be between 3-5 letters long. Separate them with line breaks.\n\n"""
# Set the host for ollama client
ollama.host = f"http://{host_ip}:11434"
try:
# Start timing the generation
start_time = time.time()
# Use ollama client to generate response - run in a thread pool to avoid blocking
loop = asyncio.get_event_loop()
with concurrent.futures.ThreadPoolExecutor() as pool:
response = await loop.run_in_executor(
pool,
lambda: ollama.generate(model=model_name, prompt=prompt, stream=False)
)
# Calculate the time taken
end_time = time.time()
generation_time = end_time - start_time
# Get the generated content
generated_content = response['response']
# Estimate the number of tokens in the response
# Rough estimate: 1 token is approximately 4 characters for English text
estimated_tokens = len(generated_content) / 4
# Calculate tokens per second
tokens_per_second = estimated_tokens / generation_time
return {
"success": True,
"tokens_per_second": tokens_per_second,
"generation_time": generation_time,
"content_length": len(generated_content),
"estimated_tokens": estimated_tokens
}
except Exception as e:
print(f"Error for {model_name} on {host_ip}: {str(e)}")
return {
"success": False,
"error": str(e)
}
async def test_host(host, models):
"""Process all models for a single host sequentially"""
results = []
for model in models:
print(f"Testing model: {model} on host: {host}")
# Run the test
result = await generate_tokens_async(model, host)
# Store the result
results.append({
"host": host,
"model": model,
"result": result
})
# Add a small delay between tests on same host
await asyncio.sleep(1)
return results
def print_report(results):
console = Console()
# Create a table
table = Table(title=f"Ollama Performance Test Report - {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
# Add columns
table.add_column("Host IP", style="cyan")
table.add_column("Model", style="green")
table.add_column("Tokens/Second", style="magenta")
table.add_column("Generation Time (s)", style="yellow")
table.add_column("Content Length", style="blue")
table.add_column("Status", style="red")
# Add rows
for result in results:
host = result["host"]
model = result["model"]
if result["result"]["success"]:
tokens_per_second = f"{result['result']['tokens_per_second']:.2f}"
generation_time = f"{result['result']['generation_time']:.2f}"
content_length = str(result['result']['content_length'])
status = "✅ Success"
else:
tokens_per_second = "N/A"
generation_time = "N/A"
content_length = "N/A"
status = f"❌ Failed: {result['result']['error']}"
table.add_row(host, model, tokens_per_second, generation_time, content_length, status)
# Print the table
console.print(table)
# Print summary statistics if there are successful results
successful_results = [r for r in results if r["result"]["success"]]
if successful_results:
summary_table = Table(title="Summary Statistics")
summary_table.add_column("Metric", style="cyan")
summary_table.add_column("Value", style="green")
avg_tokens_per_second = sum(r["result"]["tokens_per_second"] for r in successful_results) / len(successful_results)
fastest_host_model = max(successful_results, key=lambda x: x["result"]["tokens_per_second"])
slowest_host_model = min(successful_results, key=lambda x: x["result"]["tokens_per_second"])
summary_table.add_row("Average Tokens/Second", f"{avg_tokens_per_second:.2f}")
summary_table.add_row("Fastest Configuration",
f"{fastest_host_model['host']} with {fastest_host_model['model']} " +
f"({fastest_host_model['result']['tokens_per_second']:.2f} tokens/s)")
summary_table.add_row("Slowest Configuration",
f"{slowest_host_model['host']} with {slowest_host_model['model']} " +
f"({slowest_host_model['result']['tokens_per_second']:.2f} tokens/s)")
console.print(summary_table)
async def main_async():
# Define the test matrix
test_matrix = {
"localhost": ["llama3.2:3b-instruct-q4_0"],
"192.168.50.3": ["llama3.2:3b-instruct-q4_0", "llama3.1:8b-instruct-q4_0", "llama3.1:8b-instruct-q8_0"],
"192.168.50.121": ["llama3.2:3b-instruct-q4_0", "llama3.1:8b-instruct-q4_0", "llama3.1:8b-instruct-q8_0"]
}
# Create tasks to test each host in parallel
tasks = []
for host, models in test_matrix.items():
task = asyncio.create_task(test_host(host, models))
tasks.append(task)
# Wait for all tasks to complete
host_results = await asyncio.gather(*tasks)
# Flatten results
all_results = []
for result_list in host_results:
all_results.extend(result_list)
# Print the report
print_report(all_results)
def main():
# Run the async main function
asyncio.run(main_async())
if __name__ == "__main__":
main()