initial commit

2025-03-14 17:44:08 +01:00
commit 197512a6bc
2 changed files with 241 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,74 @@
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
 *$py.class
 # C extensions
 *.so
 # Distribution / packaging
 .Python
 build/
 develop-eggs/
 dist/
 downloads/
 eggs/
 .eggs/
 lib/
 lib64/
 parts/
 sdist/
 var/
 wheels/
 *.egg-info/
 .installed.cfg
 *.egg
 MANIFEST
 # PyInstaller
 #  Usually these files are written by a PyInstaller build script
 *.manifest
 *.spec
 # Installer logs
 pip-log.txt
 pip-delete-this-directory.txt
 # Unit test / coverage reports
 htmlcov/
 .tox/
 .nox/
 .coverage
 .coverage.*
 .cache
 nosetests.xml
 coverage.xml
 *.cover
 *.py,cover
 .hypothesis/
 .pytest_cache/
 # Environments
 .env
 .venv
 env/
 venv/
 ENV/
 env.bak/
 venv.bak/
 # Jupyter Notebook
 .ipynb_checkpoints
 # VSCode
 .vscode/
 # PyCharm
 .idea/
 # Local development settings
 .env.local
 .env.development.local
 # Log files
 *.log
--- a/ollama-speedtest.py
+++ b/ollama-speedtest.py
@@ -0,0 +1,167 @@
 import time
 import ollama
 import base64
 import tabulate
 from rich.console import Console
 from rich.table import Table
 import datetime
 import asyncio
 import concurrent.futures
 async def generate_tokens_async(model_name, host_ip):
    # Define the prompt to generate tokens
    prompt = """Generate 1000 random words with no spaces, each word should be between 3-5 letters long. Separate them with line breaks.\n\n"""
    # Set the host for ollama client
    ollama.host = f"http://{host_ip}:11434"
    try:
        # Start timing the generation
        start_time = time.time()
        # Use ollama client to generate response - run in a thread pool to avoid blocking
        loop = asyncio.get_event_loop()
        with concurrent.futures.ThreadPoolExecutor() as pool:
            response = await loop.run_in_executor(
                pool, 
                lambda: ollama.generate(model=model_name, prompt=prompt, stream=False)
            )
        # Calculate the time taken
        end_time = time.time()
        generation_time = end_time - start_time
        # Get the generated content
        generated_content = response['response']
        # Estimate the number of tokens in the response
        # Rough estimate: 1 token is approximately 4 characters for English text
        estimated_tokens = len(generated_content) / 4
        # Calculate tokens per second
        tokens_per_second = estimated_tokens / generation_time
        return {
            "success": True,
            "tokens_per_second": tokens_per_second,
            "generation_time": generation_time,
            "content_length": len(generated_content),
            "estimated_tokens": estimated_tokens
        }
    except Exception as e:
        print(f"Error for {model_name} on {host_ip}: {str(e)}")
        return {
            "success": False,
            "error": str(e)
        }
 async def test_host(host, models):
    """Process all models for a single host sequentially"""
    results = []
    for model in models:
        print(f"Testing model: {model} on host: {host}")
        # Run the test
        result = await generate_tokens_async(model, host)
        # Store the result
        results.append({
            "host": host,
            "model": model,
            "result": result
        })
        # Add a small delay between tests on same host
        await asyncio.sleep(1)
    return results
 def print_report(results):
    console = Console()
    # Create a table
    table = Table(title=f"Ollama Performance Test Report - {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    # Add columns
    table.add_column("Host IP", style="cyan")
    table.add_column("Model", style="green")
    table.add_column("Tokens/Second", style="magenta")
    table.add_column("Generation Time (s)", style="yellow")
    table.add_column("Content Length", style="blue")
    table.add_column("Status", style="red")
    # Add rows
    for result in results:
        host = result["host"]
        model = result["model"]
        if result["result"]["success"]:
            tokens_per_second = f"{result['result']['tokens_per_second']:.2f}"
            generation_time = f"{result['result']['generation_time']:.2f}"
            content_length = str(result['result']['content_length'])
            status = "✅ Success"
        else:
            tokens_per_second = "N/A"
            generation_time = "N/A"
            content_length = "N/A"
            status = f"❌ Failed: {result['result']['error']}"
        table.add_row(host, model, tokens_per_second, generation_time, content_length, status)
    # Print the table
    console.print(table)
    # Print summary statistics if there are successful results
    successful_results = [r for r in results if r["result"]["success"]]
    if successful_results:
        summary_table = Table(title="Summary Statistics")
        summary_table.add_column("Metric", style="cyan")
        summary_table.add_column("Value", style="green")
        avg_tokens_per_second = sum(r["result"]["tokens_per_second"] for r in successful_results) / len(successful_results)
        fastest_host_model = max(successful_results, key=lambda x: x["result"]["tokens_per_second"])
        slowest_host_model = min(successful_results, key=lambda x: x["result"]["tokens_per_second"])
        summary_table.add_row("Average Tokens/Second", f"{avg_tokens_per_second:.2f}")
        summary_table.add_row("Fastest Configuration", 
                             f"{fastest_host_model['host']} with {fastest_host_model['model']} " +
                             f"({fastest_host_model['result']['tokens_per_second']:.2f} tokens/s)")
        summary_table.add_row("Slowest Configuration", 
                             f"{slowest_host_model['host']} with {slowest_host_model['model']} " +
                             f"({slowest_host_model['result']['tokens_per_second']:.2f} tokens/s)")
        console.print(summary_table)
 async def main_async():
    # Define the test matrix
    test_matrix = {
        "localhost": ["llama3.2:3b-instruct-q4_0"],
        "192.168.50.3": ["llama3.2:3b-instruct-q4_0", "llama3.1:8b-instruct-q4_0", "llama3.1:8b-instruct-q8_0"],
        "192.168.50.121": ["llama3.2:3b-instruct-q4_0", "llama3.1:8b-instruct-q4_0", "llama3.1:8b-instruct-q8_0"]
    }
    # Create tasks to test each host in parallel
    tasks = []
    for host, models in test_matrix.items():
        task = asyncio.create_task(test_host(host, models))
        tasks.append(task)
    # Wait for all tasks to complete
    host_results = await asyncio.gather(*tasks)
    # Flatten results
    all_results = []
    for result_list in host_results:
        all_results.extend(result_list)
    # Print the report
    print_report(all_results)
 def main():
    # Run the async main function
    asyncio.run(main_async())
 if __name__ == "__main__":
    main()