initial commit

2025-03-14 17:44:08 +01:00
commit 197512a6bc
2 changed files with 241 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,74 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a PyInstaller build script
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# VSCode
+.vscode/
+
+# PyCharm
+.idea/
+
+# Local development settings
+.env.local
+.env.development.local
+
+# Log files
+*.log
--- a/ollama-speedtest.py
+++ b/ollama-speedtest.py
@@ -0,0 +1,167 @@
+import time
+import ollama
+import base64
+import tabulate
+from rich.console import Console
+from rich.table import Table
+import datetime
+import asyncio
+import concurrent.futures
+
+async def generate_tokens_async(model_name, host_ip):
+    # Define the prompt to generate tokens
+    prompt = """Generate 1000 random words with no spaces, each word should be between 3-5 letters long. Separate them with line breaks.\n\n"""
+    
+    # Set the host for ollama client
+    ollama.host = f"http://{host_ip}:11434"
+    
+    try:
+        # Start timing the generation
+        start_time = time.time()
+        
+        # Use ollama client to generate response - run in a thread pool to avoid blocking
+        loop = asyncio.get_event_loop()
+        with concurrent.futures.ThreadPoolExecutor() as pool:
+            response = await loop.run_in_executor(
+                pool, 
+                lambda: ollama.generate(model=model_name, prompt=prompt, stream=False)
+            )
+        
+        # Calculate the time taken
+        end_time = time.time()
+        generation_time = end_time - start_time
+        
+        # Get the generated content
+        generated_content = response['response']
+        
+        # Estimate the number of tokens in the response
+        # Rough estimate: 1 token is approximately 4 characters for English text
+        estimated_tokens = len(generated_content) / 4
+        
+        # Calculate tokens per second
+        tokens_per_second = estimated_tokens / generation_time
+        
+        return {
+            "success": True,
+            "tokens_per_second": tokens_per_second,
+            "generation_time": generation_time,
+            "content_length": len(generated_content),
+            "estimated_tokens": estimated_tokens
+        }
+        
+    except Exception as e:
+        print(f"Error for {model_name} on {host_ip}: {str(e)}")
+        return {
+            "success": False,
+            "error": str(e)
+        }
+
+async def test_host(host, models):
+    """Process all models for a single host sequentially"""
+    results = []
+    
+    for model in models:
+        print(f"Testing model: {model} on host: {host}")
+        
+        # Run the test
+        result = await generate_tokens_async(model, host)
+        
+        # Store the result
+        results.append({
+            "host": host,
+            "model": model,
+            "result": result
+        })
+        
+        # Add a small delay between tests on same host
+        await asyncio.sleep(1)
+    
+    return results
+
+def print_report(results):
+    console = Console()
+    
+    # Create a table
+    table = Table(title=f"Ollama Performance Test Report - {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
+    
+    # Add columns
+    table.add_column("Host IP", style="cyan")
+    table.add_column("Model", style="green")
+    table.add_column("Tokens/Second", style="magenta")
+    table.add_column("Generation Time (s)", style="yellow")
+    table.add_column("Content Length", style="blue")
+    table.add_column("Status", style="red")
+    
+    # Add rows
+    for result in results:
+        host = result["host"]
+        model = result["model"]
+        
+        if result["result"]["success"]:
+            tokens_per_second = f"{result['result']['tokens_per_second']:.2f}"
+            generation_time = f"{result['result']['generation_time']:.2f}"
+            content_length = str(result['result']['content_length'])
+            status = "✅ Success"
+        else:
+            tokens_per_second = "N/A"
+            generation_time = "N/A"
+            content_length = "N/A"
+            status = f"❌ Failed: {result['result']['error']}"
+        
+        table.add_row(host, model, tokens_per_second, generation_time, content_length, status)
+    
+    # Print the table
+    console.print(table)
+    
+    # Print summary statistics if there are successful results
+    successful_results = [r for r in results if r["result"]["success"]]
+    if successful_results:
+        summary_table = Table(title="Summary Statistics")
+        summary_table.add_column("Metric", style="cyan")
+        summary_table.add_column("Value", style="green")
+        
+        avg_tokens_per_second = sum(r["result"]["tokens_per_second"] for r in successful_results) / len(successful_results)
+        fastest_host_model = max(successful_results, key=lambda x: x["result"]["tokens_per_second"])
+        slowest_host_model = min(successful_results, key=lambda x: x["result"]["tokens_per_second"])
+        
+        summary_table.add_row("Average Tokens/Second", f"{avg_tokens_per_second:.2f}")
+        summary_table.add_row("Fastest Configuration", 
+                             f"{fastest_host_model['host']} with {fastest_host_model['model']} " +
+                             f"({fastest_host_model['result']['tokens_per_second']:.2f} tokens/s)")
+        summary_table.add_row("Slowest Configuration", 
+                             f"{slowest_host_model['host']} with {slowest_host_model['model']} " +
+                             f"({slowest_host_model['result']['tokens_per_second']:.2f} tokens/s)")
+        
+        console.print(summary_table)
+
+async def main_async():
+    # Define the test matrix
+    test_matrix = {
+        "localhost": ["llama3.2:3b-instruct-q4_0"],
+        "192.168.50.3": ["llama3.2:3b-instruct-q4_0", "llama3.1:8b-instruct-q4_0", "llama3.1:8b-instruct-q8_0"],
+        "192.168.50.121": ["llama3.2:3b-instruct-q4_0", "llama3.1:8b-instruct-q4_0", "llama3.1:8b-instruct-q8_0"]
+    }
+    
+    # Create tasks to test each host in parallel
+    tasks = []
+    for host, models in test_matrix.items():
+        task = asyncio.create_task(test_host(host, models))
+        tasks.append(task)
+    
+    # Wait for all tasks to complete
+    host_results = await asyncio.gather(*tasks)
+    
+    # Flatten results
+    all_results = []
+    for result_list in host_results:
+        all_results.extend(result_list)
+    
+    # Print the report
+    print_report(all_results)
+
+def main():
+    # Run the async main function
+    asyncio.run(main_async())
+
+if __name__ == "__main__":
+    main()