Files
predict-otron-9001/scripts/performance_test_inference.sh
geoffsee 8338750beb Refactor apply_cached_repeat_penalty for optimized caching and reuse, add extensive unit tests, and integrate special handling for gemma-specific models.
Removed `test_request.sh`, deprecated functionality, and unused imports; introduced a new CLI tool (`cli.ts`) for testing inference engine and adjusted handling of non-streaming/streaming chat completions.

- Add CPU fallback support for text generation when primary device is unsupported
- Introduce `execute_with_fallback` method to handle device compatibility and shape mismatch errors
- Extend unit tests to reproduce tensor shape mismatch errors specific to model configurations
- Increase HTTP timeout limits in `curl_chat_stream.sh` script for reliable API testing

chat completion endpoint functions with gemma3 (no streaming)

Add benchmarking guide with HTML reporting, Leptos chat crate, and middleware for metrics tracking
2025-08-27 16:15:01 -04:00

116 lines
4.1 KiB
Bash
Executable File

#!/bin/bash
# Performance testing script for inference-engine
# This script sends a series of chat completion requests to measure performance
echo "===== Inference Engine Performance Test ====="
echo "Testing with varying prompt sizes to establish baseline performance"
# Test parameters
SERVER_URL="http://localhost:8080"
ITERATIONS=3 # Lower than embeddings test due to longer processing time
TEST_SIZES=("small" "medium" "large")
MAX_TOKENS=50 # Limit token generation to keep tests shorter
# Define test prompts of different sizes
SMALL_PROMPT="What is the capital of France?"
MEDIUM_PROMPT="Explain the basic principles of machine learning. Include a brief overview of supervised and unsupervised learning."
LARGE_PROMPT="Write a comprehensive explanation of large language models. Include details about their architecture, training process, capabilities, limitations, and potential future developments. Also discuss ethical considerations around their use and deployment."
# Create a temp directory for test results
TEMP_DIR=$(mktemp -d)
echo "Storing test results in: $TEMP_DIR"
# Function to run a single test and record the results
run_test() {
local size=$1
local prompt=$2
local output_file="${TEMP_DIR}/${size}_results.txt"
echo -e "\n===== Testing $size prompt =====" | tee -a "$output_file"
echo "Prompt length: $(echo "$prompt" | wc -w) words" | tee -a "$output_file"
# Prepare JSON payload
local json_payload=$(cat <<EOF
{
"model": "gemma-3-1b-it",
"messages": [{"role": "user", "content": "$prompt"}],
"max_tokens": $MAX_TOKENS
}
EOF
)
# Run the test multiple times
for i in $(seq 1 $ITERATIONS); do
echo "Iteration $i:" | tee -a "$output_file"
# Send request and measure time
start_time=$(date +%s.%N)
# Send the chat completion request
response=$(curl -s -X POST \
-H "Content-Type: application/json" \
-d "$json_payload" \
"$SERVER_URL/v1/chat/completions")
end_time=$(date +%s.%N)
# Calculate elapsed time
elapsed=$(echo "$end_time - $start_time" | bc)
# Extract response content length
content_length=$(echo "$response" | grep -o '"content":"[^"]*"' | wc -c)
# Check if we got an error (for troubleshooting)
error_check=$(echo "$response" | grep -c "error")
if [ "$error_check" -gt 0 ]; then
echo " Error in response: $response" | tee -a "$output_file"
fi
# Log results
echo " Time: ${elapsed}s, Response size: $content_length bytes" | tee -a "$output_file"
# Add a delay between requests to allow server to recover
sleep 2
done
# Calculate average time
avg_time=$(grep "Time:" "$output_file" | grep -v "Error" | awk '{sum+=$2} END {if(NR>0) print sum/NR; else print "N/A"}')
echo "Average time for $size prompt: ${avg_time}s" | tee -a "$output_file"
}
# Make sure the server is running
echo "Checking if the server is running..."
if ! curl -s "$SERVER_URL" > /dev/null; then
echo "Server doesn't appear to be running at $SERVER_URL"
echo "Please start the server with: ./run_server.sh"
exit 1
fi
# Run tests for each prompt size
echo "Starting performance tests..."
run_test "small" "$SMALL_PROMPT"
run_test "medium" "$MEDIUM_PROMPT"
run_test "large" "$LARGE_PROMPT"
echo -e "\n===== Performance Test Summary ====="
for size in "${TEST_SIZES[@]}"; do
avg=$(grep "Average time for $size prompt" "${TEMP_DIR}/${size}_results.txt" | awk '{print $6}')
if [ -z "$avg" ]; then
avg="N/A (possible errors)"
else
avg="${avg}s"
fi
echo "$size prompt: $avg"
done
# Provide more detailed analysis if possible
echo -e "\n===== Performance Analysis ====="
echo "Note: The inference-engine response times include:"
echo " - Input prompt tokenization"
echo " - Model inference (token generation)"
echo " - Response post-processing"
echo "Check server logs for more detailed performance breakdown"
echo -e "\nDetailed results are available in: $TEMP_DIR"
echo "===== Test Complete ====="