Refactor apply_cached_repeat_penalty for optimized caching and reuse, add extensive unit tests, and integrate special handling for gemma-specific models.

Removed `test_request.sh`, deprecated functionality, and unused imports; introduced a new CLI tool (`cli.ts`) for testing inference engine and adjusted handling of non-streaming/streaming chat completions. - Add CPU fallback support for text generation when primary device is unsupported - Introduce `execute_with_fallback` method to handle device compatibility and shape mismatch errors - Extend unit tests to reproduce tensor shape mismatch errors specific to model configurations - Increase HTTP timeout limits in `curl_chat_stream.sh` script for reliable API testing chat completion endpoint functions with gemma3 (no streaming) Add benchmarking guide with HTML reporting, Leptos chat crate, and middleware for metrics tracking
2025-09-08 22:46:44 +00:00 · 2025-08-26 01:30:26 -04:00
parent 7dd23213c9
commit 8338750beb
64 changed files with 14997 additions and 220 deletions
--- a/crates/legacy-inference-engine/api_test.html
+++ b/crates/legacy-inference-engine/api_test.html
@@ -0,0 +1,295 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>OpenAI-Compatible API Tester</title>
+    <style>
+        body {
+            font-family: Arial, sans-serif;
+            max-width: 800px;
+            margin: 0 auto;
+            padding: 20px;
+            line-height: 1.6;
+        }
+        h1, h2 {
+            color: #333;
+        }
+        .container {
+            margin-bottom: 20px;
+        }
+        textarea {
+            width: 100%;
+            height: 150px;
+            padding: 10px;
+            margin-bottom: 10px;
+            border: 1px solid #ddd;
+            border-radius: 4px;
+            font-family: monospace;
+        }
+        button {
+            background-color: #4CAF50;
+            color: white;
+            padding: 10px 15px;
+            border: none;
+            border-radius: 4px;
+            cursor: pointer;
+            font-size: 16px;
+        }
+        button:hover {
+            background-color: #45a049;
+        }
+        pre {
+            background-color: #f5f5f5;
+            padding: 15px;
+            border-radius: 4px;
+            overflow-x: auto;
+            white-space: pre-wrap;
+        }
+        .response {
+            margin-top: 20px;
+        }
+        .error {
+            color: red;
+        }
+        .settings {
+            display: flex;
+            flex-wrap: wrap;
+            gap: 10px;
+            margin-bottom: 15px;
+        }
+        .settings div {
+            display: flex;
+            flex-direction: column;
+        }
+        label {
+            margin-bottom: 5px;
+            font-weight: bold;
+        }
+        input {
+            padding: 8px;
+            border: 1px solid #ddd;
+            border-radius: 4px;
+        }
+        .examples {
+            margin-top: 30px;
+        }
+        .example-btn {
+            background-color: #2196F3;
+            margin-right: 10px;
+            margin-bottom: 10px;
+        }
+        .example-btn:hover {
+            background-color: #0b7dda;
+        }
+    </style>
+</head>
+<body>
+    <h1>OpenAI-Compatible API Tester</h1>
+    <p>Use this page to test the OpenAI-compatible chat completions endpoint of the local inference engine.</p>
+    
+    <div class="container">
+        <h2>Request Settings</h2>
+        <div class="settings">
+            <div>
+                <label for="serverUrl">Server URL:</label>
+                <input type="text" id="serverUrl" value="http://localhost:3777" />
+            </div>
+            <div>
+                <label for="model">Model:</label>
+                <input type="text" id="model" value="gemma-3-1b-it" />
+            </div>
+            <div>
+                <label for="maxTokens">Max Tokens:</label>
+                <input type="number" id="maxTokens" value="150" />
+            </div>
+            <div>
+                <label for="temperature">Temperature:</label>
+                <input type="number" id="temperature" value="0.7" step="0.1" min="0" max="2" />
+            </div>
+            <div>
+                <label for="topP">Top P:</label>
+                <input type="number" id="topP" value="0.9" step="0.1" min="0" max="1" />
+            </div>
+        </div>
+        
+        <h2>Request Body</h2>
+        <textarea id="requestBody">{
+  "model": "gemma-3-1b-it",
+  "messages": [
+    {
+      "role": "user",
+      "content": "Hello, how are you today?"
+    }
+  ],
+  "max_tokens": 150,
+  "temperature": 0.7,
+  "top_p": 0.9
+}</textarea>
+        <button id="sendRequest">Send Request</button>
+        
+        <div class="examples">
+            <h3>Example Requests</h3>
+            <button class="example-btn" id="example1">Basic Question</button>
+            <button class="example-btn" id="example2">Multi-turn Conversation</button>
+            <button class="example-btn" id="example3">Creative Writing</button>
+            <button class="example-btn" id="example4">Code Generation</button>
+        </div>
+        
+        <div class="response">
+            <h2>Response</h2>
+            <pre id="responseOutput">Response will appear here...</pre>
+        </div>
+    </div>
+
+    <script>
+        document.addEventListener('DOMContentLoaded', function() {
+            // Update request body when settings change
+            const serverUrlInput = document.getElementById('serverUrl');
+            const modelInput = document.getElementById('model');
+            const maxTokensInput = document.getElementById('maxTokens');
+            const temperatureInput = document.getElementById('temperature');
+            const topPInput = document.getElementById('topP');
+            const requestBodyTextarea = document.getElementById('requestBody');
+            const responseOutput = document.getElementById('responseOutput');
+            
+            // Function to update request body from settings
+            function updateRequestBodyFromSettings() {
+                try {
+                    const requestBody = JSON.parse(requestBodyTextarea.value);
+                    requestBody.model = modelInput.value;
+                    requestBody.max_tokens = parseInt(maxTokensInput.value);
+                    requestBody.temperature = parseFloat(temperatureInput.value);
+                    requestBody.top_p = parseFloat(topPInput.value);
+                    requestBodyTextarea.value = JSON.stringify(requestBody, null, 2);
+                } catch (error) {
+                    console.error("Error updating request body:", error);
+                }
+            }
+            
+            // Update settings when request body changes
+            function updateSettingsFromRequestBody() {
+                try {
+                    const requestBody = JSON.parse(requestBodyTextarea.value);
+                    if (requestBody.model) modelInput.value = requestBody.model;
+                    if (requestBody.max_tokens) maxTokensInput.value = requestBody.max_tokens;
+                    if (requestBody.temperature) temperatureInput.value = requestBody.temperature;
+                    if (requestBody.top_p) topPInput.value = requestBody.top_p;
+                } catch (error) {
+                    console.error("Error updating settings:", error);
+                }
+            }
+            
+            // Add event listeners for settings changes
+            modelInput.addEventListener('change', updateRequestBodyFromSettings);
+            maxTokensInput.addEventListener('change', updateRequestBodyFromSettings);
+            temperatureInput.addEventListener('change', updateRequestBodyFromSettings);
+            topPInput.addEventListener('change', updateRequestBodyFromSettings);
+            
+            // Add event listener for request body changes
+            requestBodyTextarea.addEventListener('blur', updateSettingsFromRequestBody);
+            
+            // Send request button
+            document.getElementById('sendRequest').addEventListener('click', async function() {
+                try {
+                    responseOutput.textContent = "Sending request...";
+                    const serverUrl = serverUrlInput.value;
+                    const endpoint = '/v1/chat/completions';
+                    const url = serverUrl + endpoint;
+                    
+                    const requestBody = JSON.parse(requestBodyTextarea.value);
+                    
+                    const response = await fetch(url, {
+                        method: 'POST',
+                        headers: {
+                            'Content-Type': 'application/json',
+                        },
+                        body: JSON.stringify(requestBody)
+                    });
+                    
+                    const data = await response.json();
+                    responseOutput.textContent = JSON.stringify(data, null, 2);
+                } catch (error) {
+                    responseOutput.textContent = "Error: " + error.message;
+                    responseOutput.classList.add('error');
+                }
+            });
+            
+            // Example requests
+            document.getElementById('example1').addEventListener('click', function() {
+                requestBodyTextarea.value = JSON.stringify({
+                    model: modelInput.value,
+                    messages: [
+                        {
+                            role: "user",
+                            content: "Who was the 16th president of the United States?"
+                        }
+                    ],
+                    max_tokens: parseInt(maxTokensInput.value),
+                    temperature: parseFloat(temperatureInput.value),
+                    top_p: parseFloat(topPInput.value)
+                }, null, 2);
+            });
+            
+            document.getElementById('example2').addEventListener('click', function() {
+                requestBodyTextarea.value = JSON.stringify({
+                    model: modelInput.value,
+                    messages: [
+                        {
+                            role: "system",
+                            content: "You are a helpful assistant that provides concise answers."
+                        },
+                        {
+                            role: "user",
+                            content: "What is machine learning?"
+                        },
+                        {
+                            role: "assistant",
+                            content: "Machine learning is a subset of artificial intelligence that enables systems to learn and improve from experience without being explicitly programmed."
+                        },
+                        {
+                            role: "user",
+                            content: "Give me an example of a machine learning algorithm."
+                        }
+                    ],
+                    max_tokens: parseInt(maxTokensInput.value),
+                    temperature: parseFloat(temperatureInput.value),
+                    top_p: parseFloat(topPInput.value)
+                }, null, 2);
+            });
+            
+            document.getElementById('example3').addEventListener('click', function() {
+                requestBodyTextarea.value = JSON.stringify({
+                    model: modelInput.value,
+                    messages: [
+                        {
+                            role: "user",
+                            content: "Write a short poem about artificial intelligence."
+                        }
+                    ],
+                    max_tokens: parseInt(maxTokensInput.value),
+                    temperature: 0.9, // Higher temperature for creative tasks
+                    top_p: 0.9
+                }, null, 2);
+                temperatureInput.value = 0.9;
+            });
+            
+            document.getElementById('example4').addEventListener('click', function() {
+                requestBodyTextarea.value = JSON.stringify({
+                    model: modelInput.value,
+                    messages: [
+                        {
+                            role: "user",
+                            content: "Write a Python function to calculate the Fibonacci sequence up to n terms."
+                        }
+                    ],
+                    max_tokens: parseInt(maxTokensInput.value),
+                    temperature: 0.3, // Lower temperature for code generation
+                    top_p: 0.9
+                }, null, 2);
+                temperatureInput.value = 0.3;
+            });
+        });
+    </script>
+</body>
+</html>