Refactor apply_cached_repeat_penalty for optimized caching and reuse, add extensive unit tests, and integrate special handling for gemma-specific models.

Removed `test_request.sh`, deprecated functionality, and unused imports; introduced a new CLI tool (`cli.ts`) for testing inference engine and adjusted handling of non-streaming/streaming chat completions. - Add CPU fallback support for text generation when primary device is unsupported - Introduce `execute_with_fallback` method to handle device compatibility and shape mismatch errors - Extend unit tests to reproduce tensor shape mismatch errors specific to model configurations - Increase HTTP timeout limits in `curl_chat_stream.sh` script for reliable API testing chat completion endpoint functions with gemma3 (no streaming) Add benchmarking guide with HTML reporting, Leptos chat crate, and middleware for metrics tracking
2025-09-08 22:46:44 +00:00 · 2025-08-26 01:30:26 -04:00
parent 7dd23213c9
commit 8338750beb
64 changed files with 14997 additions and 220 deletions
--- a/integration/openai-client-tests/local_openai.test.ts
+++ b/integration/openai-client-tests/local_openai.test.ts
@@ -0,0 +1,43 @@
+import OpenAI from "openai";
+import {describe, test, expect} from "bun:test";
+
+const supportedModels = ["gemma-3-1b-it"];
+
+
+async function requestLocalOpenAI(model: string, userPrompt: string) {
+    const openai = new OpenAI({
+        baseURL: "http://localhost:8080/v1",
+        apiKey: "not used",
+    });
+    try {
+        return openai.chat.completions.create({
+            model: model,
+            max_tokens: 100,
+            stream: true,
+            messages: [
+                {name: "assistant_1", role: "system", content: "I am a helpful assistant" },
+                {name: "user_1", role: "user", content: userPrompt}
+            ]
+        });
+    } catch (e) {
+        console.error(e);
+        throw e;
+    }
+}
+
+describe("Local OpenAI Completions", () => {
+    test("Should return a valid message", async () => {
+        const model = supportedModels.pop();
+        const userPrompt = "Who was the 16th president of the United States?";
+        const response = await requestLocalOpenAI(model, userPrompt);
+
+        const chunks = [];
+        for await (const chunk of response) {
+            console.log('Received chunk:', chunk);
+            chunks.push(chunk);
+        }
+
+        expect(chunks.length).toBeGreaterThan(0);
+    })
+})
+