update docs

2025-09-08 22:46:44 +00:00 · 2025-08-28 12:54:09 -04:00
parent 0488bddfdb
commit d04340d9ac
18 changed files with 22 additions and 651 deletions
--- a/scripts/cli.ts
+++ b/scripts/cli.ts
@@ -0,0 +1,340 @@
+#!/usr/bin/env bun
+
+import OpenAI from "openai";
+import { parseArgs } from "util";
+
+// =====================
+// Config
+// =====================
+const DEFAULT_MODEL = "gemma-3-1b-it";
+const DEFAULT_MAX_TOKENS = 256;
+
+// Toggle this to reduce log overhead during timing runs
+const PRINT_CHUNK_DEBUG = false;
+
+// How many rows to show in the timing tables
+const SHOW_FIRST_N = 3;
+const SHOW_SLOWEST_N = 3;
+
+// =====================
+// Helpers
+// =====================
+const now = () => performance.now();
+
+type ChunkStat = {
+    index: number;
+    tSinceRequestStartMs: number;
+    dtSincePrevMs: number;
+    contentChars: number;
+};
+
+function printHelp() {
+    console.log(`
+Usage: bun client_cli.ts [options] [prompt]
+
+Simple CLI tool for testing the local OpenAI-compatible API server.
+
+Options:
+  --model <model>     Model to use (default: ${DEFAULT_MODEL})
+  --prompt <prompt>   The prompt to send (can also be provided as positional argument)
+  --list-models       List all available models from the server
+  --help              Show this help message
+
+Examples:
+  ./cli.ts "What is the capital of France?"
+  ./cli.ts --model gemma-3-1b-it --prompt "Hello, world!"
+  ./cli.ts --prompt "Who was the 16th president of the United States?"
+  ./cli.ts --list-models
+
+The server should be running at http://localhost:8080
+Start it with: ./run_server.sh
+`);
+}
+
+const { values, positionals } = parseArgs({
+    args: process.argv.slice(2),
+    options: {
+        model: { type: "string" },
+        prompt: { type: "string" },
+        help: { type: "boolean" },
+        "list-models": { type: "boolean" },
+    },
+    strict: false,
+    allowPositionals: true,
+});
+
+async function requestLocalOpenAI(model: string, userPrompt: string) {
+    const openai = new OpenAI({
+        baseURL: "http://localhost:8080/v1",
+        apiKey: "not used",
+    });
+    try {
+        console.log("[DEBUG] Creating chat completion request...");
+        return openai.chat.completions.create({
+            model,
+            max_tokens: DEFAULT_MAX_TOKENS,
+            stream: true,
+            messages: [
+                {
+                    role: "system",
+                    content: "You are a helpful assistant who responds thoughtfully and concisely.",
+                },
+                { role: "user", content: userPrompt },
+            ],
+        });
+    } catch (e: any) {
+        console.error("[ERROR] Failed to connect to local OpenAI server:", e.message);
+        console.error("[HINT] Make sure the server is running at http://localhost:8080");
+        console.error("[HINT] Start it with: ./run_server.sh");
+        throw e;
+    }
+}
+
+async function listModels() {
+    const openai = new OpenAI({
+        baseURL: "http://localhost:8080/v1",
+        apiKey: "not used",
+    });
+    try {
+        const models = await openai.models.list();
+        console.log(`[INFO] Available models from http://localhost:8080/v1:`);
+        console.log("---");
+
+        if (models.data && models.data.length > 0) {
+            models.data.forEach((model, index) => {
+                console.log(`${index + 1}. ${model.id}`);
+                console.log(`   Owner: ${model.owned_by}`);
+                console.log(`   Created: ${new Date(model.created * 1000).toISOString()}`);
+                console.log("");
+            });
+            console.log(`Total: ${models.data.length} models available`);
+        } else {
+            console.log("No models found.");
+        }
+    } catch (e: any) {
+        console.error("[ERROR] Failed to fetch models from local OpenAI server:", e.message);
+        console.error("[HINT] Make sure the server is running at http://localhost:8080");
+        console.error("[HINT] Start it with: ./run_server.sh");
+        throw e;
+    }
+}
+
+// =====================
+// Timing math
+// =====================
+function median(nums: number[]) {
+    if (nums.length === 0) return 0;
+    const s = [...nums].sort((a, b) => a - b);
+    const mid = Math.floor(s.length / 2);
+    return s.length % 2 ? s[mid] : (s[mid - 1] + s[mid]) / 2;
+}
+
+function quantile(nums: number[], q: number) {
+    if (nums.length === 0) return 0;
+    const s = [...nums].sort((a, b) => a - b);
+    const pos = (s.length - 1) * q;
+    const base = Math.floor(pos);
+    const rest = pos - base;
+    return s[base + 1] !== undefined ? s[base] + rest * (s[base + 1] - s[base]) : s[base];
+}
+
+function ms(n: number) {
+    return `${n.toFixed(1)} ms`;
+}
+
+// =====================
+// Main
+// =====================
+async function main() {
+    const tProgramStart = now();
+
+    if (values.help) {
+        printHelp();
+        process.exit(0);
+    }
+
+    if (values["list-models"]) {
+        try {
+            await listModels();
+            process.exit(0);
+        } catch (error: any) {
+            console.error("\n[ERROR] Failed to list models:", error.message);
+            process.exit(1);
+        }
+    }
+
+    const prompt = values.prompt ?? positionals[0];
+
+    if (!prompt) {
+        console.error("[ERROR] No prompt provided!");
+        printHelp();
+        process.exit(1);
+    }
+
+    const model = values.model || DEFAULT_MODEL;
+
+    console.log(`[INFO] Using model: ${model}`);
+    console.log(`[INFO] Prompt: ${prompt}`);
+    console.log(`[INFO] Connecting to: http://localhost:8080/v1`);
+    console.log("---");
+
+    const tBeforeRequest = now();
+
+    try {
+        console.log("[DEBUG] Initiating request to OpenAI server...");
+        const response = await requestLocalOpenAI(model, prompt);
+        const tAfterCreate = now();
+
+        // Streaming handling + timing
+        let fullResponse = "";
+        let chunkCount = 0;
+
+        const chunkStats: ChunkStat[] = [];
+        let tFirstChunk: number | null = null;
+        let tPrevChunk: number | null = null;
+
+        console.log("[INFO] Waiting for model to generate response...");
+        let loadingInterval;
+        if (!PRINT_CHUNK_DEBUG) {
+            // Show loading animation only if not in debug mode
+            const loadingChars = ['⠋', '⠙', '⠹', '⠸', '⠼', '⠴', '⠦', '⠧', '⠇', '⠏'];
+            let i = 0;
+            process.stdout.write('\r[INFO] Thinking  ');
+            loadingInterval = setInterval(() => {
+                process.stdout.write(`\r[INFO] Thinking ${loadingChars[i++ % loadingChars.length]} `);
+            }, 80);
+        } else {
+            console.log("[DEBUG] Starting to receive streaming response...");
+        }
+
+        for await (const chunk of response) {
+            // Clear loading animation on first chunk
+            if (loadingInterval) {
+                clearInterval(loadingInterval);
+                process.stdout.write('\r                      \r');
+            }
+            const tNow = now();
+            chunkCount++;
+
+            // Extract content (delta) if present
+            const content = chunk.choices?.[0]?.delta?.content ?? "";
+            if (PRINT_CHUNK_DEBUG) {
+                console.log(`[DEBUG] Received chunk #${chunkCount}:`, JSON.stringify(chunk));
+                if (content) console.log(`[DEBUG] Chunk content: "${content}"`);
+            }
+
+            if (content) {
+                process.stdout.write(content);
+                fullResponse += content;
+            }
+
+            if (tFirstChunk === null) tFirstChunk = tNow;
+
+            const dtSincePrev = tPrevChunk === null ? 0 : tNow - tPrevChunk;
+            chunkStats.push({
+                index: chunkCount,
+                tSinceRequestStartMs: tNow - tBeforeRequest,
+                dtSincePrevMs: dtSincePrev,
+                contentChars: content.length,
+            });
+
+            tPrevChunk = tNow;
+        }
+
+        // =========
+        // Summary
+        // =========
+        const tStreamEnd = now();
+        const totalChars = fullResponse.length;
+
+        console.log("\n---");
+        console.log(`[DEBUG] Stream completed after ${chunkCount} chunks`);
+        console.log(`[INFO] Response completed. Total length: ${totalChars} characters`);
+
+        // Build timing metrics
+        const ttfbMs = (tFirstChunk ?? tStreamEnd) - tAfterCreate; // time from create() resolved → first chunk
+        const createOverheadMs = tAfterCreate - tBeforeRequest;    // time spent awaiting create() promise
+        const totalSinceRequestMs = tStreamEnd - tBeforeRequest;   // from just before create() to last chunk
+        const streamDurationMs =
+            tFirstChunk === null ? 0 : tStreamEnd - tFirstChunk;
+
+        const gaps = chunkStats
+            .map((c) => c.dtSincePrevMs)
+            // ignore the first "gap" which is 0 by construction
+            .slice(1);
+
+        const avgGapMs = gaps.length ? gaps.reduce((a, b) => a + b, 0) / gaps.length : 0;
+        const medGapMs = median(gaps);
+        const p95GapMs = quantile(gaps, 0.95);
+
+        let maxGapMs = 0;
+        let maxGapAtChunk = 0;
+        for (let i = 0; i < gaps.length; i++) {
+            if (gaps[i] > maxGapMs) {
+                maxGapMs = gaps[i];
+                maxGapAtChunk = i + 2; // +1 to move from 0-based, +1 because we sliced starting at second chunk
+            }
+        }
+
+        // Pretty print summary
+        console.log("\n=== Timing Summary ===");
+        console.log(`create() await time:        ${ms(createOverheadMs)}`);
+        console.log(`TTFB (to 1st chunk):        ${ms(ttfbMs)}`);
+        console.log(`Stream duration:            ${ms(streamDurationMs)}`);
+        console.log(`End-to-end (req→last):      ${ms(totalSinceRequestMs)}`);
+        console.log(`Chunks:                     ${chunkCount}`);
+        console.log(`Total content chars:        ${totalChars}`);
+        console.log(`Avg chars/chunk:            ${(chunkCount ? totalChars / chunkCount : 0).toFixed(1)}`);
+        console.log(`Inter-chunk gap (avg):      ${ms(avgGapMs)}`);
+        console.log(`Inter-chunk gap (median):   ${ms(medGapMs)}`);
+        console.log(`Inter-chunk gap (p95):      ${ms(p95GapMs)}`);
+        if (gaps.length > 0) {
+            console.log(`Largest gap:                ${ms(maxGapMs)} (before chunk #${maxGapAtChunk})`);
+        }
+
+        // Small tables: first N and slowest N gaps
+        const firstRows = chunkStats.slice(0, SHOW_FIRST_N).map((c) => ({
+            chunk: c.index,
+            "t since request": `${c.tSinceRequestStartMs.toFixed(1)} ms`,
+            "dt since prev": `${c.dtSincePrevMs.toFixed(1)} ms`,
+            "chars": c.contentChars,
+        }));
+
+        const slowestRows = chunkStats
+            .slice(1) // skip first (no meaningful gap)
+            .sort((a, b) => b.dtSincePrevMs - a.dtSincePrevMs)
+            .slice(0, SHOW_SLOWEST_N)
+            .map((c) => ({
+                chunk: c.index,
+                "t since request": `${c.tSinceRequestStartMs.toFixed(1)} ms`,
+                "dt since prev": `${c.dtSincePrevMs.toFixed(1)} ms`,
+                "chars": c.contentChars,
+            }));
+
+        if (firstRows.length > 0) {
+            console.log("\n--- First chunk timings ---");
+            // @ts-ignore Bun/Node support console.table
+            console.table(firstRows);
+        }
+
+        if (slowestRows.length > 0) {
+            console.log(`\n--- Slowest ${SHOW_SLOWEST_N} gaps ---`);
+            // @ts-ignore
+            console.table(slowestRows);
+        }
+
+        const tProgramEnd = now();
+        console.log("\n=== Program Overhead ===");
+        console.log(`Total program runtime:      ${ms(tProgramEnd - tProgramStart)}`);
+
+    } catch (error: any) {
+        console.error("\n[ERROR] Request failed:", error.message);
+        process.exit(1);
+    }
+}
+
+// Run the main function
+main().catch((error) => {
+    console.error("[FATAL ERROR]:", error);
+    process.exit(1);
+});
--- a/scripts/run.sh
+++ b/scripts/run.sh
@@ -1,3 +0,0 @@
-#!/bin/bash
-
-cargo run --bin ptron
--- a/scripts/run_server.sh
+++ b/scripts/run_server.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+# Start the unified predict-otron-9000 server on port 8080
+export SERVER_PORT=${SERVER_PORT:-8080}
+export RUST_LOG=${RUST_LOG:-info}
+
+cargo run --bin predict-otron-9000 --release
--- a/scripts/test.sh
+++ b/scripts/test.sh
@@ -0,0 +1,69 @@
+#!/bin/bash
+
+# Script to test predict-otron-9000 server with 2 sequential CLI requests
+# Ensures proper cleanup of child processes on exit
+
+set -e  # Exit on any error
+
+# Function to cleanup background processes
+cleanup() {
+    echo "[INFO] Cleaning up background processes..."
+    if [[ -n "$SERVER_PID" ]]; then
+        echo "[INFO] Killing server process (PID: $SERVER_PID)"
+        kill "$SERVER_PID" 2>/dev/null || true
+        wait "$SERVER_PID" 2>/dev/null || true
+    fi
+    
+    # Kill any remaining cargo processes related to predict-otron-9000
+    pkill -f "predict-otron-9000" 2>/dev/null || true
+    
+    echo "[INFO] Cleanup complete"
+}
+
+# Set up trap to ensure cleanup on script exit
+trap cleanup EXIT INT TERM
+
+# Set environment variables
+export SERVER_PORT=${SERVER_PORT:-8080}
+export RUST_LOG=${RUST_LOG:-info}
+
+echo "[INFO] Starting predict-otron-9000 server in background..."
+
+# Start the server in background and capture its PID
+cargo run --bin predict-otron-9000 --release > server.log 2>&1 &
+SERVER_PID=$!
+
+echo "[INFO] Server started with PID: $SERVER_PID"
+
+# Function to check if server is ready
+check_server() {
+    curl -s -f http://localhost:8080/v1/models > /dev/null 2>&1
+}
+
+# Wait for server to be ready
+echo "[INFO] Waiting for server to be ready..."
+TIMEOUT=60  # 60 seconds timeout
+ELAPSED=0
+
+while ! check_server; do
+    if [[ $ELAPSED -ge $TIMEOUT ]]; then
+        echo "[ERROR] Server did not start within $TIMEOUT seconds"
+        exit 1
+    fi
+    sleep 2
+    ELAPSED=$((ELAPSED + 2))
+    echo "[INFO] Still waiting for server... (${ELAPSED}s elapsed)"
+done
+
+echo "[INFO] Server is ready!"
+
+# Run first CLI request
+echo "[INFO] Running first CLI request - listing models..."
+./cli.ts --list-models
+
+echo ""
+echo "[INFO] Running second CLI request - chat completion..."
+./cli.ts "What is 2+2?"
+
+echo ""
+echo "[INFO] Both CLI requests completed successfully!"
--- a/scripts/test_request.sh
+++ b/scripts/test_request.sh
@@ -1,69 +0,0 @@
-#!/bin/bash
-
-# Simple test script for inference-engine
-# This script sends a single chat completion request
-
-echo "===== Inference Engine Test ====="
-
-# Test parameters
-SERVER_URL="http://localhost:8080"  # Changed from 8080 to 3777 to match main.rs default port
-MAX_TOKENS=10
-PROMPT="What is the capital of France?"
-MODEL="${MODEL_ID:-gemma-2-2b-it}"  # Using gemma-2-2b-it as specified in the original test
-
-# Create a temp directory for test results
-TEMP_DIR=$(mktemp -d)
-echo "Storing test results in: $TEMP_DIR"
-
-# Prepare JSON payload
-json_payload=$(cat <<EOF
-{
-    "model": "$MODEL", 
-    "messages": [{"role": "user", "content": "$PROMPT"}],
-    "max_tokens": $MAX_TOKENS
-}
-EOF
-)
-
-# Make sure the server is running
-echo "Checking if the server is running..."
-if ! curl -s "$SERVER_URL" > /dev/null; then
-    echo "Server doesn't appear to be running at $SERVER_URL"
-    echo "Please start the server with: ./run_server.sh"
-    exit 1
-fi
-
-echo "Sending request..."
-
-# Send request and measure time
-start_time=$(date +%s.%N)
-
-# Send the chat completion request with 30 second timeout
-# Note: The gemma-2-2b-it model takes ~12.57 seconds per token on average
-# So even with MAX_TOKENS=10, the request might time out before completion
-# The timeout ensures the script doesn't hang indefinitely
-response=$(curl -s -X POST \
-    -H "Content-Type: application/json" \
-    -d "$json_payload" \
-    --max-time 30 \
-    "$SERVER_URL/v1/chat/completions")
-
-end_time=$(date +%s.%N)
-
-# Calculate elapsed time
-elapsed=$(echo "$end_time - $start_time" | bc)
-
-# Extract response content length
-content_length=$(echo "$response" | grep -o '"content":"[^"]*"' | wc -c)
-
-# Check if we got an error
-error_check=$(echo "$response" | grep -c "error")
-if [ "$error_check" -gt 0 ]; then
-    echo "Error in response: $response"
-fi
-
-# Log results
-echo "Time: ${elapsed}s, Response size: $content_length bytes"
-echo "Response: $response"
-
-echo -e "\nTest Complete"