mirror of
https://github.com/geoffsee/predict-otron-9001.git
synced 2025-09-08 22:46:44 +00:00

Removed `test_request.sh`, deprecated functionality, and unused imports; introduced a new CLI tool (`cli.ts`) for testing inference engine and adjusted handling of non-streaming/streaming chat completions. - Add CPU fallback support for text generation when primary device is unsupported - Introduce `execute_with_fallback` method to handle device compatibility and shape mismatch errors - Extend unit tests to reproduce tensor shape mismatch errors specific to model configurations - Increase HTTP timeout limits in `curl_chat_stream.sh` script for reliable API testing chat completion endpoint functions with gemma3 (no streaming) Add benchmarking guide with HTML reporting, Leptos chat crate, and middleware for metrics tracking
50 lines
1.4 KiB
Bash
Executable File
50 lines
1.4 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
set -euo pipefail
|
|
|
|
# Simple curl helper for non-streaming chat completions
|
|
# Usage:
|
|
# scripts/curl_chat.sh "Who was the 16th president of the United States?"
|
|
# MODEL_ID=google/gemma-2b-it scripts/curl_chat.sh "Hello!"
|
|
|
|
SERVER_URL=${SERVER_URL:-http://localhost:8080}
|
|
MODEL_ID=${MODEL_ID:-gemma-3-1b-it}
|
|
PROMPT=${1:-"What is the capital of France?"}
|
|
MAX_TOKENS=${MAX_TOKENS:-128}
|
|
# Timeout controls (seconds)
|
|
CONNECT_TIMEOUT=${CONNECT_TIMEOUT:-2}
|
|
MAX_TIME=${MAX_TIME:-20}
|
|
|
|
cat <<EOF
|
|
[info] POST $SERVER_URL/v1/chat/completions
|
|
[info] model=$MODEL_ID, max_tokens=$MAX_TOKENS
|
|
[info] prompt=$PROMPT
|
|
[info] timeouts: connect=${CONNECT_TIMEOUT}s, max=${MAX_TIME}s
|
|
EOF
|
|
|
|
# Quick preflight to avoid long hangs when server is down
|
|
if ! curl -sS -o /dev/null -w "%{http_code}" \
|
|
--connect-timeout "$CONNECT_TIMEOUT" \
|
|
--max-time "$CONNECT_TIMEOUT" \
|
|
"$SERVER_URL/" | grep -qE '^(200|3..)'; then
|
|
echo "[warn] Server not reachable at $SERVER_URL (preflight failed)."
|
|
echo "[hint] Start it with ./run_server.sh or adjust SERVER_URL."
|
|
exit 7
|
|
fi
|
|
|
|
curl -sS -X POST \
|
|
--connect-timeout "$CONNECT_TIMEOUT" \
|
|
--max-time "$MAX_TIME" \
|
|
-H "Content-Type: application/json" \
|
|
"$SERVER_URL/v1/chat/completions" \
|
|
-d @- <<JSON
|
|
{
|
|
"model": "${MODEL_ID}",
|
|
"messages": [
|
|
{"role": "user", "content": "${PROMPT}"}
|
|
],
|
|
"max_tokens": ${MAX_TOKENS},
|
|
"stream": false
|
|
}
|
|
JSON
|
|
|
|
echo |