mirror of
https://github.com/geoffsee/predict-otron-9001.git
synced 2025-09-08 22:46:44 +00:00
Refactor apply_cached_repeat_penalty
for optimized caching and reuse, add extensive unit tests, and integrate special handling for gemma-specific models.
Removed `test_request.sh`, deprecated functionality, and unused imports; introduced a new CLI tool (`cli.ts`) for testing inference engine and adjusted handling of non-streaming/streaming chat completions. - Add CPU fallback support for text generation when primary device is unsupported - Introduce `execute_with_fallback` method to handle device compatibility and shape mismatch errors - Extend unit tests to reproduce tensor shape mismatch errors specific to model configurations - Increase HTTP timeout limits in `curl_chat_stream.sh` script for reliable API testing chat completion endpoint functions with gemma3 (no streaming) Add benchmarking guide with HTML reporting, Leptos chat crate, and middleware for metrics tracking
This commit is contained in:
@@ -13,8 +13,6 @@ pub use text_generation::TextGeneration;
|
||||
pub use token_output_stream::TokenOutputStream;
|
||||
pub use server::{AppState, create_router};
|
||||
|
||||
use axum::{Json, http::StatusCode, routing::post, Router};
|
||||
use serde_json;
|
||||
use std::env;
|
||||
use tracing_subscriber::{layer::SubscriberExt, util::SubscriberInitExt};
|
||||
|
||||
@@ -45,26 +43,3 @@ pub fn init_tracing() {
|
||||
.with(tracing_subscriber::fmt::layer())
|
||||
.init();
|
||||
}
|
||||
|
||||
/// Create a simplified inference router that returns appropriate error messages
|
||||
/// indicating that full model loading is required for production use
|
||||
pub fn create_inference_router() -> Router {
|
||||
Router::new()
|
||||
.route("/v1/chat/completions", post(simplified_chat_completions))
|
||||
}
|
||||
|
||||
async fn simplified_chat_completions(
|
||||
axum::Json(request): axum::Json<serde_json::Value>,
|
||||
) -> Result<Json<serde_json::Value>, (StatusCode, Json<serde_json::Value>)> {
|
||||
// Return the same error message as the actual server implementation
|
||||
// to indicate that full inference functionality requires proper model initialization
|
||||
Err((
|
||||
StatusCode::BAD_REQUEST,
|
||||
Json(serde_json::json!({
|
||||
"error": {
|
||||
"message": "The OpenAI API is currently not supported due to compatibility issues with the tensor operations. Please use the CLI mode instead with: cargo run --bin inference-engine -- --prompt \"Your prompt here\"",
|
||||
"type": "unsupported_api"
|
||||
}
|
||||
})),
|
||||
))
|
||||
}
|
Reference in New Issue
Block a user