Final fix for llamacpp muxing.

blkt · blkt · commit 1452eff0cd02 · 2025-03-18T13:31:49.000+01:00
diff --git a/src/codegate/muxing/adapter.py b/src/codegate/muxing/adapter.py
@@ -26,7 +26,6 @@ class MuxingAdapterError(Exception):
     pass
 
 
-
 # Note: this is yet another awful hack to get the correct folder where
 # llamacpp models are stored. This is currently retrieved inside the
 # providers, but it should probably be refactored and injected,
diff --git a/src/codegate/muxing/router.py b/src/codegate/muxing/router.py
@@ -4,6 +4,7 @@
 from fastapi import APIRouter, HTTPException, Request
 from fastapi.responses import StreamingResponse
 
+import codegate.providers.llamacpp.completion_handler as llamacpp
 from codegate.clients.detector import DetectClient
 from codegate.db.models import ProviderType
 from codegate.muxing import models as mux_models
@@ -148,9 +149,14 @@ async def route_to_dest_provider(
                         from_openai = anthropic_from_openai
                         to_openai = anthropic_to_openai
                 case ProviderType.llamacpp:
-                    completion_function = provider._completion_handler.execute_completion
-                    from_openai = identity
-                    to_openai = identity
+                    if is_fim_request:
+                        completion_function = llamacpp.complete
+                        from_openai = identity
+                        to_openai = identity
+                    else:
+                        completion_function = llamacpp.chat
+                        from_openai = identity
+                        to_openai = identity
                 case ProviderType.ollama:
                     if is_fim_request:
                         completion_function = ollama.generate_streaming
diff --git a/src/codegate/providers/base.py b/src/codegate/providers/base.py
@@ -320,6 +320,7 @@ async def complete(
             )
 
         import asyncio
+
         if asyncio.iscoroutine(model_response):
             model_response = await model_response
         # Pass the request through the output pipeline
diff --git a/src/codegate/providers/llamacpp/completion_handler.py b/src/codegate/providers/llamacpp/completion_handler.py
@@ -49,9 +49,75 @@ async def chat_to_async_iterator(
         yield StreamingChatCompletion(**item)
 
 
+ENGINE = LlamaCppInferenceEngine()
+
+
+async def complete(request, api_key, model_path):
+    stream = request.get_stream()
+    full_path = f"{model_path}/{request.get_model()}.gguf"
+    request_dict = request.dict(
+        exclude={
+            "best_of",
+            "frequency_penalty",
+            "n",
+            "stream_options",
+            "user",
+        }
+    )
+
+    response = await ENGINE.complete(
+        full_path,
+        Config.get_config().chat_model_n_ctx,
+        Config.get_config().chat_model_n_gpu_layers,
+        **request_dict,
+    )
+
+    if stream:
+        return completion_to_async_iterator(response)
+    # TODO fix this code path is broken
+    return LegacyCompletion(**response)
+
+
+async def chat(request, api_key, model_path):
+    stream = request.get_stream()
+    full_path = f"{model_path}/{request.get_model()}.gguf"
+    request_dict = request.dict(
+        exclude={
+            "audio",
+            "frequency_penalty",
+            "include_reasoning",
+            "metadata",
+            "max_completion_tokens",
+            "modalities",
+            "n",
+            "parallel_tool_calls",
+            "prediction",
+            "prompt",
+            "reasoning_effort",
+            "service_tier",
+            "store",
+            "stream_options",
+            "user",
+        }
+    )
+
+    response = await ENGINE.chat(
+        full_path,
+        Config.get_config().chat_model_n_ctx,
+        Config.get_config().chat_model_n_gpu_layers,
+        **request_dict,
+    )
+
+    if stream:
+        return chat_to_async_iterator(response)
+    else:
+        # TODO fix this code path is broken
+        return StreamingChatCompletion(**response)
+
+
 class LlamaCppCompletionHandler(BaseCompletionHandler):
     def __init__(self, base_url):
-        self.inference_engine = LlamaCppInferenceEngine()
+        self.inference_engine = ENGINE
         self.base_url = base_url
 
     async def execute_completion(
@@ -65,64 +131,15 @@ async def execute_completion(
         """
         Execute the completion request with inference engine API
         """
-        model_path = f"{self.base_url}/{request.get_model()}.gguf"
-
         # Create a copy of the request dict and remove stream_options
         # Reason - Request error as JSON:
         # {'error': "Llama.create_completion() got an unexpected keyword argument 'stream_options'"}
         if is_fim_request:
-            request_dict = request.dict(
-                exclude={
-                    "best_of",
-                    "frequency_penalty",
-                    "n",
-                    "stream_options",
-                    "user",
-                }
-            )
-
-            response = await self.inference_engine.complete(
-                model_path,
-                Config.get_config().chat_model_n_ctx,
-                Config.get_config().chat_model_n_gpu_layers,
-                **request_dict,
-            )
-
-            if stream:
-                return completion_to_async_iterator(response)
-            return LegacyCompletion(**response)
+            # base_url == model_path in this case
+            return await complete(request, api_key, self.base_url)
         else:
-            request_dict = request.dict(
-                exclude={
-                    "audio",
-                    "frequency_penalty",
-                    "include_reasoning",
-                    "metadata",
-                    "max_completion_tokens",
-                    "modalities",
-                    "n",
-                    "parallel_tool_calls",
-                    "prediction",
-                    "prompt",
-                    "reasoning_effort",
-                    "service_tier",
-                    "store",
-                    "stream_options",
-                    "user",
-                }
-            )
-
-            response = await self.inference_engine.chat(
-                model_path,
-                Config.get_config().chat_model_n_ctx,
-                Config.get_config().chat_model_n_gpu_layers,
-                **request_dict,
-            )
-
-            if stream:
-                return chat_to_async_iterator(response)
-            else:
-                return StreamingChatCompletion(**response)
+            # base_url == model_path in this case
+            return await chat(request, api_key, self.base_url)
 
     def _create_streaming_response(
         self,

Original file line number	Diff line number	Diff line change
`@@ -320,6 +320,7 @@ async def complete(`
`320`	`320`	`)`
`321`	`321`
`322`	`322`	`import asyncio`
	`323`	`+`
`323`	`324`	`if asyncio.iscoroutine(model_response):`
`324`	`325`	`model_response = await model_response`
`325`	`326`	`# Pass the request through the output pipeline`