trying to fix llamacpp muxing

blkt · blkt · commit 6405e64ad909 · 2025-03-18T12:55:56.000+01:00
diff --git a/src/codegate/muxing/adapter.py b/src/codegate/muxing/adapter.py
@@ -9,6 +9,7 @@
 from litellm import ModelResponse
 from litellm.types.utils import Delta, StreamingChoices
 
+from codegate.config import Config
 from codegate.db import models as db_models
 from codegate.muxing import rulematcher
 from codegate.muxing.ollama_mappers import (
@@ -25,6 +26,16 @@ class MuxingAdapterError(Exception):
     pass
 
 
+
+# Note: this is yet another awful hack to get the correct folder where
+# llamacpp models are stored. This is currently retrieved inside the
+# providers, but it should probably be refactored and injected,
+# implementing a basic inversion-of-control pattern.
+def get_llamacpp_models_folder():
+    override = Config.get_config().provider_urls.get("llamacpp")
+    return override if override else "./codegate_volume/models"
+
+
 class BodyAdapter:
     """
     Format the body to the destination provider format.
@@ -42,6 +53,8 @@ def _get_provider_formatted_url(self, model_route: rulematcher.ModelRoute) -> st
             return urljoin(model_route.endpoint.endpoint, "/v1")
         if model_route.endpoint.provider_type == db_models.ProviderType.openrouter:
             return urljoin(model_route.endpoint.endpoint, "/api/v1")
+        if model_route.endpoint.provider_type == db_models.ProviderType.llamacpp:
+            return get_llamacpp_models_folder()
         return model_route.endpoint.endpoint
 
     def get_destination_info(self, model_route: rulematcher.ModelRoute) -> dict:
diff --git a/src/codegate/muxing/router.py b/src/codegate/muxing/router.py
@@ -147,6 +147,10 @@ async def route_to_dest_provider(
                         completion_function = anthropic.acompletion
                         from_openai = anthropic_from_openai
                         to_openai = anthropic_to_openai
+                case ProviderType.llamacpp:
+                    completion_function = provider._completion_handler.execute_completion
+                    from_openai = identity
+                    to_openai = identity
                 case ProviderType.ollama:
                     if is_fim_request:
                         completion_function = ollama.generate_streaming
@@ -227,15 +231,15 @@ async def _inner(
         new_request = from_openai(request)
         new_request.model = model
 
+        # Execute e.g. acompletion from Anthropic types
+        response = completion_handler(
+            new_request,
+            api_key,
+            base_url,
+        )
+
         # Wrap with an async generator that maps from
         # e.g. Anthropic types to OpenAI's.
-        return to_openai(
-            # Execute e.g. acompletion from Anthropic types
-            completion_handler(
-                new_request,
-                api_key,
-                base_url,
-            ),
-        )
+        return to_openai(response)
 
     return _inner
diff --git a/src/codegate/providers/base.py b/src/codegate/providers/base.py
@@ -319,6 +319,9 @@ async def complete(
                 is_fim_request=is_fim_request,
             )
 
+        import asyncio
+        if asyncio.iscoroutine(model_response):
+            model_response = await model_response
         # Pass the request through the output pipeline
         if not streaming:
             return await self._run_output_pipeline(input_pipeline_result.context, model_response)
diff --git a/src/codegate/providers/llamacpp/completion_handler.py b/src/codegate/providers/llamacpp/completion_handler.py
@@ -50,8 +50,9 @@ async def chat_to_async_iterator(
 
 
 class LlamaCppCompletionHandler(BaseCompletionHandler):
-    def __init__(self):
+    def __init__(self, base_url):
         self.inference_engine = LlamaCppInferenceEngine()
+        self.base_url = base_url
 
     async def execute_completion(
         self,
@@ -64,7 +65,7 @@ async def execute_completion(
         """
         Execute the completion request with inference engine API
         """
-        model_path = f"{base_url}/{request.get_model()}.gguf"
+        model_path = f"{self.base_url}/{request.get_model()}.gguf"
 
         # Create a copy of the request dict and remove stream_options
         # Reason - Request error as JSON:
diff --git a/src/codegate/providers/llamacpp/provider.py b/src/codegate/providers/llamacpp/provider.py
@@ -28,7 +28,7 @@ def __init__(
             self.base_url = self._get_base_url()
         else:
             self.base_url = "./codegate_volume/models"
-        completion_handler = LlamaCppCompletionHandler()
+        completion_handler = LlamaCppCompletionHandler(self.base_url)
         super().__init__(
             None,
             None,

Original file line number	Diff line number	Diff line change
`@@ -319,6 +319,9 @@ async def complete(`
`319`	`319`	`is_fim_request=is_fim_request,`
`320`	`320`	`)`
`321`	`321`
	`322`	`+ import asyncio`
	`323`	`+ if asyncio.iscoroutine(model_response):`
	`324`	`+ model_response = await model_response`
`322`	`325`	`# Pass the request through the output pipeline`
`323`	`326`	`if not streaming:`
`324`	`327`	`return await self._run_output_pipeline(input_pipeline_result.context, model_response)`