stacklok · lukehinds · Mar 18, 2025 · Jan 23, 2025 · Mar 10, 2025 · Mar 10, 2025
diff --git a/prompts/default.yaml b/prompts/default.yaml
@@ -46,7 +46,7 @@ pii_redacted: |
   The context files contain redacted personally identifiable information (PII) that is represented by a UUID encased within <>. For example:
   - <123e4567-e89b-12d3-a456-426614174000>
   - <2d040296-98e9-4350-84be-fda4336057eb>
-  If you encounter any PII redacted with a UUID, DO NOT WARN the user about it. Simplt respond to the user request and keep the PII redacted and intact, using the same UUID.
+  If you encounter any PII redacted with a UUID, DO NOT WARN the user about it. Simply respond to the user request and keep the PII redacted and intact, using the same UUID.
 # Security-focused prompts
 security_audit: "You are a security expert conducting a thorough code review. Identify potential security vulnerabilities, suggest improvements, and explain security best practices."
 
@@ -56,6 +56,6 @@ red_team: "You are a red team member conducting a security assessment. Identify
 # BlueTeam prompts
 blue_team: "You are a blue team member conducting a security assessment. Identify security controls, misconfigurations, and potential vulnerabilities."
 
-# Per client prompts
+# Per client prompts
 client_prompts:
     kodu: "If malicious packages or leaked secrets are found, please end the task, sending the problems found embedded in <attempt_completion><result> tags"
diff --git a/src/codegate/config.py b/src/codegate/config.py
@@ -16,9 +16,9 @@
 
 # Default provider URLs
 DEFAULT_PROVIDER_URLS = {
-    "openai": "https://api.openai.com/v1",
-    "openrouter": "https://openrouter.ai/api/v1",
-    "anthropic": "https://api.anthropic.com/v1",
+    "openai": "https://api.openai.com",
+    "openrouter": "https://openrouter.ai/api",
+    "anthropic": "https://api.anthropic.com",
     "vllm": "http://localhost:8000",  # Base URL without /v1 path
     "ollama": "http://localhost:11434",  # Default Ollama server URL
     "lm_studio": "http://localhost:1234",

diff --git a/src/codegate/db/connection.py b/src/codegate/db/connection.py
@@ -123,6 +123,17 @@ def does_db_exist(self):
         return self._db_path.is_file()
 
 
+def row_from_model(model: BaseModel) -> dict:
+    return dict(
+        id=model.id,
+        timestamp=model.timestamp,
+        provider=model.provider,
+        request=model.request.json(exclude_defaults=True, exclude_unset=True),
+        type=model.type,
+        workspace_id=model.workspace_id,
+    )
+
+
 class DbRecorder(DbCodeGate):
     def __init__(self, sqlite_path: Optional[str] = None, *args, **kwargs):
         super().__init__(sqlite_path, *args, **kwargs)
@@ -133,7 +144,10 @@ async def _execute_update_pydantic_model(
         """Execute an update or insert command for a Pydantic model."""
         try:
             async with self._async_db_engine.begin() as conn:
-                result = await conn.execute(sql_command, model.model_dump())
+                row = model
+                if isinstance(model, BaseModel):
+                    row = model.model_dump()
+                result = await conn.execute(sql_command, row)
                 row = result.first()
                 if row is None:
                     return None
@@ -175,7 +189,8 @@ async def record_request(self, prompt_params: Optional[Prompt] = None) -> Option
                 RETURNING *
                 """
         )
-        recorded_request = await self._execute_update_pydantic_model(prompt_params, sql)
+        row = row_from_model(prompt_params)
+        recorded_request = await self._execute_update_pydantic_model(row, sql)
         # Uncomment to debug the recorded request
         # logger.debug(f"Recorded request: {recorded_request}")
         return recorded_request  # type: ignore
@@ -194,7 +209,8 @@ async def update_request(
                 RETURNING *
                 """
         )
-        updated_request = await self._execute_update_pydantic_model(prompt_params, sql)
+        row = row_from_model(prompt_params)
+        updated_request = await self._execute_update_pydantic_model(row, sql)
         # Uncomment to debug the recorded request
         # logger.debug(f"Recorded request: {recorded_request}")
         return updated_request  # type: ignore
@@ -217,7 +233,7 @@ async def record_outputs(
             output=first_output.output,
         )
         full_outputs = []
-        # Just store the model respnses in the list of JSON objects.
+        # Just store the model responses in the list of JSON objects.
         for output in outputs:
             full_outputs.append(output.output)
 
@@ -341,7 +357,7 @@ async def record_context(self, context: Optional[PipelineContext]) -> None:
                     f"Alerts: {len(context.alerts_raised)}."
                 )
         except Exception as e:
-            logger.error(f"Failed to record context: {context}.", error=str(e))
+            logger.error(f"Failed to record context: {context}.", error=str(e), exc_info=e)
 
     async def add_workspace(self, workspace_name: str) -> WorkspaceRow:
         """Add a new workspace to the DB.

diff --git a/src/codegate/db/fim_cache.py b/src/codegate/db/fim_cache.py
@@ -33,6 +33,18 @@ def __init__(self):
 
     def _extract_message_from_fim_request(self, request: str) -> Optional[str]:
         """Extract the user message from the FIM request"""
+        ### NEW CODE PATH ###
+        if not isinstance(request, str):
+            content_message = None
+            for message in request.get_messages():
+                for content in message.get_content():
+                    if content_message is None:
+                        content_message = content.get_text()
+                    else:
+                        logger.warning("Expected one user message, found multiple.")
+                        return None
+            return content_message
+
         try:
             parsed_request = json.loads(request)
         except Exception as e:

diff --git a/src/codegate/extract_snippets/body_extractor.py b/src/codegate/extract_snippets/body_extractor.py
@@ -9,6 +9,7 @@
     KoduCodeSnippetExtractor,
     OpenInterpreterCodeSnippetExtractor,
 )
+from codegate.types.common import MessageTypeFilter
 
 
 class BodyCodeSnippetExtractorError(Exception):
@@ -32,25 +33,22 @@ def _extract_from_user_messages(self, data: dict) -> set[str]:
             raise BodyCodeSnippetExtractorError("Code Extractor not set.")
 
         filenames: List[str] = []
-        for msg in data.get("messages", []):
-            if msg.get("role", "") == "user":
+        for msg in data.get_messages(filters=[MessageTypeFilter.USER]):
+            for content in msg.get_content():
                 extracted_snippets = self._snippet_extractor.extract_unique_snippets(
-                    msg.get("content")
+                    content.get_text(),
                 )
                 filenames.extend(extracted_snippets.keys())
         return set(filenames)
 
     def _extract_from_list_user_messages(self, data: dict) -> set[str]:
         filenames: List[str] = []
-        for msg in data.get("messages", []):
-            if msg.get("role", "") == "user":
-                msgs_content = msg.get("content", [])
-                for msg_content in msgs_content:
-                    if msg_content.get("type", "") == "text":
-                        extracted_snippets = self._snippet_extractor.extract_unique_snippets(
-                            msg_content.get("text")
-                        )
-                        filenames.extend(extracted_snippets.keys())
+        for msg in data.get_messages(filters=[MessageTypeFilter.USER]):
+            for content in msg.get_content():
+                extracted_snippets = self._snippet_extractor.extract_unique_snippets(
+                    content.get_text(),
+                )
+                filenames.extend(extracted_snippets.keys())
         return set(filenames)
 
     @abstractmethod
@@ -93,43 +91,27 @@ class OpenInterpreterBodySnippetExtractor(BodyCodeSnippetExtractor):
     def __init__(self):
         self._snippet_extractor = OpenInterpreterCodeSnippetExtractor()
 
-    def _is_msg_tool_call(self, msg: dict) -> bool:
-        return msg.get("role", "") == "assistant" and msg.get("tool_calls", [])
-
-    def _is_msg_tool_result(self, msg: dict) -> bool:
-        return msg.get("role", "") == "tool" and msg.get("content", "")
-
-    def _extract_args_from_tool_call(self, msg: dict) -> str:
-        """
-        Extract the arguments from the tool call message.
-        """
-        tool_calls = msg.get("tool_calls", [])
-        if not tool_calls:
-            return ""
-        return tool_calls[0].get("function", {}).get("arguments", "")
-
-    def _extract_result_from_tool_result(self, msg: dict) -> str:
-        """
-        Extract the result from the tool result message.
-        """
-        return msg.get("content", "")
-
     def extract_unique_filenames(self, data: dict) -> set[str]:
-        messages = data.get("messages", [])
-        if not messages:
-            return set()
-
         filenames: List[str] = []
-        for i_msg in range(len(messages) - 1):
-            msg = messages[i_msg]
-            next_msg = messages[i_msg + 1]
-            if self._is_msg_tool_call(msg) and self._is_msg_tool_result(next_msg):
-                tool_args = self._extract_args_from_tool_call(msg)
-                tool_response = self._extract_result_from_tool_result(next_msg)
-                extracted_snippets = self._snippet_extractor.extract_unique_snippets(
-                    f"{tool_args}\n{tool_response}"
-                )
-                filenames.extend(extracted_snippets.keys())
+        # Note: the previous version of this code used to analyze
+        # tool-call and tool-results pairs to ensure that the regex
+        # matched.
+        #
+        # Given it was not a business or functional requirement, but
+        # rather an technical decision to avoid adding more regexes,
+        # we decided to analysis contents on a per-message basis, to
+        # avoid creating more dependency on the behaviour of the
+        # coding assistant.
+        #
+        # We still filter only tool-calls and tool-results.
+        filters = [MessageTypeFilter.ASSISTANT, MessageTypeFilter.TOOL]
+        for msg in data.get_messages(filters=filters):
+            for content in msg.get_content():
+                if content.get_text() is not None:
+                    extracted_snippets = self._snippet_extractor.extract_unique_snippets(
+                        f"{content.get_text()}\n\nbackwards compatibility"
+                    )
+                    filenames.extend(extracted_snippets.keys())
         return set(filenames)
 
 

diff --git a/src/codegate/extract_snippets/message_extractor.py b/src/codegate/extract_snippets/message_extractor.py
@@ -279,10 +279,16 @@ def extract_snippets(self, message: str, require_filepath: bool = False) -> List
         """
         regexes = self._choose_regex(require_filepath)
         # Find all code block matches
+        if isinstance(message, str):
+            return [
+                self._get_snippet_for_match(match)
+                for regex in regexes
+                for match in regex.finditer(message)
+            ]
         return [
             self._get_snippet_for_match(match)
             for regex in regexes
-            for match in regex.finditer(message)
+            for match in regex.finditer(message.get_text())
         ]
 
     def extract_unique_snippets(self, message: str) -> Dict[str, CodeSnippet]:

diff --git a/src/codegate/llm_utils/__init__.py b/src/codegate/llm_utils/__init__.py