feat: do not redact input on output guardrail intervention

leotac · leotac · commit 0ea2ce78b7ef · 2025-10-23T14:40:31.000+02:00
Change the behavior to redact input and/or output only
if input and/or output guardrails intervened, respectively
diff --git a/src/strands/models/bedrock.py b/src/strands/models/bedrock.py
@@ -8,7 +8,7 @@
 import logging
 import os
 import warnings
-from typing import Any, AsyncGenerator, Callable, Iterable, Literal, Optional, Type, TypeVar, Union, cast
+from typing import Any, AsyncGenerator, Callable, Iterable, Literal, Optional, Tuple, Type, TypeVar, Union, cast
 
 import boto3
 from botocore.config import Config as BotocoreConfig
@@ -518,7 +518,7 @@ def _format_request_message_content(self, content: ContentBlock) -> dict[str, An
 
         raise TypeError(f"content_type=<{next(iter(content))}> | unsupported type")
 
-    def _has_blocked_guardrail(self, guardrail_data: dict[str, Any]) -> bool:
+    def _has_blocked_guardrail(self, guardrail_data: dict[str, Any]) -> Tuple[bool, bool]:
         """Check if guardrail data contains any blocked policies.
 
         Args:
@@ -530,25 +530,27 @@ def _has_blocked_guardrail(self, guardrail_data: dict[str, Any]) -> bool:
         input_assessment = guardrail_data.get("inputAssessment", {})
         output_assessments = guardrail_data.get("outputAssessments", {})
 
+        blocked_input, blocked_output = False, False
+
         # Check input assessments
         if any(self._find_detected_and_blocked_policy(assessment) for assessment in input_assessment.values()):
-            return True
+            blocked_input = True
 
         # Check output assessments
         if any(self._find_detected_and_blocked_policy(assessment) for assessment in output_assessments.values()):
-            return True
+            blocked_output = True
 
-        return False
+        return blocked_input, blocked_output
 
-    def _generate_redaction_events(self) -> list[StreamEvent]:
+    def _generate_redaction_events(self, redact_input: bool, redact_output: bool) -> list[StreamEvent]:
         """Generate redaction events based on configuration.
 
         Returns:
             List of redaction events to yield.
         """
         events: list[StreamEvent] = []
 
-        if self.config.get("guardrail_redact_input", True):
+        if redact_input and self.config.get("guardrail_redact_input", True):
             logger.debug("Redacting user input due to guardrail.")
             events.append(
                 {
@@ -560,7 +562,7 @@ def _generate_redaction_events(self) -> list[StreamEvent]:
                 }
             )
 
-        if self.config.get("guardrail_redact_output", False):
+        if redact_output and self.config.get("guardrail_redact_output", False):
             logger.debug("Redacting assistant output due to guardrail.")
             events.append(
                 {
@@ -669,9 +671,9 @@ def _stream(
                         and "guardrail" in chunk["metadata"]["trace"]
                     ):
                         guardrail_data = chunk["metadata"]["trace"]["guardrail"]
-                        if self._has_blocked_guardrail(guardrail_data):
-                            for event in self._generate_redaction_events():
-                                callback(event)
+                        blocked_input, blocked_output = self._has_blocked_guardrail(guardrail_data)
+                        for event in self._generate_redaction_events(blocked_input, blocked_output):
+                            callback(event)
 
                     # Track if we see tool use events
                     if "contentBlockStart" in chunk and chunk["contentBlockStart"].get("start", {}).get("toolUse"):
@@ -697,12 +699,10 @@ def _stream(
                 for event in self._convert_non_streaming_to_streaming(response):
                     callback(event)
 
-                if (
-                    "trace" in response
-                    and "guardrail" in response["trace"]
-                    and self._has_blocked_guardrail(response["trace"]["guardrail"])
-                ):
-                    for event in self._generate_redaction_events():
+                if "trace" in response and "guardrail" in response["trace"]:
+                    guardrail_data = response["trace"]["guardrail"]
+                    blocked_input, blocked_output = self._has_blocked_guardrail(guardrail_data)
+                    for event in self._generate_redaction_events(blocked_input, blocked_output):
                         callback(event)
 
         except ClientError as e:
diff --git a/tests/strands/models/test_bedrock.py b/tests/strands/models/test_bedrock.py
@@ -687,7 +687,7 @@ async def test_stream_stream_output_guardrails(
 
 
 @pytest.mark.asyncio
-async def test_stream_output_guardrails_redacts_input_and_output(
+async def test_stream_output_guardrails_redacts_output(
     bedrock_client, model, messages, tool_spec, model_id, additional_request_fields, alist
 ):
     model.update_config(guardrail_redact_output=True)
@@ -735,7 +735,6 @@ async def test_stream_output_guardrails_redacts_input_and_output(
 
     tru_chunks = await alist(response)
     exp_chunks = [
-        {"redactContent": {"redactUserContentMessage": "[User input redacted.]"}},
         {"redactContent": {"redactAssistantContentMessage": "[Assistant output redacted.]"}},
         metadata_event,
     ]
@@ -1070,7 +1069,10 @@ async def test_stream_input_guardrails(bedrock_client, alist, messages):
 
 @pytest.mark.asyncio
 async def test_stream_output_guardrails(bedrock_client, alist, messages):
-    """Test stream method with streaming=False."""
+    """Test stream method with streaming=False.
+
+    Output guardrail should not redact the input.
+    """
     bedrock_client.converse.return_value = {
         "output": {"message": {"role": "assistant", "content": [{"text": "test"}]}},
         "trace": {
@@ -1113,7 +1115,6 @@ async def test_stream_output_guardrails(bedrock_client, alist, messages):
                 }
             }
         },
-        {"redactContent": {"redactUserContentMessage": "[User input redacted.]"}},
     ]
     assert tru_events == exp_events
 
@@ -1122,7 +1123,7 @@ async def test_stream_output_guardrails(bedrock_client, alist, messages):
 
 
 @pytest.mark.asyncio
-async def test_stream_output_guardrails_redacts_output(bedrock_client, alist, messages):
+async def test_stream_output_guardrails_does_not_redact_input(bedrock_client, alist, messages):
     """Test stream method with streaming=False."""
     bedrock_client.converse.return_value = {
         "output": {"message": {"role": "assistant", "content": [{"text": "test"}]}},
@@ -1166,7 +1167,6 @@ async def test_stream_output_guardrails_redacts_output(bedrock_client, alist, me
                 }
             }
         },
-        {"redactContent": {"redactUserContentMessage": "[User input redacted.]"}},
     ]
     assert tru_events == exp_events
 
diff --git a/tests_integ/test_bedrock_guardrails.py b/tests_integ/test_bedrock_guardrails.py
@@ -105,6 +105,7 @@ def test_guardrail_input_intervention(boto_session, bedrock_guardrail):
         guardrail_id=bedrock_guardrail,
         guardrail_version="DRAFT",
         boto_session=boto_session,
+        guardrail_redact_input_message="Redacted.",
     )
 
     agent = Agent(model=bedrock_model, system_prompt="You are a helpful assistant.", callback_handler=None)
@@ -116,6 +117,7 @@ def test_guardrail_input_intervention(boto_session, bedrock_guardrail):
     assert str(response1).strip() == BLOCKED_INPUT
     assert response2.stop_reason != "guardrail_intervened"
     assert str(response2).strip() != BLOCKED_INPUT
+    assert agent.messages[0]["content"][0]["text"] == "Redacted."
 
 
 @pytest.mark.parametrize("processing_mode", ["sync", "async"])
@@ -193,6 +195,10 @@ def test_guardrail_output_intervention_redact_output(bedrock_guardrail, processi
         assert REDACT_MESSAGE in str(response1)
         assert response2.stop_reason != "guardrail_intervened"
         assert REDACT_MESSAGE not in str(response2)
+        # Input not redacted being an output intervention
+        assert agent.messages[0]["content"][0]["text"] != REDACT_MESSAGE
+        # Output correctly redacted
+        assert agent.messages[1]["content"][0]["text"] == REDACT_MESSAGE
     else:
         cactus_returned_in_response1_blocked_by_input_guardrail = BLOCKED_INPUT in str(response2)
         cactus_blocked_in_response1_allows_next_response = (