archestra-ai
diff --git a/‎docs/pages/platform-observability.md‎
Lines changed: 29 additions & 3 deletions b/‎docs/pages/platform-observability.md‎
Lines changed: 29 additions & 3 deletions
diff --git a/‎platform/backend/src/auth/fastify-plugin/plugin.test.ts‎
Lines changed: 4 additions & 0 deletions b/‎platform/backend/src/auth/fastify-plugin/plugin.test.ts‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎platform/backend/src/llm-metrics.test.ts‎
Lines changed: 163 additions & 0 deletions b/‎platform/backend/src/llm-metrics.test.ts‎
Lines changed: 163 additions & 0 deletions
diff --git a/‎platform/backend/src/llm-metrics.ts‎
Lines changed: 91 additions & 2 deletions b/‎platform/backend/src/llm-metrics.ts‎
Lines changed: 91 additions & 2 deletions
@@ -4,7 +4,7 @@ category: Archestra Platform
 order: 5
 ---
 
-<!-- 
+<!--
 Check ../docs_writer_prompt.md before changing this file.
 
 This document is human-built, shouldn't be updated with AI. Don't change anything here.
@@ -42,6 +42,8 @@ The endpoint `http://localhost:9050/metrics` exposes Prometheus-formatted metric
 - `llm_request_duration_seconds` - LLM API request duration by provider, profile_id, profile_name, and status code
 - `llm_tokens_total` - Token consumption by provider, profile_id, profile_name, and type (input/output)
 - `llm_blocked_tool_total` - Counter of tool calls blocked by tool invocation policies, grouped by provider, profile_id, and profile_name
+- `llm_time_to_first_token_seconds` - Time to first token (TTFT) for streaming requests, by provider, profile_id, profile_name, and model. Helps developers choose models with lower initial response latency.
+- `llm_tokens_per_second` - Output tokens per second throughput, by provider, profile_id, profile_name, and model. Allows comparing model response speeds for latency-sensitive applications.
 
 > **Note:** The `agent_id` and `agent_name` labels are deprecated and will be removed in a future release. Please migrate your dashboards and alerts to use `profile_id` and `profile_name` instead. During the transition period, both label variants are emitted.
 
@@ -156,9 +158,9 @@ Add the following to your `prometheus.yml`:
 
 ```yaml
 scrape_configs:
-  - job_name: 'archestra-backend'
+  - job_name: "archestra-backend"
     static_configs:
-      - targets: ['localhost:9050'] # Platform API base URL
+      - targets: ["localhost:9050"] # Platform API base URL
     scrape_interval: 15s
     metrics_path: /metrics
 ```
@@ -229,4 +231,28 @@ Here are some PromQL queries for Grafana charts to get you started:
   sum(rate(llm_request_duration_seconds_count{status_code!~"2.."}[5m])) by (profile_name) / sum(rate(llm_request_duration_seconds_count[5m])) by (profile_name)
   ```
 
+- Time to first token (TTFT) p95 by model:
+
+  ```promql
+  histogram_quantile(0.95, sum(rate(llm_time_to_first_token_seconds_bucket[5m])) by (model, le))
+  ```
+
+- Average time to first token by provider:
+
+  ```promql
+  sum(rate(llm_time_to_first_token_seconds_sum[5m])) by (provider) / sum(rate(llm_time_to_first_token_seconds_count[5m])) by (provider)
+  ```
+
+- Tokens per second throughput p50 by model:
+
+  ```promql
+  histogram_quantile(0.50, sum(rate(llm_tokens_per_second_bucket[5m])) by (model, le))
+  ```
+
+- Average tokens per second by provider and model:
+
+  ```promql
+  sum(rate(llm_tokens_per_second_sum[5m])) by (provider, model) / sum(rate(llm_tokens_per_second_count[5m])) by (provider, model)
+  ```
+
 > **Note:** The `agent_name` label in PromQL queries is deprecated. Please migrate to `profile_name` for new dashboards and alerts.
@@ -105,6 +105,10 @@ describe("authPlugin integration", () => {
           updatedAt: new Date(),
         } as ApiKey,
       });
+      mockHasPermission.mockResolvedValue({
+        success: true,
+        error: null,
+      });
       mockUserModel.getById.mockResolvedValue({
         id: "user1",
         name: "Test User",
 
@@ -35,6 +35,8 @@ import {
   reportBlockedTools,
   reportLLMCost,
   reportLLMTokens,
+  reportTimeToFirstToken,
+  reportTokensPerSecond,
 } from "./llm-metrics";
 
 describe("getObservableFetch", () => {
@@ -627,3 +629,164 @@ describe("reportBlockedTools with model", () => {
     );
   });
 });
+
+describe("reportTimeToFirstToken", () => {
+  let testAgent: Agent;
+
+  beforeEach(async ({ makeAgent }) => {
+    vi.clearAllMocks();
+    testAgent = await makeAgent();
+    initializeMetrics([]);
+  });
+
+  test("records time to first token with model", () => {
+    reportTimeToFirstToken("openai", testAgent, "gpt-4", 0.5);
+
+    expect(histogramObserve).toHaveBeenCalledWith(
+      {
+        provider: "openai",
+        agent_id: testAgent.id,
+        agent_name: testAgent.name,
+        profile_id: testAgent.id,
+        profile_name: testAgent.name,
+        model: "gpt-4",
+      },
+      0.5,
+    );
+  });
+
+  test("records time to first token without model", () => {
+    reportTimeToFirstToken("anthropic", testAgent, undefined, 0.25);
+
+    expect(histogramObserve).toHaveBeenCalledWith(
+      {
+        provider: "anthropic",
+        agent_id: testAgent.id,
+        agent_name: testAgent.name,
+        profile_id: testAgent.id,
+        profile_name: testAgent.name,
+        model: "unknown",
+      },
+      0.25,
+    );
+  });
+
+  test("skips reporting for invalid TTFT value", () => {
+    reportTimeToFirstToken("openai", testAgent, "gpt-4", 0);
+    reportTimeToFirstToken("openai", testAgent, "gpt-4", -1);
+
+    expect(histogramObserve).not.toHaveBeenCalled();
+  });
+
+  test("records TTFT for different providers", () => {
+    reportTimeToFirstToken("gemini", testAgent, "gemini-pro", 0.3);
+
+    expect(histogramObserve).toHaveBeenCalledWith(
+      {
+        provider: "gemini",
+        agent_id: testAgent.id,
+        agent_name: testAgent.name,
+        profile_id: testAgent.id,
+        profile_name: testAgent.name,
+        model: "gemini-pro",
+      },
+      0.3,
+    );
+  });
+});
+
+describe("reportTokensPerSecond", () => {
+  let testAgent: Agent;
+
+  beforeEach(async ({ makeAgent }) => {
+    vi.clearAllMocks();
+    testAgent = await makeAgent();
+    initializeMetrics([]);
+  });
+
+  test("records tokens per second with model", () => {
+    // 100 tokens in 2 seconds = 50 tokens/sec
+    reportTokensPerSecond("openai", testAgent, "gpt-4", 100, 2);
+
+    expect(histogramObserve).toHaveBeenCalledWith(
+      {
+        provider: "openai",
+        agent_id: testAgent.id,
+        agent_name: testAgent.name,
+        profile_id: testAgent.id,
+        profile_name: testAgent.name,
+        model: "gpt-4",
+      },
+      50,
+    );
+  });
+
+  test("records tokens per second without model", () => {
+    // 150 tokens in 3 seconds = 50 tokens/sec
+    reportTokensPerSecond("anthropic", testAgent, undefined, 150, 3);
+
+    expect(histogramObserve).toHaveBeenCalledWith(
+      {
+        provider: "anthropic",
+        agent_id: testAgent.id,
+        agent_name: testAgent.name,
+        profile_id: testAgent.id,
+        profile_name: testAgent.name,
+        model: "unknown",
+      },
+      50,
+    );
+  });
+
+  test("skips reporting for zero output tokens", () => {
+    reportTokensPerSecond("openai", testAgent, "gpt-4", 0, 2);
+
+    expect(histogramObserve).not.toHaveBeenCalled();
+  });
+
+  test("skips reporting for zero duration", () => {
+    reportTokensPerSecond("openai", testAgent, "gpt-4", 100, 0);
+
+    expect(histogramObserve).not.toHaveBeenCalled();
+  });
+
+  test("skips reporting for negative duration", () => {
+    reportTokensPerSecond("openai", testAgent, "gpt-4", 100, -1);
+
+    expect(histogramObserve).not.toHaveBeenCalled();
+  });
+
+  test("calculates correct tokens/sec for fast response", () => {
+    // 50 tokens in 0.5 seconds = 100 tokens/sec
+    reportTokensPerSecond("gemini", testAgent, "gemini-pro", 50, 0.5);
+
+    expect(histogramObserve).toHaveBeenCalledWith(
+      {
+        provider: "gemini",
+        agent_id: testAgent.id,
+        agent_name: testAgent.name,
+        profile_id: testAgent.id,
+        profile_name: testAgent.name,
+        model: "gemini-pro",
+      },
+      100,
+    );
+  });
+
+  test("calculates correct tokens/sec for slow response", () => {
+    // 200 tokens in 10 seconds = 20 tokens/sec
+    reportTokensPerSecond("anthropic", testAgent, "claude-3", 200, 10);
+
+    expect(histogramObserve).toHaveBeenCalledWith(
+      {
+        provider: "anthropic",
+        agent_id: testAgent.id,
+        agent_name: testAgent.name,
+        profile_id: testAgent.id,
+        profile_name: testAgent.name,
+        model: "claude-3",
+      },
+      20,
+    );
+  });
+});
@@ -25,6 +25,8 @@ let llmRequestDuration: client.Histogram<string>;
 let llmTokensCounter: client.Counter<string>;
 let llmBlockedToolCounter: client.Counter<string>;
 let llmCostTotal: client.Counter<string>;
+let llmTimeToFirstToken: client.Histogram<string>;
+let llmTokensPerSecond: client.Histogram<string>;
 
 // Store current label keys for comparison
 let currentLabelKeys: string[] = [];
@@ -50,7 +52,9 @@ export function initializeMetrics(labelKeys: string[]): void {
     llmRequestDuration &&
     llmTokensCounter &&
     llmBlockedToolCounter &&
-    llmCostTotal
+    llmCostTotal &&
+    llmTimeToFirstToken &&
+    llmTokensPerSecond
   ) {
     logger.info(
       "Metrics already initialized with same label keys, skipping reinitialization",
@@ -74,6 +78,12 @@ export function initializeMetrics(labelKeys: string[]): void {
     if (llmCostTotal) {
       client.register.removeSingleMetric("llm_cost_total");
     }
+    if (llmTimeToFirstToken) {
+      client.register.removeSingleMetric("llm_time_to_first_token_seconds");
+    }
+    if (llmTokensPerSecond) {
+      client.register.removeSingleMetric("llm_tokens_per_second");
+    }
   } catch (_error) {
     // Ignore errors if metrics don't exist
   }
@@ -117,8 +127,26 @@ export function initializeMetrics(labelKeys: string[]): void {
     labelNames: [...baseLabelNames, ...nextLabelKeys],
   });
 
+  llmTimeToFirstToken = new client.Histogram({
+    name: "llm_time_to_first_token_seconds",
+    help: "Time to first token in seconds (streaming latency)",
+    labelNames: [...baseLabelNames, ...nextLabelKeys],
+    // Buckets optimized for TTFT - typically faster than full response
+    buckets: [0.05, 0.1, 0.25, 0.5, 1, 2, 5, 10],
+  });
+
+  llmTokensPerSecond = new client.Histogram({
+    name: "llm_tokens_per_second",
+    help: "Output tokens per second throughput",
+    labelNames: [...baseLabelNames, ...nextLabelKeys],
+    // Buckets for tokens/sec throughput - typical range 10-200 tokens/sec
+    buckets: [5, 10, 25, 50, 75, 100, 150, 200, 300],
+  });
+
   logger.info(
-    `Metrics initialized with ${nextLabelKeys.length} agent label keys: ${nextLabelKeys.join(", ")}`,
+    `Metrics initialized with ${
+      nextLabelKeys.length
+    } agent label keys: ${nextLabelKeys.join(", ")}`,
   );
 }
 
@@ -223,6 +251,67 @@ export function reportLLMCost(
   llmCostTotal.inc(buildMetricLabels(agent, { provider }, model), cost);
 }
 
+/**
+ * Reports time to first token (TTFT) for streaming LLM requests.
+ * This metric helps application developers understand streaming latency
+ * and choose models with lower initial response times.
+ * @param provider The LLM provider
+ * @param agent The agent/profile making the request
+ * @param model The model name
+ * @param ttftSeconds Time to first token in seconds
+ */
+export function reportTimeToFirstToken(
+  provider: SupportedProvider,
+  agent: Agent,
+  model: string | undefined,
+  ttftSeconds: number,
+): void {
+  if (!llmTimeToFirstToken) {
+    logger.warn("LLM metrics not initialized, skipping TTFT reporting");
+    return;
+  }
+  if (ttftSeconds <= 0) {
+    logger.warn("Invalid TTFT value, must be positive");
+    return;
+  }
+  llmTimeToFirstToken.observe(
+    buildMetricLabels(agent, { provider }, model),
+    ttftSeconds,
+  );
+}
+
+/**
+ * Reports tokens per second throughput for LLM requests.
+ * This metric allows comparing model response speeds and helps
+ * developers choose models for latency-sensitive applications.
+ * @param provider The LLM provider
+ * @param agent The agent/profile making the request
+ * @param model The model name
+ * @param outputTokens Number of output tokens generated
+ * @param durationSeconds Total request duration in seconds
+ */
+export function reportTokensPerSecond(
+  provider: SupportedProvider,
+  agent: Agent,
+  model: string | undefined,
+  outputTokens: number,
+  durationSeconds: number,
+): void {
+  if (!llmTokensPerSecond) {
+    logger.warn("LLM metrics not initialized, skipping tokens/sec reporting");
+    return;
+  }
+  if (durationSeconds <= 0 || outputTokens <= 0) {
+    // Skip reporting if no output tokens or invalid duration
+    return;
+  }
+  const tokensPerSecond = outputTokens / durationSeconds;
+  llmTokensPerSecond.observe(
+    buildMetricLabels(agent, { provider }, model),
+    tokensPerSecond,
+  );
+}
+
 /**
  * Returns a fetch wrapped in observability. Use it as OpenAI or Anthropic provider custom fetch implementation.
  */