Merge pull request #556 from buster-so/eval-style-changes

style fixes for eval files
2025-07-18 15:13:12 -06:00 · 2025-07-18 15:13:12 -06:00 · 9dca49f0b8
parent c42a358c7f c53e6ee19a
commit 9dca49f0b8
3 changed files with 366 additions and 351 deletions
--- a/packages/ai/evals/agents/analyst-agent/metrics/example_scorer_prompts.ts
+++ b/packages/ai/evals/agents/analyst-agent/metrics/example_scorer_prompts.ts
@ -51,7 +51,7 @@ export const usesExpectedPrecomputedMetricPrompt = `
  {{expected}}
  
  Return Y if the condition is met, N if it is not.
-`
+`;

 export const acceptableAnswersScorerPrompt = `
  You are an evaluator tasked with determining if the results returned by 'createMetrics' tool calls match any of the acceptable answers provided in the metadata. Your goal is to assess whether the actual metric results align with the expected acceptable answers, allowing for reasonable flexibility in formatting, ordering, and additional data.
@ -117,7 +117,7 @@ export const acceptableAnswersScorerPrompt = `
  Return 'Y' if the metric results match any acceptable answer (with flexible matching applied).
  Return 'N' if no acceptable answers match the metric results.
  Return 'Y' if no 'createMetrics' tool calls are found (nothing to evaluate).
-`
+`;

 export const preferredAnswerScorerPrompt = `
  You are an evaluator tasked with determining if the results returned by 'createMetrics' tool calls match the preferred answer provided in the metadata. Your goal is to assess whether the actual metric results align with the specific preferred answer, representing the ideal response we want the system to provide. This scorer is more strict than acceptable answers since it evaluates against a single preferred outcome.
@ -186,7 +186,7 @@ export const preferredAnswerScorerPrompt = `
  Return 'Y' if the metric results match the preferred answer (with flexible matching applied).
  Return 'N' if the preferred answer does not match the metric results.
  Return 'Y' if no 'createMetrics' or 'updateMetrics' tool calls are found (nothing to evaluate).
-`
+`;

 export const doneMessageMatchesSqlResultsPrompt = `
  You are evaluating whether the final response in the 'doneTool' tool call accurately reflects the results of the SQL query executed in the 'createMetrics' tool call. The output is a structured JSON array of messages representing a conversation, including user queries, assistant responses, tool calls, and tool results.
@ -232,7 +232,7 @@ export const doneMessageMatchesSqlResultsPrompt = `
  - Do not include any text outside of the JSON object.
  Return Y if the condition is met, N if it is not.
  
-`
+`;

 export const checkUsesExecuteSQLToCreateMetricsPrompt = `
      You are evaluating a conversation between a user, an LLM, and tool calls/results to determine if any SQL query in an 'executeSql' tool call is nearly identical to the SQL query in a 'createMetrics' tool call. The goal is to check for redundant SQL queries where the model unnecessarily runs a query in 'executeSql' that matches the final query used in 'createMetrics'. If no 'executeSql' tool calls are present, the evaluation should return null.
@ -279,4 +279,4 @@ export const checkUsesExecuteSQLToCreateMetricsPrompt = `
  
      The JSON output is:
      {{output}}
-`
+`;
--- a/packages/ai/evals/agents/analyst-agent/metrics/example_scorers.ts
+++ b/packages/ai/evals/agents/analyst-agent/metrics/example_scorers.ts
@ -1,361 +1,363 @@
 import { LLMClassifierFromTemplate } from 'autoevals';
-import { acceptableAnswersScorerPrompt, usesExpectedPrecomputedMetricPrompt, preferredAnswerScorerPrompt, 
-    doneMessageMatchesSqlResultsPrompt, checkUsesExecuteSQLToCreateMetricsPrompt } from './example_scorer_prompts';
-
+import {
+  acceptableAnswersScorerPrompt,
+  checkUsesExecuteSQLToCreateMetricsPrompt,
+  doneMessageMatchesSqlResultsPrompt,
+  preferredAnswerScorerPrompt,
+  usesExpectedPrecomputedMetricPrompt,
+} from './example_scorer_prompts';

 // Checks if the output SQL uses the precomputed metric from the expected SQL in braintrust
 export const usesExpectedPrecomputedMetric = LLMClassifierFromTemplate({
-    name: 'usesExpectedPrecomputedMetric',
-    promptTemplate: usesExpectedPrecomputedMetricPrompt,
-    choiceScores: {
-      Y: 1,
-      N: 0,
-    },
-    useCoT: true,
-    model: 'gpt-4.1',
+  name: 'usesExpectedPrecomputedMetric',
+  promptTemplate: usesExpectedPrecomputedMetricPrompt,
+  choiceScores: {
+    Y: 1,
+    N: 0,
+  },
+  useCoT: true,
+  model: 'gpt-4.1',
 });

 // Checks if the createMetrics tool call output is one of the acceptable answers from the braintrust metadata
 export const acceptableAnswersScorer = LLMClassifierFromTemplate({
-    name: 'acceptableAnswersScorer',
-    promptTemplate: acceptableAnswersScorerPrompt,
-    choiceScores: {
-      Y: 1,
-      N: 0,
-    },
-    useCoT: true,
-    model: 'gpt-4.1',
+  name: 'acceptableAnswersScorer',
+  promptTemplate: acceptableAnswersScorerPrompt,
+  choiceScores: {
+    Y: 1,
+    N: 0,
+  },
+  useCoT: true,
+  model: 'gpt-4.1',
 });

 // Checks if the createMetrics tool call output is the preferred answer from the braintrust metadata
 export const preferredAnswerScorer = LLMClassifierFromTemplate({
-    name: 'preferredAnswerScorer',
-    promptTemplate: preferredAnswerScorerPrompt,
-    choiceScores: {
-      Y: 1,
-      N: 0,
-    },
-    useCoT: true,
-    model: 'gpt-4.1',
+  name: 'preferredAnswerScorer',
+  promptTemplate: preferredAnswerScorerPrompt,
+  choiceScores: {
+    Y: 1,
+    N: 0,
+  },
+  useCoT: true,
+  model: 'gpt-4.1',
 });

 // Checks if the response given to the user matches the actual metric output, used to check for hallucinations or when it pulls incorrect data but lies about it
 export const doneMessageMatchesSqlResults = LLMClassifierFromTemplate({
-    name: 'doneMessageMatchesSqlResults',
-    promptTemplate: doneMessageMatchesSqlResultsPrompt,
-    choiceScores: {
-      Y: 1,
-      N: 0,
-    },
-    useCoT: true,
-    model: 'gpt-4.1',
-
-  });
+  name: 'doneMessageMatchesSqlResults',
+  promptTemplate: doneMessageMatchesSqlResultsPrompt,
+  choiceScores: {
+    Y: 1,
+    N: 0,
+  },
+  useCoT: true,
+  model: 'gpt-4.1',
+});

 //Checks to make sure that the model does not build the output SQL in ExecuteSQL, really just a price saving check
 export const checkUsesExecuteSQLToCreateMetrics = LLMClassifierFromTemplate({
-    name: 'checkUsesExecuteSQLToCreateMetrics',
-    promptTemplate: checkUsesExecuteSQLToCreateMetricsPrompt,
-    choiceScores: {
-      Y: 1,
-      N: 0,
-    },
-    useCoT: true,
+  name: 'checkUsesExecuteSQLToCreateMetrics',
+  promptTemplate: checkUsesExecuteSQLToCreateMetricsPrompt,
+  choiceScores: {
+    Y: 1,
+    N: 0,
+  },
+  useCoT: true,
 });

-
 //Makes sure that the todo list has the right format
 export const todoMarkdownBoxes = ({ output }: { output: any[] }) => {
-    try {
-        const messages = Array.isArray(output) ? output : JSON.parse(output);
-        const todoListMessages = messages.filter(
-          (msg: any) =>
-            msg.role === 'user' &&
-            Array.isArray(msg.content) &&
-            msg.content.some((c: any) => c.type === 'text' && c.text.includes('<todo_list>'))
-        );
+  try {
+    const messages = Array.isArray(output) ? output : JSON.parse(output);
+    const todoListMessages = messages.filter(
+      (msg: any) =>
+        msg.role === 'user' &&
+        Array.isArray(msg.content) &&
+        msg.content.some((c: any) => c.type === 'text' && c.text.includes('<todo_list>'))
+    );

-        const todoListMessage = todoListMessages[0];
-        
-        const todoContent = todoListMessage.content.find(
-          (c: any) => c.type === 'text' && c.text.includes('<todo_list>')
-        );
+    const todoListMessage = todoListMessages[0];

-        const todoMatch = todoContent.text.match(/<todo_list>([\s\S]*?)<\/todo_list>/);
+    const todoContent = todoListMessage.content.find(
+      (c: any) => c.type === 'text' && c.text.includes('<todo_list>')
+    );

-        const todoItems = todoMatch[1]
-        .split('\n')
-        .map((line: string) => line.trim())
-        .filter((line: string) => line.length > 0) // Include all non-empty lines for checking
-        .filter((line: string) => line !== '- Below are the items on your TODO list:');
-  
-      if (todoItems.length === 0) {
-        return 0; // No TODO items
-      }
-  
-      let allValid = true;
-      todoItems.forEach((item: string) => {
-        if (!item.startsWith('[ ]')) {
-          allValid = false;
-        }
-      });
-  
-      return allValid ? 1 : 0; // Return 1 if all items start with [ ], 0 
-    } catch (error) {
-      console.error('Error in todoMarkdownBoxes scorer:', error);
-      return null;
+    const todoMatch = todoContent.text.match(/<todo_list>([\s\S]*?)<\/todo_list>/);
+
+    const todoItems = todoMatch[1]
+      .split('\n')
+      .map((line: string) => line.trim())
+      .filter((line: string) => line.length > 0) // Include all non-empty lines for checking
+      .filter((line: string) => line !== '- Below are the items on your TODO list:');
+
+    if (todoItems.length === 0) {
+      return 0; // No TODO items
    }
+
+    let allValid = true;
+    todoItems.forEach((item: string) => {
+      if (!item.startsWith('[ ]')) {
+        allValid = false;
+      }
+    });
+
+    return allValid ? 1 : 0; // Return 1 if all items start with [ ], 0
+  } catch (error) {
+    console.error('Error in todoMarkdownBoxes scorer:', error);
+    return null;
+  }
 };

 //Makes sure that executeSQL is always followed by either another executeSQL or a sequentialThinking tool call
-export const executeSqlFollowedByValidTool = ({ output, }: { output: any }) => {
-    try {
-        // const op = output.result.outputMessages;
-        // const messages = Array.isArray(op) ? op : JSON.parse(op);
-        const messages = Array.isArray(output) ? output : JSON.parse(output);
-      // Find all executeSql tool calls
-      const executeSqlCalls = messages
-        .map((msg: any, index: number) => ({
-          msg,
-          index,
-        }))
-        .filter(
-          ({ msg }: { msg: any }) =>
-            msg.role === 'assistant' &&
-            Array.isArray(msg.content) &&
-            msg.content.some(
-              (c: any) => c.type === 'tool-call' && c.toolName === 'executeSql'
-            )
-        );
-  
-      // If no executeSql calls, pass by default
-      if (executeSqlCalls.length === 0) {
-        return 1;
-      }
-  
-      for (const { index } of executeSqlCalls) {
-        // Find the next assistant message with a tool call
-        let nextToolIndex = index + 1;
-        while (nextToolIndex < messages.length) {
-          const nextMsg = messages[nextToolIndex];
-          if (
-            nextMsg.role === 'assistant' &&
-            Array.isArray(nextMsg.content) &&
-            nextMsg.content.some((c: any) => c.type === 'tool-call')
-          ) {
-            const nextToolCall = nextMsg.content.find(
-              (c: any) => c.type === 'tool-call'
-            );
-            const nextToolName = nextToolCall?.toolName;
-            // Check if the next tool is either executeSql or sequentialThinking
-            if (nextToolName === 'executeSql' || nextToolName === 'sequentialThinking') {
-              break;
-            } else {
-              return 0; // Fail if next tool is neither executeSql nor sequentialThinking
-            }
-          }
-          nextToolIndex++;
-        }
-        // If no next tool call is found, pass (no invalid transition occurred)
-      }
-  
+export const executeSqlFollowedByValidTool = ({ output }: { output: any }) => {
+  try {
+    // const op = output.result.outputMessages;
+    // const messages = Array.isArray(op) ? op : JSON.parse(op);
+    const messages = Array.isArray(output) ? output : JSON.parse(output);
+    // Find all executeSql tool calls
+    const executeSqlCalls = messages
+      .map((msg: any, index: number) => ({
+        msg,
+        index,
+      }))
+      .filter(
+        ({ msg }: { msg: any }) =>
+          msg.role === 'assistant' &&
+          Array.isArray(msg.content) &&
+          msg.content.some((c: any) => c.type === 'tool-call' && c.toolName === 'executeSql')
+      );
+
+    // If no executeSql calls, pass by default
+    if (executeSqlCalls.length === 0) {
      return 1;
-    } catch (error) {
-      console.error('Error in executeSqlFollowedByValidTool scorer:', error);
-      return null;
    }
+
+    for (const { index } of executeSqlCalls) {
+      // Find the next assistant message with a tool call
+      let nextToolIndex = index + 1;
+      while (nextToolIndex < messages.length) {
+        const nextMsg = messages[nextToolIndex];
+        if (
+          nextMsg.role === 'assistant' &&
+          Array.isArray(nextMsg.content) &&
+          nextMsg.content.some((c: any) => c.type === 'tool-call')
+        ) {
+          const nextToolCall = nextMsg.content.find((c: any) => c.type === 'tool-call');
+          const nextToolName = nextToolCall?.toolName;
+          // Check if the next tool is either executeSql or sequentialThinking
+          if (nextToolName === 'executeSql' || nextToolName === 'sequentialThinking') {
+            break;
+          } else {
+            return 0; // Fail if next tool is neither executeSql nor sequentialThinking
+          }
+        }
+        nextToolIndex++;
+      }
+      // If no next tool call is found, pass (no invalid transition occurred)
+    }
+
+    return 1;
+  } catch (error) {
+    console.error('Error in executeSqlFollowedByValidTool scorer:', error);
+    return null;
+  }
 };

 //Makes sure the SQL field in the YML uses the block scalar format (|) so that it does not break
 export const allFilesUseYmlBlockScalar = (args: {
-    input: string;
-    output: any;
-  }) => {
-    const { output } = args;
+  input: string;
+  output: any;
+}) => {
+  const { output } = args;

-    for (const message of output) {
-      if (message.content && Array.isArray(message.content)) {
-        for (const contentItem of message.content) {
-          if (contentItem.type === 'tool-call' && contentItem.toolName === 'createMetricsFileTool') {
-            if (contentItem.args && contentItem.args.files) {
-              for (const file of contentItem.args.files) {
-                const yml = file.yml_content;
-                if (yml.includes('sql') && !yml.includes('sql: |')) {
-                  return 0;
-                }
+  for (const message of output) {
+    if (message.content && Array.isArray(message.content)) {
+      for (const contentItem of message.content) {
+        if (contentItem.type === 'tool-call' && contentItem.toolName === 'createMetricsFileTool') {
+          if (contentItem.args && contentItem.args.files) {
+            for (const file of contentItem.args.files) {
+              const yml = file.yml_content;
+              if (yml.includes('sql') && !yml.includes('sql: |')) {
+                return 0;
              }
            }
          }
        }
      }
    }
-    return 1;
+  }
+  return 1;
 };

-//Makes sure that a metric is created successfully, primairly fails if there is a bug or if the model sends back a clarifying question
+//Makes sure that a metric is created successfully, primarily fails if there is a bug or if the model sends back a clarifying question
 export const MetricCreatedSuccessfully = ({ output }: { output: any[] }) => {
-    try {
-      const hasSuccessfulCreation = output.some(message =>
-        message.role === "tool" &&
+  try {
+    const hasSuccessfulCreation = output.some(
+      (message) =>
+        message.role === 'tool' &&
        Array.isArray(message.content) &&
-        message.content.some(toolResult =>
-          toolResult.toolName === "createMetrics" &&
-          toolResult.type === "tool-result" &&
-          toolResult.result &&
-          Array.isArray(toolResult.result.files) &&
-          toolResult.result.files.length > 0
+        message.content.some(
+          (toolResult) =>
+            toolResult.toolName === 'createMetrics' &&
+            toolResult.type === 'tool-result' &&
+            toolResult.result &&
+            Array.isArray(toolResult.result.files) &&
+            toolResult.result.files.length > 0
        )
-      );
-      return hasSuccessfulCreation ? 1 : 0;
-    } catch {
-      return null;
-    }
+    );
+    return hasSuccessfulCreation ? 1 : 0;
+  } catch {
+    return null;
+  }
 };
-  
+
 //Makes sure that the timeFrame field in the YML is a string, not a number so that it does not break
 export const timeFrameIsString = ({ output }: { output: any }) => {
-    try {
-      // Parse the output, expecting an array of messages
-      const messages = Array.isArray(output) ? output : JSON.parse(output);
-  
-      // Filter for createMetrics tool calls
-      const createMetricsCalls = messages.filter(
-        (msg: any) =>
-          msg.role === 'assistant' &&
-          Array.isArray(msg.content) &&
-          msg.content.some((c: any) => c.type === 'tool-call' && c.toolName === 'createMetrics')
+  try {
+    // Parse the output, expecting an array of messages
+    const messages = Array.isArray(output) ? output : JSON.parse(output);
+
+    // Filter for createMetrics tool calls
+    const createMetricsCalls = messages.filter(
+      (msg: any) =>
+        msg.role === 'assistant' &&
+        Array.isArray(msg.content) &&
+        msg.content.some((c: any) => c.type === 'tool-call' && c.toolName === 'createMetrics')
+    );
+
+    // If no createMetrics calls, pass by default
+    if (createMetricsCalls.length === 0) {
+      return 1;
+    }
+
+    // Check each createMetrics tool call
+    for (const msg of createMetricsCalls) {
+      const toolCall = msg.content.find(
+        (c: any) => c.type === 'tool-call' && c.toolName === 'createMetrics'
      );
-  
-      // If no createMetrics calls, pass by default
-      if (createMetricsCalls.length === 0) {
-        return 1;
-      }
-  
-      // Check each createMetrics tool call
-      for (const msg of createMetricsCalls) {
-        const toolCall = msg.content.find(
-          (c: any) => c.type === 'tool-call' && c.toolName === 'createMetrics'
-        );
-        const files = toolCall?.args?.files || [];
-  
-        for (const file of files) {
-          const ymlContent = file.yml_content;
-          if (!ymlContent) {
-            return 0; // Fail if YML content is missing
-          }
-  
-          // Extract timeFrame from YML content
-          const timeFrameMatch = ymlContent.match(/timeFrame:\s*([^\n]+)/);
-          if (timeFrameMatch && timeFrameMatch[1]) {
-            const timeFrame = timeFrameMatch[1].trim();
-            // Check if timeFrame is a number (invalid)
-            if (!isNaN(Number(timeFrame))) {
-              return 0; // Fail if timeFrame is a number
-            }
-          } else {
-            return 0; // Fail if timeFrame is missing
+      const files = toolCall?.args?.files || [];
+
+      for (const file of files) {
+        const ymlContent = file.yml_content;
+        if (!ymlContent) {
+          return 0; // Fail if YML content is missing
+        }
+
+        // Extract timeFrame from YML content
+        const timeFrameMatch = ymlContent.match(/timeFrame:\s*([^\n]+)/);
+        if (timeFrameMatch && timeFrameMatch[1]) {
+          const timeFrame = timeFrameMatch[1].trim();
+          // Check if timeFrame is a number (invalid)
+          if (!isNaN(Number(timeFrame))) {
+            return 0; // Fail if timeFrame is a number
          }
+        } else {
+          return 0; // Fail if timeFrame is missing
        }
      }
-  
-      // Pass if all timeFrame values are strings
-      return 1;
-    } catch (error) {
-      console.error('Error in timeFrameIsString scorer:', error);
-      return 0; // Fail on any parsing or unexpected errors
    }
+
+    // Pass if all timeFrame values are strings
+    return 1;
+  } catch (error) {
+    console.error('Error in timeFrameIsString scorer:', error);
+    return 0; // Fail on any parsing or unexpected errors
+  }
 };

 //Makes sure that there is exactly one doneTool tool call. If there is more than one, its wasting time/money, if there is none then the model broke somehow
 export const exactlyOneDoneTool = ({ output }: { output: any[] }) => {
-    try {
-      const doneToolCount = output.filter(message =>
-        message.role === "assistant" &&
+  try {
+    const doneToolCount = output.filter(
+      (message) =>
+        message.role === 'assistant' &&
        Array.isArray(message.content) &&
-        message.content.some(toolCall =>
-          toolCall.toolName === "doneTool" &&
-          toolCall.type === "tool-call"
+        message.content.some(
+          (toolCall) => toolCall.toolName === 'doneTool' && toolCall.type === 'tool-call'
        )
-      ).length;
-      return doneToolCount === 1 ? 1 : 0;
-    } catch {
-      return 0;
-    }
+    ).length;
+    return doneToolCount === 1 ? 1 : 0;
+  } catch {
+    return 0;
+  }
 };

 //Makes sure that all metrics are created successfully. Even if it fails then rebuilds the SQL, this scorer will fail
 export const NoFailureToCreateMetrics = ({ output }: { output: any[] }) => {
-    try {
-      const hasUnsuccessfulCreation = output.some(message =>
-        message.role === "tool" &&
+  try {
+    const hasUnsuccessfulCreation = output.some(
+      (message) =>
+        message.role === 'tool' &&
        Array.isArray(message.content) &&
-        message.content.some(toolResult =>
-          toolResult.toolName === "createMetrics" &&
-          toolResult.type === "tool-result" &&
-          toolResult.result &&
-          Array.isArray(toolResult.result.failed_files) &&
-          toolResult.result.failed_files.length > 0
+        message.content.some(
+          (toolResult) =>
+            toolResult.toolName === 'createMetrics' &&
+            toolResult.type === 'tool-result' &&
+            toolResult.result &&
+            Array.isArray(toolResult.result.failed_files) &&
+            toolResult.result.failed_files.length > 0
        )
-      );
-      return hasUnsuccessfulCreation ? 0 : 1;
-    } catch {
-      return null;
-    }
+    );
+    return hasUnsuccessfulCreation ? 0 : 1;
+  } catch {
+    return null;
+  }
 };

 //Makes sure that when multiple metrics are created, a dashboard is created for them
 export const dashboardCreatedForMultipleMetrics = ({ output }: { output: any }) => {
-    try {
-      const messages = Array.isArray(output) ? output : JSON.parse(output);
-  
-      // Check for createMetrics tool calls
-      const createMetricsCalls = messages.filter(
-        (msg: any) =>
-          msg.role === 'assistant' &&
-          Array.isArray(msg.content) &&
-          msg.content.some((c: any) => c.type === 'tool-call' && c.toolName === 'createMetrics')
-      );
-  
-      // If no createMetrics calls, return null
-      if (createMetricsCalls.length === 0) {
-        return null;
-      }
-  
-      // Check if multiple metrics are created in any createMetrics call
-      let hasMultipleMetrics = false;
-      for (const msg of createMetricsCalls) {
-        const toolCall = msg.content.find(
-          (c: any) => c.type === 'tool-call' && c.toolName === 'createMetrics'
-        );
-        const files = toolCall?.args?.files || [];
-        if (files.length > 1) {
-          hasMultipleMetrics = true;
-          break;
-        }
-      }
-  
-      // If no multiple metrics, return null
-      if (!hasMultipleMetrics) {
-        return null;
-      }
-  
-      // Check for createDashboards tool call
-      const createDashboardsCalls = messages.filter(
-        (msg: any) =>
-          msg.role === 'assistant' &&
-          Array.isArray(msg.content) &&
-          msg.content.some((c: any) => c.type === 'tool-call' && c.toolName === 'createDashboards')
-      );
-  
-      // If multiple metrics exist and createDashboards call is found, return 1
-      if (createDashboardsCalls.length > 0) {
-        return 1;
-      }
-  
-      // If multiple metrics exist but no createDashboards call, return 0
-      return 0;
-    } catch (error) {
-      console.error('Error in dashboardCreatedForMultipleMetrics scorer:', error);
+  try {
+    const messages = Array.isArray(output) ? output : JSON.parse(output);
+
+    // Check for createMetrics tool calls
+    const createMetricsCalls = messages.filter(
+      (msg: any) =>
+        msg.role === 'assistant' &&
+        Array.isArray(msg.content) &&
+        msg.content.some((c: any) => c.type === 'tool-call' && c.toolName === 'createMetrics')
+    );
+
+    // If no createMetrics calls, return null
+    if (createMetricsCalls.length === 0) {
      return null;
    }
-};
+
+    // Check if multiple metrics are created in any createMetrics call
+    let hasMultipleMetrics = false;
+    for (const msg of createMetricsCalls) {
+      const toolCall = msg.content.find(
+        (c: any) => c.type === 'tool-call' && c.toolName === 'createMetrics'
+      );
+      const files = toolCall?.args?.files || [];
+      if (files.length > 1) {
+        hasMultipleMetrics = true;
+        break;
+      }
+    }
+
+    // If no multiple metrics, return null
+    if (!hasMultipleMetrics) {
+      return null;
+    }
+
+    // Check for createDashboards tool call
+    const createDashboardsCalls = messages.filter(
+      (msg: any) =>
+        msg.role === 'assistant' &&
+        Array.isArray(msg.content) &&
+        msg.content.some((c: any) => c.type === 'tool-call' && c.toolName === 'createDashboards')
+    );
+
+    // If multiple metrics exist and createDashboards call is found, return 1
+    if (createDashboardsCalls.length > 0) {
+      return 1;
+    }
+
+    // If multiple metrics exist but no createDashboards call, return 0
+    return 0;
+  } catch (error) {
+    console.error('Error in dashboardCreatedForMultipleMetrics scorer:', error);
+    return null;
+  }
+};
--- a/packages/ai/evals/agents/analyst-agent/metrics/metric.eval.ts
+++ b/packages/ai/evals/agents/analyst-agent/metrics/metric.eval.ts
@ -1,76 +1,89 @@
-import { Eval, initDataset } from 'braintrust';
 import { RuntimeContext } from '@mastra/core/runtime-context';
-import analystWorkflow, { AnalystRuntimeContext } from '../../../../src/workflows/analyst-workflow';
-import { usesExpectedPrecomputedMetric, acceptableAnswersScorer, preferredAnswerScorer, 
-    doneMessageMatchesSqlResults, checkUsesExecuteSQLToCreateMetrics, todoMarkdownBoxes,
-    executeSqlFollowedByValidTool, allFilesUseYmlBlockScalar, MetricCreatedSuccessfully,
-    timeFrameIsString, exactlyOneDoneTool, NoFailureToCreateMetrics, dashboardCreatedForMultipleMetrics} from './example_scorers';
+import { Eval, initDataset } from 'braintrust';
+import analystWorkflow, {
+  type AnalystRuntimeContext,
+} from '../../../../src/workflows/analyst-workflow';
+import {
+  MetricCreatedSuccessfully,
+  NoFailureToCreateMetrics,
+  acceptableAnswersScorer,
+  allFilesUseYmlBlockScalar,
+  checkUsesExecuteSQLToCreateMetrics,
+  dashboardCreatedForMultipleMetrics,
+  doneMessageMatchesSqlResults,
+  exactlyOneDoneTool,
+  executeSqlFollowedByValidTool,
+  preferredAnswerScorer,
+  timeFrameIsString,
+  todoMarkdownBoxes,
+  usesExpectedPrecomputedMetric,
+} from './example_scorers';

-
-const basicSuite = [executeSqlFollowedByValidTool, MetricCreatedSuccessfully, exactlyOneDoneTool,
-    NoFailureToCreateMetrics, dashboardCreatedForMultipleMetrics
-]
-const formatSuite = [todoMarkdownBoxes, allFilesUseYmlBlockScalar, timeFrameIsString]
-const basicLLMSuite = [checkUsesExecuteSQLToCreateMetrics, doneMessageMatchesSqlResults]
-const expectedSuite = [usesExpectedPrecomputedMetric, acceptableAnswersScorer, preferredAnswerScorer]
+const basicSuite = [
+  executeSqlFollowedByValidTool,
+  MetricCreatedSuccessfully,
+  exactlyOneDoneTool,
+  NoFailureToCreateMetrics,
+  dashboardCreatedForMultipleMetrics,
+];
+const formatSuite = [todoMarkdownBoxes, allFilesUseYmlBlockScalar, timeFrameIsString];
+const basicLLMSuite = [checkUsesExecuteSQLToCreateMetrics, doneMessageMatchesSqlResults];
+const expectedSuite = [
+  usesExpectedPrecomputedMetric,
+  acceptableAnswersScorer,
+  preferredAnswerScorer,
+];

 const getMetricCreation = async (input: string) => {
-    const runtimeContext = new RuntimeContext<AnalystRuntimeContext>();
-    runtimeContext.set('userId', 'c2dd64cd-f7f3-4884-bc91-d46ae431901e');
-    runtimeContext.set('organizationId', 'bf58d19a-8bb9-4f1d-a257-2d2105e7f1ce');
-    runtimeContext.set('dataSourceId', 'cc3ef3bc-44ec-4a43-8dc4-681cae5c996a');
-    runtimeContext.set('dataSourceSyntax', 'postgresql');
-    const run = analystWorkflow.createRun();
-  
-    const response = await run.start({
-      inputData: { prompt: input },
-      runtimeContext,
-    });
-  
-    if (response.status === 'failed') {
-      throw new Error(`Workflow failed: ${response.error}`);
-    }
-  
-    const formatOutputStep = response.steps['format-output'];
-    if (formatOutputStep.status === 'failed') {
-      throw new Error(`Format output step failed: ${formatOutputStep.error}`);
-    }
-  
-    return  formatOutputStep.output.outputMessages || [];
-};
+  const runtimeContext = new RuntimeContext<AnalystRuntimeContext>();
+  runtimeContext.set('userId', 'c2dd64cd-f7f3-4884-bc91-d46ae431901e');
+  runtimeContext.set('organizationId', 'bf58d19a-8bb9-4f1d-a257-2d2105e7f1ce');
+  runtimeContext.set('dataSourceId', 'cc3ef3bc-44ec-4a43-8dc4-681cae5c996a');
+  runtimeContext.set('dataSourceSyntax', 'postgresql');
+  const run = analystWorkflow.createRun();

+  const response = await run.start({
+    inputData: { prompt: input },
+    runtimeContext,
+  });
+
+  if (response.status === 'failed') {
+    throw new Error(`Workflow failed: ${response.error}`);
+  }
+
+  const formatOutputStep = response.steps['format-output'];
+  if (formatOutputStep.status === 'failed') {
+    throw new Error(`Format output step failed: ${formatOutputStep.error}`);
+  }
+
+  return formatOutputStep.output.outputMessages || [];
+};

 //basic function that just returns the input, used for super basic testing
 const returnOutput = async (input: string) => {
-    return input;
-  };
-  
+  return input;
+};

 //Example eval for testing evals with a pre-made output
-Eval("Eval-Testing",
-{
-experimentName: 'check-answer-scorers',
-data: initDataset({
+Eval('Eval-Testing', {
+  experimentName: 'check-answer-scorers',
+  data: initDataset({
    project: 'Eval-Testing',
    dataset: 'premade-badmath-runs',
-}),
-    task: (input, hooks) => {
+  }),
+  task: (input, hooks) => {
    return hooks.metadata.output;
-    },
-scores: []
-},
-);
-
+  },
+  scores: [],
+});

 Eval('development', {
-experimentName: 'bad-math-prompt-changes',
-data: initDataset({
+  experimentName: 'bad-math-prompt-changes',
+  data: initDataset({
    project: 'development',
    dataset: 'Does-Bad-Math',
-}),
-task: getMetricCreation,
-scores: [],
-maxConcurrency: 5,
-})
-
-  
+  }),
+  task: getMetricCreation,
+  scores: [],
+  maxConcurrency: 5,
+});