buster/packages/ai/evals/agents/analyst-agent/metrics/metric.eval.ts

import { RuntimeContext } from '@mastra/core/runtime-context';
import { Eval, initDataset } from 'braintrust';
import analystWorkflow, {
  type AnalystRuntimeContext,
} from '../../../../src/workflows/analyst-workflow';
import {
  MetricCreatedSuccessfully,
  NoFailureToCreateMetrics,
  acceptableAnswersScorer,
  allFilesUseYmlBlockScalar,
  checkUsesExecuteSQLToCreateMetrics,
  dashboardCreatedForMultipleMetrics,
  doneMessageMatchesSqlResults,
  exactlyOneDoneTool,
  executeSqlFollowedByValidTool,
  preferredAnswerScorer,
  timeFrameIsString,
  todoMarkdownBoxes,
  usesExpectedPrecomputedMetric,
} from './example_scorers';

const basicSuite = [
  executeSqlFollowedByValidTool,
  MetricCreatedSuccessfully,
  exactlyOneDoneTool,
  NoFailureToCreateMetrics,
  dashboardCreatedForMultipleMetrics,
];
const formatSuite = [todoMarkdownBoxes, allFilesUseYmlBlockScalar, timeFrameIsString];
const basicLLMSuite = [checkUsesExecuteSQLToCreateMetrics, doneMessageMatchesSqlResults];
const expectedSuite = [
  usesExpectedPrecomputedMetric,
  acceptableAnswersScorer,
  preferredAnswerScorer,
];

const getMetricCreation = async (input: string) => {
  const runtimeContext = new RuntimeContext<AnalystRuntimeContext>();
  runtimeContext.set('userId', 'c2dd64cd-f7f3-4884-bc91-d46ae431901e');
  runtimeContext.set('organizationId', 'bf58d19a-8bb9-4f1d-a257-2d2105e7f1ce');
  runtimeContext.set('dataSourceId', 'cc3ef3bc-44ec-4a43-8dc4-681cae5c996a');
  runtimeContext.set('dataSourceSyntax', 'postgresql');
  const run = analystWorkflow.createRun();

  const response = await run.start({
    inputData: { prompt: input },
    runtimeContext,
  });

  if (response.status === 'failed') {
    throw new Error(`Workflow failed: ${response.error}`);
  }

  const formatOutputStep = response.steps['format-output'];
  if (formatOutputStep.status === 'failed') {
    throw new Error(`Format output step failed: ${formatOutputStep.error}`);
  }

  return formatOutputStep.output.outputMessages || [];
};

//basic function that just returns the input, used for super basic testing
const returnOutput = async (input: string) => {
  return input;
};

//Example eval for testing evals with a pre-made output
Eval('Eval-Testing', {
  experimentName: 'check-answer-scorers',
  data: initDataset({
    project: 'Eval-Testing',
    dataset: 'premade-badmath-runs',
  }),
  task: (input, hooks) => {
    return hooks.metadata.output;
  },
  scores: [],
});

Eval('development', {
  experimentName: 'bad-math-prompt-changes',
  data: initDataset({
    project: 'development',
    dataset: 'Does-Bad-Math',
  }),
  task: getMetricCreation,
  scores: [],
  maxConcurrency: 5,
});
example evals and scorers so they can be shared 2025-07-19 01:24:24 +08:00			`import { RuntimeContext } from '@mastra/core/runtime-context';`
style fixes for eval files 2025-07-19 05:08:15 +08:00			`import { Eval, initDataset } from 'braintrust';`
			`import analystWorkflow, {`
			`type AnalystRuntimeContext,`
			`} from '../../../../src/workflows/analyst-workflow';`
			`import {`
			`MetricCreatedSuccessfully,`
			`NoFailureToCreateMetrics,`
			`acceptableAnswersScorer,`
			`allFilesUseYmlBlockScalar,`
			`checkUsesExecuteSQLToCreateMetrics,`
			`dashboardCreatedForMultipleMetrics,`
			`doneMessageMatchesSqlResults,`
			`exactlyOneDoneTool,`
			`executeSqlFollowedByValidTool,`
			`preferredAnswerScorer,`
			`timeFrameIsString,`
			`todoMarkdownBoxes,`
			`usesExpectedPrecomputedMetric,`
			`} from './example_scorers';`
example evals and scorers so they can be shared 2025-07-19 01:24:24 +08:00
style fixes for eval files 2025-07-19 05:08:15 +08:00			`const basicSuite = [`
			`executeSqlFollowedByValidTool,`
			`MetricCreatedSuccessfully,`
			`exactlyOneDoneTool,`
			`NoFailureToCreateMetrics,`
			`dashboardCreatedForMultipleMetrics,`
			`];`
			`const formatSuite = [todoMarkdownBoxes, allFilesUseYmlBlockScalar, timeFrameIsString];`
			`const basicLLMSuite = [checkUsesExecuteSQLToCreateMetrics, doneMessageMatchesSqlResults];`
			`const expectedSuite = [`
			`usesExpectedPrecomputedMetric,`
			`acceptableAnswersScorer,`
			`preferredAnswerScorer,`
			`];`
example evals and scorers so they can be shared 2025-07-19 01:24:24 +08:00
			`const getMetricCreation = async (input: string) => {`
style fixes for eval files 2025-07-19 05:08:15 +08:00			`const runtimeContext = new RuntimeContext<AnalystRuntimeContext>();`
			`runtimeContext.set('userId', 'c2dd64cd-f7f3-4884-bc91-d46ae431901e');`
			`runtimeContext.set('organizationId', 'bf58d19a-8bb9-4f1d-a257-2d2105e7f1ce');`
			`runtimeContext.set('dataSourceId', 'cc3ef3bc-44ec-4a43-8dc4-681cae5c996a');`
			`runtimeContext.set('dataSourceSyntax', 'postgresql');`
			`const run = analystWorkflow.createRun();`

			`const response = await run.start({`
			`inputData: { prompt: input },`
			`runtimeContext,`
			`});`

			`if (response.status === 'failed') {`
			throw new Error(`Workflow failed: ${response.error}`);
			`}`
example evals and scorers so they can be shared 2025-07-19 01:24:24 +08:00
style fixes for eval files 2025-07-19 05:08:15 +08:00			`const formatOutputStep = response.steps['format-output'];`
			`if (formatOutputStep.status === 'failed') {`
			throw new Error(`Format output step failed: ${formatOutputStep.error}`);
			`}`

			`return formatOutputStep.output.outputMessages \|\| [];`
			`};`
example evals and scorers so they can be shared 2025-07-19 01:24:24 +08:00
			`//basic function that just returns the input, used for super basic testing`
			`const returnOutput = async (input: string) => {`
style fixes for eval files 2025-07-19 05:08:15 +08:00			`return input;`
			`};`
example evals and scorers so they can be shared 2025-07-19 01:24:24 +08:00
			`//Example eval for testing evals with a pre-made output`
style fixes for eval files 2025-07-19 05:08:15 +08:00			`Eval('Eval-Testing', {`
			`experimentName: 'check-answer-scorers',`
			`data: initDataset({`
example evals and scorers so they can be shared 2025-07-19 01:24:24 +08:00			`project: 'Eval-Testing',`
			`dataset: 'premade-badmath-runs',`
style fixes for eval files 2025-07-19 05:08:15 +08:00			`}),`
			`task: (input, hooks) => {`
example evals and scorers so they can be shared 2025-07-19 01:24:24 +08:00			`return hooks.metadata.output;`
style fixes for eval files 2025-07-19 05:08:15 +08:00			`},`
			`scores: [],`
			`});`
example evals and scorers so they can be shared 2025-07-19 01:24:24 +08:00
			`Eval('development', {`
style fixes for eval files 2025-07-19 05:08:15 +08:00			`experimentName: 'bad-math-prompt-changes',`
			`data: initDataset({`
example evals and scorers so they can be shared 2025-07-19 01:24:24 +08:00			`project: 'development',`
			`dataset: 'Does-Bad-Math',`
style fixes for eval files 2025-07-19 05:08:15 +08:00			`}),`
			`task: getMetricCreation,`
			`scores: [],`
			`maxConcurrency: 5,`
			`});`