2025-07-19 01:24:24 +08:00
|
|
|
import { RuntimeContext } from '@mastra/core/runtime-context';
|
2025-07-19 05:08:15 +08:00
|
|
|
import { Eval, initDataset } from 'braintrust';
|
|
|
|
import analystWorkflow, {
|
|
|
|
type AnalystRuntimeContext,
|
|
|
|
} from '../../../../src/workflows/analyst-workflow';
|
|
|
|
import {
|
|
|
|
MetricCreatedSuccessfully,
|
|
|
|
NoFailureToCreateMetrics,
|
|
|
|
acceptableAnswersScorer,
|
|
|
|
allFilesUseYmlBlockScalar,
|
|
|
|
checkUsesExecuteSQLToCreateMetrics,
|
|
|
|
dashboardCreatedForMultipleMetrics,
|
|
|
|
doneMessageMatchesSqlResults,
|
|
|
|
exactlyOneDoneTool,
|
|
|
|
executeSqlFollowedByValidTool,
|
|
|
|
preferredAnswerScorer,
|
|
|
|
timeFrameIsString,
|
|
|
|
todoMarkdownBoxes,
|
|
|
|
usesExpectedPrecomputedMetric,
|
|
|
|
} from './example_scorers';
|
2025-07-19 01:24:24 +08:00
|
|
|
|
2025-07-19 05:08:15 +08:00
|
|
|
const basicSuite = [
|
|
|
|
executeSqlFollowedByValidTool,
|
|
|
|
MetricCreatedSuccessfully,
|
|
|
|
exactlyOneDoneTool,
|
|
|
|
NoFailureToCreateMetrics,
|
|
|
|
dashboardCreatedForMultipleMetrics,
|
|
|
|
];
|
|
|
|
const formatSuite = [todoMarkdownBoxes, allFilesUseYmlBlockScalar, timeFrameIsString];
|
|
|
|
const basicLLMSuite = [checkUsesExecuteSQLToCreateMetrics, doneMessageMatchesSqlResults];
|
|
|
|
const expectedSuite = [
|
|
|
|
usesExpectedPrecomputedMetric,
|
|
|
|
acceptableAnswersScorer,
|
|
|
|
preferredAnswerScorer,
|
|
|
|
];
|
2025-07-19 01:24:24 +08:00
|
|
|
|
|
|
|
const getMetricCreation = async (input: string) => {
|
2025-07-19 05:08:15 +08:00
|
|
|
const runtimeContext = new RuntimeContext<AnalystRuntimeContext>();
|
|
|
|
runtimeContext.set('userId', 'c2dd64cd-f7f3-4884-bc91-d46ae431901e');
|
|
|
|
runtimeContext.set('organizationId', 'bf58d19a-8bb9-4f1d-a257-2d2105e7f1ce');
|
|
|
|
runtimeContext.set('dataSourceId', 'cc3ef3bc-44ec-4a43-8dc4-681cae5c996a');
|
|
|
|
runtimeContext.set('dataSourceSyntax', 'postgresql');
|
|
|
|
const run = analystWorkflow.createRun();
|
|
|
|
|
|
|
|
const response = await run.start({
|
|
|
|
inputData: { prompt: input },
|
|
|
|
runtimeContext,
|
|
|
|
});
|
|
|
|
|
|
|
|
if (response.status === 'failed') {
|
|
|
|
throw new Error(`Workflow failed: ${response.error}`);
|
|
|
|
}
|
2025-07-19 01:24:24 +08:00
|
|
|
|
2025-07-19 05:08:15 +08:00
|
|
|
const formatOutputStep = response.steps['format-output'];
|
|
|
|
if (formatOutputStep.status === 'failed') {
|
|
|
|
throw new Error(`Format output step failed: ${formatOutputStep.error}`);
|
|
|
|
}
|
|
|
|
|
|
|
|
return formatOutputStep.output.outputMessages || [];
|
|
|
|
};
|
2025-07-19 01:24:24 +08:00
|
|
|
|
|
|
|
//basic function that just returns the input, used for super basic testing
|
|
|
|
const returnOutput = async (input: string) => {
|
2025-07-19 05:08:15 +08:00
|
|
|
return input;
|
|
|
|
};
|
2025-07-19 01:24:24 +08:00
|
|
|
|
|
|
|
//Example eval for testing evals with a pre-made output
|
2025-07-19 05:08:15 +08:00
|
|
|
Eval('Eval-Testing', {
|
|
|
|
experimentName: 'check-answer-scorers',
|
|
|
|
data: initDataset({
|
2025-07-19 01:24:24 +08:00
|
|
|
project: 'Eval-Testing',
|
|
|
|
dataset: 'premade-badmath-runs',
|
2025-07-19 05:08:15 +08:00
|
|
|
}),
|
|
|
|
task: (input, hooks) => {
|
2025-07-19 01:24:24 +08:00
|
|
|
return hooks.metadata.output;
|
2025-07-19 05:08:15 +08:00
|
|
|
},
|
|
|
|
scores: [],
|
|
|
|
});
|
2025-07-19 01:24:24 +08:00
|
|
|
|
|
|
|
Eval('development', {
|
2025-07-19 05:08:15 +08:00
|
|
|
experimentName: 'bad-math-prompt-changes',
|
|
|
|
data: initDataset({
|
2025-07-19 01:24:24 +08:00
|
|
|
project: 'development',
|
|
|
|
dataset: 'Does-Bad-Math',
|
2025-07-19 05:08:15 +08:00
|
|
|
}),
|
|
|
|
task: getMetricCreation,
|
|
|
|
scores: [],
|
|
|
|
maxConcurrency: 5,
|
|
|
|
});
|