Merge remote-tracking branch 'origin/staging' into dallin/bus-1469-getting-errors-for-flagging-assumptions-in-production

2025-07-21 12:41:14 -06:00 · 2025-07-21 12:41:14 -06:00 · 3991eccf5c
parent cee4483751 2307ad21df
commit 3991eccf5c
8 changed files with 475 additions and 14 deletions
--- a/packages/ai/src/steps/post-processing/combine-parallel-results-step.ts
+++ b/packages/ai/src/steps/post-processing/combine-parallel-results-step.ts
@ -63,6 +63,7 @@ export const combineParallelResultsOutputSchema = z.object({
            'joinSelection',
            'metricAmbiguity',
            'dataStaticAssumption',
            'uniqueIdentifier',
          ])
          .describe('The type/category of assumption made'),
        explanation: z
--- a/packages/ai/src/steps/post-processing/identify-assumptions-step.ts
+++ b/packages/ai/src/steps/post-processing/identify-assumptions-step.ts
@ -72,6 +72,7 @@ export const identifyAssumptionsOutputSchema = z.object({
            'joinSelection',
            'metricAmbiguity',
            'dataStaticAssumption',
            'uniqueIdentifier',
          ])
          .describe('The type/category of assumption made'),
        explanation: z
@ -186,7 +187,7 @@ When identifying assumptions, use the following classification types to categori
 12. **metricDefinition**: Assumptions about how a metric is defined or calculated, due to missing documentation.  
    - *Example*: Assuming \`FIRST_CLOSED_WON_DEAL_AMOUNT\` is the total deal value.  
    - *Available labels*: major, minor  
-    - *Label decision guidelines*: If the metric is undocumented, defining it introduces a new metric and is "major." If partial documentation exists and the assumption is a standard tweak (e.g., summing a documented total), it's "minor."
+    - *Label decision guidelines*: If the metric is undocumented, defining it introduces a new metric and is "major." If you are using a documented precomputed metric that is clearly connected to the user request, it is "minor". If partial documentation exists and the assumption is a standard tweak (e.g., summing a documented total), it's "minor."
 13. **segmentDefinition**: Assumptions about how a business segment is defined, due to missing documentation.  
    - *Example*: Assuming all \`TEAMS\` entries are Redo customers.  
@ -212,7 +213,7 @@ When identifying assumptions, use the following classification types to categori
 17. **aggregation**: Assumptions about how to aggregate data (e.g., sum, average). Everytime the SQL query uses aggregation, it is an assumption.
    - *Example*: Assuming revenue is summed, not averaged.  
    - *Available labels*: major, minor  
-    - *Label decision guidelines*: If the aggregation is undocumented and introduces a new calculation or if the aggregation selection is not stated in the response message, it's "major." If it's based on a documented or standard method, it's "minor."
+    - *Label decision guidelines*: If the aggregation is undocumented and introduces a new calculation or if the aggregation selection is not stated in the response message, it's "major." Only minor if it the only obvious aggregation method, it is a documented preference, or will have the same result as other aggregation methods.
 18. **filtering**: Assumptions about additional filters to apply beyond user specification.  
    - *Example*: Assuming to exclude inactive records.  
@ -259,6 +260,11 @@ When identifying assumptions, use the following classification types to categori
    - *Example*: Assuming departmental budgets remain constant year over year without considering potential changes due to economic conditions or strategic shifts.
    - *Available labels*: major, minor
    - *Label decision guidelines*: If the assumption of static data could significantly impact the analysis or decision-making process, it's "major." If the assumption is based on standard practices or if the impact of the assumption is minimal, it's "minor."
 27. **uniqueIdentifier**: Assumptions about uniqueness of an identifier.
    - *Example*: Assuming that someone can be identified by their name
    - *Available labels*: major, minor
    - *Label decision guidelines*: If the assumption of uniqueness could significantly impact the analysis or decision-making process or cause different entities to be grouped together incorrectly, it's "major." If the assumption is based on standard practices or if the impact of the assumption is minimal, it's "minor."
 </classification_types>
 <identification_guidelines>
@ -284,6 +290,14 @@ When identifying assumptions, use the following classification types to categori
 - For vagueness of user request:
    - Identify terms with multiple meanings; classify assumptions about their interpretation under "metricInterpretation," "segmentInterpretation," etc.
    - Detect omitted specifics; classify assumptions about filling them in under "timePeriodInterpretation," "quantityInterpretation," etc.
 - For uniqueIdentifier assumptions:
    - If the identifier is the ID of a table, it is not a \`uniqueIdentifier\ assumption
    - If the identifier is an ID from a different table, it is a \`uniqueIdentifier\ assumption
    - If the identifier is not an ID (e.g. name), it is a \`uniqueIdentifier\ assumption
    - If the identifier is being used to purposely group distinct entities together (grouping customers into premium and non-premium groups), it is not a \`uniqueIdentifier\` assumption
 - For filtering and segmentDefinition assumptions:
    - If the the filter or segment definition is not documented, it should be flagged as a major assumption even if it is validated using the executeSQL tool unless the filter or segment definition is a standard filter or segment definition.
    - Data exploration is not able to prove that you capture all the data that you need to, it can only show that you are not capturing data that you want to avoid.
 </identification_guidelines>
 <scoring_framework>
@ -320,6 +334,27 @@ For assumptions where the classification type is not pre-assigned to \`timeRelat
 - Whenever there are multiple possible ways to aggregate something, it is a \`metricAmbiguity\` assumption.
 - Whenever your analysis requires a numeric value to be static, a \`dataStaticAssumption\` was made.
 - Whenever filters are used, a \`filtering\` or \`segmentDefinition\` assumption was made.
 - Data is only considered documented if it is explicitly stated in the user input message or if it is stated in the \`dataset_context\`
 - When using precomputed metrics:
    - If the metric is not documented, it is a \`metricDefinition\` assumption
    - If the metric is documented but it is not obviously connected to the user request, it is a \`metricDefinition\` assumption
    - If the metric is documented and obviously connected to the user request (a total_shipped metric is clearly connected to the user request of "number of orders shipped"), it is only a minor \`metricDefinition\` assumption.
 - When interpeting a user request:
    - Basic clearly defined interpretations of a user request are not assumptions as long as they are explained in the response message.
      - Example: Assuming former employees are employees that are not active
      - Example: Assuming profit represents revenue minus cost
    - Basic definitions built by clearly defined interpretations of a user request are not assumptions as long as the definition is explained in the output message.
      - Example: Assuming former employees are defined as employees where \`is_active\` is \`false\` 
      - Example: Assuming "profit" is computed as the sum of \`revenue - cost\`
    - Interpretation that is not immediately obvious is an assumption.
      - Example: assuming "most popular coffee" means the coffee with the most orders instead of the coffee with the most oz sold is a \`metricAmbiguity\` or \`aggregation\` assumption.
      - Example: assuming "churned customers" means customers who have not made a purchase in the last 6 months is a \`segmentDefinition\` assumption.
      - Example: Assuming you can filter for clothes by doing where \`material is in ('cotton', 'wool')\` is a major \`filtering\` or \`segmentDefinition\` assumption.
    - If the interpretation is critical to the analysis, it is a major assumption. If the interpretation is not critical to the analysis, it is a minor assumption.
 - When looking at numeric columns:
    - Validate if you are making a \`valueScale\` assumption.
    - Validate if you are making a \`dataStaticAssumption\` assumption.
 - When there are multiple relationships/entities that you can join on, validate if you are making a \`joinSelection\` assumption.
 </evaluation_guidelines>
 <output_format>
@ -334,18 +369,6 @@ For assumptions where the classification type is not pre-assigned to \`timeRelat
    - Use the \`noAssumptionsIdentified\` tool to indicate that no assumptions were made.
 </output_format>
 <output_format>
 - Identified assumptions:
    - Use the \`listAssumptionsResponse\` tool to list all assumptions found.
    - Each assumption should include:
        - **descriptive_title**: Clear title summarizing the assumption.
        - **classification**: The classification type from the list (e.g., "fieldMapping").
        - **label**: The assigned label (\`timeRelated\`, \`vagueRequest\`, \`major\`, or \`minor\`).
        - **explanation**: Detailed explanation of the assumption, including query context, documentation gaps, potential issues, and contributing factors. Ensure that all references to database tables, fields, and calculations are enclosed in backticks for clarity (e.g., \`sales.revenue\` or \`(# of orders delivered on or before due date) / (Total number of orders) * 100\`). For assumptions with label \`major\` or \`minor\`, include the reasoning for the significance assessment. For \`timeRelated\` or \`vagueRequest\`, explain why the assumption fits that category.
 - No assumptions identified:
    - Use the \`noAssumptionsIdentified\` tool to indicate that no assumptions were made.
 </output_format>
 <dataset_context_guidelines>
 - Proper joins can be identified as either relationships or entities in the dataset context.
 </dataset_context_guidelines>
--- a/packages/ai/src/steps/post-processing/schemas.ts
+++ b/packages/ai/src/steps/post-processing/schemas.ts
@ -74,6 +74,7 @@ export const postProcessingWorkflowOutputSchema = z.object({
            'joinSelection',
            'metricAmbiguity',
            'dataStaticAssumption',
            'uniqueIdentifier',
          ])
          .describe('The type/category of assumption made'),
        explanation: z
--- a/packages/ai/src/tools/file-tools/delete-files-tool/delete-files-functions.ts
+++ b/packages/ai/src/tools/file-tools/delete-files-tool/delete-files-functions.ts
@ -0,0 +1,94 @@
 import * as fs from 'node:fs/promises';
 import * as path from 'node:path';
 export interface FileDeleteResult {
  success: boolean;
  filePath: string;
  error?: string;
 }
 export interface FileDeleteParams {
  path: string;
 }
 async function deleteSingleFile(fileParams: FileDeleteParams): Promise<FileDeleteResult> {
  try {
    const { path: filePath } = fileParams;
    const resolvedPath = path.isAbsolute(filePath) ? filePath : path.join(process.cwd(), filePath);
    try {
      await fs.access(resolvedPath);
    } catch {
      return {
        success: false,
        filePath,
        error: 'File not found',
      };
    }
    await fs.unlink(resolvedPath);
    return {
      success: true,
      filePath,
    };
  } catch (error) {
    return {
      success: false,
      filePath: fileParams.path,
      error: error instanceof Error ? error.message : 'Unknown error occurred',
    };
  }
 }
 export async function deleteFilesSafely(
  fileParams: FileDeleteParams[]
 ): Promise<FileDeleteResult[]> {
  const fileDeletePromises = fileParams.map((params) => deleteSingleFile(params));
  return Promise.all(fileDeletePromises);
 }
 export function generateFileDeleteCode(fileParams: FileDeleteParams[]): string {
  return `
 const fs = require('fs');
 const path = require('path');
 function deleteSingleFile(fileParams) {
  try {
    const { path: filePath } = fileParams;
    const resolvedPath = path.isAbsolute(filePath) ? filePath : path.join(process.cwd(), filePath);
    try {
      fs.accessSync(resolvedPath);
    } catch {
      return {
        success: false,
        filePath,
        error: 'File not found',
      };
    }
    fs.unlinkSync(resolvedPath);
    return {
      success: true,
      filePath,
    };
  } catch (error) {
    return {
      success: false,
      filePath: fileParams.path,
      error: error instanceof Error ? error.message : 'Unknown error occurred',
    };
  }
 }
 function deleteFilesConcurrently(fileParams) {
  return fileParams.map((params) => deleteSingleFile(params));
 }
 const fileParams = ${JSON.stringify(fileParams)};
 const results = deleteFilesConcurrently(fileParams);
 console.log(JSON.stringify(results));
  `.trim();
 }
--- a/packages/ai/src/tools/file-tools/delete-files-tool/delete-files-tool.test.ts
+++ b/packages/ai/src/tools/file-tools/delete-files-tool/delete-files-tool.test.ts
@ -0,0 +1,210 @@
 import { RuntimeContext } from '@mastra/core/runtime-context';
 import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';
 import { z } from 'zod';
 import { type SandboxContext, SandboxContextKey } from '../../../context/sandbox-context';
 import { deleteFiles } from './delete-files-tool';
 vi.mock('@buster/sandbox', () => ({
  runTypescript: vi.fn(),
 }));
 vi.mock('./delete-files-functions', () => ({
  generateFileDeleteCode: vi.fn(),
  deleteFilesSafely: vi.fn(),
 }));
 import { runTypescript } from '@buster/sandbox';
 import { deleteFilesSafely, generateFileDeleteCode } from './delete-files-functions';
 const mockRunTypescript = vi.mocked(runTypescript);
 const mockGenerateFileDeleteCode = vi.mocked(generateFileDeleteCode);
 const mockDeleteFilesSafely = vi.mocked(deleteFilesSafely);
 describe('delete-files-tool', () => {
  let runtimeContext: RuntimeContext<SandboxContext>;
  beforeEach(() => {
    vi.clearAllMocks();
    runtimeContext = new RuntimeContext<SandboxContext>();
  });
  afterEach(() => {
    vi.restoreAllMocks();
  });
  describe('deleteFiles tool', () => {
    it('should have correct tool configuration', () => {
      expect(deleteFiles.id).toBe('delete_files');
      expect(deleteFiles.description).toContain('Deletes files at the specified paths');
      expect(deleteFiles.inputSchema).toBeDefined();
      expect(deleteFiles.outputSchema).toBeDefined();
    });
    it('should validate input schema correctly', () => {
      const validInput = {
        files: [
          { path: '/test/file1.txt' },
          { path: '/test/file2.txt' },
        ],
      };
      expect(() => deleteFiles.inputSchema.parse(validInput)).not.toThrow();
    });
    it('should reject invalid input schema', () => {
      const invalidInput = {
        files: [
          { },
        ],
      };
      expect(() => deleteFiles.inputSchema.parse(invalidInput)).toThrow();
    });
    it('should execute with sandbox when available', async () => {
      const mockSandbox = { process: { codeRun: vi.fn() } };
      runtimeContext.set(SandboxContextKey.Sandbox, mockSandbox as any);
      const input = {
        files: [{ path: '/test/file.txt' }],
      };
      const mockCode = 'generated typescript code';
      const mockSandboxResult = {
        result: JSON.stringify([{ success: true, filePath: '/test/file.txt' }]),
        exitCode: 0,
        stderr: '',
      };
      mockGenerateFileDeleteCode.mockReturnValue(mockCode);
      mockRunTypescript.mockResolvedValue(mockSandboxResult);
      const result = await deleteFiles.execute({
        context: input,
        runtimeContext,
      });
      expect(mockGenerateFileDeleteCode).toHaveBeenCalledWith(input.files);
      expect(mockRunTypescript).toHaveBeenCalledWith(mockSandbox, mockCode);
      expect(result.successes).toEqual(['/test/file.txt']);
      expect(result.failures).toEqual([]);
    });
    it('should fallback to local execution when sandbox not available', async () => {
      const input = {
        files: [{ path: '/test/file.txt' }],
      };
      const mockLocalResult = [{ success: true, filePath: '/test/file.txt' }];
      mockDeleteFilesSafely.mockResolvedValue(mockLocalResult);
      const result = await deleteFiles.execute({
        context: input,
        runtimeContext,
      });
      expect(mockDeleteFilesSafely).toHaveBeenCalledWith(input.files);
      expect(result.successes).toEqual(['/test/file.txt']);
      expect(result.failures).toEqual([]);
    });
    it('should handle sandbox execution errors', async () => {
      const mockSandbox = { process: { codeRun: vi.fn() } };
      runtimeContext.set(SandboxContextKey.Sandbox, mockSandbox as any);
      const input = {
        files: [{ path: '/test/file.txt' }],
      };
      const mockCode = 'generated typescript code';
      const mockSandboxResult = {
        result: 'error output',
        exitCode: 1,
        stderr: 'Execution failed',
      };
      mockGenerateFileDeleteCode.mockReturnValue(mockCode);
      mockRunTypescript.mockResolvedValue(mockSandboxResult);
      const result = await deleteFiles.execute({
        context: input,
        runtimeContext,
      });
      expect(result.successes).toEqual([]);
      expect(result.failures).toHaveLength(1);
      expect(result.failures[0]).toEqual({
        path: '/test/file.txt',
        error: 'Execution error: Sandbox execution failed: Execution failed',
      });
    });
    it('should handle mixed success and error results', async () => {
      const input = {
        files: [
          { path: '/test/file1.txt' },
          { path: '/test/file2.txt' },
        ],
      };
      const mockLocalResult = [
        { success: true, filePath: '/test/file1.txt' },
        { success: false, filePath: '/test/file2.txt', error: 'Permission denied' },
      ];
      mockDeleteFilesSafely.mockResolvedValue(mockLocalResult);
      const result = await deleteFiles.execute({
        context: input,
        runtimeContext,
      });
      expect(result.successes).toEqual(['/test/file1.txt']);
      expect(result.failures).toEqual([{
        path: '/test/file2.txt',
        error: 'Permission denied',
      }]);
    });
    it('should handle empty files array', async () => {
      const input = { files: [] };
      const result = await deleteFiles.execute({
        context: input,
        runtimeContext,
      });
      expect(result.successes).toEqual([]);
      expect(result.failures).toEqual([]);
    });
    it('should handle JSON parse errors from sandbox', async () => {
      const mockSandbox = { process: { codeRun: vi.fn() } };
      runtimeContext.set(SandboxContextKey.Sandbox, mockSandbox as any);
      const input = {
        files: [{ path: '/test/file.txt' }],
      };
      const mockCode = 'generated typescript code';
      const mockSandboxResult = {
        result: 'invalid json output',
        exitCode: 0,
        stderr: '',
      };
      mockGenerateFileDeleteCode.mockReturnValue(mockCode);
      mockRunTypescript.mockResolvedValue(mockSandboxResult);
      const result = await deleteFiles.execute({
        context: input,
        runtimeContext,
      });
      expect(result.successes).toEqual([]);
      expect(result.failures).toHaveLength(1);
      expect(result.failures[0]?.error).toContain('Failed to parse sandbox output');
    });
  });
 });
--- a/packages/ai/src/tools/file-tools/delete-files-tool/delete-files-tool.ts
+++ b/packages/ai/src/tools/file-tools/delete-files-tool/delete-files-tool.ts
@ -0,0 +1,130 @@
 import { runTypescript } from '@buster/sandbox';
 import type { RuntimeContext } from '@mastra/core/runtime-context';
 import { createTool } from '@mastra/core/tools';
 import { wrapTraced } from 'braintrust';
 import { z } from 'zod';
 import { type SandboxContext, SandboxContextKey } from '../../../context/sandbox-context';
 const deleteFilesInputSchema = z.object({
  files: z.array(
    z.object({
      path: z.string().describe('File path to delete (absolute or relative)'),
    })
  ).describe('Array of file deletion operations to perform'),
 });
 const deleteFilesOutputSchema = z.object({
  successes: z.array(z.string()),
  failures: z.array(
    z.object({
      path: z.string(),
      error: z.string(),
    })
  ),
 });
 const deleteFilesExecution = wrapTraced(
  async (
    params: z.infer<typeof deleteFilesInputSchema>,
    runtimeContext: RuntimeContext<SandboxContext>
  ): Promise<z.infer<typeof deleteFilesOutputSchema>> => {
    const { files } = params;
    if (!files || files.length === 0) {
      return { successes: [], failures: [] };
    }
    try {
      const sandbox = runtimeContext.get(SandboxContextKey.Sandbox);
      if (sandbox) {
        const { generateFileDeleteCode } = await import('./delete-files-functions');
        const code = generateFileDeleteCode(files);
        const result = await runTypescript(sandbox, code);
        if (result.exitCode !== 0) {
          console.error('Sandbox execution failed. Exit code:', result.exitCode);
          console.error('Stderr:', result.stderr);
          console.error('Stdout:', result.result);
          throw new Error(`Sandbox execution failed: ${result.stderr || 'Unknown error'}`);
        }
        let fileResults: Array<{
          success: boolean;
          filePath: string;
          error?: string;
        }>;
        try {
          fileResults = JSON.parse(result.result.trim());
        } catch (parseError) {
          console.error('Failed to parse sandbox output:', result.result);
          throw new Error(
            `Failed to parse sandbox output: ${parseError instanceof Error ? parseError.message : 'Unknown parse error'}`
          );
        }
        const successes: string[] = [];
        const failures: Array<{ path: string; error: string }> = [];
        for (const fileResult of fileResults) {
          if (fileResult.success) {
            successes.push(fileResult.filePath);
          } else {
            failures.push({
              path: fileResult.filePath,
              error: fileResult.error || 'Unknown error',
            });
          }
        }
        return { successes, failures };
      }
      const { deleteFilesSafely } = await import('./delete-files-functions');
      const fileResults = await deleteFilesSafely(files);
      const successes: string[] = [];
      const failures: Array<{ path: string; error: string }> = [];
      for (const fileResult of fileResults) {
        if (fileResult.success) {
          successes.push(fileResult.filePath);
        } else {
          failures.push({
            path: fileResult.filePath,
            error: fileResult.error || 'Unknown error',
          });
        }
      }
      return { successes, failures };
    } catch (error) {
      return {
        successes: [],
        failures: files.map((file) => ({
          path: file.path,
          error: `Execution error: ${error instanceof Error ? error.message : 'Unknown error'}`,
        })),
      };
    }
  },
  { name: 'delete-files' }
 );
 export const deleteFiles = createTool({
  id: 'delete_files',
  description: `Deletes files at the specified paths. Supports both absolute and relative file paths. Handles errors gracefully by continuing to process other files even if some fail. Returns both successful deletions and failed operations with detailed error messages. Does not fail the entire operation when individual file deletions fail.`,
  inputSchema: deleteFilesInputSchema,
  outputSchema: deleteFilesOutputSchema,
  execute: async ({
    context,
    runtimeContext,
  }: {
    context: z.infer<typeof deleteFilesInputSchema>;
    runtimeContext: RuntimeContext<SandboxContext>;
  }) => {
    return await deleteFilesExecution(context, runtimeContext);
  },
 });
 export default deleteFiles;
--- a/packages/ai/src/tools/index.ts
+++ b/packages/ai/src/tools/index.ts
@ -12,3 +12,4 @@ export { createTodoList } from './planning-thinking-tools/create-todo-item-tool'
 export { editFiles } from './file-tools/edit-files-tool/edit-files-tool';
 export { readFiles } from './file-tools/read-files-tool/read-files-tool';
 export { createFiles } from './file-tools/create-files-tool/create-file-tool';
 export { deleteFiles } from './file-tools/delete-files-tool/delete-files-tool';
--- a/packages/ai/src/tools/post-processing/list-assumptions-response.ts
+++ b/packages/ai/src/tools/post-processing/list-assumptions-response.ts
@ -39,6 +39,7 @@ export const assumptionItemSchema = z
        'joinSelection',
        'metricAmbiguity',
        'dataStaticAssumption',
        'uniqueIdentifier',
      ])
      .describe('The classification type of the assumption'),
    explanation: z