mirror of https://github.com/buster-so/buster.git
213 lines
6.9 KiB
TypeScript
213 lines
6.9 KiB
TypeScript
import { closePool, getDb, initializePool, sql } from '@buster/database';
|
|
import { afterAll, beforeAll, describe, expect, it } from 'vitest';
|
|
import {
|
|
type SearchTarget,
|
|
generateEmbedding,
|
|
searchValuesAcrossTargets,
|
|
searchValuesByEmbedding,
|
|
searchValuesByEmbeddingWithFilters,
|
|
} from '../src/search';
|
|
|
|
// Integration tests using the existing data
|
|
const DATABASE_URL = process.env.DATABASE_URL;
|
|
const OPENAI_API_KEY = process.env.OPENAI_API_KEY;
|
|
|
|
const describeFn = DATABASE_URL && OPENAI_API_KEY ? describe : describe.skip;
|
|
|
|
describeFn('search.ts - Focused Integration Tests with Real Data', () => {
|
|
// Use the existing data source ID
|
|
const testDataSourceId = 'cc3ef3bc-44ec-4a43-8dc4-681cae5c996a';
|
|
const testSchemaName = `ds_${testDataSourceId.replace(/-/g, '_')}`;
|
|
|
|
beforeAll(async () => {
|
|
// Initialize the database pool
|
|
initializePool();
|
|
|
|
// Log some statistics about the data
|
|
const db = getDb();
|
|
const _stats = await db.execute(
|
|
sql.raw(`
|
|
SELECT
|
|
COUNT(*) as total_rows,
|
|
COUNT(DISTINCT database_name) as databases,
|
|
COUNT(DISTINCT table_name) as tables,
|
|
COUNT(DISTINCT column_name) as columns,
|
|
COUNT(embedding) as embeddings
|
|
FROM "${testSchemaName}"."searchable_column_values"
|
|
`)
|
|
);
|
|
}, 30000);
|
|
|
|
afterAll(async () => {
|
|
await closePool();
|
|
});
|
|
|
|
describe('searchValuesByEmbedding with real data', () => {
|
|
it('should find file extensions when searching for document types', async () => {
|
|
const searchEmbedding = await generateEmbedding(['file', 'document', 'extension', 'format']);
|
|
|
|
const results = await searchValuesByEmbedding(testDataSourceId, searchEmbedding, {
|
|
limit: 5,
|
|
});
|
|
results.forEach((_r, _i) => {});
|
|
|
|
// Check if we found file extensions
|
|
const _hasFileExtension = results.some(
|
|
(r) => r.value.startsWith('.') || r.column_name.toLowerCase().includes('extension')
|
|
);
|
|
|
|
expect(results.length).toBeGreaterThan(0);
|
|
}, 30000);
|
|
});
|
|
|
|
describe('searchValuesByEmbeddingWithFilters with real data', () => {
|
|
it('should filter results by table name', async () => {
|
|
const searchEmbedding = await generateEmbedding(['territory', 'region', 'area']);
|
|
|
|
const results = await searchValuesByEmbeddingWithFilters(
|
|
testDataSourceId,
|
|
searchEmbedding,
|
|
{ limit: 10 },
|
|
undefined,
|
|
undefined,
|
|
'sales_territory',
|
|
undefined
|
|
);
|
|
results.forEach((_r, _i) => {});
|
|
|
|
// All results should be from sales_territory table
|
|
expect(results.every((r) => r.table_name === 'sales_territory')).toBe(true);
|
|
expect(results.length).toBeGreaterThan(0);
|
|
|
|
// Should include territory names we saw in the data
|
|
const territoryNames = ['Northeast', 'Northwest', 'Southeast', 'Southwest', 'Central'];
|
|
const _foundTerritories = results.filter((r) => territoryNames.includes(r.value));
|
|
});
|
|
|
|
it('should filter by database and schema', async () => {
|
|
const searchEmbedding = await generateEmbedding(['data', 'value']);
|
|
|
|
const results = await searchValuesByEmbeddingWithFilters(
|
|
testDataSourceId,
|
|
searchEmbedding,
|
|
{ limit: 5 },
|
|
'postgres',
|
|
'ont_ont',
|
|
undefined,
|
|
undefined
|
|
);
|
|
results.forEach((_r, _i) => {});
|
|
|
|
// All results should match the filters
|
|
expect(results.every((r) => r.database_name === 'postgres')).toBe(true);
|
|
expect(results.every((r) => r.schema_name === 'ont_ont')).toBe(true);
|
|
});
|
|
});
|
|
|
|
describe('searchValuesAcrossTargets with real data', () => {
|
|
it('should search multiple targets in parallel', async () => {
|
|
// Define targets based on actual data
|
|
const targets: SearchTarget[] = [
|
|
{
|
|
database_name: 'postgres',
|
|
schema_name: 'ont_ont',
|
|
table_name: 'sales_territory',
|
|
column_name: 'name',
|
|
},
|
|
{
|
|
database_name: 'postgres',
|
|
schema_name: 'ont_ont',
|
|
table_name: 'document',
|
|
column_name: 'fileextension',
|
|
},
|
|
];
|
|
targets.forEach((_t, _i) => {});
|
|
|
|
const searchEmbedding = await generateEmbedding(['name', 'identifier']);
|
|
const results = await searchValuesAcrossTargets(
|
|
testDataSourceId,
|
|
searchEmbedding,
|
|
targets,
|
|
3
|
|
);
|
|
|
|
// Group by table for analysis
|
|
const resultsByTable = results.reduce(
|
|
(acc, r) => {
|
|
const tableName = r.table_name;
|
|
if (!acc[tableName]) {
|
|
acc[tableName] = [];
|
|
}
|
|
acc[tableName].push(r);
|
|
return acc;
|
|
},
|
|
{} as Record<string, typeof results>
|
|
);
|
|
|
|
for (const [_table, tableResults] of Object.entries(resultsByTable)) {
|
|
tableResults.forEach((_r, _i) => {});
|
|
}
|
|
|
|
// Should have results from both targets
|
|
expect(results.length).toBeGreaterThan(0);
|
|
expect(results.length).toBeLessThanOrEqual(6); // 3 per target max
|
|
|
|
// Should have results from multiple tables
|
|
const uniqueTables = new Set(results.map((r) => r.table_name));
|
|
expect(uniqueTables.size).toBeGreaterThanOrEqual(1);
|
|
});
|
|
});
|
|
|
|
describe('Performance with real data', () => {
|
|
it('should handle large result sets efficiently', async () => {
|
|
const startTime = Date.now();
|
|
|
|
const searchEmbedding = await generateEmbedding(['value']);
|
|
const _midTime = Date.now();
|
|
|
|
const results = await searchValuesByEmbedding(testDataSourceId, searchEmbedding, {
|
|
limit: 100,
|
|
});
|
|
const endTime = Date.now();
|
|
results.slice(0, 5).forEach((_r, _i) => {});
|
|
|
|
expect(results.length).toBeLessThanOrEqual(100);
|
|
expect(endTime - startTime).toBeLessThan(5000); // Should complete within 5 seconds
|
|
});
|
|
});
|
|
|
|
describe('Edge cases with real data', () => {
|
|
it('should handle searches that return no results', async () => {
|
|
const searchEmbedding = await generateEmbedding(['xyzabc123notfound']);
|
|
|
|
const results = await searchValuesByEmbeddingWithFilters(
|
|
testDataSourceId,
|
|
searchEmbedding,
|
|
{ limit: 10 },
|
|
'nonexistent_db',
|
|
undefined,
|
|
undefined,
|
|
undefined
|
|
);
|
|
expect(results).toEqual([]);
|
|
});
|
|
|
|
it('should show similarity differences in results', async () => {
|
|
// Search for something specific
|
|
const searchEmbedding = await generateEmbedding(['United States regions']);
|
|
const results = await searchValuesByEmbedding(testDataSourceId, searchEmbedding, {
|
|
limit: 10,
|
|
});
|
|
results.forEach((_r, _i) => {});
|
|
|
|
// The results should be ordered by similarity
|
|
// Territory names should appear near the top
|
|
const territoryNames = ['Northeast', 'Northwest', 'Southeast', 'Southwest', 'Central'];
|
|
const firstFiveValues = results.slice(0, 5).map((r) => r.value);
|
|
const _territoriesInTopFive = firstFiveValues.filter((v) => territoryNames.includes(v));
|
|
|
|
expect(results.length).toBeGreaterThan(0);
|
|
});
|
|
});
|
|
});
|