diff --git a/packages/access-controls/src/sql-permissions/parser-helpers.test.ts b/packages/access-controls/src/sql-permissions/parser-helpers.test.ts new file mode 100644 index 000000000..c6b1059e6 --- /dev/null +++ b/packages/access-controls/src/sql-permissions/parser-helpers.test.ts @@ -0,0 +1,383 @@ +import { describe, it, expect } from 'vitest'; +import { + extractPhysicalTables, + extractColumnReferences, + tablesMatch, + checkQueryIsReadOnly, + validateWildcardUsage, + extractTablesFromYml, + extractDatasetsFromYml, +} from './parser-helpers'; + +describe('extractPhysicalTables', () => { + it('should handle BigQuery queries with backtick-quoted project names', () => { + const sql = "SELECT COUNT(DISTINCT u.user_id) as total_users FROM `buster-381916`.analytics.user u"; + const tables = extractPhysicalTables(sql, 'bigquery'); + + expect(tables).toHaveLength(1); + expect(tables[0]).toMatchObject({ + database: 'buster-381916', + schema: 'analytics', + table: 'user', + fullName: 'buster-381916.analytics.user' + }); + }); + + it('should handle standard BigQuery queries without backticks', () => { + const sql = 'SELECT * FROM project.dataset.table'; + const tables = extractPhysicalTables(sql, 'bigquery'); + + expect(tables).toHaveLength(1); + expect(tables[0]).toMatchObject({ + database: 'project', + schema: 'dataset', + table: 'table', + fullName: 'project.dataset.table' + }); + }); + + it('should handle PostgreSQL queries', () => { + const sql = 'SELECT * FROM public.users'; + const tables = extractPhysicalTables(sql, 'postgresql'); + + expect(tables).toHaveLength(1); + expect(tables[0]).toMatchObject({ + schema: 'public', + table: 'users', + fullName: 'public.users' + }); + }); + + it('should handle Snowflake queries with quoted identifiers', () => { + const sql = 'SELECT * FROM "DATABASE"."SCHEMA"."TABLE"'; + const tables = extractPhysicalTables(sql, 'snowflake'); + + expect(tables).toHaveLength(1); + expect(tables[0]).toMatchObject({ + database: 'DATABASE', + schema: 'SCHEMA', + table: 'TABLE', + fullName: 'DATABASE.SCHEMA.TABLE' + }); + }); + + it('should handle multiple tables in joins', () => { + const sql = ` + SELECT u.*, o.name + FROM users u + JOIN orders o ON u.id = o.user_id + `; + const tables = extractPhysicalTables(sql); + + expect(tables).toHaveLength(2); + expect(tables.map(t => t.table)).toEqual(['users', 'orders']); + }); + + it('should ignore CTEs and subqueries', () => { + const sql = ` + WITH user_stats AS ( + SELECT user_id, COUNT(*) as count + FROM orders + GROUP BY user_id + ) + SELECT * FROM users u JOIN user_stats s ON u.id = s.user_id + `; + const tables = extractPhysicalTables(sql); + + expect(tables).toHaveLength(2); + expect(tables.map(t => t.table)).toEqual(['orders', 'users']); + }); +}); + +describe('extractColumnReferences', () => { + it('should extract columns from WHERE clause', () => { + const sql = "SELECT id FROM users WHERE status = 'active' AND created_at > '2024-01-01'"; + const columns = extractColumnReferences(sql); + + expect(columns.has('users')).toBe(true); + const userColumns = Array.from(columns.get('users')!); + expect(userColumns).toContain('id'); + expect(userColumns).toContain('status'); + expect(userColumns).toContain('created_at'); + }); + + it('should extract columns from JOIN conditions', () => { + const sql = ` + SELECT u.id, o.total + FROM users u + JOIN orders o ON u.id = o.user_id + WHERE o.status = 'completed' + `; + const columns = extractColumnReferences(sql); + + expect(columns.has('users')).toBe(true); + expect(Array.from(columns.get('users')!)).toEqual(['id']); + + expect(columns.has('orders')).toBe(true); + expect(Array.from(columns.get('orders')!)).toContain('total'); + expect(Array.from(columns.get('orders')!)).toContain('user_id'); + expect(Array.from(columns.get('orders')!)).toContain('status'); + }); + + it('should handle SELECT * queries', () => { + const sql = 'SELECT * FROM users'; + const columns = extractColumnReferences(sql); + + // SELECT * doesn't extract individual columns, it returns an empty map + expect(columns.size).toBe(0); + }); +}); + +describe('checkQueryIsReadOnly', () => { + it('should allow SELECT queries', () => { + const result = checkQueryIsReadOnly('SELECT * FROM users'); + expect(result.isReadOnly).toBe(true); + expect(result.error).toBeUndefined(); + }); + + it('should allow BigQuery SELECT with backticks', () => { + const result = checkQueryIsReadOnly( + "SELECT COUNT(DISTINCT u.user_id) FROM `buster-381916`.analytics.user u", + 'bigquery' + ); + expect(result.isReadOnly).toBe(true); + expect(result.error).toBeUndefined(); + }); + + it('should reject INSERT queries', () => { + const result = checkQueryIsReadOnly("INSERT INTO users VALUES ('test')"); + expect(result.isReadOnly).toBe(false); + expect(result.error).toContain("Query type 'INSERT' is not allowed"); + }); + + it('should reject UPDATE queries', () => { + const result = checkQueryIsReadOnly("UPDATE users SET name = 'test'"); + expect(result.isReadOnly).toBe(false); + expect(result.error).toContain("Query type 'UPDATE' is not allowed"); + }); + + it('should reject DELETE queries', () => { + const result = checkQueryIsReadOnly('DELETE FROM users'); + expect(result.isReadOnly).toBe(false); + expect(result.error).toContain("Query type 'DELETE' is not allowed"); + }); + + it('should reject DDL queries', () => { + const result = checkQueryIsReadOnly('CREATE TABLE test (id INT)'); + expect(result.isReadOnly).toBe(false); + expect(result.error).toContain("Query type 'CREATE' is not allowed"); + }); + + it('should allow SELECT with CTEs', () => { + const sql = ` + WITH stats AS (SELECT COUNT(*) FROM orders) + SELECT * FROM stats + `; + const result = checkQueryIsReadOnly(sql); + expect(result.isReadOnly).toBe(true); + }); +}); + +describe('validateWildcardUsage', () => { + it('should reject SELECT * on physical tables', () => { + const result = validateWildcardUsage('SELECT * FROM users'); + expect(result.isValid).toBe(false); + expect(result.error).toContain('SELECT * is not allowed on physical table'); + }); + + it('should reject SELECT * on BigQuery tables with backticks', () => { + const result = validateWildcardUsage( + 'SELECT * FROM `buster-381916`.analytics.user', + 'bigquery' + ); + expect(result.isValid).toBe(false); + expect(result.error).toContain('SELECT * is not allowed on physical table'); + }); + + it('should allow specific column selection', () => { + const result = validateWildcardUsage('SELECT id, name FROM users'); + expect(result.isValid).toBe(true); + expect(result.error).toBeUndefined(); + }); + + it('should allow specific columns on BigQuery tables', () => { + const result = validateWildcardUsage( + "SELECT u.user_id, u.name FROM `buster-381916`.analytics.user u", + 'bigquery' + ); + expect(result.isValid).toBe(true); + expect(result.error).toBeUndefined(); + }); + + it('should allow SELECT * on CTEs', () => { + const sql = ` + WITH user_stats AS ( + SELECT user_id, COUNT(*) as count + FROM orders + GROUP BY user_id + ) + SELECT * FROM user_stats + `; + const result = validateWildcardUsage(sql); + expect(result.isValid).toBe(true); + }); +}); + +describe('tablesMatch', () => { + it('should match exact table names', () => { + const queryTable = { table: 'users', fullName: 'users' }; + const allowedTable = { table: 'users', fullName: 'users' }; + expect(tablesMatch(queryTable, allowedTable)).toBe(true); + }); + + it('should match with schema qualification', () => { + const queryTable = { schema: 'public', table: 'users', fullName: 'public.users' }; + const allowedTable = { table: 'users', fullName: 'users' }; + expect(tablesMatch(queryTable, allowedTable)).toBe(true); + }); + + it('should match BigQuery three-part names', () => { + const queryTable = { + database: 'buster-381916', + schema: 'analytics', + table: 'user', + fullName: 'buster-381916.analytics.user' + }; + const allowedTable = { + database: 'buster-381916', + schema: 'analytics', + table: 'user', + fullName: 'buster-381916.analytics.user' + }; + expect(tablesMatch(queryTable, allowedTable)).toBe(true); + }); + + it('should be case-insensitive', () => { + const queryTable = { table: 'USERS', fullName: 'USERS' }; + const allowedTable = { table: 'users', fullName: 'users' }; + expect(tablesMatch(queryTable, allowedTable)).toBe(true); + }); + + it('should not match different tables', () => { + const queryTable = { table: 'orders', fullName: 'orders' }; + const allowedTable = { table: 'users', fullName: 'users' }; + expect(tablesMatch(queryTable, allowedTable)).toBe(false); + }); +}); + +describe('extractTablesFromYml', () => { + it('should extract tables from YML content with models array', () => { + const yml = ` +models: + - name: users + database: prod + schema: public + - name: orders + schema: sales +`; + + const tables = extractTablesFromYml(yml); + expect(tables).toHaveLength(2); + expect(tables[0]).toMatchObject({ + database: 'prod', + schema: 'public', + table: 'users', + fullName: 'prod.public.users' + }); + expect(tables[1]).toMatchObject({ + schema: 'sales', + table: 'orders', + fullName: 'sales.orders' + }); + }); + + it('should handle flat format YML', () => { + const yml = ` +name: user +database: buster-381916 +schema: analytics +`; + + const tables = extractTablesFromYml(yml); + expect(tables).toHaveLength(1); + expect(tables[0]).toMatchObject({ + database: 'buster-381916', + schema: 'analytics', + table: 'user', + fullName: 'buster-381916.analytics.user' + }); + }); + + it('should handle BigQuery project names in models array', () => { + const yml = ` +models: + - name: user + database: buster-381916 + schema: analytics +`; + + const tables = extractTablesFromYml(yml); + expect(tables).toHaveLength(1); + expect(tables[0]).toMatchObject({ + database: 'buster-381916', + schema: 'analytics', + table: 'user', + fullName: 'buster-381916.analytics.user' + }); + }); +}); + +describe('extractDatasetsFromYml', () => { + it('should extract datasets with dimensions and measures', () => { + const yml = ` +name: users +database: prod +schema: public +dimensions: + - name: id + - name: name + - name: email +measures: + - name: count +`; + + const datasets = extractDatasetsFromYml(yml); + expect(datasets).toHaveLength(1); + expect(datasets[0].table).toBe('users'); + expect(datasets[0].allowedColumns.size).toBe(4); + expect(datasets[0].allowedColumns.has('id')).toBe(true); + expect(datasets[0].allowedColumns.has('name')).toBe(true); + expect(datasets[0].allowedColumns.has('email')).toBe(true); + expect(datasets[0].allowedColumns.has('count')).toBe(true); + }); + + it('should handle models array with dimensions', () => { + const yml = ` +models: + - name: orders + schema: sales + dimensions: + - name: id + - name: status +`; + + const datasets = extractDatasetsFromYml(yml); + expect(datasets).toHaveLength(1); + expect(datasets[0].table).toBe('orders'); + expect(datasets[0].allowedColumns.size).toBe(2); + expect(datasets[0].allowedColumns.has('id')).toBe(true); + expect(datasets[0].allowedColumns.has('status')).toBe(true); + }); + + it('should handle datasets without column restrictions', () => { + const yml = ` +name: products +schema: inventory +`; + + const datasets = extractDatasetsFromYml(yml); + expect(datasets).toHaveLength(1); + expect(datasets[0].table).toBe('products'); + expect(datasets[0].allowedColumns.size).toBe(0); + }); +}); \ No newline at end of file diff --git a/packages/ai/src/utils/sql-permissions/execute-with-permission-check.ts b/packages/ai/src/utils/sql-permissions/execute-with-permission-check.ts deleted file mode 100644 index b228966b1..000000000 --- a/packages/ai/src/utils/sql-permissions/execute-with-permission-check.ts +++ /dev/null @@ -1,52 +0,0 @@ -import { createPermissionErrorMessage, validateSqlPermissions } from './permission-validator'; - -export interface ExecuteWithPermissionResult { - success: boolean; - data?: T; - error?: string; -} - -/** - * Wraps SQL execution with permission validation - * Ensures user has access to all tables referenced in the query - */ -export async function executeWithPermissionCheck( - sql: string, - userId: string, - executeFn: () => Promise, - dataSourceSyntax?: string -): Promise> { - if (!userId) { - return { - success: false, - error: 'User authentication required for SQL execution', - }; - } - - // Validate permissions - const permissionResult = await validateSqlPermissions(sql, userId, dataSourceSyntax); - - if (!permissionResult.isAuthorized) { - return { - success: false, - error: createPermissionErrorMessage( - permissionResult.unauthorizedTables, - permissionResult.unauthorizedColumns - ), - }; - } - - // Execute if authorized - try { - const result = await executeFn(); - return { - success: true, - data: result, - }; - } catch (error) { - return { - success: false, - error: error instanceof Error ? error.message : 'SQL execution failed', - }; - } -} diff --git a/packages/ai/src/utils/sql-permissions/index.ts b/packages/ai/src/utils/sql-permissions/index.ts deleted file mode 100644 index 0628f8c39..000000000 --- a/packages/ai/src/utils/sql-permissions/index.ts +++ /dev/null @@ -1,3 +0,0 @@ -export * from './sql-parser-helpers'; -export * from './permission-validator'; -export * from './execute-with-permission-check'; diff --git a/packages/ai/src/utils/sql-permissions/permission-validator.test.ts b/packages/ai/src/utils/sql-permissions/permission-validator.test.ts deleted file mode 100644 index a2f4391a7..000000000 --- a/packages/ai/src/utils/sql-permissions/permission-validator.test.ts +++ /dev/null @@ -1,1263 +0,0 @@ -import * as accessControls from '@buster/access-controls'; -import { beforeEach, describe, expect, it, vi } from 'vitest'; -import { createPermissionErrorMessage, validateSqlPermissions } from './permission-validator'; - -// Mock the access controls module -vi.mock('@buster/access-controls', () => ({ - getPermissionedDatasets: vi.fn(), -})); - -describe('Permission Validator', () => { - describe('validateSqlPermissions', () => { - beforeEach(() => { - vi.resetAllMocks(); - }); - - it('should allow queries with no tables', async () => { - const result = await validateSqlPermissions('SELECT 1', 'user123'); - expect(result).toEqual({ - isAuthorized: true, - unauthorizedTables: [], - }); - }); - - it('should allow access to permitted table', async () => { - vi.mocked(accessControls.getPermissionedDatasets).mockResolvedValueOnce({ - datasets: [ - { - ymlContent: ` - models: - - name: users - schema: public - `, - } as accessControls.PermissionedDataset, - ], - total: 1, - page: 0, - pageSize: 1000, - }); - - const result = await validateSqlPermissions('SELECT id, name FROM public.users', 'user123'); - - expect(result).toEqual({ - isAuthorized: true, - unauthorizedTables: [], - }); - }); - - it('should deny access to unpermitted table', async () => { - vi.mocked(accessControls.getPermissionedDatasets).mockResolvedValueOnce({ - datasets: [ - { - ymlContent: ` - models: - - name: users - schema: public - `, - } as accessControls.PermissionedDataset, - ], - total: 1, - page: 0, - pageSize: 1000, - }); - - const result = await validateSqlPermissions( - 'SELECT id, user_id FROM public.orders', - 'user123' - ); - - expect(result.isAuthorized).toBe(false); - expect(result.unauthorizedTables).toEqual(['public.orders']); - // May also have unauthorizedColumns for the unauthorized table - }); - - it('should check multiple tables in JOIN', async () => { - vi.mocked(accessControls.getPermissionedDatasets).mockResolvedValueOnce({ - datasets: [ - { - ymlContent: ` - models: - - name: users - schema: public - - name: orders - schema: public - `, - } as accessControls.PermissionedDataset, - ], - total: 1, - page: 0, - pageSize: 1000, - }); - - const result = await validateSqlPermissions( - 'SELECT u.id, u.name, o.id, o.total FROM public.users u JOIN public.orders o ON u.id = o.user_id', - 'user123' - ); - - expect(result).toEqual({ - isAuthorized: true, - unauthorizedTables: [], - }); - }); - - it('should deny when one table in JOIN is unpermitted', async () => { - vi.mocked(accessControls.getPermissionedDatasets).mockResolvedValueOnce({ - datasets: [ - { - ymlContent: ` - models: - - name: users - schema: public - `, - } as accessControls.PermissionedDataset, - ], - total: 1, - page: 0, - pageSize: 1000, - }); - - const result = await validateSqlPermissions( - 'SELECT u.id, u.name, o.id, o.total FROM public.users u JOIN sales.orders o ON u.id = o.user_id', - 'user123' - ); - - expect(result.isAuthorized).toBe(false); - expect(result.unauthorizedTables).toEqual(['sales.orders']); - // May also have unauthorizedColumns for the unauthorized table - }); - - it('should handle complex query with CTEs', async () => { - vi.mocked(accessControls.getPermissionedDatasets).mockResolvedValueOnce({ - datasets: [ - { - ymlContent: ` - models: - - name: product_total_revenue - schema: ont_ont - - name: product_quarterly_sales - schema: ont_ont - `, - } as accessControls.PermissionedDataset, - ], - total: 1, - page: 0, - pageSize: 1000, - }); - - const sql = ` - WITH top5 AS ( - SELECT ptr.product_name, SUM(ptr.metric_producttotalrevenue) AS total_revenue - FROM ont_ont.product_total_revenue AS ptr - GROUP BY ptr.product_name - ) - SELECT pqs.product_name, pqs.quarter, t.total_revenue - FROM ont_ont.product_quarterly_sales AS pqs - JOIN top5 t ON pqs.product_name = t.product_name - `; - - const result = await validateSqlPermissions(sql, 'user123'); - - expect(result).toEqual({ - isAuthorized: true, - unauthorizedTables: [], - }); - }); - - it('should handle subqueries', async () => { - vi.mocked(accessControls.getPermissionedDatasets).mockResolvedValueOnce({ - datasets: [ - { - ymlContent: ` - models: - - name: users - schema: public - - name: orders - schema: public - `, - } as accessControls.PermissionedDataset, - ], - total: 1, - page: 0, - pageSize: 1000, - }); - - const sql = ` - SELECT u.id, u.name FROM public.users u - WHERE u.id IN ( - SELECT user_id FROM public.orders WHERE total > 100 - ) - `; - - const result = await validateSqlPermissions(sql, 'user123'); - - expect(result).toEqual({ - isAuthorized: true, - unauthorizedTables: [], - }); - }); - - it('should match tables with different qualification levels', async () => { - vi.mocked(accessControls.getPermissionedDatasets).mockResolvedValueOnce({ - datasets: [ - { - ymlContent: ` - models: - - name: users - schema: public - `, - } as accessControls.PermissionedDataset, - ], - total: 1, - page: 0, - pageSize: 1000, - }); - - // Query has full qualification, permission has partial - // Note: Parser may not support database.schema.table in FROM clause - const result = await validateSqlPermissions('SELECT id, name FROM public.users', 'user123'); - - expect(result).toEqual({ - isAuthorized: true, - unauthorizedTables: [], - }); - }); - - it('should require schema match when permission specifies schema', async () => { - vi.mocked(accessControls.getPermissionedDatasets).mockResolvedValueOnce({ - datasets: [ - { - ymlContent: ` - models: - - name: users - table_name: public.users - `, - } as accessControls.PermissionedDataset, - ], - total: 1, - page: 0, - pageSize: 1000, - }); - - // Query missing schema that permission requires - const result = await validateSqlPermissions('SELECT id, name FROM users', 'user123'); - - expect(result.isAuthorized).toBe(false); - expect(result.unauthorizedTables).toContain('users'); - }); - - it('should handle permission check errors gracefully', async () => { - vi.mocked(accessControls.getPermissionedDatasets).mockRejectedValueOnce( - new Error('Database connection failed') - ); - - const result = await validateSqlPermissions('SELECT id, name FROM users', 'user123'); - - expect(result).toEqual({ - isAuthorized: false, - unauthorizedTables: [], - error: - 'Permission validation failed: Database connection failed. Please verify your SQL query syntax and ensure you have access to the requested resources.', - }); - }); - - it('should handle SQL parse errors gracefully', async () => { - vi.mocked(accessControls.getPermissionedDatasets).mockResolvedValueOnce({ - datasets: [], - total: 0, - page: 0, - pageSize: 1000, - }); - - const result = await validateSqlPermissions('INVALID SQL SYNTAX HERE', 'user123'); - - expect(result.isAuthorized).toBe(false); - expect(result.error).toContain('Failed to parse SQL query'); - }); - - it('should reject INSERT statements', async () => { - vi.mocked(accessControls.getPermissionedDatasets).mockResolvedValueOnce({ - datasets: [ - { - ymlContent: ` - models: - - name: users - schema: public - `, - } as accessControls.PermissionedDataset, - ], - total: 1, - page: 0, - pageSize: 1000, - }); - - const result = await validateSqlPermissions( - 'INSERT INTO public.users (name) VALUES ("test")', - 'user123' - ); - - expect(result.isAuthorized).toBe(false); - expect(result.error).toContain("Query type 'INSERT' is not allowed"); - expect(result.error).toContain('To read data, use SELECT statements instead of INSERT'); - expect(result.unauthorizedTables).toHaveLength(0); - }); - - it('should reject UPDATE statements', async () => { - vi.mocked(accessControls.getPermissionedDatasets).mockResolvedValueOnce({ - datasets: [ - { - ymlContent: ` - models: - - name: users - schema: public - `, - } as accessControls.PermissionedDataset, - ], - total: 1, - page: 0, - pageSize: 1000, - }); - - const result = await validateSqlPermissions( - 'UPDATE public.users SET name = "updated" WHERE id = 1', - 'user123' - ); - - expect(result.isAuthorized).toBe(false); - expect(result.error).toContain("Query type 'UPDATE' is not allowed"); - expect(result.error).toContain('To read data, use SELECT statements instead of UPDATE'); - }); - - it('should reject DELETE statements', async () => { - vi.mocked(accessControls.getPermissionedDatasets).mockResolvedValueOnce({ - datasets: [ - { - ymlContent: ` - models: - - name: users - schema: public - `, - } as accessControls.PermissionedDataset, - ], - total: 1, - page: 0, - pageSize: 1000, - }); - - const result = await validateSqlPermissions( - 'DELETE FROM public.users WHERE id = 1', - 'user123' - ); - - expect(result.isAuthorized).toBe(false); - expect(result.error).toContain("Query type 'DELETE' is not allowed"); - expect(result.error).toContain('To read data, use SELECT statements instead of DELETE'); - }); - - it('should reject CREATE TABLE statements', async () => { - const result = await validateSqlPermissions('CREATE TABLE new_table (id INT)', 'user123'); - - expect(result.isAuthorized).toBe(false); - expect(result.error).toContain("Query type 'CREATE' is not allowed"); - expect(result.error).toContain('DDL operations like CREATE are not permitted'); - }); - - it('should reject DROP TABLE statements', async () => { - const result = await validateSqlPermissions('DROP TABLE users', 'user123'); - - expect(result.isAuthorized).toBe(false); - expect(result.error).toContain("Query type 'DROP' is not allowed"); - expect(result.error).toContain('DDL operations like DROP are not permitted'); - }); - - it('should handle multiple datasets with overlapping tables', async () => { - vi.mocked(accessControls.getPermissionedDatasets).mockResolvedValueOnce({ - datasets: [ - { - ymlContent: ` - models: - - name: users - schema: public - `, - } as accessControls.PermissionedDataset, - { - ymlContent: ` - models: - - name: users - schema: public - - name: orders - schema: public - `, - } as accessControls.PermissionedDataset, - ], - total: 2, - page: 0, - pageSize: 1000, - }); - - const result = await validateSqlPermissions( - 'SELECT u.id, u.name, o.id, o.total FROM public.users u JOIN public.orders o ON u.id = o.user_id', - 'user123' - ); - - expect(result).toEqual({ - isAuthorized: true, - unauthorizedTables: [], - }); - }); - - it('should block queries with unauthorized columns', async () => { - vi.mocked(accessControls.getPermissionedDatasets).mockResolvedValueOnce({ - datasets: [ - { - ymlContent: ` - name: customer_feedback - database: reporting - schema: public - dimensions: - - name: product_id - - name: category - - name: region - measures: - - name: feedback_score - `, - } as accessControls.PermissionedDataset, - ], - total: 1, - page: 0, - pageSize: 1000, - }); - - const sql = ` - SELECT AVG(cf.customer_age) as avg_age - FROM reporting.public.customer_feedback cf - WHERE cf.category = 'Electronics' - AND cf.region IS NOT NULL - AND cf.customer_age IS NOT NULL - `; - - const result = await validateSqlPermissions(sql, 'user123'); - - expect(result.isAuthorized).toBe(false); - expect(result.unauthorizedColumns).toBeDefined(); - expect(result.unauthorizedColumns).toContainEqual( - expect.objectContaining({ - column: 'customer_age', - }) - ); - }); - - it('should allow queries with only authorized columns', async () => { - vi.mocked(accessControls.getPermissionedDatasets).mockResolvedValueOnce({ - datasets: [ - { - ymlContent: ` - name: users - schema: public - dimensions: - - name: id - - name: name - - name: email - measures: - - name: total_orders - `, - } as accessControls.PermissionedDataset, - ], - total: 1, - page: 0, - pageSize: 1000, - }); - - const result = await validateSqlPermissions( - 'SELECT id, name, email FROM public.users WHERE id = 1', - 'user123' - ); - - expect(result.isAuthorized).toBe(true); - expect(result.unauthorizedColumns).toBeUndefined(); - }); - - it('should handle columns in aggregate functions', async () => { - vi.mocked(accessControls.getPermissionedDatasets).mockResolvedValueOnce({ - datasets: [ - { - ymlContent: ` - name: orders - schema: sales - dimensions: - - name: order_id - - name: user_id - measures: - - name: total - - name: quantity - `, - } as accessControls.PermissionedDataset, - ], - total: 1, - page: 0, - pageSize: 1000, - }); - - const result = await validateSqlPermissions( - 'SELECT user_id, SUM(total) as revenue, AVG(quantity) FROM sales.orders GROUP BY user_id', - 'user123' - ); - - expect(result.isAuthorized).toBe(true); - }); - - it('should detect unauthorized columns in WHERE clause', async () => { - vi.mocked(accessControls.getPermissionedDatasets).mockResolvedValueOnce({ - datasets: [ - { - ymlContent: ` - name: products - schema: catalog - dimensions: - - name: product_id - - name: name - `, - } as accessControls.PermissionedDataset, - ], - total: 1, - page: 0, - pageSize: 1000, - }); - - const result = await validateSqlPermissions( - 'SELECT product_id FROM catalog.products WHERE price > 100', - 'user123' - ); - - expect(result.isAuthorized).toBe(false); - expect(result.unauthorizedColumns).toContainEqual( - expect.objectContaining({ - column: 'price', - }) - ); - }); - - it('should handle backward compatibility for tables without column definitions', async () => { - vi.mocked(accessControls.getPermissionedDatasets).mockResolvedValueOnce({ - datasets: [ - { - ymlContent: ` - models: - - name: legacy_table - schema: public - `, - } as accessControls.PermissionedDataset, - ], - total: 1, - page: 0, - pageSize: 1000, - }); - - // Should allow any columns for backward compatibility - const result = await validateSqlPermissions( - 'SELECT any_column, another_column FROM public.legacy_table', - 'user123' - ); - - expect(result.isAuthorized).toBe(true); - }); - - it('should deny access when using unauthorized columns in complex CTE query', async () => { - vi.mocked(accessControls.getPermissionedDatasets).mockResolvedValueOnce({ - datasets: [ - { - ymlContent: ` - name: users - schema: public - dimensions: - - name: id - - name: name - - name: email - measures: - - name: user_count - `, - } as accessControls.PermissionedDataset, - { - ymlContent: ` - name: orders - schema: public - dimensions: - - name: id - - name: user_id - - name: total - measures: - - name: order_count - `, - } as accessControls.PermissionedDataset, - ], - total: 2, - page: 0, - pageSize: 1000, - }); - - const sql = ` - WITH user_stats AS ( - SELECT - u.id, - u.name, - u.salary, -- unauthorized column - COUNT(o.id) as order_count - FROM public.users u - LEFT JOIN public.orders o ON u.id = o.user_id - GROUP BY u.id, u.name, u.salary - ) - SELECT * FROM user_stats WHERE order_count > 5 - `; - - const result = await validateSqlPermissions(sql, 'user123'); - - expect(result.isAuthorized).toBe(false); - expect(result.unauthorizedColumns).toBeDefined(); - expect(result.unauthorizedColumns).toContainEqual( - expect.objectContaining({ - table: 'public.users', - column: 'salary', - }) - ); - }); - - it('should deny access to entire tables not in user permissions', async () => { - vi.mocked(accessControls.getPermissionedDatasets).mockResolvedValueOnce({ - datasets: [ - { - ymlContent: ` - models: - - name: customers - schema: public - - name: orders - schema: public - `, - } as accessControls.PermissionedDataset, - ], - total: 1, - page: 0, - pageSize: 1000, - }); - - const sql = ` - SELECT - c.customer_name, - o.total, - p.product_name, -- products table not authorized - i.invoice_number -- invoices table not authorized - FROM public.customers c - JOIN public.orders o ON c.id = o.customer_id - JOIN public.products p ON o.product_id = p.id - JOIN public.invoices i ON o.id = i.order_id - `; - - const result = await validateSqlPermissions(sql, 'user123'); - - expect(result.isAuthorized).toBe(false); - expect(result.unauthorizedTables).toContain('public.products'); - expect(result.unauthorizedTables).toContain('public.invoices'); - expect(result.unauthorizedTables).not.toContain('public.customers'); - expect(result.unauthorizedTables).not.toContain('public.orders'); - }); - - it('should validate columns across nested subqueries', async () => { - vi.mocked(accessControls.getPermissionedDatasets).mockResolvedValueOnce({ - datasets: [ - { - ymlContent: ` - name: employees - database: hr - schema: public - dimensions: - - name: employee_id - - name: first_name - - name: last_name - - name: department_id - measures: - - name: employee_count - `, - } as accessControls.PermissionedDataset, - { - ymlContent: ` - name: departments - database: hr - schema: public - dimensions: - - name: department_id - - name: department_name - `, - } as accessControls.PermissionedDataset, - ], - total: 2, - page: 0, - pageSize: 1000, - }); - - const sql = ` - SELECT - e.first_name, - e.last_name, - e.salary, -- unauthorized column - ( - SELECT d.department_name - FROM hr.public.departments d - WHERE d.department_id = e.department_id - ) as dept_name, - ( - SELECT d.budget -- unauthorized column in subquery - FROM hr.public.departments d - WHERE d.department_id = e.department_id - ) as dept_budget - FROM hr.public.employees e - WHERE e.employee_id IN ( - SELECT employee_id - FROM hr.public.employees - WHERE hire_date > '2024-01-01' -- unauthorized column in WHERE - ) - `; - - const result = await validateSqlPermissions(sql, 'user123'); - - expect(result.isAuthorized).toBe(false); - expect(result.unauthorizedColumns).toBeDefined(); - expect(result.unauthorizedColumns?.length).toBeGreaterThan(0); - - const unauthorizedCols = result.unauthorizedColumns?.map((c) => c.column) || []; - expect(unauthorizedCols).toContain('salary'); - expect(unauthorizedCols).toContain('budget'); - expect(unauthorizedCols).toContain('hire_date'); - }); - - it('should handle complex UNION queries with mixed permissions', async () => { - vi.mocked(accessControls.getPermissionedDatasets).mockResolvedValueOnce({ - datasets: [ - { - ymlContent: ` - name: employees - schema: public - dimensions: - - name: id - - name: name - - name: department - `, - } as accessControls.PermissionedDataset, - { - ymlContent: ` - name: contractors - schema: public - dimensions: - - name: id - - name: name - - name: agency - `, - } as accessControls.PermissionedDataset, - ], - total: 2, - page: 0, - pageSize: 1000, - }); - - const sql = ` - SELECT - id, - name, - department as org, - salary -- unauthorized column - FROM public.employees - WHERE status = 'active' -- unauthorized column - - UNION ALL - - SELECT - id, - name, - agency as org, - hourly_rate -- unauthorized column - FROM public.contractors - WHERE end_date > CURRENT_DATE -- unauthorized column - - UNION ALL - - SELECT - id, - name, - school as org, -- entire interns table is unauthorized - stipend - FROM public.interns - `; - - const result = await validateSqlPermissions(sql, 'user123'); - - expect(result.isAuthorized).toBe(false); - expect(result.unauthorizedTables).toContain('public.interns'); - expect(result.unauthorizedColumns).toBeDefined(); - - const unauthorizedByTable = new Map(); - result.unauthorizedColumns?.forEach(({ table, column }) => { - if (!unauthorizedByTable.has(table)) { - unauthorizedByTable.set(table, []); - } - unauthorizedByTable.get(table)?.push(column); - }); - - expect(unauthorizedByTable.get('public.employees')).toContain('salary'); - expect(unauthorizedByTable.get('public.employees')).toContain('status'); - expect(unauthorizedByTable.get('public.contractors')).toContain('hourly_rate'); - expect(unauthorizedByTable.get('public.contractors')).toContain('end_date'); - }); - - it('should validate fully qualified table names correctly', async () => { - vi.mocked(accessControls.getPermissionedDatasets).mockResolvedValueOnce({ - datasets: [ - { - ymlContent: ` - name: sales_data - schema: reporting - dimensions: - - name: sale_id - - name: product_id - - name: quantity - - name: revenue - `, - } as accessControls.PermissionedDataset, - { - ymlContent: ` - name: products - schema: public - dimensions: - - name: product_id - - name: product_name - - name: category - `, - } as accessControls.PermissionedDataset, - ], - total: 2, - page: 0, - pageSize: 1000, - }); - - const sql = ` - SELECT - s.sale_id, - s.quantity, - s.revenue, - s.profit, -- unauthorized column - p.product_name, - p.category, - p.cost, -- unauthorized column - c.customer_name -- entire customers table is unauthorized - FROM reporting.sales_data s - JOIN public.products p ON s.product_id = p.product_id - JOIN reporting.customers c ON s.customer_id = c.customer_id - WHERE s.sale_date > '2024-01-01' -- unauthorized column - `; - - const result = await validateSqlPermissions(sql, 'user123'); - - expect(result.isAuthorized).toBe(false); - expect(result.unauthorizedTables).toContain('reporting.customers'); - expect(result.unauthorizedColumns).toBeDefined(); - - expect(result.unauthorizedColumns).toContainEqual( - expect.objectContaining({ - table: 'reporting.sales_data', - column: 'profit', - }) - ); - expect(result.unauthorizedColumns).toContainEqual( - expect.objectContaining({ - table: 'reporting.sales_data', - column: 'sale_date', - }) - ); - expect(result.unauthorizedColumns).toContainEqual( - expect.objectContaining({ - table: 'public.products', - column: 'cost', - }) - ); - }); - - it('should validate window functions and complex aggregations correctly', async () => { - vi.mocked(accessControls.getPermissionedDatasets).mockResolvedValueOnce({ - datasets: [ - { - ymlContent: ` - name: sales_transactions - schema: public - dimensions: - - name: transaction_id - - name: store_id - - name: product_id - - name: quantity - - name: revenue - measures: - - name: total_revenue - - name: total_quantity - `, - } as accessControls.PermissionedDataset, - ], - total: 1, - page: 0, - pageSize: 1000, - }); - - const sql = ` - SELECT - store_id, - product_id, - SUM(quantity) as total_qty, - SUM(revenue) as total_rev, - AVG(profit) as avg_profit, -- unauthorized column - ROW_NUMBER() OVER (PARTITION BY store_id ORDER BY SUM(revenue) DESC) as rank, - LAG(SUM(cost), 1) OVER (ORDER BY transaction_id) as prev_cost -- unauthorized column - FROM public.sales_transactions - WHERE transaction_date > '2024-01-01' -- unauthorized column - GROUP BY store_id, product_id, transaction_id - HAVING SUM(margin) > 1000 -- unauthorized column - `; - - const result = await validateSqlPermissions(sql, 'user123'); - - expect(result.isAuthorized).toBe(false); - expect(result.unauthorizedColumns).toBeDefined(); - - const unauthorizedCols = result.unauthorizedColumns?.map((c) => c.column) || []; - expect(unauthorizedCols).toContain('profit'); - expect(unauthorizedCols).toContain('cost'); - expect(unauthorizedCols).toContain('transaction_date'); - expect(unauthorizedCols).toContain('margin'); - }); - - it('should handle recursive CTEs with proper table validation', async () => { - vi.mocked(accessControls.getPermissionedDatasets).mockResolvedValueOnce({ - datasets: [ - { - ymlContent: ` - name: organizational_units - schema: hr - dimensions: - - name: unit_id - - name: parent_unit_id - - name: unit_name - `, - } as accessControls.PermissionedDataset, - ], - total: 1, - page: 0, - pageSize: 1000, - }); - - const sql = ` - WITH RECURSIVE org_hierarchy AS ( - SELECT - unit_id, - parent_unit_id, - unit_name, - budget, -- unauthorized column - 0 as level - FROM hr.organizational_units - WHERE parent_unit_id IS NULL - - UNION ALL - - SELECT - o.unit_id, - o.parent_unit_id, - o.unit_name, - o.headcount, -- unauthorized column - oh.level + 1 - FROM hr.organizational_units o - JOIN org_hierarchy oh ON o.parent_unit_id = oh.unit_id - ) - SELECT - oh.unit_name, - oh.level, - e.employee_count -- employees table not authorized - FROM org_hierarchy oh - LEFT JOIN hr.employee_stats e ON oh.unit_id = e.unit_id - `; - - const result = await validateSqlPermissions(sql, 'user123'); - - expect(result.isAuthorized).toBe(false); - expect(result.unauthorizedTables).toContain('hr.employee_stats'); - expect(result.unauthorizedColumns).toBeDefined(); - - const unauthorizedCols = result.unauthorizedColumns?.map((c) => c.column) || []; - expect(unauthorizedCols).toContain('budget'); - expect(unauthorizedCols).toContain('headcount'); - }); - - it('should validate EXISTS subqueries correctly', async () => { - vi.mocked(accessControls.getPermissionedDatasets).mockResolvedValueOnce({ - datasets: [ - { - ymlContent: ` - name: accounts - schema: finance - dimensions: - - name: account_id - - name: account_name - - name: account_type - `, - } as accessControls.PermissionedDataset, - { - ymlContent: ` - name: transactions - schema: finance - dimensions: - - name: transaction_id - - name: account_id - - name: amount - `, - } as accessControls.PermissionedDataset, - ], - total: 2, - page: 0, - pageSize: 1000, - }); - - const sql = ` - SELECT - a.account_id, - a.account_name, - a.balance -- unauthorized column - FROM finance.accounts a - WHERE EXISTS ( - SELECT 1 - FROM finance.transactions t - WHERE t.account_id = a.account_id - AND t.transaction_date > '2024-01-01' -- unauthorized column - AND t.status = 'completed' -- unauthorized column - ) - AND NOT EXISTS ( - SELECT 1 - FROM finance.audit_logs al -- entire table unauthorized - WHERE al.account_id = a.account_id - AND al.flag_type = 'suspicious' - ) - `; - - const result = await validateSqlPermissions(sql, 'user123'); - - expect(result.isAuthorized).toBe(false); - expect(result.unauthorizedTables).toContain('finance.audit_logs'); - expect(result.unauthorizedColumns).toBeDefined(); - - expect(result.unauthorizedColumns).toContainEqual( - expect.objectContaining({ - table: 'finance.accounts', - column: 'balance', - }) - ); - expect(result.unauthorizedColumns).toContainEqual( - expect.objectContaining({ - table: 'finance.transactions', - column: 'transaction_date', - }) - ); - expect(result.unauthorizedColumns).toContainEqual( - expect.objectContaining({ - table: 'finance.transactions', - column: 'status', - }) - ); - }); - - it('should handle SELECT * and still validate columns in other clauses', async () => { - vi.mocked(accessControls.getPermissionedDatasets).mockResolvedValueOnce({ - datasets: [ - { - ymlContent: ` - name: orders - schema: sales - dimensions: - - name: order_id - - name: customer_id - - name: total - `, - } as accessControls.PermissionedDataset, - { - ymlContent: ` - name: customers - schema: sales - dimensions: - - name: customer_id - - name: customer_name - `, - } as accessControls.PermissionedDataset, - ], - total: 2, - page: 0, - pageSize: 1000, - }); - - const sql = ` - SELECT * - FROM sales.orders o - JOIN sales.customers c ON o.customer_id = c.customer_id - WHERE o.status = 'shipped' -- unauthorized column - AND c.credit_score > 700 -- unauthorized column - AND o.shipping_cost < 50 -- unauthorized column - ORDER BY o.priority DESC -- unauthorized column - `; - - const result = await validateSqlPermissions(sql, 'user123'); - - expect(result.isAuthorized).toBe(false); - expect(result.unauthorizedColumns).toBeDefined(); - - const unauthorizedCols = - result.unauthorizedColumns?.map((c) => ({ table: c.table, column: c.column })) || []; - expect(unauthorizedCols).toContainEqual({ table: 'sales.orders', column: 'status' }); - expect(unauthorizedCols).toContainEqual({ table: 'sales.orders', column: 'shipping_cost' }); - expect(unauthorizedCols).toContainEqual({ table: 'sales.orders', column: 'priority' }); - expect(unauthorizedCols).toContainEqual({ table: 'sales.customers', column: 'credit_score' }); - }); - - it('should not treat column aliases as physical columns requiring permissions', async () => { - vi.mocked(accessControls.getPermissionedDatasets).mockResolvedValueOnce({ - datasets: [ - { - ymlContent: ` - name: sales_order_header - database: postgres - schema: ont_ont - dimensions: - - name: orderdate - - name: subtotal - - name: salesorderid - `, - } as accessControls.PermissionedDataset, - ], - total: 1, - page: 0, - pageSize: 1000, - }); - - // This is the exact query from the user's example - const sql = ` - SELECT - DATE_TRUNC('month', soh.orderdate) as order_month, - SUM(soh.subtotal) as monthly_revenue - FROM postgres.ont_ont.sales_order_header soh - WHERE soh.orderdate >= CURRENT_DATE - INTERVAL '12 months' - GROUP BY DATE_TRUNC('month', soh.orderdate) - ORDER BY order_month DESC - LIMIT 12 - `; - - const result = await validateSqlPermissions(sql, 'user123'); - - // Should be authorized - order_month is just an alias, not a physical column - expect(result.isAuthorized).toBe(true); - expect(result.unauthorizedColumns).toBeUndefined(); - expect(result.error).toBeUndefined(); - }); - - it('should not treat aliases in complex JOIN queries as physical columns', async () => { - vi.mocked(accessControls.getPermissionedDatasets).mockResolvedValueOnce({ - datasets: [ - { - ymlContent: ` - name: sales_order_detail - database: postgres - schema: ont_ont - dimensions: - - name: linetotal - - name: productid - - name: salesorderid - `, - } as accessControls.PermissionedDataset, - { - ymlContent: ` - name: product - database: postgres - schema: ont_ont - dimensions: - - name: productid - - name: name - `, - } as accessControls.PermissionedDataset, - { - ymlContent: ` - name: sales_order_header - database: postgres - schema: ont_ont - dimensions: - - name: salesorderid - - name: orderdate - `, - } as accessControls.PermissionedDataset, - ], - total: 3, - page: 0, - pageSize: 1000, - }); - - // This is the second query from the user's example - const sql = ` - SELECT - p.name as product_name, - SUM(sod.linetotal) as product_revenue - FROM postgres.ont_ont.sales_order_detail sod - JOIN postgres.ont_ont.product p ON sod.productid = p.productid - JOIN postgres.ont_ont.sales_order_header soh ON sod.salesorderid = soh.salesorderid - WHERE soh.orderdate >= CURRENT_DATE - INTERVAL '12 months' - GROUP BY p.productid, p.name - ORDER BY product_revenue DESC - LIMIT 15 - `; - - const result = await validateSqlPermissions(sql, 'user123'); - - // Should be authorized - product_name and product_revenue are just aliases - expect(result.isAuthorized).toBe(true); - expect(result.unauthorizedColumns).toBeUndefined(); - expect(result.error).toBeUndefined(); - }); - }); - - describe('createPermissionErrorMessage', () => { - it('should handle empty arrays', () => { - expect(createPermissionErrorMessage([], [])).toBe(''); - }); - - it('should handle single table', () => { - expect(createPermissionErrorMessage(['public.users'])).toBe( - 'Insufficient permissions: You do not have access to table: public.users. Please request access to this table or use a different table that you have permissions for.' - ); - }); - - it('should handle multiple tables', () => { - expect(createPermissionErrorMessage(['public.users', 'sales.orders'])).toBe( - 'Insufficient permissions: You do not have access to the following tables: public.users, sales.orders. Please request access to these tables or modify your query to use only authorized tables.' - ); - }); - - it('should handle unauthorized columns', () => { - const message = createPermissionErrorMessage( - [], - [ - { table: 'users', column: 'salary' }, - { table: 'users', column: 'ssn' }, - ] - ); - expect(message).toContain('Unauthorized column access'); - expect(message).toContain('salary'); - expect(message).toContain('ssn'); - }); - - it('should handle both tables and columns', () => { - const message = createPermissionErrorMessage( - ['private.secrets'], - [{ table: 'users', column: 'password' }] - ); - expect(message).toContain( - 'You do not have access to table: private.secrets. Please request access' - ); - expect(message).toContain('Unauthorized column access'); - expect(message).toContain('password'); - }); - }); -}); diff --git a/packages/ai/src/utils/sql-permissions/permission-validator.ts b/packages/ai/src/utils/sql-permissions/permission-validator.ts deleted file mode 100644 index 15fd4e34a..000000000 --- a/packages/ai/src/utils/sql-permissions/permission-validator.ts +++ /dev/null @@ -1,276 +0,0 @@ -import { getPermissionedDatasets } from '@buster/access-controls'; -import { - type ParsedDataset, - type ParsedTable, - checkQueryIsReadOnly, - extractColumnReferences, - extractDatasetsFromYml, - extractPhysicalTables, - extractTablesFromYml, - tablesMatch, - validateWildcardUsage, -} from './sql-parser-helpers.js'; - -export interface UnauthorizedColumn { - table: string; - column: string; -} - -export interface PermissionValidationResult { - isAuthorized: boolean; - unauthorizedTables: string[]; - unauthorizedColumns?: UnauthorizedColumn[]; - error?: string; -} - -/** - * Validates SQL query against user's permissioned datasets - * Checks that all tables and columns referenced in the query are accessible to the user - */ -export async function validateSqlPermissions( - sql: string, - userId: string, - dataSourceSyntax?: string -): Promise { - try { - // First check if query is read-only - const readOnlyCheck = checkQueryIsReadOnly(sql, dataSourceSyntax); - if (!readOnlyCheck.isReadOnly) { - return { - isAuthorized: false, - unauthorizedTables: [], - error: - readOnlyCheck.error || - 'Only SELECT statements are allowed for read-only access. Please modify your query to use SELECT instead of write operations like INSERT, UPDATE, DELETE, or DDL statements.', - }; - } - - const wildcardCheck = validateWildcardUsage(sql, dataSourceSyntax); - // Store the wildcard error but continue to validate columns to provide comprehensive feedback - let wildcardError: string | undefined; - if (!wildcardCheck.isValid) { - wildcardError = - wildcardCheck.error || - 'SELECT * is not allowed on physical tables. Please explicitly list the column names you need (e.g., SELECT id, name, email FROM users). This helps prevent unintended data exposure and improves query performance.'; - } - - // Extract physical tables from SQL - const tablesInQuery = extractPhysicalTables(sql, dataSourceSyntax); - - if (tablesInQuery.length === 0) { - // No tables referenced (might be a function call or constant select) - return { isAuthorized: true, unauthorizedTables: [] }; - } - - // Get user's permissioned datasets - const permissionedDatasets = await getPermissionedDatasets({ - userId, - page: 0, - pageSize: 1000, - }); - - // Extract all allowed tables and datasets from permissions - const allowedTables: ParsedTable[] = []; - const allowedDatasets: ParsedDataset[] = []; - - for (const dataset of permissionedDatasets.datasets) { - if (dataset.ymlContent) { - const tables = extractTablesFromYml(dataset.ymlContent); - allowedTables.push(...tables); - - const datasetsWithColumns = extractDatasetsFromYml(dataset.ymlContent); - allowedDatasets.push(...datasetsWithColumns); - } - } - - // Check each table in query against permissions - const unauthorizedTables: string[] = []; - - for (const queryTable of tablesInQuery) { - let isAuthorized = false; - - // Check if query table matches any allowed table - for (const allowedTable of allowedTables) { - const matches = tablesMatch(queryTable, allowedTable); - if (matches) { - isAuthorized = true; - break; - } - } - - if (!isAuthorized) { - unauthorizedTables.push(queryTable.fullName); - } - } - - // Continue to validate column-level permissions even if tables are unauthorized - // This allows us to report both unauthorized tables AND their columns - const columnReferences = extractColumnReferences(sql, dataSourceSyntax); - const unauthorizedColumns: UnauthorizedColumn[] = []; - - for (const [tableName, columns] of columnReferences) { - // Find the matching allowed dataset for this table - let matchingDataset: ParsedDataset | undefined; - - for (const dataset of allowedDatasets) { - // Check if table names match (case-insensitive) - const tableNameLower = tableName.toLowerCase(); - const datasetFullNameLower = dataset.fullName.toLowerCase(); - const datasetTableLower = dataset.table.toLowerCase(); - - // Handle different qualification levels - check both fullName and table - if ( - tableNameLower === datasetFullNameLower || - tableNameLower === datasetTableLower || - tableNameLower.endsWith(`.${datasetTableLower}`) || - datasetFullNameLower === tableNameLower - ) { - matchingDataset = dataset; - break; - } - } - - if (matchingDataset) { - // Found a matching dataset - validate columns if it has restrictions - if (matchingDataset.allowedColumns.size > 0) { - for (const column of columns) { - if (!matchingDataset.allowedColumns.has(column.toLowerCase())) { - unauthorizedColumns.push({ - table: tableName, - column: column, - }); - } - } - } - // If dataset has no column restrictions, it's backward compatibility mode (allow all columns) - } else { - // No matching dataset found - this table is completely unauthorized - // Check if this table was already marked as unauthorized - const isTableUnauthorized = unauthorizedTables.some((t) => { - const tLower = t.toLowerCase(); - const tableNameLower = tableName.toLowerCase(); - return ( - tLower === tableNameLower || - tLower.endsWith(`.${tableNameLower.split('.').pop()}`) || - tableNameLower.endsWith(`.${tLower.split('.').pop()}`) - ); - }); - - if (isTableUnauthorized) { - // Table is unauthorized, so all its columns are also unauthorized - for (const column of columns) { - unauthorizedColumns.push({ - table: tableName, - column: column, - }); - } - } - } - } - - const result: PermissionValidationResult = { - isAuthorized: - unauthorizedTables.length === 0 && unauthorizedColumns.length === 0 && !wildcardError, - unauthorizedTables, - }; - - if (unauthorizedColumns.length > 0) { - result.unauthorizedColumns = unauthorizedColumns; - } - - if (wildcardError) { - result.error = wildcardError; - } - - return result; - } catch (error) { - const errorMessage = error instanceof Error ? error.message : String(error); - - // Provide more specific guidance based on the error - if (errorMessage.includes('parse')) { - return { - isAuthorized: false, - unauthorizedTables: [], - error: `Failed to validate SQL permissions due to parsing error: ${errorMessage}. Please check your SQL syntax and ensure it's valid.`, - }; - } - - if (errorMessage.includes('permission')) { - return { - isAuthorized: false, - unauthorizedTables: [], - error: `Permission check failed: ${errorMessage}. Please ensure you have the necessary access rights for the requested tables and columns.`, - }; - } - - return { - isAuthorized: false, - unauthorizedTables: [], - error: `Permission validation failed: ${errorMessage}. Please verify your SQL query syntax and ensure you have access to the requested resources.`, - }; - } -} - -/** - * Creates a detailed error message for unauthorized table or column access - */ -export function createPermissionErrorMessage( - unauthorizedTables: string[], - unauthorizedColumns?: UnauthorizedColumn[] -): string { - const messages: string[] = []; - - // Handle unauthorized tables with actionable guidance - if (unauthorizedTables.length > 0) { - const tableList = unauthorizedTables.join(', '); - if (unauthorizedTables.length === 1) { - messages.push( - `You do not have access to table: ${tableList}. Please request access to this table or use a different table that you have permissions for.` - ); - } else { - messages.push( - `You do not have access to the following tables: ${tableList}. Please request access to these tables or modify your query to use only authorized tables.` - ); - } - } - - // Handle unauthorized columns - if (unauthorizedColumns && unauthorizedColumns.length > 0) { - // Group columns by table for better error messages - const columnsByTable = new Map(); - - for (const { table, column } of unauthorizedColumns) { - if (!columnsByTable.has(table)) { - columnsByTable.set(table, []); - } - const tableColumns = columnsByTable.get(table); - if (tableColumns) { - tableColumns.push(column); - } - } - - const columnMessages: string[] = []; - for (const [table, columns] of columnsByTable) { - const columnList = columns.join(', '); - columnMessages.push( - `Table '${table}': columns [${columnList}] are not available in your permitted dataset` - ); - } - - if (columnMessages.length === 1) { - messages.push( - `Unauthorized column access - ${columnMessages[0]}. Please use only the columns that are available in your permitted datasets, or request access to additional columns.` - ); - } else { - messages.push( - `Unauthorized column access:\n${columnMessages.map((m) => ` - ${m}`).join('\n')}\n\nPlease modify your query to use only the columns available in your permitted datasets, or request access to the additional columns you need.` - ); - } - } - - if (messages.length === 0) { - return ''; - } - - return `Insufficient permissions: ${messages.join('. ')}`; -} diff --git a/packages/ai/src/utils/sql-permissions/sql-parser-helpers.test.ts b/packages/ai/src/utils/sql-permissions/sql-parser-helpers.test.ts deleted file mode 100644 index c48c023c4..000000000 --- a/packages/ai/src/utils/sql-permissions/sql-parser-helpers.test.ts +++ /dev/null @@ -1,1254 +0,0 @@ -import { describe, expect, it } from 'vitest'; -import { - type ParsedTable, - checkQueryIsReadOnly, - extractColumnReferences, - extractDatasetsFromYml, - extractPhysicalTables, - extractTablesFromYml, - normalizeTableIdentifier, - parseTableReference, - tablesMatch, - validateWildcardUsage, -} from './sql-parser-helpers'; - -describe('SQL Parser Helpers', () => { - describe('parseTableReference', () => { - it('should parse simple table name', () => { - const result = parseTableReference('users'); - expect(result).toEqual({ - table: 'users', - fullName: 'users', - }); - }); - - it('should parse schema.table format', () => { - const result = parseTableReference('public.users'); - expect(result).toEqual({ - schema: 'public', - table: 'users', - fullName: 'public.users', - }); - }); - - it('should parse database.schema.table format', () => { - const result = parseTableReference('mydb.public.users'); - expect(result).toEqual({ - database: 'mydb', - schema: 'public', - table: 'users', - fullName: 'mydb.public.users', - }); - }); - - it('should handle quoted identifiers', () => { - const result = parseTableReference('"my schema"."my table"'); - expect(result).toEqual({ - schema: 'my schema', - table: 'my table', - fullName: 'my schema.my table', - }); - }); - - it('should handle PostgreSQL :: separator', () => { - const result = parseTableReference('catalog::schema.table'); - expect(result).toEqual({ - database: 'catalog', - schema: 'schema', - table: 'table', - fullName: 'catalog.schema.table', - }); - }); - }); - - describe('extractPhysicalTables', () => { - it('should extract simple table from SELECT', () => { - const sql = 'SELECT * FROM users'; - const tables = extractPhysicalTables(sql); - expect(tables).toHaveLength(1); - expect(tables[0]).toMatchObject({ table: 'users' }); - }); - - it('should extract multiple tables from JOIN', () => { - const sql = 'SELECT u.id, o.order_id FROM users u JOIN orders o ON u.id = o.user_id'; - const tables = extractPhysicalTables(sql); - expect(tables).toHaveLength(2); - expect(tables.map((t) => t.table)).toEqual(['users', 'orders']); - }); - - it('should extract schema-qualified tables', () => { - const sql = 'SELECT * FROM public.users u JOIN sales.orders o ON u.id = o.user_id'; - const tables = extractPhysicalTables(sql); - expect(tables).toHaveLength(2); - expect(tables[0]).toMatchObject({ schema: 'public', table: 'users' }); - expect(tables[1]).toMatchObject({ schema: 'sales', table: 'orders' }); - }); - - it('should exclude CTEs from physical tables', () => { - const sql = ` - WITH user_stats AS ( - SELECT user_id, COUNT(*) as count FROM orders GROUP BY user_id - ) - SELECT u.name, us.count - FROM users u - JOIN user_stats us ON u.id = us.user_id - `; - const tables = extractPhysicalTables(sql); - expect(tables).toHaveLength(2); - expect(tables.map((t) => t.table)).toEqual(['orders', 'users']); - // user_stats is a CTE and should not be included - }); - - it('should handle complex query with multiple CTEs', () => { - const sql = ` - WITH top5 AS ( - SELECT ptr.product_name, SUM(ptr.metric_producttotalrevenue) AS total_revenue - FROM ont_ont.product_total_revenue AS ptr - GROUP BY ptr.product_name - ORDER BY total_revenue DESC - LIMIT 5 - ), - quarterly_data AS ( - SELECT * FROM ont_ont.product_quarterly_sales - ) - SELECT q.*, t.total_revenue - FROM quarterly_data q - JOIN top5 t ON q.product_name = t.product_name - `; - const tables = extractPhysicalTables(sql); - expect(tables).toHaveLength(2); - expect(tables[0]).toMatchObject({ schema: 'ont_ont', table: 'product_total_revenue' }); - expect(tables[1]).toMatchObject({ schema: 'ont_ont', table: 'product_quarterly_sales' }); - }); - - it('should handle subqueries', () => { - const sql = ` - SELECT * FROM users u - WHERE u.id IN ( - SELECT user_id FROM orders WHERE total > 100 - ) - `; - const tables = extractPhysicalTables(sql); - expect(tables).toHaveLength(2); - expect(tables.map((t) => t.table).sort()).toEqual(['orders', 'users']); - }); - - it('should handle UNION queries', () => { - const sql = ` - SELECT id, name FROM employees - UNION - SELECT id, name FROM contractors - `; - const tables = extractPhysicalTables(sql); - expect(tables).toHaveLength(2); - expect(tables.map((t) => t.table).sort()).toEqual(['contractors', 'employees']); - }); - - it('should deduplicate tables', () => { - const sql = ` - SELECT * FROM users u1 - JOIN users u2 ON u1.manager_id = u2.id - `; - const tables = extractPhysicalTables(sql); - expect(tables).toHaveLength(1); - expect(tables[0]).toMatchObject({ table: 'users' }); - }); - }); - - describe('normalizeTableIdentifier', () => { - it('should normalize simple table name', () => { - const table: ParsedTable = { table: 'Users', fullName: 'Users' }; - expect(normalizeTableIdentifier(table)).toBe('users'); - }); - - it('should normalize schema.table', () => { - const table: ParsedTable = { schema: 'Public', table: 'Users', fullName: 'Public.Users' }; - expect(normalizeTableIdentifier(table)).toBe('public.users'); - }); - - it('should normalize database.schema.table', () => { - const table: ParsedTable = { - database: 'MyDB', - schema: 'Public', - table: 'Users', - fullName: 'MyDB.Public.Users', - }; - expect(normalizeTableIdentifier(table)).toBe('mydb.public.users'); - }); - }); - - describe('tablesMatch', () => { - it('should match exact table names', () => { - const query: ParsedTable = { table: 'users', fullName: 'users' }; - const permission: ParsedTable = { table: 'users', fullName: 'users' }; - expect(tablesMatch(query, permission)).toBe(true); - }); - - it('should match case-insensitive', () => { - const query: ParsedTable = { table: 'Users', fullName: 'Users' }; - const permission: ParsedTable = { table: 'users', fullName: 'users' }; - expect(tablesMatch(query, permission)).toBe(true); - }); - - it('should match when query has more qualification', () => { - const query: ParsedTable = { - database: 'mydb', - schema: 'public', - table: 'users', - fullName: 'mydb.public.users', - }; - const permission: ParsedTable = { - schema: 'public', - table: 'users', - fullName: 'public.users', - }; - expect(tablesMatch(query, permission)).toBe(true); - }); - - it('should not match different tables', () => { - const query: ParsedTable = { table: 'users', fullName: 'users' }; - const permission: ParsedTable = { table: 'orders', fullName: 'orders' }; - expect(tablesMatch(query, permission)).toBe(false); - }); - - it('should not match different schemas', () => { - const query: ParsedTable = { schema: 'public', table: 'users', fullName: 'public.users' }; - const permission: ParsedTable = { - schema: 'private', - table: 'users', - fullName: 'private.users', - }; - expect(tablesMatch(query, permission)).toBe(false); - }); - - it('should not match when query lacks required schema', () => { - const query: ParsedTable = { table: 'users', fullName: 'users' }; - const permission: ParsedTable = { - schema: 'public', - table: 'users', - fullName: 'public.users', - }; - expect(tablesMatch(query, permission)).toBe(false); - }); - }); - - describe('extractTablesFromYml', () => { - it('should handle flat YML format with separate schema and database fields', () => { - const yml = ` -name: customer -schema: ont_ont -database: postgres -`; - const tables = extractTablesFromYml(yml); - expect(tables).toHaveLength(1); - expect(tables[0]).toMatchObject({ - database: 'postgres', - schema: 'ont_ont', - table: 'customer', - fullName: 'postgres.ont_ont.customer', - }); - }); - - it('should handle flat YML format with only schema', () => { - const yml = ` -name: users -schema: public -description: User data table -`; - const tables = extractTablesFromYml(yml); - expect(tables).toHaveLength(1); - expect(tables[0]).toMatchObject({ - schema: 'public', - table: 'users', - fullName: 'public.users', - }); - }); - - it('should handle flat YML format with only database', () => { - const yml = ` -name: orders -database: analytics -version: 2 -`; - const tables = extractTablesFromYml(yml); - expect(tables).toHaveLength(1); - expect(tables[0]).toMatchObject({ - database: 'analytics', - table: 'orders', - fullName: 'analytics.orders', - }); - }); - - it('should handle models array with separate schema and database fields', () => { - const yml = ` -models: - - name: customer - schema: ont_ont - database: postgres - - name: currency - schema: ont_ont - database: postgres - - name: product - schema: catalog - database: postgres -`; - const tables = extractTablesFromYml(yml); - expect(tables).toHaveLength(3); - expect(tables[0]).toMatchObject({ - database: 'postgres', - schema: 'ont_ont', - table: 'customer', - fullName: 'postgres.ont_ont.customer', - }); - expect(tables[1]).toMatchObject({ - database: 'postgres', - schema: 'ont_ont', - table: 'currency', - fullName: 'postgres.ont_ont.currency', - }); - expect(tables[2]).toMatchObject({ - database: 'postgres', - schema: 'catalog', - table: 'product', - fullName: 'postgres.catalog.product', - }); - }); - - it('should handle models array with different schema/database combinations', () => { - const yml = ` -models: - - name: customer - schema: ont_ont - database: postgres - - name: users - schema: public - - name: analytics_fact - database: warehouse -`; - const tables = extractTablesFromYml(yml); - expect(tables).toHaveLength(3); - expect(tables[0]).toMatchObject({ - database: 'postgres', - schema: 'ont_ont', - table: 'customer', - fullName: 'postgres.ont_ont.customer', - }); - expect(tables[1]).toMatchObject({ - schema: 'public', - table: 'users', - fullName: 'public.users', - }); - expect(tables[2]).toMatchObject({ - database: 'warehouse', - table: 'analytics_fact', - fullName: 'warehouse.analytics_fact', - }); - }); - - it('should handle models with both schema and database fields', () => { - const yml = ` -models: - - name: users - schema: public - database: primary - - name: orders - schema: sales - database: analytics -`; - const tables = extractTablesFromYml(yml); - expect(tables).toHaveLength(2); - expect(tables[0]).toMatchObject({ - database: 'primary', - schema: 'public', - table: 'users', - fullName: 'primary.public.users', - }); - expect(tables[1]).toMatchObject({ - database: 'analytics', - schema: 'sales', - table: 'orders', - fullName: 'analytics.sales.orders', - }); - }); - - it('should deduplicate tables with same name/schema/database', () => { - const yml = ` -models: - - name: users - schema: public - database: postgres - - name: users - schema: public - database: postgres - - name: users - schema: public - database: postgres -`; - const tables = extractTablesFromYml(yml); - expect(tables).toHaveLength(1); - expect(tables[0]).toMatchObject({ - database: 'postgres', - schema: 'public', - table: 'users', - fullName: 'postgres.public.users', - }); - }); - - it('should handle models with only name field (no schema/database)', () => { - const yml = ` -models: - - name: simple_table - description: A simple table without schema - - name: users - schema: public -`; - const tables = extractTablesFromYml(yml); - // Should only extract the one with schema - expect(tables).toHaveLength(1); - expect(tables[0]).toMatchObject({ - schema: 'public', - table: 'users', - fullName: 'public.users', - }); - }); - - it('should handle empty or invalid YML gracefully', () => { - const emptyYml = ''; - const tables1 = extractTablesFromYml(emptyYml); - expect(tables1).toHaveLength(0); - - const invalidYml = 'not valid yaml: [}'; - const tables2 = extractTablesFromYml(invalidYml); - expect(tables2).toHaveLength(0); - }); - }); - - describe('validateWildcardUsage', () => { - it('should block unqualified wildcard on physical table', () => { - const sql = 'SELECT * FROM users'; - const result = validateWildcardUsage(sql); - expect(result.isValid).toBe(false); - expect(result.error).toContain('SELECT * is not allowed on physical table: users'); - expect(result.blockedTables).toContain('users'); - }); - - it('should block qualified wildcard on physical table', () => { - const sql = 'SELECT u.* FROM users u'; - const result = validateWildcardUsage(sql); - expect(result.isValid).toBe(false); - expect(result.error).toContain('SELECT * is not allowed on physical table: users'); - expect(result.blockedTables).toContain('users'); - }); - - it('should allow wildcard on CTE', () => { - const sql = ` - WITH user_stats AS ( - SELECT user_id, COUNT(*) as count FROM orders GROUP BY user_id - ) - SELECT * FROM user_stats - `; - const result = validateWildcardUsage(sql); - expect(result.isValid).toBe(true); - expect(result.error).toBeUndefined(); - }); - - it('should allow qualified wildcard on CTE', () => { - const sql = ` - WITH user_stats AS ( - SELECT user_id, COUNT(*) as count FROM orders GROUP BY user_id - ) - SELECT us.* FROM user_stats us - `; - const result = validateWildcardUsage(sql); - expect(result.isValid).toBe(true); - }); - - it('should block wildcard when CTE uses wildcard on physical table', () => { - const sql = ` - WITH user_cte AS ( - SELECT * FROM users - ) - SELECT * FROM user_cte - `; - const result = validateWildcardUsage(sql); - expect(result.isValid).toBe(false); - expect(result.error).toContain('SELECT * is not allowed on physical table: users'); - expect(result.blockedTables).toContain('users'); - }); - - it('should allow wildcard when CTE uses explicit columns', () => { - const sql = ` - WITH user_cte AS ( - SELECT id, name FROM users - ) - SELECT * FROM user_cte - `; - const result = validateWildcardUsage(sql); - expect(result.isValid).toBe(true); - }); - - it('should block wildcard on physical tables in JOIN', () => { - const sql = ` - WITH orders_cte AS ( - SELECT order_id FROM orders - ) - SELECT oc.*, u.* FROM orders_cte oc JOIN users u ON oc.order_id = u.id - `; - const result = validateWildcardUsage(sql); - expect(result.isValid).toBe(false); - expect(result.blockedTables).toContain('users'); - }); - - it('should allow explicit column selection', () => { - const sql = 'SELECT id, name, email FROM users'; - const result = validateWildcardUsage(sql); - expect(result.isValid).toBe(true); - }); - - it('should handle multiple physical tables with wildcards', () => { - const sql = 'SELECT u.*, o.* FROM users u JOIN orders o ON u.id = o.user_id'; - const result = validateWildcardUsage(sql); - expect(result.isValid).toBe(false); - expect(result.error).toContain('SELECT * is not allowed on physical tables: users, orders'); - expect(result.blockedTables).toEqual(expect.arrayContaining(['users', 'orders'])); - }); - - it('should handle schema-qualified tables', () => { - const sql = 'SELECT * FROM public.users'; - const result = validateWildcardUsage(sql); - expect(result.isValid).toBe(false); - expect(result.error).toContain('SELECT * is not allowed on physical table: users'); - }); - - it('should handle invalid SQL gracefully', () => { - const sql = 'NOT VALID SQL'; - const result = validateWildcardUsage(sql); - expect(result.isValid).toBe(false); - expect(result.error).toContain('Failed to validate wildcard usage in SQL query'); - }); - }); - - describe('extractDatasetsFromYml', () => { - it('should extract dataset with dimensions and measures', () => { - const yml = ` -name: customer_feedback -description: Customer feedback metrics -database: reporting -schema: public -dimensions: - - name: product_id - description: ID of the product - type: string - - name: category - description: Product category - type: string - - name: region - description: Geographic region - type: string -measures: - - name: feedback_score - description: Customer feedback score - type: number - - name: response_count - description: Number of responses - type: number -`; - const datasets = extractDatasetsFromYml(yml); - expect(datasets).toHaveLength(1); - const dataset = datasets[0]; - expect(dataset).toBeDefined(); - expect(dataset).toMatchObject({ - database: 'reporting', - schema: 'public', - table: 'customer_feedback', - fullName: 'reporting.public.customer_feedback', - }); - expect(dataset!.allowedColumns).toBeInstanceOf(Set); - expect(dataset!.allowedColumns.size).toBe(5); - expect(dataset!.allowedColumns.has('product_id')).toBe(true); - expect(dataset!.allowedColumns.has('category')).toBe(true); - expect(dataset!.allowedColumns.has('region')).toBe(true); - expect(dataset!.allowedColumns.has('feedback_score')).toBe(true); - expect(dataset!.allowedColumns.has('response_count')).toBe(true); - // Should not have non-existent column - expect(dataset!.allowedColumns.has('customer_age')).toBe(false); - }); - - it('should handle dataset without dimensions or measures', () => { - const yml = ` -name: simple_table -database: analytics -schema: public -`; - const datasets = extractDatasetsFromYml(yml); - expect(datasets).toHaveLength(1); - const dataset = datasets[0]; - expect(dataset).toBeDefined(); - expect(dataset!.allowedColumns.size).toBe(0); - }); - - it('should handle models array format with dimensions and measures', () => { - const yml = ` -models: - - name: users - schema: public - dimensions: - - name: id - - name: name - measures: - - name: total_orders - - name: products - schema: catalog - dimensions: - - name: product_id - - name: product_name -`; - const datasets = extractDatasetsFromYml(yml); - expect(datasets).toHaveLength(2); - const dataset1 = datasets[0]; - const dataset2 = datasets[1]; - expect(dataset1).toBeDefined(); - expect(dataset2).toBeDefined(); - expect(dataset1!.allowedColumns.has('id')).toBe(true); - expect(dataset1!.allowedColumns.has('name')).toBe(true); - expect(dataset1!.allowedColumns.has('total_orders')).toBe(true); - expect(dataset2!.allowedColumns.has('product_id')).toBe(true); - expect(dataset2!.allowedColumns.has('product_name')).toBe(true); - }); - }); - - describe('extractColumnReferences', () => { - it('should extract column references from simple SELECT', () => { - const sql = 'SELECT id, name FROM users'; - const columns = extractColumnReferences(sql); - expect(columns.has('users')).toBe(true); - expect(columns.get('users')).toEqual(new Set(['id', 'name'])); - }); - - it('should extract columns from WHERE clause', () => { - const sql = 'SELECT * FROM users WHERE age > 18 AND status = "active"'; - const columns = extractColumnReferences(sql); - expect(columns.has('users')).toBe(true); - expect(columns.get('users')).toContain('age'); - expect(columns.get('users')).toContain('status'); - }); - - it('should handle table aliases', () => { - const sql = 'SELECT u.id, u.name FROM users u WHERE u.age > 18'; - const columns = extractColumnReferences(sql); - expect(columns.has('users')).toBe(true); - expect(columns.get('users')).toEqual(new Set(['id', 'name', 'age'])); - }); - - it('should handle JOINs with aliases', () => { - const sql = ` - SELECT u.name, o.total - FROM users u - JOIN orders o ON u.id = o.user_id - WHERE o.status = "completed" - `; - const columns = extractColumnReferences(sql); - expect(columns.has('users')).toBe(true); - expect(columns.has('orders')).toBe(true); - expect(columns.get('users')).toEqual(new Set(['name', 'id'])); - expect(columns.get('orders')).toEqual(new Set(['total', 'user_id', 'status'])); - }); - - it('should handle aggregate functions', () => { - const sql = 'SELECT AVG(age) as avg_age, COUNT(id) as total FROM users'; - const columns = extractColumnReferences(sql); - expect(columns.has('users')).toBe(true); - expect(columns.get('users')).toEqual(new Set(['age', 'id'])); - }); - - it('should handle GROUP BY and HAVING', () => { - const sql = ` - SELECT department, AVG(salary) as avg_salary - FROM employees - GROUP BY department - HAVING AVG(salary) > 50000 - `; - const columns = extractColumnReferences(sql); - expect(columns.has('employees')).toBe(true); - expect(columns.get('employees')).toEqual(new Set(['department', 'salary'])); - }); - - it('should exclude CTE columns from validation', () => { - const sql = ` - WITH user_stats AS ( - SELECT user_id, COUNT(*) as order_count FROM orders GROUP BY user_id - ) - SELECT u.name, us.order_count - FROM users u - JOIN user_stats us ON u.id = us.user_id - `; - const columns = extractColumnReferences(sql); - expect(columns.has('users')).toBe(true); - expect(columns.has('orders')).toBe(true); - expect(columns.has('user_stats')).toBe(false); // CTE should not be tracked - expect(columns.get('users')).toEqual(new Set(['name', 'id'])); - expect(columns.get('orders')).toEqual(new Set(['user_id'])); - }); - - it('should extract columns from complex query with undocumented columns', () => { - const sql = ` - SELECT AVG(cf.customer_age) as avg_age - FROM public.customer_feedback cf - WHERE cf.category = 'Electronics' - AND cf.region IS NOT NULL - AND cf.customer_age IS NOT NULL - `; - const columns = extractColumnReferences(sql); - // Table name should be extracted as schema-qualified name - expect(columns.has('public.customer_feedback')).toBe(true); - - const tableColumns = columns.get('public.customer_feedback'); - expect(tableColumns).toBeDefined(); - expect(tableColumns).toContain('customer_age'); - expect(tableColumns).toContain('category'); - expect(tableColumns).toContain('region'); - }); - - it('should handle CASE expressions', () => { - const sql = ` - SELECT - CASE - WHEN age < 18 THEN 'minor' - WHEN age >= 65 THEN 'senior' - ELSE 'adult' - END as age_group, - name - FROM users - `; - const columns = extractColumnReferences(sql); - expect(columns.has('users')).toBe(true); - expect(columns.get('users')).toEqual(new Set(['age', 'name'])); - }); - - it('should handle subqueries', () => { - const sql = ` - SELECT name - FROM users - WHERE id IN ( - SELECT user_id FROM orders WHERE total > 100 - ) - `; - const columns = extractColumnReferences(sql); - expect(columns.has('users')).toBe(true); - expect(columns.has('orders')).toBe(true); - expect(columns.get('users')).toEqual(new Set(['name', 'id'])); - expect(columns.get('orders')).toEqual(new Set(['user_id', 'total'])); - }); - - it('should extract columns from complex query with multiple CTEs', () => { - const sql = ` - WITH - user_orders AS ( - SELECT u.id, u.name, u.email, COUNT(o.id) as order_count, SUM(o.total) as total_spent - FROM users u - LEFT JOIN orders o ON u.id = o.user_id - WHERE u.status = 'active' AND o.created_at > '2024-01-01' - GROUP BY u.id, u.name, u.email - ), - high_value_users AS ( - SELECT id, name, total_spent - FROM user_orders - WHERE total_spent > 1000 - ) - SELECT - hvu.name, - hvu.total_spent, - p.product_name, - p.category - FROM high_value_users hvu - JOIN orders o ON hvu.id = o.user_id - JOIN products p ON o.product_id = p.id - WHERE p.is_active = true - `; - const columns = extractColumnReferences(sql); - - // Should extract columns from physical tables only, not CTEs - expect(columns.has('users')).toBe(true); - expect(columns.has('orders')).toBe(true); - expect(columns.has('products')).toBe(true); - expect(columns.has('user_orders')).toBe(false); // CTE should not be included - expect(columns.has('high_value_users')).toBe(false); // CTE should not be included - - // Check extracted columns - expect(columns.get('users')).toEqual(new Set(['id', 'name', 'email', 'status'])); - expect(columns.get('orders')).toEqual( - new Set(['id', 'user_id', 'created_at', 'total', 'product_id']) - ); - expect(columns.get('products')).toEqual( - new Set(['product_name', 'category', 'id', 'is_active']) - ); - }); - - it('should handle nested subqueries with multiple table references', () => { - const sql = ` - SELECT - c.customer_name, - c.country, - ( - SELECT COUNT(*) - FROM orders o - WHERE o.customer_id = c.id - AND o.status = 'completed' - AND o.total > ( - SELECT AVG(total) - FROM orders - WHERE year = 2024 - ) - ) as high_value_orders, - ( - SELECT SUM(p.amount) - FROM payments p - JOIN invoices i ON p.invoice_id = i.id - WHERE i.customer_id = c.id - AND p.status = 'paid' - ) as total_paid - FROM customers c - WHERE c.region IN ('US', 'EU') - AND EXISTS ( - SELECT 1 - FROM subscriptions s - WHERE s.customer_id = c.id - AND s.status = 'active' - AND s.plan_type IN ('premium', 'enterprise') - ) - `; - const columns = extractColumnReferences(sql); - - expect(columns.has('customers')).toBe(true); - expect(columns.has('orders')).toBe(true); - expect(columns.has('payments')).toBe(true); - expect(columns.has('invoices')).toBe(true); - expect(columns.has('subscriptions')).toBe(true); - - expect(columns.get('customers')).toEqual( - new Set(['customer_name', 'country', 'id', 'region']) - ); - expect(columns.get('orders')).toEqual(new Set(['customer_id', 'status', 'total', 'year'])); - expect(columns.get('payments')).toEqual(new Set(['amount', 'invoice_id', 'status'])); - expect(columns.get('invoices')).toEqual(new Set(['id', 'customer_id'])); - expect(columns.get('subscriptions')).toEqual(new Set(['customer_id', 'status', 'plan_type'])); - }); - - it('should handle UNION queries with different tables', () => { - const sql = ` - SELECT - employee_id as person_id, - first_name, - last_name, - department, - 'employee' as person_type - FROM employees - WHERE status = 'active' - - UNION ALL - - SELECT - contractor_id as person_id, - first_name, - last_name, - agency as department, - 'contractor' as person_type - FROM contractors - WHERE end_date > CURRENT_DATE - - UNION ALL - - SELECT - intern_id as person_id, - first_name, - last_name, - school as department, - 'intern' as person_type - FROM interns - WHERE program_year = 2024 - `; - const columns = extractColumnReferences(sql); - - expect(columns.has('employees')).toBe(true); - expect(columns.has('contractors')).toBe(true); - expect(columns.has('interns')).toBe(true); - - expect(columns.get('employees')).toEqual( - new Set(['employee_id', 'first_name', 'last_name', 'department', 'status']) - ); - expect(columns.get('contractors')).toEqual( - new Set(['contractor_id', 'first_name', 'last_name', 'agency', 'end_date']) - ); - expect(columns.get('interns')).toEqual( - new Set(['intern_id', 'first_name', 'last_name', 'school', 'program_year']) - ); - }); - - it('should handle window functions and complex aggregations', () => { - const sql = ` - SELECT - s.store_id, - s.store_name, - s.region, - p.product_id, - p.product_name, - SUM(sd.quantity) as total_quantity, - SUM(sd.revenue) as total_revenue, - AVG(sd.price) as avg_price, - ROW_NUMBER() OVER (PARTITION BY s.region ORDER BY SUM(sd.revenue) DESC) as revenue_rank, - DENSE_RANK() OVER (ORDER BY SUM(sd.quantity) DESC) as quantity_rank, - LAG(SUM(sd.revenue), 1) OVER (PARTITION BY s.store_id ORDER BY p.product_id) as prev_product_revenue - FROM stores s - JOIN sales_data sd ON s.store_id = sd.store_id - JOIN products p ON sd.product_id = p.product_id - WHERE sd.sale_date BETWEEN '2024-01-01' AND '2024-12-31' - AND s.is_active = true - AND p.category IN ('electronics', 'appliances') - GROUP BY s.store_id, s.store_name, s.region, p.product_id, p.product_name - HAVING SUM(sd.revenue) > 10000 - `; - const columns = extractColumnReferences(sql); - - expect(columns.has('stores')).toBe(true); - expect(columns.has('sales_data')).toBe(true); - expect(columns.has('products')).toBe(true); - - expect(columns.get('stores')).toEqual( - new Set(['store_id', 'store_name', 'region', 'is_active']) - ); - expect(columns.get('sales_data')).toEqual( - new Set(['quantity', 'revenue', 'price', 'store_id', 'product_id', 'sale_date']) - ); - expect(columns.get('products')).toEqual(new Set(['product_id', 'product_name', 'category'])); - }); - - it('should handle complex JOIN conditions with multiple columns', () => { - const sql = ` - SELECT - t1.transaction_id, - t1.amount, - t2.balance, - a1.account_name as from_account, - a2.account_name as to_account - FROM transactions t1 - JOIN transactions t2 ON t1.transaction_id = t2.parent_transaction_id - AND t1.transaction_date = t2.transaction_date - AND t1.currency = t2.currency - JOIN accounts a1 ON t1.from_account_id = a1.account_id - AND t1.account_type = a1.account_type - JOIN accounts a2 ON t1.to_account_id = a2.account_id - AND t2.account_type = a2.account_type - WHERE t1.status = 'completed' - AND t2.status = 'completed' - AND a1.is_active = true - AND a2.is_active = true - `; - const columns = extractColumnReferences(sql); - - expect(columns.has('transactions')).toBe(true); - expect(columns.has('accounts')).toBe(true); - - expect(columns.get('transactions')).toEqual( - new Set([ - 'transaction_id', - 'amount', - 'balance', - 'parent_transaction_id', - 'transaction_date', - 'currency', - 'from_account_id', - 'account_type', - 'to_account_id', - 'status', - ]) - ); - expect(columns.get('accounts')).toEqual( - new Set(['account_name', 'account_id', 'account_type', 'is_active']) - ); - }); - - it('should handle fully qualified table names with database and schema', () => { - // Note: node-sql-parser has limitations with three-part naming (database.schema.table) - // Using two-part naming (schema.table) instead - const sql = ` - SELECT - u.user_id, - u.username, - p.profile_data, - l.last_login - FROM public.users u - JOIN public.user_profiles p ON u.user_id = p.user_id - JOIN reporting.login_history l ON u.user_id = l.user_id - WHERE u.created_at > '2024-01-01' - AND p.is_verified = true - AND l.login_count > 5 - `; - const columns = extractColumnReferences(sql); - - // Should handle schema-qualified names - expect(columns.has('public.users')).toBe(true); - expect(columns.has('public.user_profiles')).toBe(true); - expect(columns.has('reporting.login_history')).toBe(true); - - expect(columns.get('public.users')).toEqual(new Set(['user_id', 'username', 'created_at'])); - expect(columns.get('public.user_profiles')).toEqual( - new Set(['profile_data', 'user_id', 'is_verified']) - ); - expect(columns.get('reporting.login_history')).toEqual( - new Set(['last_login', 'user_id', 'login_count']) - ); - }); - - it('should handle SELECT * but still extract columns from WHERE/JOIN clauses', () => { - const sql = ` - SELECT * - FROM orders o - JOIN customers c ON o.customer_id = c.id - WHERE o.status = 'shipped' - AND c.country = 'USA' - AND o.total > 100 - `; - const columns = extractColumnReferences(sql); - - expect(columns.has('orders')).toBe(true); - expect(columns.has('customers')).toBe(true); - - // Even with SELECT *, we should extract columns from WHERE and JOIN - expect(columns.get('orders')).toEqual(new Set(['customer_id', 'status', 'total'])); - expect(columns.get('customers')).toEqual(new Set(['id', 'country'])); - }); - - it('should exclude column aliases from permission checks', () => { - const sql = ` - SELECT - COUNT(*) AS total_count, - SUM(amount) AS total_amount, - AVG(price) AS avg_price - FROM sales - `; - const columns = extractColumnReferences(sql); - - expect(columns.has('sales')).toBe(true); - // Should only have columns from aggregate functions, not the aliases - expect(columns.get('sales')).toEqual(new Set(['amount', 'price'])); - // Should NOT include 'total_count', 'total_amount', or 'avg_price' as they are aliases - }); - - it('should exclude aliases when referenced in ORDER BY', () => { - const sql = ` - SELECT - DATE_TRUNC('month', orderdate) AS order_month, - SUM(subtotal) AS monthly_revenue - FROM sales_order_header - WHERE orderdate >= '2024-01-01' - GROUP BY DATE_TRUNC('month', orderdate) - ORDER BY order_month DESC - `; - const columns = extractColumnReferences(sql); - - expect(columns.has('sales_order_header')).toBe(true); - // Should only extract actual columns, not aliases - expect(columns.get('sales_order_header')).toEqual(new Set(['orderdate', 'subtotal'])); - // Should NOT include 'order_month' or 'monthly_revenue' as they are aliases - }); - - it('should exclude aliases in complex queries with JOINs', () => { - const sql = ` - SELECT - p.name AS product_name, - SUM(sod.linetotal) AS product_revenue - FROM sales_order_detail sod - JOIN product p ON sod.productid = p.productid - WHERE sod.orderdate >= '2024-01-01' - GROUP BY p.productid, p.name - ORDER BY product_revenue DESC - `; - const columns = extractColumnReferences(sql); - - expect(columns.has('sales_order_detail')).toBe(true); - expect(columns.has('product')).toBe(true); - - // Should only extract physical columns - expect(columns.get('sales_order_detail')).toEqual( - new Set(['linetotal', 'productid', 'orderdate']) - ); - expect(columns.get('product')).toEqual(new Set(['name', 'productid'])); - // Should NOT include 'product_name' or 'product_revenue' as they are aliases - }); - - it('should handle aliases referenced in GROUP BY and HAVING', () => { - const sql = ` - SELECT - customer_id, - COUNT(*) AS order_count, - SUM(total) AS total_spent - FROM orders - GROUP BY customer_id - HAVING COUNT(*) > 5 - ORDER BY total_spent DESC - `; - const columns = extractColumnReferences(sql); - - expect(columns.has('orders')).toBe(true); - // Should only extract physical columns - expect(columns.get('orders')).toEqual(new Set(['customer_id', 'total'])); - // Should NOT include 'order_count' or 'total_spent' as they are aliases - }); - - it('should handle aliases with table prefixes correctly', () => { - const sql = ` - SELECT - u.id AS user_id, - u.name AS user_name, - COUNT(o.id) AS order_count - FROM users u - LEFT JOIN orders o ON u.id = o.user_id - GROUP BY u.id, u.name - ORDER BY order_count DESC - `; - const columns = extractColumnReferences(sql); - - expect(columns.has('users')).toBe(true); - expect(columns.has('orders')).toBe(true); - - expect(columns.get('users')).toEqual(new Set(['id', 'name'])); - expect(columns.get('orders')).toEqual(new Set(['id', 'user_id'])); - // Should NOT include 'user_id', 'user_name', or 'order_count' as they are aliases - }); - - it('should handle recursive CTEs', () => { - const sql = ` - WITH RECURSIVE category_tree AS ( - SELECT - category_id, - parent_category_id, - category_name, - 0 as level - FROM categories - WHERE parent_category_id IS NULL - - UNION ALL - - SELECT - c.category_id, - c.parent_category_id, - c.category_name, - ct.level + 1 - FROM categories c - JOIN category_tree ct ON c.parent_category_id = ct.category_id - WHERE ct.level < 5 - ) - SELECT - ct.category_name, - ct.level, - COUNT(p.product_id) as product_count, - SUM(p.price) as total_value - FROM category_tree ct - LEFT JOIN products p ON ct.category_id = p.category_id - WHERE p.is_active = true - GROUP BY ct.category_name, ct.level - `; - const columns = extractColumnReferences(sql); - - // Should only include physical tables, not the CTE - expect(columns.has('categories')).toBe(true); - expect(columns.has('products')).toBe(true); - expect(columns.has('category_tree')).toBe(false); // CTE should not be included - - expect(columns.get('categories')).toEqual( - new Set(['category_id', 'parent_category_id', 'category_name']) - ); - expect(columns.get('products')).toEqual( - new Set(['product_id', 'price', 'category_id', 'is_active']) - ); - }); - }); - - describe('checkQueryIsReadOnly', () => { - it('should allow SELECT statements', () => { - const result = checkQueryIsReadOnly('SELECT * FROM users'); - expect(result.isReadOnly).toBe(true); - expect(result.queryType).toBe('select'); - expect(result.error).toBeUndefined(); - }); - - it('should allow SELECT with JOIN', () => { - const result = checkQueryIsReadOnly( - 'SELECT u.id, o.total FROM users u JOIN orders o ON u.id = o.user_id' - ); - expect(result.isReadOnly).toBe(true); - }); - - it('should allow SELECT with CTEs', () => { - const sql = ` - WITH stats AS ( - SELECT user_id, COUNT(*) as count FROM orders GROUP BY user_id - ) - SELECT * FROM stats - `; - const result = checkQueryIsReadOnly(sql); - expect(result.isReadOnly).toBe(true); - }); - - it('should reject INSERT statements', () => { - const result = checkQueryIsReadOnly( - 'INSERT INTO users (name, email) VALUES ("John", "john@example.com")' - ); - expect(result.isReadOnly).toBe(false); - expect(result.queryType).toBe('insert'); - expect(result.error).toContain("Query type 'INSERT' is not allowed"); - expect(result.error).toContain('To read data, use SELECT statements instead of INSERT'); - }); - - it('should reject UPDATE statements', () => { - const result = checkQueryIsReadOnly('UPDATE users SET name = "Jane" WHERE id = 1'); - expect(result.isReadOnly).toBe(false); - expect(result.queryType).toBe('update'); - expect(result.error).toContain("Query type 'UPDATE' is not allowed"); - expect(result.error).toContain('To read data, use SELECT statements instead of UPDATE'); - }); - - it('should reject DELETE statements', () => { - const result = checkQueryIsReadOnly('DELETE FROM users WHERE id = 1'); - expect(result.isReadOnly).toBe(false); - expect(result.queryType).toBe('delete'); - expect(result.error).toContain("Query type 'DELETE' is not allowed"); - expect(result.error).toContain('To read data, use SELECT statements instead of DELETE'); - }); - - it('should reject CREATE statements', () => { - const result = checkQueryIsReadOnly('CREATE TABLE new_users (id INT, name VARCHAR(100))'); - expect(result.isReadOnly).toBe(false); - expect(result.queryType).toBe('create'); - expect(result.error).toContain("Query type 'CREATE' is not allowed"); - expect(result.error).toContain('DDL operations like CREATE are not permitted'); - }); - - it('should reject DROP statements', () => { - const result = checkQueryIsReadOnly('DROP TABLE users'); - expect(result.isReadOnly).toBe(false); - expect(result.queryType).toBe('drop'); - expect(result.error).toContain("Query type 'DROP' is not allowed"); - expect(result.error).toContain('DDL operations like DROP are not permitted'); - }); - - it('should reject ALTER statements', () => { - const result = checkQueryIsReadOnly('ALTER TABLE users ADD COLUMN age INT'); - expect(result.isReadOnly).toBe(false); - expect(result.queryType).toBe('alter'); - expect(result.error).toContain("Query type 'ALTER' is not allowed"); - expect(result.error).toContain('DDL operations like ALTER are not permitted'); - }); - - it('should handle PostgreSQL dialect', () => { - const result = checkQueryIsReadOnly('SELECT * FROM postgres.public.users', 'postgres'); - expect(result.isReadOnly).toBe(true); - }); - - it('should handle invalid SQL gracefully', () => { - const result = checkQueryIsReadOnly('NOT VALID SQL'); - expect(result.isReadOnly).toBe(false); - expect(result.error).toContain('Failed to parse SQL query for validation'); - }); - }); -}); diff --git a/packages/ai/src/utils/sql-permissions/sql-parser-helpers.ts b/packages/ai/src/utils/sql-permissions/sql-parser-helpers.ts deleted file mode 100644 index 6168fd01a..000000000 --- a/packages/ai/src/utils/sql-permissions/sql-parser-helpers.ts +++ /dev/null @@ -1,1637 +0,0 @@ -import pkg from 'node-sql-parser'; -const { Parser } = pkg; -import type { BaseFrom, ColumnRefItem, Join, Select } from 'node-sql-parser'; -import * as yaml from 'yaml'; -// Import checkQueryIsReadOnly from data-source package -export { checkQueryIsReadOnly } from '@buster/data-source'; -export type { QueryTypeCheckResult } from '@buster/data-source'; - -export interface ParsedTable { - database?: string; - schema?: string; - table: string; - fullName: string; - alias?: string; -} - -export interface ParsedDataset { - database?: string; - schema?: string; - table: string; - fullName: string; - allowedColumns: Set; // lowercase column names from dimensions and measures -} - -// Type for statements that may have UNION (_next property) -interface StatementWithNext extends Record { - _next?: StatementWithNext; - type?: string; -} - -export interface WildcardValidationResult { - isValid: boolean; - error?: string; - blockedTables?: string[]; -} - -// Map data source syntax to node-sql-parser dialect -const DIALECT_MAPPING: Record = { - // Direct mappings - mysql: 'mysql', - postgresql: 'postgresql', - sqlite: 'sqlite', - mariadb: 'mariadb', - bigquery: 'bigquery', - snowflake: 'snowflake', - redshift: 'postgresql', // Redshift uses PostgreSQL dialect - transactsql: 'transactsql', - flinksql: 'flinksql', - hive: 'hive', - - // Alternative names - postgres: 'postgresql', - mssql: 'transactsql', - sqlserver: 'transactsql', - athena: 'postgresql', // Athena uses Presto/PostgreSQL syntax - db2: 'db2', - noql: 'mysql', // Default fallback for NoQL -}; - -function getParserDialect(dataSourceSyntax?: string): string { - if (!dataSourceSyntax) { - return 'postgresql'; - } - - const dialect = DIALECT_MAPPING[dataSourceSyntax.toLowerCase()]; - if (!dialect) { - return 'postgresql'; - } - - return dialect; -} - -/** - * Extracts physical tables from SQL query, excluding CTEs - * Returns database.schema.table references with proper qualification - */ -export function extractPhysicalTables(sql: string, dataSourceSyntax?: string): ParsedTable[] { - const dialect = getParserDialect(dataSourceSyntax); - const parser = new Parser(); - - try { - // Parse SQL into AST with the appropriate dialect - const ast = parser.astify(sql, { database: dialect }); - - // Get all table references from parser with the appropriate dialect - const allTables = parser.tableList(sql, { database: dialect }); - - // Extract CTE names to exclude them - const cteNames = new Set(); - - // Handle single statement or array of statements - const statements = Array.isArray(ast) ? ast : [ast]; - - for (const statement of statements) { - // Type guard to check if statement has 'with' property - if ('with' in statement && statement.with && Array.isArray(statement.with)) { - for (const cte of statement.with) { - if (cte.name?.value) { - cteNames.add(cte.name.value.toLowerCase()); - } - } - } - } - - // Parse table references and filter out CTEs - const physicalTables: ParsedTable[] = []; - const processedTables = new Set(); - - for (const tableRef of allTables) { - const parsed = parseTableReference(tableRef); - - // Skip if it's a CTE - if (cteNames.has(parsed.table.toLowerCase())) { - continue; - } - - // Skip duplicates - const tableKey = `${parsed.database || ''}.${parsed.schema || ''}.${parsed.table}`; - if (processedTables.has(tableKey)) { - continue; - } - - processedTables.add(tableKey); - physicalTables.push(parsed); - } - - return physicalTables; - } catch (error) { - const errorMessage = error instanceof Error ? error.message : String(error); - // Provide more specific guidance based on common parsing errors - if (errorMessage.includes('Expected')) { - throw new Error( - `SQL syntax error: ${errorMessage}. Please check your SQL syntax and ensure it's valid for the ${dialect} dialect.` - ); - } - if (errorMessage.includes('Unexpected token')) { - throw new Error( - `SQL parsing error: ${errorMessage}. This may be due to unsupported SQL features or incorrect syntax.` - ); - } - throw new Error( - `Failed to parse SQL query: ${errorMessage}. Please ensure your SQL is valid and uses standard ${dialect} syntax.` - ); - } -} - -/** - * Parses a table reference string into its components - * Handles formats like: - * - table - * - schema.table - * - database.schema.table - * - type::database::table (node-sql-parser format) - * - type::schema.table (node-sql-parser format) - */ -export function parseTableReference(tableRef: string): ParsedTable { - // Remove any quotes and trim - let cleanRef = tableRef.replace(/["'`\[\]]/g, '').trim(); - - // Handle node-sql-parser format: "type::database::table" or "type::table" - if (cleanRef.includes('::')) { - const parts = cleanRef.split('::'); - // Remove the type prefix (select, insert, update, etc.) - const firstPart = parts[0]; - if ( - parts.length >= 2 && - firstPart && - ['select', 'insert', 'update', 'delete', 'create', 'drop', 'alter'].includes(firstPart) - ) { - parts.shift(); // Remove type - } - cleanRef = parts.join('.'); - } - - // Split by . for schema/table - const parts = cleanRef.split('.').filter((p) => p && p !== 'null'); - - if (parts.length === 3) { - const [database, schema, table] = parts; - if (!database || !schema || !table) { - return { - table: cleanRef, - fullName: cleanRef, - }; - } - return { - database, - schema, - table, - fullName: `${database}.${schema}.${table}`, - }; - } - - if (parts.length === 2) { - const [schema, table] = parts; - if (!schema || !table) { - return { - table: cleanRef, - fullName: cleanRef, - }; - } - return { - schema, - table, - fullName: `${schema}.${table}`, - }; - } - - if (parts.length === 1) { - const [table] = parts; - if (!table) { - return { - table: cleanRef, - fullName: cleanRef, - }; - } - return { - table, - fullName: table, - }; - } - - return { - table: cleanRef, - fullName: cleanRef, - }; -} - -/** - * Normalizes a table identifier for comparison - * Converts to lowercase and handles different qualification levels - */ -export function normalizeTableIdentifier(identifier: ParsedTable): string { - const parts = []; - - if (identifier.database) { - parts.push(identifier.database.toLowerCase()); - } - if (identifier.schema) { - parts.push(identifier.schema.toLowerCase()); - } - parts.push(identifier.table.toLowerCase()); - - return parts.join('.'); -} - -/** - * Checks if two table identifiers match, considering different qualification levels - * For example, "schema.table" matches "database.schema.table" if schema and table match - */ -export function tablesMatch(queryTable: ParsedTable, permissionTable: ParsedTable): boolean { - // Exact table name must match - if (queryTable.table.toLowerCase() !== permissionTable.table.toLowerCase()) { - return false; - } - - // If permission specifies schema, query must match - if (permissionTable.schema && queryTable.schema) { - if (permissionTable.schema.toLowerCase() !== queryTable.schema.toLowerCase()) { - return false; - } - } - - // If permission specifies database, query must match - if (permissionTable.database && queryTable.database) { - if (permissionTable.database.toLowerCase() !== queryTable.database.toLowerCase()) { - return false; - } - } - - // If permission has schema but query doesn't, it's not a match - // (we require explicit schema matching for security) - if (permissionTable.schema && !queryTable.schema) { - return false; - } - - return true; -} - -/** - * Extracts table references from dataset YML content - * Handles multiple formats: - * 1. Flat format (top-level fields): - * name: table_name - * schema: schema_name - * database: database_name - * 2. Models array with separate fields: - * models: - * - name: table_name - * schema: schema_name - * database: database_name - */ -export function extractTablesFromYml(ymlContent: string): ParsedTable[] { - const tables: ParsedTable[] = []; - const processedTables = new Set(); - - try { - // Parse YML content - const parsed = yaml.parse(ymlContent); - - // Check for flat format (top-level name, schema, database) - if (parsed?.name && !parsed?.models && (parsed?.schema || parsed?.database)) { - const parsedTable: ParsedTable = { - table: parsed.name, - fullName: parsed.name, - }; - - // Add schema if present - if (parsed.schema) { - parsedTable.schema = parsed.schema; - parsedTable.fullName = `${parsed.schema}.${parsed.name}`; - } - - // Add database if present - if (parsed.database) { - parsedTable.database = parsed.database; - if (parsed.schema) { - parsedTable.fullName = `${parsed.database}.${parsed.schema}.${parsed.name}`; - } else { - parsedTable.fullName = `${parsed.database}.${parsed.name}`; - } - } - - const key = normalizeTableIdentifier(parsedTable); - if (!processedTables.has(key)) { - processedTables.add(key); - tables.push(parsedTable); - } - } - - // Look for models array - if (parsed?.models && Array.isArray(parsed.models)) { - for (const model of parsed.models) { - // Process models that have name and at least schema or database - if (model.name && (model.schema || model.database)) { - const parsedTable: ParsedTable = { - table: model.name, - fullName: model.name, - }; - - // Add schema if present - if (model.schema) { - parsedTable.schema = model.schema; - parsedTable.fullName = `${model.schema}.${model.name}`; - } - - // Add database if present - if (model.database) { - parsedTable.database = model.database; - if (model.schema) { - parsedTable.fullName = `${model.database}.${model.schema}.${model.name}`; - } else { - parsedTable.fullName = `${model.database}.${model.name}`; - } - } - - const key = normalizeTableIdentifier(parsedTable); - if (!processedTables.has(key)) { - processedTables.add(key); - tables.push(parsedTable); - } - } - } - } - } catch (error) { - // Log the error for debugging but don't throw - return empty array - // This is expected behavior when YML content is invalid or not a dataset - const errorMessage = error instanceof Error ? error.message : String(error); - console.warn(`Failed to parse YML content for table extraction: ${errorMessage}`); - } - - return tables; -} - -/** - * Extracts datasets with allowed columns from YML content - * Handles dataset format with dimensions and measures - */ -export function extractDatasetsFromYml(ymlContent: string): ParsedDataset[] { - const datasets: ParsedDataset[] = []; - const processedDatasets = new Set(); - - try { - // Parse YML content - const parsed = yaml.parse(ymlContent); - - // Check for dataset format with dimensions and measures - if (parsed?.name) { - const parsedDataset: ParsedDataset = { - table: parsed.name, - fullName: parsed.name, - allowedColumns: new Set(), - }; - - // Add schema if present - if (parsed.schema) { - parsedDataset.schema = parsed.schema; - parsedDataset.fullName = `${parsed.schema}.${parsed.name}`; - } - - // Add database if present - if (parsed.database) { - parsedDataset.database = parsed.database; - if (parsed.schema) { - parsedDataset.fullName = `${parsed.database}.${parsed.schema}.${parsed.name}`; - } else { - parsedDataset.fullName = `${parsed.database}.${parsed.name}`; - } - } - - // Extract columns from dimensions - if (parsed.dimensions && Array.isArray(parsed.dimensions)) { - for (const dimension of parsed.dimensions) { - if (dimension.name && typeof dimension.name === 'string') { - parsedDataset.allowedColumns.add(dimension.name.toLowerCase()); - } - } - } - - // Extract columns from measures - if (parsed.measures && Array.isArray(parsed.measures)) { - for (const measure of parsed.measures) { - if (measure.name && typeof measure.name === 'string') { - parsedDataset.allowedColumns.add(measure.name.toLowerCase()); - } - } - } - - const key = normalizeTableIdentifier(parsedDataset); - if (!processedDatasets.has(key)) { - processedDatasets.add(key); - datasets.push(parsedDataset); - } - } - - // Also check for models array format with dimensions/measures - if (parsed?.models && Array.isArray(parsed.models)) { - for (const model of parsed.models) { - if (model.name) { - const parsedDataset: ParsedDataset = { - table: model.name, - fullName: model.name, - allowedColumns: new Set(), - }; - - // Add schema if present - if (model.schema) { - parsedDataset.schema = model.schema; - parsedDataset.fullName = `${model.schema}.${model.name}`; - } - - // Add database if present - if (model.database) { - parsedDataset.database = model.database; - if (model.schema) { - parsedDataset.fullName = `${model.database}.${model.schema}.${model.name}`; - } else { - parsedDataset.fullName = `${model.database}.${model.name}`; - } - } - - // Extract columns from dimensions - if (model.dimensions && Array.isArray(model.dimensions)) { - for (const dimension of model.dimensions) { - if (dimension.name && typeof dimension.name === 'string') { - parsedDataset.allowedColumns.add(dimension.name.toLowerCase()); - } - } - } - - // Extract columns from measures - if (model.measures && Array.isArray(model.measures)) { - for (const measure of model.measures) { - if (measure.name && typeof measure.name === 'string') { - parsedDataset.allowedColumns.add(measure.name.toLowerCase()); - } - } - } - - const key = normalizeTableIdentifier(parsedDataset); - if (!processedDatasets.has(key)) { - processedDatasets.add(key); - datasets.push(parsedDataset); - } - } - } - } - } catch (error) { - // Log the error for debugging but don't throw - return empty array - // This is expected behavior when YML content is invalid or not a dataset - const errorMessage = error instanceof Error ? error.message : String(error); - console.warn(`Failed to parse YML content for dataset extraction: ${errorMessage}`); - } - - return datasets; -} - -/** - * Validates that wildcards (SELECT *) are not used on physical tables - * Allows wildcards on CTEs but blocks them on physical database tables - */ -export function validateWildcardUsage( - sql: string, - dataSourceSyntax?: string -): WildcardValidationResult { - const dialect = getParserDialect(dataSourceSyntax); - const parser = new Parser(); - - try { - // Parse SQL into AST with the appropriate dialect - const ast = parser.astify(sql, { database: dialect }); - - // Handle single statement or array of statements - const statements = Array.isArray(ast) ? ast : [ast]; - - // Extract CTE names to allow wildcards on them - const cteNames = new Set(); - for (const statement of statements) { - if ('with' in statement && statement.with && Array.isArray(statement.with)) { - for (const cte of statement.with) { - if (cte.name?.value) { - cteNames.add(cte.name.value.toLowerCase()); - } - } - } - } - - const tableList = parser.tableList(sql, { database: dialect }); - const tableAliasMap = new Map(); // alias -> table name - - if (Array.isArray(tableList)) { - for (const tableRef of tableList) { - if (typeof tableRef === 'string') { - // Simple table name - tableAliasMap.set(tableRef.toLowerCase(), tableRef); - } else if (tableRef && typeof tableRef === 'object') { - const tableRefObj = tableRef as Record; - const tableName = tableRefObj.table || tableRefObj.name; - const alias = tableRefObj.as || tableRefObj.alias; - if (tableName && typeof tableName === 'string') { - if (alias && typeof alias === 'string') { - tableAliasMap.set(alias.toLowerCase(), tableName); - } - tableAliasMap.set(tableName.toLowerCase(), tableName); - } - } - } - } - - // Check each statement for wildcard usage - const blockedTables: string[] = []; - - for (const statement of statements) { - if ('type' in statement && statement.type === 'select') { - const wildcardTables = findWildcardUsageOnPhysicalTables( - statement as unknown as Record, - cteNames - ); - blockedTables.push(...wildcardTables); - } - } - - if (blockedTables.length > 0) { - // Create a more helpful error message with specific tables and guidance - const tableList = - blockedTables.length > 1 - ? `tables: ${blockedTables.join(', ')}` - : `table: ${blockedTables[0]}`; - - return { - isValid: false, - error: `SELECT * is not allowed on physical ${tableList}. Please explicitly specify the column names you need instead of using wildcards. For example, use 'SELECT column1, column2 FROM table' instead of 'SELECT * FROM table'. This restriction helps ensure data security and prevents unintended data exposure.`, - blockedTables, - }; - } - - return { isValid: true }; - } catch (error) { - const errorMessage = error instanceof Error ? error.message : String(error); - return { - isValid: false, - error: `Failed to validate wildcard usage in SQL query: ${errorMessage}. Please ensure your SQL syntax is correct and try specifying explicit column names instead of using SELECT *.`, - }; - } -} - -/** - * Recursively finds wildcard usage on physical tables in a SELECT statement - */ -function findWildcardUsageOnPhysicalTables( - selectStatement: Record, - cteNames: Set -): string[] { - const blockedTables: string[] = []; - - // Build alias mapping for this statement - const aliasToTableMap = new Map(); - if (selectStatement.from && Array.isArray(selectStatement.from)) { - for (const fromItem of selectStatement.from) { - const fromItemAny = fromItem as unknown as Record; - if (fromItemAny.table && fromItemAny.as) { - let tableName: string; - if (typeof fromItemAny.table === 'string') { - tableName = fromItemAny.table; - } else if (fromItemAny.table && typeof fromItemAny.table === 'object') { - const tableObj = fromItemAny.table as Record; - tableName = String( - tableObj.table || tableObj.name || tableObj.value || fromItemAny.table - ); - } else { - continue; - } - aliasToTableMap.set(String(fromItemAny.as).toLowerCase(), tableName.toLowerCase()); - } - - // Handle JOINs - if (fromItemAny.join && Array.isArray(fromItemAny.join)) { - for (const joinItem of fromItemAny.join) { - if (joinItem.table && joinItem.as) { - let tableName: string; - if (typeof joinItem.table === 'string') { - tableName = joinItem.table; - } else if (joinItem.table && typeof joinItem.table === 'object') { - const tableObj = joinItem.table as Record; - tableName = String( - tableObj.table || tableObj.name || tableObj.value || joinItem.table - ); - } else { - continue; - } - aliasToTableMap.set(String(joinItem.as).toLowerCase(), tableName.toLowerCase()); - } - } - } - } - } - - if (selectStatement.columns && Array.isArray(selectStatement.columns)) { - for (const column of selectStatement.columns) { - if (column.expr && column.expr.type === 'column_ref') { - // Check for unqualified wildcard (SELECT *) - if (column.expr.column === '*' && !column.expr.table) { - // Get all tables in FROM clause that are not CTEs - const physicalTables = getPhysicalTablesFromFrom( - selectStatement.from as unknown as Record[], - cteNames - ); - blockedTables.push(...physicalTables); - } - // Check for qualified wildcard (SELECT table.*) - else if (column.expr.column === '*' && column.expr.table) { - // Handle table reference - could be string or object - let tableName: string; - if (typeof column.expr.table === 'string') { - tableName = column.expr.table; - } else if (column.expr.table && typeof column.expr.table === 'object') { - // Handle object format - could have table property or be the table name itself - const tableRefObj = column.expr.table as Record; - tableName = String( - tableRefObj.table || tableRefObj.name || tableRefObj.value || column.expr.table - ); - } else { - continue; // Skip if we can't determine table name - } - - // Check if this is an alias that maps to a CTE - const actualTableName = aliasToTableMap.get(tableName.toLowerCase()); - const isAliasToCte = actualTableName && cteNames.has(actualTableName); - const isDirectCte = cteNames.has(tableName.toLowerCase()); - - if (!isAliasToCte && !isDirectCte) { - // Push the actual table name if it's an alias, otherwise push the table name itself - blockedTables.push(actualTableName || tableName); - } - } - } - } - } - - // Check CTEs for nested wildcard usage - if (selectStatement.with && Array.isArray(selectStatement.with)) { - for (const cte of selectStatement.with) { - const cteAny = cte as unknown as Record; - if (cteAny.stmt && typeof cteAny.stmt === 'object' && cteAny.stmt !== null) { - const stmt = cteAny.stmt as Record; - if (stmt.type === 'select') { - const subBlocked = findWildcardUsageOnPhysicalTables(stmt, cteNames); - blockedTables.push(...subBlocked); - } - } - } - } - - if (selectStatement.from && Array.isArray(selectStatement.from)) { - for (const fromItem of selectStatement.from) { - const fromItemAny = fromItem as unknown as Record; - if (fromItemAny.expr && typeof fromItemAny.expr === 'object' && fromItemAny.expr !== null) { - const expr = fromItemAny.expr as Record; - if (expr.type === 'select') { - const subBlocked = findWildcardUsageOnPhysicalTables(expr, cteNames); - blockedTables.push(...subBlocked); - } - } - } - } - - return blockedTables; -} - -/** - * Extracts physical table names from FROM clause, excluding CTEs - */ -function getPhysicalTablesFromFrom( - fromClause: Record[], - cteNames: Set -): string[] { - const tables: string[] = []; - - if (!fromClause || !Array.isArray(fromClause)) { - return tables; - } - - for (const fromItem of fromClause) { - // Extract table name from fromItem - if (fromItem.table) { - let tableName: string; - if (typeof fromItem.table === 'string') { - tableName = fromItem.table; - } else if (fromItem.table && typeof fromItem.table === 'object') { - const tableObj = fromItem.table as Record; - tableName = String(tableObj.table || tableObj.name || tableObj.value || fromItem.table); - } else { - continue; - } - - if (tableName && !cteNames.has(tableName.toLowerCase())) { - const aliasName = fromItem.as || tableName; - tables.push(String(aliasName)); - } - } - - // Handle JOINs - if (fromItem.join && Array.isArray(fromItem.join)) { - for (const joinItem of fromItem.join) { - if (joinItem.table) { - let tableName: string; - if (typeof joinItem.table === 'string') { - tableName = joinItem.table; - } else if (joinItem.table && typeof joinItem.table === 'object') { - const tableObj = joinItem.table as Record; - tableName = String(tableObj.table || tableObj.name || tableObj.value || joinItem.table); - } else { - continue; - } - - if (tableName && !cteNames.has(tableName.toLowerCase())) { - const aliasName = joinItem.as || tableName; - tables.push(String(aliasName)); - } - } - } - } - } - - return tables; -} - -/** - * Extracts column references from SQL query grouped by table - * Returns a map of table -> set of column names referenced - * Excludes CTE internal columns - */ -export function extractColumnReferences( - sql: string, - dataSourceSyntax?: string -): Map> { - const dialect = getParserDialect(dataSourceSyntax); - const parser = new Parser(); - const tableColumnMap = new Map>(); - - try { - // Parse SQL into AST - const ast = parser.astify(sql, { database: dialect }); - const statements = Array.isArray(ast) ? ast : [ast]; - - // Get CTEs to exclude from column validation - const cteNames = new Set(); - for (const statement of statements) { - if ('with' in statement && statement.with && Array.isArray(statement.with)) { - for (const cte of statement.with) { - if (cte.name?.value) { - cteNames.add(cte.name.value.toLowerCase()); - } - } - } - } - - // Process each statement - for (const statement of statements) { - // First process CTEs in the main statement - if ('with' in statement && statement.with && Array.isArray(statement.with)) { - for (const cte of statement.with) { - if (cte.stmt && typeof cte.stmt === 'object') { - const cteStmt = cte.stmt as Record; - // Handle CTEs with UNION/UNION ALL - they have an ast property - // Check for ast first since UNION CTEs don't have a direct type property - if (!cteStmt.type && cteStmt.ast && typeof cteStmt.ast === 'object') { - const ast = cteStmt.ast as Record; - if (ast.type === 'select') { - // Process the first SELECT - extractColumnsFromStatement(ast, tableColumnMap, cteNames); - - // Process UNION parts (_next chain) - let nextStmt = ast._next as Record | undefined; - while (nextStmt) { - if (nextStmt.type === 'select') { - extractColumnsFromStatement(nextStmt, tableColumnMap, cteNames); - } - nextStmt = nextStmt._next as Record | undefined; - } - } - } else if (cteStmt.type === 'select') { - // CTE with type 'select' - may have UNION via _next - extractColumnsFromStatement(cteStmt, tableColumnMap, cteNames); - - // Handle UNION parts (_next chain) for CTEs - const cteWithNext = cteStmt as StatementWithNext; - let nextStmt = cteWithNext._next; - while (nextStmt) { - if (nextStmt.type === 'select') { - extractColumnsFromStatement(nextStmt, tableColumnMap, cteNames); - } - nextStmt = nextStmt._next; - } - } - } - } - } - - if ('type' in statement && statement.type === 'select') { - // Process the main SELECT statement - extractColumnsFromStatement( - statement as unknown as Record, - tableColumnMap, - cteNames - ); - - // Handle UNION queries - they have a _next property for the next SELECT - const statementWithNext = statement as unknown as StatementWithNext; - let nextStatement = statementWithNext._next; - while (nextStatement) { - if (nextStatement.type === 'select') { - extractColumnsFromStatement(nextStatement, tableColumnMap, cteNames); - } - nextStatement = nextStatement._next; - } - } - } - - return tableColumnMap; - } catch (error) { - // Log the error for debugging but return empty map to allow validation to continue - const errorMessage = error instanceof Error ? error.message : String(error); - console.warn( - `Failed to extract column references from SQL: ${errorMessage}. Column-level permissions cannot be validated.` - ); - return new Map(); - } -} - -/** - * Helper function to extract columns from a SELECT statement - */ -function extractColumnsFromStatement( - statement: Record, - tableColumnMap: Map>, - cteNames: Set, - parentAliasMap?: Map -): void { - // Build table alias mapping and track all non-CTE tables - // Include parent aliases for subqueries to resolve outer references - const aliasToTableMap = new Map(parentAliasMap || []); - const physicalTables: string[] = []; - - // Process FROM clause - if (statement.from && Array.isArray(statement.from)) { - for (const fromItem of statement.from) { - processFromItem(fromItem, aliasToTableMap, cteNames, physicalTables); - } - } - - // Track column aliases defined in SELECT clause - // These should NOT be treated as physical columns when referenced in ORDER BY, GROUP BY, etc. - const columnAliases = new Set(); - - // Extract columns from SELECT clause - // Important: We only extract from the expression (column.expr), not from the alias (column.as) - // This ensures column aliases like "AS total_count" are not treated as physical columns - if (statement.columns && Array.isArray(statement.columns)) { - for (const column of statement.columns) { - // Track the alias if it exists - if (column.as) { - columnAliases.add(String(column.as).toLowerCase()); - } - - // Only process the expression part, which contains the actual column references - // The 'as' property contains the alias which should not be treated as a column - if (column.expr && typeof column.expr === 'object') { - extractColumnFromExpression( - column.expr, - aliasToTableMap, - tableColumnMap, - cteNames, - physicalTables, - columnAliases - ); - } - } - } - - // Extract columns from WHERE clause - if (statement.where) { - extractColumnFromExpression( - statement.where, - aliasToTableMap, - tableColumnMap, - cteNames, - physicalTables, - columnAliases - ); - } - - // Extract columns from GROUP BY clause - if (statement.groupby && Array.isArray(statement.groupby)) { - for (const groupItem of statement.groupby) { - // Skip if this is a reference to a column alias - if (groupItem.type === 'column_ref' && groupItem.column && !groupItem.table) { - const columnName = - typeof groupItem.column === 'string' ? groupItem.column : String(groupItem.column); - if (columnAliases.has(columnName.toLowerCase())) { - continue; // Skip column aliases - } - } - extractColumnFromExpression( - groupItem, - aliasToTableMap, - tableColumnMap, - cteNames, - physicalTables, - columnAliases - ); - } - } - - // Extract columns from HAVING clause - if (statement.having) { - extractColumnFromExpression( - statement.having, - aliasToTableMap, - tableColumnMap, - cteNames, - physicalTables, - columnAliases - ); - } - - // Extract columns from ORDER BY clause - if (statement.orderby && Array.isArray(statement.orderby)) { - for (const orderItem of statement.orderby) { - if (orderItem.expr) { - // Check if this is a reference to a column alias - if ( - orderItem.expr.type === 'column_ref' && - orderItem.expr.column && - !orderItem.expr.table - ) { - const columnName = - typeof orderItem.expr.column === 'string' - ? orderItem.expr.column - : String(orderItem.expr.column); - if (columnAliases.has(columnName.toLowerCase())) { - continue; // Skip column aliases - } - } - extractColumnFromExpression( - orderItem.expr, - aliasToTableMap, - tableColumnMap, - cteNames, - physicalTables, - columnAliases - ); - } - } - } - - // Process nested CTEs - if (statement.with && Array.isArray(statement.with)) { - for (const cte of statement.with) { - if (cte.stmt && typeof cte.stmt === 'object') { - const cteStmt = cte.stmt as Record; - - // Handle CTEs with UNION/UNION ALL - they have an ast property - // Check for ast first since UNION CTEs don't have a direct type property - if (!cteStmt.type && cteStmt.ast && typeof cteStmt.ast === 'object') { - const ast = cteStmt.ast as Record; - if (ast.type === 'select') { - // Process the first SELECT - extractColumnsFromStatement(ast, tableColumnMap, cteNames, aliasToTableMap); - - // Process UNION parts (_next chain) - let nextStmt = ast._next as Record | undefined; - while (nextStmt) { - if (nextStmt.type === 'select') { - extractColumnsFromStatement(nextStmt, tableColumnMap, cteNames, aliasToTableMap); - } - nextStmt = nextStmt._next as Record | undefined; - } - } - } else if (cteStmt.type === 'select') { - // Regular CTE without UNION - extractColumnsFromStatement(cteStmt, tableColumnMap, cteNames, aliasToTableMap); - } - } - } - } - - // Process JOIN conditions - if (statement.from && Array.isArray(statement.from)) { - for (const fromItem of statement.from) { - // Process JOIN ON conditions (fromItem.join is the join type, fromItem.on is the condition) - if (fromItem.join && fromItem.on) { - extractColumnFromExpression( - fromItem.on, - aliasToTableMap, - tableColumnMap, - cteNames, - physicalTables, - columnAliases - ); - } - - // Process subqueries in FROM clause - if (fromItem.expr && typeof fromItem.expr === 'object' && fromItem.expr.type === 'select') { - extractColumnsFromStatement( - fromItem.expr as Record, - tableColumnMap, - cteNames, - aliasToTableMap - ); - } - } - } -} - -/** - * Process FROM item to build alias mapping - */ -function processFromItem( - fromItem: unknown, - aliasToTableMap: Map, - cteNames: Set, - physicalTables?: string[] -): void { - const item = fromItem as Record; - - if (item.table) { - const tableName = extractTableName(item.table); - // Handle schema-qualified names (parser puts schema in 'db' field) - const fullTableName = item.db ? `${item.db}.${tableName}` : tableName; - const alias = item.as ? String(item.as) : tableName; - - if (!cteNames.has(tableName.toLowerCase())) { - aliasToTableMap.set(alias.toLowerCase(), fullTableName); - if (physicalTables) { - physicalTables.push(fullTableName); - } - } - } - - // Process JOINs - if (item.join && Array.isArray(item.join)) { - for (const joinItem of item.join) { - if (joinItem.table) { - const tableName = extractTableName(joinItem.table); - // Handle schema-qualified names (parser puts schema in 'db' field) - const fullTableName = joinItem.db ? `${joinItem.db}.${tableName}` : tableName; - const alias = joinItem.as ? String(joinItem.as) : tableName; - - if (!cteNames.has(tableName.toLowerCase())) { - aliasToTableMap.set(alias.toLowerCase(), fullTableName); - if (physicalTables) { - physicalTables.push(fullTableName); - } - } - - // Extract columns from JOIN conditions - if (joinItem.on) { - // JOIN conditions will be processed with the main extraction - } - } - } - } -} - -/** - * Extract table name from various formats - */ -function extractTableName(table: unknown): string { - if (typeof table === 'string') { - return table; - } - - if (table && typeof table === 'object') { - const tableObj = table as Record; - // Try different property names that might contain the table name - const tableName = tableObj.table || tableObj.name || tableObj.value; - return tableName ? String(tableName) : ''; - } - - return ''; -} - -/** - * Extract column references from an expression - */ -function extractColumnFromExpression( - expr: unknown, - aliasToTableMap: Map, - tableColumnMap: Map>, - cteNames: Set, - physicalTables?: string[], - columnAliases?: Set -): void { - if (!expr || typeof expr !== 'object') return; - - const expression = expr as Record; - - // Skip string literals and other non-column types - if ( - expression.type === 'single_quote_string' || - expression.type === 'double_quote_string' || - expression.type === 'number' || - expression.type === 'bool' || - expression.type === 'null' - ) { - return; - } - - // Handle column references - if (expression.type === 'column_ref') { - const columnName = expression.column; - const tableRef = expression.table; - - if (columnName && columnName !== '*') { - // Get the actual column name - handle both string and nested object formats - let actualColumn: string; - if (typeof columnName === 'string') { - actualColumn = columnName.toLowerCase(); - } else if (typeof columnName === 'object' && columnName !== null) { - // Handle nested object format from parser - const colObj = columnName as Record; - - // Check if it has the nested expr structure - if (colObj.expr && typeof colObj.expr === 'object') { - const exprObj = colObj.expr as Record; - // Skip string literals and non-column types - if ( - exprObj.type === 'single_quote_string' || - exprObj.type === 'double_quote_string' || - exprObj.type === 'number' || - exprObj.type === 'bool' || - exprObj.type === 'null' - ) { - return; - } - // Make sure this is actually a column reference - if (exprObj.type === 'default' || exprObj.type === 'column_ref') { - const colValue = exprObj.value || exprObj.name; - if (colValue !== undefined && colValue !== null) { - actualColumn = String(colValue).toLowerCase(); - } else { - // If we can't extract a value, skip this column - return; - } - } else { - // Unknown type, skip - return; - } - } else { - // Try direct properties as fallback - const colValue = colObj.value || colObj.name || colObj.column; - if (colValue !== undefined && colValue !== null) { - actualColumn = String(colValue).toLowerCase(); - } else { - // If we can't extract a value, skip this column - return; - } - } - } else { - // Unknown format, skip - return; - } - - // Skip if this column is actually a column alias (not a physical column) - // This handles cases where aliases are referenced in ORDER BY, GROUP BY, etc. - if (!tableRef && columnAliases && columnAliases.has(actualColumn)) { - return; // Skip column aliases - } - - if (tableRef) { - // Get table name from reference - const tableName = extractTableName(tableRef).toLowerCase(); - - // Check if it's an alias - const actualTable = aliasToTableMap.get(tableName) || tableName; - - // Only track if not a CTE - if (!cteNames.has(actualTable.toLowerCase())) { - if (!tableColumnMap.has(actualTable)) { - tableColumnMap.set(actualTable, new Set()); - } - const tableColumns = tableColumnMap.get(actualTable); - if (tableColumns) { - tableColumns.add(actualColumn); - } - } - } else if (physicalTables && physicalTables.length > 0) { - // If no table reference but we have physical tables, assign to first table - // This handles simple queries like SELECT id, name FROM users - const firstTable = physicalTables[0]; - if (firstTable && !cteNames.has(firstTable.toLowerCase())) { - if (!tableColumnMap.has(firstTable)) { - tableColumnMap.set(firstTable, new Set()); - } - const tableColumns = tableColumnMap.get(firstTable); - if (tableColumns) { - tableColumns.add(actualColumn); - } - } - } - } - } - - // Handle aggregate functions - if (expression.type === 'aggr_func' && expression.args) { - if (Array.isArray(expression.args)) { - for (const arg of expression.args) { - extractColumnFromExpression( - arg, - aliasToTableMap, - tableColumnMap, - cteNames, - physicalTables, - columnAliases - ); - } - } else if (typeof expression.args === 'object') { - const argsObj = expression.args as Record; - if (argsObj.expr) { - // Handle nested expr structure in args - extractColumnFromExpression( - argsObj.expr, - aliasToTableMap, - tableColumnMap, - cteNames, - physicalTables, - columnAliases - ); - } else { - extractColumnFromExpression( - expression.args, - aliasToTableMap, - tableColumnMap, - cteNames, - physicalTables, - columnAliases - ); - } - } else { - extractColumnFromExpression( - expression.args, - aliasToTableMap, - tableColumnMap, - cteNames, - physicalTables, - columnAliases - ); - } - } - - // Handle binary expressions (e.g., col1 = col2) - if (expression.type === 'binary_expr') { - if (expression.left) { - extractColumnFromExpression( - expression.left, - aliasToTableMap, - tableColumnMap, - cteNames, - physicalTables, - columnAliases - ); - } - if (expression.right) { - extractColumnFromExpression( - expression.right, - aliasToTableMap, - tableColumnMap, - cteNames, - physicalTables, - columnAliases - ); - } - } - - // Handle expression lists (e.g., IN clauses with subqueries) - if (expression.type === 'expr_list' && expression.value && Array.isArray(expression.value)) { - for (const item of expression.value) { - if (item.ast && item.ast.type === 'select') { - // This is a subquery - extractColumnsFromStatement( - item.ast as Record, - tableColumnMap, - cteNames, - aliasToTableMap - ); - } else { - extractColumnFromExpression( - item, - aliasToTableMap, - tableColumnMap, - cteNames, - physicalTables, - columnAliases - ); - } - } - } - - // Handle direct subqueries - if (expression.type === 'select') { - extractColumnsFromStatement( - expression as Record, - tableColumnMap, - cteNames, - aliasToTableMap - ); - } - - // Handle subqueries with ast property (as in SELECT column subqueries) - if (expression.ast && typeof expression.ast === 'object') { - const astObj = expression.ast as Record; - if (astObj.type === 'select') { - extractColumnsFromStatement(astObj, tableColumnMap, cteNames, aliasToTableMap); - } - } - - // Handle window functions (e.g., LAG, LEAD, ROW_NUMBER with OVER clause) - if (expression.type === 'window_func') { - // Process the function arguments - if (expression.args) { - if (Array.isArray(expression.args)) { - for (const arg of expression.args) { - if (arg.value) { - extractColumnFromExpression( - arg.value, - aliasToTableMap, - tableColumnMap, - cteNames, - physicalTables, - columnAliases - ); - } else { - extractColumnFromExpression( - arg, - aliasToTableMap, - tableColumnMap, - cteNames, - physicalTables - ); - } - } - } else if (typeof expression.args === 'object') { - const argsObj = expression.args as Record; - // Handle expr_list for window functions - if (argsObj.type === 'expr_list' && argsObj.value && Array.isArray(argsObj.value)) { - for (const item of argsObj.value) { - extractColumnFromExpression( - item, - aliasToTableMap, - tableColumnMap, - cteNames, - physicalTables, - columnAliases - ); - } - } else { - extractColumnFromExpression( - expression.args, - aliasToTableMap, - tableColumnMap, - cteNames, - physicalTables - ); - } - } - } - - // Process the OVER clause - if (expression.over && typeof expression.over === 'object') { - const overObj = expression.over as Record; - - // Handle PARTITION BY - if (overObj.partitionby && Array.isArray(overObj.partitionby)) { - for (const partItem of overObj.partitionby) { - extractColumnFromExpression( - partItem, - aliasToTableMap, - tableColumnMap, - cteNames, - physicalTables, - columnAliases - ); - } - } - - // Handle ORDER BY - if (overObj.orderby && Array.isArray(overObj.orderby)) { - for (const orderItem of overObj.orderby) { - if (orderItem && typeof orderItem === 'object') { - const orderObj = orderItem as Record; - if (orderObj.expr) { - extractColumnFromExpression( - orderObj.expr, - aliasToTableMap, - tableColumnMap, - cteNames, - physicalTables, - columnAliases - ); - } else { - extractColumnFromExpression( - orderItem, - aliasToTableMap, - tableColumnMap, - cteNames, - physicalTables, - columnAliases - ); - } - } - } - } - } - } - - // Handle function calls - if (expression.type === 'function' && expression.args) { - if (Array.isArray(expression.args)) { - for (const arg of expression.args) { - if (arg.value) { - extractColumnFromExpression( - arg.value, - aliasToTableMap, - tableColumnMap, - cteNames, - physicalTables, - columnAliases - ); - } else { - extractColumnFromExpression( - arg, - aliasToTableMap, - tableColumnMap, - cteNames, - physicalTables - ); - } - } - } else if (typeof expression.args === 'object') { - const argsObj = expression.args as Record; - // Handle expr_list for functions like EXISTS, etc. - if (argsObj.type === 'expr_list' && argsObj.value && Array.isArray(argsObj.value)) { - for (const item of argsObj.value) { - // Check for subquery with ast property (EXISTS subqueries have this structure) - const itemObj = item as Record; - if (itemObj?.ast && typeof itemObj.ast === 'object') { - const astObj = itemObj.ast as Record; - if (astObj.type === 'select') { - extractColumnsFromStatement(astObj, tableColumnMap, cteNames, aliasToTableMap); - } - } else { - // Process any other expression type (including aggr_func, column_ref, etc.) - extractColumnFromExpression( - item, - aliasToTableMap, - tableColumnMap, - cteNames, - physicalTables, - columnAliases - ); - } - } - } else { - extractColumnFromExpression( - expression.args, - aliasToTableMap, - tableColumnMap, - cteNames, - physicalTables - ); - } - } - - // Handle window functions (OVER clause) - if (expression.over && typeof expression.over === 'object') { - const overObj = expression.over as Record; - if (overObj.as_window_specification && typeof overObj.as_window_specification === 'object') { - const windowSpec = overObj.as_window_specification as Record; - if ( - windowSpec.window_specification && - typeof windowSpec.window_specification === 'object' - ) { - const spec = windowSpec.window_specification as Record; - - // Handle PARTITION BY - if (spec.partitionby && Array.isArray(spec.partitionby)) { - for (const partItem of spec.partitionby) { - extractColumnFromExpression( - partItem, - aliasToTableMap, - tableColumnMap, - cteNames, - physicalTables - ); - } - } - - // Handle ORDER BY - if (spec.orderby && Array.isArray(spec.orderby)) { - for (const orderItem of spec.orderby) { - if (orderItem.expr) { - extractColumnFromExpression( - orderItem.expr, - aliasToTableMap, - tableColumnMap, - cteNames, - physicalTables - ); - } else if (orderItem) { - // Sometimes the orderItem itself might be the expression - extractColumnFromExpression( - orderItem, - aliasToTableMap, - tableColumnMap, - cteNames, - physicalTables - ); - } - } - } - } - } - } - } - - // Handle CASE expressions - if (expression.type === 'case') { - if (expression.expr) { - extractColumnFromExpression( - expression.expr, - aliasToTableMap, - tableColumnMap, - cteNames, - physicalTables, - columnAliases - ); - } - if (expression.args && Array.isArray(expression.args)) { - for (const arg of expression.args) { - // Handle WHEN conditions - if (arg.cond) { - extractColumnFromExpression( - arg.cond, - aliasToTableMap, - tableColumnMap, - cteNames, - physicalTables, - columnAliases - ); - } - // Also handle older format - if (arg.when) { - extractColumnFromExpression( - arg.when, - aliasToTableMap, - tableColumnMap, - cteNames, - physicalTables, - columnAliases - ); - } - // Handle THEN results (may contain columns) - if (arg.result) { - extractColumnFromExpression( - arg.result, - aliasToTableMap, - tableColumnMap, - cteNames, - physicalTables, - columnAliases - ); - } - if (arg.then) { - extractColumnFromExpression( - arg.then, - aliasToTableMap, - tableColumnMap, - cteNames, - physicalTables, - columnAliases - ); - } - } - } - if (expression.else) { - extractColumnFromExpression( - expression.else, - aliasToTableMap, - tableColumnMap, - cteNames, - physicalTables, - columnAliases - ); - } - } -} diff --git a/packages/data-source/src/adapters/bigquery.ts b/packages/data-source/src/adapters/bigquery.ts index 193aa3f1d..85e0a7c11 100644 --- a/packages/data-source/src/adapters/bigquery.ts +++ b/packages/data-source/src/adapters/bigquery.ts @@ -117,7 +117,9 @@ export class BigQueryAdapter extends BaseAdapter { resultRows = resultRows.slice(0, maxRows); } - // BigQuery doesn't provide detailed field metadata in the same way as other databases + // BigQuery field metadata extraction + // TODO: Extract schema from job metadata when available + // For now, return empty fields array to maintain compatibility const fields: FieldMetadata[] = []; return { diff --git a/packages/data-source/src/adapters/mysql.test.ts b/packages/data-source/src/adapters/mysql.test.ts index 94f16d034..83dd07fb4 100644 --- a/packages/data-source/src/adapters/mysql.test.ts +++ b/packages/data-source/src/adapters/mysql.test.ts @@ -169,8 +169,8 @@ describe('MySQLAdapter', () => { rows: [{ id: 1, name: 'Test' }], rowCount: 1, fields: [ - { name: 'id', type: 'mysql_type_LONG', nullable: true, length: 0, precision: 0 }, - { name: 'name', type: 'mysql_type_VAR_STRING', nullable: true, length: 0, precision: 0 }, + { name: 'id', type: 'integer', nullable: true, length: 0, precision: 0 }, + { name: 'name', type: 'varchar', nullable: true, length: 0, precision: 0 }, ], hasMoreRows: false, }); diff --git a/packages/data-source/src/adapters/mysql.ts b/packages/data-source/src/adapters/mysql.ts index b5072efc4..827173313 100644 --- a/packages/data-source/src/adapters/mysql.ts +++ b/packages/data-source/src/adapters/mysql.ts @@ -5,6 +5,7 @@ import { type Credentials, DataSourceType, type MySQLCredentials } from '../type import type { QueryParameter } from '../types/query'; import { type AdapterQueryResult, BaseAdapter, type FieldMetadata } from './base'; import { normalizeRowValues } from './helpers/normalize-values'; +import { mapMySQLType } from './type-mappings/mysql'; /** * MySQL database adapter @@ -116,7 +117,7 @@ export class MySQLAdapter extends BaseAdapter { const fieldMetadata: FieldMetadata[] = Array.isArray(fields) ? fields.map((field) => ({ name: field.name, - type: `mysql_type_${field.type}`, // MySQL field type + type: mapMySQLType(`mysql_type_${field.type}`), // Map type code to normalized type nullable: typeof field.flags === 'number' ? (field.flags & 1) === 0 : true, // NOT_NULL flag is bit 0 length: typeof field.length === 'number' && field.length > 0 ? field.length : 0, precision: diff --git a/packages/data-source/src/adapters/postgresql.test.ts b/packages/data-source/src/adapters/postgresql.test.ts index 6bfe6d36e..bde9e2b70 100644 --- a/packages/data-source/src/adapters/postgresql.test.ts +++ b/packages/data-source/src/adapters/postgresql.test.ts @@ -243,8 +243,8 @@ describe('PostgreSQLAdapter', () => { rows: [{ id: 1, name: 'Test' }], rowCount: 1, fields: [ - { name: 'id', type: 'pg_type_23', nullable: true, length: 4 }, - { name: 'name', type: 'pg_type_25', nullable: true, length: 0 }, + { name: 'id', type: 'integer', nullable: true, length: 4 }, + { name: 'name', type: 'text', nullable: true, length: 0 }, ], hasMoreRows: false, }); diff --git a/packages/data-source/src/adapters/postgresql.ts b/packages/data-source/src/adapters/postgresql.ts index 8b1aaf79a..a39aff933 100644 --- a/packages/data-source/src/adapters/postgresql.ts +++ b/packages/data-source/src/adapters/postgresql.ts @@ -6,6 +6,7 @@ import { type Credentials, DataSourceType, type PostgreSQLCredentials } from '.. import type { QueryParameter } from '../types/query'; import { type AdapterQueryResult, BaseAdapter, type FieldMetadata } from './base'; import { normalizeRowValues } from './helpers/normalize-values'; +import { mapPostgreSQLType } from './type-mappings/postgresql'; // Internal types for pg-cursor that aren't exported interface CursorResult { @@ -104,7 +105,7 @@ export class PostgreSQLAdapter extends BaseAdapter { const fields: FieldMetadata[] = result.fields?.map((field) => ({ name: field.name, - type: `pg_type_${field.dataTypeID}`, // PostgreSQL type ID + type: mapPostgreSQLType(`pg_type_${field.dataTypeID}`), // Map OID to normalized type nullable: true, // PostgreSQL doesn't provide this info directly length: field.dataTypeSize > 0 ? field.dataTypeSize : 0, })) || []; @@ -149,7 +150,7 @@ export class PostgreSQLAdapter extends BaseAdapter { if (fields.length === 0 && cursor._result?.fields) { fields = cursor._result.fields.map((field) => ({ name: field.name, - type: `pg_type_${field.dataTypeID}`, + type: mapPostgreSQLType(`pg_type_${field.dataTypeID}`), // Map OID to normalized type nullable: true, length: field.dataTypeSize > 0 ? field.dataTypeSize : 0, })); diff --git a/packages/data-source/src/adapters/redshift.test.ts b/packages/data-source/src/adapters/redshift.test.ts index ba36ac328..1a84642e1 100644 --- a/packages/data-source/src/adapters/redshift.test.ts +++ b/packages/data-source/src/adapters/redshift.test.ts @@ -201,8 +201,8 @@ describe('RedshiftAdapter', () => { rows: [{ id: 1, name: 'Test' }], rowCount: 1, fields: [ - { name: 'id', type: 'redshift_type_23', nullable: true, length: 4 }, - { name: 'name', type: 'redshift_type_25', nullable: true, length: 0 }, + { name: 'id', type: 'integer', nullable: true, length: 4 }, + { name: 'name', type: 'text', nullable: true, length: 0 }, ], hasMoreRows: false, }); diff --git a/packages/data-source/src/adapters/redshift.ts b/packages/data-source/src/adapters/redshift.ts index 045e3c798..c00feb15b 100644 --- a/packages/data-source/src/adapters/redshift.ts +++ b/packages/data-source/src/adapters/redshift.ts @@ -6,6 +6,7 @@ import { type Credentials, DataSourceType, type RedshiftCredentials } from '../t import type { QueryParameter } from '../types/query'; import { type AdapterQueryResult, BaseAdapter, type FieldMetadata } from './base'; import { normalizeRowValues } from './helpers/normalize-values'; +import { mapPostgreSQLType } from './type-mappings/postgresql'; // Internal types for pg-cursor that aren't exported interface CursorResult { @@ -95,7 +96,7 @@ export class RedshiftAdapter extends BaseAdapter { const fields: FieldMetadata[] = result.fields?.map((field) => ({ name: field.name, - type: `redshift_type_${field.dataTypeID}`, // Redshift type ID + type: mapPostgreSQLType(`pg_type_${field.dataTypeID}`), // Map OID to normalized type (Redshift uses PostgreSQL OIDs) nullable: true, // Redshift doesn't provide this info directly length: field.dataTypeSize > 0 ? field.dataTypeSize : 0, })) || []; @@ -140,7 +141,7 @@ export class RedshiftAdapter extends BaseAdapter { if (fields.length === 0 && cursor._result?.fields) { fields = cursor._result.fields.map((field) => ({ name: field.name, - type: `redshift_type_${field.dataTypeID}`, + type: mapPostgreSQLType(`pg_type_${field.dataTypeID}`), // Map OID to normalized type (Redshift uses PostgreSQL OIDs) nullable: true, length: field.dataTypeSize > 0 ? field.dataTypeSize : 0, })); diff --git a/packages/data-source/src/adapters/type-mappings/index.ts b/packages/data-source/src/adapters/type-mappings/index.ts new file mode 100644 index 000000000..cd16f1673 --- /dev/null +++ b/packages/data-source/src/adapters/type-mappings/index.ts @@ -0,0 +1,130 @@ +/** + * Unified type mapping utilities for all database adapters + * Provides consistent type normalization across different database systems + */ + +import { mapPostgreSQLType, getPostgreSQLSimpleType } from './postgresql'; +import { mapMySQLType, getMySQLSimpleType } from './mysql'; +import type { FieldMetadata } from '../base'; + +export * from './postgresql'; +export * from './mysql'; + +/** + * Database type identifiers for routing to correct mapper + */ +export type DatabaseType = + | 'postgresql' + | 'redshift' + | 'mysql' + | 'sqlserver' + | 'snowflake' + | 'bigquery'; + +/** + * Simple type categories used in metadata + */ +export type SimpleType = 'number' | 'text' | 'date'; + +/** + * Maps a database-specific type to a normalized type name + * @param dbType - The database type (postgresql, mysql, etc.) + * @param typeValue - The type value (OID, code, or name) + * @returns Normalized type name + */ +export function mapDatabaseType(dbType: DatabaseType, typeValue: string | number): string { + switch (dbType) { + case 'postgresql': + case 'redshift': // Redshift uses PostgreSQL type OIDs + return mapPostgreSQLType(typeValue); + + case 'mysql': + return mapMySQLType(typeValue); + + case 'sqlserver': + case 'snowflake': + // These already return readable type names + return typeof typeValue === 'string' ? typeValue.toLowerCase() : 'text'; + + case 'bigquery': + // BigQuery types are usually already normalized + return typeof typeValue === 'string' ? typeValue.toLowerCase() : 'text'; + + default: + return typeof typeValue === 'string' ? typeValue.toLowerCase() : 'text'; + } +} + +/** + * Gets the simple type category for a normalized type + * @param dbType - The database type + * @param normalizedType - The normalized type name + * @returns Simple type category + */ +export function getSimpleType(dbType: DatabaseType, normalizedType: string): SimpleType { + switch (dbType) { + case 'postgresql': + case 'redshift': + return getPostgreSQLSimpleType(normalizedType); + + case 'mysql': + return getMySQLSimpleType(normalizedType); + + case 'sqlserver': + case 'snowflake': + case 'bigquery': + default: + return getGenericSimpleType(normalizedType); + } +} + +/** + * Generic simple type detection for databases without specific mappings + * @param normalizedType - The normalized type name + * @returns Simple type category + */ +export function getGenericSimpleType(normalizedType: string): SimpleType { + const lowerType = normalizedType.toLowerCase(); + + // Numeric types + if ( + lowerType.includes('int') || + lowerType.includes('float') || + lowerType.includes('double') || + lowerType.includes('decimal') || + lowerType.includes('numeric') || + lowerType.includes('number') || + lowerType.includes('real') || + lowerType === 'money' + ) { + return 'number'; + } + + // Date/time types + if ( + lowerType.includes('date') || + lowerType.includes('time') || + lowerType.includes('interval') + ) { + return 'date'; + } + + // Everything else is text + return 'text'; +} + +/** + * Normalizes field metadata with proper type mappings + * @param fields - Raw field metadata from database adapter + * @param dbType - The database type + * @returns Normalized field metadata + */ +export function normalizeFieldMetadata( + fields: FieldMetadata[], + dbType: DatabaseType +): FieldMetadata[] { + return fields.map((field) => ({ + ...field, + type: mapDatabaseType(dbType, field.type), + })); +} \ No newline at end of file diff --git a/packages/data-source/src/adapters/type-mappings/mysql.ts b/packages/data-source/src/adapters/type-mappings/mysql.ts new file mode 100644 index 000000000..defa0407e --- /dev/null +++ b/packages/data-source/src/adapters/type-mappings/mysql.ts @@ -0,0 +1,120 @@ +/** + * MySQL type code to normalized type name mappings + * These type codes are from the MySQL protocol + * Reference: https://dev.mysql.com/doc/dev/mysql-server/latest/field__types_8h.html + */ + +// MySQL type codes from mysql2 library +export const MYSQL_TYPE_CODE_MAP: Record = { + // Numeric types + 0: 'decimal', + 1: 'tinyint', + 2: 'smallint', + 3: 'integer', + 4: 'float', + 5: 'double', + 8: 'bigint', + 9: 'mediumint', + 246: 'decimal', // NEWDECIMAL + + // Date and time types + 7: 'timestamp', + 10: 'date', + 11: 'time', + 12: 'datetime', + 13: 'year', + + // String types + 15: 'varchar', // VARCHAR + 16: 'bit', + 247: 'enum', + 248: 'set', + 249: 'tinyblob', // TINY_BLOB + 250: 'mediumblob', // MEDIUM_BLOB + 251: 'longblob', // LONG_BLOB + 252: 'blob', // BLOB + 253: 'varchar', // VAR_STRING + 254: 'char', // STRING + 255: 'geometry', + + // JSON type + 245: 'json', +}; + +// Alternative mapping for text types that might appear differently +const TEXT_TYPE_ALIASES: Record = { + 'var_string': 'varchar', + 'string': 'char', + 'tiny_blob': 'tinytext', + 'medium_blob': 'mediumtext', + 'long_blob': 'longtext', + 'blob': 'text', + 'long': 'integer', // LONG type is an integer type +}; + +/** + * Maps MySQL type code or type string to a normalized type name + * @param mysqlType - MySQL type as numeric code or string like "mysql_type_253" + * @returns Normalized type name + */ +export function mapMySQLType(mysqlType: string | number): string { + // Handle numeric type code + if (typeof mysqlType === 'number') { + return MYSQL_TYPE_CODE_MAP[mysqlType] || 'text'; + } + + // Handle string format "mysql_type_253" + if (typeof mysqlType === 'string') { + const match = mysqlType.match(/^mysql_type_(\d+)$/); + if (match && match[1]) { + const typeCode = parseInt(match[1], 10); + return MYSQL_TYPE_CODE_MAP[typeCode] || 'text'; + } + + // Check for text type aliases (handle both mysql_type_ prefix and plain types) + const lowerType = mysqlType.toLowerCase(); + // Remove mysql_type_ prefix if present + const cleanType = lowerType.replace(/^mysql_type_/, ''); + if (TEXT_TYPE_ALIASES[cleanType]) { + return TEXT_TYPE_ALIASES[cleanType]; + } + + // If it's already a type name, return it + return lowerType; + } + + return 'text'; +} + +/** + * Determines the simple type category for metadata + * @param normalizedType - The normalized MySQL type name + * @returns Simple type category: 'number', 'text', or 'date' + */ +export function getMySQLSimpleType(normalizedType: string): 'number' | 'text' | 'date' { + const lowerType = normalizedType.toLowerCase(); + + // Numeric types + if ( + lowerType.includes('int') || + lowerType.includes('float') || + lowerType.includes('double') || + lowerType.includes('decimal') || + lowerType.includes('numeric') || + lowerType === 'bit' + ) { + return 'number'; + } + + // Date/time types + if ( + lowerType.includes('date') || + lowerType.includes('time') || + lowerType === 'year' + ) { + return 'date'; + } + + // Everything else is text + return 'text'; +} \ No newline at end of file diff --git a/packages/data-source/src/adapters/type-mappings/postgresql.ts b/packages/data-source/src/adapters/type-mappings/postgresql.ts new file mode 100644 index 000000000..7bf591a44 --- /dev/null +++ b/packages/data-source/src/adapters/type-mappings/postgresql.ts @@ -0,0 +1,145 @@ +/** + * PostgreSQL OID to normalized type name mappings + * These OIDs are stable across PostgreSQL versions + * Reference: https://github.com/postgres/postgres/blob/master/src/include/catalog/pg_type.dat + */ + +// Common PostgreSQL type OIDs +export const POSTGRESQL_TYPE_OID_MAP: Record = { + // Boolean type + 16: 'boolean', + + // Numeric types + 20: 'bigint', // int8 + 21: 'smallint', // int2 + 23: 'integer', // int4 + 700: 'float4', // real + 701: 'float8', // double precision + 1700: 'numeric', // decimal + + // String types + 18: 'char', + 19: 'name', + 25: 'text', + 1042: 'char', // bpchar + 1043: 'varchar', + + // Date/time types + 1082: 'date', + 1083: 'time', + 1114: 'timestamp', // timestamp without timezone + 1184: 'timestamptz', // timestamp with timezone + 1186: 'interval', + 1266: 'timetz', // time with timezone + + // UUID + 2950: 'uuid', + + // JSON types + 114: 'json', + 3802: 'jsonb', + + // Binary + 17: 'bytea', + + // Money + 790: 'money', + + // Network types + 869: 'inet', + 650: 'cidr', + 774: 'macaddr', + 829: 'macaddr8', + + // Geometric types + 600: 'point', + 601: 'lseg', + 602: 'path', + 603: 'box', + 604: 'polygon', + 628: 'line', + 718: 'circle', + + // Array types (common ones) + 1000: '_bool', // boolean array + 1001: '_bytea', // bytea array + 1002: '_char', // char array + 1003: '_name', // name array + 1005: '_int2', // smallint array + 1007: '_int4', // integer array + 1009: '_text', // text array + 1014: '_bpchar', // char array + 1015: '_varchar', // varchar array + 1016: '_int8', // bigint array + 1021: '_float4', // float4 array + 1022: '_float8', // float8 array + 1115: '_timestamp', // timestamp array + 1182: '_date', // date array + 1183: '_time', // time array + 1185: '_timestamptz', // timestamptz array + 1231: '_numeric', // numeric array + 2951: '_uuid', // uuid array + 3807: '_jsonb', // jsonb array +}; + +/** + * Maps PostgreSQL type OID or type string to a normalized type name + * @param pgType - PostgreSQL type as OID number or string like "pg_type_1043" + * @returns Normalized type name + */ +export function mapPostgreSQLType(pgType: string | number): string { + // Handle numeric OID + if (typeof pgType === 'number') { + return POSTGRESQL_TYPE_OID_MAP[pgType] || 'text'; + } + + // Handle string format "pg_type_1043" + if (typeof pgType === 'string') { + const match = pgType.match(/^pg_type_(\d+)$/); + if (match && match[1]) { + const oid = parseInt(match[1], 10); + return POSTGRESQL_TYPE_OID_MAP[oid] || 'text'; + } + + // If it's already a type name, return it + return pgType.toLowerCase(); + } + + return 'text'; +} + +/** + * Determines the simple type category for metadata + * @param normalizedType - The normalized PostgreSQL type name + * @returns Simple type category: 'number', 'text', or 'date' + */ +export function getPostgreSQLSimpleType(normalizedType: string): 'number' | 'text' | 'date' { + const lowerType = normalizedType.toLowerCase(); + + // Numeric types + if ( + lowerType.includes('int') || + lowerType.includes('float') || + lowerType.includes('numeric') || + lowerType.includes('decimal') || + lowerType.includes('real') || + lowerType.includes('double') || + lowerType === 'money' || + lowerType === 'bigint' || + lowerType === 'smallint' + ) { + return 'number'; + } + + // Date/time types + if ( + lowerType.includes('date') || + lowerType.includes('time') || + lowerType.includes('interval') + ) { + return 'date'; + } + + // Everything else is text + return 'text'; +} \ No newline at end of file diff --git a/packages/data-source/src/utils/create-metadata-from-results.ts b/packages/data-source/src/utils/create-metadata-from-results.ts index 8f5df81c7..a7ef9bad2 100644 --- a/packages/data-source/src/utils/create-metadata-from-results.ts +++ b/packages/data-source/src/utils/create-metadata-from-results.ts @@ -1,5 +1,6 @@ import type { ColumnMetaData, DataMetadata } from '@buster/server-shared/metrics'; import type { FieldMetadata } from '../adapters/base'; +import { getGenericSimpleType } from '../adapters/type-mappings'; /** * Creates DataMetadata from query results and optional column metadata from adapters @@ -34,27 +35,70 @@ export function createMetadataFromResults( // Try to use adapter metadata if available const adapterColumn = columns?.find((col) => col.name === columnName); if (adapterColumn) { - // Map adapter types to our types (this is a simplified mapping) - const typeStr = adapterColumn.type.toLowerCase(); - if ( - typeStr.includes('int') || - typeStr.includes('float') || - typeStr.includes('numeric') || - typeStr.includes('decimal') || - typeStr.includes('number') - ) { - simpleType = 'number'; - columnType = typeStr.includes('int') ? 'int4' : 'float8'; - } else if (typeStr.includes('date') || typeStr.includes('time')) { - simpleType = 'date'; - columnType = typeStr.includes('timestamp') ? 'timestamp' : 'date'; - } else if (typeStr.includes('bool')) { - // Booleans map to text in simple_type since 'boolean' isn't valid - simpleType = 'text'; - columnType = 'bool'; - } else { - simpleType = 'text'; - columnType = 'text'; + // Use the normalized type from the adapter + const normalizedType = adapterColumn.type.toLowerCase(); + + // Use our type mapping utility to determine simple type + simpleType = getGenericSimpleType(normalizedType); + + // Map to allowed ColumnMetaData types + switch (normalizedType) { + case 'integer': + case 'int': + columnType = 'int4'; + break; + case 'bigint': + columnType = 'int8'; + break; + case 'smallint': + case 'tinyint': + columnType = 'int2'; + break; + case 'double': + case 'double precision': + columnType = 'float8'; + break; + case 'real': + case 'float': + columnType = 'float4'; + break; + case 'boolean': + case 'bool': + columnType = 'bool'; + simpleType = 'text'; // Booleans map to text in simple_type + break; + case 'varchar': + case 'char': + case 'string': + columnType = 'text'; + break; + case 'timestamp': + case 'datetime': + columnType = 'timestamp'; + break; + case 'timestamptz': + columnType = 'timestamptz'; + break; + case 'date': + columnType = 'date'; + break; + case 'time': + columnType = 'time'; + break; + case 'json': + columnType = 'json'; + break; + case 'jsonb': + columnType = 'jsonb'; + break; + case 'decimal': + case 'numeric': + columnType = 'numeric'; + break; + default: + // Default to text for unknown types + columnType = 'text'; + break; } } else if (values.length > 0) { // Fallback: infer from data @@ -94,10 +138,23 @@ export function createMetadataFromResults( const uniqueValues = new Set(values); if (simpleType === 'number' && values.length > 0) { - const numericValues = values.filter((v): v is number => typeof v === 'number'); + // Try to convert values to numbers for proper comparison + const numericValues = values + .map((v) => { + if (typeof v === 'number') return v; + if (typeof v === 'string' && !isNaN(Number(v))) return Number(v); + return null; + }) + .filter((v): v is number => v !== null); + if (numericValues.length > 0) { minValue = Math.min(...numericValues); maxValue = Math.max(...numericValues); + } else { + // Fallback to string comparison if no valid numbers found + const sortedValues = [...values].sort(); + minValue = String(sortedValues[0]); + maxValue = String(sortedValues[sortedValues.length - 1]); } } else if (simpleType === 'date' && values.length > 0) { const dateValues = values diff --git a/packages/data-source/src/utils/execute-metric-query.ts b/packages/data-source/src/utils/execute-metric-query.ts index 552efb92c..31482dc66 100644 --- a/packages/data-source/src/utils/execute-metric-query.ts +++ b/packages/data-source/src/utils/execute-metric-query.ts @@ -100,6 +100,9 @@ export async function executeMetricQuery( // Trim results to requested limit if we have more const data = hasMoreRecords ? allResults.slice(0, maxRows) : allResults; + // Create metadata from original results and column information BEFORE conversion + const dataMetadata = createMetadataFromResults(data, result.columns); + // Convert data to match expected type (string | number | null) const typedData = data.map((row) => { const typedRow: Record = {}; @@ -118,9 +121,6 @@ export async function executeMetricQuery( return typedRow; }); - // Create metadata from results and column information - const dataMetadata = createMetadataFromResults(typedData, result.columns); - // Validate and parse result metadata if available const validatedMetadata = resultMetadataSchema.safeParse(result.metadata); const parsedMetadata = validatedMetadata.success ? validatedMetadata.data : undefined;