From 8f3fb8732dda28dbbe6c0c7819895cba66fb8e89 Mon Sep 17 00:00:00 2001 From: dal Date: Tue, 1 Apr 2025 10:08:28 -0600 Subject: [PATCH] search data catalog change --- .../routes/helpers/search_data_catalog.rs | 35 ++++++++++--------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/api/src/routes/rest/routes/helpers/search_data_catalog.rs b/api/src/routes/rest/routes/helpers/search_data_catalog.rs index ea5764ca9..b0629f4ae 100644 --- a/api/src/routes/rest/routes/helpers/search_data_catalog.rs +++ b/api/src/routes/rest/routes/helpers/search_data_catalog.rs @@ -68,14 +68,14 @@ struct FilteredDataset { } const LLM_FILTER_PROMPT: &str = r#" -You are a dataset relevance evaluator. Your task is to determine which datasets might contain information relevant to the user's query based on their structure and metadata. +You are a dataset relevance evaluator. Your task is to determine which datasets might contain information relevant to the user's query based on their structure and metadata. Be inclusive in your evaluation - if there's a reasonable chance the dataset could be useful, include it. USER REQUEST: {user_request} SEARCH QUERY: {query} Below is a list of datasets that were identified as potentially relevant by an initial semantic ranking system. -For each dataset, review its description in the YAML format and determine if its structure is suitable for the user's query. -ONLY include datasets that you determine are relevant in your response. +For each dataset, review its description in the YAML format and determine if its structure could potentially be suitable for the user's query. +Include datasets that have even a reasonable possibility of containing relevant information. DATASETS: {datasets_json} @@ -86,30 +86,33 @@ Return a JSON response with the following structure: "results": [ { "id": "dataset-uuid-here", - "reason": "Brief explanation of why this dataset's structure is relevant" + "reason": "Brief explanation of why this dataset's structure might be relevant" }, - // ... more relevant datasets only + // ... more potentially relevant datasets ] } ``` IMPORTANT GUIDELINES: -1. DO NOT make assumptions about what specific values exist in the datasets -2. Focus EXCLUSIVELY on identifying datasets with STRUCTURES that could reasonably contain the type of information requested -3. For example, if a user asks about "red bull sales", consider datasets about products, sales, inventory, etc. as potentially relevant - even if "red bull" is not explicitly mentioned -4. Evaluate based on whether the dataset's schema, fields, or description indicates it COULD contain the relevant information -5. Look for structural compatibility rather than exact matches in the content -6. ONLY include datasets you find relevant in your response - omit any that aren't relevant +1. Be inclusive - if there's a reasonable possibility the dataset could be useful, include it +2. Consider both direct and indirect relationships to the query +3. For example, if a user asks about "red bull sales", consider datasets about: + - Direct relevance: products, sales, inventory + - Indirect relevance: marketing campaigns, customer demographics, store locations +4. Evaluate based on whether the dataset's schema, fields, or description MIGHT contain or relate to the relevant information +5. Include datasets that could provide contextual or supporting information +6. When in doubt about relevance, lean towards including the dataset 7. Ensure the "id" field exactly matches the dataset's UUID -8. Use both the USER REQUEST and SEARCH QUERY to understand the user's information needs - the USER REQUEST provides broader context while the SEARCH QUERY represents specific search intent -9. Restrict your evaluation strictly to the defined elements in the dataset metadata: +8. Use both the USER REQUEST and SEARCH QUERY to understand the user's information needs broadly +9. Consider these elements in the dataset metadata: - Column names and their data types - Entity relationships - Predefined metrics - Table schemas - Dimension hierarchies -10. Do NOT make assumptions about what data might exist beyond what is explicitly defined in the metadata -11. A dataset is relevant ONLY if its documented structure supports answering the query, not because you assume it might contain certain data + - Related or connected data structures +10. While you shouldn't assume specific data exists, you can be optimistic about the potential usefulness of related data structures +11. A dataset is relevant if its structure could reasonably support or contribute to answering the query, either directly or indirectly "#; pub fn router() -> Router { @@ -238,7 +241,7 @@ async fn rerank_datasets( query, documents, model: ReRankModel::EnglishV3, - top_n: Some(25), // Get top 20 results per query + top_n: Some(30), ..Default::default() };