mirror of https://github.com/buster-so/buster.git
bugfix: data metadata processing
This commit is contained in:
parent
1819eb1859
commit
7e866c54b1
|
@ -99,98 +99,101 @@ fn compute_column_metadata(data: &[IndexMap<String, DataType>]) -> Vec<ColumnMet
|
||||||
|
|
||||||
columns.iter().map(|column_name| {
|
columns.iter().map(|column_name| {
|
||||||
let mut value_map = HashSet::new();
|
let mut value_map = HashSet::new();
|
||||||
let mut min_value = None;
|
let mut min_value_numeric: Option<f64> = None; // Use specific name
|
||||||
let mut max_value = None;
|
let mut max_value_numeric: Option<f64> = None; // Use specific name
|
||||||
let mut is_date_type = false;
|
|
||||||
let mut min_value_str: Option<String> = None;
|
let mut min_value_str: Option<String> = None;
|
||||||
let mut max_value_str: Option<String> = None;
|
let mut max_value_str: Option<String> = None;
|
||||||
|
let mut determined_type: Option<(SimpleType, ColumnType)> = None;
|
||||||
|
|
||||||
for row in data {
|
for row in data {
|
||||||
if let Some(value) = row.get(column_name) {
|
if let Some(value) = row.get(column_name) {
|
||||||
// Track unique values (up to a reasonable limit)
|
// Track unique values (up to a reasonable limit)
|
||||||
if value_map.len() < 100 {
|
if value_map.len() < 100 {
|
||||||
value_map.insert(format!("{:?}", value));
|
value_map.insert(format!("{:?}", value)); // format! handles nulls acceptably
|
||||||
}
|
}
|
||||||
|
|
||||||
// Calculate min/max for appropriate types
|
// Determine type from first non-null value encountered
|
||||||
|
if determined_type.is_none() {
|
||||||
|
match value {
|
||||||
|
// Check for non-null variants using matches! for conciseness
|
||||||
|
DataType::Int2(Some(_)) | DataType::Int4(Some(_)) | DataType::Int8(Some(_)) |
|
||||||
|
DataType::Float4(Some(_)) | DataType::Float8(Some(_)) | DataType::Text(Some(_)) |
|
||||||
|
DataType::Bool(Some(_)) | DataType::Date(Some(_)) | DataType::Timestamp(Some(_)) |
|
||||||
|
DataType::Timestamptz(Some(_)) | DataType::Json(Some(_)) | DataType::Uuid(Some(_)) |
|
||||||
|
DataType::Decimal(Some(_)) | DataType::Time(Some(_)) => {
|
||||||
|
determined_type = Some(determine_types(value));
|
||||||
|
}
|
||||||
|
// If it's a Null variant or Unknown, keep looking
|
||||||
|
_ => {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Calculate min/max based on value's actual type in this row
|
||||||
match value {
|
match value {
|
||||||
DataType::Int2(Some(v)) => {
|
DataType::Int2(Some(v)) => {
|
||||||
let n = *v as f64;
|
let n = *v as f64;
|
||||||
min_value = Some(min_value.map_or(n, |min: f64| min.min(n)));
|
min_value_numeric = Some(min_value_numeric.map_or(n, |min| min.min(n)));
|
||||||
max_value = Some(max_value.map_or(n, |max: f64| max.max(n)));
|
max_value_numeric = Some(max_value_numeric.map_or(n, |max| max.max(n)));
|
||||||
}
|
}
|
||||||
DataType::Int4(Some(v)) => {
|
DataType::Int4(Some(v)) => {
|
||||||
let n = *v as f64;
|
let n = *v as f64;
|
||||||
min_value = Some(min_value.map_or(n, |min: f64| min.min(n)));
|
min_value_numeric = Some(min_value_numeric.map_or(n, |min| min.min(n)));
|
||||||
max_value = Some(max_value.map_or(n, |max: f64| max.max(n)));
|
max_value_numeric = Some(max_value_numeric.map_or(n, |max| max.max(n)));
|
||||||
}
|
}
|
||||||
DataType::Int8(Some(v)) => {
|
DataType::Int8(Some(v)) => {
|
||||||
let n = *v as f64;
|
let n = *v as f64;
|
||||||
min_value = Some(min_value.map_or(n, |min: f64| min.min(n)));
|
min_value_numeric = Some(min_value_numeric.map_or(n, |min| min.min(n)));
|
||||||
max_value = Some(max_value.map_or(n, |max: f64| max.max(n)));
|
max_value_numeric = Some(max_value_numeric.map_or(n, |max| max.max(n)));
|
||||||
}
|
}
|
||||||
DataType::Float4(Some(v)) => {
|
DataType::Float4(Some(v)) => {
|
||||||
let n = *v as f64;
|
let n = *v as f64;
|
||||||
min_value = Some(min_value.map_or(n, |min: f64| min.min(n)));
|
min_value_numeric = Some(min_value_numeric.map_or(n, |min| min.min(n)));
|
||||||
max_value = Some(max_value.map_or(n, |max: f64| max.max(n)));
|
max_value_numeric = Some(max_value_numeric.map_or(n, |max| max.max(n)));
|
||||||
}
|
}
|
||||||
DataType::Float8(Some(v)) => {
|
DataType::Float8(Some(v)) => {
|
||||||
let n = *v as f64;
|
let n = *v as f64;
|
||||||
min_value = Some(min_value.map_or(n, |min: f64| min.min(n)));
|
min_value_numeric = Some(min_value_numeric.map_or(n, |min| min.min(n)));
|
||||||
max_value = Some(max_value.map_or(n, |max: f64| max.max(n)));
|
max_value_numeric = Some(max_value_numeric.map_or(n, |max| max.max(n)));
|
||||||
}
|
}
|
||||||
DataType::Date(Some(date)) => {
|
DataType::Date(Some(date)) => {
|
||||||
is_date_type = true;
|
update_date_min_max(&date.to_string(), &mut min_value_str, &mut max_value_str);
|
||||||
let date_str = date.to_string();
|
|
||||||
update_date_min_max(&date_str, &mut min_value_str, &mut max_value_str);
|
|
||||||
}
|
}
|
||||||
DataType::Timestamp(Some(ts)) => {
|
DataType::Timestamp(Some(ts)) => {
|
||||||
is_date_type = true;
|
update_date_min_max(&ts.to_string(), &mut min_value_str, &mut max_value_str);
|
||||||
let ts_str = ts.to_string();
|
|
||||||
update_date_min_max(&ts_str, &mut min_value_str, &mut max_value_str);
|
|
||||||
}
|
}
|
||||||
DataType::Timestamptz(Some(ts)) => {
|
DataType::Timestamptz(Some(ts)) => {
|
||||||
is_date_type = true;
|
update_date_min_max(&ts.to_string(), &mut min_value_str, &mut max_value_str);
|
||||||
let ts_str = ts.to_string();
|
|
||||||
update_date_min_max(&ts_str, &mut min_value_str, &mut max_value_str);
|
|
||||||
}
|
}
|
||||||
|
// Ignore nulls and non-comparable types for min/max calculation
|
||||||
_ => {}
|
_ => {}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Determine the column type and simple type
|
// Finalize types - default if no non-null value was found
|
||||||
let column_type = first_row.get(column_name).unwrap();
|
let (simple_type, column_type) = determined_type.unwrap_or((SimpleType::Other, ColumnType::Other));
|
||||||
let (simple_type, column_type) = determine_types(column_type);
|
|
||||||
|
|
||||||
// Format min/max values appropriately based on type
|
// Format min/max values appropriately based on determined simple_type
|
||||||
let (min_value, max_value) = if is_date_type {
|
let (min_value_json, max_value_json) = match simple_type {
|
||||||
(
|
SimpleType::Number => (
|
||||||
min_value_str.map_or(serde_json::Value::Null, |v| serde_json::Value::String(v)),
|
min_value_numeric.and_then(|v| serde_json::Number::from_f64(v).map(serde_json::Value::Number))
|
||||||
max_value_str.map_or(serde_json::Value::Null, |v| serde_json::Value::String(v)),
|
.unwrap_or(serde_json::Value::Null),
|
||||||
)
|
max_value_numeric.and_then(|v| serde_json::Number::from_f64(v).map(serde_json::Value::Number))
|
||||||
} else {
|
.unwrap_or(serde_json::Value::Null),
|
||||||
(
|
),
|
||||||
min_value.map_or(serde_json::Value::Null, |v| {
|
SimpleType::Date => (
|
||||||
match serde_json::Number::from_f64(v) {
|
min_value_str.map_or(serde_json::Value::Null, serde_json::Value::String),
|
||||||
Some(num) => serde_json::Value::Number(num),
|
max_value_str.map_or(serde_json::Value::Null, serde_json::Value::String),
|
||||||
None => serde_json::Value::Null,
|
),
|
||||||
}
|
// Don't provide min/max for other types
|
||||||
}),
|
_ => (serde_json::Value::Null, serde_json::Value::Null),
|
||||||
max_value.map_or(serde_json::Value::Null, |v| {
|
|
||||||
match serde_json::Number::from_f64(v) {
|
|
||||||
Some(num) => serde_json::Value::Number(num),
|
|
||||||
None => serde_json::Value::Null,
|
|
||||||
}
|
|
||||||
}),
|
|
||||||
)
|
|
||||||
};
|
};
|
||||||
|
|
||||||
ColumnMetaData {
|
ColumnMetaData {
|
||||||
name: column_name.to_lowercase(),
|
name: column_name.to_lowercase(),
|
||||||
min_value,
|
min_value: min_value_json,
|
||||||
max_value,
|
max_value: max_value_json,
|
||||||
unique_values: value_map.len() as i32,
|
unique_values: value_map.len() as i32, // Count includes distinct null representations
|
||||||
simple_type,
|
simple_type,
|
||||||
column_type,
|
column_type,
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue