Get stats of a column in a SQL table.
(
table_name: str, column_name: str, column_type: DataType
)
| 8 | |
| 9 | |
| 10 | def get_sql_stats( |
| 11 | table_name: str, column_name: str, column_type: DataType |
| 12 | ) -> ColumnStats: |
| 13 | """ |
| 14 | Get stats of a column in a SQL table. |
| 15 | """ |
| 16 | |
| 17 | # Prepare the stats query based on the column type |
| 18 | if column_type in ("integer", "number"): |
| 19 | stats_query = f""" |
| 20 | SELECT |
| 21 | COUNT(*) as count, |
| 22 | COUNT(DISTINCT "{column_name}") as unique, |
| 23 | SUM(CASE WHEN "{column_name}" IS NULL THEN 1 ELSE 0 END) as null_count, |
| 24 | MIN("{column_name}") as min, |
| 25 | MAX("{column_name}") as max, |
| 26 | AVG("{column_name}") as mean, |
| 27 | PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY "{column_name}") as median, |
| 28 | STDDEV("{column_name}") as std, |
| 29 | PERCENTILE_CONT(0.05) WITHIN GROUP (ORDER BY "{column_name}") as p5, |
| 30 | PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY "{column_name}") as p25, |
| 31 | PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY "{column_name}") as p75, |
| 32 | PERCENTILE_CONT(0.95) WITHIN GROUP (ORDER BY "{column_name}") as p95 |
| 33 | FROM {table_name} |
| 34 | """ |
| 35 | elif ( |
| 36 | column_type == "date" |
| 37 | or column_type == "datetime" |
| 38 | or column_type == "time" |
| 39 | ): |
| 40 | stats_query = f""" |
| 41 | SELECT |
| 42 | COUNT(*) as count, |
| 43 | COUNT(DISTINCT "{column_name}") as unique, |
| 44 | SUM(CASE WHEN "{column_name}" IS NULL THEN 1 ELSE 0 END) as null_count, |
| 45 | MIN("{column_name}") as min, |
| 46 | MAX("{column_name}") as max |
| 47 | FROM {table_name} |
| 48 | """ |
| 49 | elif column_type == "boolean": |
| 50 | stats_query = f""" |
| 51 | SELECT |
| 52 | COUNT(*) as count, |
| 53 | COUNT(DISTINCT "{column_name}") as unique, |
| 54 | SUM(CASE WHEN "{column_name}" IS NULL THEN 1 ELSE 0 END) as null_count, |
| 55 | SUM(CASE WHEN "{column_name}" = TRUE THEN 1 ELSE 0 END) as true_count, |
| 56 | SUM(CASE WHEN "{column_name}" = FALSE THEN 1 ELSE 0 END) as false_count |
| 57 | FROM {table_name} |
| 58 | """ |
| 59 | else: |
| 60 | stats_query = f""" |
| 61 | SELECT |
| 62 | COUNT(*) as count, |
| 63 | COUNT(DISTINCT "{column_name}") as unique, |
| 64 | SUM(CASE WHEN "{column_name}" IS NULL THEN 1 ELSE 0 END) as null_count |
| 65 | FROM {table_name} |
| 66 | """ |
| 67 |
searching dependent graphs…