Discover available columns from all CSV types. Returns dict: {csv_type: {COLUMN_UPPER: actual_column_name}}
(self)
| 51 | self.conn = None |
| 52 | |
| 53 | def discover_columns(self) -> Dict[str, Dict[str, str]]: |
| 54 | """ |
| 55 | Discover available columns from all CSV types. |
| 56 | Returns dict: {csv_type: {COLUMN_UPPER: actual_column_name}} |
| 57 | """ |
| 58 | if self.available_columns: |
| 59 | return self.available_columns |
| 60 | |
| 61 | conn = self.connect() |
| 62 | |
| 63 | # CSV file patterns in priority order |
| 64 | csv_patterns = { |
| 65 | 'samples': 'xcapture_samples_*.csv', |
| 66 | 'syscend': 'xcapture_syscend_*.csv', |
| 67 | 'iorqend': 'xcapture_iorqend_*.csv', |
| 68 | 'kstacks': 'xcapture_kstacks_*.csv', |
| 69 | 'ustacks': 'xcapture_ustacks_*.csv' |
| 70 | } |
| 71 | |
| 72 | for csv_type, pattern in csv_patterns.items(): |
| 73 | self.available_columns[csv_type] = {} |
| 74 | self.schema_info[csv_type] = [] |
| 75 | |
| 76 | describe_result = None |
| 77 | active_pattern = pattern |
| 78 | reader = 'read_csv_auto' |
| 79 | |
| 80 | csv_files = self.get_csv_files(pattern) |
| 81 | if csv_files: |
| 82 | describe_result = self._try_describe(conn, reader, pattern) |
| 83 | |
| 84 | if not describe_result: |
| 85 | parquet_pattern = pattern.replace('.csv', '.parquet') |
| 86 | parquet_files = self.get_csv_files(parquet_pattern) |
| 87 | if parquet_files: |
| 88 | reader = 'read_parquet' |
| 89 | active_pattern = parquet_pattern |
| 90 | describe_result = self._try_describe(conn, reader, parquet_pattern) |
| 91 | |
| 92 | if describe_result: |
| 93 | columns = describe_result |
| 94 | self.available_columns[csv_type] = { |
| 95 | col_name.lower(): col_name for col_name, *_ in columns |
| 96 | } |
| 97 | self.schema_info[csv_type] = [(col_name, col_type) for col_name, col_type, *_ in columns] |
| 98 | self.csv_metadata[csv_type] = { |
| 99 | 'pattern': active_pattern, |
| 100 | 'column_count': len(columns), |
| 101 | 'columns': [col[0] for col in columns], |
| 102 | 'format': reader.replace('read_', '') |
| 103 | } |
| 104 | else: |
| 105 | self.csv_metadata[csv_type] = { |
| 106 | 'pattern': pattern, |
| 107 | 'column_count': 0, |
| 108 | 'columns': [], |
| 109 | 'format': None |
| 110 | } |
no test coverage detected