Applies a larger test of the schema against the data held in CustomTable .rows attribute - will test the number of samples passed as an optional parameter. If auto_remediate == True (default), then it will automatically update the data type used in the schema to the 'sa
(self, samples=10, auto_remediate=True)
| 3414 | return dt |
| 3415 | |
| 3416 | def test_and_remediate_schema(self, samples=10, auto_remediate=True): |
| 3417 | |
| 3418 | """ Applies a larger test of the schema against the data held in CustomTable .rows attribute - will |
| 3419 | test the number of samples passed as an optional parameter. |
| 3420 | |
| 3421 | If auto_remediate == True (default), then it will automatically update the data type used in the schema to |
| 3422 | the 'safest' among the types found in the sample set. """ |
| 3423 | |
| 3424 | updated_schema = {} |
| 3425 | |
| 3426 | for key, data_type in self.schema.items(): |
| 3427 | |
| 3428 | if len(self.rows) < samples: |
| 3429 | samples = len(self.rows) |
| 3430 | |
| 3431 | samples_dt_list = [] |
| 3432 | |
| 3433 | for x in range(0,samples): |
| 3434 | |
| 3435 | if key not in self.rows[x]: |
| 3436 | logger.warning(f"warning: CustomTable - test_and_remediate_schema - unexpected error - " |
| 3437 | f"key not found in row - {x} - {key} - {self.rows[x]}") |
| 3438 | else: |
| 3439 | check_value = self.rows[x][key] |
| 3440 | samples_dt_list.append(self._get_best_guess_value_type(check_value)) |
| 3441 | |
| 3442 | # simple decision tree - will add more options over time |
| 3443 | if "text" in samples_dt_list or data_type=="text": |
| 3444 | dt = "text" |
| 3445 | elif "float" in samples_dt_list or data_type=="float": |
| 3446 | dt = "float" |
| 3447 | elif "integer" in samples_dt_list or "bigint" in samples_dt_list and data_type in ["integer", "bigint"]: |
| 3448 | dt = "integer" |
| 3449 | if self.db == "postgres": |
| 3450 | dt = "bigint" |
| 3451 | else: |
| 3452 | # when in doubt, assign 'text' as safest choice |
| 3453 | dt = "text" |
| 3454 | |
| 3455 | updated_schema.update({key: dt}) |
| 3456 | |
| 3457 | if auto_remediate: |
| 3458 | self.schema = updated_schema |
| 3459 | |
| 3460 | return updated_schema |
| 3461 | |
| 3462 | def load_json(self, fp, fn, selected_keys=None, data_type_map=None, schema=None): |
| 3463 |
no test coverage detected