Allows to generate schema based on a CSV file. The names of the columns are taken from the header of the CSV file. Types of columns are inferred from the values, by checking if they can be parsed. Currently supported types are str, int and float. Args: path: path to the CSV
(
path: str,
*,
name: str | None = None,
properties: SchemaProperties = SchemaProperties(),
delimiter: str = ",",
quote: str = '"',
comment_character: str | None = None,
escape: str | None = None,
double_quote_escapes: bool = True,
num_parsed_rows: int | None = None,
)
| 925 | |
| 926 | |
| 927 | def schema_from_csv( |
| 928 | path: str, |
| 929 | *, |
| 930 | name: str | None = None, |
| 931 | properties: SchemaProperties = SchemaProperties(), |
| 932 | delimiter: str = ",", |
| 933 | quote: str = '"', |
| 934 | comment_character: str | None = None, |
| 935 | escape: str | None = None, |
| 936 | double_quote_escapes: bool = True, |
| 937 | num_parsed_rows: int | None = None, |
| 938 | ): |
| 939 | """Allows to generate schema based on a CSV file. |
| 940 | The names of the columns are taken from the header of the CSV file. |
| 941 | Types of columns are inferred from the values, by checking if they can be parsed. |
| 942 | Currently supported types are str, int and float. |
| 943 | |
| 944 | Args: |
| 945 | path: path to the CSV file. |
| 946 | name: schema name. |
| 947 | properties: schema properties. |
| 948 | delimiter: delimiter used in CSV file. Defaults to ",". |
| 949 | quote: quote character used in CSV file. Defaults to '"'. |
| 950 | comment_character: character used in CSV file to denote comments. |
| 951 | Defaults to None |
| 952 | escape: escape character used in CSV file. Defaults to None. |
| 953 | double_quote_escapes: enable escapes of double quotes. Defaults to True. |
| 954 | num_parsed_rows: number of rows, which will be parsed when inferring types. When |
| 955 | set to None, all rows will be parsed. When set to 0, types of all columns |
| 956 | will be set to str. Defaults to None. |
| 957 | |
| 958 | Returns: |
| 959 | Schema |
| 960 | """ |
| 961 | |
| 962 | def remove_comments_from_file(f: Iterable[str], comment_char: str | None): |
| 963 | for line in f: |
| 964 | if line.lstrip()[0] != comment_char: |
| 965 | yield line |
| 966 | |
| 967 | with open(path) as f: |
| 968 | csv_reader = csv.DictReader( |
| 969 | remove_comments_from_file(f, comment_character), |
| 970 | delimiter=delimiter, |
| 971 | escapechar=escape, |
| 972 | quoting=csv.QUOTE_ALL, |
| 973 | quotechar=quote, |
| 974 | doublequote=double_quote_escapes, |
| 975 | ) |
| 976 | if csv_reader.fieldnames is None: |
| 977 | raise ValueError("can't generate Schema based on an empty CSV file") |
| 978 | column_names = csv_reader.fieldnames |
| 979 | if num_parsed_rows is None: |
| 980 | csv_data = list(csv_reader) |
| 981 | else: |
| 982 | csv_data = list(itertools.islice(csv_reader, num_parsed_rows)) |
| 983 | |
| 984 | def choose_type(entries: list[str]): |
nothing calls this directly
no test coverage detected