Writes the training data from the csv file to a directory based on the scikit-learn.datasets `load_files` specification. dataset source: https://www.kaggle.com/hetulmehta/website-classification e.g. container_folder/ category_1_folder/ file_1.tx
()
| 7 | |
| 8 | |
| 9 | def write_data(): |
| 10 | """ |
| 11 | Writes the training data from the csv file to a directory based on the |
| 12 | scikit-learn.datasets `load_files` specification. |
| 13 | |
| 14 | dataset source: https://www.kaggle.com/hetulmehta/website-classification |
| 15 | |
| 16 | e.g. |
| 17 | container_folder/ |
| 18 | category_1_folder/ |
| 19 | file_1.txt file_2.txt file_3.txt ... file_42.txt |
| 20 | category_2_folder/ |
| 21 | file_43.txt file_44.txt ... |
| 22 | """ |
| 23 | |
| 24 | with open('website_classification.csv') as csvfile: |
| 25 | website_reader = csv.reader(csvfile, delimiter=',') |
| 26 | for row in website_reader: |
| 27 | [id, website, content, category] = row |
| 28 | if category != 'category': |
| 29 | category = category.replace('/', '+') |
| 30 | dir_name = f"training_data/{category}" |
| 31 | Path(dir_name).mkdir(parents=True, exist_ok=True) |
| 32 | with open(f'{dir_name}/{id}.txt', mode='w+') as txtfile: |
| 33 | txtfile.write(content) |
| 34 | |
| 35 | |
| 36 | if __name__ == "__main__": |
no outgoing calls
no test coverage detected