Classify URL specified by user
(data)
| 11 | |
| 12 | |
| 13 | def classify(data): |
| 14 | """ |
| 15 | Classify URL specified by user |
| 16 | """ |
| 17 | soup = BeautifulSoup(data, features='html.parser') |
| 18 | html = soup.get_text() |
| 19 | |
| 20 | # create classifier |
| 21 | clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', SGDClassifier())]) |
| 22 | try: |
| 23 | os.chdir(Path(__file__).parent) |
| 24 | |
| 25 | dataset = load_files('training_data') |
| 26 | except FileNotFoundError: |
| 27 | print("Training data not found. Obtaining training data...") |
| 28 | print("This may take a while...") |
| 29 | from .gather_data import write_data |
| 30 | write_data() |
| 31 | print("Training data obtained.") |
| 32 | dataset = load_files('training_data') |
| 33 | pass |
| 34 | x_train, x_test, y_train, y_test = train_test_split(dataset.data, dataset.target) |
| 35 | clf.fit(x_train, y_train) |
| 36 | |
| 37 | # returns an array of target_name values |
| 38 | predicted = clf.predict([html]) |
| 39 | accuracy = np.mean(predicted == y_test) |
| 40 | |
| 41 | return [dataset.target_names[predicted[0]], accuracy] |
no test coverage detected