A simple backtest, which splits the dataset into a train set and test set, then fits a Random Forest classifier to the train set. We print the precision and accuracy of the classifier on the test set, then run a backtest comparing this strategy's performance to passive investment in
()
| 8 | |
| 9 | |
| 10 | def backtest(): |
| 11 | """ |
| 12 | A simple backtest, which splits the dataset into a train set and test set, |
| 13 | then fits a Random Forest classifier to the train set. We print the precision and accuracy |
| 14 | of the classifier on the test set, then run a backtest comparing this strategy's performance |
| 15 | to passive investment in the S&P500. |
| 16 | Please note that there is a methodological flaw in this backtest which will give deceptively |
| 17 | good results, so the results here should not encourage you to live trade. |
| 18 | """ |
| 19 | # Build the dataset, and drop any rows with missing values |
| 20 | data_df = pd.read_csv("keystats.csv", index_col="Date") |
| 21 | data_df.dropna(axis=0, how="any", inplace=True) |
| 22 | |
| 23 | features = data_df.columns[6:] |
| 24 | X = data_df[features].values |
| 25 | |
| 26 | # The labels are generated by applying the status_calc to the dataframe. |
| 27 | # '1' if a stock beats the S&P500 by more than x%, else '0'. Here x is the |
| 28 | # outperformance parameter, which is set to 10 by default but can be redefined. |
| 29 | y = list( |
| 30 | status_calc( |
| 31 | data_df["stock_p_change"], data_df["SP500_p_change"], outperformance=10 |
| 32 | ) |
| 33 | ) |
| 34 | |
| 35 | # z is required for us to track returns |
| 36 | z = np.array(data_df[["stock_p_change", "SP500_p_change"]]) |
| 37 | |
| 38 | # Generate the train set and test set by randomly splitting the dataset |
| 39 | X_train, X_test, y_train, y_test, z_train, z_test = train_test_split( |
| 40 | X, y, z, test_size=0.2 |
| 41 | ) |
| 42 | |
| 43 | # Instantiate a RandomForestClassifier with 100 trees, then fit it to the training data |
| 44 | clf = RandomForestClassifier(n_estimators=100, random_state=0) |
| 45 | clf.fit(X_train, y_train) |
| 46 | |
| 47 | # Generate the predictions, then print test set accuracy and precision |
| 48 | y_pred = clf.predict(X_test) |
| 49 | print("Classifier performance\n", "=" * 20) |
| 50 | print(f"Accuracy score: {clf.score(X_test, y_test): .2f}") |
| 51 | print(f"Precision score: {precision_score(y_test, y_pred): .2f}") |
| 52 | |
| 53 | # Because y_pred is an array of 1s and 0s, the number of positive predictions |
| 54 | # is equal to the sum of the array |
| 55 | num_positive_predictions = sum(y_pred) |
| 56 | if num_positive_predictions < 0: |
| 57 | print("No stocks predicted!") |
| 58 | |
| 59 | # Recall that z_test stores the change in stock price in column 0, and the |
| 60 | # change in S&P500 price in column 1. |
| 61 | # Whenever a stock is predicted to outperform (y_pred = 1), we 'buy' that stock |
| 62 | # and simultaneously `buy` the index for comparison. |
| 63 | stock_returns = 1 + z_test[y_pred, 0] / 100 |
| 64 | market_returns = 1 + z_test[y_pred, 1] / 100 |
| 65 | |
| 66 | # Calculate the average growth for each stock we predicted 'buy' |
| 67 | # and the corresponding index growth |
no test coverage detected