Currently, the sp500 and stock price datasets we downloaded do not have any data for days when the market was closed (weekends and public holidays). We need to amend this so that all rows are included. Doing this now saves a lot of effort when we actually create the keystats dataset
()
| 59 | |
| 60 | |
| 61 | def preprocess_price_data(): |
| 62 | """ |
| 63 | Currently, the sp500 and stock price datasets we downloaded do not have any data for |
| 64 | days when the market was closed (weekends and public holidays). We need to amend this so that |
| 65 | all rows are included. Doing this now saves a lot of effort when we actually create the |
| 66 | keystats dataset, which requires that we have stock data every day. |
| 67 | :return: SP500 and stock dataframes, with no missing rows. |
| 68 | """ |
| 69 | # Read in SP500 data and stock data, parsing the dates. |
| 70 | sp500_raw_data = pd.read_csv("sp500_index.csv", index_col="Date", parse_dates=True) |
| 71 | stock_raw_data = pd.read_csv("stock_prices.csv", index_col="Date", parse_dates=True) |
| 72 | |
| 73 | # We will reindex to include the weekends. |
| 74 | start_date = str(stock_raw_data.index[0]) |
| 75 | end_date = str(stock_raw_data.index[-1]) |
| 76 | idx = pd.date_range(start_date, end_date) |
| 77 | sp500_raw_data = sp500_raw_data.reindex(idx) |
| 78 | stock_raw_data = stock_raw_data.reindex(idx) |
| 79 | |
| 80 | # Now the weekends are NaN, so we fill forward these NaNs |
| 81 | # (i.e weekends take the value of Friday's adjusted close). |
| 82 | sp500_raw_data.ffill(inplace=True) |
| 83 | stock_raw_data.ffill(inplace=True) |
| 84 | |
| 85 | return sp500_raw_data, stock_raw_data |
| 86 | |
| 87 | |
| 88 | def parse_keystats(sp500_df, stock_df): |