Creates the forward sample by parsing the current data html files that we downloaded in check_yahoo(). :return: a pandas dataframe containing all of the current data for each ticker.
()
| 90 | |
| 91 | |
| 92 | def forward(): |
| 93 | """ |
| 94 | Creates the forward sample by parsing the current data html files that we downloaded in check_yahoo(). |
| 95 | :return: a pandas dataframe containing all of the current data for each ticker. |
| 96 | """ |
| 97 | # Creating an empty dataframe which we will later fill. In addition to the features, we need some index variables |
| 98 | # (date, unix timestamp, ticker), and of course the dependent variables (prices). |
| 99 | df_columns = [ |
| 100 | "Date", |
| 101 | "Unix", |
| 102 | "Ticker", |
| 103 | "Price", |
| 104 | "stock_p_change", |
| 105 | "SP500", |
| 106 | "SP500_p_change", |
| 107 | ] + features |
| 108 | |
| 109 | df = pd.DataFrame(columns=df_columns) |
| 110 | |
| 111 | tickerfile_list = os.listdir("forward/") |
| 112 | |
| 113 | # Required in macOS to remove the hidden index file. |
| 114 | if ".DS_Store" in tickerfile_list: |
| 115 | tickerfile_list.remove(".DS_Store") |
| 116 | |
| 117 | # This is the actual parsing. This needs to be fixed every time yahoo changes their UI. |
| 118 | for tickerfile in tqdm(tickerfile_list, desc="Parsing progress:", unit="tickers"): |
| 119 | ticker = tickerfile.split(".html")[0].upper() |
| 120 | source = open(f"forward/{tickerfile}").read() |
| 121 | # Remove commas from the html to make parsing easier. |
| 122 | source = source.replace(",", "") |
| 123 | |
| 124 | # Regex search for the different variables in the html file, then append to value_list |
| 125 | value_list = [] |
| 126 | for variable in features: |
| 127 | try: |
| 128 | # Basically, look for the first number present after we an occurence of the variable |
| 129 | regex = ( |
| 130 | r">" |
| 131 | + re.escape(variable) |
| 132 | + r".*?(\-?\d+\.*\d*K?M?B?|N/A[\\n|\s]*|>0|NaN)%?" |
| 133 | r"(</td>|</span>)" |
| 134 | ) |
| 135 | value = re.search(regex, source, flags=re.DOTALL).group(1) |
| 136 | |
| 137 | # Dealing with number formatting |
| 138 | value_list.append(data_string_to_float(value)) |
| 139 | |
| 140 | # The data may not be present. Process accordingly. |
| 141 | except AttributeError: |
| 142 | value_list.append("N/A") |
| 143 | # print(ticker, variable) |
| 144 | |
| 145 | # Append the ticker and the features to the dataframe |
| 146 | new_df_row = [0, 0, ticker, 0, 0, 0, 0] + value_list |
| 147 | |
| 148 | df = df.append(dict(zip(df_columns, new_df_row)), ignore_index=True) |
| 149 |
no test coverage detected