We have downloaded a large number of html files, which are snapshots of a ticker at different times, containing the fundamental data (our features). To extract the key statistics, we use regex. For supervised machine learning, we also need the data that will form our dependent variable,
(sp500_df, stock_df)
| 86 | |
| 87 | |
| 88 | def parse_keystats(sp500_df, stock_df): |
| 89 | """ |
| 90 | We have downloaded a large number of html files, which are snapshots of a ticker at different times, |
| 91 | containing the fundamental data (our features). To extract the key statistics, we use regex. |
| 92 | For supervised machine learning, we also need the data that will form our dependent variable, |
| 93 | the performance of the stock compared to the SP500. |
| 94 | :sp500_df: dataframe containing SP500 prices |
| 95 | :stock_df: dataframe containing stock prices |
| 96 | :return: a dataframe of training data (i.e features and the components of our dependent variable) |
| 97 | """ |
| 98 | # The tickers whose data is to be parsed. |
| 99 | stock_list = [x[0] for x in os.walk(statspath)] |
| 100 | stock_list = stock_list[1:] |
| 101 | |
| 102 | # Creating a new dataframe which we will later fill. |
| 103 | df_columns = [ |
| 104 | "Date", |
| 105 | "Unix", |
| 106 | "Ticker", |
| 107 | "Price", |
| 108 | "stock_p_change", |
| 109 | "SP500", |
| 110 | "SP500_p_change", |
| 111 | ] + features |
| 112 | |
| 113 | df = pd.DataFrame(columns=df_columns) |
| 114 | |
| 115 | # tqdm is a simple progress bar |
| 116 | for stock_directory in tqdm(stock_list, desc="Parsing progress:", unit="tickers"): |
| 117 | keystats_html_files = os.listdir(stock_directory) |
| 118 | |
| 119 | # Snippet to get rid of the .DS_Store file in macOS |
| 120 | if ".DS_Store" in keystats_html_files: |
| 121 | keystats_html_files.remove(".DS_Store") |
| 122 | |
| 123 | ticker = stock_directory.split(statspath)[1] |
| 124 | |
| 125 | for file in keystats_html_files: |
| 126 | # Convert the datetime format of our file to unix time |
| 127 | date_stamp = datetime.strptime(file, "%Y%m%d%H%M%S.html") |
| 128 | unix_time = time.mktime(date_stamp.timetuple()) |
| 129 | |
| 130 | # Read in the html file as a string. |
| 131 | full_file_path = stock_directory + "/" + file |
| 132 | |
| 133 | # This will store the parsed values |
| 134 | value_list = [] |
| 135 | |
| 136 | with open(full_file_path, "r") as source: |
| 137 | source = source.read() |
| 138 | # Remove commas from the html to make parsing easier. |
| 139 | source = source.replace(",", "") |
| 140 | |
| 141 | # Regex search for the different variables in the html file, then append to value_list |
| 142 | for variable in features: |
| 143 | # Search for the table entry adjacent to the variable name. |
| 144 | try: |
| 145 | regex = ( |
no test coverage detected