Splits the dataset into train and test :param df_x: dataframe of x variables :type df_x: pd.DataFrame :param df_y: dataframe of y values :type df_y: pd.DataFrame :param window: the prediction window :type window: int :param has_y: whet
(self, df_x, df_y, window, csv_path, has_y=False, binary_y=False, save_csv=False)
| 51 | return df |
| 52 | |
| 53 | def make_train_test(self, df_x, df_y, window, csv_path, has_y=False, binary_y=False, save_csv=False): |
| 54 | """ |
| 55 | Splits the dataset into train and test |
| 56 | :param df_x: dataframe of x variables |
| 57 | :type df_x: pd.DataFrame |
| 58 | :param df_y: dataframe of y values |
| 59 | :type df_y: pd.DataFrame |
| 60 | :param window: the prediction window |
| 61 | :type window: int |
| 62 | :param has_y: whether df_y exists separately or is a column in df_x (must be 'target' column) |
| 63 | :type has_y: boolean |
| 64 | :return: train_x, train_y, test_x, test_y |
| 65 | :rtype: pd.DataFrames |
| 66 | """ |
| 67 | if has_y: |
| 68 | y_values = df_y.copy() |
| 69 | y_values.columns = ['y_values'] |
| 70 | fulldata = df_x.copy() |
| 71 | else: |
| 72 | if window == 0: |
| 73 | y_values = df_x['close'].copy() |
| 74 | y_values.columns = ['y_values'] |
| 75 | fulldata = df_x.copy() |
| 76 | else: |
| 77 | y_values = np.log(df_x['close'].copy()/df_x['close'].copy().shift(-window)).dropna() |
| 78 | y_values.columns = ['y_values'] |
| 79 | fulldata = df_x.iloc[:-window, :].copy() |
| 80 | if binary_y: |
| 81 | y_values.loc[y_values['y_values']<0] = -1 |
| 82 | y_values.loc[y_values['y_values']>0] = 1 |
| 83 | y_values.loc[y_values['y_values']==0] = 0 |
| 84 | print(y_values.shape) |
| 85 | print(fulldata.shape) |
| 86 | train_y = y_values.iloc[:int(len(y_values)*self.split)] |
| 87 | test_y = y_values.iloc[int(len(y_values)*self.split)+1:] |
| 88 | |
| 89 | train_x = fulldata.iloc[:int(len(y_values)*self.split), :] |
| 90 | test_x = fulldata.iloc[int(len(y_values)*self.split)+1:len(y_values), :] |
| 91 | |
| 92 | print(train_y.shape) |
| 93 | print(train_x.shape) |
| 94 | |
| 95 | if save_csv: |
| 96 | train_x.to_csv(f'data/processed_data/{csv_path}/train_x.csv') |
| 97 | train_y.to_csv(f'data/processed_data/{csv_path}/train_y.csv', header=['y_values']) |
| 98 | test_x.to_csv(f'data/processed_data/{csv_path}/test_x.csv') |
| 99 | test_y.to_csv(f'data/processed_data/{csv_path}/test_y.csv', header=['y_values']) |
| 100 | fulldata.to_csv(f'data/processed_data/{csv_path}/full_x.csv') |
| 101 | y_values.to_csv(f'data/processed_data/{csv_path}/full_y.csv', header=['y_values']) |
| 102 | return fulldata, y_values, train_x, train_y, test_x, test_y |
| 103 | |
| 104 | def check_labels(self, y_values): |
| 105 | print(y_values['y_values'].value_counts()) |
no outgoing calls
no test coverage detected