| 4 | |
| 5 | |
| 6 | class DataProcessing: |
| 7 | def __init__(self, split): |
| 8 | self.split = split |
| 9 | |
| 10 | def make_features(self, file_path, window, csv_path, make_y=True, verbose=True, save_csv=False): |
| 11 | df = pd.read_csv(f"{file_path}", index_col=0) |
| 12 | print(df.shape) |
| 13 | cols = df.columns |
| 14 | #print(type(df[cols[1]].iloc[1])) |
| 15 | #print(df[cols[2]].head()) |
| 16 | for i in cols[1:]: |
| 17 | print(i) |
| 18 | df[f'{i}_ret1'] = np.log(df[i]/df[i].shift(1)) |
| 19 | #df[f'{i}_autocorr1'] = df[f'{i}_ret1'].corr(df[f'{i}_ret1'].shift(1)) |
| 20 | for j in range(2, window, 2): |
| 21 | #print((df[i]/df[i].shift(1)).head()) |
| 22 | df[f'{i}_ret{j}'] = np.log(df[i]/df[i].shift(j)) |
| 23 | df[f'{i}_mavg{j}'] = df[f'{i}_ret1'].rolling(j).mean() |
| 24 | df[f'{i}_ewm{j}'] = df[f'{i}_ret1'].ewm(span=j).mean() |
| 25 | #df[f'{i}_autocorr{j}'] = df[f'{i}_ret1'].corr(df[f'{i}_ret1'].shift(j)) |
| 26 | if window > 10: |
| 27 | for j in range(10, window, 5): |
| 28 | df[f'{i}_vol{j}'] = df[f'{i}_ret1'].rolling(j).std() |
| 29 | df[f'{i}_ewmvol{j}'] = df[f'{i}_ret1'].ewm(span=j).std() |
| 30 | break |
| 31 | df['liq'] = df['close']*df['volume'] |
| 32 | df['liq_ret1'] = df['liq']/df['liq'].shift(1) |
| 33 | #df['liq_autocorr1'] = df['liq'].corr(df['liq'].shift(1)) |
| 34 | for j in range(2, window, 2): |
| 35 | df[f'liq_ret{j}'] = df['liq']/df['liq'].shift(j) |
| 36 | df[f'liq_mavg{j}'] = df['liq'].rolling(j).mean() |
| 37 | df[f'liq_ewm{j}'] = df['liq'].ewm(span=j).mean() |
| 38 | #df[f'liq_autocorr{j}'] = df['liq'].corr(df['liq'].shift(j)) |
| 39 | if window > 10: |
| 40 | for j in range(10, window, 5): |
| 41 | #df[f'liq_vol{j}'] = df['liq'].rolling(j).std() |
| 42 | #df[f'liq_ewmvol{j}'] = df['liq'].ewm(span=j).std() |
| 43 | break |
| 44 | if verbose: |
| 45 | df = df.dropna() |
| 46 | print(df.shape) |
| 47 | print(df.tail()) |
| 48 | print(df.head()) |
| 49 | if save_csv: |
| 50 | df.to_csv(f'{csv_path}/full_features.csv') |
| 51 | return df |
| 52 | |
| 53 | def make_train_test(self, df_x, df_y, window, csv_path, has_y=False, binary_y=False, save_csv=False): |
| 54 | """ |
| 55 | Splits the dataset into train and test |
| 56 | :param df_x: dataframe of x variables |
| 57 | :type df_x: pd.DataFrame |
| 58 | :param df_y: dataframe of y values |
| 59 | :type df_y: pd.DataFrame |
| 60 | :param window: the prediction window |
| 61 | :type window: int |
| 62 | :param has_y: whether df_y exists separately or is a column in df_x (must be 'target' column) |
| 63 | :type has_y: boolean |
no outgoing calls
no test coverage detected