MCPcopy Index your code
hub / github.com/VivekPa/AIAlpha / DataProcessing

Class DataProcessing

data_processor/data_processing.py:6–105  ·  view source on GitHub ↗

Source from the content-addressed store, hash-verified

4
5
6class DataProcessing:
7 def __init__(self, split):
8 self.split = split
9
10 def make_features(self, file_path, window, csv_path, make_y=True, verbose=True, save_csv=False):
11 df = pd.read_csv(f"{file_path}", index_col=0)
12 print(df.shape)
13 cols = df.columns
14 #print(type(df[cols[1]].iloc[1]))
15 #print(df[cols[2]].head())
16 for i in cols[1:]:
17 print(i)
18 df[f'{i}_ret1'] = np.log(df[i]/df[i].shift(1))
19 #df[f'{i}_autocorr1'] = df[f'{i}_ret1'].corr(df[f'{i}_ret1'].shift(1))
20 for j in range(2, window, 2):
21 #print((df[i]/df[i].shift(1)).head())
22 df[f'{i}_ret{j}'] = np.log(df[i]/df[i].shift(j))
23 df[f'{i}_mavg{j}'] = df[f'{i}_ret1'].rolling(j).mean()
24 df[f'{i}_ewm{j}'] = df[f'{i}_ret1'].ewm(span=j).mean()
25 #df[f'{i}_autocorr{j}'] = df[f'{i}_ret1'].corr(df[f'{i}_ret1'].shift(j))
26 if window > 10:
27 for j in range(10, window, 5):
28 df[f'{i}_vol{j}'] = df[f'{i}_ret1'].rolling(j).std()
29 df[f'{i}_ewmvol{j}'] = df[f'{i}_ret1'].ewm(span=j).std()
30 break
31 df['liq'] = df['close']*df['volume']
32 df['liq_ret1'] = df['liq']/df['liq'].shift(1)
33 #df['liq_autocorr1'] = df['liq'].corr(df['liq'].shift(1))
34 for j in range(2, window, 2):
35 df[f'liq_ret{j}'] = df['liq']/df['liq'].shift(j)
36 df[f'liq_mavg{j}'] = df['liq'].rolling(j).mean()
37 df[f'liq_ewm{j}'] = df['liq'].ewm(span=j).mean()
38 #df[f'liq_autocorr{j}'] = df['liq'].corr(df['liq'].shift(j))
39 if window > 10:
40 for j in range(10, window, 5):
41 #df[f'liq_vol{j}'] = df['liq'].rolling(j).std()
42 #df[f'liq_ewmvol{j}'] = df['liq'].ewm(span=j).std()
43 break
44 if verbose:
45 df = df.dropna()
46 print(df.shape)
47 print(df.tail())
48 print(df.head())
49 if save_csv:
50 df.to_csv(f'{csv_path}/full_features.csv')
51 return df
52
53 def make_train_test(self, df_x, df_y, window, csv_path, has_y=False, binary_y=False, save_csv=False):
54 """
55 Splits the dataset into train and test
56 :param df_x: dataframe of x variables
57 :type df_x: pd.DataFrame
58 :param df_y: dataframe of y values
59 :type df_y: pd.DataFrame
60 :param window: the prediction window
61 :type window: int
62 :param has_y: whether df_y exists separately or is a column in df_x (must be 'target' column)
63 :type has_y: boolean

Callers 3

run.pyFile · 0.90
pca_auto.pyFile · 0.90
data_processing.pyFile · 0.85

Calls

no outgoing calls

Tested by

no test coverage detected