hub / github.com/VivekPa/AIAlpha / DataProcessing

Class DataProcessing

data_processor/data_processing.py:6–105 · view source on GitHub ↗

Source from the content-addressed store, hash-verified

4
5
6	class DataProcessing:
7	def __init__(self, split):
8	self.split = split
9
10	def make_features(self, file_path, window, csv_path, make_y=True, verbose=True, save_csv=False):
11	df = pd.read_csv(f"{file_path}", index_col=0)
12	print(df.shape)
13	cols = df.columns
14	#print(type(df[cols[1]].iloc[1]))
15	#print(df[cols[2]].head())
16	for i in cols[1:]:
17	print(i)
18	df[f'{i}_ret1'] = np.log(df[i]/df[i].shift(1))
19	#df[f'{i}_autocorr1'] = df[f'{i}_ret1'].corr(df[f'{i}_ret1'].shift(1))
20	for j in range(2, window, 2):
21	#print((df[i]/df[i].shift(1)).head())
22	df[f'{i}_ret{j}'] = np.log(df[i]/df[i].shift(j))
23	df[f'{i}_mavg{j}'] = df[f'{i}_ret1'].rolling(j).mean()
24	df[f'{i}_ewm{j}'] = df[f'{i}_ret1'].ewm(span=j).mean()
25	#df[f'{i}_autocorr{j}'] = df[f'{i}_ret1'].corr(df[f'{i}_ret1'].shift(j))
26	if window > 10:
27	for j in range(10, window, 5):
28	df[f'{i}_vol{j}'] = df[f'{i}_ret1'].rolling(j).std()
29	df[f'{i}_ewmvol{j}'] = df[f'{i}_ret1'].ewm(span=j).std()
30	break
31	df['liq'] = df['close']*df['volume']
32	df['liq_ret1'] = df['liq']/df['liq'].shift(1)
33	#df['liq_autocorr1'] = df['liq'].corr(df['liq'].shift(1))
34	for j in range(2, window, 2):
35	df[f'liq_ret{j}'] = df['liq']/df['liq'].shift(j)
36	df[f'liq_mavg{j}'] = df['liq'].rolling(j).mean()
37	df[f'liq_ewm{j}'] = df['liq'].ewm(span=j).mean()
38	#df[f'liq_autocorr{j}'] = df['liq'].corr(df['liq'].shift(j))
39	if window > 10:
40	for j in range(10, window, 5):
41	#df[f'liq_vol{j}'] = df['liq'].rolling(j).std()
42	#df[f'liq_ewmvol{j}'] = df['liq'].ewm(span=j).std()
43	break
44	if verbose:
45	df = df.dropna()
46	print(df.shape)
47	print(df.tail())
48	print(df.head())
49	if save_csv:
50	df.to_csv(f'{csv_path}/full_features.csv')
51	return df
52
53	def make_train_test(self, df_x, df_y, window, csv_path, has_y=False, binary_y=False, save_csv=False):
54	"""
55	Splits the dataset into train and test
56	:param df_x: dataframe of x variables
57	:type df_x: pd.DataFrame
58	:param df_y: dataframe of y values
59	:type df_y: pd.DataFrame
60	:param window: the prediction window
61	:type window: int
62	:param has_y: whether df_y exists separately or is a column in df_x (must be 'target' column)
63	:type has_y: boolean

Callers 3

run.pyFile · 0.90

pca_auto.pyFile · 0.90

data_processing.pyFile · 0.85

Calls

no outgoing calls

Tested by

no test coverage detected