()
| 115 | |
| 116 | |
| 117 | def get_normalized_data(): |
| 118 | print("Reading in and transforming data...") |
| 119 | |
| 120 | if not os.path.exists('../large_files/train.csv'): |
| 121 | print('Looking for ../large_files/train.csv') |
| 122 | print('You have not downloaded the data and/or not placed the files in the correct location.') |
| 123 | print('Please get the data from: https://www.kaggle.com/c/digit-recognizer') |
| 124 | print('Place train.csv in the folder large_files adjacent to the class folder') |
| 125 | exit() |
| 126 | |
| 127 | df = pd.read_csv('../large_files/train.csv') |
| 128 | data = df.values.astype(np.float32) |
| 129 | np.random.shuffle(data) |
| 130 | X = data[:, 1:] |
| 131 | Y = data[:, 0] |
| 132 | |
| 133 | Xtrain = X[:-1000] |
| 134 | Ytrain = Y[:-1000] |
| 135 | Xtest = X[-1000:] |
| 136 | Ytest = Y[-1000:] |
| 137 | |
| 138 | # normalize the data |
| 139 | mu = Xtrain.mean(axis=0) |
| 140 | std = Xtrain.std(axis=0) |
| 141 | np.place(std, std == 0, 1) |
| 142 | Xtrain = (Xtrain - mu) / std |
| 143 | Xtest = (Xtest - mu) / std |
| 144 | |
| 145 | return Xtrain, Xtest, Ytrain, Ytest |
| 146 | |
| 147 | |
| 148 | def plot_cumulative_variance(pca): |
no outgoing calls
no test coverage detected