Loads the dataset :type path: String :param path: The path to the dataset (here IMDB) :type n_words: int :param n_words: The number of word to keep in the vocabulary. All extra words are set to unknow (1). :type valid_portion: float :param valid_portion: The proporti
(path="imdb.pkl", n_words=100000, valid_portion=0.1, maxlen=None,
sort_by_len=True)
| 80 | |
| 81 | |
| 82 | def load_data(path="imdb.pkl", n_words=100000, valid_portion=0.1, maxlen=None, |
| 83 | sort_by_len=True): |
| 84 | '''Loads the dataset |
| 85 | |
| 86 | :type path: String |
| 87 | :param path: The path to the dataset (here IMDB) |
| 88 | :type n_words: int |
| 89 | :param n_words: The number of word to keep in the vocabulary. |
| 90 | All extra words are set to unknow (1). |
| 91 | :type valid_portion: float |
| 92 | :param valid_portion: The proportion of the full train set used for |
| 93 | the validation set. |
| 94 | :type maxlen: None or positive int |
| 95 | :param maxlen: the max sequence length we use in the train/valid set. |
| 96 | :type sort_by_len: bool |
| 97 | :name sort_by_len: Sort by the sequence lenght for the train, |
| 98 | valid and test set. This allow faster execution as it cause |
| 99 | less padding per minibatch. Another mechanism must be used to |
| 100 | shuffle the train set at each epoch. |
| 101 | |
| 102 | ''' |
| 103 | |
| 104 | ############# |
| 105 | # LOAD DATA # |
| 106 | ############# |
| 107 | |
| 108 | # Load the dataset |
| 109 | path = get_dataset_file( |
| 110 | path, "imdb.pkl", |
| 111 | "http://www.iro.umontreal.ca/~lisa/deep/data/imdb.pkl") |
| 112 | |
| 113 | if path.endswith(".gz"): |
| 114 | f = gzip.open(path, 'rb') |
| 115 | else: |
| 116 | f = open(path, 'rb') |
| 117 | |
| 118 | train_set = pickle.load(f) |
| 119 | test_set = pickle.load(f) |
| 120 | f.close() |
| 121 | if maxlen: |
| 122 | new_train_set_x = [] |
| 123 | new_train_set_y = [] |
| 124 | for x, y in zip(train_set[0], train_set[1]): |
| 125 | if len(x) < maxlen: |
| 126 | new_train_set_x.append(x) |
| 127 | new_train_set_y.append(y) |
| 128 | train_set = (new_train_set_x, new_train_set_y) |
| 129 | del new_train_set_x, new_train_set_y |
| 130 | |
| 131 | # split training set into validation set |
| 132 | train_set_x, train_set_y = train_set |
| 133 | n_samples = len(train_set_x) |
| 134 | sidx = numpy.random.permutation(n_samples) |
| 135 | n_train = int(numpy.round(n_samples * (1. - valid_portion))) |
| 136 | valid_set_x = [train_set_x[s] for s in sidx[n_train:]] |
| 137 | valid_set_y = [train_set_y[s] for s in sidx[n_train:]] |
| 138 | train_set_x = [train_set_x[s] for s in sidx[:n_train]] |
| 139 | train_set_y = [train_set_y[s] for s in sidx[:n_train]] |
no test coverage detected