| 159 | |
| 160 | |
| 161 | def get_ptb_data(): |
| 162 | # like the wikipedia dataset, I want to return 2 things: |
| 163 | # word2idx mapping, sentences |
| 164 | # here the sentences should be Tree objects |
| 165 | |
| 166 | if not os.path.exists('../large_files/trees'): |
| 167 | print("Please create ../large_files/trees relative to this file.") |
| 168 | print("train.txt and test.txt should be stored in there.") |
| 169 | print("Please download the data from http://nlp.stanford.edu/sentiment/") |
| 170 | exit() |
| 171 | elif not os.path.exists('../large_files/trees/train.txt'): |
| 172 | print("train.txt is not in ../large_files/trees/train.txt") |
| 173 | print("Please download the data from http://nlp.stanford.edu/sentiment/") |
| 174 | exit() |
| 175 | elif not os.path.exists('../large_files/trees/test.txt'): |
| 176 | print("test.txt is not in ../large_files/trees/test.txt") |
| 177 | print("Please download the data from http://nlp.stanford.edu/sentiment/") |
| 178 | exit() |
| 179 | |
| 180 | word2idx = {} |
| 181 | train = [] |
| 182 | test = [] |
| 183 | |
| 184 | # train set first |
| 185 | for line in open('../large_files/trees/train.txt'): |
| 186 | line = line.rstrip() |
| 187 | if line: |
| 188 | t = str2tree(line, word2idx) |
| 189 | # if t.word is None and t.left is None and t.right is None: |
| 190 | # print "sentence:", line |
| 191 | # display_tree(t) |
| 192 | # print "" |
| 193 | train.append(t) |
| 194 | # break |
| 195 | |
| 196 | # test set |
| 197 | for line in open('../large_files/trees/test.txt'): |
| 198 | line = line.rstrip() |
| 199 | if line: |
| 200 | t = str2tree(line, word2idx) |
| 201 | test.append(t) |
| 202 | return train, test, word2idx |
| 203 | |
| 204 | # get_ptb_data() |