| 10 | |
| 11 | |
| 12 | def loadData_Tokenizer(X_train, X_test,MAX_NB_WORDS=75000,MAX_SEQUENCE_LENGTH=500): |
| 13 | np.random.seed(7) |
| 14 | text = np.concatenate((X_train, X_test), axis=0) |
| 15 | text = np.array(text) |
| 16 | tokenizer = Tokenizer(num_words=MAX_NB_WORDS) |
| 17 | tokenizer.fit_on_texts(text) |
| 18 | sequences = tokenizer.texts_to_sequences(text) |
| 19 | word_index = tokenizer.word_index |
| 20 | text = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH) |
| 21 | print('Found %s unique tokens.' % len(word_index)) |
| 22 | indices = np.arange(text.shape[0]) |
| 23 | # np.random.shuffle(indices) |
| 24 | text = text[indices] |
| 25 | print(text.shape) |
| 26 | X_train = text[0:len(X_train), ] |
| 27 | X_test = text[len(X_train):, ] |
| 28 | embeddings_index = {} |
| 29 | f = open("C:\\Users\\kamran\\Documents\\GitHub\\RMDL\\Examples\\Glove\\glove.6B.50d.txt", encoding="utf8") |
| 30 | for line in f: |
| 31 | |
| 32 | values = line.split() |
| 33 | word = values[0] |
| 34 | try: |
| 35 | coefs = np.asarray(values[1:], dtype='float32') |
| 36 | except: |
| 37 | pass |
| 38 | embeddings_index[word] = coefs |
| 39 | f.close() |
| 40 | print('Total %s word vectors.' % len(embeddings_index)) |
| 41 | return (X_train, X_test, word_index,embeddings_index) |
| 42 | |
| 43 | |
| 44 | |