| 12 | from keras.preprocessing.sequence import pad_sequences |
| 13 | |
| 14 | def loadData_Tokenizer(X_train, X_test,MAX_NB_WORDS=75000,MAX_SEQUENCE_LENGTH=500): |
| 15 | np.random.seed(7) |
| 16 | text = np.concatenate((X_train, X_test), axis=0) |
| 17 | text = np.array(text) |
| 18 | tokenizer = Tokenizer(num_words=MAX_NB_WORDS) |
| 19 | tokenizer.fit_on_texts(text) |
| 20 | sequences = tokenizer.texts_to_sequences(text) |
| 21 | word_index = tokenizer.word_index |
| 22 | text = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH) |
| 23 | print('Found %s unique tokens.' % len(word_index)) |
| 24 | indices = np.arange(text.shape[0]) |
| 25 | # np.random.shuffle(indices) |
| 26 | text = text[indices] |
| 27 | print(text.shape) |
| 28 | X_train = text[0:len(X_train), ] |
| 29 | X_test = text[len(X_train):, ] |
| 30 | embeddings_index = {} |
| 31 | f = open(".\glove.6B.100d.txt", encoding="utf8") |
| 32 | for line in f: |
| 33 | values = line.split() |
| 34 | word = values[0] |
| 35 | try: |
| 36 | coefs = np.asarray(values[1:], dtype='float32') |
| 37 | except: |
| 38 | pass |
| 39 | embeddings_index[word] = coefs |
| 40 | f.close() |
| 41 | print('Total %s word vectors.' % len(embeddings_index)) |
| 42 | return (X_train, X_test, word_index,embeddings_index) |
| 43 | |
| 44 | |
| 45 | def Build_Model_RCNN_Text(word_index, embeddings_index, nclasses, MAX_SEQUENCE_LENGTH=500, EMBEDDING_DIM=100): |