MCPcopy
hub / github.com/kk7nc/Text_Classification / loadData_Tokenizer

Function loadData_Tokenizer

code/RCNN.py:14–42  ·  view source on GitHub ↗
(X_train, X_test,MAX_NB_WORDS=75000,MAX_SEQUENCE_LENGTH=500)

Source from the content-addressed store, hash-verified

12from keras.preprocessing.sequence import pad_sequences
13
14def loadData_Tokenizer(X_train, X_test,MAX_NB_WORDS=75000,MAX_SEQUENCE_LENGTH=500):
15 np.random.seed(7)
16 text = np.concatenate((X_train, X_test), axis=0)
17 text = np.array(text)
18 tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
19 tokenizer.fit_on_texts(text)
20 sequences = tokenizer.texts_to_sequences(text)
21 word_index = tokenizer.word_index
22 text = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
23 print('Found %s unique tokens.' % len(word_index))
24 indices = np.arange(text.shape[0])
25 # np.random.shuffle(indices)
26 text = text[indices]
27 print(text.shape)
28 X_train = text[0:len(X_train), ]
29 X_test = text[len(X_train):, ]
30 embeddings_index = {}
31 f = open(".\glove.6B.100d.txt", encoding="utf8")
32 for line in f:
33 values = line.split()
34 word = values[0]
35 try:
36 coefs = np.asarray(values[1:], dtype='float32')
37 except:
38 pass
39 embeddings_index[word] = coefs
40 f.close()
41 print('Total %s word vectors.' % len(embeddings_index))
42 return (X_train, X_test, word_index,embeddings_index)
43
44
45def Build_Model_RCNN_Text(word_index, embeddings_index, nclasses, MAX_SEQUENCE_LENGTH=500, EMBEDDING_DIM=100):

Callers 1

RCNN.pyFile · 0.70

Calls

no outgoing calls

Tested by

no test coverage detected