(sent_l,sent_r=None,wordVec_model=None,dimx=100,dimy=100,vocab_size=10000,embedding_dim=300)
| 54 | return [x.strip() for x in re.split('(\W+)?', sent) if x.strip()] |
| 55 | |
| 56 | def process_data(sent_l,sent_r=None,wordVec_model=None,dimx=100,dimy=100,vocab_size=10000,embedding_dim=300): |
| 57 | sent1 = [] |
| 58 | sent1.extend(sent_l) |
| 59 | if sent_r: |
| 60 | sent1.extend(sent_r) |
| 61 | # sent1 = [' '.join(i) for i in sent1] |
| 62 | sentence = ["%s %s %s" % (START,x,END) for x in sent1] |
| 63 | tokenize_sent = [regexp_tokenize(x, |
| 64 | pattern = '\w+|$[\d\.]+|\S+') for x in sentence] |
| 65 | |
| 66 | |
| 67 | freq = FreqDist(chain(*tokenize_sent)) |
| 68 | print 'found ',len(freq),' unique words' |
| 69 | vocab = freq.most_common(vocab_size - 1) |
| 70 | index_to_word = [x[0] for x in vocab] |
| 71 | index_to_word.append(unk_token) |
| 72 | |
| 73 | word_to_index = dict([(w,i) for i,w in enumerate(index_to_word)]) |
| 74 | |
| 75 | for i,sent in enumerate(tokenize_sent): |
| 76 | tokenize_sent[i] = [w if w in word_to_index else unk_token for w in sent] |
| 77 | |
| 78 | len_train = len(sent_l) |
| 79 | text=[] |
| 80 | for i in tokenize_sent: |
| 81 | text.extend(i) |
| 82 | |
| 83 | sentences_x = [] |
| 84 | sentences_y = [] |
| 85 | |
| 86 | for sent in tokenize_sent[0:len_train]: |
| 87 | temp = [START for i in range(dimx)] |
| 88 | for ind,word in enumerate(sent[0:dimx]): |
| 89 | temp[ind] = word |
| 90 | sentences_x.append(temp) |
| 91 | |
| 92 | X_data = [] |
| 93 | for i in sentences_x: |
| 94 | temp = [] |
| 95 | for j in i: |
| 96 | temp.append(word_to_index[j]) |
| 97 | temp = np.array(temp).T |
| 98 | X_data.append(temp) |
| 99 | |
| 100 | X_data = np.array(X_data) |
| 101 | |
| 102 | if sent_r: |
| 103 | for sent in tokenize_sent[len_train:]: |
| 104 | temp = [START for i in range(dimy)] |
| 105 | for ind,word in enumerate(sent[0:dimy]): |
| 106 | temp[ind] = word |
| 107 | sentences_y.append(temp) |
| 108 | |
| 109 | y_data=[] |
| 110 | for i in sentences_y: |
| 111 | temp = [] |
| 112 | for j in i: |
| 113 | temp.append(word_to_index[j]) |
nothing calls this directly
no outgoing calls
no test coverage detected