hub / github.com/GauravBh1010tt/DeepLearn / process_data

Function process_data

_deeplearn_utils/dl_text/dl.py:56–142 · view source on GitHub ↗

(sent_l,sent_r=None,wordVec_model=None,dimx=100,dimy=100,vocab_size=10000,embedding_dim=300)

Source from the content-addressed store, hash-verified

54	return [x.strip() for x in re.split('(\W+)?', sent) if x.strip()]
55
56	def process_data(sent_l,sent_r=None,wordVec_model=None,dimx=100,dimy=100,vocab_size=10000,embedding_dim=300):
57	sent1 = []
58	sent1.extend(sent_l)
59	if sent_r:
60	sent1.extend(sent_r)
61	# sent1 = [' '.join(i) for i in sent1]
62	sentence = ["%s %s %s" % (START,x,END) for x in sent1]
63	tokenize_sent = [regexp_tokenize(x,
64	pattern = '\w+\|$[\d\.]+\|\S+') for x in sentence]
65
66
67	freq = FreqDist(chain(*tokenize_sent))
68	print 'found ',len(freq),' unique words'
69	vocab = freq.most_common(vocab_size - 1)
70	index_to_word = [x[0] for x in vocab]
71	index_to_word.append(unk_token)
72
73	word_to_index = dict([(w,i) for i,w in enumerate(index_to_word)])
74
75	for i,sent in enumerate(tokenize_sent):
76	tokenize_sent[i] = [w if w in word_to_index else unk_token for w in sent]
77
78	len_train = len(sent_l)
79	text=[]
80	for i in tokenize_sent:
81	text.extend(i)
82
83	sentences_x = []
84	sentences_y = []
85
86	for sent in tokenize_sent[0:len_train]:
87	temp = [START for i in range(dimx)]
88	for ind,word in enumerate(sent[0:dimx]):
89	temp[ind] = word
90	sentences_x.append(temp)
91
92	X_data = []
93	for i in sentences_x:
94	temp = []
95	for j in i:
96	temp.append(word_to_index[j])
97	temp = np.array(temp).T
98	X_data.append(temp)
99
100	X_data = np.array(X_data)
101
102	if sent_r:
103	for sent in tokenize_sent[len_train:]:
104	temp = [START for i in range(dimy)]
105	for ind,word in enumerate(sent[0:dimy]):
106	temp[ind] = word
107	sentences_y.append(temp)
108
109	y_data=[]
110	for i in sentences_y:
111	temp = []
112	for j in i:
113	temp.append(word_to_index[j])

Callers

nothing calls this directly

Calls

no outgoing calls

Tested by

no test coverage detected