| 43 | |
| 44 | |
| 45 | class WordVectorizer(object): |
| 46 | def __init__(self, meta_root, prefix): |
| 47 | vectors = np.load(pjoin(meta_root, '%s_data.npy'%prefix)) |
| 48 | words = pickle.load(open(pjoin(meta_root, '%s_words.pkl'%prefix), 'rb')) |
| 49 | word2idx = pickle.load(open(pjoin(meta_root, '%s_idx.pkl'%prefix), 'rb')) |
| 50 | self.word2vec = {w: vectors[word2idx[w]] for w in words} |
| 51 | |
| 52 | def _get_pos_ohot(self, pos): |
| 53 | pos_vec = np.zeros(len(POS_enumerator)) |
| 54 | if pos in POS_enumerator: |
| 55 | pos_vec[POS_enumerator[pos]] = 1 |
| 56 | else: |
| 57 | pos_vec[POS_enumerator['OTHER']] = 1 |
| 58 | return pos_vec |
| 59 | |
| 60 | def __len__(self): |
| 61 | return len(self.word2vec) |
| 62 | |
| 63 | def __getitem__(self, item): |
| 64 | word, pos = item.split('/') |
| 65 | if word in self.word2vec: |
| 66 | word_vec = self.word2vec[word] |
| 67 | vip_pos = None |
| 68 | for key, values in VIP_dict.items(): |
| 69 | if word in values: |
| 70 | vip_pos = key |
| 71 | break |
| 72 | if vip_pos is not None: |
| 73 | pos_vec = self._get_pos_ohot(vip_pos) |
| 74 | else: |
| 75 | pos_vec = self._get_pos_ohot(pos) |
| 76 | else: |
| 77 | word_vec = self.word2vec['unk'] |
| 78 | pos_vec = self._get_pos_ohot('OTHER') |
| 79 | return word_vec, pos_vec |