| 29 | # 40 files --> half right but 50 is better |
| 30 | |
| 31 | class Glove: |
| 32 | def __init__(self, D, V, context_sz): |
| 33 | self.D = D |
| 34 | self.V = V |
| 35 | self.context_sz = context_sz |
| 36 | |
| 37 | def fit(self, sentences, cc_matrix=None, learning_rate=1e-4, reg=0.1, xmax=100, alpha=0.75, epochs=10, gd=False): |
| 38 | # build co-occurrence matrix |
| 39 | # paper calls it X, so we will call it X, instead of calling |
| 40 | # the training data X |
| 41 | # TODO: would it be better to use a sparse matrix? |
| 42 | t0 = datetime.now() |
| 43 | V = self.V |
| 44 | D = self.D |
| 45 | |
| 46 | if not os.path.exists(cc_matrix): |
| 47 | X = np.zeros((V, V)) |
| 48 | N = len(sentences) |
| 49 | print("number of sentences to process:", N) |
| 50 | it = 0 |
| 51 | for sentence in sentences: |
| 52 | it += 1 |
| 53 | if it % 10000 == 0: |
| 54 | print("processed", it, "/", N) |
| 55 | n = len(sentence) |
| 56 | for i in range(n): |
| 57 | # i is not the word index!!! |
| 58 | # j is not the word index!!! |
| 59 | # i just points to which element of the sequence (sentence) we're looking at |
| 60 | wi = sentence[i] |
| 61 | |
| 62 | start = max(0, i - self.context_sz) |
| 63 | end = min(n, i + self.context_sz) |
| 64 | |
| 65 | # we can either choose only one side as context, or both |
| 66 | # here we are doing both |
| 67 | |
| 68 | # make sure "start" and "end" tokens are part of some context |
| 69 | # otherwise their f(X) will be 0 (denominator in bias update) |
| 70 | if i - self.context_sz < 0: |
| 71 | points = 1.0 / (i + 1) |
| 72 | X[wi,0] += points |
| 73 | X[0,wi] += points |
| 74 | if i + self.context_sz > n: |
| 75 | points = 1.0 / (n - i) |
| 76 | X[wi,1] += points |
| 77 | X[1,wi] += points |
| 78 | |
| 79 | # left side |
| 80 | for j in range(start, i): |
| 81 | wj = sentence[j] |
| 82 | points = 1.0 / (i - j) # this is +ve |
| 83 | X[wi,wj] += points |
| 84 | X[wj,wi] += points |
| 85 | |
| 86 | # right side |
| 87 | for j in range(i + 1, end): |
| 88 | wj = sentence[j] |