Wraps around if we split: make sure last passage isn't too short. This is meant to be similar to the DPR preprocessing.
(inp)
| 17 | |
| 18 | |
| 19 | def process_page(inp): |
| 20 | """ |
| 21 | Wraps around if we split: make sure last passage isn't too short. |
| 22 | This is meant to be similar to the DPR preprocessing. |
| 23 | """ |
| 24 | |
| 25 | (nwords, overlap, tokenizer), (title_idx, docid, title, url, content) = inp |
| 26 | |
| 27 | if tokenizer is None: |
| 28 | words = content.split() |
| 29 | else: |
| 30 | words = tokenizer.tokenize(content) |
| 31 | |
| 32 | words_ = (words + words) if len(words) > nwords else words |
| 33 | passages = [words_[offset:offset + nwords] for offset in range(0, len(words) - overlap, nwords - overlap)] |
| 34 | |
| 35 | assert all(len(psg) in [len(words), nwords] for psg in passages), (list(map(len, passages)), len(words)) |
| 36 | |
| 37 | if tokenizer is None: |
| 38 | passages = [' '.join(psg) for psg in passages] |
| 39 | else: |
| 40 | passages = [' '.join(psg).replace(' ##', '') for psg in passages] |
| 41 | |
| 42 | if title_idx % 100000 == 0: |
| 43 | print("#> ", title_idx, '\t\t\t', title) |
| 44 | |
| 45 | for p in passages: |
| 46 | print("$$$ ", '\t\t', p) |
| 47 | print() |
| 48 | |
| 49 | print() |
| 50 | print() |
| 51 | print() |
| 52 | |
| 53 | return (docid, title, url, passages) |
| 54 | |
| 55 | |
| 56 | def main(args): |