| 46 | |
| 47 | |
| 48 | def text_to_chunks(texts, word_length=150, start_page=1): |
| 49 | text_toks = [t.split(' ') for t in texts] |
| 50 | chunks = [] |
| 51 | |
| 52 | for idx, words in enumerate(text_toks): |
| 53 | for i in range(0, len(words), word_length): |
| 54 | chunk = words[i : i + word_length] |
| 55 | if ( |
| 56 | (i + word_length) > len(words) |
| 57 | and (len(chunk) < word_length) |
| 58 | and (len(text_toks) != (idx + 1)) |
| 59 | ): |
| 60 | text_toks[idx + 1] = chunk + text_toks[idx + 1] |
| 61 | continue |
| 62 | chunk = ' '.join(chunk).strip() |
| 63 | chunk = f'[Page no. {idx+start_page}]' + ' ' + '"' + chunk + '"' |
| 64 | chunks.append(chunk) |
| 65 | return chunks |
| 66 | |
| 67 | |
| 68 | class SemanticSearch: |