| 272 | |
| 273 | |
| 274 | class HierachicalSummarizer(DataAugmenter): |
| 275 | def __init__(self): |
| 276 | self.summarizer = pipeline( |
| 277 | "summarization", |
| 278 | "pszemraj/long-t5-tglobal-base-16384-book-summary", |
| 279 | device=0 if torch.cuda.is_available() else -1, |
| 280 | ) |
| 281 | |
| 282 | self.params = { |
| 283 | "max_length": 1024, |
| 284 | "min_length": 8, |
| 285 | "no_repeat_ngram_size": 3, |
| 286 | "early_stopping": False, |
| 287 | "repetition_penalty": 3.5, |
| 288 | "length_penalty": 0.3, |
| 289 | "encoder_no_repeat_ngram_size": 3, |
| 290 | "num_beams": 4, |
| 291 | } # parameters for text generation out of model |
| 292 | |
| 293 | self.nlp = spacy.load("en_core_web_sm") |
| 294 | |
| 295 | def cleanup_summary(self, out): |
| 296 | ( |
| 297 | out.replace("The novel begins with the description of", "") |
| 298 | .replace("the description of", "") |
| 299 | .replace("The novel begins", "") |
| 300 | .replace("This chapter introduces us to", "") |
| 301 | .replace("In this chapter, ", "") |
| 302 | .replace("This chapter", "") |
| 303 | .strip(" ,") |
| 304 | ) |
| 305 | return out |
| 306 | |
| 307 | def parse_single(self, essay): |
| 308 | final_summary = "" |
| 309 | new_summary = "" |
| 310 | level_2_summary = [] |
| 311 | level_1_summary = [] |
| 312 | entities = [] |
| 313 | essay_parts = essay.split("##") |
| 314 | for section_text in essay_parts: |
| 315 | result = self.summarizer(section_text, **self.params) |
| 316 | out = self.cleanup_summary(result[0]["summary_text"]) |
| 317 | level_2_summary.append(out) |
| 318 | result = self.summarizer(out, **self.params) |
| 319 | out = self.cleanup_summary(result[0]["summary_text"]) |
| 320 | new_summary += "\n" + out |
| 321 | level_1_summary.append(out) |
| 322 | |
| 323 | entity = recognize_entities(section_text, self.nlp, n=5, person="ignore") |
| 324 | entities.append(entity) |
| 325 | |
| 326 | result = self.summarizer(new_summary, **self.params) |
| 327 | final_summary = self.cleanup_summary(result[0]["summary_text"]) |
| 328 | |
| 329 | first_instruction = "Write a story about the following:\n" + final_summary |
| 330 | first_answer = "\n".join(level_1_summary) |
| 331 | instructions = [first_instruction] |