MCPcopy Index your code
hub / github.com/LAION-AI/Open-Assistant / HierachicalSummarizer

Class HierachicalSummarizer

scripts/data_augment/data_augment.py:274–342  ·  view source on GitHub ↗

Source from the content-addressed store, hash-verified

272
273
274class HierachicalSummarizer(DataAugmenter):
275 def __init__(self):
276 self.summarizer = pipeline(
277 "summarization",
278 "pszemraj/long-t5-tglobal-base-16384-book-summary",
279 device=0 if torch.cuda.is_available() else -1,
280 )
281
282 self.params = {
283 "max_length": 1024,
284 "min_length": 8,
285 "no_repeat_ngram_size": 3,
286 "early_stopping": False,
287 "repetition_penalty": 3.5,
288 "length_penalty": 0.3,
289 "encoder_no_repeat_ngram_size": 3,
290 "num_beams": 4,
291 } # parameters for text generation out of model
292
293 self.nlp = spacy.load("en_core_web_sm")
294
295 def cleanup_summary(self, out):
296 (
297 out.replace("The novel begins with the description of", "")
298 .replace("the description of", "")
299 .replace("The novel begins", "")
300 .replace("This chapter introduces us to", "")
301 .replace("In this chapter, ", "")
302 .replace("This chapter", "")
303 .strip(" ,")
304 )
305 return out
306
307 def parse_single(self, essay):
308 final_summary = ""
309 new_summary = ""
310 level_2_summary = []
311 level_1_summary = []
312 entities = []
313 essay_parts = essay.split("##")
314 for section_text in essay_parts:
315 result = self.summarizer(section_text, **self.params)
316 out = self.cleanup_summary(result[0]["summary_text"])
317 level_2_summary.append(out)
318 result = self.summarizer(out, **self.params)
319 out = self.cleanup_summary(result[0]["summary_text"])
320 new_summary += "\n" + out
321 level_1_summary.append(out)
322
323 entity = recognize_entities(section_text, self.nlp, n=5, person="ignore")
324 entities.append(entity)
325
326 result = self.summarizer(new_summary, **self.params)
327 final_summary = self.cleanup_summary(result[0]["summary_text"])
328
329 first_instruction = "Write a story about the following:\n" + final_summary
330 first_answer = "\n".join(level_1_summary)
331 instructions = [first_instruction]

Callers 1

get_augmenterFunction · 0.85

Calls

no outgoing calls

Tested by

no test coverage detected