| 392 | |
| 393 | |
| 394 | class CodeInstructor(DataAugmenter): |
| 395 | def __init__(self): |
| 396 | self.tokenizer = AutoTokenizer.from_pretrained("Graverman/t5-code-summary") |
| 397 | self.model = T5ForConditionalGeneration.from_pretrained("Graverman/t5-code-summary") |
| 398 | |
| 399 | def parse(self, codes): |
| 400 | source_encoding = self.tokenizer( |
| 401 | codes, |
| 402 | max_length=300, |
| 403 | padding="max_length", |
| 404 | truncation=True, |
| 405 | return_attention_mask=True, |
| 406 | add_special_tokens=True, |
| 407 | return_tensors="pt", |
| 408 | ) |
| 409 | outputs = self.model.generate( |
| 410 | input_ids=source_encoding["input_ids"], |
| 411 | attention_mask=source_encoding["attention_mask"], |
| 412 | max_length=100, |
| 413 | length_penalty=0.75, |
| 414 | repetition_penalty=2.5, |
| 415 | early_stopping=True, |
| 416 | use_cache=True, |
| 417 | ) |
| 418 | summaries = [self.tokenizer.decode(o, skip_special_tokens=True) for o in outputs] |
| 419 | |
| 420 | questions = ["Write a script that does the following:\n" + s for s in summaries] |
| 421 | answers = codes |
| 422 | |
| 423 | return questions, answers |
| 424 | |
| 425 | |
| 426 | def recognize_entities(text, model, n=4, person="ignore"): |