| 476 | await iterations.aclose() |
| 477 | |
| 478 | def SoundGeneration(self, request, context): |
| 479 | model_name = request.model |
| 480 | try: |
| 481 | if self.processor is None: |
| 482 | if model_name == "": |
| 483 | return backend_pb2.Result(success=False, message="request.model is required") |
| 484 | self.processor = AutoProcessor.from_pretrained(model_name) |
| 485 | if self.model is None: |
| 486 | if model_name == "": |
| 487 | return backend_pb2.Result(success=False, message="request.model is required") |
| 488 | # Dynamically resolve model class if configured, otherwise default to MusicgenForConditionalGeneration |
| 489 | model_type = self.options.get("model_type", "MusicgenForConditionalGeneration") |
| 490 | ModelClass = getattr(transformers_module, model_type) |
| 491 | self.model = ModelClass.from_pretrained(model_name) |
| 492 | inputs = None |
| 493 | if request.text == "": |
| 494 | inputs = self.model.get_unconditional_inputs(num_samples=1) |
| 495 | elif request.HasField('src'): |
| 496 | sample_rate, wsamples = wavfile.read('path_to_your_file.wav') |
| 497 | |
| 498 | if request.HasField('src_divisor'): |
| 499 | wsamples = wsamples[: len(wsamples) // request.src_divisor] |
| 500 | |
| 501 | inputs = self.processor( |
| 502 | audio=wsamples, |
| 503 | sampling_rate=sample_rate, |
| 504 | text=[request.text], |
| 505 | padding=True, |
| 506 | return_tensors="pt", |
| 507 | ) |
| 508 | else: |
| 509 | inputs = self.processor( |
| 510 | text=[request.text], |
| 511 | padding=True, |
| 512 | return_tensors="pt", |
| 513 | ) |
| 514 | |
| 515 | if request.HasField('duration'): |
| 516 | tokens = int(request.duration * 51.2) # 256 tokens = 5 seconds, therefore 51.2 tokens is one second |
| 517 | guidance = self.options.get("guidance_scale", 3.0) |
| 518 | if request.HasField('temperature'): |
| 519 | guidance = request.temperature |
| 520 | dosample = self.options.get("do_sample", True) |
| 521 | if request.HasField('sample'): |
| 522 | dosample = request.sample |
| 523 | audio_values = self.model.generate(**inputs, do_sample=dosample, guidance_scale=guidance, max_new_tokens=self.max_tokens) |
| 524 | print("[transformers] SoundGeneration generated!", file=sys.stderr) |
| 525 | |
| 526 | # Save audio output |
| 527 | if hasattr(self.processor, 'save_audio'): |
| 528 | if hasattr(self.processor, 'batch_decode'): |
| 529 | try: |
| 530 | audio_values = self.processor.batch_decode(audio_values) |
| 531 | except Exception: |
| 532 | pass |
| 533 | self.processor.save_audio(audio_values, request.dst) |
| 534 | else: |
| 535 | sampling_rate = self.model.config.audio_encoder.sampling_rate |