| 519 | |
| 520 | |
| 521 | def AudioTranscription(self, request, context): |
| 522 | try: |
| 523 | if self.model is None or self.processor is None: |
| 524 | return backend_pb2.TranscriptResult(segments=[], text="") |
| 525 | |
| 526 | import torchaudio |
| 527 | from liquid_audio import ChatState |
| 528 | |
| 529 | audio_path = request.dst |
| 530 | if not audio_path: |
| 531 | return backend_pb2.TranscriptResult(segments=[], text="") |
| 532 | |
| 533 | chat = ChatState(self.processor) |
| 534 | chat.new_turn("system") |
| 535 | chat.add_text("Perform ASR.") |
| 536 | chat.end_turn() |
| 537 | chat.new_turn("user") |
| 538 | # soundfile in preference to torchaudio.load — the latter routes |
| 539 | # through torchcodec which needs NVIDIA NPP libs we don't bundle. |
| 540 | import soundfile as _sf |
| 541 | import torch |
| 542 | audio_np, sr = _sf.read(audio_path, dtype="float32", always_2d=True) |
| 543 | wav = torch.from_numpy(audio_np.T) # (channels, samples) |
| 544 | if wav.shape[0] > 1: |
| 545 | # Down-mix to mono — the processor expects a single channel |
| 546 | wav = wav.mean(dim=0, keepdim=True) |
| 547 | chat.add_audio(wav, sr) |
| 548 | chat.end_turn() |
| 549 | chat.new_turn("assistant") |
| 550 | |
| 551 | max_new = int(self.options.get("max_new_tokens", 1024)) |
| 552 | |
| 553 | pieces = [] |
| 554 | for tok in self.model.generate_sequential(**chat, max_new_tokens=max_new): |
| 555 | if tok.numel() == 1: |
| 556 | if tok.item() == IM_END_TOKEN: |
| 557 | break |
| 558 | pieces.append(self.processor.text.decode(tok)) |
| 559 | |
| 560 | text = "".join(pieces).strip() |
| 561 | duration_ms = int((wav.shape[1] / sr) * 1000) |
| 562 | segment = backend_pb2.TranscriptSegment( |
| 563 | id=0, start=0, end=duration_ms, text=text, tokens=[], |
| 564 | ) |
| 565 | return backend_pb2.TranscriptResult(segments=[segment], text=text) |
| 566 | except Exception as exc: |
| 567 | print(f"AudioTranscription failed: {exc}", file=sys.stderr) |
| 568 | print(traceback.format_exc(), file=sys.stderr) |
| 569 | return backend_pb2.TranscriptResult(segments=[], text="") |
| 570 | |
| 571 | |
| 572 | def StartFineTune(self, request, context): |