MCPcopy
hub / github.com/mudler/LocalAI / AudioTranscription

Method AudioTranscription

backend/python/liquid-audio/backend.py:521–569  ·  view source on GitHub ↗
(self, request, context)

Source from the content-addressed store, hash-verified

519
520
521 def AudioTranscription(self, request, context):
522 try:
523 if self.model is None or self.processor is None:
524 return backend_pb2.TranscriptResult(segments=[], text="")
525
526 import torchaudio
527 from liquid_audio import ChatState
528
529 audio_path = request.dst
530 if not audio_path:
531 return backend_pb2.TranscriptResult(segments=[], text="")
532
533 chat = ChatState(self.processor)
534 chat.new_turn("system")
535 chat.add_text("Perform ASR.")
536 chat.end_turn()
537 chat.new_turn("user")
538 # soundfile in preference to torchaudio.load — the latter routes
539 # through torchcodec which needs NVIDIA NPP libs we don't bundle.
540 import soundfile as _sf
541 import torch
542 audio_np, sr = _sf.read(audio_path, dtype="float32", always_2d=True)
543 wav = torch.from_numpy(audio_np.T) # (channels, samples)
544 if wav.shape[0] > 1:
545 # Down-mix to mono — the processor expects a single channel
546 wav = wav.mean(dim=0, keepdim=True)
547 chat.add_audio(wav, sr)
548 chat.end_turn()
549 chat.new_turn("assistant")
550
551 max_new = int(self.options.get("max_new_tokens", 1024))
552
553 pieces = []
554 for tok in self.model.generate_sequential(**chat, max_new_tokens=max_new):
555 if tok.numel() == 1:
556 if tok.item() == IM_END_TOKEN:
557 break
558 pieces.append(self.processor.text.decode(tok))
559
560 text = "".join(pieces).strip()
561 duration_ms = int((wav.shape[1] / sr) * 1000)
562 segment = backend_pb2.TranscriptSegment(
563 id=0, start=0, end=duration_ms, text=text, tokens=[],
564 )
565 return backend_pb2.TranscriptResult(segments=[segment], text=text)
566 except Exception as exc:
567 print(f"AudioTranscription failed: {exc}", file=sys.stderr)
568 print(traceback.format_exc(), file=sys.stderr)
569 return backend_pb2.TranscriptResult(segments=[], text="")
570
571
572 def StartFineTune(self, request, context):

Callers

nothing calls this directly

Calls 4

appendMethod · 0.80
decodeMethod · 0.80
readMethod · 0.45
getMethod · 0.45

Tested by

no test coverage detected