commitUtteranceWithTranscript commits one user turn. live carries the transcript semantic_vad's live stream already produced (its caption deltas were streamed to the client during the turn, so only the completed event is emitted here); gated carries the retranscribe gate's batch decode (the authorit
(ctx context.Context, utt []byte, live *liveUtterance, gated *schema.TranscriptionResult, itemID string, session *Session, conv *Conversation, t Transport)
| 1502 | // itemID is the turn's conversation item id ("" mints a fresh one); it must |
| 1503 | // match the id any live deltas were sent under. |
| 1504 | func commitUtteranceWithTranscript(ctx context.Context, utt []byte, live *liveUtterance, gated *schema.TranscriptionResult, itemID string, session *Session, conv *Conversation, t Transport) { |
| 1505 | if len(utt) == 0 { |
| 1506 | return |
| 1507 | } |
| 1508 | |
| 1509 | f, err := os.CreateTemp("", "realtime-audio-chunk-*.wav") |
| 1510 | if err != nil { |
| 1511 | xlog.Error("failed to create temp file", "error", err) |
| 1512 | return |
| 1513 | } |
| 1514 | defer f.Close() |
| 1515 | defer os.Remove(f.Name()) |
| 1516 | xlog.Debug("Writing to file", "file", f.Name()) |
| 1517 | |
| 1518 | hdr := laudio.NewWAVHeader(uint32(len(utt))) |
| 1519 | if err := hdr.Write(f); err != nil { |
| 1520 | xlog.Error("Failed to write WAV header", "error", err) |
| 1521 | return |
| 1522 | } |
| 1523 | |
| 1524 | if _, err := f.Write(utt); err != nil { |
| 1525 | xlog.Error("Failed to write audio data", "error", err) |
| 1526 | return |
| 1527 | } |
| 1528 | |
| 1529 | f.Sync() |
| 1530 | |
| 1531 | // Start speaker verification concurrently with transcription. This is a |
| 1532 | // latency optimization only: there is a hard join below before the LLM, so |
| 1533 | // an unauthorized utterance never reaches generateResponse (no LLM, no |
| 1534 | // tools, no TTS) regardless of how fast transcription finishes. A rejected |
| 1535 | // turn wastes only transcription compute, which has no side effects. The |
| 1536 | // transcript is still emitted to the same peer that sent the audio, which |
| 1537 | // reveals nothing new to them. |
| 1538 | // Resolve the speaker when the gate must authorize this turn, or when identity |
| 1539 | // surfacing/personalization needs a fresh identity. Identity resolution |
| 1540 | // ignores the when:first short-circuit (that only skips re-authorization). |
| 1541 | type resolveOutcome struct { |
| 1542 | res resolution |
| 1543 | err error |
| 1544 | } |
| 1545 | var resolveCh chan resolveOutcome |
| 1546 | runResolve := false |
| 1547 | if session.voiceGate != nil && session.InputAudioTranscription != nil { |
| 1548 | enforce := session.voiceGate.cfg.EnforceGate() |
| 1549 | gateNeedsAuth := enforce |
| 1550 | if enforce && session.voiceGate.cfg.When == config.VoiceGateWhenFirst { |
| 1551 | session.gateMu.Lock() |
| 1552 | if session.voiceVerified { |
| 1553 | gateNeedsAuth = false |
| 1554 | } |
| 1555 | session.gateMu.Unlock() |
| 1556 | } |
| 1557 | if gateNeedsAuth || session.voiceGate.cfg.IdentityEnabled() { |
| 1558 | runResolve = true |
| 1559 | resolveCh = make(chan resolveOutcome, 1) |
| 1560 | wavPath := f.Name() |
| 1561 | go func() { |
no test coverage detected