| 146 | |
| 147 | class DAC(BaseModel, CodecMixin): |
| 148 | def __init__( |
| 149 | self, |
| 150 | encoder_dim: int = 64, |
| 151 | encoder_rates: List[int] = [2, 4, 8, 8], |
| 152 | latent_dim: int = None, |
| 153 | decoder_dim: int = 1536, |
| 154 | decoder_rates: List[int] = [8, 8, 4, 2], |
| 155 | n_codebooks: int = 9, |
| 156 | codebook_size: int = 1024, |
| 157 | codebook_dim: Union[int, list] = 8, |
| 158 | quantizer_dropout: bool = False, |
| 159 | sample_rate: int = 44100, |
| 160 | ): |
| 161 | super().__init__() |
| 162 | |
| 163 | self.encoder_dim = encoder_dim |
| 164 | self.encoder_rates = encoder_rates |
| 165 | self.decoder_dim = decoder_dim |
| 166 | self.decoder_rates = decoder_rates |
| 167 | self.sample_rate = sample_rate |
| 168 | |
| 169 | if latent_dim is None: |
| 170 | latent_dim = encoder_dim * (2 ** len(encoder_rates)) |
| 171 | |
| 172 | self.latent_dim = latent_dim |
| 173 | |
| 174 | self.hop_length = np.prod(encoder_rates) |
| 175 | self.encoder = Encoder(encoder_dim, encoder_rates, latent_dim) |
| 176 | |
| 177 | self.n_codebooks = n_codebooks |
| 178 | self.codebook_size = codebook_size |
| 179 | self.codebook_dim = codebook_dim |
| 180 | self.quantizer = ResidualVectorQuantize( |
| 181 | input_dim=latent_dim, |
| 182 | n_codebooks=n_codebooks, |
| 183 | codebook_size=codebook_size, |
| 184 | codebook_dim=codebook_dim, |
| 185 | quantizer_dropout=quantizer_dropout, |
| 186 | ) |
| 187 | |
| 188 | self.decoder = Decoder( |
| 189 | latent_dim, |
| 190 | decoder_dim, |
| 191 | decoder_rates, |
| 192 | ) |
| 193 | self.sample_rate = sample_rate |
| 194 | self.apply(init_weights) |
| 195 | |
| 196 | self.delay = self.get_delay() |
| 197 | |
| 198 | def preprocess(self, audio_data, sample_rate): |
| 199 | if sample_rate is None: |