(dim: int, end: int, theta: float = 10000.0, device:str|None=None)
| 7 | |
| 8 | @functools.cache |
| 9 | def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0, device:str|None=None) -> Tensor: |
| 10 | freqs = 1.0 / (theta ** (Tensor.arange(0, dim, 2, device=device)[:(dim // 2)] / dim)) |
| 11 | freqs = Tensor.arange(end, device=device).unsqueeze(dim=1) * freqs.unsqueeze(dim=0) |
| 12 | return freqs.cos().cat(freqs.sin(), dim=-1).contiguous() |
| 13 | |
| 14 | class ExpertWeights: |
| 15 | """Like nn.Linear but with num_experts dimension. Weight shape: (num_experts, out_features, in_features).""" |
searching dependent graphs…