Create pipeline with quant_policy=QuantPolicy.TURBO_QUANT. This fixture has class scope so large model instances are released before later FP8 accuracy tests allocate their own pipelines.
(model_id)
| 56 | |
| 57 | @pytest.fixture(scope='class') |
| 58 | def pipe_quant_42(model_id): |
| 59 | """Create pipeline with quant_policy=QuantPolicy.TURBO_QUANT. |
| 60 | |
| 61 | This fixture has class scope so large model instances are released before later FP8 accuracy tests allocate their |
| 62 | own pipelines. |
| 63 | """ |
| 64 | engine_config = PytorchEngineConfig( |
| 65 | tp=1, |
| 66 | cache_max_entry_count=0.05, |
| 67 | quant_policy=QuantPolicy.TURBO_QUANT, # K=4bit, V=2bit mixed precision |
| 68 | ) |
| 69 | pipe = pipeline(model_id, backend_config=engine_config, log_level='INFO') |
| 70 | yield pipe |
| 71 | # Cleanup |
| 72 | pipe.close() |
| 73 | del pipe |
| 74 | gc.collect() |
| 75 | if torch.cuda.is_available() and torch.cuda.device_count() > 0: |
| 76 | torch.cuda.empty_cache() |
| 77 | |
| 78 | |
| 79 | # ============================================================================= |
nothing calls this directly
no test coverage detected