(args)
| 83 | |
| 84 | |
| 85 | def start_server(args): |
| 86 | if server_ready(SERVER_URL): |
| 87 | print(f"Reuse existing SGLang server: {SERVER_URL}") |
| 88 | return None |
| 89 | |
| 90 | os.makedirs(os.path.dirname(os.path.abspath(args.server_log)) or ".", exist_ok=True) |
| 91 | env = os.environ.copy() |
| 92 | env["CUDA_VISIBLE_DEVICES"] = args.gpu |
| 93 | |
| 94 | cmd = [ |
| 95 | sys.executable, |
| 96 | "-m", |
| 97 | "sglang.launch_server", |
| 98 | "--model", |
| 99 | args.model_dir, |
| 100 | "--served-model-name", |
| 101 | SERVED_MODEL_NAME, |
| 102 | "--attention-backend", |
| 103 | ATTENTION_BACKEND, |
| 104 | "--page-size", |
| 105 | str(PAGE_SIZE), |
| 106 | "--mem-fraction-static", |
| 107 | str(MEM_FRACTION_STATIC), |
| 108 | "--context-length", |
| 109 | str(CONTEXT_LENGTH), |
| 110 | "--enable-custom-logit-processor", |
| 111 | "--disable-overlap-schedule", |
| 112 | "--skip-server-warmup", |
| 113 | "--host", |
| 114 | HOST, |
| 115 | "--port", |
| 116 | str(PORT), |
| 117 | ] |
| 118 | |
| 119 | print(f"Starting SGLang server on GPU {args.gpu}, port {PORT} ...") |
| 120 | log_file = open(args.server_log, "w", encoding="utf-8") |
| 121 | process = subprocess.Popen(cmd, env=env, stdout=log_file, stderr=subprocess.STDOUT) |
| 122 | process._log_file = log_file |
| 123 | print(f"Server PID: {process.pid}") |
| 124 | |
| 125 | start = time.time() |
| 126 | while time.time() - start < SERVER_TIMEOUT: |
| 127 | if process.poll() is not None: |
| 128 | log_file.flush() |
| 129 | raise RuntimeError(f"SGLang server exited early. Check {args.server_log}") |
| 130 | if server_ready(SERVER_URL): |
| 131 | print(f"Server ready ({time.time() - start:.0f}s)") |
| 132 | return process |
| 133 | time.sleep(3) |
| 134 | |
| 135 | stop_server(process) |
| 136 | raise TimeoutError(f"Timed out waiting for SGLang server. Check {args.server_log}") |
| 137 | |
| 138 | |
| 139 | def stop_server(process): |
no test coverage detected