# 为每个命令模块初始化子命令解析器 for cmd_module in CMD_MODULES: new_cmds = cmd_module.cmd_init() for cmd in new_cmds: cmd.subparser_init(subparsers).set_defaults(dispatch_function=cmd.cmd)
classCLISubcommand: """Base class for CLI argument handlers."""
name: str # 最终dispatch_function=cmd.cmd启动的cmd,最终的子命令的入口 @staticmethod defcmd(args: argparse.Namespace) -> None: raise NotImplementedError("Subclasses should implement this method")
defvalidate(self, args: argparse.Namespace) -> None: # No validation by default pass # 参数解析器,用于解析每个子命令的参数 defsubparser_init( self, subparsers: argparse._SubParsersAction ) -> FlexibleArgumentParser: raise NotImplementedError("Subclasses should implement this method")
最终的逻辑都是各个功能模块去实现基类CLISubcommand来完成不同的功能
2. CLI serve逻辑
serve入口实现在vllm/entrypoints/cli/serve.py中,核心代码如下
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
classServeSubcommand(CLISubcommand): """The `serve` subcommand for the vLLM CLI.""" # 在cli中使用vllm serve调用到这个对象 name = "serve"
@staticmethod defcmd(args: argparse.Namespace) -> None: # If model is specified in CLI (as positional arg), it takes precedence ifhasattr(args, "model_tag") and args.model_tag isnotNone: args.model = args.model_tag
if args.headless or args.api_server_count < 1: run_headless(args) else: if args.api_server_count > 1: run_multi_api_server(args) else: # Single API server (this process). uvloop.run(run_server(args))
logger.info( "Starting vLLM API server %d on %s", engine_client.vllm_config.parallel_config._api_process_rank, listen_address, ) shutdown_task = await serve_http( app, sock=sock, enable_ssl_refresh=args.enable_ssl_refresh, host=args.host, port=args.port, log_level=args.uvicorn_log_level, # NOTE: When the 'disable_uvicorn_access_log' value is True, # no access log will be output. access_log=not args.disable_uvicorn_access_log, timeout_keep_alive=envs.VLLM_HTTP_TIMEOUT_KEEP_ALIVE, ssl_keyfile=args.ssl_keyfile, ssl_certfile=args.ssl_certfile, ssl_ca_certs=args.ssl_ca_certs, ssl_cert_reqs=args.ssl_cert_reqs, h11_max_incomplete_event_size=args.h11_max_incomplete_event_size, h11_max_header_count=args.h11_max_header_count, **uvicorn_kwargs, )
# NB: Await server shutdown only after the backend context is exited try: await shutdown_task finally: sock.close()
graph TD
A[CLI命令<br/>vllm serve] --> B[serve.py<br/>ServeSubcommand.cmd]
B --> C{运行模式判断}
C -->|headless模式| D[run_headless]
C -->|多API服务器| E[run_multi_api_server]
C -->|单API服务器| F[run_server]
F --> G[api_server.py<br/>setup_server]
G --> H[端口绑定<br/>socket创建]
H --> I[run_server_worker]
I --> J[launcher.py<br/>serve_http]
J --> K[uvicorn服务器启动]
K --> L[路由注册<br/>build_app]
L --> M[api_router.py<br/>attach_router]
M --> N[POST /v1/chat/completions]
N --> O[serving.py<br/>OpenAIServingChat.create_chat_completion]
O --> P[engine_client.generate]
P --> Q[AsyncLLM.generate]
Q --> R[核心引擎<br/>模型推理]
E --> S[多进程管理<br/>APIServerProcessManager]
S --> T[多个run_api_server_worker_proc]
T --> I
D --> U[无头模式<br/>纯推理引擎]