
    h<                     .   S SK r S SKrS SKrS SKrS SKrS SKJr  S SKJrJ	r	  S SK
Jr  S SKrS SKJr  SSKJrJrJr  \" 5       (       a
  S SKrS SKJr  \" 5       (       a  S S	KJr  S S
KJr  \" 5       (       a  S SKJr  \R4                  " \5      r " S S5      r\S:X  aw  S SKJr  \" 5       r \ RC                  SS9  \ RE                  SS/SS\" 5       S9r#\$" S\#5        S SK%J&r&  \&RN                  " S5      RQ                  S5      r)\ RU                  \)5        gg)    N)BytesIO)OptionalUnion)urlparse)nn   )is_requests_availableis_vllm_ascend_availableis_vllm_available)ConnectionError)PyNcclCommunicator)StatelessProcessGroup)PyHcclCommunicatorc                   \   \ rS rSrSr     S$S\\   S\S\S\S\4
S	 jjr	S%S
\S\4S jjr
          S&S\\   S\\   S\S\S\S\S\S\S\S\\   S\\   S\\\      4S jjrS'S\\R                   \\4   4S jjrS\S\R$                  4S jrS\R*                  4S  jrS! rS" rS#rg)(
VLLMClient.   a  
A client class to interact with a vLLM server.

This class provides methods to generate completions, initialize and manage weight update groups, and update model
weights in a distributed setting. Before using it, start the vLLM server with `trl vllm-serve`.

Args:
    base_url (`str` or `None`, *optional*, defaults to `None`):
        Base URL for the vLLM server (e.g., `"http://localhost:8000"`). If provided, `host` and `server_port` are
        ignored.
    host (`str`, *optional*, defaults to `"0.0.0.0"`):
        IP address of the vLLM server. Ignored if `base_url` is provided.
    server_port (`int`, *optional*, defaults to `8000`):
        Port number of the vLLM server. Ignored if `base_url` is provided.
    group_port (`int`, *optional*, defaults to `51216`):
        Port number for the weight update group.
    connection_timeout (`float`, *optional*, defaults to `0.0`):
        Total timeout duration in seconds to wait for the server to be up. If the server is not up after the
        timeout, a `ConnectionError` is raised.

Examples:
    Run the vLLM server with the model `Qwen/Qwen2.5-7B`:

    ```
    $ trl vllm-serve --model Qwen/Qwen2.5-7B
    ...
    INFO:     Application startup complete.
    INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
    ```

    Use the client to generate completions and update model weights:

    ```python
    >>> from trl.extras.vllm_client import VLLMClient

    >>> client = VLLMClient()
    >>> client.generate(["Hello, AI!", "Tell me a joke"])
    [[2980, 498, 1492, 752, 448, 264, 13027, 8645, 30, 358, 2776, 4460, 311, 3270, 264, 2025],
     [911, 7988, 1251, 382, 3838, 653, 498, 1618, 4325, 879, 2581, 20027, 264, 21428, 30, 362]]

    >>> from transformers import AutoModelForCausalLM

    >>> model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-7B", device_map="cuda")
    >>> client.init_communicator(device="cuda")
    >>> client.update_model_params(model)
    ```

    There are several ways to initialize the client:

    ```python
    VLLMClient(base_url="http://localhost:8000")
    VLLMClient(base_url="http://192.168.1.100:8000")
    VLLMClient(host="localhost", server_port=8000)
    VLLMClient(host="192.168.1.100", server_port=8000)
    ```
Nbase_urlhostserver_port
group_portconnection_timeoutc                    [        5       (       d  [        S5      e[        5       (       d  [        S5      e[        R                  " 5       U l        Ubh  [        U5      n[        R                  " UR                  5      U l
        UR                  =(       d    SnU SUR                   UR                   3U l        O-X l
        X0l        SU R                   SU R                   3U l        X@l        U R#                  U5        g )NzIrequests is not installed. Please install it with `pip install requests`.zAvLLM is not installed. Please install it with `pip install vllm`.httpz://zhttp://:)r	   ImportErrorr   requestsSessionsessionr   socketgethostbynamehostnamer   schemenetlocpathr   r   r   check_server)selfr   r   r   r   r   
parsed_urlr"   s           P/home/james-whalen/.local/lib/python3.13/site-packages/trl/extras/vllm_client.py__init__VLLMClient.__init__h   s     %&&ijj ""abb'')!(+J,,Z-@-@ADI&&0&F%hc**;*;)<Z__<MNDMI*%dii[$2B2B1CDDM$,-    total_timeoutretry_intervalc                 <   U R                    S3n[        R                  " 5       n  [        R                  " U5      nUR                  S:X  a:  SUR
                  ;   a  UR
                  S   U l        [        R                  S5        g [        R                  S	U S
35        [        R                  " U5        M  ! [        R                  R                   aD  n[        R                  " 5       U-
  nXq:  a  [        SU R                    SU S35      Ue SnANSnAff = f)a  
Check server availability with retries on failure, within a total timeout duration. If the server is not up
after the total timeout duration, raise a `ConnectionError`.

Args:
    retry_interval (`float`, *optional*, defaults to `2.0`):
        Interval in seconds between retries.
    total_timeout (`float`, *optional*, defaults to `0.0`):
        Total timeout duration in seconds.
z/health/   zX-Forwarded-ForzServer is up!Nz$The vLLM server can't be reached at z after zF seconds. Make sure the server is running by running `trl vllm-serve`.z"Server is not up yet. Retrying in z seconds...)r   timer   getstatus_codeheadersr   loggerinfo
exceptionsRequestExceptionr   sleep)r&   r,   r-   url
start_timeresponseexcelapsed_times           r(   r%   VLLMClient.check_server   s    x(YY[
 #<<, ''3.(H,<,<<$,$4$45F$G	KK0	 / KK<^<LKXYJJ~&)  &&77 #yy{Z70)>t}}oWUbTc dR R  1s   B9 9D:DDpromptsimagesnrepetition_penaltytemperaturetop_ptop_kmin_p
max_tokensguided_decoding_regexgeneration_kwargsreturnc                 p   U R                    S3nS nU(       a  U Vs/ s H
  o" U5      PM     snOSnU R                  R                  UUUUUUUUUU	U
U=(       d    0 S.S9nUR                  S:X  a  UR	                  5       nUS   US   S	.$ [        S
UR                   SUR                   35      es  snf )a<  
Generates model completions for the provided prompts.

Args:
    prompts (`list[str]`):
        List of text prompts for which the model will generate completions.
    images (`list[PIL.Image]` or `None`, *optional*, defaults to `None`):
        List of PIL Images to send along with the prompts.
    n (`int`, *optional*, defaults to `1`):
        Number of completions to generate for each prompt.
    repetition_penalty (`float`, *optional*, defaults to `1.0`):
        Parameter for repetition penalty. 1.0 means no penalty.
    temperature (`float`, *optional*, defaults to `1.0`):
        Temperature parameter for sampling. Higher values increase diversity.
    top_p (`float`, *optional*, defaults to `1.0`):
        Top-p sampling parameter.`1.0` means no truncation.
    top_k (`int`, *optional*, defaults to `-1`):
        Top-k sampling parameter. `-1` means no truncation.
    min_p (`float`, *optional*, defaults to `0.0`):
        Minimum probability for sampling.
    max_tokens (`int`, *optional*, defaults to `16`):
        Maximum number of tokens to generate for each prompt.
    guided_decoding_regex (`str` or `None`, *optional*, defaults to `None`):
        Regular expression to guide the decoding process.
    generation_kwargs (`dict` or `None`, *optional*, defaults to `None`):
        Additional generation parameters to pass to the vLLM `SamplingParams`. This can include parameters like
        `seed`, `frequency_penalty`, etc. If it contains keys that conflict with the other parameters, they
        will override them.

Returns:
    `dict` with keys:
        - `completion_ids` (`list[list[int]]`):
            List of lists of token IDs representing the model-generated completions for each prompt.
        - `logprobs` (`list[list[float]]`):
            List of lists of log probabilities for each generated token.
z
/generate/c                     [        5       nU R                  USS9  UR                  5       n[        R                  " U5      R                  S5      $ )NPNG)formatzutf-8)r   savegetvaluebase64	b64encodedecode)imagebuffer	img_bytess      r(   pil_to_base64*VLLMClient.generate.<locals>.pil_to_base64   sA    YFJJveJ,)I##I.55g>>r+   N)r?   r@   rA   rB   rC   rD   rE   rF   rG   rH   rI   jsonr/   completion_idslogprobs)r[   r\   Request failed: , )r   r   postr2   rZ   	Exceptiontext)r&   r?   r@   rA   rB   rC   rD   rE   rF   rG   rH   rI   r9   rW   imgr;   json_responses                    r(   generateVLLMClient.generate   s    d z*	? <B7-$7t<<$$" &8*()>%6%<" % 
  3&$MMOM&34D&ES`akSlmm.x/C/C.DBx}}oVWW- 8s   B3devicec                 &   U R                    S3n[        R                  " U5      nUR                  S:X  a  UR	                  5       S   nO%[        SUR                   SUR                   35      eUS-   nX@l        U R                    S3n[        [        R                  R                  U5      R                  5      nU R                  R                  USU R                  UUS	.S
9nUR                  S:w  a%  [        SUR                   SUR                   35      e[         R"                  " S5        [$        R&                  " U R(                  U R                  U R                  US9n[+        XqS9U l        [.        R0                  " U R2                  5        g)ar  
Initializes the weight update group in a distributed setup for model synchronization.

Args:
    device (`torch.device`, `str`, or `int`, *optional*, defaults to `0`):
        Device of trainer main process. It's the device that will be used for the weights synchronization. Can
        be a `torch.device` object, a string like `'cuda:0'`, or an integer device index.
z/get_world_size/r/   
world_sizer]   r^      z/init_communicator/0.0.0.0)r   portrh   client_device_uuidrY   g?)r   rk   rankrh   rf   N)r   r   r1   r2   rZ   r`   ra   rm   strtorchcudaget_device_propertiesuuidr   r_   r   r0   r8   r   creater   r   pynccl_commatexitregisterclose_communicator)r&   rf   r9   r;   vllm_world_sizerh   rl   pgs           r(   init_communicatorVLLMClient.init_communicator   sd    /0<<$3&&mmol;O.x/C/C.DBx}}oVWW$q(
#	 23 !A!A&!I!N!NO <<$$!(&8	 % 
 3&.x/C/C.DBx}}oVWW
 	

3 #))tyytUYU^U^kuv-b@ 	//0r+   nameweightsc                    [        UR                  5      [        UR                  5      pCU R                   S3nU R
                  R                  XQX4S.S9nUR                  S:w  a%  [        SUR                   SUR                   35      eU R                  R                  X R                  S9  U R                  R                  R                  5         g)	z
Updates a specific named parameter in the model and broadcasts it to other processes.

Args:
    name (`str`):
        Name of the layer whose weights are being updated.
    weights (`torch.Tensor`):
        Tensor containing the updated weights.
z/update_named_param/)r}   dtypeshaperY   r/   r]   r^   )srcN)ro   r   tupler   r   r   r_   r2   r`   ra   ru   	broadcastrm   groupbarrier)r&   r}   r~   r   r   r9   r;   s          r(   update_named_paramVLLMClient.update_named_param,  s     7==)5+?u34<<$$Su/]$^3&.x/C/C.DBx}}oVWW 	""7		":&&(r+   modelc                 l    UR                  5        H   u  p#U R                  X#R                  5        M"     g)z
Updates all parameters of the given model by calling `update_named_param` for each parameter in the model.

Args:
    model (`nn.Module`):
        Model whose parameters (weights/biases) are to be updated.
N)named_parametersr   data)r&   r   r}   params       r(   update_model_paramsVLLMClient.update_model_params@  s+     !113KD##D**5 4r+   c                     U R                    S3nU R                  R                  U5      nUR                  S:w  a%  [	        SUR                   SUR
                   35      eg)z(
Resets the prefix cache for the model.
z/reset_prefix_cache/r/   r]   r^   N)r   r   r_   r2   r`   ra   r&   r9   r;   s      r(   reset_prefix_cacheVLLMClient.reset_prefix_cacheL  s`     34<<$$S)3&.x/C/C.DBx}}oVWW 'r+   c                     U R                    S3n U R                  R                  U5      nUR                  S:w  a%  [	        SUR                   SUR
                   35      eg! [         a     gf = f)zG
Closes the weight update group and cleans up the communication group.
z/close_communicator/r/   r]   r^   N)r   r   r_   r2   r`   ra   r   r   s      r(   rx   VLLMClient.close_communicatorU  s|     34	\||((-H
 ##s*"283G3G2H8==/ Z[[ +	  		s   A" "
A/.A/)r   r   r   ru   rm   r   r   )Nrj   i@  i          )r   g       @)
Nri         ?r   r   r      NN)r   )__name__
__module____qualname____firstlineno____doc__r   ro   intfloatr)   r%   listdictrd   r   rp   rf   r{   Tensorr   r   Moduler   r   rx   __static_attributes__ r+   r(   r   r   .   s   7v #'$'.3-. . 	.
 . ".8"'% "'u "'N "&$' /3,0QXcQX QX 	QX
 "QX QX QX QX QX QX  (}QX $D>QX 
d3iQXf/1ellC.D(E /1b)s )U\\ )(
6 
6X\r+   r   __main__)SamplingParamsrq   rn   z
Hello, AI!zTell me a joke       )rA   rG   sampling_paramsz
Responses:)AutoModelForCausalLMzQwen/Qwen2.5-7B)+rv   rQ   loggingr   r0   ior   typingr   r   urllib.parser   rp   r   import_utilsr	   r
   r   r   r   ,vllm.distributed.device_communicators.pyncclr   vllm.distributed.utilsr   3vllm_ascend.distributed.device_communicators.pyhcclr   	getLoggerr   r4   r   vllmr   clientr{   rd   	responsesprinttransformersr   from_pretrainedtor   r   r   r+   r(   <module>r      s          " !   ] ] ( O<!!p 
		8	$t\ t\p	 z#\F
F+ /? @ARTftfvwI	,	" 2 001BCFFvNE
u% r+   