ó
    ²—ýhÇ<  ã                   ó.  • S SK r S SKrS SKrS SKrS SKrS SKJr  S SKJrJ	r	  S SK
Jr  S SKrS SKJr  SSKJrJrJr  \" 5       (       a
  S SKrS SKJr  \" 5       (       a  S S	KJr  S S
KJr  \" 5       (       a  S SKJr  \R4                  " \5      r " S S5      r\S:X  aw  S SKJr  \" 5       r \ RC                  SS9  \ RE                  SS/SS\" 5       S9r#\$" S\#5        S SK%J&r&  \&RN                  " S5      RQ                  S5      r)\ RU                  \)5        gg)é    N)ÚBytesIO)ÚOptionalÚUnion)Úurlparse)Únné   )Úis_requests_availableÚis_vllm_ascend_availableÚis_vllm_available)ÚConnectionError)ÚPyNcclCommunicator)ÚStatelessProcessGroup)ÚPyHcclCommunicatorc                   ó\  • \ rS rSrSr     S$S\\   S\S\S\S\4
S	 jjr	S%S
\S\4S jjr
          S&S\\   S\\   S\S\S\S\S\S\S\S\\   S\\   S\\\      4S jjrS'S\\R                   \\4   4S jjrS\S\R$                  4S jrS\R*                  4S  jrS! rS" rS#rg)(Ú
VLLMClienté.   aÒ  
A client class to interact with a vLLM server.

This class provides methods to generate completions, initialize and manage weight update groups, and update model
weights in a distributed setting. Before using it, start the vLLM server with `trl vllm-serve`.

Args:
    base_url (`str` or `None`, *optional*, defaults to `None`):
        Base URL for the vLLM server (e.g., `"http://localhost:8000"`). If provided, `host` and `server_port` are
        ignored.
    host (`str`, *optional*, defaults to `"0.0.0.0"`):
        IP address of the vLLM server. Ignored if `base_url` is provided.
    server_port (`int`, *optional*, defaults to `8000`):
        Port number of the vLLM server. Ignored if `base_url` is provided.
    group_port (`int`, *optional*, defaults to `51216`):
        Port number for the weight update group.
    connection_timeout (`float`, *optional*, defaults to `0.0`):
        Total timeout duration in seconds to wait for the server to be up. If the server is not up after the
        timeout, a `ConnectionError` is raised.

Examples:
    Run the vLLM server with the model `Qwen/Qwen2.5-7B`:

    ```
    $ trl vllm-serve --model Qwen/Qwen2.5-7B
    ...
    INFO:     Application startup complete.
    INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
    ```

    Use the client to generate completions and update model weights:

    ```python
    >>> from trl.extras.vllm_client import VLLMClient

    >>> client = VLLMClient()
    >>> client.generate(["Hello, AI!", "Tell me a joke"])
    [[2980, 498, 1492, 752, 448, 264, 13027, 8645, 30, 358, 2776, 4460, 311, 3270, 264, 2025],
     [911, 7988, 1251, 382, 3838, 653, 498, 1618, 4325, 879, 2581, 20027, 264, 21428, 30, 362]]

    >>> from transformers import AutoModelForCausalLM

    >>> model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-7B", device_map="cuda")
    >>> client.init_communicator(device="cuda")
    >>> client.update_model_params(model)
    ```

    There are several ways to initialize the client:

    ```python
    VLLMClient(base_url="http://localhost:8000")
    VLLMClient(base_url="http://192.168.1.100:8000")
    VLLMClient(host="localhost", server_port=8000)
    VLLMClient(host="192.168.1.100", server_port=8000)
    ```
NÚbase_urlÚhostÚserver_portÚ
group_portÚconnection_timeoutc                 óþ  • [        5       (       d  [        S5      e[        5       (       d  [        S5      e[        R                  " 5       U l        Ubh  [        U5      n[        R                  " UR                  5      U l
        UR                  =(       d    SnU SUR                   UR                   3U l        O-X l
        X0l        SU R                   SU R                   3U l        X@l        U R#                  U5        g )NzIrequests is not installed. Please install it with `pip install requests`.zAvLLM is not installed. Please install it with `pip install vllm`.Úhttpz://zhttp://Ú:)r	   ÚImportErrorr   ÚrequestsÚSessionÚsessionr   ÚsocketÚgethostbynameÚhostnamer   ÚschemeÚnetlocÚpathr   r   r   Úcheck_server)Úselfr   r   r   r   r   Ú
parsed_urlr"   s           ÚP/home/james-whalen/.local/lib/python3.13/site-packages/trl/extras/vllm_client.pyÚ__init__ÚVLLMClient.__init__h   sÖ   € ô %×&Ñ&ÜÐiÓjÐjÜ ×"Ñ"ÜÐaÓbÐbä×'Ò'Ó)ˆŒàÑä! (Ó+ˆJÜ×,Ò,¨Z×-@Ñ-@ÓAˆDŒIØ×&Ñ&×0¨&ˆFØ%˜h c¨*×*;Ñ*;Ð)<¸Z¿_¹_Ð<MÐNˆDMàŒIØ*ÔØ% d§i¡i [°°$×2BÑ2BÐ1CÐDˆDŒMØ$ŒØ×ÑÐ,Õ-ó    Útotal_timeoutÚretry_intervalc                 ó<  • U R                    S3n[        R                  " 5       n  [        R                  " U5      nUR                  S:X  a:  SUR
                  ;   a  UR
                  S   U l        [        R                  S5        g [        R                  S	U S
35        [        R                  " U5        M“  ! [        R                  R                   aD  n[        R                  " 5       U-
  nXq:¼  a  [        SU R                    SU S35      Ue SnANŽSnAff = f)a‚  
Check server availability with retries on failure, within a total timeout duration. If the server is not up
after the total timeout duration, raise a `ConnectionError`.

Args:
    retry_interval (`float`, *optional*, defaults to `2.0`):
        Interval in seconds between retries.
    total_timeout (`float`, *optional*, defaults to `0.0`):
        Total timeout duration in seconds.
z/health/éÈ   zX-Forwarded-ForzServer is up!Nz$The vLLM server can't be reached at z after zF seconds. Make sure the server is running by running `trl vllm-serve`.z"Server is not up yet. Retrying in z seconds...)r   Útimer   ÚgetÚstatus_codeÚheadersr   ÚloggerÚinfoÚ
exceptionsÚRequestExceptionr   Úsleep)r&   r,   r-   ÚurlÚ
start_timeÚresponseÚexcÚelapsed_times           r(   r%   ÚVLLMClient.check_server„   s  € ð —‘˜xÐ(ˆÜ—Y’Y“[ˆ
àð Ü#Ÿ<š<¨Ó,ð ×'Ñ'¨3Ó.Ø(¨H×,<Ñ,<Ó<Ø$,×$4Ñ$4Ð5FÑ$G˜œ	Ü—K‘K Ô0Øð	 /ô K‰KÐ<¸^Ð<LÈKÐXÔYÜJŠJ~Ô&ñ) øô ×&Ñ&×7Ñ7ó ä#Ÿyšy›{¨ZÑ7ØÓ0Ü)Ø>¸t¿}¹}¸oÈWÐUbÐTcð dRð Róð ðô 1ûðús   §B9 Â9DÃ:DÄDÚpromptsÚimagesÚnÚrepetition_penaltyÚtemperatureÚtop_pÚtop_kÚmin_pÚ
max_tokensÚguided_decoding_regexÚgeneration_kwargsÚreturnc                 óp  • U R                    S3nS nU(       a  U Vs/ s H
  oí" U5      PM     snOSnU R                  R                  UUUUUUUUUU	U
U=(       d    0 S.S9nUR                  S:X  a  UR	                  5       nUS   US   S	.$ [        S
UR                   SUR                   35      es  snf )a<  
Generates model completions for the provided prompts.

Args:
    prompts (`list[str]`):
        List of text prompts for which the model will generate completions.
    images (`list[PIL.Image]` or `None`, *optional*, defaults to `None`):
        List of PIL Images to send along with the prompts.
    n (`int`, *optional*, defaults to `1`):
        Number of completions to generate for each prompt.
    repetition_penalty (`float`, *optional*, defaults to `1.0`):
        Parameter for repetition penalty. 1.0 means no penalty.
    temperature (`float`, *optional*, defaults to `1.0`):
        Temperature parameter for sampling. Higher values increase diversity.
    top_p (`float`, *optional*, defaults to `1.0`):
        Top-p sampling parameter.`1.0` means no truncation.
    top_k (`int`, *optional*, defaults to `-1`):
        Top-k sampling parameter. `-1` means no truncation.
    min_p (`float`, *optional*, defaults to `0.0`):
        Minimum probability for sampling.
    max_tokens (`int`, *optional*, defaults to `16`):
        Maximum number of tokens to generate for each prompt.
    guided_decoding_regex (`str` or `None`, *optional*, defaults to `None`):
        Regular expression to guide the decoding process.
    generation_kwargs (`dict` or `None`, *optional*, defaults to `None`):
        Additional generation parameters to pass to the vLLM `SamplingParams`. This can include parameters like
        `seed`, `frequency_penalty`, etc. If it contains keys that conflict with the other parameters, they
        will override them.

Returns:
    `dict` with keys:
        - `completion_ids` (`list[list[int]]`):
            List of lists of token IDs representing the model-generated completions for each prompt.
        - `logprobs` (`list[list[float]]`):
            List of lists of log probabilities for each generated token.
z
/generate/c                 ó    • [        5       nU R                  USS9  UR                  5       n[        R                  " U5      R                  S5      $ )NÚPNG)Úformatzutf-8)r   ÚsaveÚgetvalueÚbase64Ú	b64encodeÚdecode)ÚimageÚbufferÚ	img_bytess      r(   Úpil_to_base64Ú*VLLMClient.generate.<locals>.pil_to_base64Ü   sA   € Ü“YˆFØJ‰Jv eˆJÑ,ØŸ™Ó)ˆIÜ×#Ò# IÓ.×5Ñ5°gÓ>Ð>r+   N)r?   r@   rA   rB   rC   rD   rE   rF   rG   rH   rI   ©Újsonr/   Úcompletion_idsÚlogprobs)r[   r\   úRequest failed: ú, )r   r   Úpostr2   rZ   Ú	ExceptionÚtext)r&   r?   r@   rA   rB   rC   rD   rE   rF   rG   rH   rI   r9   rW   Úimgr;   Újson_responses                    r(   ÚgenerateÚVLLMClient.generate¨   sÝ   € ðd —‘˜zÐ*ˆò	?ö <B±Ó7²¨- Ö$±Ò7Àtˆà—<‘<×$Ñ$Øà"Ø ØØ&8Ø*ØØØØ(Ø)>Ø%6×%<¸"ñð %ð 
ˆð  ×Ñ 3Ó&Ø$ŸM™M›OˆMØ&3Ð4DÑ&EÐS`ÐakÑSlÑmÐmäÐ.¨x×/CÑ/CÐ.DÀBÀxÇ}Á}ÀoÐVÓWÐWùò- 8s   žB3Údevicec                 ó&  • U R                    S3n[        R                  " U5      nUR                  S:X  a  UR	                  5       S   nO%[        SUR                   SUR                   35      eUS-   nX@l        U R                    S3n[        [        R                  R                  U5      R                  5      nU R                  R                  USU R                  UUS	.S
9nUR                  S:w  a%  [        SUR                   SUR                   35      e[         R"                  " S5        [$        R&                  " U R(                  U R                  U R                  US9n[+        XqS9U l        [.        R0                  " U R2                  5        g)ar  
Initializes the weight update group in a distributed setup for model synchronization.

Args:
    device (`torch.device`, `str`, or `int`, *optional*, defaults to `0`):
        Device of trainer main process. It's the device that will be used for the weights synchronization. Can
        be a `torch.device` object, a string like `'cuda:0'`, or an integer device index.
z/get_world_size/r/   Ú
world_sizer]   r^   é   z/init_communicator/ú0.0.0.0)r   Úportrh   Úclient_device_uuidrY   gš™™™™™¹?)r   rk   Úrankrh   ©rf   N)r   r   r1   r2   rZ   r`   ra   rm   ÚstrÚtorchÚcudaÚget_device_propertiesÚuuidr   r_   r   r0   r8   r   Úcreater   r   Úpynccl_commÚatexitÚregisterÚclose_communicator)r&   rf   r9   r;   Úvllm_world_sizerh   rl   Úpgs           r(   Úinit_communicatorÚVLLMClient.init_communicatorû   sd  € ð —‘Ð/Ð0ˆÜ—<’< Ó$ˆØ×Ñ 3Ó&Ø&Ÿm™m›o¨lÑ;‰OäÐ.¨x×/CÑ/CÐ.DÀBÀxÇ}Á}ÀoÐVÓWÐWà$ qÑ(ˆ
Ø#Œ	ð —‘Ð2Ð3ˆÜ ¤§¡×!AÑ!AÀ&Ó!I×!NÑ!NÓOÐð —<‘<×$Ñ$Øà!ØŸ™Ø(Ø&8ñ	ð %ð 
ˆð ×Ñ 3Ó&ÜÐ.¨x×/CÑ/CÐ.DÀBÀxÇ}Á}ÀoÐVÓWÐWô
 	
Š
3Œô #×)Ò)¨t¯y©y¸t¿¹ÐUY×U^ÑU^ÐkuÑvˆÜ-¨bÑ@ˆÔô 	Š˜×/Ñ/Õ0r+   ÚnameÚweightsc                 ó¤  • [        UR                  5      [        UR                  5      pCU R                   S3nU R
                  R                  XQX4S.S9nUR                  S:w  a%  [        SUR                   SUR                   35      eU R                  R                  X R                  S9  U R                  R                  R                  5         g)	zø
Updates a specific named parameter in the model and broadcasts it to other processes.

Args:
    name (`str`):
        Name of the layer whose weights are being updated.
    weights (`torch.Tensor`):
        Tensor containing the updated weights.
z/update_named_param/)r}   ÚdtypeÚshaperY   r/   r]   r^   )ÚsrcN)ro   r€   Útupler   r   r   r_   r2   r`   ra   ru   Ú	broadcastrm   ÚgroupÚbarrier)r&   r}   r~   r€   r   r9   r;   s          r(   Úupdate_named_paramÚVLLMClient.update_named_param,  s±   € ô ˜7Ÿ=™=Ó)¬5°·±Ó+?ˆuØ—‘Ð3Ð4ˆØ—<‘<×$Ñ$ SÀuÑ/]Ð$Ð^ˆØ×Ñ 3Ó&ÜÐ.¨x×/CÑ/CÐ.DÀBÀxÇ}Á}ÀoÐVÓWÐWð 	×Ñ×"Ñ" 7·	±	Ð"Ñ:Ø×Ñ×Ñ×&Ñ&Õ(r+   Úmodelc                 ól   • UR                  5        H   u  p#U R                  X#R                  5        M"     g)zÏ
Updates all parameters of the given model by calling `update_named_param` for each parameter in the model.

Args:
    model (`nn.Module`):
        Model whose parameters (weights/biases) are to be updated.
N)Únamed_parametersr‡   Údata)r&   r‰   r}   Úparams       r(   Úupdate_model_paramsÚVLLMClient.update_model_params@  s+   € ð !×1Ñ1Ö3‰KˆDà×#Ñ# D¯*©*Ö5ò 4r+   c                 óÂ   • U R                    S3nU R                  R                  U5      nUR                  S:w  a%  [	        SUR                   SUR
                   35      eg)z(
Resets the prefix cache for the model.
z/reset_prefix_cache/r/   r]   r^   N)r   r   r_   r2   r`   ra   ©r&   r9   r;   s      r(   Úreset_prefix_cacheÚVLLMClient.reset_prefix_cacheL  s`   € ð —‘Ð3Ð4ˆØ—<‘<×$Ñ$ SÓ)ˆØ×Ñ 3Ó&ÜÐ.¨x×/CÑ/CÐ.DÀBÀxÇ}Á}ÀoÐVÓWÐWð 'r+   c                 óä   • U R                    S3n U R                  R                  U5      nUR                  S:w  a%  [	        SUR                   SUR
                   35      eg! [         a     gf = f)zG
Closes the weight update group and cleans up the communication group.
z/close_communicator/r/   r]   r^   N)r   r   r_   r2   r`   ra   r   r‘   s      r(   rx   ÚVLLMClient.close_communicatorU  s|   € ð —‘Ð3Ð4ˆð	\Ø—|‘|×(Ñ(¨Ó-ˆHð
 ×#Ñ# sÓ*ÜÐ"2°8×3GÑ3GÐ2HÈÈ8Ï=É=È/Ð ZÓ[Ð[ð +øô	 ó 	áð	ús   ‘A" Á"
A/Á.A/)r   r   r   ru   rm   r   r   )Nrj   i@  iÈ  ç        )r–   g       @)
Nri   ç      ð?r—   r—   éÿÿÿÿr–   é   NN)r   )Ú__name__Ú
__module__Ú__qualname__Ú__firstlineno__Ú__doc__r   ro   ÚintÚfloatr)   r%   ÚlistÚdictrd   r   rp   rf   r{   ÚTensorr‡   r   ÚModulerŽ   r’   rx   Ú__static_attributes__© r+   r(   r   r   .   s„  † ñ7ðv #'ØØØØ$'ñ.à˜3‘-ð.ð ð.ð ð	.ð
 ð.ð "õ.ñ8"'¨%ð "'Àuõ "'ðN "&ØØ$'Ø ØØØØØ/3Ø,0ñQXàc‘ðQXð ˜‘ðQXð ð	QXð
 "ðQXð ðQXð ðQXð ðQXð ðQXð ðQXð  (¨™}ðQXð $ D™>ðQXð 
ˆd3‰i‰õQXñf/1¨¨e¯l©l¸CÀÐ.DÑ(Eõ /1ðb) sð )°U·\±\ô )ð(
6¨¯©ô 
6òXõ\r+   r   Ú__main__)ÚSamplingParamsrq   rn   z
Hello, AI!zTell me a jokeé   é    )rA   rG   Úsampling_paramsz
Responses:)ÚAutoModelForCausalLMzQwen/Qwen2.5-7B)+rv   rQ   Úloggingr   r0   Úior   Útypingr   r   Úurllib.parser   rp   r   Úimport_utilsr	   r
   r   r   r   Ú,vllm.distributed.device_communicators.pyncclr   Úvllm.distributed.utilsr   Ú3vllm_ascend.distributed.device_communicators.pyhcclr   Ú	getLoggerrš   r4   r   Úvllmr¨   Úclientr{   rd   Ú	responsesÚprintÚtransformersr¬   Úfrom_pretrainedÚtor‰   rŽ   r¦   r+   r(   Ú<module>r½      s   ðó Û Û Û Û Ý ß "Ý !ã Ý ç ]Ñ ]ñ ×ÑÛÝ(ñ ×ÑÝOÝ<á×!Ñ!Ýpð 
×	Ò	˜8Ó	$€÷t\ñ t\ðp	 ˆzÓÝ#á‹\€FØ
×Ñ FÐÑ+ð —‘ Ð/?Ð @ÀAÐRTÑftÓfvÐw€IÙ	ˆ,˜	Ô"õ 2à ×0Ò0Ð1BÓC×FÑFÀvÓN€EØ
×Ñ˜uÕ%ð r+   