
    cCi                        S r SSKrSSKJrJr  SSKrSSKJr  SSKJr  SSK	J
r
JrJr  SSKJr  SS	KJrJr  SS
KJr  SSKJrJr  SSKJr  SSKJrJr  SSKJr  SSKJr  \R@                  " \!5      r" " S S\RF                  5      r$ " S S\RJ                  5      r& " S S\RJ                  5      r' " S S\5      r(\ " S S\5      5       r)\ " S S\)5      5       r*\" SS9 " S S \)\5      5       r+/ S!Qr,g)"zPyTorch XGLM model.    N)OptionalUnion)nn   )ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)_prepare_4d_attention_mask!_prepare_4d_causal_attention_mask)GradientCheckpointingLayer))BaseModelOutputWithPastAndCrossAttentions!CausalLMOutputWithCrossAttentions)PreTrainedModel)auto_docstringlogging)deprecate_kwarg   )
XGLMConfigc            
       r   ^  \ rS rSrSrSS\S\S\S\\   4U 4S jjjrS\	R                  4U 4S	 jjrS
rU =r$ )XGLMScaledWordEmbedding'   zT
This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
num_embeddingsembedding_dimpadding_idxembed_scalec                 2   > [         TU ]  XU5        X@l        g N)super__init__r   )selfr   r   r   r   	__class__s        `/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/xglm/modeling_xglm.pyr!    XGLMScaledWordEmbedding.__init__,   s    D&    	input_idsc                 <   > [         TU ]  U5      U R                  -  $ r   )r    forwardr   )r"   r'   r#   s     r$   r)   XGLMScaledWordEmbedding.forward0   s    wy)D,<,<<<r&   r   )      ?)__name__
__module____qualname____firstlineno____doc__intr   floatr!   torchTensorr)   __static_attributes____classcell__r#   s   @r$   r   r   '   sJ    's '3 'S '_ghm_n ' '= = =r&   r   c            	          ^  \ rS rSrSrSS\S\S\\   4U 4S jjjrSS\S\S\\   4S jjr\	SS\S\S\\   4S	 jj5       r
\R                  " 5       SS
\\R                     S\4S jj5       rSrU =r$ )!XGLMSinusoidalPositionalEmbedding4   zDThis module produces sinusoidal positional embeddings of any length.num_positionsr   r   c                    > [         TU ]  5         SU l        X l        X0l        U R                  XR                  -   X#5        g )N   )r    r!   offsetr   r   make_weights)r"   r<   r   r   r#   s       r$   r!   *XGLMSinusoidalPositionalEmbedding.__init__7   s8    *&-++5}Rr&   r   c                     U R                  XU5      n[        U S5      (       a8  UR                  U R                  R                  U R                  R
                  S9nU R                  SUSS9  g )NweightsdtypedeviceF)
persistent)get_embeddinghasattrtorC   rE   rF   register_buffer)r"   r   r   r   emb_weightss        r$   r@   .XGLMSinusoidalPositionalEmbedding.make_weights>   s\    ((T4##%..t||/A/A$,,J]J].^KYFr&   c                    US-  n[         R                  " S5      US-
  -  n[        R                  " [        R                  " U[        R
                  S9R                  5       U* -  5      n[        R                  " U [        R
                  S9R                  5       R                  S5      UR                  S5      -  n[        R                  " [        R                  " U5      [        R                  " U5      /SS9R                  U S5      nUS-  S:X  a,  [        R                  " U[        R                  " U S5      /SS9nUb  SXBSS24'   UR                  [        R                  " 5       5      $ )	z
Build sinusoidal embeddings.

This matches the implementation in tensor2tensor, but differs slightly from the description in Section 3.5 of
"Attention Is All You Need".
r>   i'  r   )rE   r   dimN)mathlogr4   exparangeint64r3   	unsqueezecatsincosviewzerosrJ   get_default_dtype)r   r   r   half_dimembs        r$   rH   /XGLMSinusoidalPositionalEmbedding.get_embeddingF   s    !A%hhuoA.iiXU[[AGGISDPQll>=CCEOOPQRUXUbUbcdUeeii338a@EEnVXY1!))S%++na"@AqIC""#CQvve--/00r&   position_idspast_key_values_lengthc                    UR                  5       u  p4XR                  -  nSU-   U-   nXPR                  R                  S5      :  a&  U R                  XPR                  U R
                  5        U R                  R                  SUR                  S5      5      R                  X4U R                  R                  S   5      R                  5       $ )Nr>   r   rQ   )
sizer?   rC   r@   r   r   index_selectr[   shapedetach)r"   ra   rb   bszseq_lenmax_poss         r$   r)   )XGLMSinusoidalPositionalEmbedding.forward[   s    #((*# g+ 66\\&&q))g'9'94;K;KL||((L,=,=b,ABGGVZVbVbVhVhikVlmttvvr&   )r   r?   r   r   )Nr   )r-   r.   r/   r0   r1   r2   r   r!   r@   staticmethodrH   r4   no_gradr5   r)   r6   r7   r8   s   @r$   r:   r:   4   s    NSc S# SHUXM S SG3 Gs GQYZ]Q^ G 1c 1# 1HUXM 1 1( ]]_	wHU\\$: 	w[^ 	w 	wr&   r:   c                     ^  \ rS rSrSr    SS\S\S\\   S\\   S\\   S\\   4U 4S	 jjjr	\
" S
SSS9      SS\R                  S\\R                     S\\   S\\R                     S\\R                     S\S\\R                     S\\R                  \\R                     \\\R                        4   4S jj5       rSrU =r$ )XGLMAttentionh   z=Multi-headed attention from 'Attention Is All You Need' paper	embed_dim	num_headsdropout
is_decoderbias	layer_idxc                   > [         TU ]  5         Xl        X l        X0l        X-  U l        U R
                  U-  U R                  :w  a  [        SU R                   SU S35      eU R
                  S-  U l        X@l        X`l	        [        R                  " XUS9U l        [        R                  " XUS9U l        [        R                  " XUS9U l        [        R                  " XUS9U l        g )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      ࿩ru   )r    r!   rq   rr   rs   head_dim
ValueErrorscalingrt   rv   r   Lineark_projv_projq_projout_proj)r"   rq   rr   rs   rt   ru   rv   r#   s          r$   r!   XGLMAttention.__init__k   s     	""!.MMI%$..8MdnnM]$YKr3  }}d*$"ii	4@ii	4@ii	4@		)TBr&   past_key_valuepast_key_values4.58new_nameversionhidden_stateskey_value_statesattention_masklayer_head_maskoutput_attentionscache_positionreturnc                    USLnUR                  5       u  pnU(       a  UR                  S   OU
nU R                  U5      U R                  -  nSnUb]  [	        U[
        5      (       aF  UR                  R                  U R                  5      nU(       a  UR                  nOUR                  nOUnU(       a  UOUnU(       aQ  UbN  U(       aG  WR                  U R                     R                  nUR                  U R                     R                  nOU R                  U5      nU R                  U5      nUR!                  XSU R"                  5      R%                  SS5      nUR!                  XSU R"                  5      R%                  SS5      nUbc  U(       d  UOSnWR'                  UUU R                  SU05      u  nnU(       a.  [	        U[
        5      (       a  SUR                  U R                  '   XR(                  -  SU R"                  4nUR!                  XU R(                  U R"                  5      R%                  SS5      nUR*                  " U6 nUR*                  " U6 nUR*                  " U6 nUR                  S5      n[,        R.                  " UUR%                  SS5      5      nUR                  5       XR(                  -  X4:w  a-  [1        SXR(                  -  X4 S	UR                  5        35      eUb  UR                  5       U	SX4:w  a"  [1        S
U	SX4 S	UR                  5        35      eUR!                  XR(                  X5      U-   n[,        R2                  " U[,        R4                  " [,        R6                  " UR8                  5      R:                  UR<                  S95      nUR!                  XR(                  -  X5      nUR8                  [,        R>                  :X  aK  [@        RB                  RE                  US[,        RF                  S9RI                  [,        R>                  5      nO[@        RB                  RE                  USS9nUb  UR                  5       U R(                  4:w  a*  [1        SU R(                  4 S	UR                  5        35      eUR!                  SSSS5      UR!                  XR(                  X5      -  nUR!                  XR(                  -  X5      nU(       a;  UR!                  XR(                  X5      nUR!                  XR(                  -  X5      nOSn[@        RB                  RK                  UU RJ                  U RL                  S9n[,        R.                  " UU5      nUR                  5       XR(                  -  XR"                  4:w  a5  [1        SXR(                  XR"                  4 S	UR                  5        35      eUR!                  XR(                  XR"                  5      nUR%                  SS5      nUR+                  XU RN                  5      nU RQ                  U5      nUU4$ )z#Input shape: Batch x Time x ChannelNr   FrQ   r>   r   Tz$Attention weights should be of size z	, but is z!Attention mask should be of size )rF   )rP   rE   rO   z/Head mask for a single layer should be of size ptrainingz `attn_output` should be of size ))rd   rf   r   r{   
isinstancer
   
is_updatedgetrv   cross_attention_cacheself_attention_cachelayerskeysvaluesr}   r~   r[   ry   	transposeupdaterr   reshaper4   bmmrz   maxtensorfinforE   minrF   float16r   
functionalsoftmaxfloat32rJ   rs   r   rq   r   )r"   r   r   r   r   r   r   r   is_cross_attentionrh   tgt_len_src_lenquery_statesr   curr_past_key_valuecurrent_states
key_statesvalue_states
proj_shapeattn_weightsattn_weights_reshaped
attn_probsattn_outputs                           r$   r)   XGLMAttention.forward   s    .T9',,.a/A"((+w {{=1DLL@
&/+>??,77;;DNNK
%*9*O*O'*9*N*N'&5#-?)]/"=*,33DNNCHHJ.55dnnELLL^4J;;~6L#r4==ISSTUWXYJ',,S2t}}MWWXY[\]L*7It+>+E+Ednn?OQ_>`,(
L &*_FY*Z*ZAEO..t~~>NN*B>
#((t~~t}}U__`acde#++Z8''4
#++Z8//!$yyz/C/CAq/IJ3#7"JJ6nn8Lg7_6` a %%'(* 
 %""$a(BB 7a8R7SS\]k]p]p]r\st  (,,S..'SVddL 99ell5;;|7I7I+J+N+NWcWjWjkL (,,S>>-A7TL .==002U]]0[^^_d_l_lmL==0020FL&##%$..):: Et~~FWEX Y',,./1  +//2q!<|?P?PQTVdVdfm?wwL',,S>>-A7TL
 %1$5$5c>>7$\!055cNN6JG]L$(!]]**<4<<RVR_R_*`
ii
L9#"6!OO2CR_R_3`2a b$$&') 
 "&&sNNG]]S!++Aq1 "))#GmmK0111r&   )rs   rq   ry   rt   r}   rv   rr   r   r   r{   r~   )        FTN)NNNNFN)r-   r.   r/   r0   r1   r2   r   r3   boolr!   r   r4   r5   r   tupler)   r6   r7   r8   s   @r$   ro   ro   h   sO   G $'%*#$(CC C %	C
 TNC tnC D>C C: %0A6R 48+/1526"'15}2||}2 #5<<0}2 "%	}2
 !.}2 "%,,/}2  }2 !.}2 
u||Xell3XeELL>Q5RR	S}2 S}2r&   ro   c                   l  ^  \ rS rSrSS\4U 4S jjjr\" SSSS9         SS\R                  S	\	\R                     S
\	\R                     S\	\R                     S\	\R                     S\	\R                     S\	\
   S\	\   S\	\   S\	\R                     S\R                  4S jj5       rSrU =r$ )XGLMDecoderLayeri	  configc                 8  > [         TU ]  5         UR                  U l        [	        U R                  UR
                  UR                  SUS9U l        UR                  U l        [        UR                     U l        UR                  U l        UR                  (       aU  [	        U R                  UR
                  UR                  SUS9U l        [        R                   " U R                  5      U l        [        R                   " U R                  5      U l        [        R&                  " U R                  UR(                  5      U l        [        R&                  " UR(                  U R                  5      U l        [        R                   " U R                  5      U l        g )NT)rq   rr   rs   rt   rv   )r    r!   d_modelrq   ro   attention_headsattention_dropout	self_attnrs   r   activation_functionactivation_fnactivation_dropoutadd_cross_attentionencoder_attnr   	LayerNormencoder_attn_layer_normself_attn_layer_normr|   ffn_dimfc1fc2final_layer_norm)r"   r   rv   r#   s      r$   r!   XGLMDecoderLayer.__init__
  s   &nn,,,,
 ~~#F$>$>?"(";";%% -.. 0000#!D ,.<<+GD($&LL$@!99T^^V^^<99V^^T^^< "T^^ <r&   r   r   r   r   r   r   encoder_hidden_statesencoder_attention_maskr   cross_attn_layer_head_maskr   	use_cacher   r   c           
         UnU R                  U5      nU R                  UUUUUU
S9u  p[        R                  R	                  XR                  U R
                  S9nX-   nSnUb`  UnU R                  U5      nU R                  UUUUUUU
S9u  p[        R                  R	                  XR                  U R
                  S9nX-   nUnU R                  U5      nU R                  U R                  U5      5      n[        R                  R	                  XR                  U R
                  S9nU R                  U5      n[        R                  R	                  XR                  U R
                  S9nX-   nU4nU(       a  XU4-  nU$ )au  
Args:
    hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
    attention_mask (`torch.FloatTensor`): attention mask of size
        `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
    encoder_hidden_states (`torch.FloatTensor`):
        cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
    encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
        `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
    layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
        `(encoder_attention_heads,)`.
    cross_attn_layer_head_mask (`torch.FloatTensor`): mask for cross-attention heads in a given layer of
        size `(decoder_attention_heads,)`.
    past_key_values (`Cache`): cached past key and value projection states
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
)r   r   r   r   r   r   r   N)r   r   r   r   r   r   r   )r   r   r   r   rs   r   r   r   r   r   r   r   r   )r"   r   r   r   r   r   r   r   r   r   r   residualself_attn_weightscross_attn_weightsoutputss                  r$   r)   XGLMDecoderLayer.forward(  s   B !11-@ ,0>>'+)+/) ,: ,
( --m||VZVcVc-d 0 " ,$H 88GM040A0A+!65 : /"3- 1B 1-M MM11-<<Z^ZgZg1hM$4M !--m<**488M+BC--m?V?Vaeanan-o/--m||VZVcVc-d 0 "+=>>Gr&   )r   r   rs   rq   r   r   r   r   r   r   r   r   )	NNNNNNFTN)r-   r.   r/   r0   r   r!   r   r4   r5   r   r   r   r)   r6   r7   r8   s   @r$   r   r   	  s   =z = =< %0A6R
 268<9=26=A+/,1$(15N||N !.N  (5	N
 !) 6N "%,,/N %-U\\$:N "%N $D>N D>N !.N 
N SNr&   r   c                   4    \ rS rSr% \\S'   SrSrS/rS r	Sr
g)	XGLMPreTrainedModeli{  r   modelTr   c                 "   U R                   R                  n[        U[        R                  5      (       aW  UR
                  R                  R                  SUS9  UR                  b%  UR                  R                  R                  5         g g [        U[        R                  5      (       ad  UR
                  R                  R                  SUS9  UR                  b2  UR
                  R                  UR                     R                  5         g g g )Nr   )meanstd)r   init_stdr   r   r|   weightdatanormal_ru   zero_	Embeddingr   )r"   moduler   s      r$   _init_weights!XGLMPreTrainedModel._init_weights  s    kk""fbii((MM&&CS&9{{&  &&( '--MM&&CS&9!!-""6#5#56<<> . .r&    N)r-   r.   r/   r0   r   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modulesr   r6   r   r&   r$   r   r   {  s"    &*#+,	?r&   r   c            "         ^  \ rS rSrSS\S\\R                     4U 4S jjjr\	              SS\\
R                     S\\
R                     S\\
R                     S\\
R                     S	\\
R                     S
\\
R                     S\\
R                     S\\   S\\
R                     S\\   S\\   S\\   S\\   S\\
R                     S\\\
R                     \4   4S jj5       rSrU =r$ )	XGLMModeli  r   embed_tokensc           
        > [         TU ]  U5        UR                  U l        UR                  U l        UR                  U l        UR                  U l        UR                  (       a   [        R                  " UR                  5      OSnUb  X l        O/[        UR                  UR                  U R
                  US9U l        [        UR                  UR                  UR                  5      U l        ["        R$                  " ['        UR(                  5       Vs/ s H  n[+        XS9PM     sn5      U l        ["        R.                  " UR                  5      U l        SU l        U R5                  5         gs  snf )zB
embed_tokens (`nn.Embedding`, *optional*):
    output embeddings
r,   Nr+   )rv   F)r    r!   rs   	layerdroppad_token_idr   max_position_embeddingsmax_target_positionsscale_embeddingrR   sqrtr   r   r   
vocab_sizer:   embed_positionsr   
ModuleListrange
num_layersr   r   r   
layer_normgradient_checkpointing	post_init)r"   r   r   r   ir#   s        r$   r!   XGLMModel.__init__  s    
 	 ~~))!..$*$B$B!393I3Idii/s# , 7!!6>>43C3CQ\!D  A**NN 

 mmTYZ`ZkZkTl$mTlq%5f%JTl$mn,,v~~6&+# %ns   E4r'   r   ra   r   r   	head_maskcross_attn_head_maskr   inputs_embedsr   r   output_hidden_statesreturn_dictr   r   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nU
b  U
OU R                   R                  n
Ub  UOU R                   R                  nUb  U	b  [        S5      eUb7  U R                  X5        UR                  5       nUR                  SUS   5      nO"U	b  U	R                  5       SS nO[        S5      eU	c  U R                  U5      n	U R                  (       a/  U R                  (       a  U
(       a  [        R                  S5        Sn
U
(       aG  UcD  Ub.  [        [        U R                   S9[        U R                   S95      O[        U R                   S9nU
(       a@  [!        U["        5      (       a+  [        R                  S5        [        R$                  " U5      nUb  UR'                  5       OS	n[)        X/U	U5      nUcU  [*        R,                  " UUS   U-   [*        R.                  Ub  UR0                  OU	R0                  S
9nUR3                  S	5      nUb  Ub  [5        XYR6                  US   S9nXR9                  UU5      R;                  U	R0                  5      -   n[<        R>                  RA                  U[C        U R@                  5      U R                  S9nU(       a  SOSnU(       a  SOSnU(       a  Ub  SOSn[E        Xg/SS/5       Hn  u  nnUc  M  UR                  5       S	   [G        U RH                  5      :w  d  M7  [        SU S[G        U RH                  5       SUR                  5       S	    S35      e   [K        U RH                  5       H  u  nnU(       a  UU4-  nU R                  (       a(  [*        RL                  " / 5      nUU RN                  :  a  ML  U" UUUUUb  UU   OSUb  UU   OSUUU
US9
nUS	   nU(       d  My  UUS   4-  nUc  M  UUS   4-  nM     U RQ                  U5      nU(       a  UU4-  nU(       d  [#        S UUUUU4 5       5      $ [S        UUUUUS9$ )a  
encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
    Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention of
    the decoder.
encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
    Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
    selected in `[0, 1]`:

    - 1 for tokens that are **not masked**,
    - 0 for tokens that are **masked**.

    [What are attention masks?](../glossary#attention-mask)
cross_attn_head_mask (`torch.Tensor` of shape `(num_layers, attention_heads)`, *optional*):
    Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

    - 1 indicates the head is **not masked**,
    - 0 indicates the head is **masked**.
NzDYou cannot specify both input_ids and inputs_embeds at the same timerQ   z5You have to specify either input_ids or inputs_embedsz_`use_cache = True` is incompatible with gradient checkpointing`. Setting `use_cache = False`...F)r   zPassing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.r   rD   )r   r   r   r   r  zThe `z` should be specified for z layers, but it is for .)r   r   r   r   r   r   r   r   r>   c              3   0   #    U  H  nUc  M  Uv   M     g 7fr   r   ).0vs     r$   	<genexpr>$XGLMModel.forward.<locals>.<genexpr>J  s      rA rs   	)last_hidden_stater   r   
attentionscross_attentions)*r   r   r  r   use_return_dictrz   %warn_if_padding_and_no_attention_maskrd   r[   r   r   r   loggerwarning_oncer
   r	   r   r   from_legacy_cacheget_seq_lengthr   r4   rU   longrF   rW   r   rE   r   rJ   r   r   rs   r3   ziplenr   	enumeraterandr   r   r   )r"   r'   r   ra   r   r   r   r  r   r  r   r   r  r  r   input_shaperb   r   all_hidden_statesall_self_attnsall_cross_attentions	attn_mask	mask_nameidxdecoder_layerdropout_probabilitylayer_outputss                              r$   r)   XGLMModel.forward  s   H 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B]  ]%>cdd"66yQ#..*K!r;r?;I&',,.s3KTUU  --i8M&&4==##u "	 0 )4 $L$DlZ^ZeZeFfg!5 
 OU;;\
 2CCOTOETE`!?!?!Afg:8N
  <<&B"88jj+4+@y''mFZFZ	L (11!4L !,1G1S%?&(;(;[QS_&" &(<(<\Ka(b(e(e  )
 
 --muT\\?R]a]j]j-k #7BD0d&7<Q<]rdh %((IKYoKp$q Iy$>>#A&#dkk*::$	{*DSEUDV W%NN,Q/03  %r #,DKK"8C#!m%55!}}&+jjn#&7)%'=3<3H3dI]Ii,@,Eos /"3#-M *!,M  =#3"55(4(]1-=,??(7 #9: 6  -!11 ':K^]qr  
 9+++%1
 	
r&   )	rs   r   r   r   r   r   r   r   r   r   )NNNNNNNNNNNNNN)r-   r.   r/   r0   r   r   r   r   r!   r   r4   r5   r   r   r   r   r   r)   r6   r7   r8   s   @r$   r   r     s   z ",,9O  >  -115/38<9=,07;+/04$(,0/3&*15e
ELL)e
 !.e
 u||,	e

  (5e
 !) 6e
 ELL)e
 'u||4e
 "%e
  -e
 D>e
 $D>e
 'tne
 d^e
 !.e
  
uU\\"$MM	N!e
 e
r&   r   z
    The XGLM Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    )custom_introc            $         ^  \ rS rSrSrS/rU 4S jr\               SS\\	R                     S\\	R                     S\\	R                     S\\	R                     S	\\	R                     S
\\	R                     S\\	R                     S\\   S\\	R                     S\\	R                     S\\   S\\   S\\   S\\   S\\	R                     S\\\	R                     \4   4 S jj5       rSrU =r$ )XGLMForCausalLMiX  r   zlm_head.weightc                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  UR                  SS9U l        U R                  5         g )NFrx   )
r    r!   r   r   r   r|   hidden_sizer   lm_headr   )r"   r   r#   s     r$   r!   XGLMForCausalLM.__init__b  sH     v&
yy!3!3V5F5FUS 	r&   r'   r   ra   r   r   r   r  r   r  labelsr   r   r  r  r   r   c                 .   Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU R	                  UUUUUUUUU	UUUUUS9nU R                  US   5      nSnU
b?  U R                  " UU
4U R                   R                  U R                   R                  S.UD6nU(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  UR                  UR                  S9$ )a  
encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
    Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention of
    the decoder.
encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
    Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
    selected in `[0, 1]`:

    - 1 for tokens that are **not masked**,
    - 0 for tokens that are **masked**.

    [What are attention masks?](../glossary#attention-mask)
cross_attn_head_mask (`torch.Tensor` of shape `(num_layers, attention_heads)`, *optional*):
    Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

    - 1 indicates the head is **not masked**,
    - 0 indicates the head is **masked**.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
N)r'   r   ra   r   r   r   r  r   r  r   r   r  r  r   r   )r   r   r   )losslogitsr   r   r  r  )r   r   r  r  r   r*  loss_functionr   r   r   r   r   r  r  )r"   r'   r   ra   r   r   r   r  r   r  r,  r   r   r  r  r   kwargsr   r/  r.  outputs                        r$   r)   XGLMForCausalLM.forwardj  sW   V 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] **)%"7#9!5+'/!5#)  
" gaj)%%  ;;11![[55	
 D Y,F'+'7D7V#CVC0#33!//))$55
 	
r&   )r*  r   )NNNNNNNNNNNNNNN)r-   r.   r/   r0   r   _tied_weights_keysr!   r   r   r4   r5   r   r   r   r   r   r)   r6   r7   r8   s   @r$   r'  r'  X  s     *+  -115/38<9=,07;+/04)-$(,0/3&*15!Y
ELL)Y
 !.Y
 u||,	Y

  (5Y
 !) 6Y
 ELL)Y
 'u||4Y
 "%Y
  -Y
 &Y
 D>Y
 $D>Y
 'tnY
 d^Y
  !.!Y
$ 
uU\\"$EE	F%Y
 Y
r&   r'  )r'  r   r   )-r1   rR   typingr   r   r4   r   activationsr   cache_utilsr   r	   r
   
generationr   modeling_attn_mask_utilsr   r   modeling_layersr   modeling_outputsr   r   modeling_utilsr   utilsr   r   utils.deprecationr   configuration_xglmr   
get_loggerr-   r  r   r   Moduler:   ro   r   r   r   r'  __all__r   r&   r$   <module>rC     s      "   ! C C ) e 9 l - , 0 * 
		H	%
=bll 
=1w		 1wh^2BII ^2Bo1 od ?/ ? ?$ F
# F
 F
R f
)? f
f
R Br&   