
    bCi                     T   S SK r S SKJrJrJr  S SKrS SKJr  S SKJrJ	r	J
r
  SSKJr  SSKJrJrJr  SSKJr  SSKJr  SS	KJr  SS
KJr  SSKJrJrJrJr  SSKJrJ r   SSK!J"r"  SSK#J$r$J%r%J&r&J'r'  SSK(J)r)  SSK*J+r+  \&" 5       (       a  SSK,J-r-J.r.  \'R^                  " \05      r1 " S S\Rd                  5      r3 " S S\Rd                  5      r4   S2S\Rj                  S\Rl                  S\Rl                  S\Rl                  S\\Rl                     S\\7   S\7S\\Rl                     4S jjr8 " S  S!\Rj                  5      r9 " S" S#\5      r:\% " S$ S%\ 5      5       r;\% " S& S'\;5      5       r<\%" S(S)9 " S* S+\;\5      5       r=\% " S, S-\;5      5       r>\%" S.S)9 " S/ S0\;5      5       r?/ S1Qr@g)3    N)CallableOptionalUnion)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)AttentionMaskConverter)FlashAttentionKwargs)GradientCheckpointingLayer))BaseModelOutputWithPastAndCrossAttentions!CausalLMOutputWithCrossAttentions SequenceClassifierOutputWithPastTokenClassifierOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringis_torch_flex_attn_availablelogging)deprecate_kwarg   )BioGptConfig)	BlockMaskmake_flex_block_causal_maskc                      ^  \ rS rSrSrS\S\4U 4S jjr  SS\R                  S\S\	\R                     4U 4S	 jjjr
S
rU =r$ ) BioGptLearnedPositionalEmbedding7   zF
This module learns positional embeddings up to a fixed maximum size.
num_embeddingsembedding_dimc                 L   > SU l         [        TU ]	  XR                   -   U5        g )N   )offsetsuper__init__)selfr%   r&   	__class__s      d/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/biogpt/modeling_biogpt.pyr+   )BioGptLearnedPositionalEmbedding.__init__<   s"     ++5}E    attention_maskpast_key_values_lengthposition_idsc                    > Uc5  [         R                  " USS9nX1-  S-
  R                  5       nUSS2US24   n[        TU ]  X0R
                  -   5      $ )z3`input_ids_shape` is expected to be [bsz x seqlen].Nr   dim)torchcumsumlongr*   forwardr)   )r,   r1   r2   r3   r-   s       r.   r:   (BioGptLearnedPositionalEmbedding.forwardB   sZ      <<A>L(9A=CCEL'+A+B(BCLw|kk9::r0   )r)   )r   N)__name__
__module____qualname____firstlineno____doc__intr+   r7   
LongTensorr   r:   __static_attributes____classcell__r-   s   @r.   r#   r#   7   s]    Fs F3 F '(37	;((; !$; u//0	; ;r0   r#   c            
       r   ^  \ rS rSrSrSS\S\S\S\\   4U 4S jjjrS\	R                  4U 4S	 jjrS
rU =r$ )BioGptScaledWordEmbeddingS   zT
This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
r%   r&   padding_idxembed_scalec                 2   > [         TU ]  XU5        X@l        g N)r*   r+   rJ   )r,   r%   r&   rI   rJ   r-   s        r.   r+   "BioGptScaledWordEmbedding.__init__X   s    D&r0   	input_idsc                 <   > [         TU ]  U5      U R                  -  $ rL   )r*   r:   rJ   )r,   rN   r-   s     r.   r:   !BioGptScaledWordEmbedding.forward\   s    wy)D,<,<<<r0   rJ   )      ?)r<   r=   r>   r?   r@   rA   r   floatr+   r7   Tensorr:   rC   rD   rE   s   @r.   rG   rG   S   sJ    's '3 'S '_ghm_n ' '= = =r0   rG   modulequerykeyvaluer1   scalingdropout	head_maskc                    Uc  UR                  S5      S-  n[        R                  " XR                  SS5      5      U-  n	Ub  X-   n	[        R
                  R                  U	SS9n	Ub  XR                  SSSS5      -  n	[        R
                  R                  XU R                  S9n	[        R                  " X5      n
U
R                  SS5      R                  5       n
X4$ )N      r(   r	   r5   r   ptraining)sizer7   matmul	transposenn
functionalsoftmaxviewrZ   ra   
contiguous)rU   rV   rW   rX   r1   rY   rZ   r[   kwargsattn_weightsattn_outputs              r.   eager_attention_forwardrm   `   s     **R.D(<<}}Q':;gEL!#4==((2(>L#nnQAq&AA==((6??([L,,|3K''1-88:K$$r0   c                     ^  \ rS rSrSr      SS\S\S\S\S\S\S	\\	   S
\\   4U 4S jjjr
\" SSSS9      SS\R                  S\\R                     S\\   S\\R                     S\\R                     S\S\\R                     S\\   S\\R                  \\R                     \\\R                        4   4S jj5       rSrU =r$ )BioGptAttention~   z=Multi-headed attention from 'Attention Is All You Need' paper	embed_dim	num_headsrZ   
is_decoderbias	is_causalconfig	layer_idxc	                 t  > [         T	U ]  5         Xl        X l        X0l        X-  U l        Xpl        U R
                  U-  U R                  :w  a  [        SU R                   SU S35      eU R
                  S-  U l        X@l	        X`l
        Xl        Uc>  U R                  (       a-  [        R                  SU R                  R                   S35        [         R"                  " XUS9U l        [         R"                  " XUS9U l        [         R"                  " XUS9U l        [         R"                  " XUS9U l        g )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).r^   zInstantiating a decoder z without passing `layer_idx` is not recommended and will lead to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` when creating this class.rt   )r*   r+   rq   rr   rZ   head_dimrv   
ValueErrorrY   rs   ru   rw   loggerwarning_oncer-   r<   re   Lineark_projv_projq_projout_proj)
r,   rq   rr   rZ   rs   rt   ru   rv   rw   r-   s
            r.   r+   BioGptAttention.__init__   s    	""!.MMI%$..8MdnnM]$YKr3  }}d*$""*4>>+B+B*C D, , ii	4@ii	4@ii	4@		)TBr0   past_key_valuepast_key_values4.58new_nameversionhidden_stateskey_value_statesr1   layer_head_maskoutput_attentionscache_positionrj   returnc                 f   USLn	UR                   SS u  pU	(       a  UR                   S   OUnXSU R                  4nXSU R                  4nU R                  U5      R                  " U6 R	                  SS5      nSnUb]  [        U[        5      (       aF  UR                  R                  U R                  5      nU	(       a  UR                  nOUR                  nOUnU	(       a  UOUnU	(       aQ  UbN  U(       aG  WR                  U R                     R                  nUR                  U R                     R                  nOU R                  U5      nU R!                  U5      nUR                  " U6 R	                  SS5      nUR                  " U6 R	                  SS5      nUbc  U	(       d  UOSnWR#                  UUU R                  SU05      u  nnU	(       a.  [        U[        5      (       a  SUR                  U R                  '   [$        nU R&                  R(                  S:w  a  [*        U R&                  R(                     nU" U UUUU4U R,                  (       d  S	OU R.                  U R0                  UUS
.UD6u  nnUR3                  XS5      R5                  5       nU R7                  U5      nUU4$ )z#Input shape: Batch x Time x ChannelNr]   r   r(   Fr   Teager        )rZ   rY   r   r[   )shaperz   r   rh   rd   
isinstancer   
is_updatedgetrw   cross_attention_cacheself_attention_cachelayerskeysvaluesr   r   updaterm   rv   _attn_implementationr   ra   rZ   rY   reshaperi   r   )r,   r   r   r   r1   r   r   r   rj   is_cross_attentionbsztgt_lensrc_lenq_input_shapekv_input_shapequery_statesr   curr_past_key_valuecurrent_states
key_statesvalue_statesattention_interfacerl   rk   s                           r.   r:   BioGptAttention.forward   s   $ .T9 %**3B//A"((+wr4==9DMM: {{=166FPPQRTUV
&/+>??,77;;DNNK
%*9*O*O'*9*N*N'&5#-?)]/"=*,33DNNCHHJ.55dnnELLL^4J;;~6L#.9CCAqIJ',,n=GG1ML*7It+>+E+Ednn?OQ_>`,(
L &*_FY*Z*ZAEO..t~~>(?;;++w6"9$++:Z:Z"[$7%
  $}}C$,,LL/%%
 %
!\ "))#;FFHmmK0L((r0   )rv   rZ   rq   rz   ru   rs   r   rw   rr   r   r   rY   r   )r   FTFNN)NNNNFN)r<   r=   r>   r?   r@   rA   rS   boolr   r   r+   r   r7   rT   r   r   r   tupler:   rC   rD   rE   s   @r.   ro   ro   ~   sv   G  )-#'%C%C %C 	%C
 %C %C %C &%C C=%C %CN %0A6R 48+/1526"'15R)||R) #5<<0R) "%	R)
 !.R) "%,,/R)  R) !.R) -.R) 
u||Xell3XeELL>Q5RR	SR) SR)r0   ro   c                     ^  \ rS rSrSS\S\\   4U 4S jjjr\" SSSS9       SS	\	R                  S
\\	R                     S\\	R                     S\\   S\\   S\\   S\\	R                     S\\	R                     S\\   S\\	R"                  \\\	R"                  \	R"                  4      4   4S jj5       rSrU =r$ )BioGptDecoderLayer   rv   rw   c           
      p  > [         TU ]  5         UR                  U l        [	        U R                  UR
                  UR                  SSUUS9U l        UR                  U l	        [        UR                     U l        UR                  U l        [        R                  " U R                  5      U l        [        R"                  " U R                  UR$                  5      U l        [        R"                  " UR$                  U R                  5      U l        [        R                  " U R                  5      U l        g )NT)rq   rr   rZ   rs   ru   rv   rw   )r*   r+   hidden_sizerq   ro   num_attention_headsattention_probs_dropout_prob	self_attnhidden_dropout_probrZ   r
   
hidden_actactivation_fnactivation_dropoutre   	LayerNormself_attn_layer_normr~   intermediate_sizefc1fc2final_layer_norm)r,   rv   rw   r-   s      r.   r+   BioGptDecoderLayer.__init__   s    ++(nn0077
 11#F$5$56"(";";$&LL$@!99T^^V-E-EF99V55t~~F "T^^ <r0   r   r   r   r   r   r1   r   r   	use_cacher3   r   rj   r   c	                 J   Un
U R                  U5      nU R                  " SUUUUUUUS.U	D6u  p[        R                  R	                  XR                  U R
                  S9nX-   nUn
U R                  U5      nU R                  U5      nU R                  U5      n[        R                  R	                  XR                  U R
                  S9nU R                  U5      n[        R                  R	                  XR                  U R
                  S9nX-   nU4nU(       a  X4-  nU$ )ag  
Args:
    hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
    attention_mask (`torch.FloatTensor`): attention mask of size
        `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
    layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
        `(encoder_attention_heads,)`.
    past_key_values (`Cache`): cached past key and value projection states
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
    use_cache (`bool`, *optional*):
        If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
        (see `past_key_values`).
    cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
        Indices depicting the position of the input sequence tokens in the sequence. It is used to update the
        cache in the correct position and to infer the complete sequence length.
)r   r   r1   r   r   r3   r   r_    )r   r   re   rf   rZ   ra   r   r   r   r   r   )r,   r   r1   r   r   r   r   r3   r   rj   residualself_attn_weightsoutputss                r.   r:   BioGptDecoderLayer.forward  s.   > !11-@ ,0>> 	,
'+)+/%)	,
 	,
( --m||VZVcVc-d 0 !--m</**=9--m?V?Vaeanan-o/--m||VZVcVc-d 0 "++Gr0   )	r   r   rZ   rq   r   r   r   r   r   rL   )NNNFTNN)r<   r=   r>   r?   r   r   rA   r+   r   r7   rT   r   r   rB   r   r   r   FloatTensorr:   rC   rD   rE   s   @r.   r   r      s   =| = = =. %0A6R 2626+/,1$(3715?||? !.? "%,,/	?
 "%? $D>? D>? u//0? !.? +,? 
u  (51B1BEDUDU1U+V"WW	X? S?r0   r   c                      \ rS rSr% \\S'   SrSrSrSr	Sr
SrS\\\R                  S4      S\R                  S\R                  S	\4S
 jr\S\R                  S\S\S\R(                  S\R                  S\4S j5       rSrg)BioGptPreTrainedModeliY  rv   biogptTr1   r    input_tensorr   r   c           	      \   U R                   R                  S:X  au  [        U[        R                  5      (       a  [        U5      nU$ UcD  [        [        R                  " UR                  S   UR                  S   4UR                  S95      nU$ U R                   R                  S:X  a  Ub  US:H  R                  5       (       a  U$ g Ub  UR                  5       OSnUb  UR                  OSnU R                   R                  S:X  a.  U(       d'  [        R                  " UUUU R                  S	9(       a  g UR                  nUR                  S   nU(       a  UR!                  5       n	O5[        U[        R                  5      (       a  UR                  S
   OXX-   S-   n	U R#                  UUU	UUUR                  S   S9n
U R                   R                  S:X  aS  UbP  UR                  R$                  S;   a6  [        R&                  " U5      R(                  n[        R*                  " X5      n
U
$ )Nflex_attentionr   r   )rb   deviceflash_attention_2r   Fsdpa)inputs_embedsr2   is_trainingr]   )sequence_lengthtarget_lengthdtyper   
batch_size)cudaxpunpu)rv   r   r   r7   rT   r!   onesr   r   anyget_seq_lengthis_compileabler   _ignore_causal_mask_sdpara   r   get_max_cache_shape5_prepare_4d_causal_attention_mask_with_cache_positiontypefinfomin_unmask_unattended)r,   r1   r   r   r   past_seen_tokensusing_compilable_cacher   r   r   causal_mask	min_dtypes               r.   _update_causal_mask)BioGptPreTrainedModel._update_causal_maske  s    ;;++/??.%,,77!<^!L "!  '!<JJ*003\5G5G5JK-44" "!;;++/BB)~/D.I.I.K.K%%
 @O?Z?99;`aCRC^!?!?di ;;++v5>T%>>*'7 MM	 ""&,,Q/!+??AM nell;; $$R(%7!;  PP+')#))!, Q 
 KK,,6*%%**.DD
 E*..I0CCK[Kr0   r   r   r   r   c                    U b  U R                  5       S:X  a  U nU$ [        R                  " U5      R                  n[        R                  " X4XUR
                  S9nUS:w  a  [        R                  " USS9nU[        R                  " X$R
                  S9UR                  SS5      :  -  nUSSSS2SS24   R                  USSS5      nU b  UR                  5       nU R                  S   n	USS2SS2SS2SU	24   U SS2SSSS24   R                  UR
                  5      -   n
U
S:H  n
USS2SS2SS2SU	24   R                  X5      USS2SS2SS2SU	24'   U$ )	a  
Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
`(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

Args:
    attention_mask (`torch.Tensor`):
        A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
        `(batch_size, 1, query_length, key_value_length)`.
    sequence_length (`int`):
        The sequence length being processed.
    target_length (`int`):
        The target length: when generating with static cache, the mask should be as long as the static cache,
        to account for the 0 padding, the part of the cache that is not filled yet.
    dtype (`torch.dtype`):
        The dtype to use for the 4D attention mask.
    cache_position (`torch.Tensor`):
        Indices depicting the position of the input sequence tokens in the sequence.
    batch_size (`torch.Tensor`):
        Batch size.
N   )
fill_valuer   r   r   )diagonalr   r]   r   )r6   r7   r   r   fullr   triuaranger   expandcloner   tomasked_fill)r1   r   r   r   r   r   rj   r   r   mask_lengthpadding_masks              r.   r   KBioGptPreTrainedModel._prepare_4d_causal_attention_mask_with_cache_position  s}   > %.*<*<*>!*C(K* ' E*..I** 0Y\j\q\qK !##jjqA5<<>S>STWeWmWmnprsWtttK%dD!Q&67>>z1bRTUK))//1,2226*1aL[L+@ANSTVZ\`bcScDdDgDg&&E    ,q05@Aq,;,AV5W5c5c 6Aq!\k\12 r0   r   N)r<   r=   r>   r?   r   __annotations__base_model_prefixsupports_gradient_checkpointing_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraphr   r   r7   rT   r   r   staticmethodrA   r   r   rC   r   r0   r.   r   r   Y  s     &*#N!J u||['@!ABJ llJ 	J
 JX 444 4 {{	4
 4 4 4r0   r   c                   \  ^  \ rS rSrS\4U 4S jjr\           SS\\R                     S\\R                     S\\R                     S\\R                     S\\   S	\\   S
\\R                     S\\   S\\   S\\   S\\R                     S\\   S\\\4   4S jj5       rSrU =r$ )BioGptModeli  rv   c           
        > [         TU ]  U5        Xl        UR                  U l        UR                  U l        UR                  U l        UR                  U l	        UR                  (       a   [        R                  " UR                  5      OSn[        UR                  U R                  U R                  US9U l        [!        UR"                  U R                  5      U l        [&        R(                  " [+        UR,                  5       Vs/ s H  n[/        XS9PM     sn5      U l        [&        R2                  " U R                  5      U l        SU l        U R9                  5         g s  snf )NrR   rQ   )rw   F)r*   r+   rv   	layerdropr   rZ   r   rq   pad_token_idrI   scale_embeddingmathsqrtrG   
vocab_sizeembed_tokensr#   max_position_embeddingsembed_positionsre   
ModuleListrangenum_hidden_layersr   r   r   
layer_normgradient_checkpointing	post_init)r,   rv   rJ   ir-   s       r.   r+   BioGptModel.__init__  s    ))11++!..7=7M7Mdii 2 23SV5t~~t/?/?[
  @@^@^`d`n`nommV[\b\t\tVu$vVuQR%7%LVu$vw,,t~~6&+# %ws   	E%rN   r1   r[   r   r   r   r3   r   output_hidden_statesreturn_dictr   rj   r   c                    Ub  UOU R                   R                  nU	b  U	OU R                   R                  n	Ub  UOU R                   R                  nU
b  U
OU R                   R                  n
US L US L-  (       a  [        S5      eUb$  UnUR                  nUR                  SUS   5      nO.Ub   UR                  5       S S nUS S 2S S 2S4   nO[        S5      eUc  U R                  U5      nU R                  (       a/  U R                  (       a  U(       a  [        R                  S5        SnU(       a  Uc  [        U R                   S9nU(       a@  [        U[         5      (       a+  [        R                  S5        [        R"                  " U5      nUR                  5       S S u  nnUb  UR%                  5       OSnUc#  [&        R(                  " UUU-   UR*                  S	9nUc%  UU-   n[&        R,                  " UUUR*                  S	9nUnU R/                  UUUU5      nUc5  [&        R0                  " US
S9nXr-  S
-
  R3                  5       nUS S 2US 24   nU R5                  UUUS9nUU-   n[6        R8                  R;                  UU R:                  U R                  S9nU R                  (       a/  U R                  (       a  U(       a  [        R                  S5        SnU	(       a  SOS nU(       a  SOS nS n[=        U R>                  5       H|  u  nnU	(       a  UU4-  nU R                  (       a(  [&        R@                  " / 5      nUU RB                  :  a  ML  U" U4UUb  UU   OS UUUUUS.UD6nUS   nU(       d  Ms  UUS
   4-  nM~     U	(       a  UU4-  nU RE                  U5      nU
(       d  [!        S UUUUU4 5       5      $ [G        UUUUUS9$ )NzTYou cannot specify both decoder_input_ids and decoder_inputs_embeds at the same timer]   zEYou have to specify either decoder_input_ids or decoder_inputs_embedsz[`use_cache=True` is incompatible with gradient checkpointing`. Setting `use_cache=False`...F)rv   zPassing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. You should pass an instance of `DynamicCache` instead, e.g. `past_key_values=DynamicCache.from_legacy_cache(past_key_values)`.r   r   r   r5   )r3   r_   zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...r   )r1   r   r   r   r   r3   r   c              3   0   #    U  H  nUc  M  Uv   M     g 7frL   r   ).0vs     r.   	<genexpr>&BioGptModel.forward.<locals>.<genexpr>  s      rA rs   	)last_hidden_stater   r   
attentionscross_attentions)$rv   r   r  r   use_return_dictr{   r   rh   rb   r  r  ra   r|   r}   r   r   r   from_legacy_cacher   r7   r   r   r   r   r8   r9   r  re   rf   rZ   	enumerater   randr  r  r   )r,   rN   r1   r[   r   r   r   r3   r   r  r  r   rj   inputinput_shaper   
seq_lengthr2   mask_seq_lengthself_attn_cacher   	positionsr   all_hidden_statesall_self_attnsall_cross_attentionsidxdecoder_layerdropout_probabilitylayer_outputss                                 r.   r:   BioGptModel.forward  s     2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B] -t";<stt"E++K!r;r?;I&',,.s3K!!Q(+Edee  --e4M&&4==##q "	 0*$++>OOU;;U
 +<<_MO!.!3!3!5cr!:
JETE`!?!?!Afg!"\\&(>(KTaThThN !4zAO"ZZ
OML`L`aN)..	
  <<A>L(9A=CCEL'+A+B(BCL((9O^j(k	%	1--mt||VZVcVc-d&&4==##p "	"6BD0d#"+DKK"8C#!m%55!}}&+jjn#&7)
*3<3H3d /"3#)-
 
M *!,M  =#3"551 #96  -!116 ':K^]qr  
 9+++%1
 	
r0   )
rv   rZ   rq   r  r  r  r  r  r   rI   )NNNNNNNNNNN)r<   r=   r>   r?   r   r+   r   r   r7   rB   r   r   r   rT   r   r   r   r   r   r:   rC   rD   rE   s   @r.   r  r    s:   | *  156:1559+/$(37,0/3&*15P
E,,-P
 !!2!23P
 E--.	P

   1 12P
 "%P
 D>P
 u//0P
 $D>P
 'tnP
 d^P
 !.P
 +,P
 
u??	@P
 P
r0   r  zR
    BioGPT Model with a `language modeling` head on top for CLM fine-tuning.
    )custom_introc                      ^  \ rS rSrS/rU 4S jrS rS r\            SS\	\
R                     S\	\
R                     S\	\
R                     S	\	\
R                     S
\	\   S\	\
R                     S\	\   S\	\
R                     S\	\   S\	\   S\	\   S\	\
R                     S\\   S\\\4   4S jj5       rSrU =r$ )BioGptForCausalLMi  zoutput_projection.weightc                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  UR                  SS9U l        U R                  5         g NFry   )
r*   r+   r  r   re   r~   r   r  output_projectionr  r,   rv   r-   s     r.   r+   BioGptForCausalLM.__init__  sJ     !&)!#6+=+=v?P?PW\!] 	r0   c                     U R                   $ rL   r9  r,   s    r.   get_output_embeddings'BioGptForCausalLM.get_output_embeddings  s    %%%r0   c                     Xl         g rL   r=  )r,   new_embeddingss     r.   set_output_embeddings'BioGptForCausalLM.set_output_embeddings  s    !/r0   rN   r1   r[   r   r   labelsr   r3   r   r  r  r   rj   r   c                    Ub  UOU R                   R                  nU R                  " U4UUUUUUU	U
UUS.
UD6nUS   nU R                  U5      nSnUb*  U R                  " UU4SU R                   R
                  0UD6nU(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  UR                  UR                  S9$ )a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
    `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
    are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
N)
r1   r[   r   r   r   r3   r   r  r  r   r   r  r   )losslogitsr   r   r   r!  )rv   r"  r   r9  loss_functionr  r   r   r   r   r!  )r,   rN   r1   r[   r   r   rE  r   r3   r   r  r  r   rj   r   sequence_outputprediction_scoreslm_lossoutputs                      r.   r:   BioGptForCausalLM.forward  s   . &1%<k$++B]B]++
)'+%/!5#)
 
 "!* 22?C((!  ;;11 	G ')GABK7F,3,?WJ'KVK0$#33!//))$55
 	
r0   )r   r9  )NNNNNNNNNNNN)r<   r=   r>   r?   _tied_weights_keysr+   r?  rC  r   r   r7   rB   r   r   r   rT   r   r   r   r   r   r:   rC   rD   rE   s   @r.   r6  r6    sQ    55&0  156:1559+/-1$(37,0/3&*15>
E,,->
 !!2!23>
 E--.	>

   1 12>
 "%>
 ))*>
 D>>
 u//0>
 $D>>
 'tn>
 d^>
 !.>
 +,>
 
u77	8>
 >
r0   r6  c                      ^  \ rS rSrU 4S jr\             SS\\R                     S\\R                     S\\R                     S\\R                     S\\
   S\\R                     S	\\R                     S
\\   S\\R                     S\\   S\\   S\\   S\\R                     S\\\4   4S jj5       rSrU =r$ )BioGptForTokenClassificationi  c                   > [         TU ]  U5        UR                  U l        [        U5      U l        [        US5      (       a  UR                  b  UR                  nOUR                  n[        R                  " U5      U l
        [        R                  " UR                  UR                  5      U l        U R                  5         g )Nclassifier_dropout)r*   r+   
num_labelsr  r   hasattrrS  r   re   DropoutrZ   r~   r   
classifierr  )r,   rv   rS  r-   s      r.   r+   %BioGptForTokenClassification.__init__  s      ++!&)6/00V5N5N5Z!'!:!:!'!;!;zz"45))F$6$68I8IJr0   rN   token_type_idsr1   r[   r   r   rE  r   r3   r   r  r  r   r   c                    Ub  UOU R                   R                  nU R                  UUUUUUU	U
UUUS9nUS   nU R                  U5      nU R	                  U5      nSnUb  [        5       nUb  UR                  S5      S:H  nUR                  SU R                  5      n[        R                  " UUR                  S5      [        R                  " UR                  5      R                  U5      5      nU" UU5      nO2U" UR                  SU R                  5      UR                  S5      5      nU(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )e  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
N
r   r1   r[   r   r   r3   r   r  r  r   r   r]   r   r(   )rG  rH  r   r   )rv   r"  r   rZ   rW  r   rh   rT  r7   wheretensorignore_indextype_asr   r   r   )r,   rN   rY  r1   r[   r   r   rE  r   r3   r   r  r  r   transformer_outputsr   rH  rG  loss_fctactive_lossactive_logitsactive_labelsrM  s                          r.   r:   $BioGptForTokenClassification.forward  su   . &1%<k$++B]B]"kk+)'%/!5#) * 
 ,A.]3/')H),11"5: &B @ %R%,,x?T?T2U2]2]^d2e!  }=B @&++b/RY!4QR!88F)-)9TGf$EvE$-;;*55	
 	
r0   )r   rW  rZ   rT  )NNNNNNNNNNNNN)r<   r=   r>   r?   r+   r   r   r7   rB   r   r   r   rT   r   r   r   r:   rC   rD   rE   s   @r.   rQ  rQ    sT     15596:15+/59-1$(37,0/3&*15A
E,,-A
 !!1!12A
 !!2!23	A

 E--.A
 "%A
   1 12A
 ))*A
 D>A
 u//0A
 $D>A
 'tnA
 d^A
 !.A
 
u++	,A
 A
r0   rQ  a  
    The BioGpt Model transformer with a sequence classification head on top (linear layer).

    [`BioGptForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-2) do.

    Since it does classification on the last token, it is required to know the position of the last token. If a
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    c                      ^  \ rS rSrS\4U 4S jjr\             SS\\R                     S\\R                     S\\R                     S\\   S\\R                     S	\\R                     S
\\   S\\R                     S\\   S\\   S\\   S\\R                     S\\\R                  4   S\\\4   4S jj5       rS rS rSrU =r$ )BioGptForSequenceClassificationiC  rv   c                    > [         TU ]  U5        UR                  U l        [        U5      U l        [
        R                  " UR                  U R                  SS9U l        U R                  5         g r8  )
r*   r+   rT  r  r   re   r~   r   scorer  r:  s     r.   r+   (BioGptForSequenceClassification.__init__R  sS      ++!&)YYv114??O
 	r0   rN   r1   r[   r   r   rE  r   r3   r   r  r  r   logits_to_keepr   c                    Ub  UOU R                   R                  nU R                  UUUUUUUU	U
UUS9nUS   n[        U[        5      (       a  [        U* S5      OUnU R                  USS2USS24   5      nUb  UR                  SS u  nnOUR                  SS u  nnU R                   R                  c  SnOUbV  [        R                  " XR                   R                  5      R                  S5      S-
  R                  UR                  5      nO.Sn[        R                  U R                   R"                   S35        U[        R$                  " UUR                  S9U4   nSnUGb  U R                   R&                  c  U R(                  S:X  a  S	U R                   l        OoU R(                  S:  aN  UR*                  [        R,                  :X  d  UR*                  [        R                  :X  a  S
U R                   l        OSU R                   l        U R                   R&                  S	:X  aJ  [/        5       nU R(                  S:X  a&  U" UR1                  5       UR1                  5       5      nOU" UU5      nOU R                   R&                  S
:X  a=  [3        5       nU" UR5                  SU R(                  5      UR5                  S5      5      nO-U R                   R&                  S:X  a  [7        5       nU" UU5      nU(       d  U4USS -   nUb  U4U-   $ U$ [9        UUUR:                  UR<                  UR>                  S9$ )r[  Nr\  r   r(   r]   r   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`r   
regressionsingle_label_classificationmulti_label_classification)rG  rH  r   r   r   ) rv   r"  r   r   rA   slicerj  r   r  r7   nesumr   r   r|   r}   r-   r<   r   problem_typerT  r   r9   r   squeezer   rh   r   r   r   r   r   )r,   rN   r1   r[   r   r   rE  r   r3   r   r  r  r   rl  ra  r   slice_indicesrH  r   r   pooled_logitsrG  rb  rM  s                           r.   r:   'BioGptForSequenceClassification.forward[  s   . &1%<k$++B]B]"kk+)'%/!5#) * 
 ,A.8B>SV8W8W~ot4]kM!]A*=>? *3//"1*='J*7*=*=bq*A'J;;##+ O$#(88I{{7O7O#P#T#TUW#X[\#\"`"`aganan"o"$##~~../ 0^ ^
 u||Jv}}M^_{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#M$9$9$;V^^=MND#M6:D))-JJ+- 2 22t GUWY))-II,.v6#%(;AB(??F)-)9TGf$EvE/ /??-;;*55
 	
r0   c                 .    U R                   R                  $ rL   r   r  r>  s    r.   get_input_embeddings4BioGptForSequenceClassification.get_input_embeddings  s    {{'''r0   c                 $    XR                   l        g rL   rz  )r,   rX   s     r.   set_input_embeddings4BioGptForSequenceClassification.set_input_embeddings  s    #( r0   )r   rT  rj  )NNNNNNNNNNNNr   )r<   r=   r>   r?   r   r+   r   r   r7   rB   r   r   r   rT   r   rA   r   r   r:   r{  r~  rC   rD   rE   s   @r.   rh  rh  C  sn   |   156:15+/59-1$(37,0/3&*1534\
E,,-\
 !!2!23\
 E--.	\

 "%\
   1 12\
 ))*\
 D>\
 u//0\
 $D>\
 'tn\
 d^\
 !.\
 c5<</0\
 
u66	7\
 \
|() )r0   rh  )r6  rQ  rh  r  r   )Nr   N)Ar	  typingr   r   r   r7   torch.nnre   r   r   r   activationsr
   cache_utilsr   r   r   
generationr   modeling_attn_mask_utilsr   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.deprecationr   configuration_biogptr   integrations.flex_attentionr    r!   
get_loggerr<   r|   	Embeddingr#   rG   ModulerT   rS   rm   ro   r   r   r  r6  rQ  rh  __all__r   r0   r.   <module>r     s  ,  , ,   A A ! C C ) > B 9  G & ^ ^ 0 .  !!U 
		H	%;r|| ;8
= 
=&  $(,%II%<<% 
% <<	%
 U\\*% e_% % %%<})bii })@X3 Xv MO M M` g
' g
 g
T 
Q
- Q

Q
h Q
#8 Q
 Q
h m)&; m)m)`r0   