
    cCik                        S r SSKrSSKJrJrJr  SSKrSSKJr  SSKJ	r	J
r
Jr  SSKJr  SSKJrJrJrJrJr  SS	KJrJr  SS
KJr  SSKJrJr  SSKJrJrJrJ r   SSK!J"r"J#r#  SSK$J%r%  \ RL                  " \'5      r(S r)S r*S r+S r,S r- " S S\R                  R\                  5      r/ " S S\R\                  5      r0 " S S\R\                  5      r1  SGS\R\                  S\Rd                  S\Rd                  S\Rd                  S\\Rd                     S \3S!\3S"\\Rd                     S#\\   4S$ jjr4 " S% S&\R\                  5      r5 " S' S(\R\                  5      r6 " S) S*\R\                  5      r7 " S+ S,\R\                  5      r8 " S- S.\R\                  5      r9 " S/ S0\5      r: " S1 S2\R\                  5      r; " S3 S4\R\                  5      r<\ " S5 S6\5      5       r=\ " S7 S8\=5      5       r>\ " S9 S:\=5      5       r? " S; S<\R\                  5      r@\" S=S>9 " S? S@\=5      5       rA\ " SA SB\=5      5       rB " SC SD\R\                  5      rCSE rD/ SFQrEg)HzPyTorch ESM model.    N)CallableOptionalUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )GradientCheckpointingLayer)"BaseModelOutputWithCrossAttentions,BaseModelOutputWithPoolingAndCrossAttentionsMaskedLMOutputSequenceClassifierOutputTokenClassifierOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack) find_pruneable_heads_and_indicesprune_linear_layer)TransformersKwargsauto_docstringcan_return_tuplelogging)OutputRecordercheck_model_inputs   )	EsmConfigc                 V    U R                  SSS9u  p[        R                  " U* U4SS9$ )N   dim)chunktorchcat)xx1x2s      ^/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/esm/modeling_esm.pyrotate_halfr*   ,   s-    WWQBWFB99rc2YB''    c                     US S 2S S 2S U R                   S   2S S 24   nUS S 2S S 2S U R                   S   2S S 24   nX-  [        U 5      U-  -   $ )N)shaper*   )r&   cossins      r)   apply_rotary_pos_embr1   1   sV    
aMaggbkM1$
%C
aMaggbkM1$
%CGA,--r+   c                 n    U S-  S[         R                  " U [        R                  " S5      -  5      -   -  $ )zg
This is the gelu implementation from the original ESM repo. Using F.gelu yields subtly wrong results.
g      ?      ?g       @)r$   erfmathsqrtr&   s    r)   gelur8   8   s.     s7cEIIa$))C.&899::r+   c                 *    X R                  SS5      -   $ )zJMake layer symmetric in final two dimensions, used for contact prediction.r    r-   )	transposer7   s    r)   
symmetrizer;   ?   s    {{2r"""r+   c                     U R                  SSS9nU R                  SSS9nU R                  SSS9nX-  nUR                  U5        X-
  nU$ )z=Perform average product correct, used for contact prediction.r    T)keepdimsr-   )r    r-   )sumdiv_)r&   a1a2a12avg
normalizeds         r)   average_product_correctrE   D   sW    	
rD	!B	
rD	!B
%%4%
(C
'CHHSMJr+   c                      ^  \ rS rSr% Sr\R                  \S'   S\4U 4S jjr	SS jr
S\R                  S\R                  S	\\R                  \R                  4   4S
 jrSrU =r$ )RotaryEmbeddingP   z
Rotary position embeddings based on those in
[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer). Query and keys are transformed by rotation
matrices which depend on their relative positions.
inv_freqr"   c           	         > [         TU ]  5         SS[        R                  " SUS[        R                  S9R                  5       U-  -  -  nU R                  SU5        S U l        S U l        S U l	        g )Nr3   i'  r   r   dtyperI   )
super__init__r$   arangeint64floatregister_buffer_seq_len_cached_cos_cached_sin_cached)selfr"   rI   	__class__s      r)   rN   RotaryEmbedding.__init__Y   sg    %ELLC%++$N$T$T$VY\$\]^Z2#r+   c                 j   UR                   U   nX0R                  :w  d$  U R                  R                  UR                  :w  a  X0l        [        R
                  " UR                   U   UR                  S9R                  U R                  5      n[        R                  " X@R                  5      n[        R                  " XU4SS9R                  UR                  5      nUR                  5       S S S S 2S S 24   U l        UR                  5       S S S S 2S S 24   U l        U R                  U R                  4$ )Ndevicer    r!   )r.   rS   rT   r[   r$   rO   type_asrI   outerr%   tor/   r0   rU   )rV   r&   seq_dimensionseq_lentfreqsembs          r)   _update_cos_sin_tables&RotaryEmbedding._update_cos_sin_tablesc   s    ''-( ***d.>.>.E.E.Q#* QWW]3AHHEMMdmm\AKK==1E))UN366qxx@C"wwytQ)9:D"wwytQ)9:D!1!111r+   qkreturnc                    U R                  USS9u  U l        U l        [        XR                  U R                  5      R	                  UR
                  S9[        X R                  U R                  5      R	                  UR
                  S94$ )Nr-   )r_   rK   )rd   rT   rU   r1   r^   rL   )rV   rf   rg   s      r)   forwardRotaryEmbedding.forwards   s    -1-H-HZ\-H-]*$* !$4$4d6F6FGJJQRQXQXJY $4$4d6F6FGJJQRQXQXJY
 	
r+   )rT   rS   rU   )r   )__name__
__module____qualname____firstlineno____doc__r$   Tensor__annotations__intrN   rd   tuplerj   __static_attributes____classcell__rW   s   @r)   rG   rG   P   s^     ll C  2 
 
%,, 
5u||A[;\ 
 
r+   rG   c                   F   ^  \ rS rSrSr  SS\S\4U 4S jjjrS rSrU =r	$ )	EsmContactPredictionHead|   zWPerforms symmetrization, apc, and computes a logistic regression on the output featuresin_featureseos_idxc                    > [         TU ]  5         Xl        X0l        [        R
                  " USU5      U l        [        R                  " 5       U l        g )Nr   )	rM   rN   r{   r|   r   Linear
regressionSigmoid
activation)rV   r{   biasr|   rW   s       r)   rN   !EsmContactPredictionHead.__init__   s<     	&))KD9**,r+   c                 N   UR                  U R                  5      R                  U5      nUR                  S5      UR                  S5      -  nX#S S 2S S S S 2S S 24   -  nUSS S2S S24   nUSSS 2SS 24   nUR	                  5       u  pEpgnUR                  XEU-  Xw5      nUR                  U R                  R                  R                  5      n[        [        U5      5      nUR                  SSSS5      nU R                  U R                  U5      R                  S5      5      $ )Nr   r   .r    r   r
   )ner|   r^   	unsqueezesizeviewr   weightr[   rE   r;   permuter   squeeze)	rV   tokens
attentionseos_mask
batch_sizelayersheadsseqlen_s	            r)   rj    EsmContactPredictionHead.forward   s   99T\\*--j9%%a(8+=+=a+@@1dD!Q+>"??
SbS#2#.
QR,
/9/@,
E1__Z%P
  ]]OO""))

 -Z
-CD
''1a3
tz:BB1EFFr+   )r   r|   r{   r   )Tr   )
rl   rm   rn   ro   rp   rs   rN   rj   ru   rv   rw   s   @r)   ry   ry   |   s6    a
 	
'
' 	
' 
'G Gr+   ry   c                   D   ^  \ rS rSrSrU 4S jr    SS jrS rSrU =r	$ )EsmEmbeddings   zN
Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
c                   > [         TU ]  5         [        R                  " UR                  UR
                  UR                  S9U l        UR                  (       a/  [        R                  " UR
                  UR                  S9U l        OS U l        [        R                  " UR                  5      U l        [        USS5      U l        U R#                  S[$        R&                  " UR(                  5      R+                  S5      SS9  UR                  U l        U R                   S:X  a9  [        R                  " UR(                  UR
                  U R,                  S9U l        UR0                  U l        UR2                  U l        g )	N)padding_idxepsposition_embedding_typeabsoluteposition_ids)r   r    F)
persistent)rM   rN   r   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsemb_layer_norm_before	LayerNormlayer_norm_eps
layer_normDropouthidden_dropout_probdropoutgetattrr   rR   r$   rO   max_position_embeddingsexpandr   position_embeddingstoken_dropoutmask_token_idrV   configrW   s     r)   rN   EsmEmbeddings.__init__   s*   !||F,=,=v?Q?Q_e_r_rs'' ll6+=+=6CXCXYDO"DOzz&"<"<='.v7PR\']$ELL)G)GHOOPWXej 	 	
 "..'':5')||..0B0BPTP`P`(D$ $11#11r+   c                    Uc*  Ub  [        XR                  5      nOU R                  U5      nUc  U R                  U5      nUnU R                  (       a  Ub  UR                  XR                  :H  R                  S5      S5      nSnUb  UR                  S5      OUR                  S   nXR                  :H  R                  S5      R                  5       U-  nUSU-
  -  SU-
  S S 2S S 4   -  R                  UR                  5      nU R                  S:X  a  U R                  U5      n	XY-   nU R                  b  U R                  U5      nUb,  XRR                  S5      -  R                  UR                  5      nU$ )Nr            gQ?r   r   )"create_position_ids_from_input_idsr   &create_position_ids_from_inputs_embedsr   r   masked_fillr   r   r>   r.   rQ   r^   rL   r   r   r   )
rV   	input_idsattention_maskr   inputs_embeds
embeddingsmask_ratio_trainsrc_lengthsmask_ratio_observedr   s
             r)   rj   EsmEmbeddings.forward   s    $A)M]M]^#JJ=Y  00;M #
 )"7#//>P>P1P0[0[\^0_adeJ)4B4N.,,R0T]TcTcdeTfK#,0B0B#B"G"G"K"Q"Q"SVa"a$,<(<=EXAXZ[]acgZg@hhll  J '':5"&":":<"H#9J??&4J%$'?'?'CCGG
HXHXYJ r+   c                    UR                  5       SS nUS   n[        R                  " U R                  S-   X0R                  -   S-   [        R                  UR
                  S9nUR                  S5      R                  U5      $ )z
We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

Args:
    inputs_embeds: torch.Tensor

Returns: torch.Tensor
Nr    r   rL   r[   r   )r   r$   rO   r   longr[   r   r   )rV   r   input_shapesequence_lengthr   s        r)   r   4EsmEmbeddings.create_position_ids_from_inputs_embeds   s~     $((*3B/%a.||q /4D4D"Dq"HPUPZPZcpcwcw
 %%a(//<<r+   )r   r   r   r   r   r   r   r   NNNN)
rl   rm   rn   ro   rp   rN   rj   r   ru   rv   rw   s   @r)   r   r      s+    22 /b= =r+   r   modulequerykeyvaluer   scalingr   	head_maskkwargsc                    [         R                  " XR                  SS5      5      U-  n	[        U S5      (       GaI  U R                  S;   Ga8  UR
                  S   n
[         R                  " U
[         R                  U	R                  S9R                  SS5      n[         R                  " U
[         R                  U	R                  S9R                  SS5      nX-
  nU R                  XR                  -   S-
  5      nUR                  UR                  S9nU R                  S	:X  a  [         R                  " S
X5      nOCU R                  S:X  a3  [         R                  " S
X5      n[         R                  " SX.5      nUU-   nU	W-   n	Ub#  US S 2S S 2S S 2S UR
                  S   24   nU	U-   n	[        R                   R#                  U	S[         R$                  S9R                  UR                  5      n	[        R                   R'                  XU R(                  S9n	Ub  X-  n	[         R                  " X5      nUR                  SS5      R+                  5       nUU	4$ )Nr   r
   r   relative_keyrelative_key_queryr   r    r   rK   r   zbhld,lrd->bhlrr   zbhrd,lrd->bhlrr-   )r"   rL   )ptraining)r$   matmulr:   hasattrr   r.   rO   r   r[   r   distance_embeddingr   r^   rL   einsumr   
functionalsoftmaxfloat32r   r   
contiguous)r   r   r   r   r   r   r   r   r   attn_weights
seq_lengthposition_ids_lposition_ids_rdistancepositional_embeddingrelative_position_scoresrelative_position_scores_queryrelative_position_scores_keycausal_maskattn_outputs                       r)   eager_attention_forwardr      s    <<}}Q':;gELv011f6T6T Y 7 [[^
j

<K^K^_ddegijkj

<K^K^_ddefhjk!2%88DbDb9bef9fg366U[[6I))^;',||4De'b$++/CC-2\\:JE-h*+0<<8H#+d('EHd'd$#&>>!$Q1o		"o%=>#k1==((2U]](SVVW\WbWbcL==((6??([L#/,,|3K''1-88:K$$r+   c                      ^  \ rS rSrSU 4S jjr    SS\R                  S\\R                     S\\R                     S\\R                     S\\R                     S\	\
   S	\\R                     4S
 jjrSrU =r$ )EsmSelfAttentioni0  c                   > [         TU ]  5         Xl        UR                  UR                  -  S:w  a7  [        US5      (       d&  [        SUR                   SUR                   S35      eUR                  U l        [        UR                  UR                  -  5      U l        U R                  U R                  -  U l	        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        UR                  U l        U=(       d    [#        USS5      U l        S U l        U R$                  S:X  d  U R$                  S	:X  aH  UR(                  U l        [        R*                  " S
UR(                  -  S-
  U R                  5      U l        O(U R$                  S:X  a  [/        U R                  S9U l        SU l        UR2                  U l        X0l        U R2                  =(       a    U(       + U l        g )Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()r   r   r   r   r   r   rotaryr!   r3   )rM   rN   r   r   num_attention_headsr   
ValueErrorrs   attention_head_sizeall_head_sizer   r~   r   r   r   attention_probs_dropout_probr   r   r   rotary_embeddingsr   r   r   rG   r   
is_decoder	layer_idx	is_causal)rV   r   r   r   is_cross_attentionrW   s        r)   rN   EsmSelfAttention.__init__1  s    : ::a?PVXhHiHi#F$6$6#7 8 445Q8 
 $*#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
99V//1C1CDYYv1143E3EF
::'> (
'-zC
$ "&''>9T=Y=Y]q=q+1+I+ID(&(ll1v7U7U3UXY3Y[_[s[s&tD#))X5%49Q9Q%RD" ++"C1C-Cr+   hidden_statesr   r   encoder_hidden_statesencoder_attention_maskr   rh   c                    UR                   S S u  pxXxSU R                  4n	U R                  U5      R                  U	5      R	                  SS5      n
US LnU(       a  UOUnU(       a  UOUnU R                  U5      R                  U	5      R	                  SS5      nU R                  U5      R                  U	5      R	                  SS5      nXR                  S-  -  n
U R                  S:X  a  U R                  X5      u  p[        nU R                  R                  S:w  a]  U R                  S;   a0  [        SU R                  R                   S	U R                   S
35      e[        U R                  R                     nU" U U
UUU4U R                  (       d  SOU R                  U R                   US.UD6u  nnUR#                  XxS5      R%                  5       nUU4$ )Nr    r   r   g      r   eagerr   zESM z attention does not support z^ embeddings. Set attention explicitly to 'eager' with `model.set_attn_implementation('eager')`r   )r   r   r   )r.   r   r   r   r:   r   r   r   r   r   r   _attn_implementationr   r   r   r   r   reshaper   )rV   r   r   r   r   r   r   r   r   hidden_shapequery_layerr   current_states	key_layervalue_layerattention_interfacer   r   s                     r)   rj   EsmSelfAttention.forwardS  s    "/!4!4Sb!9
"D4L4LMjj/44\BLLQPQR2$>2D.-3E/>HH^,11,?II!QO	jj055lCMMaQRS "$<$<d$BB''83%)%;%;K%S"K(?;;++w6++/UU 4;;;;<<XY]YuYuXv wh h  #:$++:Z:Z"[$7
%
  $}}C$,,LL
%
 
%
!\ "))*"EPPRL((r+   )r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   )NNFr   )rl   rm   rn   ro   rN   r$   rq   r   FloatTensorr   r   rt   rj   ru   rv   rw   s   @r)   r   r   0  s     DJ 7;15=A>B3)||3) !!2!233) E--.	3)
  ((9(9:3) !)):): ;3) +,3) 
u||	3) 3)r+   r   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )EsmSelfOutputi  c                    > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  5      U l        g N)	rM   rN   r   r~   r   denser   r   r   r   s     r)   rN   EsmSelfOutput.__init__  sB    YYv1163E3EF
zz&"<"<=r+   c                 R    U R                  U5      nU R                  U5      nX-   nU$ r  r  r   rV   r   input_tensors      r)   rj   EsmSelfOutput.forward  ,    

=1]3%4r+   r  rl   rm   rn   ro   rN   rj   ru   rv   rw   s   @r)   r  r        >
 r+   r  c                   R   ^  \ rS rSrSU 4S jjrS r    SS\\   4S jjrSr	U =r
$ )	EsmAttentioni  c                    > [         TU ]  5         [        XUS9U l        [	        U5      U l        [        5       U l        [        R                  " UR                  UR                  S9U l	        g )N)r   r   r   )rM   rN   r   rV   r  outputsetpruned_headsr   r   r   r   )rV   r   r   r   rW   s       r)   rN   EsmAttention.__init__  sQ    $VUgh	#F+Ef&8&8f>S>STr+   c                 6   [        U5      S:X  a  g [        XR                  R                  U R                  R                  U R
                  5      u  p[        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l	        [        U R                  R                  USS9U R                  l        U R                  R                  [        U5      -
  U R                  l        U R                  R                  U R                  R                  -  U R                  l        U R
                  R                  U5      U l        g )Nr   r   r!   )lenr   rV   r   r   r  r   r   r   r   r  r  r   union)rV   r   indexs      r)   prune_headsEsmAttention.prune_heads  s   u:?79900$))2O2OQUQbQb

 -TYY__eD		*499==%@		,TYY__eD		.t{{/@/@%QO )-		(E(EE
(R		%"&))"?"?$))B_B_"_		 --33E:r+   r   c                 ~    U R                  U5      nU R                  " U4UUUUS.UD6u  pU R                  X5      nU$ )Nr   r   r   r   )r   rV   r  )
rV   r   r   r   r   r   r   hidden_states_lnr   r   s
             r)   rj   EsmAttention.forward  sW      >>-8
)"7#9
 
 kk+=r+   )r   r  r  rV   )NFr   )rl   rm   rn   ro   rN   r#  r   r   rj   ru   rv   rw   s   @r)   r  r    s6    U;* "# +, r+   r  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )EsmIntermediatei  c                    > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        g r  )rM   rN   r   r~   r   intermediate_sizer  r   s     r)   rN   EsmIntermediate.__init__  s,    YYv1163K3KL
r+   r   rh   c                 >    U R                  U5      n[        U5      nU$ r  )r  r8   )rV   r   s     r)   rj   EsmIntermediate.forward  s     

=1]+r+   )r  
rl   rm   rn   ro   rN   r$   rq   rj   ru   rv   rw   s   @r)   r*  r*    s)    MU\\ ell  r+   r*  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )	EsmOutputi  c                    > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " UR                  5      U l	        g r  )
rM   rN   r   r~   r,  r   r  r   r   r   r   s     r)   rN   EsmOutput.__init__  sB    YYv779K9KL
zz&"<"<=r+   c                 R    U R                  U5      nU R                  U5      nX-   nU$ r  r  r  s      r)   rj   EsmOutput.forward  r  r+   r  r  rw   s   @r)   r2  r2    r  r+   r2  c                   N   ^  \ rS rSrU 4S jr    SS\\   4S jjrS rSr	U =r
$ )EsmLayeri  c                   > [         TU ]  5         UR                  U l        SU l        [	        U5      U l        UR                  U l        UR                  U l        U R                  (       a.  U R                  (       d  [        U  S35      e[	        USS9U l	        [        U5      U l        [        U5      U l        [        R                  " UR                   UR"                  S9U l        g )Nr   z> should be used as a decoder model if cross attention is addedT)r   r   )rM   rN   chunk_size_feed_forwardseq_len_dimr  	attentionr   add_cross_attentionRuntimeErrorcrossattentionr*  intermediater2  r  r   r   r   r   r   s     r)   rN   EsmLayer.__init__  s    '-'E'E$%f- ++#)#=#= ##??"dV+i#jkk".v$"OD+F3'f&8&8f>S>STr+   r   c                     U R                   " U4UUS.UD6nU R                  (       a;  Ub8  [        U S5      (       d  [        SU  S35      eU R                  " U4UUUUS.UD6nU R                  U5      nU$ )N)r   r   r?  z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`r&  )r<  r   r   AttributeErrorr?  feed_forward_chunk)	rV   r   r   r   r   r   r   attention_outputlayer_outputs	            r)   rj   EsmLayer.forward  s      >>
)
 	
 ??4@4!122$=dV D` ` 
  $22  -#&;'=    ../?@r+   c                 l    U R                  U5      nU R                  U5      nU R                  X15      nU$ r  )r   r@  r  )rV   rE  attention_output_lnintermediate_outputrF  s        r)   rD  EsmLayer.feed_forward_chunk  s9    "nn-=>"//0CD{{#6Ir+   )	r   r=  r<  r:  r?  r@  r   r  r;  r   )rl   rm   rn   ro   rN   r   r   rj   rD  ru   rv   rw   s   @r)   r8  r8    s7    U$ "#! +,!F r+   r8  c                   R   ^  \ rS rSrU 4S jr\    SS\\   4S jj5       rSr	U =r
$ )
EsmEncoderi  c                 2  > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        [        R                  " UR                  UR                  S9U l        SU l        g s  snf )Nr   F)rM   rN   r   r   
ModuleListrangenum_hidden_layersr8  layerr   r   r   emb_layer_norm_aftergradient_checkpointing)rV   r   r   rW   s      r)   rN   EsmEncoder.__init__  sq    ]]eFD\D\>]#^>]HV$4>]#^_
$&LL1C1CI^I^$_!&+# $_s   Br   c           	          [        U R                  5       H  u  pxUb  X7   OS n	U" U4UU	UUS.UD6nM     U R                  (       a  U R                  U5      n[        US9$ )Nr&  )last_hidden_state)	enumeraterR  rS  r   )
rV   r   r   r   r   r   r   ilayer_modulelayer_head_masks
             r)   rj   EsmEncoder.forward   su      )4OA.7.CilO(-)&;'= M  5 $$ 55mDM1MRRr+   )r   rS  rT  rR  r   )rl   rm   rn   ro   rN   r   r   r   rj   ru   rv   rw   s   @r)   rM  rM    s=    ,  "#S +,S Sr+   rM  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )	EsmPooleri<  c                    > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " 5       U l        g r  )rM   rN   r   r~   r   r  Tanhr   r   s     r)   rN   EsmPooler.__init__=  s9    YYv1163E3EF
'')r+   r   rh   c                 \    US S 2S4   nU R                  U5      nU R                  U5      nU$ Nr   )r  r   )rV   r   first_token_tensorpooled_outputs       r)   rj   EsmPooler.forwardB  s6     +1a40

#566r+   )r   r  r0  rw   s   @r)   r^  r^  <  s(    $
U\\ ell  r+   r^  c                   ~    \ rS rSr% \\S'   SrSrSr/ SQr	S/r
SrSrSrSr\\" \SS	S
9/\" \SSS
9/S.rS rS rSrg)EsmPreTrainedModeliK  r   esmTF)r8  #EsmFoldTriangularSelfAttentionBlockr   zposition_embeddings.weightr   r<  )r"  
layer_namer?  )r   r   cross_attentionsc                    [        U[        R                  5      (       ak  UR                  R                  R                  SU R                  R                  S9  UR                  b%  UR                  R                  R                  5         gg[        U[        R                  5      (       ax  UR                  R                  R                  SU R                  R                  S9  UR                  b2  UR                  R                  UR                     R                  5         gg[        U[        R                  5      (       aJ  UR                  R                  R                  5         UR                  R                  R                  S5        g[        U[        5      (       a%  UR                  R                  R                  5         gg)zInitialize the weightsr   )meanstdNr3   )
isinstancer   r~   r   datanormal_r   initializer_ranger   zero_r   r   r   fill_	EsmLMHead)rV   r   s     r)   _init_weights EsmPreTrainedModel._init_weightsa  s2   fbii(( MM&&CT[[5R5R&S{{&  &&( '--MM&&CT[[5R5R&S!!-""6#5#56<<> .--KK""$MM$$S)	**KK""$ +r+   c                     g r   rV   s    r)   get_output_embeddings(EsmPreTrainedModel.get_output_embeddingss  s     r+   rz  N)rl   rm   rn   ro   r   rr   base_model_prefixsupports_gradient_checkpointingaccepts_loss_kwargs_no_split_modules"_keys_to_ignore_on_load_unexpected_supports_flash_attn_supports_sdpa_supports_flex_attn_supports_attention_backendr8  r   r   _can_record_outputsrw  r|  ru   rz  r+   r)   rh  rh  K  sz    &*#\*F)G&N"& "%&6aKXY+1AQR
%$r+   rh  c                     ^  \ rS rSrSrSU 4S jjrS rS rS r\	" 5       \
       SS\\R                     S\\R                     S	\\R                     S
\\R                     S\\R                     S\\R                     S\\R                     S\\   S\\\R                     \4   4S jj5       5       rS rSrU =r$ )EsmModeliy  a  

The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
cross-attention is added between the self-attention layers, following the architecture described in [Attention is
all you need](https://huggingface.co/papers/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.

To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
`add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
c                   > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        U(       a  [        U5      OSU l        [        UR                  UR                  -  SS9U l        U R                  5         g)z^
add_pooling_layer (bool, *optional*, defaults to `True`):
    Whether to add a pooling layer
NT)r{   r   )rM   rN   r   r   r   rM  encoderr^  poolerry   rQ  r   contact_head	post_init)rV   r   add_pooling_layerrW   s      r)   rN   EsmModel.__init__  so    
 	 '/!&)+<i'$40063M3MMTX

 	r+   c                 .    U R                   R                  $ r  r   r   r{  s    r)   get_input_embeddingsEsmModel.get_input_embeddings  s    ...r+   c                 $    XR                   l        g r  r  )rV   r   s     r)   set_input_embeddingsEsmModel.set_input_embeddings  s    */'r+   c                     UR                  5        H7  u  p#U R                  R                  U   R                  R	                  U5        M9     g)z
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
N)itemsr  rR  r<  r#  )rV   heads_to_prunerR  r   s       r)   _prune_headsEsmModel._prune_heads  s<    
 +002LELLu%//;;EB 3r+   r   r   r   r   r   r   r   r   rh   c                    USL USL-  (       a  [        S5      eUc  U R                  UUS9nU R                  R                  S:w  aE  UR                  SS u  pUc   [
        R                  " X4UR                  S9nU R                  X)U
4S9nU R                  R                  (       aL  UbI  UR                  5       u  pnX4nUc  [
        R                  " XR                  S9nU R                  U5      nOSnU R                  X@R                  R                  5      nU R                  " U4UUUUS.UD6nUS	   nU R                  b  U R                  U5      OSn[!        UUS
9$ )a  
input_ids (`torch.LongTensor` of shape `((batch_size, sequence_length))`):
    Indices of input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are input IDs?](../glossary#input-ids)
position_ids (`torch.LongTensor` of shape `((batch_size, sequence_length))`, *optional*):
    Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
    config.max_position_embeddings - 1]`.

    [What are position IDs?](../glossary#position-ids)
inputs_embeds (`torch.FloatTensor` of shape `((batch_size, sequence_length), hidden_size)`, *optional*):
    Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
    is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
    model's internal embedding lookup matrix.
Nz:You must specify exactly one of input_ids or inputs_embeds)r   r   flash_attention_2r    rZ   )r   r&  r   )rW  pooler_output)r   r   r   r   r.   r$   onesr[   get_extended_attention_maskr   r   invert_attention_maskget_head_maskrQ  r  r  r   )rV   r   r   r   r   r   r   r   r   r   r   encoder_batch_sizeencoder_sequence_lengthr   encoder_hidden_shapeencoder_extended_attention_maskencoder_outputssequence_outputre  s                      r)   rj   EsmModel.forward  s   > -t";<YZZ  OO#) , M
 ;;++/BB%2%8%8"%="J%!&j-E}OcOc!d+/+K+K,D ,L ,N ;;!!&;&G=R=W=W=Y:$6#P %-).4HQeQe)f&.2.H.HI_.`+.2+ &&y++2O2OP	,,
)"7#B
 
 *!,8<8OO4UY;-'
 	
r+   c                 6   U " XSSS9R                   n[        R                  " USS9nX2R                  S5      R                  S5      R                  S5      -  nX2R                  S5      R                  S5      R                  S5      -  nU R	                  X5      $ )NT)r   return_dictoutput_attentionsr   r!   r   r
      )r   r$   stackr   r  )rV   r   r   attnss       r)   predict_contactsEsmModel.predict_contacts  s    V`deppEq)
 	))!,66q9CCAFF))!,66q9CCAFF  //r+   )r   r  r   r  r  )T)NNNNNNN)rl   rm   rn   ro   rp   rN   r  r  r  r   r   r   r$   rq   r   r   r   rt   r   rj   r  ru   rv   rw   s   @r)   r  r  y  s   
(/0C  -115/3,0048<9=O
ELL)O
 !.O
 u||,	O

 ELL)O
  -O
  (5O
 !) 6O
 +,O
 
uU\\"$PP	QO
  O
b	0 	0r+   r  c                   z  ^  \ rS rSrS/rU 4S jrS rS r\\	        SS\
\R                     S\
\R                     S\
\R                     S	\
\R                     S
\
\R                     S\
\R                     S\
\R                     S\
\R                     S\\   S\\\4   4S jj5       5       rS rSrU =r$ )EsmForMaskedLMi  zlm_head.decoder.weightc                    > [         TU ]  U5        UR                  (       a  [        R	                  S5        [        USS9U l        [        U5      U l        U R                  5         U R                  5         g )NzjIf you want to use `EsmForMaskedLM` make sure `config.is_decoder=False` for bi-directional self-attention.Fr  )rM   rN   r   loggerwarningr  ri  rv  lm_headinit_weightsr  r   s     r)   rN   EsmForMaskedLM.__init__  s\     NN1
 Fe< (r+   c                 .    U R                   R                  $ r  r  decoderr{  s    r)   r|  $EsmForMaskedLM.get_output_embeddings  s    ||###r+   c                 $    XR                   l        g r  r  )rV   new_embeddingss     r)   set_output_embeddings$EsmForMaskedLM.set_output_embeddings  s    -r+   r   r   r   r   r   r   r   labelsr   rh   c	           
      n   U R                   " U4UUUUUUS.U	D6n
U
S   nU R                  U5      nSnUba  [        5       nUR                  UR                  5      nU" UR                  SU R                  R                  5      UR                  S5      5      n[        UUU
R                  U
R                  S9$ )az  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
    config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
    loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
)r   r   r   r   r   r   r   Nr    losslogitsr   r   )ri  r  r   r^   r[   r   r   r   r   r   r   )rV   r   r   r   r   r   r   r   r  r   outputsr  prediction_scoresmasked_lm_lossloss_fcts                  r)   rj   EsmForMaskedLM.forward"  s    * ((	
)%'"7#9	
 	
 "!* LL9')HYY0778F%&7&<&<RAWAW&XZ`ZeZefhZijN$!//))	
 	
r+   c                 4    U R                   R                  XS9$ )N)r   )ri  r  )rV   r   r   s      r)   r  EsmForMaskedLM.predict_contactsR  s    xx(((OOr+   )ri  r  )NNNNNNNN)rl   rm   rn   ro   _tied_weights_keysrN   r|  r  r   r   r   r$   
LongTensorrq   r	  r   r   r   rt   r   rj   r  ru   rv   rw   s   @r)   r  r    s"   23 $.  151537,059=A9=-1,
E,,-,
 !.,
 u//0	,

 ELL),
   1 12,
  ((9(9:,
 !) 6,
 ))*,
 +,,
 
un$	%,
  ,
\P Pr+   r  c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )rv  iV  z&ESM Head for masked language modeling.c                   > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  UR                  S9U l        [        R                  " UR                  UR                  SS9U l
        [        R                  " [        R                  " UR                  5      5      U l        g )Nr   F)r   )rM   rN   r   r~   r   r  r   r   r   r   r  	Parameterr$   zerosr   r   s     r)   rN   EsmLMHead.__init__Y  s    YYv1163E3EF
,,v'9'9v?T?TUyy!3!3V5F5FUSLLV->->!?@	r+   c                     U R                  U5      n[        U5      nU R                  U5      nU R                  U5      U R                  -   nU$ r  )r  r8   r   r  r   rV   featuresr   r&   s       r)   rj   EsmLMHead.forwarda  sD    JJx GOOA LLOdii'r+   )r   r  r  r   	rl   rm   rn   ro   rp   rN   rj   ru   rv   rw   s   @r)   rv  rv  V  s    0A r+   rv  z
    ESM Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
    output) e.g. for GLUE tasks.
    )custom_introc                   "  ^  \ rS rSrU 4S jr\\      SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\   S
\\\4   4S jj5       5       rSrU =r$ )EsmForSequenceClassificationik  c                    > [         TU ]  U5        UR                  U l        Xl        [	        USS9U l        [        U5      U l        U R                  5         U R                  5         g NFr  )
rM   rN   
num_labelsr   r  ri  EsmClassificationHead
classifierr  r  r   s     r)   rN   %EsmForSequenceClassification.__init__r  sR      ++Fe</7r+   r   r   r   r   r   r  r   rh   c                    U R                   " U4UUUUS.UD6nUS   n	U R                  U	5      n
SnUGb  UR                  U
R                  5      nU R                  R
                  c  U R                  S:X  a  SU R                  l        OoU R                  S:  aN  UR                  [        R                  :X  d  UR                  [        R                  :X  a  SU R                  l        OSU R                  l        U R                  R
                  S:X  aI  [        5       nU R                  S:X  a&  U" U
R                  5       UR                  5       5      nOU" X5      nOU R                  R
                  S:X  a=  [        5       nU" U
R                  SU R                  5      UR                  S5      5      nO,U R                  R
                  S:X  a  [        5       nU" X5      n[!        UU
UR"                  UR$                  S	9$ )
ae  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
r   r   r   r   r   Nr   r   single_label_classificationmulti_label_classificationr    r  )ri  r  r^   r[   r   problem_typer  rL   r$   r   rs   r	   r   r   r   r   r   r   r   rV   r   r   r   r   r   r  r   r  r  r  r  r  s                r)   rj   $EsmForSequenceClassification.forward~  s   & ((
)%'
 
 "!*1YYv}}-F{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#F3D))-JJ+-B @&++b/R))-II,./'!//))	
 	
r+   )r  r   ri  r  NNNNNN)rl   rm   rn   ro   rN   r   r   r   r$   r  rq   r	  r   r   r   rt   r   rj   ru   rv   rw   s   @r)   r  r  k  s    
  151537,059-1:
E,,-:
 !.:
 u//0	:

 ELL):
   1 12:
 ))*:
 +,:
 
u..	/:
  :
r+   r  c                   "  ^  \ rS rSrU 4S jr\\      SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\   S
\\\4   4S jj5       5       rSrU =r$ )EsmForTokenClassificationi  c                 N  > [         TU ]  U5        UR                  U l        [        USS9U l        [
        R                  " UR                  5      U l        [
        R                  " UR                  UR                  5      U l        U R                  5         U R                  5         g r  )rM   rN   r  r  ri  r   r   r   r   r~   r   r  r  r  r   s     r)   rN   "EsmForTokenClassification.__init__  su      ++Fe<zz&"<"<=))F$6$68I8IJr+   r   r   r   r   r   r  r   rh   c                 x   U R                   " U4UUUUS.UD6nUS   n	U R                  U	5      n	U R                  U	5      n
SnUbW  [        5       nUR	                  U
R
                  5      nU" U
R                  SU R                  5      UR                  S5      5      n[        UU
UR                  UR                  S9$ )z
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
r  r   Nr    r  )ri  r   r  r   r^   r[   r   r  r   r   r   r  s                r)   rj   !EsmForTokenClassification.forward  s    " ((
)%'
 
 "!*,,71')HYYv}}-FFKKDOO<fkk"oND$!//))	
 	
r+   )r  r   ri  r  r  )rl   rm   rn   ro   rN   r   r   r   r$   r  rq   r	  r   r   r   rt   r   rj   ru   rv   rw   s   @r)   r  r    s    
  151537,059-1)
E,,-)
 !.)
 u//0	)

 ELL))
   1 12)
 ))*)
 +,)
 
u++	,)
  )
r+   r  c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )r  i  z-Head for sentence-level classification tasks.c                 ,  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  5      U l        [        R                  " UR                  UR                  5      U l
        g r  )rM   rN   r   r~   r   r  r   r   r   r  out_projr   s     r)   rN   EsmClassificationHead.__init__  s`    YYv1163E3EF
zz&"<"<=		&"4"4f6G6GHr+   c                     US S 2SS S 24   nU R                  U5      nU R                  U5      n[        R                  " U5      nU R                  U5      nU R	                  U5      nU$ rc  )r   r  r$   tanhr  r  s       r)   rj   EsmClassificationHead.forward  sY    Q1WLLOJJqMJJqMLLOMM!r+   )r  r   r  r  rw   s   @r)   r  r    s    7I r+   r  c                     U R                  U5      R                  5       n[        R                  " USS9R	                  U5      U-  nUR                  5       U-   $ )z
Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
are ignored. This is modified from fairseq's `utils.make_positions`.

Args:
    x: torch.Tensor x:

Returns: torch.Tensor
r   r!   )r   rs   r$   cumsumr\   r   )r   r   maskincremental_indicess       r)   r   r     sP     <<$((*D,,t3;;DADH##%33r+   )r  r  r  r  rh  )r   N)Frp   r5   typingr   r   r   r$   r   torch.nnr   r   r	   modeling_layersr   modeling_outputsr   r   r   r   r   modeling_utilsr   r   processing_utilsr   pytorch_utilsr   r   utilsr   r   r   r   utils.genericr   r   configuration_esmr   
get_loggerrl   r  r*   r1   r8   r;   rE   ModulerG   ry   r   rq   rQ   r   r   r  r  r*  r2  r8  rM  r^  rh  r  r  rv  r  r  r  r   __all__rz  r+   r)   <module>r     s      , ,   A A 9  G & Q R R ? ( 
		H	%(
.;#
	)
ehhoo )
X Gryy  GF\=BII \=L (,/%II/%<</% 
/% <<	/%
 U\\*/% /% /% %/% '(/%dV)ryy V)r
BII 
-299 -`bii 
		 
7) 7t S  SH		  * * *Z K0! K0 K0\ JP' JP JPZ		 * I
#5 I
I
X 8
 2 8
 8
vBII &4 r+   