
    bCi                        S r SSKJrJrJr  SSKrSSKrSSKJr  SSK	Js  J
r  SSKJrJr  SSKJr  SSKJrJr  SSKJr  SS	KJr  SS
KJrJrJr  SSKJrJr  SSKJ r J!r!  SSK"J#r#J$r$J%r%J&r&J'r'J(r(J)r)J*r*  SSK+J,r,J-r-J.r.J/r/J0r0  \Rb                  " \25      r3SDS\44S jjr5 SES\Rl                  S\4S\4S\44S jjr7S\Rl                  S\Rp                  S\4S\9S\4S\Rl                  4S jr:SS\Rv                  4S\Rl                  S \4S!\4S"\<S#\4S$\Rz                  S\>\Rl                  \Rl                  4   4S% jjr?S&\Rl                  S'\\4   S\Rl                  4S( jr@ " S) S*\'5      rA " S+ S,\(5      rB " S- S.\ 5      rC " S/ S0\%5      rD " S1 S2\)5      rE " S3 S4\&5      rF\ " S5 S6\$5      5       rG " S7 S8\G5      rH " S9 S:\G5      rI " S; S<\G5      rJ " S= S>\G5      rK " S? S@\G5      rL " SA SB\#5      rM/ SCQrNg)Fz<Blt modular model, inheriting from Mllama where appropriate.    )CallableOptionalUnionN   )CacheDynamicCache)create_causal_mask)BaseModelOutputWithPastCausalLMOutputWithPast)ALL_ATTENTION_FUNCTIONS)Unpack)TransformersKwargsauto_docstringlogging)OutputRecordercheck_model_inputs   )Cohere2RotaryEmbeddingrotate_half)MllamaForCausalLMMllamaPreTrainedModelMllamaSelfAttentionDecoderLayerMllamaTextCrossAttentionMllamaTextMLPMllamaTextRMSNormMllamaTextSelfAttentioneager_attention_forward   )	BltConfigBltGlobalTransformerConfigBltLocalDecoderConfigBltLocalEncoderConfigBltPatcherConfigprimec                     [         R                  " U[         R                  U R                  S9n[         R                  " U R
                  S   U R                  S9nX#-  n[         R                  " X-  SS9$ )aX  
A polynomial rolling hash algorithm that converts sequences
of tokens into hash values. The hash is computed as:
    hash = (token_0 * prime^0 + token_1 * prime^1 + ... + token_n * prime^n)

The rolling hash allows the model to efficiently
identify and encode recurring byte-level patterns in the input text.

Args:
    token_tensor (torch.Tensor): [batch_size, seq_len, group_size] containing token IDs to hash
    prime (int): Prime number used as the base for the polynomial hash.

Returns:
    torch.Tensor: Hash values of shape [batch_size, seq_len] where each value
                 represents the hash of the corresponding token group

Example:
    >>> tokens = torch.tensor([[1, 2, 3], [4, 5, 6]])
    >>> hashes = rolling_polynomial_hash(tokens, prime=31)
    >>> # hash[0] = 1*31^0 + 2*31^1 + 3*31^2
    >>> # hash[1] = 4*31^0 + 5*31^1 + 6*31^2
dtypedevicer(   dim)torchtensorint64r(   arangeshapesum)token_tensorr$   prime_tensorpowersprime_powerss        ]/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/blt/modular_blt.pyrolling_polynomial_hashr8   9   s]    . <<U[[ATATUL\\,,,R09L9LMF'L99\0b99    	token_ids
group_sizemax_hashc                 T   [         R                  " 5          U R                  u  pE[         R                  " XAS-
  [         R                  U R
                  S9n[         R                  " X`/SS9nUR                  SUS5      n[        X5      n	X-  n
SSS5        U
$ ! , (       d  f       W
$ = f)z1Hash token groups and map to range [0, max_hash].r   r&   r+   N)	r-   no_gradr1   zerosr/   r(   catunfoldr8   )r:   r;   r$   r<   
batch_sizeseq_lenpaddingpadded_tokenswindowshasheshash_valuess              r7   byte_group_hash_functionrI   V   s     
'oo
++jq.T]TdTde		7"6A>  &&q*a8(8' 
  
 s   A8B
B'local_encoder_tokensencoder_hash_tok_embedding$encoder_hash_byte_group_nb_functionsencoder_hash_byte_group_sizeencoder_hash_byte_group_vocabreturnc                     / SQnUR                  U 5      nSn[        U5       H>  n	Xi[        U5      -     n
U H%  n[        XX5      nXU-  -   nXr" U5      -  nUS-  nM'     M@     U$ )z=Compute token embeddings enhanced with hash-based embeddings.)ʚ;l   21A ioYl   vt l   . l   }g l   Au l   0 l   T l   AK l   | r   r   )embed_tokensrangelenrI   )rJ   local_encoderrK   rL   rM   rN   primes
embeddingsembedding_idxfunc_nbr$   r;   hash_idsoffset_hash_idss                 r7   compute_hash_embeddingsr\   h   s    F ++,@AJM=>V,-6J/0DRWwH&9V)VVO4_EEJQM 7 ? r9   F	patch_idsnum_patchessequence_lengthpatches_as_queriescross_attn_kr'   c                 l   U R                   u  pgU R                  nU(       aj  X-  n	Un
[        R                  " XS9R	                  S5      R	                  S5      R                  XaU5      nU R	                  S5      R                  XaU5      nOiUn	X-  n
U R	                  S5      R                  XgU5      n[        R                  " XS9R	                  S5      R	                  S5      R                  XgU5      nX:H  nU(       a  SOSnUR                  XNS9nXiU
4nUR                   U:w  a  [        SUR                    SU 35      eUR	                  S5      nSUR                  U5      -
  nUR                  UR                  [        R                  5      [        R                  " U5      R                  5      nU$ )	a  
Prepare cross-attention mask for patch-based attention, following mllama's robust approach.

This function creates masks that control which patches can attend to which other patches,
with support for query/key role swapping and cross-attention multipliers.

Args:
    patch_ids (torch.Tensor): Tensor of shape [batch_size, seq_len] containing patch ids.
    num_patches (int): Total number of patches.
    sequence_length (int): Length of the sequence.
    patches_as_queries (bool): If True, patches are used as queries, otherwise as keys.
    cross_attn_k (int): Cross-attention multiplier for repeating patches.
    dtype (torch.dtype): Data type for the output mask.

Returns:
    Tuple[torch.Tensor, torch.Tensor]:
        - cross_attention_mask: 4D tensor [batch_size, 1, q_len, kv_len]
r*   r   r)   r   r+   zCross attention mask shape z doesn't match expected g      ?)r1   r(   r-   r0   	unsqueezeexpandrepeat_interleave
ValueErrortomasked_fillboolfinfomin)r]   r^   r_   r`   ra   r'   rB   rC   r(   q_lenkv_lenq_patch_idskv_patch_idscross_attention_mask
repeat_dimexpected_shapeinverted_cross_attn_masks                    r7   #_prepare_patch_cross_attention_maskrt      s   4 $//JF *  LL4Yq\Yr]VJW5	 	 !**1-44ZgV+))"-44Z+VLL4>>qAKKANUUV`kvw 	 '6 )bJ/AA,A_ !0N!!^3)*>*D*D)EE]^l]mn
 	

 099!<  #%9%<%<U%CC3?? ##EJJ/U1C1G1G  r9   patch_lengthsmax_patch_lengthc                 <   Uc  U $ U R                  S5      n/ nU  Hp  n/ nXDS:      HO  nUR                  5       n[        Xa5      u  pxUR                  U/U-  5        U(       d  M>  UR	                  U5        MQ     UR	                  U5        Mr     [        S U 5       5      n	[        R                  " X)4U R                  U R                  S9n
[        U5       HF  u  pU(       d  M  [        R                  " XPR                  U R                  S9XS[        U5      24'   MH     U
S:g  R                  SS9R                  5       U
R                  S   :  aJ  U
S:g  R                  SS9R!                  5       R                  5       R                  5       S-   nU
SS2SU24   n
U
$ )a  
Splits patch lengths into smaller segments if they exceed `max_patch_length`.
Pads the result to uniform length across the batch.

Args:
    patch_lengths (torch.Tensor): [batch_size, num_patches] tensor of patch lengths.
    max_patch_length (int, optional): Maximum allowed length per patch.

Returns:
    torch.Tensor: [batch_size, max_len] tensor of split and padded patch lengths.
Nr   c              3   8   #    U  H  n[        U5      v   M     g 7fN)rT   ).0splitss     r7   	<genexpr>(process_patch_lengths.<locals>.<genexpr>   s     6I&#f++Is   r&   r+   r   )sizeitemdivmodextendappendmaxr-   r?   r'   r(   	enumerater.   rT   anyr2   r1   nonzero)ru   rv   rB   	processedseqr{   lengthfull_chunks	remaindermax_lenpaddedilast_nonzeros                r7   process_patch_lengthsr      s    ##A&JI'lF[[]F%+F%E"KMM+,{:;yi( # 	   6I66G[[*.m6I6IR_RfRfgFy)	6',||FBUBU^k^r^r'sFmFm#$ *
 	!Q##%Q7!((Q(/779==?DDFJ=L=()Mr9   c                       \ rS rSrSrg)BltMLPi   N__name__
__module____qualname____firstlineno____static_attributes__r   r9   r7   r   r         r9   r   c                       \ rS rSrSrg)
BltRMSNormi  r   Nr   r   r9   r7   r   r     r   r9   r   c                       \ rS rSrSrg)BltRotaryEmbeddingi  r   Nr   r   r9   r7   r   r     r   r9   r   c                   0   ^  \ rS rSrS\4U 4S jjrSrU =r$ )BltTransformerLayeri  	layer_idxc                    > [         TU ]  5         [        XS9U l        [	        U5      U l        [        UR                  UR                  S9U l	        [        UR                  UR                  S9U l
        g )N)configr   eps)super__init__BltSelfAttention	self_attnr   mlpr   hidden_sizerms_norm_epsinput_layernormpost_attention_layernormselfr   r   	__class__s      r7   r   BltTransformerLayer.__init__  s[    )M&>)&*<*<&BUBUV(263E3E6K^K^(_%r9   )r   r   r   r   )r   r   r   r   intr   r   __classcell__r   s   @r7   r   r     s    `# ` `r9   r   c            	          ^  \ rS rSrS\S\4U 4S jjr   SS\R                  S\R                  S\R                  S\	4U 4S	 jjjr
S
rU =r$ )r   i  r   r   c                 2   > [         TU ]  X5        SU l        g )NT)r   r   	is_causalr   s      r7   r   BltSelfAttention.__init__  s    +r9   hidden_statesattention_maskposition_embeddings	use_cachec           
      2   > [         TU ]  " SUUUUUUS.UD6$ )N)r   r   r   r   past_key_valuescache_positionr   )r   forward)	r   r   r   r   r   r   r   kwargsr   s	           r7   r   BltSelfAttention.forward#  s7     w 
') 3+)
 
 	
r9   )r   )FNN)r   r   r   r   r   r   r   r-   Tensorri   r   r   r   r   s   @r7   r   r     s]    y S   
||
 
 #\\	

 
 
r9   r   c                      ^  \ rS rSrSrSS\S\S\\   4U 4S jjjr    SS\	R                  S\\	R                     S	\\   S
\\	R                     S\\	R                     S\\   4S jjrSrU =r$ )BltCrossAttentioni8  z<Cross-attention module for Blt, following transformers styler   r   r   c                    > [         TU ]  5         SU l        [        U R                  UR
                  S9U l        [        U R                  UR
                  S9U l        g )NFr   )r   r   r   r   r   r   q_normk_norm)r   r   r   r   r   s       r7   r   BltCrossAttention.__init__;  sI     !1!1v7J7JK !1!1v7J7JKr9   r   cross_attention_statesr   r   r   r   c                    UR                  5       u  pxn	U R                  U5      n
U R                  U
5      n
U
R                  XxU R                  U R
                  5      R                  SS5      n
Ub  U R                  U5      nU R                  U5      nU R                  U5      nUR                  USU R                  U R
                  5      R                  SS5      nUR                  USU R                  U R
                  5      R                  SS5      nUb!  UR                  XU R                  SU05      u  pOZUS   S:w  aF  UR                  U R                     R                  UR                  U R                     R                  pO[!        S5      e["        nU R$                  R&                  S:w  a  [(        U R$                  R&                     nU" U U
UUU4U R*                  (       d  SOU R,                  U R.                  S	.UD6u  pUR1                  XxS5      R3                  5       nU R5                  U5      nX-   nX4$ )
Nr   r   r)   r   r   z^Cross attention layer can't find neither `cross_attn_states` nor cached values for key/values!eagerg        )dropoutscaling)r~   r   q_projview	num_headshead_dim	transposer   k_projv_projnum_key_value_headsupdater   layerskeysvaluesrf   r   r   _attn_implementationr   trainingr   r   reshape
contiguouso_proj)r   r   r   r   r   r   r   bszrl   _query_states
key_statesvalue_statesattention_interfaceattn_outputattn_weightss                   r7   r   BltCrossAttention.forwardA  s!    &**,A{{=1{{<0#((T^^T]]S]]^_abc!-%)[[1G%H"%;<J;;'=>L#b$2J2JDMMZddefhijJ',,S"d6N6NPTP]P]^hhijlmnL*+:+A+Adnn?OQ_>`,(
 A!#&&t~~6;;&&t~~6== %
 p  )@;;++w6"9$++:Z:Z"[$7	%
  $}}C$,,LL	%
 	%
! "))#b9DDFkk+.!1((r9   )r   r   r   ry   NNNN)r   r   r   r   __doc__r   r   r   r   r-   r   r   
LongTensorr   r   r   r   r   r   s   @r7   r   r   8  s    FLy LS LxPS} L L :>+/15593)||3) !) 63) "%	3)
 !.3) !!1!123) +,3) 3)r9   r   c                   f    \ rS rSr% \\S'   SrSrSrS/r	\
" \SSS9\
" \SSS9S	.rS
 rS rS rSrg)BltPreTrainedModeliw  r   Fr   r   local_decoderindex
layer_namer   )r   
attentionsc                     [        S5      eNzNo need to inherit it!AttributeErrorr   modules     r7   _init_weights BltPreTrainedModel._init_weights      566r9   c                     [        S5      er   r   r   s     r7   _update_causal_mask&BltPreTrainedModel._update_causal_mask  r   r9   c                     [        S5      er   r   r   s     r7   5_prepare_4d_causal_attention_mask_with_cache_positionHBltPreTrainedModel._prepare_4d_causal_attention_mask_with_cache_position  r   r9   r   N)r   r   r   r   r   __annotations___supports_attention_backend_supports_flash_attn_supports_flex_attn_no_split_modulesr   r   r   _can_record_outputsr   r   r  r   r   r9   r7   r   r   w  sU    "' ./'(;1Q`a$%5Q?[
777r9   r   c                     ^  \ rS rSr% \\S'   S\" \SSS90rS\4U 4S jjr	          SS\
\R                     S	\
\R                     S
\
\R                     S\
\R                     S\
\R                     S\
\   S\
\R                     S\
\R                     S\
\   S\
\R                     S\\   4S jjrS rSrU =r$ )BltLocalEncoderi  r   encoder_attentionsr   rU   r   c           	        > [         TU ]  U5        SU l        Xl        [        R
                  " [        UR                  5       Vs/ s H  n[        X5      PM     sn5      U l	        [        US9U l        [        R                  " UR                  UR                  UR                  -  SS9U l        [        R                   " UR"                  UR                  5      U l        [        R
                  " 5       U l        UR(                  (       a  UR                  OSn[        U5       H0  nU R&                  R+                  [-        XUR                  S95        M2     U R/                  5         g s  snf )NFr   in_featuresout_featuresbiasr   r   r   r   )r   r   gradient_checkpointingr   nn
ModuleListrS   num_hidden_layersr   r   r   
rotary_embLinearr   ra   patch_embedding_projection	Embedding
vocab_sizerR   cross_attn_layerscross_attn_all_layersr   r   	post_initr   r   r   layers_to_addr   s       r7   r   BltLocalEncoder.__init__  s    &+#mmEJ6KcKcEdeEd	 3Ede
 -F;*,))**++f.A.AA+
'
 LL):):F<N<NO!#4:4P4P00VW}-I""))!RXRdRde .
 	! fs   E)	input_idsinputs_embedspatch_embedsr   position_idsr   r   encoder_attention_maskr^   r]   r   c           	         Uc  U R                  U5      nUR                  S   n[        R                  " X R                  R                  U R
                  S9nUcK  [        R                  " UR                  S   UR                  S9R                  S5      R                  US5      nU R                  X5      n[        R                  " XR                  R                  U R
                  S9n[        U R                  5       H  u  nnU" U4UUUUS.UD6nU[        U R                  5      S-
  :X  d  U R                  R                  (       d  MM  U R!                  XU
5      nU R#                  U5      nUR%                  XR                  S   U R                  R&                  -  U R                  R(                  5      nU R                  R                  (       a  UOSnU R*                  U   " SUUUS.UD6u  nnUU-   nM     UnUU4$ )	Nr   pr   r   r*   r)   r   r   r   r   r   r   r   r   )rR   r1   Fr   r   r   r-   r0   r(   rc   rd   r  r   r   rT   r  patch_reducer  r   ra   r   r  )r   r"  r#  r$  r   r%  r   r   r&  r^   r]   r   rB   r   r   idxlayerr   cross_attention_outputr   encoder_cross_statess                        r7   r   BltLocalEncoder.forward  s      --i8M"((+
		-;;3F3FQUQ^Q^_]003M<P<PQ[[\]^eefprtu  #oomJ		-;;3F3FQUQ^Q^_#DKK0JC!$7- /- M c$++&**dkk.O.O.O#00YW#>>|L+33 2 21 58P8P PRVR]R]RiRi  $(;;#D#DC!	,0,B,B9,M -".+8#9- 	-)&  ,.DD- 1.  ,222r9   c                 :   UR                   S   nUR                   S   nUR                  S5      R                  SSUR                   S   5      n[        R                  " XBU4UR
                  UR                  S9nUR                  USUSSS9nUSS2SU2SS24   nU$ )	a>  
Reduce variable length patches to single embedding per patch
Note: this works with variable number of patches for different sequences in the batch
It handles variable length patches by assuming that patch_lengths will be 0 for any
extra patches on the *right*. Since there can be a variable number of patches
this function also return the number of patches for each sequence in the batch.
Any embeddings on the right that are not allocated to a patch
(i.e. if the sum(patch_lengths[i]) < seq_len for any i)
will be sent to a dummy patch, which is trimmed before returning.
r   r)   r&   r   amaxF)srcr,   r   reduceinclude_selfN)r1   rc   rd   r-   r?   r'   r(   scatter_reduce)r   r   max_num_patchesr]   rB   embedding_dimreduced_embeddingss          r7   r-  BltLocalEncoder.patch_reduce  s     #((+
%++B/''+222r=;N;Nr;RS	"[[-8@S@S\i\p\p
 0>> ? 
 03CO3CQ0FG!!r9   )r   r  rR   r  r   r  r  
NNNNNNNNNN)r   r   r   r   r"   r  r   r   r  r   r   r-   r   r   r   r   r   r   r   r-  r   r   r   s   @r7   r
  r
    s&   !!n-=QSbc4 2 1504/31537+/599=%),043E,,-43  -43 u||,	43
 !.43 u//043 "%43 !!1!1243 !) 643 c]43 ELL)43 +,43l" "r9   r
  c                   T  ^  \ rS rSr% \\S'   S\4U 4S jjr\" 5               SS\\	R                     S\\	R                     S\\	R                     S\\	R                     S\\	R                     S	\\   S
\\	R                     S\\	R                     S\\   4S jj5       rSrU =r$ )BltLocalDecoderi   r   c           	        > [         TU ]  U5        SU l        Xl        SU l        [
        R                  " [        UR                  5       Vs/ s H  n[        X5      PM     sn5      U l
        [        US9U l        [
        R                  " UR                  UR                  UR                   -  SS9U l        [%        UR                  UR&                  S9U l        [
        R                  " 5       U l        UR,                  (       a  UR                  OSn[        U5       H0  nU R*                  R/                  [1        XUR                  S95        M2     U R3                  5         g s  snf )NFTr  r  r   r   r  )r   r   r  r   cross_attn_decoderr  r  rS   r  r   r   r   r  r  hidden_size_globalr   ra   r  r   r   normr  r  r   r   r  r  s       r7   r   BltLocalDecoder.__init__  s    &+#"&mmEJ6KcKcEdeEd	 3Ede
 -F;*,))11++f.A.AA+
'
 v11v7J7JK	!#4:4P4P00VW}-I""))!RXRdRde .
 	! fs   E#r"  r#  r$  r   r%  r   r   r&  r   c	           	      .   UR                   S   n
UnU R                  U5      nUR                  XR                   S   U R                  R                  -  U R                  R
                  5      nUb  U R                  (       d  X-   nUcK  [        R                  " UR                   S   UR                  S9R                  S5      R                  U
S5      nU R                  X5      n[        R                  " XR                  R                  U R                  S9n[!        U R"                  5       HT  u  pUS:X  d  U R                  R$                  (       a   U R&                  U   " SUUUS.U	D6u  nnX-   nU" U4UUUUS.U	D6nMV     U R)                  U5      nU$ )	Nr   r   r*   r)   r(  r+  r*  r   )r1   r  r   r   ra   r   rA  r-   r0   r(   rc   rd   r  r,  r   r   r   r   r  r  rC  )r   r"  r#  r$  r   r%  r   r   r&  r   rB   r   r   r   r/  r0  r   logitss                     r7   r   BltLocalDecoder.forward  s    #((+
%66|D#++**1-0H0HH$++JaJa
 #D,C,C)8M]003M<P<PQ[[\]^eefprtu  #oomJ		-;;3F3FQUQ^Q^_!$++.HAAv::,0,B,B1,E -"/+7#9- 	-)& !. F!$7- /- M /" =)r9   )r   rA  r  r  r   rC  r  r  NNNNNNNN)r   r   r   r   r!   r  r   r   r   r-   r   r   r   r   r   r   r   r   r   s   @r7   r?  r?     s    !!4 0  1504/31537+/599=0E,,-0  -0 u||,	0
 !.0 u//00 "%0 !!1!120 !) 60 +,0 0r9   r?  c                      ^  \ rS rSr% \\S'   S\" \SSS90rS\4U 4S jjr	    SS\
R                  S	\\
R                     S
\\
R                     S\\   S\\
R                     S\\   4S jjrSrU =r$ )BltGlobalTransformeriO  r   global_attentionsr   global_transformerr   c                   > [         TU ]  U5        Xl        [        R                  " 5       U l        [        UR                  5       H'  nU R
                  R                  [        X5      5        M)     [        US9U l        [        USS 5      b0  [        R                  " UR                  UR                  SS9U l        O[        R"                  " 5       U l        U R%                  5         g )Nr  encoder_cross_output_sizeFr  )r   r   r   r  r  r   rS   r  r   r   r   r  getattrr  rN  r   token_embedding_projectionIdentityr  r   s      r7   r   BltGlobalTransformer.__init__U  s     mmov778IKK26EF 9,F; 66=I.0ii00&2D2D5/D+ /1kkmD+r9   input_embedsr   r%  r   r   r   c           	         UR                   u  pxn	U R                  U5      n
[        R                  " XR                  R                  U R
                  S9n
UcK  [        R                  " UR                   S   UR                  S9R                  S5      R                  US5      nU R                  X5      n[        U R                  5       H  u  pU" U
4UUUUS.UD6n
M     U
$ )Nr(  r   r*   r   r)   r*  )r1   rQ  r,  r   r   r   r-   r0   r(   rc   rd   r  r   r   )r   rT  r   r%  r   r   r   rB   rC   r   r   r   r   r/  s                 r7   r   BltGlobalTransformer.forwardg  s     ".!3!3
Q77E		-;;3F3FQUQ^Q^_\//2<;N;NOYYZ[\ccdnprs  #oomJ!$++.HA!$7- /- M / r9   )r   r   r  rQ  r   )r   r   r   r   r    r  r   r   r  r   r-   r   r   r   r   r   r   r   r   r   r   s   @r7   rJ  rJ  O  s    &&^,<ARfg9 * 2637+/59ll !. u//0	
 "% !!1!12 +, r9   rJ  c                   H  ^  \ rS rSr% \\S'   S\4U 4S jjr          SS\\R                     S\\R                     S\\R                     S\\   S\\R                     S	\\   S
\\R                     S\\   S\\   S\\   S\\   4S jjr\  SS j5       rSrU =r$ )
BltPatcheri  r   c                   > [         TU ]  U5        [        U R                  S9U l        [
        R                  " 5       U l        [        U R                  R                  5       H2  nU R                  R                  [        U R                  U5      5        M4     [
        R                  " U R                  R                  U R                  R                  5      U l        [!        U R                  R                  U R                  R"                  S9U l        [
        R&                  " U R                  R                  U R                  R                  SS9U l        g )Nr  r   FrO  )r   r   r   r   r  r  r  r   rS   r  r   r   r  r  r   rR   r   r   rC  r  lm_headr   s      r7   r   BltPatcher.__init__  s     ,DKK@mmot{{<<=IKK24;;	JK >LL)?)?AXAXYt{{66DKK<T<TU	yyKK##KK""
r9   r"  r   r%  r   r#  r   r   
patch_size	thresholdrv   r   c           	      $   US L US L-  (       a  [        S5      eUc  U R                  U5      nU(       a  Uc
  [        5       nUcD  Ub  UR                  5       OSn[        R
                  " XUR                  S   -   UR                  S9nUc  UR                  S5      n[        U R                  UUUUUS9nUnU R                  X5      nU R                   H
  nU" XUS9nM     U R                  U R                  U5      5      n[        R                  R!                  US9R#                  5       nUR                  S S u  nnUb  U R%                  UUUU	S	9nO,[        R&                  " UU4UR(                  UR                  S
9n[+        UU
5      nUUU4$ )N:You must specify exactly one of input_ids or inputs_embedsr   r   r*   r   rT  r   r   r   r%  )r   r   )rF  r   )	entropiesr_   r\  r]  r&   )rf   rR   r   get_seq_lengthr-   r0   r1   r(   rc   r	   r   r  r   rZ  rC  distributionsCategoricalentropypatch_lengths_from_entropiesonesr'   r   )r   r"  r   r%  r   r#  r   r   r\  r]  rv   r   past_seen_tokenscausal_maskr   r   r/  rF  prediction_entropiesrB   r_   ru   s                         r7   r   BltPatcher.forward  s    -t";<YZZ  --i8M0*nO!CRC^==?de"\\ ]5H5H5K"KTaThThN )33A6L(;;&))+%
 &"oomJ[[E!-ituM ! dii67$22>>f>MUUW&3&9&9"1&=#
O! ==. /%#	 > M "JJ_-]5H5HQ^QeQeM .m=MN#]F::r9   c                 `   U R                   S   n[        R                  " SS/[        R                  U R                  S9R                  S5      R                  US5      nUR                   S   nU SS2SS24   n X:  nUR                   S   n[        R                  " XR                  S9R                  S5      R                  US5      n	[        R                  " X5      n
[        R                  " X/SS9n[        R                  " Xw) /SS9nX   R                  XH5      nUR                  SS9R                  5       nUSS2SU24   n[        R                  " X]U-   4SS9n[        R                  " USS2SS24   US-
  5      n[        R                  " USS2SS24   S-
  U4SS9nUU-
  S-   nU$ )z
Computes patch lengths from token entropies.

Depending on whether a threshold is provided, the function uses either:
- Thresholding the entropy values (when `threshold` is set).
r   r   r&   Nr*   r)   r+   )r1   r-   r.   longr(   rc   repeatr0   rd   	full_liker@   r   r2   r   )ra  r_   r\  r]  rB   init_tokensoffset
patch_maskrC   token_indicessentinelpadded_indicespadded_maskpatch_startsmax_valid_patchespatch_start_ids
last_token
patch_endsru   s                      r7   rf  'BltPatcher.patch_lengths_from_entropies  s    __Q'
 LL!Quzz):J:JKUUVWX__`jlmn 	 ""1% ae$	 *
""1% W5E5EFPPQRSZZ[egij??=:M#<!D ii[ 9qA &2:::O&NNqN1557#A'9(9'9$9:  ))[2G$HaP ___QU%;_q=PQ
YY12 6 :JGQO
"_4q8r9   )rR   r   rZ  rC  r  r=  )NN)r   r   r   r   r#   r  r   r   r-   r   r   r   FloatTensorri   r   floatr   r   r   staticmethodrf  r   r   r   s   @r7   rX  rX    s   
/ 
  151537+/59$(59$(%)*.?;E,,-?; !.?; u//0	?;
 "%?;   1 12?; D>?; !!1!12?; SM?; E??; #3-?; +,?;B  	3 3r9   rX  c                     ^  \ rS rSrS\4U 4S jjr\" 5               SS\\R                     S\\R                     S\\R                     S\\R                     S\\   S	\\R                     S
\\   S\\R                     S\\   S\4S jj5       rS rS rS\R                  S\S\R                  4S jrSrU =r$ )BltModeli  r   c                   > [         TU ]  U5        SU l        Xl        [	        UR
                  5      U l        [        UR                  5      U l	        [        UR                  5      U l        UR                  [        UR                  5      -  nUR                   U-  n["        R$                  " X1R
                  R&                  5      U l        U R                  R*                  (       a]  [-        UR.                  5      U l        U R0                  R3                  5         U R0                  R5                  5        H
  nSUl        M     OS U l        U R9                  5         g )NF)r   r   r  r   r
  encoder_configrU   rJ  global_configrL  r?  decoder_configr   rL   rT   rM   rN   r  r  r   rK   patch_in_forwardrX  patcher_configpatchereval
parametersrequires_gradr  )r   r   num_embeddingstotal_vocab_sizeparamr   s        r7   r   BltModel.__init__  s    &+#,V-B-BC"6v7K7K"L,V-B-BCDDs6KnKnGoo!??.P*,,,7GI^I^IjIj*k';;''%f&;&;<DLLL002&+# 3  DLr9   r"  ru   r   r%  r   r#  r   r   r   rO   c	                 B   US L US L-  (       a  [        S5      eUb  Un
UR                  u  pnOnUR                  u  p[        UU R                  U R                  U R
                  R                  U R
                  R                  U R
                  R                  5      n
UGc  U R
                  R                  S:X  a  U R                  b  Uc  [        S5      eU R                  UU R
                  R                  U R
                  R                  U R
                  R                  U R
                  R                  UR                  S9u  pnOmUb  UR                  OUR                  nUb  UR                   OUR                   n[#        [$        R&                  " XS-   4XS9U R
                  R                  5      nU R)                  X,5      nUcE  Ub  UR+                  5       OSn[$        R,                  " UUU
R                  S   -   U
R                  S9nUc  UR/                  S5      n[1        U R
                  U
UUUUS	9n[3        UUR                  S   US
U R
                  R4                  U
R                   S9nU R                  " SUU
UUUUR                  S   US.U	D6u  nnUR7                  XR                  S   S5      n[$        R,                  " SUR                  S   UR                  S9nUR/                  S5      n[1        U R
                  US US S S	9nU R8                  " SUUUS.U	D6nU R)                  US S 2SS 24   U5      n[3        UUR                  S   USU R
                  R4                  U
R                   S9nU R:                  " SUUUUUUUUS.U	D6n[=        UUS9$ )Nr_  re  z0input_ids is required for entropy-based patching)r\  r]  rv   patching_batch_sizer(   r   r&   r   r*   r`  T)r]   r^   r_   r`   ra   r'   )r"  r#  r   r%  r&  r^   r]   r)   )rT  r   r%  F)r"  r#  r$  r   r%  r   r   r&  )last_hidden_stater   r   )rf   r1   r\   rU   rK   r   rL   rM   rN   patching_moder  r\  patching_thresholdrv   r  r(   r'   r   r-   rg  _patch_ids_from_lengthsrb  r0   rc   r	   rt   ra   r   rL  r   r
   )r   r"  ru   r   r%  r   r#  r   r   r   encoder_embedsrB   r_   r   r(   r'   r]   rh  ri  cross_attn_mask_encencoder_hidden_statesr1  global_cache_positionglobal_position_idsglobal_causal_maskglobal_hidden_statesdecoder_patch_idscross_attn_mask_decoutputs                                r7   r   BltModel.forward"  s    -t";<YZZ $*N-:-@-@*J*3//'J4""//@@8899N  {{((I5$,,:R$$%WXX&*ll#{{55"kk<<%)[[%A%A(,(G(G$++ '3 '#! .7-B))H\H\+4+@	mFYFY 5JJ
a,?@]KK00! 00P	!CRC^==?de"\\ "2^5I5I!5L"LUcUjUjN )33A6L(;;'))+%
 B%++A.+#11 &&
 7;6H6H 	7
(&%#6%++A.	7
 	7
33  488EXEXYZE[]_` %Q0D0J0J10MVjVqVq r3==a@/;;-0 
  $66  
--, 
 	 
 !88q!"u9M_A'%++A.+$11 &&
 ## 

/-&%+)#6

 

 '$+
 	
r9   c                 .    U R                   R                  $ ry   rU   rR   )r   s    r7   get_input_embeddingsBltModel.get_input_embeddings  s    !!...r9   c                 $    XR                   l        g ry   r  )r   values     r7   set_input_embeddingsBltModel.set_input_embeddings  s    */'r9   rC   c           	         UR                   S   n[        R                  " [        R                  " USUR                  UR
                  S9UR                  SS9S S 2S S24   /SS9n[        R                  " X!R
                  S9nUR                  S5      UR                  S5      R                  S5      :*  R                  SS9S-
  $ )Nr   r   r&   r)   r+   r*   )
r1   r-   r@   r?   r'   r(   cumsumr0   rc   r2   )r   ru   rC   rB   rw  token_positionss         r7   r   BltModel._patch_ids_from_lengths  s    "((+
yyJ1D1D]MaMab$$$,QV4 
  ,,w7K7KL&&q)_-F-Fq-I-S-STV-WW\\ac\dghhhr9   )r   rK   rL  r  r   rU   r  rH  )r   r   r   r   r   r   r   r   r-   r   r   r   r}  ri   r   r   r
   r   r  r  r   r  r   r   r   s   @r7   r  r    s4   y (  15041537+/59$(59~
E,,-~
  -~
 !.	~

 u//0~
 "%~
   1 12~
 D>~
 !!1!12~
 +,~
 
!~
 ~
@/0
iU\\ 
iC 
iTYT`T` 
i 
ir9   r  c                     ^  \ rS rSr% \\S'   SrSrS/rS\4U 4S jjr	            SS\
\R                     S\
\R                     S	\
\R                     S
\
\R                     S\
\R                     S\
\\R                  \R                  4      S\
\\\\R$                     4      S\
\R$                     S\
\R                     S\
\   S\
\R                     S\\\R                  4   S\\   S\\\4   4S jjrSrU =r$ )BltForCausalLMi  r   Fmodelzlm_head.weightc                    > [         TU ]  U5        UR                  U l        [        U5      U l        [
        R                  " UR                  R                  UR                  SS9U l	        U R                  5         g )NFrO  )r   r   r  r  r  r  r  r  r   rZ  r  )r   r   r   s     r7   r   BltForCausalLM.__init__  sZ      ++f%
yy!6!6!B!BFDUDU\abr9   r"  r   r%  r   rp   full_text_row_masked_out_maskr   r#  labelsr   r   logits_to_keepr   rO   c                    U R                   " SUUUUUUUU
US.	UD6nUR                  n[        U[        5      (       a  [	        U* S 5      OUnU R                  US S 2US S 24   5      R                  5       nS nU	b  U R                  " UXR                  40 UD6n[        UUUR                  UR                  UR                  S9$ )N)	r"  r   r%  rp   r  r   r#  r   r   )lossrF  r   r   r   r   )r  r  
isinstancer   slicerZ  r~  loss_functionr  r   r   r   r   )r   r"  r   r%  r   rp   r  r   r#  r  r   r   r  r   outputsr   slice_indicesrF  r  s                      r7   r   BltForCausalLM.forward  s    " ** 
)%!5*G+')
 
  118B>SV8W8W~ot4]kmA}a,?@AGGI%%ffooPPD%#33!//))
 	
r9   )rZ  r  r  )NNNNNNNNNNNr   )r   r   r   r   r   r  _can_compile_fullgraphbase_model_prefix_tied_weights_keysr   r   r-   r   r   tupler   r   listr}  ri   r   r   r   r   r   r   r   r   s   @r7   r  r    s   "*+y  151537=A;?UYKO59-1$(5934,
E,,-,
 !.,
 u//0	,

 !))9)9 :,
 'u'7'78,
 (0ellELL6P0Q'R,
 "%tE4E4E/F(F"GH,
   1 12,
 ))*,
 D>,
 !!1!12,
 c5<</0,
 +,,
 
u,,	-,
 ,
r9   r  )r   r  rX  r  )rQ   )r   rQ   i0u  )Or   typingr   r   r   r-   torch.distributionstorch.nnr  torch.nn.functional
functionalr,  cache_utilsr   r   masking_utilsr	   modeling_outputsr
   r   modeling_utilsr   processing_utilsr   utilsr   r   r   utils.genericr   r   cohere2.modeling_cohere2r   r   mllama.modeling_mllamar   r   r   r   r   r   r   r   configuration_bltr   r    r!   r"   r#   
get_loggerr   loggerr   r8   r   rI   r  r  r\   float32ri   r'   r  rt   r   r   r   r   r   r   r   r   r
  r?  rJ  rX  r  r  __all__r   r9   r7   <module>r     se   C , ,      . / O 5 & @ @ ?	 	 	  
		H	%: :< \a||),9<UX$#,,# !## +.	#
 #'# $'# \\#T  %K ||K K  K  	K 
 K  ;;K  5<<%&K \) )RU )[`[g[g )X	] 		" 		/ 	`9 `
. 
4<)0 <)~ 7. 7 7*p"( p"fL( L^2- 2jF# FRfi! fiR9
& 9
xr9   