
    cCi                        S SK JrJr  S SKrS SKJr  S SKJs  Jr  SSK	J
r
  SSKJr  SSKJr  SSKJr  SS	KJrJrJrJrJrJrJrJr  S
SKJr  \R8                  " \5      r " S S\R>                  5      r  " S S\5      r!SS jr" " S S\5      r# " S S\5      r$ " S S\5      r% " S S\5      r& " S S\5      r'/ SQr(g)    )CallableOptionalN   )Cache)ALL_ATTENTION_FUNCTIONS)logging)deprecate_kwarg   )LlamaAttentionLlamaDecoderLayerLlamaForCausalLMLlamaMLP
LlamaModelLlamaRotaryEmbeddingeager_attention_forwardrotate_half   )
OlmoConfigc                   r   ^  \ rS rSrSrS\SS4U 4S jjrS\R                  S\R                  4S jr	S	r
U =r$ )
OlmoLayerNorm   z/LayerNorm but with no learnable weight or bias.hidden_sizereturnNc                 2   > [         TU ]  5         U4U l        g N)super__init__normalized_shape)selfr   	__class__s     _/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/olmo/modular_olmo.pyr   OlmoLayerNorm.__init__   s    !,    hidden_statesc                     UR                   n[        R                  " UR                  [        R
                  S9U R                  S S SS9R                  U5      $ )N)dtypegh㈵>)eps)r&   F
layer_normtotorchfloat32r   )r   r$   
orig_dtypes      r!   forwardOlmoLayerNorm.forward"   sO    "((
||M,,5==,A4CXCXZ^`djnorr
 	
r#   )r   )__name__
__module____qualname____firstlineno____doc__intr   r+   Tensorr.   __static_attributes____classcell__r    s   @r!   r   r      s9    9/C /D /
U\\ 
ell 
 
r#   r   c                   (   ^  \ rS rSrU 4S jrSrU =r$ )OlmoMLP)   c                 >  > [         TU ]  U5        [        R                  " U R                  U R
                  SS9U l        [        R                  " U R                  U R
                  SS9U l        [        R                  " U R
                  U R                  SS9U l        g )NF)bias)	r   r   nnLinearr   intermediate_size	gate_projup_proj	down_proj)r   configr    s     r!   r   OlmoMLP.__init__*   ss     4#3#3T5K5KRWXyy!1!143I3IPUV4#9#94;K;KRWXr#   )rD   rB   rC   )r0   r1   r2   r3   r   r7   r8   r9   s   @r!   r;   r;   )   s    Y Yr#   r;   c                    U R                   UR                   pvUR                  U5      nUR                  U5      nX-  [        U 5      U-  -   nX-  [        U5      U-  -   n	UR                  U5      U	R                  U5      4$ )a  Applies Rotary Position Embedding to the query and key tensors.

Args:
    q (`torch.Tensor`): The query tensor.
    k (`torch.Tensor`): The key tensor.
    cos (`torch.Tensor`): The cosine part of the rotary embedding.
    sin (`torch.Tensor`): The sine part of the rotary embedding.
    position_ids (`torch.Tensor`, *optional*):
        Deprecated and unused.
    unsqueeze_dim (`int`, *optional*, defaults to 1):
        The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
        sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
        that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
        k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
        cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
        the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
Returns:
    `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
)r&   	unsqueezer   r*   )
qkcossinposition_idsunsqueeze_dimq_typek_typeq_embedk_embeds
             r!   apply_rotary_pos_embrS   1   sv    ( WWaggF
--
&C
--
&Cw;q>C/0Gw;q>C/0G::fwzz&111r#   c                      \ rS rSr\" SSSS9  SS\R                  S\\R                  \R                  4   S	\\R                     S\\	   S
\\R                     S\\R                  \\R                     4   4S jj5       rSrg)OlmoAttentionM   past_key_valuepast_key_valuesz4.58)new_nameversionNr$   position_embeddingsattention_maskcache_positionr   c                    UR                   S S n/ UQSPU R                  P7nU R                  U5      n	U R                  U5      n
U R	                  U5      nU R
                  R                  b  U	R                  U R
                  R                  * U R
                  R                  S9  U
R                  U R
                  R                  * U R
                  R                  S9  UR                  U R
                  R                  * U R
                  R                  S9  U	R                  U5      R                  SS5      n	U
R                  U5      R                  SS5      n
UR                  U5      R                  SS5      nUu  p[        XX5      u  pUb$  XUS.nUR                  XU R                  U5      u  p[        nU R
                  R                  S:w  a  [        U R
                  R                     nU" U U	U
UU4U R                   (       d  SOU R"                  U R$                  S.UD6u  nnUR&                  " / UQSP76 R)                  5       nU R+                  U5      nUU4$ )	N)minmaxr   r
   )rL   rK   r]   eagerg        )dropoutscaling)shapehead_dimq_projk_projv_projrE   clip_qkvclamp_view	transposerS   update	layer_idxr   _attn_implementationr   trainingattention_dropoutrd   reshape
contiguouso_proj)r   r$   r[   r\   rX   r]   kwargsinput_shapehidden_shapequery_states
key_statesvalue_statesrK   rL   cache_kwargsattention_interfaceattn_outputattn_weightss                     r!   r.   OlmoAttention.forwardN   s1    $))#2.88b8$--8{{=1[[/
{{=1;;+T[[%9%9$9t{{?S?ST4;;#7#7"7T[[=Q=QRT[[%9%9$9t{{?S?ST#((6@@AF__\2<<QB
#((6@@AF&#7RU#[ &#&nUL'6'='=jX\XfXfht'u$J(?;;++w6"9$++:Z:Z"[$7	%
  $}}C$2H2HLL	%
 	%
!\ "));;;;FFHkk+.L((r#    )NN)r0   r1   r2   r3   r	   r+   r6   tupler   r   
LongTensorr.   r7   r   r#   r!   rU   rU   M   s    %0A6R ,0592)||2) #5<<#=>2) !.	2)
 "%2) !!1!122) 
u||Xell33	42) S2)r#   rU   c                   4   ^  \ rS rSrS\S\4U 4S jjrSrU =r$ )OlmoDecoderLayer   rE   ro   c                    > [         TU ]  X5        [        UR                  5      U l        [        UR                  5      U l        [        XS9U l        g )N)rE   ro   )r   r   r   r   input_layernormpost_attention_layernormrU   	self_attnr   rE   ro   r    s      r!   r   OlmoDecoderLayer.__init__   sB    +,V-?-?@(5f6H6H(I%&fJr#   )r   r   r   )	r0   r1   r2   r3   r   r5   r   r7   r8   r9   s   @r!   r   r      s    Kz Kc K Kr#   r   c                       \ rS rSrS rSrg)OlmoRotaryEmbedding   c                    U R                   S S S 2S 4   R                  5       R                  UR                  S   SS5      R	                  UR
                  5      nUS S 2S S S 24   R                  5       n[        UR
                  R                  [        5      (       a0  UR
                  R                  S:w  a  UR
                  R                  OSn[        R                  " USS9   UR                  5       UR                  5       -  R                  SS5      n[        R                  " Xf4SS	9nUR                  5       U R                  -  nUR                  5       U R                  -  n	X4sS S S 5        $ ! , (       d  f       g = f)
Nr   r_   r   mpscpuF)device_typeenabledr
   )dim)inv_freqfloatexpandre   r*   device
isinstancetypestrr+   autocastrm   catrK   attention_scalingrL   )
r   xrM   inv_freq_expandedposition_ids_expandedr   freqsembrK   rL   s
             r!   r.   OlmoRotaryEmbedding.forward   s'    MM$4-8>>@GGHZHZ[\H]_acdehhijiqiqr ,QaZ 8 > > @'1!((--'E'E!((--[`J`ahhmmfk^^UC&,,.1F1L1L1NNYYZ[]^_E))UN3C'')d444C'')d444C8 DCCs   $BE22
F r   N)r0   r1   r2   r3   r.   r7   r   r#   r!   r   r      s    
r#   r   c                   0   ^  \ rS rSrS\4U 4S jjrSrU =r$ )	OlmoModel   rE   c           	         > [         TU ]  U5        [        R                  " [	        UR
                  5       Vs/ s H  n[        X5      PM     sn5      U l        [        UR                  5      U l
        g s  snf r   )r   r   r?   
ModuleListrangenum_hidden_layersr   layersr   r   normr   s      r!   r   OlmoModel.__init__   s_     mmBGH`H`BabBaYf0Bab
 "&"4"45	 cs   A4)r   r   )r0   r1   r2   r3   r   r   r7   r8   r9   s   @r!   r   r      s    6z 6 6r#   r   c                       \ rS rSrSrg)OlmoForCausalLM   r   N)r0   r1   r2   r3   r7   r   r#   r!   r   r      s    r#   r   )r   r   OlmoPreTrainedModel)Nr   ))typingr   r   r+   torch.nnr?   torch.nn.functional
functionalr(   cache_utilsr   modeling_utilsr   utilsr   utils.deprecationr	   llama.modeling_llamar   r   r   r   r   r   r   r   configuration_olmor   
get_loggerr0   loggerModuler   r;   rS   rU   r   r   r   r   __all__r   r#   r!   <module>r      s    %       5  0	 	 	 + 
		H	%
BII 
Yh Y284)N 4)nK( K. 6
 6	& 	r#   