
    cCi7                        S r SSKJrJr  SSKrSSKJr  SSKJr  SSK	J
r
  SSKJr  SS	KJr  SS
KJrJr  SSKJr  SSKJr  SSKJr  SSKJrJrJrJrJrJrJrJ r J!r!J"r"  SSK#J$r$  \RJ                  " \&5      r'Sr( " S S\ 5      r) " S S\5      r* " S S\5      r+ " S S\5      r, " S S\5      r- " S S\5      r. " S S \5      r/ " S! S"\5      r0 " S# S$\5      r1 " S% S&\5      r2/ S'Qr3g)(zPyTorch Qwen3 model.    )CallableOptionalN   )Cache)FlashAttentionKwargs)CausalLMOutputWithPast)ALL_ATTENTION_FUNCTIONS)Unpack)TransformersKwargslogging)deprecate_kwarg   )GemmaMLP)LlamaAttention)
Qwen2DecoderLayerQwen2ForCausalLMQwen2ForQuestionAnsweringQwen2ForSequenceClassificationQwen2ForTokenClassification
Qwen2ModelQwen2PreTrainedModelQwen2RMSNormapply_rotary_pos_embeager_attention_forward   )Qwen3ConfigzQwen/Qwen3-8Bc                       \ rS rSrSrg)Qwen3RMSNorm4    N__name__
__module____qualname____firstlineno____static_attributes__r        a/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/qwen3/modular_qwen3.pyr   r   4       r'   r   c                       \ rS rSrSrg)Qwen3MLP8   r    Nr!   r    r'   r(   r+   r+   8   r)   r'   r+   c                   6  ^  \ rS rSrS\S\4U 4S jjr\" SSSS9  SS	\R                  S
\
\R                  \R                  4   S\\R                     S\\   S\\R                     S\\   S\
\R                  \\R                     4   4S jj5       rSrU =r$ )Qwen3Attention<   config	layer_idxc                   > [         TU ]  X5        [        U R                  UR                  S9U l        [        U R                  UR                  S9U l        UR                  U   S:X  a  UR                  U l        g S U l        g )N)epssliding_attention)	super__init__r   head_dimrms_norm_epsq_normk_normlayer_typessliding_window)selfr0   r1   	__class__s      r(   r6   Qwen3Attention.__init__=   si    +"4==f6I6IJ"4==f6I6IJ7=7I7I)7TXk7kf33qur'   past_key_valuepast_key_valuesz4.58)new_nameversionhidden_statesposition_embeddingsattention_maskcache_positionkwargsreturnc                    UR                   S S n/ UQSPU R                  P7nU R                  U R                  U5      R	                  U5      5      R                  SS5      n	U R                  U R                  U5      R	                  U5      5      R                  SS5      n
U R                  U5      R	                  U5      R                  SS5      nUu  p[        XX5      u  pUb$  XUS.nUR                  XU R                  U5      u  p[        nU R                  R                  S:w  a  [        U R                  R                     nU" U U	U
UU4U R                   (       d  SOU R"                  U R$                  U R&                  S.UD6u  nnUR(                  " / UQSP76 R+                  5       nU R-                  U5      nUU4$ )Nr   r   )sincosrG   eagerg        )dropoutscalingr<   )shaper7   r9   q_projview	transposer:   k_projv_projr   updater1   r   r0   _attn_implementationr	   trainingattention_dropoutrP   r<   reshape
contiguouso_proj)r=   rD   rE   rF   rA   rG   rH   input_shapehidden_shapequery_states
key_statesvalue_statesrM   rL   cache_kwargsattention_interfaceattn_outputattn_weightss                     r(   forwardQwen3Attention.forwardC   s    $))#2.88b8$--8{{4;;}#=#B#B<#PQ[[\]_`a[[]!;!@!@!NOYYZ[]^_
{{=166|DNNqRST&#7RU#[ &#&nUL'6'='=jX\XfXfht'u$J(?;;++w6"9$++:Z:Z"[$7
%
  $}}C$2H2HLL..
%
 
%
!\ "));;;;FFHkk+.L((r'   )r:   r9   r<   )NN)r"   r#   r$   r%   r   intr6   r   torchTensortupler   r   
LongTensorr
   r   rg   r&   __classcell__r>   s   @r(   r.   r.   <   s    v{ vs v %0A6R ,059*)||*) #5<<#=>*) !.	*)
 "%*) !!1!12*) -.*) 
u||Xell33	4*) S*)r'   r.   c                       \ rS rSrSrg)Qwen3DecoderLayerq   r    Nr!   r    r'   r(   rq   rq   q   r)   r'   rq   c                       \ rS rSrSrg)Qwen3PreTrainedModelu   r    Nr!   r    r'   r(   rt   rt   u   r)   r'   rt   c                       \ rS rSrSrg)
Qwen3Modely   r    Nr!   r    r'   r(   rw   rw   y   r)   r'   rw   c                   :   ^  \ rS rSrS\\   S\4U 4S jjrSrU =r	$ )Qwen3ForCausalLM}   super_kwargsrI   c                 $   > [         TU ]  " S0 UD6$ )a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example:

```python
>>> from transformers import AutoTokenizer, Qwen3ForCausalLM

>>> model = Qwen3ForCausalLM.from_pretrained("Qwen/Qwen3-8B")
>>> tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-8B")

>>> prompt = "Hey, are you conscious? Can you talk to me?"
>>> inputs = tokenizer(prompt, return_tensors="pt")

>>> # Generate
>>> generate_ids = model.generate(inputs.input_ids, max_length=30)
>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
```r    )r5   rg   )r=   r|   r>   s     r(   rg   Qwen3ForCausalLM.forward~   s    4 w...r'   r    )
r"   r#   r$   r%   r
   r   r   rg   r&   rn   ro   s   @r(   rz   rz   }   s%    /12/ 
 / /r'   rz   c                       \ rS rSrSrg)Qwen3ForSequenceClassification   r    Nr!   r    r'   r(   r   r      r)   r'   r   c                       \ rS rSrSrg)Qwen3ForTokenClassification   r    Nr!   r    r'   r(   r   r      r)   r'   r   c                       \ rS rSrSrg)Qwen3ForQuestionAnswering   r    Nr!   r    r'   r(   r   r      r)   r'   r   )rz   r   rt   rw   r   r   )4__doc__typingr   r   rj   cache_utilsr   modeling_flash_attention_utilsr   modeling_outputsr   modeling_utilsr	   processing_utilsr
   utilsr   r   utils.deprecationr   gemma.modeling_gemmar   llama.modeling_llamar   qwen2.modeling_qwen2r   r   r   r   r   r   r   r   r   r   configuration_qwen3r   
get_loggerr"   logger_CHECKPOINT_FOR_DOCr   r+   r.   rq   rt   rw   rz   r   r   r   __all__r    r'   r(   <module>r      s     %    B 6 5 & 0 0 +   - 
		H	%% 	< 		x 	2)^ 2)j	) 		/ 		 	/' /<	%C 		"= 		 9 	r'   