
    C1i                     0   S r SSKJrJrJr  SSKrSSKJrJ	r	J
r
   " S S\
R                  5      r " S S\
R                  5      r " S	 S
\
R                  5      r " S S\
R                  5      r " S S\
R                  5      r " S S\
R                  5      r " S S\
R$                  5      r " S S\
R(                  5      r " S S\
R,                  5      r " S S\
R0                  5      r " S S\
R,                  5      r " S S\
R0                  5      rg)z0Declares specification of the Transformer model.    )OptionalTupleUnionN)attention_speccommon_spec
model_specc            /       X   \ rS rSrSS\R
                  R                  S\R                  R                  SSSSSSSSSSSSSSSS4S\	S\	S	\
S
\
S\R
                  S\	S\R                  S\
S\
S\
S\
S\
S\
S\\	   S\\	   S\\	   S\
S\\R                     S\S\S\\	   S\\
   S\
4.S jjrSrg) TransformerEncoderSpec
   TF   N'  
num_layers	num_headspre_normno_final_norm
activationnum_source_embeddingsembeddings_mergelayernorm_embeddingrelative_positionrelative_attention_biasffn_glurms_normmulti_query_attentionnum_heads_kvhead_dim
rotary_dimrotary_interleaverotary_scaling_typerotary_scaling_factorrotary_basesliding_windowqk_normpre_post_layer_normc                 f   U(       a  Ub  US:w  a  [        S5      eSnXl        [        R                  " S5      R	                  U5      U l        X0l        [        R                  " S5      R	                  U5      U l        [        R                  " S5      R	                  U5      U l        [        U5       Vs/ s H  n[        R                  " 5       PM     snU l        SU l        U	(       d  U
(       d  [        5       U l        U(       a   U(       d  [        R                   " US9U l        U(       a  [        R                   " US9U l        Ub*  [        R                  " S5      R	                  U5      U l        [        U5       Vs/ s H  n[)        U	U
UUUUUUUUUUUS	9PM     snU l        gs  snf s  snf )
a/  Initializes a Transformer encoder specification.

Args:
  num_layers: Number of layers.
  num_heads: Number of attention heads.
  pre_norm: Enable the pre-norm Transformer architecture.
  no_final_norm: Disable the final layer norm in the pre-norm architecture.
  activation: Activation to apply in the feed-forward network.
  num_source_embeddings: Number of source embeddings.
  embeddings_merge: When :obj:`num_source_embeddings` > 1, specify how the
    embeddings are merged.
  layernorm_embedding: Apply layer normalization after the embedding layer.
  relative_position: Use relative position representations in the self-attention
    layers as described in https://arxiv.org/abs/1803.02155.
  relative_attention_bias: Use relative attention bias in the self-attention
    layers as described in the T5 paper https://arxiv.org/abs/1910.10683.
  ffn_glu: Use gated linear units in the FFN layers as described in
    https://arxiv.org/abs/2002.05202.
  rms_norm: Use the root mean square layer normalization.
  multi_query_attention: Use multi-query attention (alias for num_heads_kv=1).
  num_heads_kv: Number of attention heads for the key and value.
  head_dim: Number of dimensions per attention head.
  rotary_dim: Apply rotary embeddings to these first N dimensions. If 0, rotary
    embeddings are applied to all dimensions.
  rotary_interleave: Interleave the head dimensions when rotary embeddings are applied.
    Otherwise the head dimensions are sliced in half.
  rotary_scaling_type: Type of RoPE scaling.
  rotary_scaling_factor: Factor used in the RoPE scaling.
  rotary_base: The base period of the rotary embeddings.
  sliding_window: Max sequence length to retain in KV Cache.
  qk_norm: Apply layer normalization to the query and key projections.
  pre_post_layer_norm: Add post layer norm for each pre norm layer.
Nr   5Enabling multi_query_attention implies num_heads_kv=1int16int8Tr   int32)r   r   r   r   r   r   r   r   r   r    r!   r#   r$   )
ValueErrorr   npdtypetyper   r   r   r   ranger   EmbeddingsSpec
embeddingsscale_embeddingsPositionEncoderSpecposition_encodingsLayerNormSpec
layer_normr   r"   TransformerEncoderLayerSpeclayer)selfr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   r"   r#   r$   _s                            \/home/james-whalen/.local/lib/python3.13/site-packages/ctranslate2/specs/transformer_spec.py__init__TransformerEncoderSpec.__init__   s   x !'LA,= K  L%:"'*//	: ((6*//
; " 0 5 56F G278M2N
2NQK&&(2N
 !% )@&9&;D#M)77JDO'2'@'@('SD$%"$((7"3"8"8"HD$ :&!
  ' ("3(?!)!%"3$7&;'$7 '!



s   5F)F.)r   r1   r   r8   r6   r   r   r   r4   r   r2   r"   )__name__
__module____qualname____firstlineno__r   
ActivationRELUEmbeddingsMergeCONCATintboolr   r   RotaryScalingTypefloatr<   __static_attributes__     r;   r
   r
   
   s   
 #-8-C-C-H-H%&8C8S8S8Z8Z$)"'(-&+&*"&$("&JN'("(,"'$)1f
f
 f
 	f

 f
  **f
  #f
 &55f
 "f
  f
 "&f
 f
 f
  $f
 smf
  3-!f
" SM#f
$  %f
& &n&F&FG'f
(  %)f
* +f
, !-f
. $/f
0 "1f
 f
rL   r
   c            J          \ rS rSrS\R
                  R                  SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS4"S	\S
\S\S\R
                  S\S\S\S\S\S\S\S\S\S\S\S\S\S\	\   S\S\	\
R                     S\S\S\S \S!\S"\S#\S$\S%\	\   S&\	\   S'\	\   S(\	\R                     S)\	\   S*\	\   S+\S,\	\   4HS- jjr\S. 5       rS/rg)0TransformerDecoderSpect   TFr   Nr   r   r   r   r   r   r   with_encoder_attentionr   project_in_outr   r   alignment_layeralignment_headsr   r   alibialibi_use_positive_positionsscale_alibir   r   r   r    r!    original_max_position_embeddingsmax_position_embeddingsparallel_residualshared_layer_normr$   r   r   r   r"   
quant_typequant_group_size
quant_bitsr#    external_pre_post_encoder_layersc%           	         [        5       U l        U(       a$  U(       d  [        S5      eU(       a  [        S5      eU(       a  Ub  US:w  a  [        S5      eSn[        R                  " S5      R                  U5      U l        X0l        [        R                  " S5      R                  U5      U l        [        R                  " S5      R                  U5      U l	        [        R                  " S5      R                  U5      U l
        [        R                  " 5       U l        SU l        [        R                   U l        Xl        UU l        UU l        Ub*  [        R                  " S	5      R                  U5      U l        U	(       d   U
(       d  U(       d  Uc  [-        5       U l        U(       a   U(       d  [        R0                  " US
9U l        U(       a  [        R0                  " US
9U l        [        R6                  " 5       U l        [;        U5       V%s/ s HH  n%[=        S$0 SU_SU	_SU
_SU_SU_SU_SU_SU_SU_SU_SU_SU_SU_SU_SU_SU_SU_SU_SU#_SU$_6PMJ     sn%U l        SU l         U=(       d    UU:g  U R                  S '   U(       a4  [        R6                  " 5       U l!        [        R6                  " 5       U l"        U (       a.  U U R                  S!'   U"U R                  S"'   U!U R                  S#'   ggs  sn%f )%a
  Initializes a Transformer decoder specification.

Args:
  num_layers: Number of layers.
  num_heads: Number of attention heads.
  pre_norm: Enable the pre-norm Transformer architecture.
  activation: Activation to apply in the feed-forward network.
  layernorm_embedding: Apply layer normalization after the embedding layer.
  with_encoder_attention: Enable the encoder attention sublayers.
  no_final_norm: Disable the final layer norm in the pre-norm architecture.
  project_in_out: Add linear transformations after the embedding layer and before
    the final layer.
  relative_position: Use relative position representations in the self-attention
    layers as described in https://arxiv.org/abs/1803.02155.
  relative_attention_bias: Use relative attention bias in the self-attention
    layers as described in the T5 paper https://arxiv.org/abs/1910.10683.
  alignment_layer: Layer index selected for alignment.
  alignment_heads: Number of attention heads selected for alignment.
  ffn_glu: Use gated linear units in the FFN layers as described in
    https://arxiv.org/abs/2002.05202.
  rms_norm: Use the root mean square layer normalization.
  alibi: Use attention with linear biases.
  alibi_use_positive_positions: Use positive positions in the ALiBi definition.
  scale_alibi: Apply the dot product scale factor to ALiBi.
  rotary_dim: Apply rotary embeddings to these first N dimensions. If 0, rotary
    embeddings are applied to all dimensions.
  rotary_interleave: Interleave the head dimensions when rotary embeddings are applied.
    Otherwise the head dimensions are sliced in half.
  rotary_scaling_type: Type of RoPE scaling.
  rotary_scaling_factor: Factor used in the RoPE scaling.
  rotary_base: The base period of the rotary embeddings.
  original_max_position_embeddings: The original max position embeddings
    for Su rope embeddings
  max_position_embeddings: The max position embeddings for Su rope embeddings
  parallel_residual: Use parallel residual connections in each layer block, as used
    by the GPT-J and GPT-NeoX models.
  shared_layer_norm: When using parallel residual, share the input and post
    attention layer norms.
  pre_post_layer_norm: Add post layer norm for each pre norm layer
  multi_query_attention: Use multi-query attention (alias for num_heads_kv=1).
  num_heads_kv: Number of attention heads for the key and value.
  sliding_window: Max sequence length to retain in KV Cache.
  quant_type: quantization type used (like awq... for lower bit quantization)
  quant_group_size: group size of the lower bit quantization
  quant_bits: number of bit of the quantization (ex: 4bit)
  external_pre_post_encoder_layers: if the encoder attention pre and processing
    is done outside the attention.
z/The GPT-J block expects a pre-norm architecturez-The GPT-J block does not have cross attentionNr   r&   r'   r(   Tr*   r)   rQ   r   r   r   r   r   r   r   r    r!   rX   rY   rZ   r[   r$   r   r   r"   r#   r_   Fr   quantization_typequantization_bitsquantization_group_sizerK   )#dict_configr+   r,   r-   r.   r   r   r   rS   rT   r   r0   r1   r2   r   OPTIONALscale_outputsrU   rV   rW   r"   r3   r4   r5   r6   r   
LinearSpec
projectionr/   TransformerDecoderLayerSpecr8   start_from_zero_embedding
project_inproject_out)&r9   r   r   r   r   r   rQ   r   rR   r   r   rS   rT   r   r   rU   rV   rW   r   r   r   r    r!   rX   rY   rZ   r[   r$   r   r   r   r"   r\   r]   r^   r#   r_   r:   s&                                         r;   r<   TransformerDecoderSpec.__init__u   s   p v !RSS% !PQQ 'LA,= K  L'*//	: ((6*//
;!xx055oF!xx055oF%446 $'00
,H)&%"$((7"3"8"8"HD!+"&9&;D#M)77JDO'2'@'@('SD$%0020 :&/
. '- ( '="3 )@  	
 " & #4 %8 '< ( 2R )@ #4 #4 %8  *!" "#$  .%&  '( 2R), '/

2 */&0E 1
I% 	,- )446DO*557D0:DLL,-0:DLL,-6FDLL23 E
s   AK<c                     U R                   $ N)re   r9   s    r;   configTransformerDecoderSpec.config  s    ||rL   )re   r   rU   rV   rT   rS   r1   r8   r6   r   r   r4   r   rl   rm   ri   rW   r2   rg   r"   rk   )r>   r?   r@   rA   r   rB   rC   rF   rG   r   r   rH   rI   Quantizationr<   propertyrr   rJ   rK   rL   r;   rN   rN   t   s   
 -8-C-C-H-H$)'+#$"'(-! -2!$("&JN'("01'("'"'$)&+&*"&(,9=*.$(;@KdGdG dG 	dG
  **dG "dG !%dG dG dG  dG "&dG dG dG dG dG  !dG" '+#dG$ %dG& SM'dG(  )dG* &n&F&FG+dG,  %-dG. /dG0 +.1dG2 "%3dG4  5dG6  7dG8 "9dG:  $;dG< sm=dG> 3-?dG@ !AdGB [556CdGD #3-EdGF SMGdGH IdGJ +34.KdGL  rL   rN   c                   v    \ rS rSr              SS\\   S\S\\R                     S\	S\	S\4S	 jjr
S
rg)r7   i   Nr   r   r   r    r!   r$   c                    [         R                  " SUUUUUUUU	U
UUUS9U l        [        X4S9U l        U(       a  [
        R                  " US9U l        [
        R                  " US9U l        [
        R                  " US9U l	        [
        R                  " US9U l
        [        U R                  S5        [        U R                  S5        g g )NT)self_attentionr   r   r   r   r   r"   r   r   r   r    r!   r#   glur   r)   r6   )r   MultiHeadAttentionSpecrx   FeedForwardSpecffnr   r5   input_layer_normpost_attention_layer_normpre_feedforward_layer_normpost_feedforward_layer_normdelattr)r9   r   r   r   r   r   r   r"   r   r   r   r    r!   r#   r$   s                  r;   r<   $TransformerEncoderLayerSpec.__init__!  s    " -CC/$;%)!/ 3"7#
 #wB$/$=$=x$PD!-8-F-F!.D* /:.G.G!/D+ 0;/H/H!0D, D''6DHHl+ rL   )r}   r~   r   r   r   rx   )FFFFNNNNTNr   r   FF)r>   r?   r@   rA   r   rF   rG   r   rH   rI   r<   rJ   rK   rL   r;   r7   r7      s       %$("&JN'("$)/, SM/,  /, &n&F&FG/,  %/, /, "/, /,rL   r7   c                   F    \ rS rSr                    SS jrSrg)rj   iS  Nc                 ~   [         R                  " SUUUUUUU	U
UUUUUUS9U l        U(       a   [         R                  " UUUUUUSL S9U l        [	        XES9U l        U(       a  U(       a  [        R                  " 5       U l        O4[        R                  " 5       U l	        [        R                  " 5       U l
        [        U R                  S5        [        U R
                  S5        U(       a  [        R                  " US9U l	        [        R                  " US9U l
        U(       a9  U(       a2  [        R                  " US9U l        [        R                  " US9U l        [        R                  " US9U l        [        R                  " US9U l        [        U R                  S5        [        U R
                  S5        g g )NT)rx   r   r   r   r   r   r   r    r!   rX   rY   r   r   r"   r#   F)r   r   r   r"   r#   has_normry   r6   r)   )r   r{   rx   	attentionr|   r}   r   r5   r[   r~   r   r   *external_post_encoder_attention_layer_norm)external_pre_encoder_attention_layer_normr   r   )r9   rQ   r   r   r   r   r   r   r   r    r!   rX   rY   rZ   r[   r$   r   r   r"   r#   r_   s                        r;   r<   $TransformerDecoderLayerSpec.__init__T  s   . -CC/$;!/ 3"7#-M$;%)
$ "+BB!)!-9UBDN #wB )4)B)B)D&(3(A(A(C%1<1J1J1L.D''6DHHl+$/$=$=x$PD!-8-F-F!.D* &*J--x@ ?  --x@ >
 /:.G.G!/D+ 0;/H/H!0D, D''6DHHl+1 rL   )
r   r   r   r}   r~   r   r   r   rx   r[   )TFFFFNTNr   r   r   r   FFFNNNFFr>   r?   r@   rA   r<   rJ   rK   rL   r;   rj   rj   S  sF      $ % )* !!).+W,rL   rj   c                       \ rS rSrSS jrSrg)r|   i  c                     [         R                  " US9U l        [         R                  " 5       U l        [         R                  " 5       U l        U(       a  [         R                  " 5       U l        g g )Nr)   )r   r5   r6   rh   linear_0linear_1linear_0_noact)r9   rz   r   s      r;   r<   FeedForwardSpec.__init__  sM    %33XF#..0#..0"-"8"8":D rL   )r6   r   r   r   N)FFr   rK   rL   r;   r|   r|     s    ;rL   r|   c                       \ rS rSrS rSrg)r3   i  c                 .    [         R                  U l        g rp   )r   rf   	encodingsrq   s    r;   r<   PositionEncoderSpec.__init__  s    #,,rL   )r   Nr   rK   rL   r;   r3   r3     s    -rL   r3   c                   >   ^  \ rS rSrSrSS\\   4U 4S jjjrSrU =r	$ )TransformerConfigi  z%Configuration for Transformer models.layer_norm_epsilonc                 *   > [         TU ]  " SSU0UD6  g)zInitializes the configuration for Transformer models.

Args:
  layer_norm_epsilon: The layer norm epsilon value.
  **kwargs: Additional configuration.
r   NrK   superr<   r9   r   kwargs	__class__s      r;   r<   TransformerConfig.__init__       	I,>I&IrL   rK   rp   
r>   r?   r@   rA   __doc__r   rI   r<   rJ   __classcell__r   s   @r;   r   r     s    /J8E? J JrL   r   c                    d  ^  \ rS rSrSrS\S\4U 4S jjr\SSS\	R                  R                  SS	S	\	R                  R                  SSSSS4S
\\\\\4   4   S\S\S\S\S\	R                  S\S\S\S\	R                  S\S\S\S\S\4S jj5       r\S 5       r\S 5       rS rS rS rSrU =r$ ) TransformerSpeci  zDescribes a Transformer model.

The specification is invariant to hidden dimensions but requires to
explicitly set the number of layers and attention heads.
encoderdecoderc                   > [        U[        5      (       d  [        S5      e[        U[        5      (       d  [        S5      e[        TU ]  5         Xl        X l        U R                  R                  SU R                  R                  5        g)zInitializes a Transformer model specification.

Args:
  encoder: The encoder specification.
  decoder: The decoder specification.
1encoder argument must be a TransformerEncoderSpec1decoder argument must be a TransformerDecoderSpecr   N)
isinstancer
   	TypeErrorrN   r   r<   r   r   re   add_attributer   )r9   r   r   r   s      r;   r<   TransformerSpec.__init__  so     '#9::OPP'#9::OPP""#T\\%G%G	
rL   FTrP   r   r   r   with_relative_positionr   r   r   rS   rT   r   r   r   r   r   r   r   c                     [        U[        [        45      (       a  Uu  nnOXnn[        UUUUUU	U
UUUUUUS9n[	        UUUUUUUUUUUUUS9nU " UU5      $ )a  Creates a Transformer model specification.

Args:
  num_layers: Number of encoder and decoder layers, or a 2-tuple if the
    number is different.
  num_heads: Number of attention heads.
  with_relative_position: Use relative position representations in the self-attention
    layers as described in https://arxiv.org/abs/1803.02155.
  pre_norm: Enable the pre-norm Transformer architecture.
  no_final_norm: Disable the final layer norm in the pre-norm architecture.
  activation: Activation to apply in the feed-forward network.
  alignment_layer: Layer index selected for alignment.
  alignment_heads: Number of attention heads selected for alignment.
  num_source_embeddings: Number of source embeddings.
  embeddings_merge: When :obj:`num_source_embeddings` > 1, specify how the
    embeddings are merged.
  layernorm_embedding: Apply layer normalization after the embedding layer.
  relative_attention_bias: Use relative attention bias in the self-attention
    layers as described in the T5 paper https://arxiv.org/abs/1910.10683.
  ffn_glu: Use gated linear units in the FFN layer as described in
    https://arxiv.org/abs/2002.05202.
  rms_norm: Use the root mean square layer normalization.
  multi_query_attention: Use multi-query attention.
)r   r   r   r   r   r   r   r   r   r   r   )r   r   r   r   r   r   rS   rT   r   r   r   )r   listtupler
   rN   )clsr   r   r   r   r   r   rS   rT   r   r   r   r   r   r   r   num_encoder_layersnum_decoder_layersr   r   s                       r;   from_configTransformerSpec.from_config  s    V j4-005?2 25? 2('!"7- 34$;"7
  )'! 34$;++"7
  7G$$rL   c                     g)Nr   rK   rq   s    r;   nameTransformerSpec.name7  s     rL   c                     g)N   rK   rq   s    r;   revisionTransformerSpec.revision;      rL   c                     [        5       $ rp   )r   rq   s    r;   get_default_config"TransformerSpec.get_default_config?  s     ""rL   c                     U R                   R                   Vs/ s H  oR                  R                  S   PM     sn$ s  snf Nr   r   r1   weightshape)r9   specs     r;   get_source_vocabulary_size*TransformerSpec.get_source_vocabulary_sizeB  s3    151H1HI1H!!!$1HIIIs   ">c                 \    U R                   R                  R                  R                  S   $ r   r   r1   r   r   rq   s    r;   get_target_vocabulary_size*TransformerSpec.get_target_vocabulary_sizeE  #    ||&&--33A66rL   )r   r   )r>   r?   r@   rA   r   r
   rN   r<   classmethodr   rB   rC   rD   rE   r   rF   r   rG   r   ru   r   r   r   r   r   rJ   r   r   s   @r;   r   r     ss   
-
8N
* 
 (-#-8-C-C-H-H! %&8C8S8S8Z8Z$)(-&+!O%#uS#X./O% O% !%	O%
 O% O%  **O% O% O%  #O% &55O% "O% "&O% O% O%   $!O% O%b ! !  #J7 7rL   r   c                   >   ^  \ rS rSrSrSS\\   4U 4S jjjrSrU =r	$ )TransformerDecoderModelConfigiI  z-Configuration for Transformer decoder models.r   c                 *   > [         TU ]  " SSU0UD6  g)zInitializes the configuration for Transformer decoder models.

Args:
  layer_norm_epsilon: The layer norm epsilon value.
  **kwargs: Additional configuration.
r   NrK   r   r   s      r;   r<   &TransformerDecoderModelConfig.__init__L  r   rL   rK   rp   r   r   s   @r;   r   r   I      7J8E? J JrL   r   c            @         ^  \ rS rSrSrS\4U 4S jjr\S\R                  R                  SSSSSSSSSSSSSS	S
S
SSSSSSSSSSS4S\S\S\S\R                  S\S\S\S\S\S\S\S\S\S\\   S\S\\R                     S\S\S\S\S\S \S!\S"\S#\\   S$\\   S%\\   S&\\R"                     S'\\   S(\\   S)\4>S* jj5       r\S+ 5       r\S, 5       rS- rS. rS/rU =r$ )0TransformerDecoderModelSpeciV  z3Describes a Transformer decoder model (e.g. GPT-2).r   c                    > [        U[        5      (       d  [        S5      e[        TU ]  5         Xl        U R
                  R                  R                  5        H   u  p#U R                  R                  X#5        M"     g)zdInitializes a Transformer decoder model specification.

Args:
  decoder: The decoder specification.
r   N)
r   rN   r   r   r<   r   rr   itemsre   r   )r9   r   keyvaluer   s       r;   r<   $TransformerDecoderModelSpec.__init__Y  s`     '#9::OPP,,--335JCLL&&s2 6rL   TFNr   r   r   r   r   r   r   r   r   rR   r   r   r   rU   rV   rW   r   r   r   r    r!   rX   rY   rZ   r[   r$   r   r   r   r"   r\   r]   r^   r#   c                      [        UU40 SU_SU_SU_SS_SU_SU_SU_S	U	_S
U
_SU_SU_SU_SU_SU_SU_SU_SU_SU_SU_SU_SU_SU_SU_SU_SU_SU_SU_SU_SU_SU_6n U " U 5      $ ) a  Creates a Transformer decoder model specification.

Args:
  num_layers: Number of decoder layers.
  num_heads: Number of attention heads.
  pre_norm: Enable the pre-norm Transformer architecture.
  activation: Activation to apply in the feed-forward network.
  layernorm_embedding: Apply layer normalization after the embedding layer.
  no_final_norm: Do not apply layer normalization after the last decoder block.
  project_in_out: Add a linear layer after the embedding layer and another one
    before the final output projection.
  with_relative_position: Enable relative position representations modules.
  ffn_glu: Use gated linear units in the FFN layers as described in
    https://arxiv.org/abs/2002.05202.
  rms_norm: Use the root mean square layer normalization.
  alibi: Use attention with linear biases.
  alibi_use_positive_positions: Use positive positions in the ALiBi definition.
  scale_alibi: Apply the dot product scale factor to ALiBi.
  rotary_dim: Apply rotary embeddings to these first N dimensions. If 0, rotary
    embeddings are applied to all dimensions.
  rotary_interleave: Interleave the head dimensions when rotary embeddings are applied.
    Otherwise the head dimensions are sliced in half.
  rotary_scaling_type: Type of RoPE scaling.
  rotary_scaling_factor: Factor used in the RoPE scaling.
  rotary_base: The base period of the rotary embeddings.
  original_max_position_embeddings: The original max position embeddings
    for Su rope embeddings
  max_position_embeddings: The max position embeddings for Su rope embeddings
  parallel_residual: Use parallel residual connections in each layer block, as used
    by the GPT-J and GPT-NeoX models.
  shared_layer_norm: When using parallel residual, share the input and post
    attention layer norms.
  pre_post_layer_norm: add post layer norm for each pre norm layer
  multi_query_attention: Use multi-query attention (alias for num_heads_kv=1).
  num_heads_kv: Number of attention heads for the key and value.
  head_dim: Number of head
  sliding_window: max sequence length to retain KV cache
  quant_type: quantization type used (like awq... for lower bit quantization)
  quant_group_size: group size of the lower bit quantization
  quant_bits: number of bit of the quantization (ex: 4bit)
r   r   r   rQ   Fr   rR   r   r   r   rU   rV   rW   r   r   r   r    r!   rX   rY   rZ   r[   r$   r   r   r   r"   r\   r]   r^   r#   )rN   )!r   r   r   r   r   r   r   rR   r   r   r   rU   rV   rW   r   r   r   r    r!   rX   rY   rZ   r[   r$   r   r   r   r"   r\   r]   r^   r#   r   s!                                    r;   r   'TransformerDecoderModelSpec.from_configg  sR   X )!
 !
 "	!

 !4!
 $)!
 (!
 *!
 5!
 !
 !
 !
 *F!
 $!
 "!
  0!!
" !4#!
$ #8%!
& $'!
( .N)!
* %<+!
, 0-!
. 0/!
0 !41!
2 #83!
4 &5!
6 7!
8 *9!
: ";!
< .=!
> "?!
@ A!
F 7|rL   c                     g)NrN   rK   rq   s    r;   r    TransformerDecoderModelSpec.name      'rL   c                     g)N   rK   rq   s    r;   r   $TransformerDecoderModelSpec.revision  r   rL   c                     [        5       $ rp   )r   rq   s    r;   r   .TransformerDecoderModelSpec.get_default_config      ,..rL   c                 \    U R                   R                  R                  R                  S   $ r   r   rq   s    r;   get_vocabulary_size/TransformerDecoderModelSpec.get_vocabulary_size  r   rL   )r   )r>   r?   r@   rA   r   rN   r<   r   r   rB   rC   rF   rG   r   r   rH   rI   rt   r   ru   r   r   r   r   rJ   r   r   s   @r;   r   r   V  sP   =3 6 3 
 -8-C-C-H-H$)#$',-2!$("&JN'("01'("'"'$)&+&*"&(,9=*.$(Ann n 	n
  **n "n n n !%n n n n '+n n SMn   !n" &n&F&FG#n$  %%n& 'n( +.)n* "%+n,  -n.  /n0 "1n2  $3n4 sm5n6 3-7n8 !9n: [556;n< #3-=n> SM?n@ An n` ( (  /7 7rL   r   c                   >   ^  \ rS rSrSrSS\\   4U 4S jjjrSrU =r	$ )TransformerEncoderModelConfigi  z-Configuration for Transformer encoder models.r   c                 *   > [         TU ]  " SSU0UD6  g)zInitializes the configuration for Transformer encoder models.

Args:
  layer_norm_epsilon: The layer norm epsilon value.
  **kwargs: Additional configuration.
r   NrK   r   r   s      r;   r<   &TransformerEncoderModelConfig.__init__  r   rL   rK   rp   r   r   s   @r;   r   r     r   rL   r   c                      ^  \ rS rSrSrS\R                  R                  4S\S\	S\R                  4U 4S jjjr
\S 5       r\S	 5       rS
 rS rSrU =r$ )TransformerEncoderModelSpeci  z2Describes a Transformer encoder model (e.g. BERT).Fr   pooling_layerpooling_activationc                 f  > [        U[        5      (       d  [        S5      e[        TU ]  5         Xl        U R                  R                  SU R
                  R                  5        U(       aE  [        R                  " 5       U l        [        R                  " S5      R                  U5      U l        gg)zInitializes a Transformer encoder model specification.

Args:
  encoder: The encoder specification.
  pooling_layer: Add the pooling layer.
  pooling_activation: The activation to apply after the pooling layer.
r   r   r(   N)r   r
   r   r   r<   r   re   r   r   r   rh   pooler_denser,   r-   r.   pooler_activation)r9   r   r   r   r   s       r;   r<   $TransformerEncoderModelSpec.__init__  s     '#9::OPP""#T\\%G%G	
  + 6 6 8D%'XXf%5%:%:;M%ND" rL   c                     g)Nr
   rK   rq   s    r;   r    TransformerEncoderModelSpec.name  r   rL   c                     g)Nr   rK   rq   s    r;   r   $TransformerEncoderModelSpec.revision  r   rL   c                     [        5       $ rp   )r   rq   s    r;   r   .TransformerEncoderModelSpec.get_default_config  r   rL   c                 b    U R                   R                  S   R                  R                  S   $ r   r   rq   s    r;   r   /TransformerEncoderModelSpec.get_vocabulary_size  s(    ||&&q)0066q99rL   )r   r   r   )r>   r?   r@   rA   r   r   rB   Tanhr
   rG   r<   ru   r   r   r   r   rJ   r   r   s   @r;   r   r     s    <
 $5@5K5K5P5P	O'O O (22	O O4 ( (  /: :rL   r   )r   typingr   r   r   numpyr,   ctranslate2.specsr   r   r   	LayerSpecr
   rN   r7   rj   r|   r3   SequenceToSequenceModelConfigr   SequenceToSequenceModelSpecr   LanguageModelConfigr   LanguageModelSpecr   r   r   rK   rL   r;   <module>r	     s   6 ) )  E Eg
Z11 g
TiZ11 iX0,*"6"6 0,fX,*"6"6 X,v;j** ;-*.. -

J
@@ 
J}7j<< }7@
JJ$B$B 
JN7*">"> N7b
JJ$B$B 
J):*">"> ):rL   