
    yj                        d Z ddlmZmZmZ ddlZddlmZm	Z	m
Z
  G d de
j                  Z G d de
j                  Z G d	 d
e
j                  Z G d de
j                  Z G d de
j                  Z G d de
j                  Z G d de
j                  Z G d de
j                  Z G d de
j                  Z G d de
j                  Z G d de
j                  Z G d de
j                  ZdS )z0Declares specification of the Transformer model.    )OptionalTupleUnionN)attention_speccommon_spec
model_specc            /       ,   e Zd Zddej        j        dej        j        ddddddddddddddddfdedede	d	e	d
ej        dedej        de	de	de	de	de	de	de
e         de
e         de
e         de	de
ej                 dedede
e         de
e	         de	f.dZdS )TransformerEncoderSpecTF   N'  
num_layers	num_headspre_normno_final_norm
activationnum_source_embeddingsembeddings_mergelayernorm_embeddingrelative_positionrelative_attention_biasffn_glurms_normmulti_query_attentionnum_heads_kvhead_dim
rotary_dimrotary_interleaverotary_scaling_typerotary_scaling_factorrotary_basesliding_windowqk_normpre_post_layer_normc                   	
 |rdk    rt          d          d|| _        t          j        d                              |          | _        || _        t          j        d                              |          | _        t          j        d                              |          | _        d t          |          D             | _
        d| _        	s
st                      | _        |r|st          j                  | _        |rt          j                  | _        |,t          j        d	                              |          | _        
	fd
t          |          D             | _        dS )a'  Initializes a Transformer encoder specification.

        Args:
          num_layers: Number of layers.
          num_heads: Number of attention heads.
          pre_norm: Enable the pre-norm Transformer architecture.
          no_final_norm: Disable the final layer norm in the pre-norm architecture.
          activation: Activation to apply in the feed-forward network.
          num_source_embeddings: Number of source embeddings.
          embeddings_merge: When :obj:`num_source_embeddings` > 1, specify how the
            embeddings are merged.
          layernorm_embedding: Apply layer normalization after the embedding layer.
          relative_position: Use relative position representations in the self-attention
            layers as described in https://arxiv.org/abs/1803.02155.
          relative_attention_bias: Use relative attention bias in the self-attention
            layers as described in the T5 paper https://arxiv.org/abs/1910.10683.
          ffn_glu: Use gated linear units in the FFN layers as described in
            https://arxiv.org/abs/2002.05202.
          rms_norm: Use the root mean square layer normalization.
          multi_query_attention: Use multi-query attention (alias for num_heads_kv=1).
          num_heads_kv: Number of attention heads for the key and value.
          head_dim: Number of dimensions per attention head.
          rotary_dim: Apply rotary embeddings to these first N dimensions. If 0, rotary
            embeddings are applied to all dimensions.
          rotary_interleave: Interleave the head dimensions when rotary embeddings are applied.
            Otherwise the head dimensions are sliced in half.
          rotary_scaling_type: Type of RoPE scaling.
          rotary_scaling_factor: Factor used in the RoPE scaling.
          rotary_base: The base period of the rotary embeddings.
          sliding_window: Max sequence length to retain in KV Cache.
          qk_norm: Apply layer normalization to the query and key projections.
          pre_post_layer_norm: Add post layer norm for each pre norm layer.
        Nr   5Enabling multi_query_attention implies num_heads_kv=1int16int8c                 4    g | ]}t          j                    S  )r   EmbeddingsSpec).0_s     c/usr/local/lib/hermes-agent/venv/lib/python3.11/site-packages/ctranslate2/specs/transformer_spec.py
<listcomp>z3TransformerEncoderSpec.__init__.<locals>.<listcomp>S   s.     
 
 
-.K&((
 
 
    Tr   int32c                 H    g | ]}t          	
           S ))r   r   r   r   r   r   r   r   r   r   r    r"   r#   )TransformerEncoderLayerSpec)r+   r,   r   r   r   r#   r"   r   r   r   r    r   r   r   r   s     r-   r.   z3TransformerEncoderSpec.__init__.<locals>.<listcomp>`   sa     
 
 
   ("3(?!)!%"3$7&;'$7  
 
 
r/   )
ValueErrorr   npdtypetyper   r   r   r   range
embeddingsscale_embeddingsPositionEncoderSpecposition_encodingsr   LayerNormSpec
layer_normr   r!   layer)selfr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   r"   r#   s            ```` ``````` ``r-   __init__zTransformerEncoderSpec.__init__   s   x ! 	'LA,=,= K   L%:"'**//	:: (6**//
;; " 0 0 5 56F G G
 
278M2N2N
 
 
 !%  	<)@ 	<&9&;&;D# 	KM 	K)7JJJDO 	T'2'@('S'S'SD$%"$(7"3"3"8"8"H"HD
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
  :&&!
 
 



r/   )__name__
__module____qualname__r   
ActivationRELUEmbeddingsMergeCONCATintboolr   r   RotaryScalingTypefloatrA   r)   r/   r-   r
   r
   
   s       
 #-8-C-H%&8C8S8Z$)"'(-&+&*"&$("&JN'("(,"'$)1f
 f
f
 f
 	f

 f
  *f
  #f
 &5f
 "f
  f
 "&f
 f
 f
  $f
 smf
  3-!f
" SM#f
$  %f
& &n&FG'f
(  %)f
* +f
, !-f
. $/f
0 "1f
 f
 f
 f
 f
 f
r/   r
   c            L          e Zd Zdej        j        dddddddddddddddddddddddddddddddddf#ded	ed
edej        dedededededededededededededee         dedee	j
                 dedededed ed!ed"ed#ed$ee         d%ee         d&ee         d'eej                 d(ee         d)ee         d*ed+ed,ee         fJd-Zed.             ZdS )/TransformerDecoderSpecTFr   Nr   r   r   r   r   r   r   with_encoder_attentionr   project_in_outr   r   alignment_layeralignment_headsr   r   alibialibi_use_positive_positionsscale_alibir   r   r   r   r     original_max_position_embeddingsmax_position_embeddingsparallel_residualshared_layer_normr#   r   r   r   r!   
quant_typequant_group_size
quant_bitsr"   v_norm external_pre_post_encoder_layersc&                 b  	
#$% t                      | _        r"|st          d          rt          d          |rdk    rt          d          dt          j        d                              |          | _        || _        t          j        d                              |          | _        t          j        d                              |          | _	        t          j        d                              |          | _
        t          j                    | _        d| _        t          j        | _        || _        || _        || _        ,t          j        d	                                        | _        	s
s|st-                      | _        |r|st          j        
          | _        |rt          j        
          | _        t          j                    | _        %#
	$fdt;          |          D             | _        d| _        t          j        | _         |p|k    | j        d<   |r0t          j                    | _!        t          j                    | _"        | r | | j        d<   |"| j        d<   |!| j        d<   dS dS )a.  Initializes a Transformer decoder specification.

        Args:
          num_layers: Number of layers.
          num_heads: Number of attention heads.
          pre_norm: Enable the pre-norm Transformer architecture.
          activation: Activation to apply in the feed-forward network.
          layernorm_embedding: Apply layer normalization after the embedding layer.
          with_encoder_attention: Enable the encoder attention sublayers.
          no_final_norm: Disable the final layer norm in the pre-norm architecture.
          project_in_out: Add linear transformations after the embedding layer and before
            the final layer.
          relative_position: Use relative position representations in the self-attention
            layers as described in https://arxiv.org/abs/1803.02155.
          relative_attention_bias: Use relative attention bias in the self-attention
            layers as described in the T5 paper https://arxiv.org/abs/1910.10683.
          alignment_layer: Layer index selected for alignment.
          alignment_heads: Number of attention heads selected for alignment.
          ffn_glu: Use gated linear units in the FFN layers as described in
            https://arxiv.org/abs/2002.05202.
          rms_norm: Use the root mean square layer normalization.
          alibi: Use attention with linear biases.
          alibi_use_positive_positions: Use positive positions in the ALiBi definition.
          scale_alibi: Apply the dot product scale factor to ALiBi.
          rotary_dim: Apply rotary embeddings to these first N dimensions. If 0, rotary
            embeddings are applied to all dimensions.
          rotary_interleave: Interleave the head dimensions when rotary embeddings are applied.
            Otherwise the head dimensions are sliced in half.
          rotary_scaling_type: Type of RoPE scaling.
          rotary_scaling_factor: Factor used in the RoPE scaling.
          rotary_base: The base period of the rotary embeddings.
          original_max_position_embeddings: The original max position embeddings
            for Su rope embeddings
          max_position_embeddings: The max position embeddings for Su rope embeddings
          parallel_residual: Use parallel residual connections in each layer block, as used
            by the GPT-J and GPT-NeoX models.
          shared_layer_norm: When using parallel residual, share the input and post
            attention layer norms.
          pre_post_layer_norm: Add post layer norm for each pre norm layer
          multi_query_attention: Use multi-query attention (alias for num_heads_kv=1).
          num_heads_kv: Number of attention heads for the key and value.
          sliding_window: Max sequence length to retain in KV Cache.
          quant_type: quantization type used (like awq... for lower bit quantization)
          quant_group_size: group size of the lower bit quantization
          quant_bits: number of bit of the quantization (ex: 4bit)
          external_pre_post_encoder_layers: if the encoder attention pre and processing
            is done outside the attention.
        z/The GPT-J block expects a pre-norm architecturez-The GPT-J block does not have cross attentionNr   r%   r&   r'   Tr1   r0   c                     g | ]K}t          di d ddddddddd	d
dddd	dddd
ddLS )rP   r   r   r   r   r   r   r   r   r    rW   rX   rY   rZ   r#   r   r   r!   r"   r^   r_   r)   )TransformerDecoderLayerSpec)r+   r,   r_   r   r   rX   r   rW   rY   r#   r"   r   r   r   r    r   r   r   r   rZ   r!   r^   rP   s     r-   r.   z3TransformerDecoderSpec.__init__.<locals>.<listcomp>   sB    
 
 
0 / (   '='="3"3 )@(?  	
 " &: #4"3 %8$7 '<&; (K 2R1Q )@(? #4"3 #4"3 %8$7  *\!" "#$  .~%&  '( v)* 2R1Q+
 
 
r/   Fr   quantization_typequantization_bitsquantization_group_size)#dict_configr4   r5   r6   r7   r   r   r   rR   rS   r   r*   r9   r:   r   OPTIONALscale_outputsrT   rU   rV   r!   r;   r<   r=   r>   r   
LinearSpec
projectionr8   r?   start_from_zero_embeddingfinal_logit_softcapping
project_inproject_out)&r@   r   r   r   r   r   rP   r   rQ   r   r   rR   rS   r   r   rT   rU   rV   r   r   r   r   r    rW   rX   rY   rZ   r#   r   r   r   r!   r[   r\   r]   r"   r^   r_   s&         `  ``  ``   `````````` ```   ```r-   rA   zTransformerDecoderSpec.__init__u   s3   r vv 	R T !RSSS% R !PQQQ  	'LA,=,= K   L'**//	:: (6**//
;;!x0055oFF!x0055oFF%466 $'0
,H)&%"$(7"3"3"8"8"H"HD!	<+	< 	< "&9&;&;D# 	KM 	K)7JJJDO 	T'2'@('S'S'SD$%022
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0 :&&1
 
 

6 */&'1':$0E 1
I% 	,-  	8)466DO*577D 	G0:DL,-0:DL,-6FDL2333	G 	Gr/   c                     | j         S N)rg   r@   s    r-   configzTransformerDecoderSpec.config  s
    |r/   )rB   rC   rD   r   rE   rF   rI   rJ   r   r   rK   rL   QuantizationrA   propertyrs   r)   r/   r-   rN   rN   t   s       
 -8-C-H$)'+#$"'(-! -2!$("&JN'("01'("'"'$)&+&*"&(,9=*.$(;@MhG hGhG hG 	hG
  *hG "hG !%hG hG hG  hG "&hG hG hG hG hG  !hG" '+#hG$ %hG& SM'hG(  )hG* &n&FG+hG,  %-hG. /hG0 +.1hG2 "%3hG4  5hG6  7hG8 "9hG:  $;hG< sm=hG> 3-?hG@ !AhGB [56ChGD #3-EhGF SMGhGH IhGJ KhGL +34.MhG hG hG hGT   X  r/   rN   c                   n    e Zd Z	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddee         dedeej                 d	ed
edefdZ	dS )r3   FNTr   r   r   r   r   r   r    r#   c                    t          j        d||||||||	|
|||          | _        t          ||          | _        |rt          j        |          | _        t          j        |          | _        t          j        |          | _	        t          j        |          | _
        t          | j        d           t          | j        d           d S d S )NT)self_attentionr   r   r   r   r   r!   r   r   r   r   r    r"   glur   r0   r>   )r   MultiHeadAttentionSpecrx   FeedForwardSpecffnr   r=   input_layer_normpost_attention_layer_normpre_feedforward_layer_normpost_feedforward_layer_normdelattr)r@   r   r   r   r   r   r   r!   r   r   r   r   r    r"   r#   s                  r-   rA   z$TransformerEncoderLayerSpec.__init__%  s   " -C/$;%)!/ 3"7#
 
 
 #wBBB 	,$/$=x$P$P$PD!-8-F!. . .D* /:.G!/ / /D+ 0;/H!0 0 0D, D'666DHl+++++	, 	,r/   )FFFFNNNNTNr   r   FF)
rB   rC   rD   r   rI   rJ   r   rK   rL   rA   r)   r/   r-   r3   r3   $  s           %$("&JN'("$)/, /, SM/,  /, &n&FG/,  %/, /, "/, /, /, /, /, /,r/   r3   c                   @    e Zd Z	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddZdS )	rb   TFNr   r   r   c           	         t          j        di ddd|d|d|d|d|d|d	|	d
|
d|d|d|d|d|d|d|| _        |r!t          j        ||||||du           | _        t	          ||          | _        |ru|rt          j                    | _        n0t          j                    | _	        t          j                    | _
        t          | j        d           t          | j        d           |rt          j        |          | _	        t          j        |          | _
        |r6|r4t          j        |          | _        t          j        |          | _        t          j        |          | _        t          j        |          | _        t          | j        d           t          | j        d           t           j        | _        d S )Nrx   Tr   r   r   r   r   r   r   r    rW   rX   r   r   r!   r"   r^   F)r   r   r   r!   r"   has_normry   r>   r0   r)   )r   r{   rx   	attentionr|   r}   r   r=   rZ   r~   r   r   *external_post_encoder_attention_layer_norm)external_pre_encoder_attention_layer_normr   r   r   rh   layer_scalar)r@   rP   r   r   r   r   r   r   r   r   r    rW   rX   rY   rZ   r#   r   r   r!   r"   r^   r_   s                         r-   rA   z$TransformerDecoderLayerSpec.__init__X  s   0 -C 
 
 
4
//
 %<$;
 X	

 "z
 0/
 !4 3
 #8"7
 $
 .N-M
 %<$;
 &
 X
 *>
 G
  6!
& " 	+B!)!-9UB  DN #wBBB 	,  M)4)B)D)D&&(3(A(C(C%1<1J1L1L.D'666DHl+++ 	,$/$=x$P$P$PD!-8-F!. . .D* & *J -x@@@ ?  -x@@@ >
 /:.G!/ / /D+ 0;/H!0 0 0D, D'666DHl+++&/r/   )TFFFFNTNr   r   r   r   FFFNNNFFFrB   rC   rD   rA   r)   r/   r-   rb   rb   W  so          $ % )* !!).-[0 [0 [0 [0 [0 [0r/   rb   c                       e Zd ZddZdS )r|   Fc                     t          j        |          | _        t          j                    | _        t          j                    | _        |rt          j                    | _        d S d S )Nr0   )r   r=   r>   rj   linear_0linear_1linear_0_noact)r@   rz   r   s      r-   rA   zFeedForwardSpec.__init__  s`    %3XFFF#.00#.00 	;"-"8":":D	; 	;r/   N)FFr   r)   r/   r-   r|   r|     s(        ; ; ; ; ; ;r/   r|   c                       e Zd Zd ZdS )r;   c                 (    t           j        | _        d S rq   )r   rh   	encodingsrr   s    r-   rA   zPositionEncoderSpec.__init__  s    #,r/   Nr   r)   r/   r-   r;   r;     s#        - - - - -r/   r;   c                   6     e Zd ZdZddee         f fdZ xZS )TransformerConfigz%Configuration for Transformer models.Nlayer_norm_epsilonc                 >     t                      j        dd|i| dS )zInitializes the configuration for Transformer models.

        Args:
          layer_norm_epsilon: The layer norm epsilon value.
          **kwargs: Additional configuration.
        r   Nr)   superrA   r@   r   kwargs	__class__s      r-   rA   zTransformerConfig.__init__  .     	II,>I&IIIIIr/   rq   rB   rC   rD   __doc__r   rL   rA   __classcell__r   s   @r-   r   r     s_        //J J8E? J J J J J J J J J Jr/   r   c                    6    e Zd ZdZdedef fdZedddej	        j
        dddej        j        dddddfd	eeeeef         f         d
ededededej	        dedededej        dededededefd            Zed             Zed             Zd Zd Zd Z xZS )TransformerSpeczDescribes a Transformer model.

    The specification is invariant to hidden dimensions but requires to
    explicitly set the number of layers and attention heads.
    encoderdecoderc                 >   t          |t                    st          d          t          |t                    st          d          t	                                                       || _        || _        | j        	                    d| j        j
                   dS )zInitializes a Transformer model specification.

        Args:
          encoder: The encoder specification.
          decoder: The decoder specification.
        1encoder argument must be a TransformerEncoderSpec1decoder argument must be a TransformerDecoderSpecr   N)
isinstancer
   	TypeErrorrN   r   rA   r   r   rg   add_attributer   )r@   r   r   r   s      r-   rA   zTransformerSpec.__init__  s     '#9:: 	QOPPP'#9:: 	QOPPP""#T\%G	
 	
 	
 	
 	
r/   FTrO   r   r   r   with_relative_positionr   r   r   rR   rS   r   r   r   r   r   r   r   c                     t          |t          t          f          r|\  }}n||}}t          ||||||	|
||||||          }t	          |||||||||||||          } | ||          S )a  Creates a Transformer model specification.

        Args:
          num_layers: Number of encoder and decoder layers, or a 2-tuple if the
            number is different.
          num_heads: Number of attention heads.
          with_relative_position: Use relative position representations in the self-attention
            layers as described in https://arxiv.org/abs/1803.02155.
          pre_norm: Enable the pre-norm Transformer architecture.
          no_final_norm: Disable the final layer norm in the pre-norm architecture.
          activation: Activation to apply in the feed-forward network.
          alignment_layer: Layer index selected for alignment.
          alignment_heads: Number of attention heads selected for alignment.
          num_source_embeddings: Number of source embeddings.
          embeddings_merge: When :obj:`num_source_embeddings` > 1, specify how the
            embeddings are merged.
          layernorm_embedding: Apply layer normalization after the embedding layer.
          relative_attention_bias: Use relative attention bias in the self-attention
            layers as described in the T5 paper https://arxiv.org/abs/1910.10683.
          ffn_glu: Use gated linear units in the FFN layer as described in
            https://arxiv.org/abs/2002.05202.
          rms_norm: Use the root mean square layer normalization.
          multi_query_attention: Use multi-query attention.
        )r   r   r   r   r   r   r   r   r   r   r   )r   r   r   r   r   r   rR   rS   r   r   r   )r   listtupler
   rN   )clsr   r   r   r   r   r   rR   rS   r   r   r   r   r   r   r   num_encoder_layersnum_decoder_layersr   r   s                       r-   from_configzTransformerSpec.from_config  s    V j4-00 	L5?2 2 25? 2('!"7- 34$;"7
 
 
  )'! 34$;++"7
 
 
  s7G$$$r/   c                     dS )Nr   r)   rr   s    r-   namezTransformerSpec.name?  s      r/   c                     dS )N   r)   rr   s    r-   revisionzTransformerSpec.revisionC      qr/   c                     t                      S rq   )r   rr   s    r-   get_default_configz"TransformerSpec.get_default_configG  s     """r/   c                 .    d | j         j        D             S )Nc                 2    g | ]}|j         j        d          S )r   )weightshape)r+   specs     r-   r.   z>TransformerSpec.get_source_vocabulary_size.<locals>.<listcomp>K  s"    III!!$IIIr/   )r   r9   rr   s    r-   get_source_vocabulary_sizez*TransformerSpec.get_source_vocabulary_sizeJ  s    II1HIIIIr/   c                 :    | j         j        j        j        d         S Nr   r   r9   r   r   rr   s    r-   get_target_vocabulary_sizez*TransformerSpec.get_target_vocabulary_sizeM      |&-3A66r/   )rB   rC   rD   r   r
   rN   rA   classmethodr   rE   rF   rG   rH   r   rI   r   rJ   r   ru   r   r   r   r   r   r   r   s   @r-   r   r     s        
-
8N
 
 
 
 
 
* 
 (-#-8-C-H! %&8C8S8Z$)(-&+!O% O%#uS#X./O% O% !%	O%
 O% O%  *O% O% O%  #O% &5O% "O% "&O% O% O%   $!O% O% O% [O%b ! ! X!   X# # #J J J7 7 7 7 7 7 7r/   r   c                   6     e Zd ZdZddee         f fdZ xZS )TransformerDecoderModelConfigz-Configuration for Transformer decoder models.Nr   c                 >     t                      j        dd|i| dS )zInitializes the configuration for Transformer decoder models.

        Args:
          layer_norm_epsilon: The layer norm epsilon value.
          **kwargs: Additional configuration.
        r   Nr)   r   r   s      r-   rA   z&TransformerDecoderModelConfig.__init__T  r   r/   rq   r   r   s   @r-   r   r   Q  _        77J J8E? J J J J J J J J J Jr/   r   c            B           e Zd ZdZdef fdZedej        j	        ddddddddddddddd	d	ddddddddddddfd
e
de
dedej        dedededededededededee
         dedeej                 dedede
de
deded ed!ed"ee
         d#ee
         d$ee
         d%eej                 d&ee
         d'ee
         d(ed)ef@d*            Zed+             Zed,             Zd- Zd. Z xZS )/TransformerDecoderModelSpecz3Describes a Transformer decoder model (e.g. GPT-2).r   c                    t          |t                    st          d          t                                                       || _        | j        j                                        D ] \  }}| j        	                    ||           !dS )z|Initializes a Transformer decoder model specification.

        Args:
          decoder: The decoder specification.
        r   N)
r   rN   r   r   rA   r   rs   itemsrg   r   )r@   r   keyvaluer   s       r-   rA   z$TransformerDecoderModelSpec.__init__a  s     '#9:: 	QOPPP,-3355 	3 	3JCL&&sE2222	3 	3r/   TFNr   r   r   r   r   r   r   r   r   rQ   r   r   r   rT   rU   rV   r   r   r   r   r    rW   rX   rY   rZ   r#   r   r   r   r!   r[   r\   r]   r"   r^   c!                     t          ||fi d|d|d|ddd|d|d|d	|	d
|
d|d|d|d|d|d|d|d|d|d|d|d|d|d|d|d|d|d|d|d|d|d | }! | |!          S )!a!
  Creates a Transformer decoder model specification.

        Args:
          num_layers: Number of decoder layers.
          num_heads: Number of attention heads.
          pre_norm: Enable the pre-norm Transformer architecture.
          activation: Activation to apply in the feed-forward network.
          layernorm_embedding: Apply layer normalization after the embedding layer.
          no_final_norm: Do not apply layer normalization after the last decoder block.
          project_in_out: Add a linear layer after the embedding layer and another one
            before the final output projection.
          with_relative_position: Enable relative position representations modules.
          ffn_glu: Use gated linear units in the FFN layers as described in
            https://arxiv.org/abs/2002.05202.
          rms_norm: Use the root mean square layer normalization.
          alibi: Use attention with linear biases.
          alibi_use_positive_positions: Use positive positions in the ALiBi definition.
          scale_alibi: Apply the dot product scale factor to ALiBi.
          rotary_dim: Apply rotary embeddings to these first N dimensions. If 0, rotary
            embeddings are applied to all dimensions.
          rotary_interleave: Interleave the head dimensions when rotary embeddings are applied.
            Otherwise the head dimensions are sliced in half.
          rotary_scaling_type: Type of RoPE scaling.
          rotary_scaling_factor: Factor used in the RoPE scaling.
          rotary_base: The base period of the rotary embeddings.
          original_max_position_embeddings: The original max position embeddings
            for Su rope embeddings
          max_position_embeddings: The max position embeddings for Su rope embeddings
          parallel_residual: Use parallel residual connections in each layer block, as used
            by the GPT-J and GPT-NeoX models.
          shared_layer_norm: When using parallel residual, share the input and post
            attention layer norms.
          pre_post_layer_norm: add post layer norm for each pre norm layer
          multi_query_attention: Use multi-query attention (alias for num_heads_kv=1).
          num_heads_kv: Number of attention heads for the key and value.
          head_dim: Number of head
          sliding_window: max sequence length to retain KV cache
          quant_type: quantization type used (like awq... for lower bit quantization)
          quant_group_size: group size of the lower bit quantization
          quant_bits: number of bit of the quantization (ex: 4bit)
        r   r   r   rP   Fr   rQ   r   r   r   rT   rU   rV   r   r   r   r   r    rW   rX   rY   rZ   r#   r   r   r   r!   r[   r\   r]   r"   r^   )rN   )"r   r   r   r   r   r   r   rQ   r   r   r   rT   rU   rV   r   r   r   r   r    rW   rX   rY   rZ   r#   r   r   r   r!   r[   r\   r]   r"   r^   r   s"                                     r-   r   z'TransformerDecoderModelSpec.from_configo  s   Z )"
 "
 "
 X"
 "z	"

 !4 3"
 $)5"
 (-"
 *>"
 54"
 G"
 X"
 %"
 *F)E"
 $"
 "z"
  0/!"
" !4 3#"
$ #8"7%"
& $'"
( .N-M)"
* %<$;+"
, 0/-"
. 0//"
0 !4 31"
2 #8"73"
4 &5"
6 X7"
8 *>9"
: "z;"
< .-="
> "z?"
@ GA"
B 6C"
H s7||r/   c                     dS )NrN   r)   rr   s    r-   r   z TransformerDecoderModelSpec.name      ''r/   c                     dS )N   r)   rr   s    r-   r   z$TransformerDecoderModelSpec.revision  r   r/   c                     t                      S rq   )r   rr   s    r-   r   z.TransformerDecoderModelSpec.get_default_config      ,...r/   c                 :    | j         j        j        j        d         S r   r   rr   s    r-   get_vocabulary_sizez/TransformerDecoderModelSpec.get_vocabulary_size  r   r/   )rB   rC   rD   r   rN   rA   r   r   rE   rF   rI   rJ   r   r   rK   rL   rt   r   ru   r   r   r   r   r   r   s   @r-   r   r   ^  s       ==3 6 3 3 3 3 3 3 
 -8-C-H$)#$',-2!$("&JN'("01'("'"'$)&+&*"&(,9=*.$(Cp pp p 	p
  *p "p p p !%p p p p '+p p SMp   !p" &n&FG#p$  %%p& 'p( +.)p* "%+p,  -p.  /p0 "1p2  $3p4 sm5p6 3-7p8 !9p: [56;p< #3-=p> SM?p@ ApB Cp p p [pd ( ( X(   X/ / /7 7 7 7 7 7 7r/   r   c                   6     e Zd ZdZddee         f fdZ xZS )TransformerEncoderModelConfigz-Configuration for Transformer encoder models.Nr   c                 >     t                      j        dd|i| dS )zInitializes the configuration for Transformer encoder models.

        Args:
          layer_norm_epsilon: The layer norm epsilon value.
          **kwargs: Additional configuration.
        r   Nr)   r   r   s      r-   rA   z&TransformerEncoderModelConfig.__init__  r   r/   rq   r   r   s   @r-   r   r     r   r/   r   c                        e Zd ZdZdej        j        fdededej        f fdZ	e
d             Ze
d             Zd	 Zd
 Z xZS )TransformerEncoderModelSpecz2Describes a Transformer encoder model (e.g. BERT).Fr   pooling_layerpooling_activationc                 x   t          |t                    st          d          t                                                       || _        | j                            d| j        j                   |rFt          j
                    | _        t          j        d                              |          | _        dS dS )zInitializes a Transformer encoder model specification.

        Args:
          encoder: The encoder specification.
          pooling_layer: Add the pooling layer.
          pooling_activation: The activation to apply after the pooling layer.
        r   r   r'   N)r   r
   r   r   rA   r   rg   r   r   r   rj   pooler_denser5   r6   r7   pooler_activation)r@   r   r   r   r   s       r-   rA   z$TransformerEncoderModelSpec.__init__  s     '#9:: 	QOPPP""#T\%G	
 	
 	
  	O + 6 8 8D%'Xf%5%5%:%:;M%N%ND"""	O 	Or/   c                     dS )Nr
   r)   rr   s    r-   r   z TransformerEncoderModelSpec.name  r   r/   c                     dS )Nr   r)   rr   s    r-   r   z$TransformerEncoderModelSpec.revision  r   r/   c                     t                      S rq   )r   rr   s    r-   r   z.TransformerEncoderModelSpec.get_default_config#  r   r/   c                 F    | j         j        d         j        j        d         S r   )r   r9   r   r   rr   s    r-   r   z/TransformerEncoderModelSpec.get_vocabulary_size&  s    |&q)06q99r/   )rB   rC   rD   r   r   rE   Tanhr
   rJ   rA   ru   r   r   r   r   r   r   s   @r-   r   r     s        <<
 $5@5K5P	O O'O O (2	O O O O O O4 ( ( X(   X/ / /: : : : : : :r/   r   )r   typingr   r   r   numpyr5   ctranslate2.specsr   r   r   	LayerSpecr
   rN   r3   rb   r|   r;   SequenceToSequenceModelConfigr   SequenceToSequenceModelSpecr   LanguageModelConfigr   LanguageModelSpecr   r   r   r)   r/   r-   <module>r      s   6 6 ) ) ) ) ) ) ) ) ) )     E E E E E E E E E Eg
 g
 g
 g
 g
Z1 g
 g
 g
Tm m m m mZ1 m m m`0, 0, 0, 0, 0,*"6 0, 0, 0,f\0 \0 \0 \0 \0*"6 \0 \0 \0~; ; ; ; ;j* ; ; ;- - - - -*. - - -

J 
J 
J 
J 
J
@ 
J 
J 
J}7 }7 }7 }7 }7j< }7 }7 }7@
J 
J 
J 
J 
JJ$B 
J 
J 
JP7 P7 P7 P7 P7*"> P7 P7 P7f
J 
J 
J 
J 
JJ$B 
J 
J 
J): ): ): ): ):*"> ): ): ): ): ):r/   