o
    gw                     @   s|  d dl mZ d dlmZmZ d dlZd dlmZ d dl	m
Z
 d dlmZmZ d dlmZmZ d dlmZ d d	lmZmZmZmZmZ d d
lmZ eeZG dd dZG dd dZG dd dZG dd dZ G dd dZ!G dd dZ"G dd dZ#G dd dZ$G dd de
Z%G dd deZ&G dd  d e%Z'G d!d" d"e%Z(G d#d$ d$e%Z)G d%d& d&e%Z*G d'd( d(eZ+dS ))    )	getLogger)ListOptionalN)DynamoOnnxHelper)Fusion)AttentionOpTypeFusionOptions) FusionBiasSkipLayerNormalizationFusionSkipLayerNormalization)NumpyHelper)
ModelProto	NodeProtoTensorProtohelpernumpy_helper)	OnnxModelc                   @      e Zd Zdd ZdS )ProcessGemmWFuncc                 C   s   t |dS )N   r   )np	transposeselfx r   ^/var/www/visachat/venv/lib/python3.10/site-packages/onnxruntime/transformers/onnx_model_phi.py__call__      zProcessGemmWFunc.__call__N__name__
__module____qualname__r   r   r   r   r   r          r   c                   @   r   )ProcessMatMulQFuncc                 C   s   t t |ddd dS )N   r   r   r   r   splitr   r   r   r   r         zProcessMatMulQFunc.__call__Nr   r   r   r   r   r$      r#   r$   c                   @   r   )ProcessMatMulKFuncc                 C      t t |ddd dS )Nr%   r   r   r   r&   r   r   r   r   r       r(   zProcessMatMulKFunc.__call__Nr   r   r   r   r   r)      r#   r)   c                   @   r   )ProcessMatMulVFuncc                 C   r*   )Nr%   r      r   r&   r   r   r   r   r   %   r(   zProcessMatMulVFunc.__call__Nr   r   r   r   r   r+   $   r#   r+   c                   @   r   )ProcessBiasQFuncc                 C      t |ddd }|S )Nr%   r   r   r'   r   r   r   r   r   *      zProcessBiasQFunc.__call__Nr   r   r   r   r   r-   )   r#   r-   c                   @   r   )ProcessBiasKFuncc                 C   r.   )Nr%   r/   r   r0   r   r   r   r   r   0   r1   zProcessBiasKFunc.__call__Nr   r   r   r   r   r2   /   r#   r2   c                   @   r   )ProcessBiasVFuncc                 C   r.   )Nr%   r/   r,   r0   r   r   r   r   r   6   r1   zProcessBiasVFunc.__call__Nr   r   r   r   r   r3   5   r#   r3   c                   @   r   )ProcessRotCacheFuncc                 C   s8   t |jdks	J |jd dkr|d d ddf S |S )Nr,   r       r      )lenshaper   r   r   r   r   <   s   zProcessRotCacheFunc.__call__Nr   r   r   r   r   r4   ;   r#   r4   c                       s  e Zd Zdedee f fddZdefddZdd	 Z	d
d Z
dd Zdd Zd7ddZdd Zdd Zdd Zdee dedee fddZd8dee d ee d!efd"d#Zd8dee d ee d!efd$d%Zd9dee d ee d!efd'd(Zd8dee d ee d!efd)d*Zd8dee d ee d!efd+d,Zd:dee d ee d!efd-d.Zd:dee d ee d!efd/d0Zd:dee d ee d!efd1d2Z		&	3	4d;dee d ee d!efd5d6Z  ZS )<Fissionmodelnodes_to_findc                    s   t  |d| d S )NDONOTUSEsuper__init__)r   r:   r;   	__class__r   r   r?   F   s   zFission.__init__attn_op_typec                 C   s
   || _ d S N)rB   )r   rB   r   r   r   set_attention_op_typeM   s   
zFission.set_attention_op_typec                 C   s   |d t | S )N_)str)r   layer_idnamer   r   r   	get_unameP   s   zFission.get_unamec                 C   s>   |D ]}||ks| |s||r|  S qtd| d)NzEdge z
 not found)endswith
startswith
ValueError)r   edgesrH   edger   r   r   get_edge_by_nameS   s
   zFission.get_edge_by_namec                 C      |  |j|S rC   )rO   inputr   noderH   r   r   r   get_input_by_nameY      zFission.get_input_by_namec                 C   rP   rC   )rO   outputrR   r   r   r   get_output_by_name\   rU   zFission.get_output_by_nameNc                 C   sd   | j |}t|}||}tj|d u r|d n|tj|j|	 
 dd}| j || j |jS )N
_processedT	data_typedimsvalsraw)r:   get_initializerr   to_arrayr   make_tensorr   FLOATr8   flattentobytesadd_initializerthis_graph_namerH   )r   initializer_namefunctorcustom_namei
i_np_arrayprocessed_i_np_array
new_tensorr   r   r   process_initializer_   s   

zFission.process_initializerc                 C   &   | j  j }||_tj|jj_	d S rC   )
r:   graph
value_infoaddrH   r   ra   typetensor_type	elem_typer   rH   new_value_infor   r   r   add_fp32_value_infom      zFission.add_fp32_value_infoc                 C   rn   rC   )
r:   ro   rp   rq   rH   r   INT64rr   rs   rt   ru   r   r   r   add_int64_value_infor   rx   zFission.add_int64_value_infoc                 C   s\   | j  jD ]}|j|kr| j  j|  nqtj|tj|d}| j  j	|g d S )Nrt   r8   )
r:   ro   rp   rH   remover   make_tensor_value_infor   ra   extend)r   rH   r8   rp   rv   r   r   r   replace_fp32_value_infow   s   
zFission.replace_fp32_value_infosubgraph_nodesrG   layer_known_edges_namesc                 C   s   |D ]_}t |jD ]\}}|dkrq	||vr'| |||j|< | |j|  q	t |jD ]\}}|dkr6q-||vrK| |||j|< | |j|  q-| ||j|_| j| | j| j	|j< qd S )N )
	enumeraterQ   rI   rw   rV   rH   nodes_to_addappendre   node_name_to_graph_name)r   r   rG   r   new_noderi   rH   r   r   r   set_unique_name_and_add_nodes   s&   z%Fission.set_unique_name_and_add_nodesr   inputsoutputsprefixc                 C   s>   t |dksJ t |dksJ tjd|||d dd}|gS )Nr%   r   LayerNormalization_LayerNormalizationg   >)r   r   rH   epsilonr7   r   	make_noder   r   r   r   rS   r   r   r   	layernorm      zFission.layernormc                 C   sr   t |dksJ t |dksJ tjd|d |d g|d g|d d}tjd|d |d g||d	 d}||gS )
Nr%   r   MatMulr   
matmul_outr   r   rH   Addr,   Biasr   )r   r   r   r   matmulrq   r   r   r   gemm   s   zFission.gemmr5   c              	   C   sB   t |dksJ t |dksJ tjd|||d d||d}|gS )N   r   RotaryEmbeddingcom.microsoft)r   r   rH   domainrotary_embedding_dim	num_headsr   )r   r   r   r   rot_dimr   rS   r   r   r   rotary      	zFission.rotaryc                 C   s>   t |dksJ t |dksJ tjd|||d dd}|gS )Nr   FastGelur   )r   r   rH   r   r   r   r   r   r   fastgelu   r   zFission.fastgeluc                 C   s<   t |dksJ t |dksJ tjd|||d d}|gS )Nr,   r   r   r   r   r   r   r   r   rq      s   zFission.addc              	   C   sB   t |dksJ t |dksJ tjd|||d d|dd}|gS )N   r%   MultiHeadAttentionr   r   )r   r   rH   r   r   unidirectionalr   r   r   r   r   r   rS   r   r   r   mha   r   zFission.mhac              	   C   sB   t |dksJ t |dksJ tjd|||d d||d}|gS )N   r%   GroupQueryAttentionr   )r   r   rH   r   r   kv_num_headsr   r   r   r   r   gqa   r   zFission.gqac                 C   sF   t |dksJ t |dksJ tjd|||d d|dddd	}|gS )N   r,   	Attentionr   r   r5   )r   r   rH   r   r   r   	do_rotaryr   r   r   r   r   r   	attention   s   zFission.attentionP      %?c                 C   sF   t |dksJ t |dksJ tjd|||d d||||d	}|gS )N   r   PagedAttentionzvllm.ort.ext)r   r   rH   r   r   num_kv_heads	head_sizescaler   )r   r   r   r   r   r   r   rS   r   r   r   
paged_attn  s   	zFission.paged_attnrC   )r   )r   r5   r5   )r   r5   )r   r5   r   r   )r    r!   r"   r   r   rF   r?   r   rD   rI   rO   rT   rW   rm   rw   rz   r   r   intr   r   r   r   r   rq   r   r   r   r   __classcell__r   r   r@   r   r9   E   sR    

        r9   c                       s\   e Zd Zdededef fddZdefddZd	d
 Zde	fddZ
de	fddZ  ZS )Phi2PreProcessorr:   r   hidden_sizec                    s(   t  | d| _|| _|| _d| _d S )Nr5   modeling_phi_PhiModel_model_1)r>   r?   num_hidden_layersnum_attention_headsr   	func_namer   r:   r   r   r@   r   r   r?     s
   
zPhi2PreProcessor.__init__returnc                 C   s   i }d|d< d|d< d|d< d|d< t d	| jd	D ],}d
| |d| < d| |d| < d| |d| d< d| |d| d< qdd | jjjD }d|v rbd|v rbd|d< d|d< |S d|v rjd|v slJ d|d< d|d< |S )Nlogits	lm_head_1	input_idsl_input_ids_
past_key_0
key_statespast_value_0value_statesr   	past_key_key_states_past_value_value_states_present_key_model_layers__1present_value__1_1c                 S   s   g | ]}|j qS r   rH   ).0or   r   r   
<listcomp>3  s    z7Phi2PreProcessor.get_phi2_edge_dict.<locals>.<listcomp>model_layers_0_1_1model_layers_0_1_2present_key_0present_value_0model_layers_0_1)ranger   r:   ro   rV   )r   	edge_dictri   r   r   r   r   get_phi2_edge_dict'  s&   z#Phi2PreProcessor.get_phi2_edge_dictc                 C   s<   d}| j jjD ]}|j|}|dkr|j|d  |_qd S )N)modeling_phi_PhiDecoderLayer_model_layersr/   )r:   ro   rS   op_typefind)r   phi2_transformer_layer_namerS   indexr   r   r   simplify_phi2_op_type=  s   z&Phi2PreProcessor.simplify_phi2_op_typerB   c              
   C   s  |t jk| _|t jk| _| jj}g }|jD ]}d|jv rkt	j
|j| js&tjntjddgd}t	j
dtjdgd}t	j
dtjddgd}t	j
dtjddgd}t	j
d	tjdgd}	| jsc||||gn||||	g | jrd
|jv rt	j
|jd
d|jjjdd| jd| j| j gd}
||
g q| jrd
|jv rt	j
|j|jjjg dd}
||
g d|jv rt	j
|j|jjjg dd}
||
g qd
|jv sd|jv rt	j
|j|jjjd| jd| j| j gd}
||
g q|d |j| g }t|jD ]_\}}|dkr||g q| jr?d|jv r=t	j
|jdd|jjjdd| jd| j| j gd}
||
g q| jrEqt	j
|j|jjjd| jd| j| j gd}
||
g q|d |j| d S )Nr   
batch_sizeseq_lenr{   stepr   position_idsattention_maskinput_metadatapast_keypastr,   past_seq_len)
num_blocksr   head_size_x
block_sizeblock_x
past_value)r   r   r   r   rQ   r   present_keypresenttotal_seq_lenrV   )r   r   use_attnr   use_vllmr:   ro   rQ   rH   r   r}   r   INT32ry   r~   replacerr   rs   rt   r   r   
ClearFieldr   rV   )r   rB   ro   
new_inputsvivi_iidvi_stepvi_pidvi_maskvi_metavi_cachenew_outputsri   r   r   r   process_graph_ioD  s   














z!Phi2PreProcessor.process_graph_ioc                 C   s~   d }| j jD ]}|j| jr|j} nq|d usJ | | | |   |   | 	  |t
jkr8|   | | d S rC   )r:   	functionsrH   rJ   r   unroll_functionupdate_edgesr   r   remove_dropout_layerr   r   remove_lm_head_layerr  )r   rB   function_namefuncr   r   r   preprocess_onnx  s   

z Phi2PreProcessor.preprocess_onnx)r    r!   r"   r   r   r?   dictr   r   r   r  r  r   r   r   r@   r   r     s    }r   c                       *   e Zd Zdef fddZdd Z  ZS )FissionTransformerEmbeddingPhir:   c                       t  |dg d S )N6torch_nn_modules_sparse_Embedding_model_embed_tokens_1r=   r   r:   r@   r   r   r?        z'FissionTransformerEmbeddingPhi.__init__c           	      C   s   t d|j t|jdksJ t|jdksJ |jd }|jd }| |d}|||g}tjd||g|gddg}| 	|d| | j
| d	| _d S )
NOptimizing %s...r,   r   r   zembed_tokens.weightGatherEmbedding_Gatherr   T)loggerinforH   r7   rQ   rV   rT   r   r   r   nodes_to_remover   prune_graph)	r   rS   input_name_to_nodesoutput_name_to_noderQ   rV   	embeddingr   r   r   r   r   fuse  s"   


	
z#FissionTransformerEmbeddingPhi.fuser    r!   r"   r   r?   r!  r   r   r   r@   r   r    
    r  c                       r  )FissionTransformerLayerNormPhir:   c                    r  )N@torch_nn_modules_normalization_LayerNorm_model_final_layernorm_1r=   r  r@   r   r   r?     r  z'FissionTransformerLayerNormPhi.__init__c           
      C   s   t d|j t|jdksJ t|jdksJ |jd }|jd }| |d}| |d}||||g}g }	|	| |||g|gd | 	|	d| | 
|g d	 | 
|g d	 | j| d
| _d S )Nr  r%   r   r   zfinal_layernorm.weightzfinal_layernorm.biasFinalc   r   r   r   T)r  r  rH   r7   rQ   rV   rT   r~   r   r   r   r  r   r  )
r   rS   r  r  rQ   rV   	ln_weightln_biasr   r   r   r   r   r!    s   


z#FissionTransformerLayerNormPhi.fuser"  r   r   r@   r   r$    r#  r$  c                       r  )!FissionTransformerCausalLMHeadPhir:   c                    r  )N(torch_nn_modules_linear_Linear_lm_head_1r=   r  r@   r   r   r?     r  z*FissionTransformerCausalLMHeadPhi.__init__c           
      C   s   t d|j t|jdksJ t|jdksJ |jd }|jd }| | |dt }| |d}||||g}g }	|		| 
|||g|gd | |	d	| | |g d
 | |g d | j| d| _d S )Nr  r   r   r,   r   zlm_head.weightzlm_head.biasLMHead_r'  r(  )r   r   i   T)r  r  rH   r7   rQ   rV   rm   rT   r   r~   r   r   r   r  r   r  )
r   rS   r  r  rQ   rV   	fc_weightfc_biasr   r   r   r   r   r!    s   


z&FissionTransformerCausalLMHeadPhi.fuser"  r   r   r@   r   r+    r#  r+  c                       sF   e Zd Zdedef fddZdd Zdd Zd	d
 Zdd Z	  Z
S )FissionTransformerBlockPhir:   r   c                    sT   || _ d}i | _g }t|D ]}d| d}|| || j|< qt || d S )Nr5   *modeling_phi_PhiDecoderLayer_model_layers_r   )r   func_to_layer_idr   r   r>   r?   )r   r:   r   max_num_layersr;   layerr   r@   r   r   r?   6  s   
z#FissionTransformerBlockPhi.__init__c                 C   s   | j |j S rC   )r2  r   )r   rS   r   r   r   get_layer_idF  r   z'FissionTransformerBlockPhi.get_layer_idc                 C   s   t jddgdgdtjdt jdddgdgd	d
t jdddgdgdd
t jddgdgdtjdt jddgdgdd
t jdddgdgdddt jddgdgdtjdg}|S )NCastr   
mask_int64Cast_gqa_aux_0)r   r   rH   to	ReduceSumonemask_row_sumsReduceSum_gqa_auxr   Subseqlens_k_int64Sub_gqa_aux	seqlens_kCast_gqa_aux_1Shape
mask_shapeShape_gqa_aux_0r  total_seq_len_int64Gather_gqa_aux_0r   )r   r   rH   axistotal_sequence_lengthCast_gqa_aux_2)r   r   r   ry   r   )r   gqa_aux_nodesr   r   r   get_gqa_aux_nodesI  sV   +z,FissionTransformerBlockPhi.get_gqa_aux_nodesc	                 C   sX  | j |}	| j |}
| j |}tt|	d}tt|
d}tt|d}tj|||fdd}| j |}| j |}| j |}t|}t|}t|}tj|||fdd}|jd }tj	|t
j||d g|  dd}| j || j tj	|t
j|d g|  dd}| j || j | |j | |j ||fS )Nr   r   )rH  r   r%   TrY   )r:   r^   r   r   r   r_   stackr8   r   r`   r   ra   rb   rc   rd   re   rw   rH   )r   q_wk_wv_wq_bk_bv_bweight_name	bias_nameq_weightk_weightv_weightqwkwvw
qkv_weightq_biask_biasv_biasqbkbvbqkv_biasr   weightbiasr   r   r   pack_qkv_gemmw  sD   






z(FissionTransformerBlockPhi.pack_qkv_gemmc           $      C   s  t d|j t d| j  | |}|jd }| |d}| |d}|jd }| |d}	| |d}
| |d	}| |d
}d\}}}}}}d\}}d\}}| jt	j
kr| | |dt }| | |dt }| | |dt }| |d}| |d}| |d}| | |dt }| | |dt }n.| | |d| |d| |d| |d| |d| |d| |d| |d\}}| | |dt }| |d}| | |dt }| | |dt }| |d}| |d}g }||||g |||	|
g |||g | jt	j
kr2|||||||||g n|||g |||||||g |g d g }|| |||gdg || d||gd gd! || d||gd"gd# || d"gd$g || d$||gd%gd& || d d%gd'gd( || |d'g|gd) | jt	j
kr|| d||gd*gd+ || d||gd,gd- || d||gd.gd/ | jt	jkrd0nd1}|| d*|||gd2gd+ || d,|||gd3gd- | jt	jkr|| d2d3d.d4d5d4||gd|	|
g n| jt	jkri|| d2d3d.||d6d7gd|	|
g |dkrh|  } | D ]}!| j|! | j| j|!j< qD| j !t"j#t$j%d8gd9d:d;d<| j n9| jt	jkr|| &d2d3d.||d=gdg n!d>| }"d?| }#||"|#g || 'd||d5|"gd|#g | (||| | )|g d@ | )|g d@ | j*| dA| _+d S )BNr  zAttentionOpType: r   r   r   r/   r   present_valuezinput_layernorm.weightzinput_layernorm.bias)NNNNNN)NNzself_attn.q_proj.weightzself_attn.k_proj.weightzself_attn.v_proj.weightzself_attn.q_proj.biaszself_attn.k_proj.biaszself_attn.v_proj.biaszrotary_emb.cos_cachedzrotary_emb.sin_cachedattn_qkv_weightattn_qkv_biaszself_attn.dense.weightzself_attn.dense.biaszmlp.fc1.weightzmlp.fc2.weightzmlp.fc1.biaszmlp.fc2.bias)r   r   rA  rI  r   r   ln_outattn_outattn_add_outOutProj_fc1_outFC1_gelu_outfc2_outFC2_residual_1_out
Residual_1
Residual_2queryQ_keyK_valueV_r   r   	query_rotkey_rotr   r   rA  rI  r   int64)dtyper;  r   r   past_present_r(  T),r  r  rH   rB   r5  rQ   rT   rV   rW   r   r   rm   r   r4   rf  rI   r~   r   r   r   rq   r   r   r   r   r   r   rL  r   r   re   r   r:   rd   r   
from_arrayr   arrayr   r   r   r   r  r  )$r   rS   r  r  rG   i_hidden_statesi_key_cachei_value_cacheo_hidden_stateso_key_cacheo_value_cacher)  r*  attn_q_weightattn_q_biasattn_k_weightattn_k_biasattn_v_weightattn_v_biasrh  ri  	cos_cache	sin_cacheattn_out_weightattn_out_biasmlp_fc1_weightmlp_fc2_weightmlp_fc1_biasmlp_fc2_biasr   r   pos_ids_namerK  r   	past_namepresent_namer   r   r   r!    s  










	



zFissionTransformerBlockPhi.fuse)r    r!   r"   r   r   r?   r5  rL  rf  r!  r   r   r   r@   r   r0  5  s    .*r0  c                       sX   e Zd Zdededef fddZddee d	ef fd
dZ	dd Z
dddZ  ZS )PhiOnnxModelr:   r   r   c                    sJ   t  | t| j||| _t| || _t| | _t	| | _
t| | _d S rC   )r>   r?   r   r:   phi2_preprocessorr0  fission_transformer_blockr+  fission_causal_lm_headr$  fission_transformer_layernormr  fission_transformer_embeddingr   r@   r   r   r?   P  s   

zPhiOnnxModel.__init__NFoptionsadd_dynamic_axesc                    s   |d usJ |j }| j| | j| | j  | j  | j  | j  t	 
  t| | _t| | _| j  | j  d S rC   )attention_op_typer  rD   r  r  applyr  r  r  r>   r  r
   fuse_slnr	   fuse_bias_sln)r   r  r  rB   r@   r   r   optimizeX  s   







zPhiOnnxModel.optimizec                 C   s@   i }g d}|D ]}|  |}t|||< qtd|  |S )z8
        Returns node count of fused operators.
        )	r   r   r   r   GeluBiasGelur   r   SkipLayerNormalizationzOptimized operators: )get_nodes_by_op_typer7   r  r  )r   op_countopsopnodesr   r   r   get_fused_operator_statisticsm  s   
z*PhiOnnxModel.get_fused_operator_statisticsc                    s    du r|    dtf fdd}|d|d |d |d }|d	|d
 |d }|d|d }|dko@||ko@||k}|dkrJtd |dkrStd |dkr\td |S )zA
        Returns True when the model is fully optimized.
        Nop_namec                    s     | pdS )Nr   )get)r  fused_op_countr   r   r    rU   z1PhiOnnxModel.is_fully_optimized.<locals>.op_countr   r   r   r   r  r  r   r   r  r   zLayer Normalization not fusedzGelu (or FastGelu) not fusedz+Attention (or MultiHeadAttention) not fused)r  rF   r  debugwarning)r   r  r  r   gelu
layer_norm
is_perfectr   r  r   is_fully_optimized  s*   


zPhiOnnxModel.is_fully_optimized)NFrC   )r    r!   r"   r   r   r?   r   r   boolr  r  r  r   r   r   r@   r   r  O  s
    r  ),loggingr   typingr   r   numpyr   dynamo_onnx_helperr   fusion_baser   fusion_optionsr   r   fusion_skiplayernormr	   r
   fusion_utilsr   onnxr   r   r   r   r   
onnx_modelr   r    r  r   r$   r)   r+   r-   r2   r3   r4   r9   r   r  r$  r+  r0  r  r   r   r   r   <module>   s<   
 Z 4"!!  