o
    g                     @   s   d dl mZ d dlmZmZmZmZ d dlZd dl	m
Z
 d dlmZ d dlmZmZ d dlmZmZmZmZ d dlmZ eeZG d	d
 d
ZG dd de
ZdS )    )	getLogger)ListOptionalTupleUnionN)Fusion)AttentionMaskFormat)FusionUtilsNumpyHelper)	NodeProtoTensorProtohelpernumpy_helper)	OnnxModelc                   @   sN   e Zd ZdZdefddZdefddZdd	 Zd
d Z	de
de
fddZdS )AttentionMask:
    Fuse Attention subgraph into one Attention node.
    modelc                 C   s2   || _ i | _i | _t|| _tj| _| | _	d S N)
r   mask_indicemask_castedr	   utilsr   MaskIndexEndmask_formatget_opset_versionopset_version)selfr    r   `/var/www/visachat/venv/lib/python3.10/site-packages/onnxruntime/transformers/fusion_attention.py__init__   s   
zAttentionMask.__init__r   c                 C   s
   || _ d S r   )r   )r   r   r   r   r   set_mask_format!   s   
zAttentionMask.set_mask_formatc                 C   s*   || j v r|| j | ksJ || j |< d S r   )r   )r   mask
mask_indexr   r   r   set_mask_indice$   s   
zAttentionMask.set_mask_indicec                 C   s    t | jdks	J tt| jS Nr   )lenr   nextiter)r   r   r   r   get_first_mask)   s   zAttentionMask.get_first_maskinputreturnc              	   C   s`  | j tjkrd S || jv r| j| S | j|r!| j|\}}n
| j|\}}d}|r2|| j	|< | j tj
kr?|| j|< |S | jd}| jdk rltjd|g|g| jddd}|jtddgtd	d
g n7d}| j|d u r| jtj|tjdgdgdd tjd||g|g| jddd}|jtd	d
g | j| || j|< |S )NTr!      	ReduceSumMaskReduceSuminputsoutputsnameaxes   keepdimsr   ort_const_1_reduce_sum_axesFr0   	data_typedimsvalsraw)r   r   NoMaskr   r   find_graph_inputr   cast_graph_input_to_int32cast_input_to_int32r   r   create_node_namer   r   	make_node	attributeextendmake_attributeget_initializeradd_initializermake_tensorr   INT64add_node)r   r(   casted
input_name	cast_nodeoutput_namemask_index_node	axes_namer   r   r   process_mask-   sV   




$	
zAttentionMask.process_maskN)__name__
__module____qualname____doc__r   r   r   r   r"   r'   strrN   r   r   r   r   r      s    
r   c                )       sx  e Zd ZdZdddddgfdededed	ee d
edede	e
 f fddZdedeeef fddZdedeeef fddZdefddZde
fddZde
de
de
fddZde
de
de
e
ffdd Zd!e
d"e
d#e
fd$d%Zde
de
fd&d'Zd(ed)eedf d*eedf d+e
deedf f
d,d-Zd.ed/ed0ed(ed)eedf d*eedf dedeedf fd1d2Z	3	3	3	3	3	3	dDd.ed/eee
df d0eee
df d(ed)eedf d*eedf deded4e
d5e
de
de
de
d6e
d7e
d8edeedf f"d9d:Z	3	3	3	3	3		dEd;e
d.ed/ed0ed(ed)ed*ededed<e
d4e
d=e
de
de
d6e
d7e
d>ee d?edeedf f&d@dAZdBdC Z  ZS )FFusionAttentionr   NFSkipLayerNormalizationLayerNormalizationr   hidden_size	num_headsattention_maskuse_multi_head_attention!disable_multi_head_attention_biassearch_op_typesc           	         sh   |rdnd}t  ||| || _|| _|r|nt|| _|| _|| _d | _d| _	d| _
d | _d| _d S )NMultiHeadAttention	AttentionT)superr   rW   rX   r   rY   rZ   r[   mask_filter_valuenum_heads_warninghidden_size_warningshape_infershape_infer_done)	r   r   rW   rX   rY   rZ   r[   r\   attention_op_name	__class__r   r   r   m   s   

zFusionAttention.__init__concatr)   c                 C   s   t |jdkr;| j|jd }| j|jd }t|tjr;|jdkr;t|tjr;|jdkr;|d |d |d  fS | j| j	fS )aU  
        Detect num_heads and hidden_size from Concat node in the following subgraph:

        SkipLayerNormalization or EmbedLayerNormalization
                        /        |
                     MatMul    Shape
                        |        |
                       Add     Gather(indices=0)
                        |        |
                        |      Unsqueeze
                        |        |
                        |     Concat (*, -1, 12, 64)
                        |     /
                       Reshape
                          |
                       Transpose
                 r2   r   )
r$   r(   r   get_constant_value
isinstancenpndarraysizerX   rW   )r   rh   rX   	head_sizer   r   r   )get_num_heads_and_hidden_size_from_concat   s   



z9FusionAttention.get_num_heads_and_hidden_size_from_concat	reshape_qc                 C   sF  | j |jd }|du r3| j |d}|dur"|jdkr"| |S t|jd  d | j| j	fS t
|}t|dksJ|d dksJ|d dkrYtd	| d
 | j| j	fS |d }|d }|| }| jdkr|| jkr| jrtd| j d| d d| _| j	dkr|| j	kr| jrtd| j	 d| d d| _||fS )zDetect num_heads and hidden_size from a reshape node.

        Args:
            reshape_q (NodeProto): reshape node for Q

        Returns:
            Tuple[int, int]: num_heads and hidden_size
        r2   NConcatz is not initializer.ri   rj   r   rk   zq_shape_value=z7. Expected value are like [0, 0, num_heads, head_size].z--num_heads is z. Detected value is z. Using detected value.Fz--hidden_size is )r   rC   r(   
get_parentop_typerr   loggerdebugrX   rW   r
   to_arrayr$   ra   warningrb   )r   rs   q_shaperh   q_shape_valuerX   rq   rW   r   r   r   get_num_heads_and_hidden_size   s2   


$z-FusionAttention.get_num_heads_and_hidden_sizeadd_qkc                 C   s   | j s| jjdd| _d| _ | jd u rd S | j|jd }| j|jd }|d u s/|d u r:td| d d S ||krItd| d d S |jd S )	NT)updater   r2   zone of the inputs of z is Nonezthe shape of two inputs of z is not same)rd   r   infer_runtime_shaperc   get_edge_shaper(   rw   rx   )r   r~   input_0_shapeinput_1_shaper   r   r   get_add_qk_str   s   

zFusionAttention.get_add_qk_strc                    s    d t tfdd| j}t|dkrS t|dks J | jd}tjd fddt| j	D g|dd	}| j
| | j| j|< S )
N_maskc                    s   | j d  kS r#   )output)node)mask_output_namer   r   <lambda>   s    z0FusionAttention.reshape_add_qk.<locals>.<lambda>r2   r   rt   c                    s   g | ]} qS r   r   ).0_)r~   r   r   
<listcomp>   s    z2FusionAttention.reshape_add_qk.<locals>.<listcomp>r.   r/   r0   axis)listfilternodes_to_addr$   r   r>   r   r?   rangerX   appendthis_graph_namenode_name_to_graph_name)r   r~   concat_nodeconcat_node_nameconcat_add_qk_fp32r   )r~   r   r   reshape_add_qk   s    zFusionAttention.reshape_add_qkpast_kpast_vc                 C   s   | j d}| j d}|d dd}|d dd}tjd|g|g|dgd}tjd|g|g|dgd}| j| | j| | j| j|< | j| j|< | j d}	|dd	ddd
d}
tjd||g|
g|	dd}| j| | j| j|	< |
S )zConcatenate past_k and past_v inputs to create past_kv input.

        Args:
            past_k (str): name of past K value
            past_v (str): name of past V value

        Returns:
            kv_output_name (str): name of past KV value
        	Unsqueeze_5d.r   r   )r.   r/   r0   r1   rt   z.valuez.kv_value_kvr   )	r   r>   replacer   r?   r   r   r   r   )r   r   r   unsqueeze_k_nameunsqueeze_v_name	k_5d_name	v_5d_namek_5dv_5dr   kv_output_name	concat_kvr   r   r   r      sD   		zFusionAttention.concat_kvc                 C   s   d}| j |}|du r%tjtjdd| j jgdd|d}| j || j | j 	d}| j 	d}|d	 
d
d}|d	 
d
d}tjd||g|g|d}	tjd||g|g|d}
| j|	 | j|
 | j| j|< | j| j|< ||fS )ah  Reshape past_k and past_v from 4D to 3D to use as inputs for multihead attention node.

        Args:
            past_k (str): name of past K value of shape 4D
            past_v (str): name of past V value of shape 4D

        Returns:
            k_3d (str): name of past K value of shape 3D
            v_3d (str): name of past V value of shape 3D
        kv_4d_to_3dNr   int64dtyper0   Reshape_3dr   r   r-   )r   rC   r   
from_arrayrn   arrayrW   rD   r   r>   r   r   r?   r   r   r   )r   r   r   new_dims_namenew_dimsreshape_k_namereshape_v_name	k_3d_name	v_3d_namek_3dv_3dr   r   r   
reshape_kv4  s8   zFusionAttention.reshape_kvpresent_k_namepresent_v_namekv_nodec                 C   s   d\}}| j |}| j |}|du r(tjtjddd|d}| j || j |du r@tjtjddd|d}| j || j | j d}| j d}	t	j
d||g|g|dd	}
t	j
d||g|g|	dd	}| j|
 | j| | j| j|< | j| j|	< dS )
a?  Split kv_node containing present KV values into separate present K and present V values.

        Args:
            present_k_name (str): name of output to store present K value in
            present_v_name (str): name of output to store present V value in
            kv_node (str): name of present KV values
        )index_0index_1Nr   r   r   r   r2   Gatherr   )r   rC   r   r   rn   r   rD   r   r>   r   r?   r   r   r   )r   r   r   r   k_indexv_indexk_dimv_dimgather_k_namegather_v_name	present_k	present_vr   r   r   split_kve  s:   	zFusionAttention.split_kvc           	      C   s   |d  dd}|d  dd}| jd}| jd}tjd|g|g|g dd}tjd|g|g|g dd}| j| | j| | j| j|< | j| j|< ||fS )a}  Transpose past_k and past_v from (B,N,P,H) to (B,P,N,H)

        Args:
            past_k (str): name of past K value of shape (B,N,P,H)
            past_v (str): name of past V value of shape (B,N,P,H)

        Returns:
            past_k_transpose (str): name of past K value of shape (B,P,N,H)
            past_v_transpose (str): name of past V value of shape (B,P,N,H)
        _transposedr   r   	Transpose)r   rj   r2   rk   )r.   r/   r0   perm)	r   r   r>   r   r?   r   r   r   r   )	r   r   r   past_k_transposepast_v_transposetranspose_k_nametranspose_v_nametranspose_ktranspose_vr   r   r   transpose_kv  s.   	zFusionAttention.transpose_kvq_addk_addv_addname_prefixc                 C   s   | j |jd p| j |jd }t|}t|}t|}|d ur<| j |jd p6| j |jd }	t|	}|d urW| j |jd pQ| j |jd }
t|
}tj|||fdd}dt|j	 }|d }| j
||j|g|d |S )Nr2   r   r   rk   	_qkv_biasr0   r6   r7   r8   )r   rC   r(   r
   ry   rn   
zeros_likestackprodshaperD   r6   )r   r   r   r   r   q_biasqbkbvbk_biasv_biasqkv_biasqkv_bias_dim	bias_namer   r   r   create_combined_qkv_bias  s(   $


$
$
z(FusionAttention.create_combined_qkv_biasq_matmulk_matmulv_matmulc           $      C   s"  | j d}|jd |jd kr|jd |jd ksJ | j |jd }	| j |jd }
| j |jd }t|	}t|
}t|}|j|jkrR|j|jksTJ |jd }tj|||fdd	|d| f}|d }| j
||	j|jd |jd g|d |d }tjd|jd |g|g|d	}| j| j|< |g}|d
 }| j
|tjdgdgdd |d }| j
|tjdg|gdd |d }| j
|tjdgd| gdd |d }| j
|tjdgd| gdd |d }| j
|tjdgdgdd |d }tjd||||g|g| j dd	}| j| j|j< |d }tjd||||g|g| j dd	}| j| j|j< |d }tjd||||g|g| j dd	}| j| j|j< |} |}!|}"||||g | jr|dur| j |jd rodnd}#tt| j |j|# r||jd|# < |} || | j| j|j< |dur| j |jd rdnd}#tt| j |j|# r||jd|# < |}!|| | j| j|j< |dur| j |jd rdnd}#tt| j |j|# r||jd|# < |}"|| | j| j|j< | j| | |!|"fS )a  Create packed QKV MatMul node before MultiHeadAttention node.
           This is for the scenario where an Attention node should be created but cannot be created
           because past_key and past_value are separate inputs and not one concatenated input.

        Args:
            q_matmul (NodeProto): name of MatMul from Q path - (batch_size, sequence_length, hidden_size)
            k_matmul (NodeProto): name of MatMul from K path - (batch_size, sequence_length, hidden_size)
            v_matmul (NodeProto): name of MatMul from V path - (batch_size, sequence_length, hidden_size)
            q_add (NodeProto): name of Add from Q path
            k_add (NodeProto): name of Add from K path
            v_add (NodeProto): name of Add from V path
            num_heads (int): number of heads

        Returns:
            Union[NodeProto, None]: the node created or None if failed.
        MatMulr   r2   r   rk   _qkv_weightr   _qkv_outr-   _q_start_indexFr5   _k_start_index_v_start_indexrj   _end_of_qkv_index_qkv_last_axisr   _q_outSlice_k_out_v_outN)r   r>   r(   rC   r
   ry   r   rn   r   reshaperD   r6   r   r?   r   r   r   rF   r0   rA   r[   anyr   r   )$r   r   r   r   r   r   r   rX   matmul_node_nameq_weightk_weightv_weightqwkwvwd
qkv_weightqkv_weight_nameqkv_matmul_output
qkv_matmul	qkv_nodesq_slice_namek_slice_namev_slice_nameend_of_qkv_nameqkv_last_axis_nameq_slice_outputq_slicek_slice_outputk_slicev_slice_outputv_sliceq_outputk_outputv_outputinitializer_inputr   r   r   create_packed_qkv_matmul_node  s   ,



"






 

 

 

z-FusionAttention.create_packed_qkv_matmul_node r   key_padding_maskr   r   
packed_qkvc              	   C   s  |dksJ |dkr|| dkrt d| d|  dS tdd | j jD }| jd}g }|rT| |||||||\}}}||j	d |j	d |j	d g ndt
|tu rt
|tu r| jru||j	d |j	d |j	d g nC||j	d |j	d |j	d g n1t
|tkrt
|tkr||v r||v r| jr||j	d ||g n||j	d ||g ndS | js| ||||}|| n|d |r|r||
|||g n|
s|r||
|g |	g}|r|r|||g tjd	|||d
}d|_|jtd|g |S )a[  Create a MultiHeadAttention node.

        Args:
            q_matmul (NodeProto): name of MatMul from Q path - (batch_size, sequence_length, hidden_size)
            k_matmul (NodeProto): name of MatMul from K path - (batch_size, sequence_length, hidden_size) or (batch_size, num_heads, past_sequence_length, head_size)
            v_matmul (NodeProto): name of MatMul from V path - (batch_size, sequence_length, hidden_size) or (batch_size, num_heads, past_sequence_length, head_size)
            q_add (NodeProto): name of Add from Q path
            k_add (NodeProto): name of Add from K path
            v_add (NodeProto): name of Add from V path
            num_heads (int): number of attention heads. If a model is pruned, it is the number of heads after pruning.
            hidden_size (int): hidden dimension. If a model is pruned, it is the hidden dimension after pruning.
            output (str): output name of MHA
            key_padding_mask (str): name of key padding mask
            add_qk (str): name of add after Q x K'
            past_k (str): name of past K value - (batch_size, num_heads, past_sequence_length, head_size)
            past_v (str): name of past V value - (batch_size, num_heads, past_sequence_length, head_size)
            present_k (str): name of present K value - (batch_size, num_heads, sequence_length, head_size)
            present_v (str): name of present V value - (batch_size, num_heads, sequence_length, head_size)
            packed_qkv (bool): whether to combine MatMuls from Q, K, V paths
                               Note: This is for the scenario where an Attention node should be created but cannot be created
                               because past_key and past_value are separate inputs and not one concatenated input.

        Returns:
            Union[NodeProto, None]: the node created or None if failed.
        r   input hidden size # is not a multiple of num of heads Nc                 S      g | ]}|j qS r   r   )r   r   r   r   r   r         zCFusionAttention.create_multihead_attention_node.<locals>.<listcomp>r^   r  r]   r-   com.microsoftrX   )rw   rx   setr   graphr(   r>   r  rA   r   typer   r[   rS   r   r   r   r?   domainr@   rB   )r   r   r   r   r   r   r   rX   rW   r   r  r~   r   r   r   r   r  graph_input_namesmha_node_name
mha_inputsr  r  r  r   mha_outputsmha_noder   r   r   create_multihead_attention_nodef  sX   -
$$$
z/FusionAttention.create_multihead_attention_noder!   r(   
add_qk_strscalecausalc           6      C   sf  |dksJ |	dkr|	| dkrt d|	 d|  dS d}|du r-|du r-|du r-d}| j|jd }| j|jd }| j|jd }d\}}}|r| j|jd p`| j|jd }| j|jd pr| j|jd }| j|jd p| j|jd }|r|r|r|sdS |du rt|jd  d	 dS t|}t|}t|}|j|jksJ |jd }|jd }|jd }||  kr|ksJ  J |	dkr|	|krt 	d
|	 d| d d} |j|jkrd} t
|jdd }!t
|jdd }"t
|jdd }#d}$| r%t
j|||fdd}%|!|" |# }$nt
j|||fdd}%d|! }$|rt|}&t|}'t|}(t
|&j})t
|'j}*t
|(j}+|)|*  krd|!ksgJ  J |+|#ksnJ | rt
j|&|'|(fdd},|)|* |+ }-nt
j|&|'|(fdd},d|) }-| jd}.| js| j|.d |j||$g|%d |r| j|.d |j|-g|,d | jr|rt d dS |jd |jd |jd |.d g}/|dur|/| tjd|/|g|.d}0nt|
|.d |r|.d ndg}/|dur|/| n|/d |o|}1|1r| ||}2|/|2 |dur4| |}3|1s/|/d |/|3 |g}4|rW|rW|dddddd}5|4|5 | |||5 tjd|/|4|.d}0d|0_|0jtd|g |r||0jtddg |dur|0jtd|g | r|0jtd|!|"|#gg | jdur|0jtd t| jg |0S )!a+  Create an Attention node.

        Args:
            mask_index (str): mask input
            q_matmul (NodeProto): MatMul node in fully connection for Q
            k_matmul (NodeProto): MatMul node in fully connection for K
            v_matmul (NodeProto): MatMul node in fully connection for V
            q_add (NodeProto): Add bias node in fully connection for Q
            k_add (NodeProto): Add bias node in fully connection for K
            v_add (NodeProto): Add bias node in fully connection for V
            num_heads (int): number of attention heads. If a model is pruned, it is the number of heads after pruning.
            hidden_size (int): hidden dimension. If a model is pruned, it is the hidden dimension after pruning.
            input (str): input name
            output (str): output name
            add_qk_str (str): name of Add node after Q x K'
            past_k (str): name of input for past K value
            past_v (str): name of input for past V value
            present_k (str): name of output to store present K value
            present_v (str): name of output to store present V value
            scale: scale before softmax
            causal: whether it is uni-directional mask.

        Returns:
            Union[NodeProto, None]: the node created or None if failed.
        r   r  r  NTFr2   )NNNzl is not an initializer. Please set do_constant_folding=True in torch.onnx.export to unblock attention fusionzInput hidden size (z3) is not same as weight matrix dimension of q,k,v (z:). Please provide a correct input hidden size or pass in 0r   rk   r^   r   r   r   zVMultiHeadAttention does not support relative_position_bias: cannot fuse the attention.r]   r-   r  z.key_keyr   r   r   rX   unidirectionalr,  qkv_hidden_sizesr`   ) rw   rx   r   rC   r(   printr
   ry   r   rz   rn   r   concatenater   r>   rZ   rD   r6   r   r   r   r?   r   r   r   r   r$  r@   rA   rB   r`   float)6r   r!   r   r   r   r   r   r   rX   rW   r(   r   r+  r   r   r   r   r,  r-  has_biasr   r   r   r   r   r   r   r  r  
qw_in_size
kw_in_size
vw_in_sizeis_qkv_diff_dimsqw_out_sizekw_out_sizevw_out_sizeqkv_weight_dimr  r   r   r   q_bias_shapek_bias_shapev_bias_shaper   r   attention_node_nameattention_inputsattention_nodepast_existspast_kvr   attention_outputs
present_kvr   r   r   create_attention_node  s  .
$$$








 












z%FusionAttention.create_attention_nodec           6      C   sz  |}|j dkr| j|dd}|d ur|}nd S | j|g dg d}d }|d ur2|\}}}	}
}n| j|g dg d}|d urI|\}}}
}nd S g }t|jD ]\}}||vr[qR||d jd kreqR|| qRt|dkrsd S |d }	 | j|d	d}|d ur||jd  }|d urt|d
kr|d }|j dkr|jd }n,d S |d urt|dkr|jd }nd S |j dkr|| }|D ]}|j dkr|jd }q	 || }|j dkrt|jdkr|jd }|| }dd |D }|	ddkrd S | j|g dg d}|d u rt
d d S |\}}}}d}d}d}g dg dfg dg dfg dg dfg dg dfg dg dfd}d }| D ],\}} | j|| d | d }|d u raqJ|d krhd!}|d"krod!}|d#krvd!} |d u rt
d$ d S d }!d }"d }#|r|\}}#}"}n|r|\}}!}#}"n|r|\}}}"n|\}}!}}"| j|"g dg d%}$|$d u r| j|"g d&g d'}$|$d u rt
d( d S |$d) }%|$d* }&|$d+ }'| j|"g dg d}(|(d u r
| j|"g d,g d-}(|(d u r
t
d. d S |(d* })|(d+ }*d }+d },|r:| j|#g d/g dfg d0g dfg d1g d2fg|\}}+}nS|rp| j|#g d3g d2fg d0g dfg|\}}+}|!d uro| |!},|,d u rot
d4|!  d S n|rtn| j|!g d5g d6fg d7g d8fg|\}}+}|s|+d u rt
d9 d S |st|+dkr|+d j d	kr| j|+d \}}-|-d:kr|-| _|jd |kr|'jd |kr|*jd |kr|s| j|+d+ jd nd }.|d u r|	n|
}/| |%\}0}1|0dks|1dkr	t
d; d S | |.|'|*||&|)||0|1||/jd |,}2|2d u r#d S | j|2 | j| j|2j< |d urs|jd }3d<|3 }4| jd=|3 tjdgtdd|0t|1|0 gdd>}5| j t!"d?|/jd |5jg|4gd@|3 | j |4|jd< | j#$|/|
|g | j#$| | j#$| j%s|$n|$d d+  | j#$| j%s|(n|(d d+  | j#$| j%s|n|d d+  d!| _&d S d S d S d S )ANrV   Addr   )rH  r   r   r   r   )NNr   r   r   )rH  Einsumr   r   )r2   Nr   r   r2   Mulrj      rU   ri   c                 S   r  r   )rv   )r   childr   r   r   r     r  z(FusionAttention.fuse.<locals>.<listcomp>r   rk   )r   r   rH  r   )r2   r   r   Nz&fuse_attention: failed to match v pathF)SoftmaxrH  Divr   )r   r   Nr   )rM  rH  rJ  r   )rM  Wherer   rN  )r   r   rj   r   )rM  rH  rO  r   )r   r   r   rj   )rM  rN  r   )r   r   r   )path1path2path3path4path5rR  TrS  rT  z'fuse_attention: failed to match qk path)r   r   r   N)rN  r   r   rH  r   )r   r   r   r   Nz&fuse_attention: failed to match q pathr   )r   r   r   rH  r   )r2   r   r   r   Nz&fuse_attention: failed to match k path)Expandr   Equal)rX  r   r   )CastrW  r   rX  )r   r   r   r   )rY  rX  r   r   z4fuse_attention: failed to verify shape inference of )rJ  SubrY  r   r   )Nr   r2   r   r   )rJ  rZ  r   r   )Nr   r2   r   z)fuse_attention: failed to match mask pathizmFailed to detect num_heads and hidden_size for Attention fusion. Please specify those parameters in argument.edge_modified_shape_modified_tensorr5   r   reshape_modified_)'rv   r   match_parentmatch_parent_path	enumerater(   r   r   r$   countrw   rx   itemsmatch_parent_pathsr   get_constant_inputr`   rY   rN   r}   rz   rG  r   r   r   r0   rD   r   rF   rn   r   intrG   r   r?   nodes_to_removerA   rZ   prune_graph)6r   normalize_nodeinput_name_to_nodesoutput_name_to_node
start_nodeadd_before_layernormr  einsum_noder   reshape_qkvtranspose_qkv
matmul_qkvother_inputs_ir(   
root_inputmul_before_layernormmul_childrenlayernorm_nodechildrenrL  parent_nodechildren_typesv_nodesadd_vmatmul_v
is_distillis_distill_addis_no_mask_attentionqk_pathsqk_nodeskvr~   	matmul_qkwhere_qkq_nodesrs   add_qmatmul_qk_nodesadd_kmatmul_k
mask_nodesr+  mul_valr!   attention_last_nodeq_num_headsq_hidden_sizenew_nodeunique_indexnew_edgeshape_tensorr   r   r   fuse  s  



















	





$
0 



	   
zFusionAttention.fuse)r  r  r  r  r  r  F)r  r  r  r  r  NF)rO   rP   rQ   rR   r   re  r   r   boolr   rS   r   r   r   rr   r}   r   r   r   r   r   r   r   r   r  r*  r3  rG  r  __classcell__r   r   rf   r   rT   h   s(   	)71-'






	
 

	


|	


 frT   )loggingr   typingr   r   r   r   numpyrn   fusion_baser   fusion_optionsr   fusion_utilsr	   r
   onnxr   r   r   r   
onnx_modelr   rO   rw   r   rT   r   r   r   r   <module>   s   V