o
    gc                     @   s   d dl mZ d dlmZmZmZmZmZ d dlm	Z	 d dl
mZ d dlmZmZmZ d dlmZ eeZG dd de	ZG d	d
 d
eZdS )    )	getLogger)DictListOptionalTupleUnion)Fusion)FusionUtils)	NodeProtoTensorProtohelper)	OnnxModelc                       s(  e Zd ZdZd2dedef fddZdeded	e	eef f fd
dZ
dedeeee f dedefddZdd Zdd Zdd Zdd Zdd Zdede	eed	ef f fddZ					d3deded ed!ed"ed	ef d#ee fd$d%Zd&d' Zd(d) Z		d4d*d+Zd,d- Zd.d/ Zd0d1 Z  ZS )5FusionEmbedLayerNoMaskz
    Fuse embedding layer into one node (EmbedLayerNormalization).
    It supports the following model types: BERT, DistilBert, ALBert.
    no maskmodeldescriptionc                    s<   t  |dddg| t|| _d | _d| _d | _d | _d S )NEmbedLayerNormalizationLayerNormalizationSkipLayerNormalizationF)super__init__r	   utilsshape_infershape_infer_done	attention
embed_node)selfr   r   	__class__ a/var/www/visachat/venv/lib/python3.10/site-packages/onnxruntime/transformers/fusion_embedlayer.pyr      s   

zFusionEmbedLayerNoMask.__init__addreturnNc                 C   sP   | j |dgdg}|d u rd S | j |dgdg}|d u r d S |d |d fS )NGatherr      )r   match_parent_path)r   r!   gather_0_pathgather_1_pathr   r   r    match_two_gather&   s   z'FusionEmbedLayerNoMask.match_two_gather	layernorminput_name_to_nodesis_distil_bertc           
      C   s  | j j|d|dd| _| jdurdS |jd |vrdS ||jd  }tdd |D }|g d	kr_|D ]+}|jd
kr^| j |g dg d}|dur^|d jd |jd kr^|d | _ dS q3t	|dkr|d jdkr|d jd |v r||d jd  }t	|dkr|d jdkr|d jd |v r||d jd  }	|	D ]}|jdkr|| _ dS qtdd |	D }|r|g dkr|g dkr|g dkrt
d dS dS |g dkr|g d	krt
d dS dS )a  Check that LayerNormalization has a child of Attention node or subgraph like Attention.

        Args:
            layernorm (NodeProto): LayerNormalization node
            input_name_to_nodes (Dict[str, List[NodeProto]]): map from input name to nodes
            is_distil_bert (bool): whether it is DistilBert or not

        Returns:
            bool: whether there is Attention node or subgraph like Attention
        	AttentionF)	recursiveNTr   c                 S      g | ]}|j qS r   op_type.0childr   r   r    
<listcomp>K       zCFusionEmbedLayerNoMask.check_attention_subgraph.<locals>.<listcomp>)MatMulr6   r6   r   r   )Addr6   MultiHeadAttentionr6   )NNr   r      r$   r6   r7   c                 S   r.   r   r/   r1   r   r   r    r4   h   r5   )r6   r6   r6   Shaper   )r7   r6   r6   r6   r;   r;   )r7   r6   r6   r6   r;   z<No Attention like subgraph in children of LayerNormalization)r7   r6   r6   r6   )r   find_first_child_by_typer   outputsortedr0   r%   inputcross_attentionlenloggerdebug)
r   r)   r*   r+   childrenchildren_typesnodepath1grandchildrennodesr   r   r    check_attention_subgraph1   sZ   

 
,


z/FusionEmbedLayerNoMask.check_attention_subgraphc                 C   s  | j |ddgddg}|du r"| j |g dg d}|du r"dS |d |d	 }}|jd |kr4dS | j |g d
g dfg dg dfg|\}}}|du rSdS |d }	| j|	ddrg| j|	ddsidS |d }
| j|
ddswdS |d	 }|jd |krdS dS )az    Match position embedding path from input_ids to Gather for DistilBert.

        Pattern is like the following:
                 (input_ids)
                      |
                     Shape
                       |                          |    Gather (indices=1)
                       |       |
                       |      Cast (optional)
                       |       |
                       |      Range (start=0, end=*, delta=1)
                       |       |
                       |    Unsqueeze
                       |    /
                      Expand
                        |
                      Gather
        Expandr;   r$   N)rK   WhereReshaper;   )r$   r$   r:   r   Fr   r9   )	UnsqueezeRangeCastr#   r;   )r   r   r$   r   r   )rN   rO   r#   r;   )r   r   r$   r   r:   T)r   r%   r?   match_parent_pathsr   check_node_input_value)r   position_embedding_gather	input_idsoutput_name_to_noderG   expandshape_path2
range_nodegather_node
shape_noder   r   r    #match_position_embedding_distilbert   sD   
z:FusionEmbedLayerNoMask.match_position_embedding_distilbertc                 C   s   dS )aY  Match position embedding path from input_ids to Gather for Roberta.

        Roberta Embedding Layer Pattern (* is optional since it might be removed by ORT, ? is the padding word id):
          (input_ids) --> Equal(B=?) -- Not -- Cast(to=6) -- CumSum(axis=1) -- Mul -- Cast(to=7) -- Add(B=1) -- Cast(to=7)* --> Gather
                                                |                              ^
                                                V                              |
                                                +------------------------------+

        Roberta new pattern from transformers v4.9:
           (input_ids) --> Equal(B=?) -- Not -- Cast(to=6) -- CumSum(axis=1) -- Add(B=0) -- Mul -- Cast(to=7) -- Add(B=1) --> Gather
                                                |                                           ^
                                                V                                           |
                                                +-------------------------------------------+

        start_node = position_embedding_gather
        start_index = 1

        # match optional Cast node.
        parent = self.model.get_parent(start_node, start_index, output_name_to_node)
        if parent is None:
            return
        if parent.op_type == "Cast":
            if OnnxModel.get_node_attribute(parent, "to") != 7:
                return
            start_node = parent
            start_index = 0

        i, path, return_indices = self.model.match_parent_paths(
            start_node,
            [ (['Add', 'Cast', 'Mul', 'CumSum', 'Cast', 'Not', 'Equal'], [start_index, 0, 0, 0, 0, 0, 0]),
              (['Add', 'Cast', 'Mul', 'Add', 'CumSum', 'Cast', 'Not', 'Equal'], [start_index, 0, 0, 0, 0, 0, 0, 0])],
            output_name_to_node)

        if path is not None:
            # constant input of Add shall be 1.
            i, value = self.model.get_constant_input(path[0])
            if value != 1:
                return False

            _, self.padding_word_id = self.model.get_constant_input(path[-1])

            return input_ids == path[-1].input[0]
        Fr   r   rT   rU   rV   r   r   r     match_position_embedding_roberta   s   -z7FusionEmbedLayerNoMask.match_position_embedding_robertac                 C   s  | j |ddgddg|}|du rdS |\}}| j |jd }|durTt|jdkrT|jd dkrT| j|ddgrT| j|ddgrTt|jd	ksV| j|d	dgsVdS | j  }|d
k rjt	
|ddgsidS n| j|ddgsudS | j |d|}	|	du rdS |	jdkr| j|	ddsdS | j |	d|}
n|	}
|
du s|
jdkrdS | j|
ddsdS | j |
d|}|du s|jdkrdS ||jd kS )a	    Match position embedding path from input_ids to Gather for BERT.

        BERT Embedding Layer Pattern:
                                    (input_ids)
                                   /                                          /          Shape
                                /              |
                              /              Gather (indices=1)
                             /                  |
                            /                  Add (optional, B=0)
                           /                    |
                        Gather (segment_ids) Unsqueeze (axes=0)
                           \        |           |
                            \     Gather      Slice (data[1,512], starts=0, ends=*, axes=1, steps=1)
                              \    /            |
                                Add          Gather
                                   \       /
                                      Add
                                       |
                                LayerNormalization
        SlicerN   r$   r:   NFr            axesr7   r#   r;   )r   r%   get_constant_valuer?   rA   rX   r   rS   get_opset_versionr	   check_node_attribute
get_parentr0   )r   rT   rU   rV   pathslice	unsqueezeslice_weightopset_versionrF   gatherrX   r   r   r    match_position_embedding_bert   sT    

z4FusionEmbedLayerNoMask.match_position_embedding_bertc                 C   s(   |  |||r	dS | |||rdS dS )NTF)rp   r^   r_   r   r   r    match_position_embedding:  s
   z/FusionEmbedLayerNoMask.match_position_embeddingc                 C   s  |j d }|r|j d nd}|j d }| js!| jjdd| _d| _| jdurs| j|}| j|}|r6|s8J t|dkrLt|dkrL|d |d ksYtd| d|  dS |rs| j	||sstd	| d
| j|  dS | j
|j d }	|	du st|	jdkrtd dS | j
|j d }
|
du st|
jdks|	jd |
jd krtd dS |r| j
|j d }|du st|jdks|	jd |jd krtd dS |	jd |
jd krtd|j d  d|	jd  d|j d  d|
jd   |rU|	jd |jd kr-td|j d  d|	jd  d|j d  d|jd   |
jd |jd krUtd|j d  d|
jd  d|j d  d|jd   dS )zXSanity check of embedding weights, and match hidden_size of weights and shape of inputs.r$   NT)updater:   z^Cannot fuse EmbedLayerNormalization: input_ids and position_ids not matched in 2nd dimension: z vs FzYCannot fuse EmbedLayerNormalization: input_ids and segment_ids does not have same shape: z != r   zICannot fuse EmbedLayerNormalization: word embedding table is not expectedzMCannot fuse EmbedLayerNormalization: position embedding table is not expectedzLCannot fuse EmbedLayerNormalization: segment embedding table is not expectedzword_embedding_table (z) size z <= position_embedding_table (z <= segment_embedding_table (zposition_embedding_table ()r?   r   r   infer_runtime_shaper   get_edge_shaperA   rB   infocompare_shaperf   rX   warning)r   word_embedding_gathersegment_embedding_gatherrT   rU   segment_idsposition_idsinput_ids_shapeposition_ids_shapeword_embedding_tableposition_embedding_tablesegment_embedding_tabler   r   r    check_embeddingH  sj   





222z&FusionEmbedLayerNoMask.check_embedding
input_namec                 C   sd   d}| j |}|dur&|jjjtjkr | j|\}}||fS |}||fS | j|\}}||fS )a  Cast a graph input or node input to int32.

        Args:
            input_name (str): name of graph input or node input

        Returns:
            A tuple of casted input name and the cast node.
            int32_output (str): If input is int32, it is the input name, Otherwise it is output name of Cast node.
            input_cast_node (Union[None, NodeProto]): Cast node. It could be None if input is int32.
        N)	r   find_graph_inputtypetensor_type	elem_typer   INT32r   cast_input_to_int32)r   r   input_cast_nodegraph_inputint32_outputr   r   r    cast_to_int32  s   z$FusionEmbedLayerNoMask.cast_to_int32FrU   rx   rT   ry   r{   c	                 C   s  g }	|  |\}}
| jd}|jdkr|jd }|jd }n
|jd }|jd }d}|durL|  |jd \}}
|||jd |jd |jd ||g}n|d|jd |jd d||g}|durp|d |  |\}}
|| |d	 |d
 g}|r|dur|n|d }|| tjd|||d}d|_|j	D ]}|j
dkr|j	|g qt|j	dkr|j	tddg |	| |	D ]	}| j| j|j
< q| j|	 || _|S )ag  Create an EmbedLayerNormalization node. Note that segment embedding is optional.

        Args:
            input_ids (str): input_ids for word embeddings
            layernorm (NodeProto): LayerNormalization or SkipLayerNormalization node.
            word_embedding_gather (NodeProto): the Gather node for word embedding
            position_embedding_gather (NodeProto): the Gather node for position embedding
            segment_embedding_gather (Union[None, NodeProto]): the Gather node for segment embedding, or None.

        Returns:
            NodeProto: the EmbedLayerNormalization node created.
        r   r   r$   r:   rb   Nr    _output_dummy_mask_index_embedding_sum)outputsnamezcom.microsoftepsilong-q=)r   r   create_node_namer0   r?   appendr   	make_nodedomain	attributer   extendrA   make_attributethis_graph_namenode_name_to_graph_namenodes_to_addr   )r   rU   r)   rx   rT   ry   r{   embedding_sum_outputembedding_sum_namer   rY   	node_namegammabetaembed_node_inputsrz   embed_node_outputsr   r   attrF   r   r   r    create_fused_node  sl   










z(FusionEmbedLayerNoMask.create_fused_nodec                 C   s$   | j |jd |jd  d| _d S )Nr   T)r   replace_input_of_all_nodesr=   prune_graph)r   r)   r   r   r   r    finish_fusion  s   
z$FusionEmbedLayerNoMask.finish_fusionc                 C   s*   |j dkot|jdkot|jd dkS )Nr   rb   r   )r0   rA   r=   )r   rF   r   r   r    "is_skip_layer_norm_with_sum_output  s   *z9FusionEmbedLayerNoMask.is_skip_layer_norm_with_sum_outputc              
   C   sx  |  |}|d u rdS |\}}|jd }	|jd }
| j||dds#dS | |d |s,dS |jdkrP| |}d}|}|rA|jd nd }|d uoN| j|d u}n@|}|jdkrYdnd}t	|j|krg|j| nd }|d uot| j|d u}|o||v ot	|| dk}|d uo|jdkp|p|}| j
|	|||||
||r|nd d}|rd	|j|< |s| j||jd
  | || dS )NFr$   r+   r   rb   r7   r   )r   r   _no_use__to_be_removed_r:   T)r(   r?   rJ   r   r0   r   r=   r   find_graph_outputrA   r   r   r   )r   r)   add_before_layernormr*   rV   optional_segment_gather
two_gatherrx   rT   rU   r{   need_embedding_sum_outputsum_output_indexnode_with_sum_output
sum_outputis_sum_graph_outputis_sum_used_by_multiple_nodesr   r   r   r    	fuse_gpt2  sX   







z FusionEmbedLayerNoMask.fuse_gpt2c           
      C   s   |  |}|du rdS |\}}|jd }| j||ddsdS | |||s'dS | |d|s0dS | ||||d}	| ||	 dS )a  Fuse embedding layer for DistilBert
        Args:
            layernorm (NodeProto): node of LayerNormalization or SkipLayerNormalization
            add_before_layernorm (NodeProto): the Add node before LayerNormalization, or the SkipLayerNormalization itself
            input_name_to_nodes (Dict[str, List[NodeProto]]): map from input name to nodes
            output_name_to_node (Dict[str, List[NodeProto]]): map from output name to nodes
        NFr$   Tr   )r(   r?   rJ   rq   r   r   r   )
r   r)   r   r*   rV   r   rx   rT   rU   r   r   r   r    fuse_distilbertd  s    


z&FusionEmbedLayerNoMask.fuse_distilbertc                 C   s   | j |dgdg}|du rdS | |d }|du rdS |\}}|jd }	| j||dds0dS | j |dgdg}
|
du r@dS |
d }| ||	|sZ| ||	|sTdS |}|}|}| |||scdS | |	||||}| || dS )	a  Fuse embedding layer for Bert
        Args:
            layernorm (NodeProto): node of LayerNormalization or SkipLayerNormalization
            add_before_layernorm (NodeProto): the Add node before LayerNormalization, or the SkipLayerNormalization itself
            input_name_to_nodes (Dict[str, List[NodeProto]]): map from input name to nodes
            output_name_to_node (Dict[str, List[NodeProto]]): map from output name to nodes
        r7   r   NFr$   r   r#   T)	r   r%   r(   r?   rJ   rq   r   r   r   )r   r)   r   r*   rV   add_2_gatherr   rx   ry   rU   position_embedding_pathrT   tempr   r   r   r    	fuse_bert  s>   	
z FusionEmbedLayerNoMask.fuse_bertc           	      C   s  | j |dgdg}|jdkr|d u rd S |d }d }nP| j |dgdg}| j |dgdg}|d u rG|d urG|d u r>d S |d }|d }n%|d urh|d u rh| j |dgdg}|d u r_d S |d }|d }n|}d }| |||||rwd S | ||||rd S | ||||rd S d S )Nr7   r   r   r#   r$   )r   r%   r0   r   r   r   )	r   rF   r*   rV   first_add_pathr   r   r&   r'   r   r   r    fuse  s<   



zFusionEmbedLayerNoMask.fuse)r   )NFN)N)__name__
__module____qualname____doc__r   strr   r
   r   r   r(   r   r   boolrJ   r^   r`   rp   rq   r   r   r   r   r   r   r   r   r   r   __classcell__r   r   r   r    r      sT    "
T>/H"J

b
Q)2r   c                       s8   e Zd Zd	def fddZdd Z fddZ  ZS )
FusionEmbedLayerNormalizationFr   c                    s   t  |d || _d S )Nz	with mask)r   r   use_mask_index)r   r   r   r   r   r    r     s   
z&FusionEmbedLayerNormalization.__init__c                 C   s   | j }t|jdkr|j| td|j n"t|jdkr1|jd s1||jd< td|j n	td|j d S |D ]$}td|j |jdkrS|jd |jd< q<|jd	kr`|jd |jd
< q<d S )N   zappend mask to %szreplace mask in %szskip mask in %szupdate mask_index in %sr,   r$   rb   r8   rc   )	r   rA   r?   r   rB   rC   r   r0   r=   )r   
mask_int32attention_nodesr   attention_noder   r   r    replace_mask  s"   


z*FusionEmbedLayerNormalization.replace_maskc                    sf  d | _ d | _d | _t ||| | jd u rd S | js'td | d d S | j d u r=| jd u r=td | d d S | j rG| j j	d }n| jj	d }|| }| j
|rkdd |D }| || | d d S ||vr|td	| | d d S || }|jd
v rdd |D }|jdkr|j	d }t|t|kr| j| | || | d d S d S )NzG--use_mask_index is not set: EmbedLayerNormalization will not have maskz EmbedLayerNormalization(no mask)zLEmbedLayerNormalization will not have mask since attention node is not foundrb   rc   c                 S      g | ]	}|j d v r|qS )r,   r8   r/   r2   rF   r   r   r    r4         z6FusionEmbedLayerNormalization.fuse.<locals>.<listcomp>z"EmbedLayerNormalization(with mask)zHEmbedLayerNormalization will not have mask since %s is not a node output)	ReduceSumrP   c                 S   r   r   r/   r   r   r   r    r4   %  r   r   r   )r   r@   r   r   r   r   rB   rC   increase_counterr?   r   r   r   r0   rA   nodes_to_remover   )r   rF   r*   rV   r   children_nodesr   r   r   r    r     sJ   









z"FusionEmbedLayerNormalization.fuse)F)r   r   r   r   r   r   r   r   r   r   r   r    r     s    r   N)loggingr   typingr   r   r   r   r   fusion_baser   fusion_utilsr	   onnxr
   r   r   
onnx_modelr   r   rB   r   r   r   r   r   r    <module>   s        X