o
    gf^                     @   s2  d dl Z d dlZd dlZd dlZd dlZd dlmZ 	 g dZd%ddZdd Z	dd	 Z
d&d
dZd'ddZdd Zdd Zdd Zdd Z			d(ddZdd Zdd Zdd Zdd  Zed!kre Zed"e d d#lmZ eej ejsejsJ d$eeZnejZeeeZeD ]Z ee  qdS dS ))    N)TensorProto)ScanLoopIfc                 C   s  t  }|jdddtdd |jdddtdd |jd	d
dtddd |jdddtddd |jddtddd |jddtddd |jddtddd |jddtddd |jddtddd |jddtd d d |jd!dtd d"d |jd#dtd d$d |jd%dd&g d'd(d) |jd*d+dd,d-d. |jdd/ |jd0dtd1d2d |jd3dd,d4d. |jdd5 |jd6dd,d7d. |jdd8 |jd9d:dd,d; |jdd< || S )=Nz-iz--inputFz2Set the input file for reading the profile results)requiredtypehelpz-mz--modelzIonnx model path to run profiling. Required when --input is not specified.z-bz--batch_size   zbatch size of input)r   r   defaultr   z-sz--sequence_length    zsequence length of inputz--past_sequence_lengthzpast sequence length for gpt2z--global_lengthz&number of global tokens for longformerz	--samplesi  z\number of samples to test. Set it large enough to reduce the variance of performance result.z--thresholdg{Gz?zfThreshold of run time ratio among all nodes. Nodes with larger ratio will show in top expensive nodes.z--thread_numznumber of threads to usez--input_ids_namez"input name for input IDs, for bertz--segment_ids_namez$input name for segment IDs, for bertz--input_mask_namez'input name for attention mask, for bertz--dummy_inputsr
   )bertgpt2
longformerr
   zEType of model inputs. The default will create dummy inputs with ones.)r   r
   choicesr   z-gz	--use_gpu
store_truezuse GPU)r   actionr   )use_gpuz
--providercudazExecution provider to usez--basic_optimizationz_Enable only basic graph optimizations. By default, all optimizations are enabled in OnnxRuntime)basic_optimizationz--kernel_time_onlyz.Only include the kernel time and no fence time)kernel_time_onlyz-vz	--verbose)r   r   )verbose)argparseArgumentParseradd_argumentstrintfloatset_defaults
parse_args)argvparser r"   X/var/www/visachat/venv/lib/python3.10/site-packages/onnxruntime/transformers/profiler.pyparse_arguments   s   		
r$   c                 C   sD   ddl m} || ||| |dd}|D ]}|d |}	q| }
|
S )Nr   )create_onnxruntime_sessionT)enable_all_optimizationnum_threadsenable_profiling)benchmark_helperr%   runend_profiling)onnx_model_pathr   providerr   
thread_num
all_inputsr%   sessioninputs_profile_filer"   r"   r#   run_profile   s   	r4   c                 C   sT   t d|  d t| }t|}W d    n1 sw   Y  t|ts(J |S )Nzloading profile output z ...)printopenjsonload
isinstancelist)r3   opened_file	sess_timer"   r"   r#   load_profile_json   s   
r=   c                 C   sL  i }i }i }d}d}| D ]i}|d dkr|d dkrd}|sq|d dkrud	|v rud
|v rud|d
 v ru|d }|d
 d }	|	t v rBq|	sJd| d}	||v ra||  |d	 7  < ||  d7  < n|d	 ||< d||< |	||< ||d	 7 }q|s{dgS g }
|
d|d dd |
d |
d t| dd ddD ]0\}}|| }||k rq|| }|t| }|
|dd|d dd|dd|dd| 	 qi }| D ]\}}	|| }|	|v r||	  |7  < q|||	< q|
d |
d |
d  t| d!d ddD ]\}	}|| }|
|dd|d dd|	  q	|
S )"  Parse profile data and output nodes in two sections - nodes in the original order, and top expensive nodes.

    Args:
        sess_time (List[Dict]): profile data
        kernel_time_only (bool, optional): Only include items for kernel time. Defaults to False.
        threshold (int, optional): Minimum ratio of duration among all. Defaults to 0.

    Returns:
        List[str]: lines of string for output.
    r   FcatSessionnamesession_initializationTKerneldurargsop_name()r	   zNo kernel record found!z%
Top expensive kernels with Time% >= d   .2f:@----------------------------------------------------------------u&   Total(μs)	Time%	Calls	Avg(μs)	Kernelc                 S      | d S Nr	   r"   xr"   r"   r#   <lambda>       z&parse_kernel_results.<locals>.<lambda>keyreverse10d	      Y@5.2f5d8.1fz
Group kernel time by operator:u   Total(μs)	Time%	Operatorc                 S   rM   rN   r"   rO   r"   r"   r#   rQ     rR   )NODES_TYPE_CONTAINING_SUBGRAPHappendsorteditemsr   )r<   	thresholdkernel_name_to_op_namekernel_timekernel_freqtotalsession_inititemkernel_namerF   linesdurationratiocallsavg_timeop_timer"   r"   r#   parse_kernel_results   sf   (

4



&rn   Fc                 C   s  g }i }i }i }d}| D ]}|d dkrd|v rd|v rd|d v r|d  dd	 d
d	 dd	}	d|d v rj|d d dkrCd}
n|d d dkrNd}
n
|d d dkrXd}
|	|vra|
||	< n||	 |
ksiJ n|rmq|d d }|tv rxq|	|v r||	  |d 7  < ||	  d7  < n|d ||	< d||	< ||	 ||d 7 }qg d}d}|D ]?}	||	 }||	 }|t| }|| d }||	d	}||7 }||dd|dd|dd|dd|dd|dd|	  q|d|d dd  |d! |d" t| d#d$ d%d&D ]A\}	}|| }||k rq||	 }|t| }|| d }||	d	}||dd|dd|dd|dd|dd|	  q|S )'r>   r   r?   NoderD   rE   rF   rA   _kernel_time _fence_before_fence_afterr-   CPUExecutionProviderCPUCUDAExecutionProviderCUDADmlExecutionProviderDMLr	   )z
Nodes in the original order:rL   u3   Total(μs)	Time%	Acc %	Avg(μs)	Calls	Provider	Nodeg        rX   rV   rW   rY   r[   rZ   8sz#
Top expensive nodes with Time% >= rI   rJ   rK   rL   u-   Total(μs)	Time%	Avg(μs)	Calls	Provider	Nodec                 S   rM   rN   r"   rO   r"   r"   r#   rQ   V  rR   z$parse_node_results.<locals>.<lambda>TrS   )replacer\   r]   r   getr^   r_   )r<   r   r`   node_name_list	node_time	node_freqnode_providerrd   rf   	node_namedevicerF   rh   before_percentageri   rk   rl   
percentager-   rj   r"   r"   r#   parse_node_results  sr   (

6


:r   c                 C   s:  i }i }d}i }i }i }i }	d}
i }| D ]}|d dkrd|v rd|v rd|d v r|d d }|t v r5qd|d vr]d|d	 v r\||	v rP|	|  |d 7  < n|d |	|< |
|d 7 }
q|d dd
}||v rr||  d7  < nd||< | d| }||v r||  |d 7  < ||  d7  < n
|d ||< d||< ||v r||  |d 7  < n|d ||< ||v r||  |d 7  < ||  d7  < n
|d ||< d||< ||d 7 }qd
dg}|d |d t| dd ddD ]G\}}|	|d}|| }|| }|||
  }|| }|| }||dd|d dd|dd|d dd|dd|dd|dd|  q|d
dg7 }|d |d t| dd ddD ]C\}}|d}|d }|d }|dd
}|| }|| }|||  }||dd|d dd|dd|dd|d d|  qW|S )!a  Group results by operator name.

    Args:
        sess_time (List[Dict]): profile data
        kernel_time_only (bool): Only include items for kernel time.
        use_gpu (bool): GPU is used in profiling or not.

    Returns:
        List[str]: lines of string for output.
    r   r?   ro   rD   rE   rF   r-   fencerA   rq   r	   rK   zGrouped by operatorrL   uM   Total(μs)	Time%	Kernel(μs)	Kernel%	Calls	AvgKernel(μs)	Fence(μs)	Operatorc                 S   rM   rN   r"   rO   r"   r"   r#   rQ     rR   z$group_node_results.<locals>.<lambda>TrS   rV   rW   rX   rY   11drZ   z14.1fzGrouped by provider + operatoru<   Kernel(μs)	Provider%	Calls	AvgKernel(μs)	Provider	Operatorc                 S   rM   rN   r"   rO   r"   r"   r#   rQ     rR   ExecutionProviderz9.2frz   )r\   r|   r]   r^   r_   splitr{   )r<   r   r   op_kernel_timeop_kernel_recordstotal_kernel_timeprovider_op_kernel_timeprovider_op_kernel_recordsprovider_kernel_timeop_fence_timetotal_fence_timeprovider_counterrf   rF   r-   rT   rh   rb   
fence_timekernel_time_ratio
total_time
time_ratiokernel_callsavg_kernel_timepartsshort_eprk   provider_time_ratior"   r"   r#   group_node_resultsd  s   (

F


2r   c                 C   s&   t | dtkrt| | dS d S )Nvalue)r   
WhichOneofr   getattr)dimr"   r"   r#   get_dim_from_type_proto  s   &r   c                 C   s   dd | j jjD S )Nc                 S   s   g | ]}t |qS r"   )r   ).0dr"   r"   r#   
<listcomp>  s    z-get_shape_from_type_proto.<locals>.<listcomp>)tensor_typeshaper   )
type_protor"   r"   r#   get_shape_from_type_proto  s   r   c                    s  i  |   D ]m}t|j}g }t|D ]\}}t|tr!|| qt|dkr+ dS t|dkr7|||d < t|dkrC|||d < |jjj	}	|	t
jt
jt
jfv sTJ |	t
jkr\tjn
|	t
jkrdtjntj}
tj||
d}| |j< q fddt|D }|S )a  Create dummy inputs for ONNX model.

    Args:
        onnx_model (OnnxModel): ONNX model
        batch_size (int): batch size
        sequence_length (int): sequence length
        samples (int): number of samples

    Returns:
        List[Dict]: list of inputs
       Nr   r	   dtypec                       g | ]} qS r"   r"   r   r2   dummy_inputsr"   r#   r         z'create_dummy_inputs.<locals>.<listcomp>)'get_graph_inputs_excluding_initializersr   r   	enumerater9   r   r]   lenr   	elem_typer   FLOATINT32INT64numpyfloat32int64int32onesrA   range)
onnx_model
batch_sizesequence_lengthsamplesgraph_inputr   symbol_dimsir   r   	data_typedatar/   r"   r   r#   create_dummy_inputs  s0   




r   c                 C   sB   ddl m}m} || |||\}	}
}||||dd|	|
|dd	}|S )a-  Create dummy inputs for BERT model.

    Args:
        onnx_model (OnnxModel): ONNX model
        batch_size (int): batch size
        sequence_length (int): sequence length
        samples (int): number of samples
        input_ids_name (str, optional): Name of graph input for input IDs. Defaults to None.
        segment_ids_name (str, optional): Name of graph input for segment IDs. Defaults to None.
        input_mask_name (str, optional): Name of graph input for attention mask. Defaults to None.

    Returns:
        List[Dict]: list of inputs
    r   )find_bert_inputsgenerate_test_data{   F)
test_casesseedr   	input_idssegment_ids
input_maskrandom_mask_length)bert_test_datar   r   )r   r   r   r   input_ids_namesegment_ids_nameinput_mask_namer   r   r   r   r   r/   r"   r"   r#   create_bert_inputs  s   r   c                    s   ||||| d}i  |   D ]V}t|j}t|D ]\}}	t|	tr4|	|vr.td|	 ||	 ||< q|jjj}
|
t	j
t	jt	jfv sFJ |
t	j
krNtjn
|
t	jkrVtjntj}tj||d}| |j< q fddt|D }|S )a  Create dummy inputs for GPT-2 model.

    Args:
        onnx_model (OnnxModel): ONNX model
        batch_size (int): batch size
        sequence_length (int): sequence length
        past_sequence_length (int): past sequence length
        samples (int): number of samples

    Raises:
        RuntimeError: symbolic is not supported. Use the tool convert_to_onnx.py to export ONNX model instead.

    Returns:
        List[Dict]: list of inputs
    )r   seq_lenpast_seq_lentotal_seq_lensymbol is not supported: r   c                    r   r"   r"   r   r   r"   r#   r   O  r   z&create_gpt2_inputs.<locals>.<listcomp>)r   r   r   r   r9   r   RuntimeErrorr   r   r   r   r   r   r   r   r   r   r   rA   r   )r   r   r   past_sequence_lengthr   symbolsr   r   r   r   r   r   r   r/   r"   r   r#   create_gpt2_inputs#  s0   



r   c                    s  ||d}i  |   D ]m}t|j}t|D ]\}}	t|	tr0|	|vr*td|	 ||	 ||< q|jjj}
|
t	j
t	jt	jfv sBJ |
t	j
krJtjn
|
t	jkrRtjntj}d|jv rltj||d}d|ddd|f< ntj||d}| |j< q fddt|D }|S )	a  Create dummy inputs for Longformer model.

    Args:
        onnx_model (OnnxModel): ONNX model
        batch_size (int): batch size
        sequence_length (int): sequence length
        global_length (int): number of global tokens
        samples (int): number of samples

    Raises:
        RuntimeError: symbolic is not supported. Use the tool convert_longformer_to_onnx.py to export ONNX model instead.

    Returns:
        List[Dict]: list of inputs
    )r   r   r   globalr   r	   Nc                    r   r"   r"   r   r   r"   r#   r   ~  r   z,create_longformer_inputs.<locals>.<listcomp>)r   r   r   r   r9   r   r   r   r   r   r   r   r   r   r   r   r   rA   zerosr   r   )r   r   r   global_lengthr   r   r   r   r   r   r   r   r   r/   r"   r   r#   create_longformer_inputsS  s.   





r   c                 C   s@   t | }t||j}|t||j|j7 }|t||j|j7 }|S N)r=   rn   r`   r   r   r   r   )r3   rE   profile_recordsrh   r"   r"   r#   process_results  s
   r   c                 C   s  | j dkr| j ntjdd}dtjvrt|tjd< ddlm} ddlm	} ||| j
}d }| jdkrEt|| j| j| j| j| j| j}n.| jdkrWt|| j| j| j| j}n| jd	krit|| j| j| j| j}n
t|| j| j| j}t| j
| j| j| j| j |}|S )
Nr   F)logicalOMP_NUM_THREADS)r8   )	OnnxModelr   r   r   )r.   psutil	cpu_countosenvironr   onnxr8   r   r   modelr   r   r   r   r   r   r   r   r   r   r   r   r   r4   r   r-   r   )rE   r'   r8   r   r   r/   r3   r"   r"   r#   r*     sV   


	
	r*   __main__	Arguments)setup_loggerzMrequires either --model to run profiling or --input to read profiling resultsr   )r   )Fr   )NNN)!r   r7   r   r   r   r   r   r\   r$   r4   r=   rn   r   r   r   r   r   r   r   r   r   r*   __name__	argumentsr5   r)   r   r   inputr   r3   resultsliner"   r"   r"   r#   <module>   sP    
 


PWd/
)0/6




