o
    g                     @   sZ  d Z ddlZddlZddlZddlZddlmZ ddlZddlZddlm	Z	m
Z
mZmZmZmZmZmZmZmZmZ ddlmZ ddlmZmZ ddlmZmZmZmZ ddlmZ dd	l m!Z! e"d
Z#ej$ddZ$dej%vrwe&e$ej%d< ddl'Z'ddl(m)Z)m*Z*m+Z+ dd Z,dd Z-de.de.fddZ/dd Z0dd Z1dd Z2e3dkre2  dS dS )a]   Benchmarking the inference of pretrained transformer models.
    PyTorch/TorchScript benchmark is based on https://github.com/huggingface/transformers/blob/master/examples/benchmarks.py.
    One difference is that random input_ids is generated in this benchmark.

    For onnxruntime, this script will convert a pretrained model to ONNX, and optimize it when -o parameter is used.

    Example commands:
        Export all models to ONNX, optimize and validate them:
            python benchmark.py -b 0 -o -v -i 1 2 3
        Run OnnxRuntime on GPU for all models:
            python benchmark.py -g
        Run OnnxRuntime on GPU for all models with fp32 optimization:
            python benchmark.py -g -o
        Run OnnxRuntime on GPU with fp16 optimization:
            python benchmark.py -g -o -p "fp16"
        Run TorchScript on GPU for all models:
            python benchmark.py -e torchscript -g
        Run TorchScript on GPU for all models with fp16:
            python benchmark.py -e torchscript -g -p "fp16"
        Run ONNXRuntime and TorchScript on CPU for all models with quantization:
            python benchmark.py -e torchscript onnxruntime -p "int8" -o
        Run OnnxRuntime with the ROCM provider and graph optimization script:
            python benchmark.py -g -m bert-base-cased --provider rocm --optimizer_info by_script --disable_embed_layer_norm
        Run OnnxRuntime with bfloat16 fastmath mode kernels on aarch64 platforms with bfloat16 support:
            python benchmark.py --enable_arm64_bfloat16_fastmath_mlas_gemm

    It is recommended to use run_benchmark.sh to launch benchmark.
    N)datetime)ConfigModifierOptimizerInfo	Precisioncreate_onnxruntime_sessionget_latency_resultinference_ortinference_ort_with_io_bindingoutput_detailsoutput_fusion_statisticsoutput_summarysetup_logger)FusionOptions)MODEL_CLASSESMODELS)create_onnxruntime_inputexport_onnx_model_from_ptexport_onnx_model_from_tfload_pretrained_model)version)QuantizeHelper F)logicalOMP_NUM_THREADS)
AutoConfigAutoTokenizerLxmertConfigc           4      C   s  dd l }g }| r!d| vr!d| vr!d| vr!td |S d}|dkr9tj}d}d| vr9td	 |S |tjkrGtd
| d |D ]}t| d }|
D ]}|t|kr` n|d | }t| d |_	t
|}d|v rt , t|t| d t| d t| d |||||| |||||||\}} }!}"W d    n1 sw   Y  d|v rt|t| d t| d t| d |||||| |||||||\}} }!}"| sqTt|| |d|||d}#|#d u rqTdd |# D }$g }%| rdnd}&tj||d}'tt|t|t|!|'jg}(tt||'jg})|D ]}*|*dkr&q|D ]}+|"d ur6|+|"kr6q(d|v r>tjntj},t|!|*|+||'|,}-d|j||&||| ||||*|+| tt d}.|'j	dv r{t d| d|*d|'j!|'j!g  nt d| d|*|+g  |rt"|#|-|.|	|*|}/nG|##|$|-}0|(g}1t$t|0D ]}2|2dkrt| d dkr|1%|) q|1%|( qd|v rtj&ntj'}3t(|#|-|.|	|$|0|%|1|*|&|3|}/t |/ |%|/ q(qqTqI|S )Nr   CUDAExecutionProviderROCMExecutionProviderDmlExecutionProviderzPlease install onnxruntime-gpu or onnxruntime-directml package instead of onnxruntime, and use a machine with GPU for testing gpu performance.tensorrt   TensorrtExecutionProviderzhPlease install onnxruntime-gpu-tensorrt package, and use a machine with GPU for testing gpu performance.zOptimizerInfo is set to zA, graph optimizations specified in FusionOptions are not applied.   pt      tfT)enable_all_optimizationnum_threadsverbose(enable_mlas_gemm_fastmath_arm64_bfloat16c                 S   s   g | ]}|j qS  )name).0node_argr,   r,   Y/var/www/visachat/venv/lib/python3.10/site-packages/onnxruntime/transformers/benchmark.py
<listcomp>   s    z#run_onnxruntime.<locals>.<listcomp>cudacpu	cache_dironnxruntimeenginer   	providersdevice	optimizer	precision
io_binding
model_nameinputsthreads
batch_sizesequence_lengthcustom_layer_numr   vitswinzRun onnxruntime on  with input shape gpt))r6   get_available_providersloggererrorr   NOOPTwarningr   len
model_typer   parsetorchno_gradr   r   r   get_outputsr   from_pretrainednumpyprodmaxhidden_sizeint64int32r   __version__get_layer_numstrr   nowinfo
image_sizer   runrangeappendlonglongintcr	   )4use_gpuprovidermodel_namesmodel_classconfig_modifierr<   r)   batch_sizessequence_lengthsrepeat_timesinput_countsoptimizer_infovalidate_onnxr5   onnx_dirr*   	overwritedisable_ort_io_bindinguse_raw_attention_maskmodel_fusion_statisticsmodel_source(enable_arm64_bfloat16_fastmath_mlas_gemmargsr6   resultswarm_up_repeatr>   all_input_names
num_inputsinput_namesfusion_optionsonnx_model_fileis_valid_onnx_model
vocab_sizemax_sequence_lengthort_sessionort_output_namesoutput_buffersr:   configmax_last_state_sizemax_pooler_sizerA   rB   input_value_type
ort_inputsresult_templateresultort_outputsoutput_buffer_max_sizesi	data_typer,   r,   r0   run_onnxruntimeY   sf  











	

	


Nr   c                    s  g }| rt j std |S t d |D ]A}tj||	|d}|| t	||||d}|j
dv r:|d g}ntj||d}|j|d}td	|  td
|   |tjkrc|  t | ridnd}|| |tjkr{t|}|D ]}|dkrq}|D ]}|j
dv rtd| d|d|j|jg  t j|d|j|jf|tjkrt jnt j|dn&|d ur||krqtd| d||g  t jd|jd ||ft j|dz^|	rt j !|n|
rt "|n|   t#j$ fdd|dd}|	rdn|
r
dndt j%d| rdndd|d|d||||& t't() d}|*t+|| t| |,| W q t-yW } zt.| t j/  W Y d }~qd }~ww q}q|S )NzYPlease install PyTorch with Cuda, and use a machine with GPU for testing gpu performance.F)torchscriptr5   )r   r5   custom_model_classrD   r   r4      zModel zNumber of parameters zcuda:0r3   zRun PyTorch on rG   r#   )sizedtyper:   r%   )lowhighr   r   r:   c                      s    S Nr,   r,   	inference	input_idsr,   r0   <lambda>  s    zrun_pytorch.<locals>.<lambda>repeatnumberr   torch2rQ   NAr2   r   r7   )0rQ   r2   is_availablerJ   rK   set_grad_enabledr   rT   modifyr   rO   r   max_model_input_sizesgetdebugnum_parametersr   FLOAT16halfr:   toINT8r   quantize_torch_modelr_   r`   randnfloat16float32randintr   longjittracecompiletimeitr   r[   r\   r]   r   r^   updater   rc   RuntimeError	exceptionempty_cache)rf   rh   ri   rj   r<   r)   rk   rl   rm   r   r   r5   r*   ry   r>   r   model	tokenizermax_input_sizer:   rA   rB   runtimesr   er,   r   r0   run_pytorch9  s   









"


9r   do_eager_modeuse_xlac                    s*   ddl m dd l fdd}|S )Nr   )wrapsc                    sT     fdd} j d fdd}du r(du s&J d|S |S )	Nc                         | i |S r   r,   rx   kwargsfuncr,   r0   run_in_eager_mode     zFrun_with_tf_optimizations.<locals>.run_func.<locals>.run_in_eager_mode)experimental_compilec                     r   r   r,   r   r   r,   r0   run_in_graph_mode  s   zFrun_with_tf_optimizations.<locals>.run_func.<locals>.run_in_graph_modeTFzcCannot run model in XLA, if `args.eager_mode` is set to `True`. Please set `args.eager_mode=False`.)function)r   r   r   r   r'   r   r   r   r0   run_func  s   

z+run_with_tf_optimizations.<locals>.run_func)	functoolsr   
tensorflow)r   r   r   r,   r   r0   run_with_tf_optimizations  s   r   c                    s  g }dd l jj| | sjg d | r$j s$td |S | r`j	d}zj|d d jj
|d d jjdd W n ty_ } zt| W Y d }~nd }~ww |tjksj|tjkrntd|D ]}tj||	d |  t| |	|dd	tj||	d}|j|d
}|D ]}|dkrq|D ]}|d ur||krqtd| d||g  dd l}|  fddt|| D }j|||fj dzzt!dddfdd}t!dddfdd}t!ddd fdd}| j"r|nt# t$r|  t%j&fdd|dd}dj'd| r-dndd |d |d||||( t)t*+ d!}|,t-|| t| |.| W q tyz } zt| dd"l/m0} |1 }|2  W Y d }~qd }~ww qqp|S )#Nr   GPUzVPlease install Tensorflow-gpu, and use a machine with GPU for testing gpu performance.Tz/gpu:0)r:   z+Mixed precision is currently not supported.r4   )r   r5   r   is_tf_modelr   zRun Tensorflow on rG   c                    s   g | ]} d  jd qS )r   r%   )r   r   )r.   r   )r   rngr,   r0   r1     s    z"run_tensorflow.<locals>.<listcomp>)shaper   F)r   r   c                      s    ddS )NF)trainingr,   r,   r   r   r,   r0   encoder_forward  s   z'run_tensorflow.<locals>.encoder_forwardc                      s     ddS )NF)decoder_input_idsr   r,   r,   r   r,   r0   encoder_decoder_forward  r   z/run_tensorflow.<locals>.encoder_decoder_forwardc                     s8   j dd jg} j dd jg}| |ddS )Nr%   F)visual_feats
visual_posr   )randomnormalvisual_feat_dimvisual_pos_dim)featspos)r   r   r   r'   r,   r0   lxmert_forward  s   z&run_tensorflow.<locals>.lxmert_forwardc                      s     S r   r,   r,   )r   r,   r0   r   &  s    z run_tensorflow.<locals>.<lambda>r%   r   r   r   r2   r3   r   r7   )r2   )3r   r   	threading set_intra_op_parallelism_threadsset_visible_devicestestis_built_with_cudarJ   rK   list_physical_devicesexperimentalset_memory_growth
distributeOneDeviceStrategyr   r   r   r   r   NotImplementedErrorr   rT   r   r   r   r   r   r_   r   Randomrb   constantrZ   r   is_encoder_decoder
isinstancer   r   r   r[   r\   r]   r   r^   r   r   rc   numbar2   get_current_devicereset)rf   rh   ri   rj   r<   r)   rk   rl   rm   r5   r*   ry   physical_devicesr   r>   r   r   rA   rB   r   valuesr   r   r   r   r   r2   r:   r,   )r   r   r   r   r   r'   r0   run_tensorflow  s   









Ir   c                  C   s  t  } | jddddtg dtt ddt  d | jd	dd
tdddgdd | jddtd ttddt d | jddddtdgg ddd | jdddtt	j
dddd | jddtt	j
dddd | jdd dd!d"d# | jd$dtd d%d | jd&d'ttjttd(d) | jd*dd!d+d# | jd,dd!d-d# | jd.d/ttjttd0d) | jd1d2dd!d3d# | jd4d5dd d6d7 | jd8d9dd d:d7 | jd;d<dd d=d7 | jd>d?ddd
gtg d@dAdB | jdCdDddEtdFdG | jdHdIdtd
gdJ | jdKdLdtg dMdJ | jdNdd!dOd# | jddP | jdQdRddtdSgdTdU | jdVdtd dWd | jdXdd!dYd# | jddZ t|  |  }|S )[Nz-mz--modelsF+)zbert-base-casedzroberta-basegpt2z Pre-trained models in the list: z, )requirednargstypedefaultchoiceshelpz--model_sourcer%   r$   r'   zExport onnx from pt or tfz--model_classz!Model type selected in the list: )r   r   r   r  r  z-ez	--enginesr6   )r6   rQ   r   r   r   zEngines to benchmarkz-cz--cache_dir.cache_modelsz%Directory to cache pre-trained models)r   r   r   r  z
--onnx_dironnx_modelszDirectory to store onnx modelsz-gz	--use_gpu
store_truezRun on gpu device)r   actionr  z
--providerzExecution provider to usez-pz--precisionzfPrecision of model to run. fp32 for full precision, fp16 for half precision, and int8 for quantization)r   r   r  r  z	--verbosezPrint more informationz--overwritezOverwrite existing modelsz-oz--optimizer_infozjOptimizer info: Use optimizer.py to optimize onnx model as default. Can also choose from by_ort and no_optz-vz--validate_onnxzValidate ONNX modelz-fz--fusion_csvz:CSV file for saving summary results of graph optimization.)r   r   r  z-dz--detail_csvz#CSV file for saving detail results.z-rz--result_csvz$CSV file for saving summary results.z-iz--input_counts)r%   r&   r#   zXNumber of ONNX model inputs. Please use 1 for fair comparison with Torch or TorchScript.)r   r   r   r   r  r  z-tz--test_timesd   z8Number of repeat times to get average inference latency.)r   r   r   r  z-bz--batch_sizes)r   r   r   z-sz--sequence_lengths)             @         z--disable_ort_io_bindingz=Disable running ONNX Runtime with binded inputs and outputs. )rs   z-nz--num_threadsr   zThreads to use)r   r   r   r   r  z--force_num_layersz%Manually set the model's layer numberz*--enable_arm64_bfloat16_fastmath_mlas_gemmzHEnable bfloat16 mlas gemm kernels on aarch64. Supported only for CPU EP )rw   )argparseArgumentParseradd_argumentr]   listr   keysjoinr   ospathr   FLOAT32r   BYSCRIPTintset_defaultsr   add_arguments
parse_args)parserrx   r,   r,   r0   parse_argumentsE  sV  

					

r  c                  C   s  t  } t| j | jtjkr| jstd d S | jtj	kr-| jr-| j
dvr-td d S t| jdkrCt| jd  d dv rCdg| _td	d
 | jD | _td|   tj| jsvzt| j W n tyu   td| j Y nw d| jv }d| jv }d| jv }d| jv }d| jv }|rttjtdk rtdtj  d S t| j}g }| jD ]}t| t tj!"  |s|s|r+| j#dgkrt$d |r|t%| j| j| j&|| j|| j'| j| j(dd| j| j7 }|r|t%| j| j| j&|| j|| j'| j| j(dd| j| j7 }|r+|t%| j| j| j&|| j|| j'| j| j(dd| j| j7 }|rG|t)| j| j| j&|| j|| j'| j| j(| j| j7 }i }	|rz4| j* }
|t+| j| j
| j| j&|| j|| j'| j| j(| j#| j,| j-| j| j.| j| j/| j0|
|	| j1| j2| 7 }W q t3y   t4d Y qw qt56 7d}|	r| j8pd| d}t9|	| t|dkr| j'dgkrt$d d S | j:pd| d}t;|| | j<pd| d}t=|||  d S )Nzfp16 is for GPU only)migraphxrocmzint8 is for CPU onlyr%   r   r#   )rE   swimr   c                 S   s   h | ]
}|d kr
t n|qS )r   )	cpu_count)r.   xr,   r,   r0   	<setcomp>  s    zmain.<locals>.<setcomp>zArguments: z#Creation of the directory %s failedrQ   r   r   r6   r   z2.0.0z2PyTorch version must be >=2.0.0 and you are using zB--input_counts is not implemented for torch or torchscript engine.TF	Exceptionz%Y%m%d-%H%M%Sbenchmark_fusion_z.csvzNo any result available.benchmark_detail_benchmark_summary_)>r  r   r*   r<   r   r   rf   rJ   rK   r   rg   rN   modelsr   rl   sortedr)   r_   r  r  existsr5   mkdirOSErrorenginesr   rP   rQ   r[   r   force_num_layersset_num_threadsr   
__config__parallel_inforn   rM   r   ri   rk   
test_timesr   use_mask_indexr   ro   rp   rq   rr   rs   rv   rw   r&  r   r   r^   strftime
fusion_csvr   
detail_csvr
   
result_csvr   )rx   enable_torchenable_torch2enable_torchscriptenable_onnxruntimeenable_tensorflowrj   ry   r)   ru   rt   
time_stampcsv_filenamer,   r,   r0   main  s  


$












rA  __main__)4__doc__r  loggingr  r   r   rU   psutilbenchmark_helperr   r   r   r   r   r   r	   r
   r   r   r   r~   r   huggingface_modelsr   r   onnx_exporterr   r   r   r   	packagingr   quantize_helperr   	getLoggerrJ   r#  environr]   rQ   transformersr   r   r   r   r   boolr   r   r  rA  __name__r,   r,   r,   r0   <module>   sB   4

 ap  I #
