o
    gGX                     @   s  d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlmZm	Z	 d dl
mZ d dlmZ d dlmZ d dlmZ d dlmZmZmZmZ d dlZd dlZd dlZd dlZd dlmZ d dlZeeZG d	d
 d
eZG dd deZ G dd dZ!dej"iZ#ddddddi fddZ$d8ddZ%d9ddZ&dd Z'dd Z(dd Z)dd  Z*d:d!d"Z+ej,d fd#d$Z-d%d& Z.d;d(d)Z/d*eeee0ef   fd+d,Z1G d-d. d.eZ2G d/d0 d0e2Z3G d1d2 d2e2Z4d<d4d5Z5d6d7 Z6dS )=    N)ABCabstractmethod)ThreadPoolExecutor)datetime)Enum)sleep)AnyDictListOptional)versionc                   @   s$   e Zd ZdZdZdZdZdd ZdS )	Precisionfp32fp16int8int4c                 C      | j S Nvalueself r   `/var/www/visachat/venv/lib/python3.10/site-packages/onnxruntime/transformers/benchmark_helper.py__str__&      zPrecision.__str__N)__name__
__module____qualname__FLOAT32FLOAT16INT8INT4r   r   r   r   r   r       s    r   c                   @   s    e Zd ZdZdZdZdd ZdS )OptimizerInfono_optby_ort	by_scriptc                 C   r   r   r   r   r   r   r   r   1   r   zOptimizerInfo.__str__N)r   r   r   NOOPTBYORTBYSCRIPTr   r   r   r   r   r#   *   s
    r#   c                   @   s$   e Zd Zdd Zdd Zdd ZdS )ConfigModifierc                 C   
   || _ d S r   
num_layers)r   r-   r   r   r   __init__6      
zConfigModifier.__init__c                 C   s   | j d u rd S t|dr| j |_td| j   t|dr+| j |_td| j   t|dr?| j |_td| j   d S d S )Nnum_hidden_layersz6Modifying pytorch model's number of hidden layers to: encoder_layersz7Modifying pytorch model's number of encoder layers to: zdecoder_layers z7Modifying pytorch model's number of decoder layers to: )r-   hasattrr0   loggerinfor1   decoder_layers)r   configr   r   r   modify9   s   



zConfigModifier.modifyc                 C   r   r   r,   r   r   r   r   get_layer_numF   r   zConfigModifier.get_layer_numN)r   r   r   r.   r7   r8   r   r   r   r   r*   5   s    r*   float32TFc	                    s@  d }	zt  }
|rt jj|
_nt jj|
_|rd|
_|dkr)||
_t	d|
j  |r/d|
_
nd|
_
t	d|   |rn|dkrEddg}n,|d	krNd
dg}n#|dkrWg d}n|dkr`ddg}n|dkrig d}nddg}ndg} r| fdd|D }|r|
dd t j| |
|d}	W |	S  ty   tjddd Y |	S w )NTr   z%Session option: intra_op_num_threads=   zCreate session for onnx model: dmlDmlExecutionProviderCPUExecutionProviderrocmROCMExecutionProvidermigraphx)MIGraphXExecutionProviderr@   r>   cudaCUDAExecutionProvidertensorrt)TensorrtExecutionProviderrD   r>   c                    s$   g | ]}| v r| | fn|qS r   r   ).0nameprovider_optionsr   r   
<listcomp>   s   $ z.create_onnxruntime_session.<locals>.<listcomp>z(mlas.enable_gemm_fastmath_arm64_bfloat161)	providers	Exception)exc_info)onnxruntimeSessionOptionsGraphOptimizationLevelORT_ENABLE_ALLgraph_optimization_levelORT_ENABLE_BASICenable_profilingintra_op_num_threadsr3   debuglog_severity_leveladd_session_config_entryInferenceSessionrN   error)onnx_model_pathuse_gpuproviderenable_all_optimizationnum_threadsrV   verbose(enable_mlas_gemm_fastmath_arm64_bfloat16rJ   sessionsess_optionsrM   r   rI   r   create_onnxruntime_sessionP   sL   






rf   c                 C   s8   | rt jddd d S t jdd tdtj d S )NDEBUGz8[%(filename)s:%(lineno)s - %(funcName)20s()] %(message)s)levelfmtz%(message)s)ri   transformers)coloredlogsinstalllogging	getLoggersetLevelWARNING)rb   r   r   r   setup_logger   s   
rq   c                 C   s   | rt j| st |  |rt j|st | |r:|dkr+dt v s*J dntt g dr:J dt	dt
j  t	dtj  t	dtj  tt
jtd	ksbJ ttjtd
ksoJ ttjtd	ks|J d S )Nr<   r=   zBPlease install onnxruntime-directml package to test GPU inference.)rD   r@   rB   zWPlease install onnxruntime-gpu package, or install ROCm support, to test GPU inference.zPyTorch Version:zTransformers Version:zOnnxRuntime Version:z1.10.0z4.12.0)ospathexistsmakedirsrP   get_available_providersset
isdisjointr3   r4   torch__version__rj   r   parse)	cache_dir
output_dirr^   r_   r   r   r   prepare_environment   s(   

r~   c                 C   s   t | tt|  d }tj| tjdd }|d|  }t| |dt| dd dt| dd dt| dd d|d|ddS )Ng     @@)dtypez.2fZ   _   c   )
test_timeslatency_variancelatency_90_percentilelatency_95_percentilelatency_99_percentileaverage_latency_msQPS)sumfloatlennumpyvarfloat64
percentile)latency_list
batch_size
latency_msr   
throughputr   r   r   get_latency_result   s   r   c                 C   sv   t |dddd!}g d}tj||d}|  | D ]}|| qW d    n1 s,w   Y  td|  d S )Na asciimodenewlineencoding)enginer   rM   device	precision	optimizer
io_binding
model_nameinputsthreadsr   sequence_lengthcustom_layer_numr   r   r   r   r   r   r   r   
fieldnamesz&Detail results are saved to csv file: )opencsv
DictWriterwriteheaderwriterowr3   r4   )resultscsv_filenamecsv_filecolumn_names
csv_writerresultr   r   r   output_details   s   r   c                    s  t |dddd}g d g }|jD ]"}|jdgkr#|d|  q|jD ]}|d| d|  q&qtj| | d}|  |jD ]}d	D ]}	|jD ]}
d
D ]}|j	D ]z}i }| D ]l}|d |kr|d |	kr|d |
kr|d |kr|d |krɇ fdd|
 D }|s|| |dd |D  n D ]}|| || ksJ q|d }|d }|r|d |d| d| < q]|d |d| < q]|r|| qWqRqNqIqEW d    n1 sw   Y  td|  d S )Nr   r   r   r   )r   r   r   r   r   rM   r   r   r   r   r   b_sr   )         )TFr   r   r   r   r   r   c                    s   i | ]\}}| v r||qS r   r   )rG   kvheader_namesr   r   
<dictcomp>  s    z"output_summary.<locals>.<dictcomp>c                 S   s   i | ]}|d qS )r   r   )rG   r   r   r   r   r     s    r   r   r   z'Summary results are saved to csv file: )r   batch_sizessequence_lengthsappendr   r   r   modelsenginesra   itemsupdater   r3   r4   )r   r   argsr   
data_namesr   r   r   r   input_countengine_namer   r   rowr   headersr   r   sr   r   r   output_summary   sZ   






6r   c                 C   s   t |ddddO}ddddgttt|   }tj||d	}|  | D ]'}t	t
 | | d< tj| | d< tj| | d< || | d< || |  q(W d    n1 sZw   Y  td
|  d S )Nr   r   r   r   model_filenamer   rj   ry   r   z(Fusion statistics is saved to csv file: )r   listnextitervalueskeysr   r   r   strr   nowrj   rz   ry   r   r3   r4   )model_fusion_statisticsr   r   r   r   keyr   r   r   output_fusion_statistics)  s&   r   c                    sd   i }t j fddd|d t j fddd|d}|| |ddi |t|| |S )Nc                          d  S r   runr   
ort_inputsort_sessionr   r   <lambda>?      zinference_ort.<locals>.<lambda>r   numberrepeatc                      r   r   r   r   r   r   r   r   @  r   r   F)timeitr   r   r   )r   r   result_templaterepeat_timesr   warm_up_repeatr   r   r   r   r   inference_ort=  s   
r   c              
      s  i }   |D ]&}t|| |	}tt|| j|
} ||j	j
d||j|  qt|dkr;t|||	 t|D ]\}} ||| j	j
dtj|| j||   q?tj fddd|d tj fddd|d}|| |ddi |t|| |S )	Nr   c                      
     S r   run_with_iobindingr   r   r   r   r   r   t     
 z/inference_ort_with_io_binding.<locals>.<lambda>r   r   c                      r   r   r   r   r   r   r   r   z  r   r   T)r   ry   
from_numpytoIO_BINDING_DATA_TYPE_MAPgetr   r   
bind_inputr   typeshapedata_ptrr   allocateOutputBuffers	enumeratebind_outputr   r9   r   r   r   r   )r   r   r   r   ort_output_namesort_outputsoutput_buffersoutput_buffer_max_sizesr   r   	data_typer   r   rH   np_input
input_typeiort_output_namer   r   r   r   inference_ort_with_io_bindingG  sL   	

	
r  c                 C   s&   |D ]}|  tj|tj|d qd S )N)r   r   )r   ry   emptyr9   )r   r   r   r   r   r   r   r     s   r   {   c                 C   s<   t |  tj |  t|  tj|  tj|  dS )z5Set random seed manually to get deterministic resultsN)randomseedr   ry   manual_seedrC   manual_seed_all)r  r   r   r   set_random_seed  s
   

r	  returnc               
   C   s   ddl m} m}m}m}m}m}m} z>|  g }| }t|t	s#W d S t
|D ]#}	|||	}
t|
tr8 W d S ||	|||	|
j|
j|
jd q'|  |W S  | yh } ztd| W Y d }~d S d }~ww )Nr   	NVMLErrornvmlDeviceGetCountnvmlDeviceGetHandleByIndexnvmlDeviceGetMemoryInfonvmlDeviceGetNamenvmlInitnvmlShutdown)idrH   totalfreeused-Error fetching GPU information using nvml: %s)py3nvml.py3nvmlr  r  r  r  r  r  r  
isinstanceintranger   r   r  r  r  print)r  r  r  r  r  r  r  r   device_countr   r4   r\   r   r   r   get_gpu_info  s4   $



	
r  c                   @   s@   e Zd Zd
ddZdd Zedeeee	e
f   fddZd	S )MemoryMonitorTc                 C   r+   r   )keep_measuringr   r   r   r   r   r.     r/   zMemoryMonitor.__init__c                 C   sB   dd l }d}	 t||t  jd }td | js 	 |S q)Nr   T   {Gzt?)	psutilmaxProcessrr   getpidmemory_inforssr   r   )r   r$  	max_usager   r   r   measure_cpu_usage  s   zMemoryMonitor.measure_cpu_usager
  c                 C   s   t  r   )NotImplementedErrorr   r   r   r   measure_gpu_usage  s   zMemoryMonitor.measure_gpu_usageNT)r   r   r   r.   r+  r   r   r
   r	   r   r   r-  r   r   r   r   r    s
    
$r  c                       s<   e Zd Zd fdd	Zdeeeeef   fddZ	  Z
S )CudaMemoryMonitorTc                    s   t  | d S r   )superr.   r!  	__class__r   r   r.     s   zCudaMemoryMonitor.__init__r
  c           
   
      s>  ddl m}m}mm}mm}m} g g  zo|  | }t|t	s-t
d|  W d S dd t|D fddt|D  	 t|D ]%}||}t|tr`t
d|   W d S t| |jd	 |< qGtd
 | jsunqC|   fddt|D W S  |y }	 zt
d|	 W Y d }	~	d S d }	~	ww )Nr   r  z*nvmlDeviceGetCount result is not integer: c                 S      g | ]}d qS r   r   rG   r   r   r   r   rK         z7CudaMemoryMonitor.measure_gpu_usage.<locals>.<listcomp>c                    s   g | ]} |qS r   r   r5  )r  r  r   r   rK     s    Tz%nvmlDeviceGetMemoryInfo returns str: r"  r#  c                        g | ]}| | | d qS )	device_idrH   max_used_MBr   r5  gpu_namemax_gpu_usager   r   rK         r  )r  r  r  r  r  r  r  r  r  r  r3   r\   r  r   r%  r  r   r   )
r   r  r  r  r  r  r  r   r4   r\   r   )r<  r=  r  r  r   r-    s>   $



z#CudaMemoryMonitor.measure_gpu_usager.  )r   r   r   r.   r   r
   r	   r   r   r-  __classcell__r   r   r1  r   r/    s    &r/  c                       s.   e Zd Zd fdd	Zdd Zdd Z  ZS )	RocmMemoryMonitorTc                    sl   t  | d}tj|r|tjvrtj| zdd l}|| _| j  W d S  t	y5   d | _Y d S w )Nz/opt/rocm/libexec/rocm_smir   )
r0  r.   rr   rs   rt   sysr   rocm_smiinitializeRsmiImportError)r   r   rocm_smi_pathrB  r1  r   r   r.     s   
zRocmMemoryMonitor.__init__c                 C   s(   | j d u rdS | j |dd d d S )Nr:   VRAMr   i   )rB  
getMemInfo)r   devr   r   r   get_used_memory  s   
z!RocmMemoryMonitor.get_used_memoryc                    s   | j d u rd S | j d urt| j  nd}dd t|D dd t|D  	 t|D ]}t| | ||< q,td | jsDnq( fddt|D S )Nr   c                 S   r3  r4  r   r5  r   r   r   rK     r6  z7RocmMemoryMonitor.measure_gpu_usage.<locals>.<listcomp>c                 S   s   g | ]}d | qS )GPUr   r5  r   r   r   rK     s    Tr#  c                    r7  r8  r   r5  r;  r   r   rK   %  r>  )	rB  r   listDevicesr  r%  rI  timer   r   )r   r  r   r   r;  r   r-    s   

z#RocmMemoryMonitor.measure_gpu_usager.  )r   r   r   r.   rI  r-  r?  r   r   r1  r   r@    s    r@  rC   c              	   C   sD  d }|dkr	t }nt}|d}| r|d ur|}n| }|d u r"d S |d u r(|S t }| }||j}z||}	|	 }
W d|_| }nd|_| }w |d u r]	 W d    d S td| d|  t	|dkrt	|dkrt	|t	|krd}t
|D ]\}}|d }|| d }|| }t||}q|W  d    S W d    d S W d    d S W d    d S 1 sw   Y  d S |d ur|}n| }|d u r|S t >}| }||j}z||}	|	 }
W d|_| }nd|_| }w td|d	d
|d	d || W  d    S 1 sw   Y  d S )Nr?   FzGPU memory usage: before=z  peak=r   r   r:  zCPU memory usage: before=z.1fz
 MB, peak=z MB)r@  r/  r-  r   submitr   r   r3   r4   r   r   r%  r+  )is_gpufuncmonitor_typestart_memorymemory_monitor_typemonitormemory_before_testexecutor
mem_thread	fn_thread_r*  max_usedr   memory_beforebeforeafterr  r   r   r   measure_memory/  s   



(






&r]  c                  C   sL   g d} d}| D ]}t |}|d u rq|r|d7 }|| d| 7 }q|S )N)ORT_DISABLE_FUSED_ATTENTION!ORT_ENABLE_FUSED_CAUSAL_ATTENTION!ORT_DISABLE_FUSED_CROSS_ATTENTIONORT_DISABLE_TRT_FLASH_ATTENTION&ORT_DISABLE_MEMORY_EFFICIENT_ATTENTIONORT_TRANSFORMER_OPTIONSORT_CUDA_GEMM_OPTIONSr   ,=)rr   getenv)	env_namesenvrH   r   r   r   r   get_ort_environment_variabless  s   	
rj  r.  r   r4  )r  )rC   N)7r   rm   rr   r  rA  rL  r   abcr   r   concurrent.futuresr   r   enumr   r   typingr   r	   r
   r   rk   r   ry   rj   	packagingr   rP   rn   r   r3   r   r#   r*   r9   r   rf   rq   r~   r   r   r   r   r   longlongr  r   r	  r   r  r  r/  r@  r]  rj  r   r   r   r   <module>   sf   

	

F
":

=
&2
+D