o
    gX                     @   sD  d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZ	d dl
Z
d dlZd dlZd dlmZmZ d dlmZ d dlmZ d dlmZmZmZ d dlmZ d dlmZmZmZ d dlZe e!Z"de j#fd	d
Z$de j#fddZ%dd Z&dd Z'dd Z(dd Z)dd Z*dd Z+dd Z,dd Z-e!dkre-  dS dS )    N)measure_memorysetup_logger)get_library_path)ORTModelForSpeechSeq2Seq)ProfilerActivityprofilerecord_function)trange)AutoModelForSpeechSeq2SeqWhisperConfigWhisperProcessorargsc                    s   j dvr	td fdd fdd j j j j j jd} j dkrq| D ]\}}t	j
|gd	|v r<t	jnt	jd
||< q- jrSt	j
 jgt	jd
|d<  jrbt	j
 jgt	jd
|d<  jrqt	j
 jgt	jd
|d< td j  fdd}t | j | j} jr||d< |S td  j dkrdnd fdd}t || ||} j dkr||d< |S |j jrtjntj jd|d<  j|d< d|d< d|d<  jr j|d< |S )N>   orthf-orthf-pt-eagerhf-pt-compilez/Unable to auto-detect inputs for provided modelc                     s   t  j} t | } | S N)whisper
load_audio
audio_pathpad_or_trimaudior    h/var/www/visachat/venv/lib/python3.10/site-packages/onnxruntime/transformers/models/whisper/benchmark.pyload_via_ffmpeg$   s   
z#get_inputs.<locals>.load_via_ffmpegc                     sV   t  jd} tjt|  tjd}t|g}W d    |S 1 s$w   Y  |S )Nrbdtype)openr   npasarraylistreaduint8array)fr   r   r   r   load_via_numpy)   s   
z"get_inputs.<locals>.load_via_numpy)
max_length
min_length	num_beamsnum_return_sequenceslength_penaltyrepetition_penaltyr   penaltyr   decoder_input_idslogits_processortemperaturezLoad audio: c                    s   | r S   S r   r   )onnx_e2e)r   r(   r   r   <lambda>D   s    zget_inputs.<locals>.<lambda>audio_streamzFeature extraction: r!   ptc                    s    j j| g jdjS )N)return_tensorssampling_rate)	processorfeature_extractorr8   input_featuresr   )r   return_typer   r   r4   P   s    
r;   )r   deviceinputsno_repeat_ngram_sizeTearly_stopping	use_cacheforced_decoder_ids)benchmark_type	Exceptionr)   r*   r+   r,   r-   r.   itemsr!   r&   float32int32has_decoder_input_idsr0   has_logits_processorr1   has_temperaturer2   loggerinfor   time_fnhas_audio_streamtouse_fp16torchfloat16target_devicer?   )r   r>   kvload_audio_fn
audio_dataprocessor_fnr;   r   )r   r   r(   r<   r   
get_inputs    sX   

&





rY   c                 C   s  d\}}d\}}| j dv r<| jr| jn| j}t }tj|| jr#tjntj	dd
| j}t }| j dkr;t|}n2| j dv rft }| j|_|t  | jred|_d|_| jretd td ntd	| j  | j d
krt| jtu r| jd n| j}t| jtu r| jd nd }t }tj| j|||dd}t }| j dkrt d| j!  t }tj"| j!|| jgd}t }t d||  d |S )N)NN   r   r   T)torch_dtyperA   r   >   r   r      r   Cannot recognize r   )providerprovider_optionssession_optionsuse_io_bindingr   zLoading model from )	providerszLoaded model in  s)#rC   hf_pt_model_path
model_nametimer
   from_pretrainedrP   rQ   rR   rF   rO   rS   compiler   SessionOptionsr   enable_profilingregister_custom_ops_libraryr   verboselog_verbosity_levellog_severity_leveltuneset_default_logger_severityset_default_logger_verbosityrD   typeexecution_providertupler   hf_ort_dir_pathrK   rL   ort_model_pathInferenceSession)r   modelsess_options
start_timeend_timesourcer^   r_   r   r   r   	get_modelh   sf   







r}   c                 C   sV  t |tu r
|d n|}t |tu r|d n|}t| j}| jdkr(t| jnt| jt	j
dd}| jr=||}t| |D ]}|| q?| jdkrQtj| t }	| jdkr_t| jnt| jt	j
dd}
|
D ]}|| qj| jdkr|tj| t }| jdkrtd d}||	 | j }|| }td	| d
 td| d d S )Nr   r\   r   zWarm up)filedesccpu	Benchmark z	Latency: rc   zThroughput: z qps)rr   rt   rQ   r=   rS   rC   rangewarmup_runsr	   sysstdoutrl   rK   rL   cudasynchronizerf   num_runs)r   fnr>   warmup_inputsbenchmark_inputstorch_devicewarmup_rangeoutputs_rz   bench_ranger{   
batch_sizelatency
throughputr   r   r   rM      s@   










rM   c           	      C   s6  | j   d| j d| j d|jdd d| dtj d}d }| j dv rtt	j
t	jgddd }td || W d    n1 sGw   Y  W d    n1 sVw   Y  |jdd	j| j| jd
}tj| j| d}t|d}|| W d    |S 1 sw   Y  |S || | d}|S )N-r   z%Y-%m-%d_%H:%M:%SrZ   T)
activitiesrecord_shapesprofile_memorymodel_inference   )group_by_stack_n)sort_by	row_limitz.logw.json)rC   lower	precisionr=   __name__replacedatetimenowr   r   CPUCUDAr   key_averagestablept_filter_bypt_num_rowsospathjoin
log_folderr    write)	r   r   r>   inputs_typeprefixfilenameprof	prof_datar'   r   r   r   
profile_fn   s.   B






r   c                    s   t  }t|}|jdd   td|jd d d t  t	j
  t| jdk fdd| jd tj  d S )	Ng?)intervalzCPU usage: %r   c                      s    S r   r   r   r   r>   r   r   r4     s    zmeasure_fn.<locals>.<lambda>)is_gpufuncmonitor_type)r   getpidpsutilProcesscpu_percentrK   rL   gccollectrQ   r   empty_cacher   r=   r   r   r   flush)r   r   r>   pidprocessr   r   r   
measure_fn   s   

 r   c           
         s  fdd fdd}|} j dkr||  jrt ||d} j dkr|d td  }jj }|d	 }tj	|rWt
d
| d|  t|tj j| jj }|d }tj	|r~t
d
| d|  t|tj j| jj }|d }tj	|rt
d
| d|  t|tj j| d S t
d t || ||\}}	t
dt|d  d t
d|	d   t || d S )Nc                    s    j di | }|S )Nr   )generate)r>   predicted_idsrx   r   r   get_pred_ids  s   z&run_hf_inference.<locals>.get_pred_idsc                    s>   | }g }t  jD ]}| jj|ddd  q||fS )NTskip_special_tokensr   )r   r,   appendr9   batch_decode)r>   r   transcriptionr   )r   r   r   r   gen_and_dec  s
   z%run_hf_inference.<locals>.gen_and_decr   zgen-and-decr   r   z-encoder.json	Renaming  to z-decoder.jsonz-decoder-with-past.jsonz
Evaluating PyTorch...Generated token length: r    tokensTranscription: )rC   r   r   lenencodersessionend_profilingr   r   isfilerK   warningrenamer   r   decoderdecoder_with_pastrL   rM   r   )
r   r>   rx   r   generate_fnnew_logname
new_prefixold_lognamer   r   r   )r   r   rx   r   run_hf_inference  s>   


r   c                    sj  d fdd	}fdd}fdd} fdd	} j d
kr!|n|}||} jrNt ||d}	 }
td|
 d|	  t|
tj	 j
|	 d S td |} jrb||dd}||f}t || ||} j d
kru| }|d } jrtd|d d   n$||d d }tdt| d  jj|d ddd }t|  t || d S )NFc                    s   t tdd  }t |  }|| }t|r%td|  td|r0 jr0| d | d< || }t|rI|D ]}t	d| d | |= q: j
d	kru }|  D ]
\}}	|||	 qV D ]}
|j|
j j
 jd
 qe|S | S )Nc                 S      | j S r   namemodel_inputr   r   r   r4   N      z?run_ort_inference.<locals>.prepare_ort_inputs.<locals>.<lambda>z(The following model inputs are missing: zEThere are missing inputs to the model. Please add them and try again.r)   r*   zRemoving unnecessary input 'z' from user provided inputsr   )device_type	device_id)setmaprY   keysr   rK   errorrD   ro   rL   r=   
io_bindingrE   bind_cpu_inputget_outputsbind_outputr   r   )r>   warmupmodel_inputsuser_inputsmissing_inputsunnecessary_inputsunnecessary_inputr   rT   rU   outputr   rx   r   r   prepare_ort_inputsL  s*   

z-run_ort_inference.<locals>.prepare_ort_inputsc                    s     |  | S r   )run_with_iobinding)r   r   r   r   with_io_bindingj  s   
z*run_ort_inference.<locals>.with_io_bindingc                    s     d | }|S r   )run)r>   r   r   r   r   without_io_bindingo  s   z-run_ort_inference.<locals>.without_io_bindingc                    s6    j | v rt|  j kd d }| d |d  S | S )Nr   r\   )eos_token_idr!   where)r   	first_endr   r   r   handle_outputt  s   
z(run_ort_inference.<locals>.handle_outputr   e2er   r   z
Evaluating ONNX Runtime...T)r   r   r   r   r   r   )F)r=   r   r   r   rK   r   r   r   r   r   r   rL   ro   rM   copy_outputs_to_cpurN   r   r9   r   printr   )r   r>   rx   r   r   r  r  r   
ort_inputsr   r   ort_evaluate_inputsort_warmup_inputsort_outputsactual_outputr   r   r   r   run_ort_inferenceK  s:   


r  c                 C   sD   | j dv rt| || d S | j dkrt| || d S td| j  )N>   r   r   r   r   r]   )rC   r   r  rD   )r   r>   rx   r   r   r   run_inference  s
   

r  c               	   C   s  t  } | jddtdg dd | jddtddd	 | jd
dtddg ddd | jdtddd | jdtddd | jdtddd | jddtddd	 | jddttj rYdndg dd  | jd!d"td#d$ | jd%d&td'd$ | jd(d)td*d$ | jd+td,d$ | jd-td.d/d | jd0td1d$ | jd2td#d$ | jd3td4d$ | jd5td4d$ | jd6td7d$ | jd8td7d$ | jd9td:d$ | jd;td<d=d | jd>td4d?d | jd@td7dAd | jdBdCdDdE | jdFtdGdHd | jdItdJdKd | jdLdCdDdE | jdMtt	j
dNdOd | jdPdCdDdQdR |  }tj|j t|j |j|_dS|jv re|j  dT|_|jdUkrK|jdV|jif|_n|jdWkre|j|jd4|jr\d4nd#dXf|_d|_|jdYkrs|jssJ dZ|jdSkr|jsJ d[t|j|_|S )\Nz-btz--benchmark-typeT)r   r   r   r   )rr   requiredchoicesz-mz--model-namez;Hugging Face name of model (e.g. 'openai/whisper-large-v2'))rr   r  helpz-pz--precisionfp32)int8fp16r  zePrecision for model. For ONNX models, the model's precision should be set before running this script.)rr   r  defaultr  r  z--hf-pt-model-pathr   zNPath to directory containing all PyTorch files (e.g. tokenizer, PyTorch model))rr   r  r  z--hf-ort-dir-pathzaPath to directory containing all ONNX files (e.g. tokenizer, encoder, decoder, decoder_with_past)z--ort-model-pathzPath to ONNX modelz-az--audio-pathz%Path to audio file for E2E evaluationz-dz--devicer   r   )r   r   rocm)rr   r  r  z-idz--device-idr   )rr   r  z-wz--warmup-runsr   z-nz
--num-runs
   z--seed   z--sampling-ratei>  zSampling rate for audio (in Hz)z--max-lengthi  z--min-lengthz--num-beamsr\   z--num-return-sequencesz--length-penaltyg      ?z--repetition-penaltyz--no-repeat-ngram-size   z--decoder-input-idsz[]zThe forced decoder ids for generation. Format is [start token, timestamp token, language token, task token]. Default is [start token]. See `decoder_input_ids` in https://github.com/microsoft/Olive/tree/main/examples/whisper for details.z--logits-processorzLWhether to use timestamps logits processor or not (0 for false, 1 for true).z--temperaturez!Temperature value for generation.z	--profileF
store_true)r  actionz--pt-filter-byself_cpu_time_totalz"What to filter PyTorch profiler byz--pt-num-rowsi  z.Number of rows for PyTorch profiler to displayz	--verbosez--log-folder.zFolder to cache log filesz--tunezFOnly used by ROCm EP, enable TunableOp tuning to select fastest kernel)r  r  r  r   ExecutionProviderCUDAExecutionProviderr   ROCMExecutionProvider)r   tunable_op_enabletunable_op_tuning_enabler   z,Please specify a path to `--hf-ort-dir-path`z+Please specify a path to `--ort-model-path`)argparseArgumentParseradd_argumentstrrQ   r   is_availableintfloatr   r   r   
parse_argsr!   randomseedmanual_seedr=   r   rC   upperrs   r   ro   ru   rv   astliteral_evalr0   )parserr   r   r   r   r+    s   
r+  c                  C   sB  t  } t| j t| j dtjj_	t
| j}t| j}| jdkr*d| j n| j}| jdk}t| d| t| d| t| d| t| dd	 t| d
|j td| j  t| }| jdkrttdd | }d|v | _t| dd|v  t| dd|v  t| dd|v  | jg kr|jg| _t| }t| || d S )NTr   zcuda:r  r9   rS   rP   rN   Fr  zForced decoder prompt ids: r   c                 S   r   r   r   r   r   r   r   r4   T  r   zmain.<locals>.<lambda>r5   rH   r0   rI   r1   rJ   r2   )r+  r   rl   rK   rL   __dict__rQ   backendscudnn	benchmarkr   rg   re   r   r=   r   r   setattrr  r0   r}   rC   r   r   rY   rN   decoder_start_token_idr  )r   configr9   rS   rP   rx   ort_model_inputsr>   r   r   r   main=  s2   






r;  __main__).r$  r0  r   r   loggingr   r   rf   numpyr!   r   rQ   r   benchmark_helperr   r   onnxruntime_extensionsr   optimum.onnxruntimer   torch.profilerr   r   r   tqdmr	   transformersr
   r   r   onnxruntimer   	getLoggerr   rK   	NamespacerY   r}   rM   r   r   r   r  r  r+  r;  r   r   r   r   <module>   sD   
HC1@X	 $
