o
    gh                     @   sv  d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZ	d dl
Z
d dlZd dlZd dlmZmZ d dlmZmZ d dlmZmZmZmZmZmZ d dlmZ d dlmZmZmZ d dl m!Z! d dl"m#Z#m$Z$m%Z% d dl&Z'e(e)Z*d	d
 Z+de j,de-fddZ.de j,fddZ/dd Z0dd Z1dd Z2dd Z3dd Z4dd Z5d"ddZ6dd  Z7e)d!kre7  dS dS )#    N)measure_memorysetup_logger)get_rankget_size)add_io_bindings_as_ortvalues%get_merged_sample_with_past_kv_inputsget_msft_sample_inputsget_sample_inputsget_sample_with_past_kv_inputsverify_ort_inputs)ORTModelForCausalLM)ProfilerActivityprofilerecord_function)trange)
AutoConfigAutoModelForCausalLMAutoTokenizerc                 C   sP   | j dv rdS | j dkr"zt|jW S  ty!   t|jj Y S w t| S )N   hf-pt-eagerhf-pt-compiler   hf-ort)benchmark_typeleninputs_names	Exceptiondecoderinput_names
get_inputsargsmodel r"   f/var/www/visachat/venv/lib/python3.10/site-packages/onnxruntime/transformers/models/llama/benchmark.pyget_ort_model_inputs_len(   s   

r$   r    ort_model_inputs_lenc                 C   s  d\}}| j dkrdn| jj}| j dv r4t| j| j| j| jdd}t| j| j| j| j| jdd}||fS | j dv r|d	kr]t| j| j| j| jdd}t| j| j| j| j| jdd}||fS t	| j| j| j| jd
|| j| j
ddd
}t	| j| j| jd| j|| j| j
ddd
}||fS | j dkrt	| j| j| j| jd
|| j| j
dd| jd}t	| j| j| jd| j|| j| j
dd| jd}||fS | j dkr|dk}t| j| jd
| j|| j| j
|d}t| j| j| jd|| j| j
|d}||fS td)NNNort-msfti   r   T)return_dict)use_fp16r(   >   r      r   pt)seq_lenpast_seq_lenmax_seq_lenr)   use_buffer_shareenginer(      ort-convert-to-onnxort)r,   r-   r.   r)   r/   r0   r(   
world_size   )r-   r,   r.   r)   r/   split_kvz/Unable to auto-detect inputs for provided model)r   configmax_position_embeddingsr	   target_device
batch_sizesequence_lengthr
   r)   r   r/   r4   r   r   )r    r%   init_inputsiter_inputsr.   r6   r"   r"   r#   r   5   s   
o
\F
*

r   c                 C   s  d\}}d\}}| j dv rB| jr| jn| j}t }tj|| jr#tjntj	| j
| j
d| jd| j}t }| j dkrAt|}n| j dv rYt }| j|_| jrXd|_d|_ntd| j  | j d	krt| jtu rr| jd
 n| j}t| jtu r| jd nd }d }d }	t| jD ]'}
d|
vsd|
v sd|
v rqd|
v s|
dkr|
}d|
v r|
}	d|
v r|
}|
}	qt }tj| j||	| j
| j
d|dkrdnd |||d
}t }| j dv rtd| j !| j"  t }tj#| j !| j"|| jgd}t }td||  d |S )Nr&   r   T)torch_dtypeuse_auth_tokentrust_remote_code	use_cache	cache_dirr   >   r   r'   r2   r1   Cannot recognize r   r   z.onnxz
.onnx_dataz
.onnx.datadecoder_modelz
model.onnxdecoder_with_past_modeldecoder_merged_model)	decoder_file_namedecoder_with_past_file_namer?   r@   use_io_binding
use_mergedproviderprovider_optionssession_options   r'   r2   zLoading model from )	providerszLoaded model in  s)$r   hf_pt_dir_path
model_nametimer   from_pretrainedr)   torchfloat16float32authrB   tor9   compiler3   SessionOptionsr   enable_profilingverboselog_verbosity_levellog_severity_levelr   typeexecution_providertupleoslistdirhf_ort_dir_pathr   loggerinfoort_model_pathformatrankInferenceSession)r    r!   sess_options
start_timeend_timesourcerK   rL   rG   rH   filenamer"   r"   r#   	get_model   s   
	




rq   c                    sV   j dv r
t jnt jtjdd} jr||}t|  fdd} fdd}|D ]}|  || |  q-d} j dv rFt j	nt j	tjdd}	|	D ]}|  t

 }
|| |  t

 }|||
 7 }qQ j dvrvtd	 | j	 } j| } jdkrtd
 j  td j  td| d td| d d S )NrN   zWarm up)filedescc                     *    j dkr jdv r j S  fddS )NcpurN   c                     &    j dkrtj rtj S dd S )Nru   c                  W      d S Nr"   kwargsr"   r"   r#   <lambda>      =time_fn.<locals>.<lambda>.<locals>.<lambda>.<locals>.<lambda>devicerU   cudais_availablesynchronizery   r    r"   r#   r{        
+time_fn.<locals>.<lambda>.<locals>.<lambda>)r   r   
io_bindingsynchronize_inputsry   r   r"   r#   r{        
ztime_fn.<locals>.<lambda>c                     rt   )Nru   rN   c                     rv   )Nru   c                  W   rw   rx   r"   ry   r"   r"   r#   r{   (  r|   r}   r~   ry   r   r"   r#   r{   %  r   r   )r   r   r   synchronize_outputsry   r   r"   r#   r{   "  r   r   	Benchmark zBatch Size: zSequence Length: z	Latency: rP   zThroughput: z tps)r   rangewarmup_runsr   sysstdoutr]   rf   rg   num_runsrS   r:   rj   r;   )r    fninputswarmup_rangeoutputs
input_syncoutput_sync_
total_timebench_rangerm   rn   latency
throughputr"   r   r#   time_fn  sF   











r   c           	      C   sH  d| j  d| j d| j  d| j d| j d|jdd d| dtj	 d}d }| jdv rt
tjtjgddd }td	 || W d    n1 sPw   Y  W d    n1 s_w   Y  |jd
dj| j| jd}tj| j| d}t|d}|| W d    |S 1 sw   Y  |S || | d}|S )Nb_sr   -z%Y-%m-%d_%H:%M:%Sr   T)
activitiesrecord_shapesprofile_memorymodel_inferencer5   )group_by_stack_n)sort_by	row_limitz.logwz.json)r:   r;   r   lower	precisionr   __name__replacedatetimenowr   r   CPUCUDAr   key_averagestablept_filter_bypt_num_rowsrc   pathjoin
log_folderopenwrite)	r    r   r   inputs_typeprefixrp   prof	prof_datafr"   r"   r#   
profile_fnR  s.   T






r   c                    s   t  }t|}|jdd   | jdkr+td|jd dtjdd  d t	
  tj  t| jdk fd	d
d tj  d S )Ng?)intervalr   zCPU usage: F)logical%ru   c                      s    S rx   r"   r"   r   r   r"   r#   r{   |  s    zmeasure_fn.<locals>.<lambda>)is_gpufunc)rc   getpidpsutilProcesscpu_percentrj   rf   rg   	cpu_countgccollectrU   r   empty_cacher   r   r   r   flush)r    r   r   pidprocessr"   r   r#   
measure_fno  s   

&
r   c                    s    fdd}|}| j dkr|| || | jrlt| ||d}| j dkrA jj }td| d|  t	|tj
| j| t| ||d}| j dkrj jj }td| d|  t	|tj
| j| d S td	 t| || t| || td
 t| || t| || d S )Nc                    s    di | }|S )Nr"   r"   r   r   r!   r"   r#   
get_logits  s   z$run_hf_inference.<locals>.get_logitsr   promptr   	Renaming  to token7
Evaluating `model(inputs)` step to get past_key_values5
Evaluating `model(inputs)` step with past_key_values)r   r   r   r   sessionend_profilingrf   warningrc   renamer   r   r   decoder_with_pastrg   r   r   )r    r<   r=   r!   r   generate_fnnew_lognameold_lognamer"   r   r#   run_hf_inference  s.   




r   c                    sV   fdd}fdd}fdd} j dkr|n|}i } jry|||\}	}t ||	d}
 }td	| d
|
  t|tj	 j
|
 t |||\}}t ||d}
 }td	| d
|
  t|tj	 j
|
 d S td |||\}	}t ||	 t ||	 td |||\}}t || t || d S )Nc                    sP   t | }  jdkr$t|  jt j j|\}}t d| ||fS | |fS )Nru   r   )r   r   r   intrj   r/   setattr)r   kv_cache_ortvaluesr   r   r"   r#   prepare_ort_inputs  s   

z-run_ort_inference.<locals>.prepare_ort_inputsc                    s     |  d S rx   )run_with_iobinding)r   r   r"   r#   with_io_binding  s   z*run_ort_inference.<locals>.with_io_bindingc                    s     d | }|S rx   )runr   r   r"   r#   without_io_binding  s   z-run_ort_inference.<locals>.without_io_bindingru   r   r   r   r   r   r   )r   r   r   r   rf   r   rc   r   r   r   r   rq   rg   r   r   )r    r<   r=   r!   r   r   r   r   r   ort_init_inputsr   r   ort_iter_inputsr"   r   r#   run_ort_inference  s4   

r   c                 C   sH   | j dv rt| ||| d S | j dv rt| ||| d S td| j  )N>   r   r   r   rN   rC   )r   r   r   r   )r    r<   r=   r!   r"   r"   r#   run_inference  s
   

r   c              	   C   s  t  }|jddtdg dd |jddtddd	 |jd
ddddd |jdddtdg ddd |jdtddd |jdtddd |jdtddd |jddd d! |jd"d#d$d! |jd%d&ttj rid'nd(g d)d* |jd+d,td-d. |jd/d0td1d. |jd2d3td4d. |jd5td6d. |jd7td8d. |jd9td:d. |jd;ddd< |jd=td>d?d |jd@tdAdBd |jdCddd< |jdDttj	
dEdFd |jdGtddHdIdJ | }tj|j t|j dK|jv rt|dL|j  dM |jdNkr|jdO| if|_n|jdPkr|jdO| if|_d'|_|jdQkr-|js-J dR|jdSv r;|js;J dT|jdU|_|jdU|_|jdVv s[|jdWkr]|jd(kr]dndX|_|jrxt|jd:krtt|jd:ksxJ dY|S )ZNz-btz--benchmark-typeT)r   r   r   r'   r2   )r`   requiredchoicesz-mz--model-namez<Hugging Face name of model (e.g. 'meta-llama/Llama-2-7b-hf'))r`   r   helpz-az--authF
store_truez5Use Hugging Face authentication token to access model)defaultactionr   z-pz--precisionfp32)int4int8fp16r   zePrecision for model. For ONNX models, the model's precision should be set before running this script.)r   r`   r   r   r   z--hf-pt-dir-pathr   zNPath to directory containing all PyTorch files (e.g. tokenizer, PyTorch model))r`   r   r   z--hf-ort-dir-pathzhPath to directory containing all ONNX files (e.g. tokenizer, decoder_merged, decoder, decoder_with_past)z--ort-model-pathzPath to ONNX modelz-bz--batch-sizesz1 2)r   z-sz--sequence-lengthsz32 64 128 256 512z-dz--devicer   ru   )ru   r   rocm)r`   r   r   z-idz--device-idr   )r`   r   z-wz--warmup-runsr5   z-nz
--num-runs
   z--seed   z--max-length    z--num-return-sequencesr1   z	--profile)r   r   z--pt-filter-byself_cpu_time_totalz"What to filter PyTorch profiler byz--pt-num-rowsi  z.Number of rows for PyTorch profiler to displayz	--verbosez--log-folder.zFolder to cache log filesz--cache-dirz./model_cachez-Cache dir where Hugging Face files are stored)r`   r   r   r   r3   ra   ExecutionProviderCUDAExecutionProvider	device_idROCMExecutionProviderr   z,Please specify a path to `--hf-ort-dir-path`rN   z+Please specify a path to `--ort-model-path` >   r   r   r   r   zOPlease provide only one (batch_size, sequence_length) combination for profiling)argparseArgumentParseradd_argumentstrrU   r   r   r   rc   r   r   
parse_argsnprandomseedmanual_seedr   r   r   upperra   re   rh   batch_sizessplitsequence_lengthsr   r   r   )rj   parserr    r"   r"   r#   get_args  s   
	*"r  c                  C   s  t  } t }t| }t|j t|j dtj	j
_| |_||_tj|j|j|j|jd}tj|j|j|j|jd}|jdkrEd|j n|j}|jdk}t|d| t|d| t|d| t|d	| t|}t||}|jd
v rtj|j|jdd}	ttdd |	j j!}
|ot"|
dko|jdk}t|d| nt|dd t#$|j%|j&D ]3\}}|jdkrtd| d| d t|dt'| t|dt'| t(||\}}t)|||| qd S )NT)rB   r?   r@   ru   zcuda:r   	tokenizerr7   r9   r)   rN   F)load_external_datac                 S   s
   | j dkS )NGroupQueryAttention)op_type)noder"   r"   r#   r{     s   
 zmain.<locals>.<lambda>r   r/   z
Batch size = z and sequence length = z...r:   r;   )*r   r   r  r   r]   rf   rg   __dict__rU   backendscudnn	benchmarkrj   r4   r   rT   rR   rB   rX   r   r   r   r   rq   r$   r   onnx
load_modelrh   ri   listfiltergraphr  r   	itertoolsproductr  r  r   r   r   )rj   r4   r    r  r7   r9   r)   r!   r%   
onnx_model	gqa_nodesr/   r:   r;   r<   r=   r"   r"   r#   main  sH   





r%  __main__)r   )8r  r   r   r!  loggingrc   r   rS   numpyr	  r  r   rU   benchmark_helperr   r   dist_settingsr   r   llama_inputsr   r   r   r	   r
   r   optimum.onnxruntimer   torch.profilerr   r   r   tqdmr   transformersr   r   r   onnxruntimer3   	getLoggerr   rf   r$   	Namespacer   r   rq   r   r   r   r   r   r   r  r%  r"   r"   r"   r#   <module>   sJ    
 UF>;
	 	2
