o
    g                    @   s   d Z ddlZddlZddlZddlZddlZddlmZ ddlm	Z	 ddl
mZmZmZmZmZ ddlZddlZddlZddlmZmZ ddlmZ ddlmZmZmZ dd	lmZ dd
lmZm Z m!Z!m"Z"m#Z#m$Z$m%Z%m&Z& ddl'm(Z(m)Z)m*Z*m+Z+ ddl,m-Z. ddl/m0Z0 ddl1m2Z3 ddl4m5Z5m6Z6 e7dZ8G dd deZ9d}deee:  dej;fddZ<dej;fddZ=dej;fddZ>d~de:de?fdd Z@d~de:de?de?fd!d"ZAd#e:d$e?d%e?de)fd&d'ZBd(ejd)efd*d+ZCd(ejd)efd,d-ZDd(ejd)efd.d/ZE	0	1		dd2ed3ed4e:d5eFd6eeG d7eeG fd8d9ZHd:ed;efd<d=ZI	1dd(ed5eFdee fd>d?ZJd@dA ZKdBdC ZLdDdE ZMdFefdGdHZNdFedIe?dJe?de?fdKdLZOdFefdMdNZP	PddQedRe:dSeFdTeFdUeFf
dVdWZQdFefdXdYZRdFefdZd[ZSd\efd]d^ZTd~d_e:de?fd`daZU	d~d_e:dbe:de?de?fdcddZVdedf ZWe9jXfdej;dge9fdhdiZYdej;dQee e%f djejZdkejZdleFdmeFdneeeF  dee:ef fdodpZ[dqdr Z\ddej;dteee:  due?fdvdwZ]d}dej;dteee:  fdxdyZ^ddeee:  dteee:  fdzd{Z-e_d|kre-  dS dS )a  
This converts GPT2 or T5 model to onnx with beam search operator.

Example 1: convert gpt2 model with beam search:
    python convert_generation.py -m gpt2 --output gpt2_beam_search.onnx

Example 2: convert gpt2 model with beam search containing specific cuda optimizations:
    python convert_generation.py -m gpt2 --output gpt2_beam_search.onnx --use_gpu                       --past_present_share_buffer --use_decoder_masked_attention

Example 3: convert gpt2 model with beam search with mixed precision and enable SkipLayerNorm strict mode:
    python convert_generation.py -m gpt2 --output gpt2_beam_search.onnx --use_gpu -p fp16 --use_sln_strict_mode

Example 4: convert T5 model with beam search in two steps:
    cd ./models/t5
    python convert_to_onnx.py -m t5-small
    cd ../..
    python convert_generation.py -m t5-small --model_type t5                                            --decoder_onnx ./models/t5/onnx_models/t5-small_decoder.onnx                                    --encoder_decoder_init_onnx ./models/t5/onnx_models/t5-small_encoder_decoder_init.onnx          --output ./models/t5/onnx_models/t5_small_beam_search.onnx

Example 5: convert T5 model with beam search. All in one step:
    python convert_generation.py -m t5-small --model_type t5 --output ./models/t5/onnx_models/t5_small_beam_search.onnx

Example 6: convert T5 model with beam search containing specific cuda optimizations. All in one step:
    python convert_generation.py -m t5-small --model_type t5 --output ./models/t5/onnx_models/t5_small_beam_search.onnx           --use_gpu --past_present_share_buffer --use_decoder_masked_attention

Example 7: convert MT5 model with external data file like mt5-base-beamsearch.onnx.data in below example.
    python convert_generation.py -m google/mt5-base --model_type mt5 --output mt5-base-beamsearch.onnx -e

Example 8: convert gpt2 model with greedy search:
    python convert_generation.py -m gpt2 --output gpt2_greedy_search.onnx --num_beams 1 --num_return_sequences 1

Example 9: convert gpt2 model with sampling:
    python convert_generation.py -m gpt2 --output gpt2_sampling.onnx --num_beams 1 --num_return_sequences 1 --top_p 0.6
    N)Enum)Path)AnyDictListOptionalUnion)	Precisionsetup_logger)NumpyHelper)
GraphProto
ModelProtoTensorProto)	OnnxModel)
GPT2ConfigGPT2LMHeadModelGPT2Tokenizer	MT5ConfigMT5ForConditionalGenerationT5ConfigT5ForConditionalGenerationT5Tokenizer)GraphOptimizationLevelInferenceSessionSessionOptionsget_available_providers)main)PRETRAINED_GPT2_MODELS)export_onnx_models)PRETRAINED_MT5_MODELSPRETRAINED_T5_MODELS c                   @   s    e Zd ZdZdZdZdd ZdS )GenerationTypebeam_searchgreedy_searchsamplingc                 C   s   | j S N)value)self r)   b/var/www/visachat/venv/lib/python3.10/site-packages/onnxruntime/transformers/convert_generation.py__str__U   s   zGenerationType.__str__N)__name__
__module____qualname__
BEAMSEARCHGREEDYSEARCHSAMPLINGr+   r)   r)   r)   r*   r"   P   s
    r"   argvreturnc                 C   s  t  }|d}|jdddtddtt t  d |jdd	td
g dddg d d |jdd	tt	j
dddd |jdd	tddd |jdd	tddd |jdd	ddd |jd	d |d}|jddtdd |jd d!d	ttjtjtjgd"d |jd#d$d	d%d&gd'd( |jd)d*d	dd+d |jd	d, |jd-d.d	dd/d |jd	d0 |jd1d2d	dd3d |jd	d4 |jd5d6d	dd7d |jd	d8 |jd9d:d	dd;d |jd	d< |d=}|jd>d	dd?d |jd	d@ |jdAd	ddBd |jd	dC |jdDd	ddE |jd	dF |jdGtd	dHdIdJ |jdKd	ddLd |jd	dM |jdNd	ddOd |jd	dP |jdQd	ddRd |jd	dS |jdTd	ddUd |jd	dV |jdWd	ddXd |jd	dY |jdZd	dd[d |jd	d\ |jd]d	dd^d |jd	d_ |d`}|jdatd	dbdcdJ |jddtd	dedfdJ |jdgtd	dhdidJ |jdjtd	dbdkdJ |jdltd	dbdmdJ |jdntd	dbdodJ |jdptd	dqdrdJ |jdstd	dqdtdJ |jdutd	tdv dwdJ |jdxtd	dbdydJ |jdztd	d{d|dJ |jd}td	dHd~dJ |jdtd	dddJ |jdtd	dddJ |jdtd	dddJ |d}|jdd	ddd |jd	d |jdd	ddd |jd	d |jdd	ddd |jd	d |jdd	ddd |jd	d |jdd	ddd |jd	d |jdd	tdbdd |jdd	ddd |jd	d || }|S )zParse arguments

    Args:
        argv (Optional[List[str]], optional): _description_. Defaults to None.

    Returns:
        argparse.Namespace: Parsed arguments.
    zInput optionsz-m--model_name_or_pathTzEPytorch model checkpoint path, or pretrained model name in the list: , )requiredtypehelpz--model_typeFgpt2)r9   t5mt5z*Model type (default is gpt2) in the list: )r6   r7   defaultchoicesr8   --cache_dir.cache_modelsz%Directory to cache pre-trained models)r6   r7   r<   r8   z--decoder_onnxr!   zLPath of onnx model for decoder. Specify it when you have exported the model.z--encoder_decoder_init_onnxzgPath of ONNX model for encoder and decoder initialization. Specify it when you have exported the model.z	--verbose
store_truezPrint more information)r6   actionr8   )verbosezOutput options--outputz,Output path for onnx model with beam search.z-p--precisionzTPrecision of model to run. fp32 for full precision, fp16 for half or mixed precisionz-b--op_block_list*autozDisable certain onnx operators when exporting model to onnx format. When using defaultvalue for gpt2 type of model fp16 precision, it will be set to ["Add", "LayerNormalization", "SkipLayerNormalization", "FastGelu"]. Other situation, it will be set to [])r6   nargsr<   r8   z-e--use_external_data_formatz!save external data for model > 2G)use_external_data_formatz-sz--run_shape_inferencezrun shape inference)run_shape_inferencez-dpvsz--disable_pad_vocab_sizezDo not pad logits MatMul weight to be a multiple of 8 along the dimension where dim value is the vocab size. The logits MatMul may hence be of poor performance for fp16 precision.)disable_pad_vocab_sizez-dsgdz,--disable_separate_gpt2_decoder_for_init_runzDo not create separate decoder subgraphs for initial and remaining runs. This does not allow for optimizations based on sequence lengths in each subgraph)*disable_separate_gpt2_decoder_for_init_runz-iz--disable_shared_initializerszdo not share initializers in encoder and decoder for T5 or in the init decoder and decoder for GPT2. It will increase memory usage of t5/mt5/gpt2 models.)disable_shared_initializersz6Beam search parameters that stored in the output modelz--output_sequences_scoreszoutput sequences scores)output_sequences_scoresz--output_token_scoreszoutput token scores)output_token_scoresz--early_stopping)r6   rB   )early_stoppingz--no_repeat_ngram_sizer   zNo repeat ngram size)r7   r6   r<   r8   z--vocab_maskz\Enable vocab_mask. This mask applies only to every generated token to filter some bad words.)
vocab_maskz--past_present_share_bufferzWUse shared buffer for past and present, currently work for gpt2 greedy/sampling search.)past_present_share_bufferz--use_decoder_masked_attentionzUses `DecoderMaskedSelfAttention` or `DecoderMaskedMultiHeadAttention` to optimize the decoding Attention computation. Must be used with `past_present_share_buffer`. Currently, only Attention head sizes of 32, 64 and 128 are supported.)use_decoder_masked_attentionz--prefix_vocab_maskzeEnable prefix_vocab_mask. This mask can be used to filter bad words in the first generated token only)prefix_vocab_maskz--custom_attention_maskz]Enable custom_attention_mask. This mask can be used to replace default encoder attention mask)custom_attention_maskz--presence_maskz!Presence mask for custom sampling)presence_maskz--seedzRandom seed for sampling op)seedzYBeam search parameters not stored in the output model, for testing parity and performancez--min_length   zMin sequence lengthz--max_length2   zMax sequence lengthz--num_beams   z	Beam sizez--num_return_sequencesz&Number of return sequence <= num_beamsz--length_penaltyz<Positive. >1 to penalize and <1 to encourage short sentence.z--repetition_penaltyz-Positive. >1 to penalize and <1 to encourage.z--temperature      ?z6The value used to module the next token probabilities.z--top_pzTop P for samplingz--filter_valueInfzFilter value for Top P samplingz--min_tokens_to_keepzAMinimum number of tokens we keep per batch example in the output.z--presence_penalty        z%presence penalty for custom sampling.z--customz&If 1 customized top P logic is appliedz--vocab_sizezIVocab_size of the underlying model used to decide the shape of vocab maskz--eos_token_idzKcustom eos_token_id for generating model with existing onnx encoder/decoderz--pad_token_idzKcustom pad_token_id for generating model with existing onnx encoder/decoderz0Other options for testing parity and performancez--use_sln_strict_modez_Enable strict mode for SLN in CUDA provider. This ensures a better accuracy but will be slower.)use_sln_strict_mode	--use_gpuz)use GPU for inference. Required for fp16.)use_gpuz--disable_parityzdo not run parity test)disable_parityz--disable_perf_testzdo not run perf test)disable_perf_testz--torch_performanceztest PyTorch performance)torch_performancez--total_runsz4Number of times of inference for latency measurementz--save_test_dataz-save test data for onnxruntime_perf_test tool)save_test_data)argparseArgumentParseradd_argument_groupadd_argumentstrjoinr   r    r   ospathset_defaultsr	   FLOAT32FLOAT16intfloat
parse_args)r2   parserinput_groupoutput_groupmodel_groupbeam_parameters_group
test_groupargsr)   r)   r*   parse_argumentsY   s~  	
		






r}   r|   c                 C   s   | j }d|d| jdd| jtjkrdndddd	d
dg}| jr%|d| jg | jr-|d | j	r5|d t
| jrF|dg || j | jtjkrS| jsSJ d| jr^td|  t|d dS )zqConvert GPT-2 model to onnx

    Args:
        args (argparse.Namespace): arguments parsed from command line
    r4   rD   z--optimize_onnxrE   fp32fp16z--test_runs1z--test_cases10z--overwriter>   rb   rJ   rF   zEfp16 or mixed precision model cannot run in CPU. Please add --use_gpuzarguments for convert_to_onnx:)r2   N)model_name_or_pathdecoder_onnx	precisionr	   rq   	cache_dirextendrc   appendrK   lenop_block_listrr   rC   loggerinfoconvert_gpt2_to_onnx)r|   
model_name	argumentsr)   r)   r*   gpt2_to_onnx  s8   


r   c                 C   s   t | j| jt| jj| j| j| jt	j
k| jdddddd| jd}td|d   td|d   |d | _|d | _dS )	znConvert T5 model to onnx

    Args:
        args (argparse.Namespace): arguments parsed from command line
    FT)rc   rK   optimize_onnxr   rC   use_decoder_start_tokenmerge_encoder_and_decoder_init	overwritedisable_auto_mixed_precisionuse_int32_inputs
model_typezonnx model for encoder: r   zonnx model for decoder: rZ   N)export_t5_onnx_modelsr   r   r   outputparentrc   rK   r   r	   rr   r   r   debugencoder_decoder_init_onnxr   )r|   pathsr)   r)   r*   
t5_to_onnx  s(   


r   T	onnx_pathrK   c                 C   sP   ddl m} tj| dd}|j|ddd}|r!tj|| |d d	S td d	S )
zShape inference on an onnx file, which will be overwritten.

    Args:
        onnx_path (str): Path of onnx model
        use_external_data_format(bool): output tensors to external data or not.
    r   )SymbolicShapeInferenceTload_external_dataF)
auto_mergeguess_output_ranksave_as_external_dataz4Failed to run symbolic shape inference on the model.N)	&onnxruntime.tools.symbolic_shape_inferr   onnx
load_modelinfer_shapesr   saver   warning)r   rK   r   modeloutr)   r)   r*   shape_inference(  s   r   c                 C   s  t j| dd}|jjd j}t|}| }||v sJ || }|jdkr'dS d}||j	d }|du rR|
|dd}	|	du rBdS ||	j	d }|du rPdS d}|jtjjkr[dS t|jd	krddS |jd }
|
d
 dkrqdS t|
d
 d
 }||
 }|jr|rtj|jd |ftjd}tjt||fdd}||jd< ntj||jd ftjd}tjt||fdd}||jd< | |_ndS tj|| |d dS )zPad the logits MatMul weight in the provided decoder model, which will be overwritten.

    Args:
        onnx_path (str): Path of onnx model
        use_external_data_format(bool): output tensors to external data or not.
    Tr   r   MatMulFrZ   N	Transpose      dtypeaxisr   )r   r   graphr   namer   output_name_to_nodeop_typeget_initializerinputmatch_parent	data_typer   DataTyperr   r   dimsmathceilraw_datanpzerosfloat16concatenater   to_arraytobytesr   )r   rK   decoder_model_protologits_output_namedecoder_modelr   matmul_nodepad_along_axis_1logits_weighttranspose_before_matmulactual_vocab_sizepadded_vocab_sizepaddingpadding_dataweight_with_paddingr)   r)   r*   pad_weights_of_logits_matmul:  sN   


r   
model_pathrc   ra   c                    sx   t  }tj|_|rddgndg}|r3dt vrtdtd |r3ddi}d|i  fdd|D }t| ||d	}|S )
a  Create OnnxRuntime session.

    Args:
        model_path (str): onnx model path
        use_gpu (bool): use GPU or not
        use_sln_strict_mode (bool): use strict mode for skip layer normalization or not

    Raises:
        RuntimeError: CUDAExecutionProvider is not available when --use_gpu is specified.

    Returns:
        onnxruntime.InferenceSession: The created session.
    CUDAExecutionProviderCPUExecutionProviderz5CUDAExecutionProvider is not available for --use_gpu!zuse CUDAExecutionProvider"enable_skip_layer_norm_strict_modeTc                    s$   g | ]}| v r| | fn|qS r)   r)   ).0r   provider_optionsr)   r*   
<listcomp>  s    z&create_ort_session.<locals>.<listcomp>)	providers)	r   r   ORT_DISABLE_ALLgraph_optimization_levelr   RuntimeErrorr   r   r   )r   rc   ra   sess_optionsexecution_providerscuda_provider_optionsort_sessionr)   r   r*   create_ort_session  s   


r   r   r   c              	   C   s  |t jk}t| j}|d }|dksJ g ddd t|D  }t| jt|kr9tdt| dt| j t|D ]E\}}| j| j|krZtd| d	| d| j| j tj	}|dkri|rftjntj
}| j| jjj}	|	|krtd| d
| d|	 q=td dgdd t|D  }
t| jt|
krtdt|
 dt| j t|
D ]>\}}| j| j|krtd| d	| d| j| j |rtjntj
}| j| jjj}||krtd| d
| d| qtd dS )a  Verify GPT-2 subgraph

    Args:
        graph (onnx.GraphProto): onnx graph of GPT-2
        precision (Precision): Precision (FLOAT16 or FLOAT32) of the model.

    Raises:
        ValueError: Number of inputs not expected.
        ValueError: Input name is not expected.
        ValueError: Input data type is not expected.
        ValueError: Number of outputs not expected.
        ValueError: Output name is not expected.
        ValueError: Output data type is not expected.
       rZ   )	input_idsposition_idsattention_maskc                 S      g | ]}d | qS )past_r)   r   ir)   r)   r*   r         z(verify_gpt2_subgraph.<locals>.<listcomp> Number of inputs expected to be . Got Input  is expected to be $ is expected to have onnx data type z:Verifying GPT-2 graph inputs: name and data type are good.logitsc                 S   r   )present_r)   r   r)   r)   r*   r     r   !Number of outputs expected to be Output z;Verifying GPT-2 graph outputs: name and data type are good.N)r	   rr   r   r   range
ValueError	enumerater   r   INT32FLOATr7   tensor_type	elem_typer   r   r   )r   r   
is_float16input_countlayer_countexpected_inputsr   expected_inputexpected_type
input_typeexpected_outputsexpected_outputoutput_typer)   r)   r*   verify_gpt2_subgraph  s>   

"
"
r  c              	   C   s8  |t jk}|r
tjntj}t| j}|d d }|dksJ ddg}t|D ]}|d|  |d|  q&t|D ]}|d|  |d	|  q=t| jt|krhtd
t| dt| j t	|D ]?\}}| j| j
|krtd| d| d| j| j
 |dk rtjn|}	| j| jjj}
|
|	krtd| d|	 d|
 qldg}t|D ]}|d|  |d|  qt| jt|krtdt| dt| j t	|D ]7\}}| j| j
|krtd| d| d| j| j
 | j| jjj}||krtd| d| d| qdS )  Verify T5 decoder subgraph

    Args:
        graph (onnx.GraphProto): onnx graph of T5 decoder
        precision (Precision): Precision (FLOAT16 or FLOAT32) of the model.

    Raises:
        ValueError: Number of inputs not expected.
        ValueError: Input name is not expected.
        ValueError: Input data type is not expected.
        ValueError: Number of outputs not expected.
        ValueError: Output name is not expected.
        ValueError: Output data type is not expected.
    r   r\   rZ   r   encoder_attention_maskpast_key_self_past_value_self_past_key_cross_past_value_cross_r   r   r   r   r   r   present_key_self_present_value_self_r   r   N)r	   rr   r   r   r   r   r   r   r   r   r   r   r7   r   r   r   )r   r   r  
float_typer  r  r  r   r  r  r  r  r	  r
  r)   r)   r*   verify_t5_decoder_subgraph  sH   

""
r  c              	   C   s  |t jk}t| jd d }|dksJ g d}t| jt|kr0tdt| dt| j t|D ]9\}}| j| j|krQtd| d| d| j| j tj	}| j| j
jj}||krmtd| d	| d| q4d
dg}	t|D ]}|	d|  |	d|  qvt|D ]}|	d|  |	d|  qt| jt|	krtdt|	 dt| j t|	D ]>\}}
| j| j|
krtd| d|
 d| j| j |rtjntj}| j| j
jj}||krtd| d	| d| qtd dS )r  r   r\   rZ   )encoder_input_idsr  decoder_input_idsr   r   r   r   r   r   encoder_hidden_statesr  r  present_key_cross_present_value_cross_r   r   zMT5 encoder graph verified: name and data type of inputs and outputs are good.N)r	   rr   r   r   r   r   r   r   r   r   r7   r   r   r   r   r   r   r   )r   r   r  r  r  r   r  r  r  r  r	  r
  r)   r)   r*   'verify_t5_encoder_decoder_init_subgraph-  s@   
""r  shared_   graph1graph2shared_prefixmin_elementssignature_cache1signature_cache2c                 C   s  i }i }g }g }	g }
| j D ]N}|jrt|j|ksq|j D ]=}|jr)t|j|ks*qt||||rZ||j ||j< || |j|vrX||j }|||j< |	| |
|  nqqtd|
  | j	D ]}t
t|jD ]}|j| |
v rtd|j|  qpqg|j	D ]}t
t|jD ]}|j| |
v rtd|j|  qq|	D ]}|j | q|jD ]}|j|v r||j |_q|j	D ]4}t
t|jD ]*}|j| |v r||j|  }td|j d| d|j|  d|  ||j|< qq|D ]}| j | q| jD ]}|j|v r||j |_q| j	D ]7}t
t|jD ],}|j| |v rO||j|  }td|j d| d|j|  d|  ||j|< q$q|	D ]	}||j |_qU|	D ] }tj|j}tj|j|j|}| j| |j| qa|	S )	a  Remove initializers with same value from two graphs.

    Args:
        graph1 (GraphProto): the first graph to process
        graph2 (GraphProto): the second graph to process
        shared_prefix (str): add prefix to the shared initializers among two graphs
        min_elements (int, optional): minimal number of elements for initializers to be considered. Defaults to 1024.
        signature_cache1 (dict): Optional dictionary to store data signatures of tensors in graph1 in order to speed up comparison
        signature_cache2 (dict): Optional dictionary to store data signatures of tensors in graph2 in order to speed up comparison
    zshared initializers:zname is found in graph 1: zname is found in graph 2: zgraph 2 rename node z input z from z to zgraph 1 rename node )initializerr   sumr   has_same_valuer   r   r   r   noder   r   r   r   remove
value_infor   numpy_helperr   shapehelpermake_tensor_value_infor   )r  r  r   r!  r"  r#  mapping_initializers_1mapping_initializers_2shared_initializers_1shared_initializers_2shared_initializers_namesinitializer1initializer2shared_namer'  jr$  r)  new_namer+  r)   r)   r*   remove_shared_initializersq  s   












*


*
r8  encoder_modelr   c                 C   s`   t | }t |}|d |d i i }}|| || t|jj|jjd||d}|S )Ne_d_s_)r   r"  r#  )r   add_prefix_to_namesremove_duplicated_initializerr8  r   r   )r9  r   encoderdecoderr"  r#  initializersr)   r)   r*   get_shared_initializers  s   




rB  c                 C   s   g }| j D ]}|jrt|j|ksq|| q|D ]}| j | q|D ]}tj|j}tj	
|j|j|}| j| q%|S )a^  Remove initializers of a graph, when they have number of elements larger than a threshold.

    Args:
        graph (GraphProto): the graph.
        min_elements (int, optional): minimal number of elements for initializers to be considered. Defaults to 1024.

    Returns:
        List[TensorProto]: initializers that are removed from the graph.
    )r$  r   r%  r   r(  r   r*  r   r+  r,  r-  r   r   r)  )r   r!  moved_initializerstensorr$  r+  r)  r)   r)   r*   move_initializers  s   
rE  c                 C   s   | j dkrtd| j d| j dkr| j}n^| j dkr | j}nU| j dkr)| j}nL| j dkr2| j}nC| j dkr;| j}n:| j d	krD| j}n1| j d
krM| j	}n(| j dkrV| j
}n| j dkr_| j}n| j dkrh| j}ntd| j d| j  d| j|fS )z
    Convert attribute to kwarg format for use with onnx.helper.make_node.
        :parameter attribute: attribute in AttributeProto format.
        :return: attribute in {key: value} format.
    r   z
attribute z does not have type specified.rZ   r   r   r\            r   	   
   z has unsupported type r?   )r7   r   r   fr   stgfloatsintsstringstensorsgraphs)	attributer'   r)   r)   r*   _attribute_to_pair  s0   











rU  c                 C   sD   i }| j D ]}t|\}}|||i q| jr |d| ji |S )Ndomain)rT  rU  updaterV  )r'  kwargsattrkeyr'   r)   r)   r*   	kwargs_of1  s   
r[  c                 C   s   t dd | jjjjD S )Nc                 S   s   g | ]}|j r
|j n|jqS r)   )	dim_param	dim_value)r   dr)   r)   r*   r   <  s    zshape_of.<locals>.<listcomp>)tupler7   r   r+  dim)vir)   r)   r*   shape_of;  s   rb  subgc              
   C   s  d}d}g }t | jD ],\}}||kr1t|}tjj|j|jjj	|d |d |d d|d gd}|
|g q|
tjjdtjjdgd	g | d
 | j
| g }t | jD ],\}}||krt|}tjj|j|jjj	|d |d |d d|d gd}|
|g qZ| d | j
| g }| jD ]N}	|	jdkrt|	}
|
ddi g }|
|	j t|dk r|
dg t|dk st|dk r|
dg tjjd||	jfd|	ji|
}	|
|	g q| d | j
| | S )Nr   rZ   r   r   max_seq_lenr\   r   r+  past_sequence_lengthr+  r   r   	AttentionrT   rG  r!   rH  r   r'  )r   r   rb  r   r,  r-  r   r7   r   r   r   r   r   
ClearFieldr   r'  r   r[  rW  r   	make_node)rc  input_past_0output_past_0
new_inputsr   ra  r+  new_outputs	new_nodesr'  rX  nisr)   r)   r*   1update_decoder_subgraph_past_present_share_buffer?  sX    



 
rq  is_beam_searchswitch_attentionc                 C   s  |r@g }t | jD ]
\}}||g q	|tjjdtjjdgdg |tjjdtjjg ddg | d | j| |rg d}g }| j	D ]x}|j
dkrt|}	|	 D ]}
|
d	krd  d
S |
|vrx|
dkrutd|
 d |	|
= qZg }||j |rt|dk r|dg t|dk st|dk r|dg t|dk r|dg tjjd||jfd|ji|	}||g qK| d | j	| dS )aS  Update the Attention nodes to DecoderMaskedSelfAttention.

    Args:
        subg (GraphProto): GraphProto of the decoder subgraph
        is_beam_search (bool): Boolean specifying if the sampling algo is BeamSearch
        switch_attention (bool): Boolean specifying if `Attention` is to be switched with `DecoderMaskedSelfAttention`
    
beam_widthrZ   rg  cache_indirection
batch_sizert  rd  r   rT   	num_headsscalemask_filter_valuerV  rh  qkv_hidden_sizesFunidirectionalzRemoving attribute: zB from Attention node while switching to DecoderMaskedSelfAttentionrH  r!   r   rI  DecoderMaskedSelfAttentionr   r'  T)r   r   r   r   r,  r-  r   r   ri  r'  r   r[  copyr   r   r   rj  r   r   )rc  rr  rs  rm  _ira  'decoder_masked_attention_supported_attrro  r'  rX  krp  r)   r)   r*   4update_decoder_subgraph_use_decoder_masked_attentiono  sd   
 




r  c                 C   s  t  }g }dd t| jD }i }i }| jD ]'}|jD ]}|r0||vr)|g||< q|| | q|jD ]}|r<|||< q4q| jD ]}|jdkr|jd rR|jd sSqA|jd |jd }	}
d}| jD ]}|j|
krn|} nqc|du rtqAt	j
|}|jdkr| dkr|jd |v r||	 }|jdkr|jd r|jd |v r|jd d	s|jd d
r||jd  || t||jd  dkr|| qA||fS )az  Correct graph which originally use dim of past_seq_len from input_ids's shape which is fixed to max_seq_len after
       shared past/present buffer

    Args:
        subg (GraphProto): GraphProto of the decoder subgraph
    return:
        tensor_names_to_rename : set of tensor names which is equal to past_sequence_length
        nodes_to_remove : list of node to remove
    c                 S   s   i | ]\}}|j |qS r)   r   )r   indexinpr)   r)   r*   
<dictcomp>  s    z+find_past_seq_len_usage.<locals>.<dictcomp>GatherrZ   r   Nr   Shaper  r  )setr   r   r'  r   r   r   r$  r   r   r*  r   sizeitem
startswithaddr   )rc  tensor_names_to_renamenodes_to_removegraph_input_namesinput_name_to_nodesr   r'  
input_nameoutput_nameshape_tensor_nameshape_index_nameini_gather_indicesrD  gather_indices_arr
shape_noder)   r)   r*   find_past_seq_len_usage  s^   







$


r  rZ   r`   r   	attn_maskkv_num_heads
world_sizewindow_sizec           1      C   sP  |  tjjdtjdgdgd tjjd|dg|d g| dd}tjjd|d dgdg| dd}tjjd	dgd
g| d	tjd}tjjd|g|d g| dd}tjjd|d dgdg| ddd}	tjjd	dgdg| d	tjd}
| j	j
j|||||	|
g ttdd | j	j
j}t|D ]\}}| |g dg d}| |ddgddg}d\}}}|d ur|\}}}n|d ur|\}}| |g dg d}| |ddgddg}d\}}}|d ur|\}}}n|d ur|\}}| |ddgddg}| |dgdg}d\}}|d ur|\}}n	|d ur"|d }d}|d ur>|d ur>|jD ]}|jdkr<|j}q1d}|jD ]}|jdkrN|j}qC|jd |jd kod|jd |jd k}|d uor|d uor|d u} |d u o|d u o|d u }!d \}"}#}$|r| s|!rt| |jd }%t| |jd }&t| |jd }'|%jd! }(tj|%|&|'fdd"|(d#|( })tjj|)d$| d%})|  |) tjjd|jd |)jg|)j d&g| dd}*| j	j
j|*g | j	j
j| | j	j
j| | j	j
j| |*jd }"| rt| |jd }+t| |jd },t| |jd }-|+jd! }(tj|+|,|-fdd"d#|( }.tjj|.d'| d%}.|  |. tjjd|*jd |.jg|.j d&gd(}/| j	j
j|/g | j	j
j| | j	j
j| | j	j
j| |/jd }"n|jd }"|jd }#|jd }$tjjd)|"|#|$|jd* |jd+ |jd |
jd |d ur|jd nd,|d ur|jd# nd,g	|j|j d-d)d.|| |dkr|| n|| |t!|d uo|d u|d/
}0| j	j
j| | j	j
j|0g |d ur| j	j
j| |d ur%| j	j
j| q| S )0NonerZ   r   r   r   vals	ReduceSum	_row_sumsinputsoutputsr   Subseqlens_k_int64Cast	seqlens_k)r  r  r   tor  _shaper  total_seq_len_int64r   )r  r  r   r   total_seq_lenc                 S   s
   | j dkS )NMultiHeadAttention)r   )r'  r)   r)   r*   <lambda>V  s   
 z&replace_mha_with_gqa.<locals>.<lambda>)RotaryEmbeddingAddr   )r   r   r   r  r   )NNN)rZ   r   r   r  r   NNinterleavedry  )r!   r!   r!   r`   r   r   QKV_Weight_r  _output	QKV_Bias_)r  r  GroupQueryAttentionrG  rH  r!   r  com.microsoft)	r  r  r   rV  ry  r  local_window_size	do_rotaryrotary_interleaved)"add_initializerr   r,  make_tensorr   INT64rj  create_node_namer   r   r   r'  r   listfilterr   match_parent_pathrT  r   r   r   r   r   r   r+  r   stackreshaper*  
from_arrayr(  r   replacers   )1r   r  r  r  r  reduce_sum_nodesub_nodeseqlen_k_cast_noder  gather_nodetotal_seqlen_cast_node	mha_nodesidxr'  q_path_1q_path_2q_rotaryq_addq_matmulk_path_1k_path_2k_rotaryk_addk_matmulv_path_1v_path_2v_addv_matmulr  attry  root_input_is_sameall_paths_have_biasall_paths_have_no_biasq_input_to_attentionk_input_to_attentionv_input_to_attentionqwkwvwr`  
qkv_weightpacked_matmul_nodeqbkbvbqkv_biaspacked_add_nodegqa_noder)   r)   r*   replace_mha_with_gqa  s<  









*

 









r  c              	      s  d}dd j D }|dk r$|| ds$|d7 }|dk r$|| drd}tj| d }d| |   fddt|D }td	|  tj   }td
|  |d }|d }|d }	d}
jD ]_}|jdkr|j d |v rtd|j	 d|j  |
d7 }
||j d  }d| }dgdt|j  }|
| |j| |jtjddg tj|tj||d|	g}j|g qg|
|krtd| d|
 d S )NrZ   c                 S      g | ]}|j qS r)   r  r   gir)   r)   r*   r         zBupdate_decoder_subgraph_output_cross_attention.<locals>.<listcomp>r   pastr   c                    s"   i | ]}j |d     j|qS )r   )r   r   )r   layerinput_cross_past_0rc  r)   r*   r    s   " zBupdate_decoder_subgraph_output_cross_attention.<locals>.<dictcomp>z    --past_key_cross_inputs=zpast_key_cross_0_shape is r   DecoderMaskedMultiHeadAttentionz'    -- add cross QK output from: node: z with output: output_cross_qk_r!   	output_qkz#Did not add cross QK for all layersz vs )r   r  r   r   r   printrb  r'  r   r   r   r   rT  r   r,  make_attributer-  r   r   r   )rc  input_self_past_0r  output_self_present_0
num_layerspast_key_cross_inputsinput_past_key_cross_0_shapebatch_size_dimnum_heads_dimcross_seq_len_dimnum_layer_output_qkr'  r  cross_attention_out_nameappended_namescross_attentionr)   r  r*   .update_decoder_subgraph_output_cross_attention  sD   


r	  c              	   C   s"  d}dd | j D }|dk r$|| ds$|d7 }|dk r$|| drd}tt| j | d }d| | }g }g }| jD ]}|jdkrK||g q>t||k rTd	S d }	| jD ]}|jd
krd|}	 nqYg d}
d}t| \}}t|dkr|D ]}td| d|  qy|D ]}td|j d|j	  qt
jjddgdgdd}t
jjddg|gdtjd}|||g | jD ]}t|jdkr|	d ur|jd |	j d krt
jjddgdgdtjd}|jd |j d< ||g |jdkrt|}| D ]
}||
vr||= q|j d |j d |j d g}|t|j dkr$|j d ndg |t|j dkr7|j d ndg |t|j dkrJ|j d ndg |t|j dkr]|j d ndg |dg |d g |d!g |t|j dkr|j d ndg d|d"< t
jjd#||jfd$|j	i|}||vrt|j D ]\}}||v r||j |< q||g q| d% | j| d&d | j D }g }t| j D ]0\}}||kr||k rt|}t
jj|j	|jjj|d |d d'|d gd(}||g qd|vr|t
jjdt
jjdgd)g d |vr0|t
jjd t
jjdgd)g d!|vrF|t
jjd!t
jjg d*d)g | d+ | j | g }t| jD ]+\}}||kr|t|}t
jj|j	|jjj|d |d d'|d gd(}||g qX| d, | j| d-S ).NrZ   c                 S   r  r)   r  r  r)   r)   r*   r     r  zSupdate_decoder_subgraph_share_buffer_and_use_decoder_masked_mha.<locals>.<listcomp>r   r  r\   r   r  FRelativePositionBiasrx  #past_sequence_length_squeezed_int64r   zFound tensor name z to be renamed to zFound node to removed: type:z, name:Squeezerf  past_sequence_length_squeezed!node_past_sequence_length_squeezer  r  &node_past_sequence_length_squeeze_cast)r   r  past_sequence_length_int64past_sequence_length_castr!   rF  rG  rH  rt  ru  rT   r  r   r'  c                 S   r  r)   r  )r   r  r)   r)   r*   r   q  r  rd  re  rg  rv  r   r   T)r   r  rs   r   r'  r   r   r  r  r   r   r,  rj  r   r  r   r[  r  r   ri  rb  r-  r7   r   r   r   )rc  r  r  output_self_past_0r  r  ro  	old_nodesr'  rel_pos_bias_noder  target_squeezed_past_seq_namer  r  name_to_renamenrsqueeze_node	cast_noderX  r  rp  r  r   orig_input_namesrm  r   ra  r+  rn  r)   r)   r*   ?update_decoder_subgraph_share_buffer_and_use_decoder_masked_mha  s  




*
&&&&&





 



r  model_protoc                 C   s  t | }| }g }g }| D ]}|jdkrd|jd v r&d|jd v r&q||jd  }||jd  }||jd  }||jd }	||jd }
||jd }|	rY|
rY|s\ dS t|	}t|
}t|}tj	|||gdd}|j
d	d
d}tjj|d |	jdkrtjntj|jd |jd g|  d}| jj|g tjjd	|jd |d g|d g|d}|jd |jd< d|jd< d|jd< ||g ||||g q|| || |  |  dS )Nr  past_key_crossrZ   past_value_crossr   r   Fr   r   
MatMul_QKV)name_prefix_weightr  _outr  r!   T)r   r   nodesr   r   r   r   r   r   r   r  r   r,  r  r   r   r   rr   r+  flattentolistr   r$  r   rj  r   	add_nodesremove_nodesupdate_graphtopological_sort)r  
onnx_modelr   nodes_to_addr  r'  r  r  r  q_weightk_weightv_weightr  r  r  r  matmul_node_nameweightr   r)   r)   r*   pack_qkv_for_decoder_masked_mha  sZ   








r1  decoder_onnx_pathc                 C   s   t j| dd}tt|jjD ],}|jj| jdks#|jj| jdkr;|jj| jjj	j
d }|dr8|  d|_qtj|| |d dS )aQ  Update the input shapes for the inputs "input_ids" and "position_ids" and make the sequence length dim value 1 for each of them.
       The decoder model will be over-written.

    Args:
        decoder_onnx_path (str): Path of GPT-2 decoder onnx model
        use_external_data_format(bool): output tensors to external data or not.
    Tr   r   r   rZ   r\  r   )r   r   r   r   r   r   r   r7   r   r+  r`  HasFieldClearr]  r   r   )r2  rK   r   r   shape_dim_protor)   r)   r*   *update_input_shapes_for_gpt2_decoder_model  s   	
r6  init_decoder_onnx_pathc                 C   s  t j| dd}|jjd j}t|}| }||v sJ || }|jdkr'dS ||g dg d}|du rA||g d	g d
}|du r_||g dg d}|du r_||g dg d}|du redS |d }	|	jdk}
|
sd}||	g d|dddg}|du rd}||	g d|dddg}|du rd}||	g d|ddg}|du rd}||	g d|ddg}nBd}||	g d|ddg}|du rd}||	g d|ddg}|du rd}||	ddg|dg}|du rd}||	ddg|dg}|du rdS |dkrdnd}|
s|	|	d|}n|	|	d|}|du r!dS |d }|d }t j
jdtjdgdgd}t j
jdtjdgdgd}t j
jdtjdgdgd}t j
jdtjdgdgd}|| || || || d|jd  }t j
jd|jd ddddg|g|ddd }|
s|jd n|jd! }d|jd  }t j
jd|ddddg|g|dd"d }|| || |||jd | ||	|| |  tj|||d# dS )$a  Generates the initial decoder GPT2 subgraph and saves it for downstream use.
       The initial decoder model will be saved to init_decoder_onnx_path.

    Args:
        decoder_onnx_path (str): Path of GPT-2 decoder onnx model
        init_decoder_onnx_path (str): Path of GPT-2 init decoder onnx model
        use_external_data_format(bool): output tensors to external data or not.
    Tr   r   r   F)r  LayerNormalizationr  r  r  r   r  FastGelur  r   r  r8  r  )r   r   r   rZ   r   r   r   r   r   r   r   r   r   N)
r  SkipLayerNormalizationr  r   r  r9  r  r   r  r:  )
r   r   rZ   r   r   r   r   r   r   r   )r8  r  r  r   r9  r   r8  r  )r   r   rZ   r   r   r   r   r   )r:  r   r9  r   r:  )r   rZ   r   r   r   r`   r:  )r  r  r   rh  rZ   )r  r   rh  )r  r   rh  rh  r  SliceLastTokenStartsr  SliceLastTokenEndsSliceLastTokenAxesSliceLastTokenStepsedge_modified_SliceGatherLastToken_0_r  r   GatherLastToken_1_r   )r   r   r   r   r   r   r   r   r  r   r,  r  r   r   r  rj  r  add_nodereplace_node_inputr)  r   )r2  r7  rK   init_decoder_model_protor   gpt2_init_decoder_modelr   logits_matmul_node"logits_matmul_to_residual_add_pathresidual_add_nodeis_skiplayernorm_path&residual_add_to_attention_parent_indexresidual_add_to_attention_path residual_add_to_add_parent_indexadd_before_residual_add	attentionmatmul_after_attentionslice_starts
slice_ends
slice_axesslice_stepsslice_0_output_nameslice_node_0add_before_residual_add_outputslice_1_output_nameslice_node_1r)   r)   r*   generate_gpt2_init_decoder  s0  











r[  c           	      C   s   t d}t |j}t |j}t |j}| jjD ]%}|jjjj	D ]}|
dr;|j||||fv r;t|j}|  ||_qq| jjD ]%}|jjjj	D ]}|
dre|j||||fv ret|j}|  ||_qIqAdS )zoMake dim_proto numeric.

    Args:
        model: T5 encoder and decoder model.
        config: T5 config.
    rZ   r\  N)rl   ry  d_modeld_kvr   r   r7   r   r+  r`  r3  r\  rs   r4  r]  r   )	r   configsequence_lengthry  hidden_size	head_sizerD  	dim_protor]  r)   r)   r*   make_dim_proto_numeric_t5  s>   




rc  generation_typec           ,      C   s  | j dk}|tjk}|tjk}|tjk}| j}td|  t| j	dkrL| j	d dkrL|rI| j
tjkrIg d| _	td| j	  td ng | _	|sP|rd|sVtd	| jr]td
| jrdtd|ro|ro| jsotd| jrx|sxtd| jr| jstd|r| jrtj| jrtd| j  nS| jsd| j| j
tjkrdnd}tt| jj| | _td| j d| j d t|  n"| jr| jrtd| j d| j  ntd| j d t|  d}| j s| j
tjkr|r|s|s|rtd| j d t!| j| j"}|st#d d}	d}
| j$ss|rs|s1|s1|rstd| j d  d!| j
tjkrFdnd}tt| jj| }
t%| j|
| j"}	|	sdt#d" |	rst&| j| j"sstd#|s}| j's}|	rtd$| j d t(| j| j" |	rtd$|
 d t(|
| j" |rt)j*| j| j+d%}n| j d&krt,j*| j| j+d%}n	t-j*| j| j+d%}| j.rtd'|  |j/}|r|j/n|j0}|j1}| j1d(kr| j1}| j/d(kr| j/}| j0d(kr| j0}t2j3| jd)d*}| j  d+|j4_5d}| j dkr5t6|j4| j
 |	r4t2j3|
d)d*}| j  d,|j4_5t6|j4| j
 nt7|j4| j
 d}|rFg d-}n
|sL|rPg d.}| j8rZ|9d/ n|9d0 | j:ri|9d1 n|9d0 | j;rx|9d2 n|9d0 |r| j<r| j=r|9d3 n|9d0 | j>r|9d4 d5g}| jr|9d6 | jr| jsJ d7|9d8 d}|rt2j?j@d9||d:| j  d;}n#|rt2j?j@d<||d=| j  d;}n|rt2j?j@d>||d?| j  d;}d@|_Ad}|r(t2j?BdA|t2j?BdB|t2j?BdC| jCt2j?BdD| jDrdndt2j?BdE| j dkr#dndg}nw|rOt2j?BdA|t2j?BdB|t2j?BdE| j dkrCdndt2j?BdC| jCg}nP|rt2j?BdA|t2j?BdB|t2j?BdE| j dkrjdndt2j?BdC| jCt2j?BdF| jEt2j?BdG| jFt2j?BdH| jGt2j?BdI| jHt2j?BdJ| j<t2j?BdK| jIg
}|r|Jt2j?BdL|g |jKJ| g }| j dMv rd| j'rtdN| j d t(| j| j" t2j3| jd)d*}| j  dO|j4_5tL|j4| j
 tM|| tM|| |r"| jstdPtdQ tN|j4rtdR ntdS tO|rtdT ntdU | jPs=tQ||}tt| dVdWdX |D  dY |jKJt2j?BdZ|j4t2j?Bd[|j4t2j?Bd\t|j4jRd]kr^|jSnd(g n|	r| jPstQ||}tt| dVd^dX |D  d_ |rtd` tT|j4 | jrtU|j4|dstda|jK9t2j?Bdb|j4 ntV|j4}tt| dc |rtdd tT|j4 | jrtU|j4|d)stde|jK9t2j?Bd[|j4 t2j?WdftXjYdgdhg}t2j?WditXjYdg}t2j?WdjtXjYdg}t2j?WdktXjYdg}t2j?WdltXjYdg}t2j?WdmtXjZdg}t2j?WdntXjZdg}d}|r:|||||||g}n|s@|rF||||g}| j8rYt2j?Wd/tXjY|g} |9|  | j:rmt2j?Wd1tXjYdg|g}!|9|! | j;rt2j?Wd2tXjYdgdhg}"|9|" | j<r| j=rt2j?Wd3tXjYdg|g}#|9|# |r| j>rt2j?Wd4tXjYdg}$|9|$ d}%|rt2j?Wd5tXjYg do}%n|s|rt2j?Wd5tXjYdgdig}%|%g}&| jrt2j?Wd6tXjZdgdlg}'|&9|' | jrt2j?Wd8tXjZdpdgdk|g}(|&9|( t2j?[|g|s| j  dqn| j  dr||&|})t2j?j\|)ds|j]dt}*| j"rFddul^m_}+ |+`t2ja|+`dvk r;t#dw tbjc|*| jd)d)dx nt2c|*| j tdy| j  dS )zzConvert model according to command line arguments.

    Args:
        args (argparse.Namespace): arguments parsed from command line
    r9   z**** past_present_share_buffer=rZ   r   rH   )r  r8  r:  r9  z**** Setting op_block_list to zI**** use --op_block_list if you want to override the block operator list.z<Currently only gpt2 with greedy search/sampling is supportedzLoutput_sequences_scores currently is not supported in greedy search/samplingzHoutput_token_scores currently is not supported in greedy search/samplingzi`use_decoder_masked_attention` MUST be turned on to use `past_present_share_buffer` in case of BeamSearchzS`past_present_share_buffer` MUST be turned on to use `use_decoder_masked_attention`z?`use_decoder_masked_attention` option is only supported on GPUsz)skip convert_to_onnx since path existed: z{}_past_{}.onnxr   r~   zConvert GPT model z	 to onnx z ...z,skip convert_to_onnx since paths specified: z and zConvert model z to onnx ...Fz=Pad logits MatMul weights for optimal MatMul perf in fp16 on z. The file will be overwritten.z]Tried and failed to pad logits MatMul weights. Performance may be sub-optimal for this MatMulNz*Creating an initial run GPT2 decoder from z. zgpt2_init_past_{}.onnxzuTried and failed to generate the init decoder GPT2 model. Performance may be sub-optimal for the initial decoding runzGCould not update the input shapes for the non-initial decoder subgraph.z Run symbolic shape inference on r   r:   zConfig=r`   Tr   z decoderz init decoderr   
max_length
min_length	num_beamsnum_return_sequenceslength_penaltyrepetition_penaltyr   rg  rh  rl  rS   r!   rV   r   rX   rY   	sequencessequences_scoresz8--output_token_scores requires --output_sequences_scoresscores
BeamSearchBeamSearch_r  GreedySearchGreedySearch_Sampling	Sampling_r  eos_token_idpad_token_idno_repeat_ngram_sizerR   r   temperaturetop_pfilter_valuemin_tokens_to_keepcustompresence_penalty
vocab_sizer:   r;   zSymbolic shape inference on z encoder and decoder initzMpast_present_share_buffer is only supported with use_decoder_masked_attentionzl*****update t5 decoder subgraph to share past/present buffer and use decoder_masked_multihead_attention*****z4*****update t5 decoder subgraph successfully!!!*****zF*****DecoderMaskedMultiHeadAttention is not applied to T5 decoder*****z9*****pack qkv for decoder masked mha successfully!!!*****z3*****pack qkv for decoder masked mha failed!!!*****z shared initializers (c                 S   r  r)   r  r   r)   r)   r*   r   c	  r  z,convert_generation_model.<locals>.<listcomp>z>) in encoder and decoder subgraphs are moved to the main graphr?  r@  decoder_start_token_idr   c                 S   r  r)   r  r   r)   r)   r*   r   	  r  zC) in decoder and init decoder subgraphs are moved to the main graphzY*****update init decoder subgraph to make past and present share buffer******************zLCould not update the init decoder subgraph to use DecoderMaskedSelfAttentioninit_decoderz: initializers from the decoder are moved to the main graphzT*****update decoder subgraph to make past and present share buffer******************zGCould not update the decoder subgraph to use DecoderMaskedSelfAttentionr   rw  r_  rg  rh  ri  rj  rk  rl  )rw  rj  rg  zmax_length - sequence_lengthz beam searchz greedy searchzonnxruntime.transformers)producer_nameopset_imports)versionz1.12.0z0Require onnx >= 1.12 to save large (>2GB) model!)r   all_tensors_to_one_filezmodel save to )dr   r"   r/   r0   r1   rT   r   r   r   r   r   r	   rr   NotImplementedErrorrP   rQ   rU   r   rc   r   rn   ro   existsformatr   r   r   r   as_posixr   r   r   rM   r   rK   r   rN   r[  r6  rL   r   r   from_pretrainedr   r   r   rC   rw  rx  r  r   r   r   r   r  r  rS   r   rV   rW   r~  rX   rY   r,  rj  rV  r  ry  rR   rz  r{  r|  r}  r  r   rT  r  rc  r  r1  rO   rB  r   r  rq  r  rE  r-  r   r   r   
make_graph
make_modelopset_import	packagingr  parse__version__r   r   ),r|   rd  is_gpt2is_beamsearchis_greedysearchis_samplingrT   onnx_filenamelogits_matmul_weight_paddedgpt2_init_decoder_generatedgpt2_init_decoder_onnx_pathgpt2_init_decoder_onnx_filenamer^  rw  rx  r  r   rG  r  r  r'  attr_to_extendrA  r9  r   rg  rh  ri  rj  rk  rl  graph_inputsrS   rV   r   rX   rY   rn  graph_outputsro  rp  	new_graph	new_modelr  r)   r)   r*   convert_generation_model&  s  








	























	







	
r  r   r   rw  rx  bad_words_idsc                 C   s   | j rtj std| jtjkr|  t	| j rdnd}|
| td |
|}|
|}g }t| jD ]3}	t }
|j||| j| j| j| j| j||| j| j| j|r[|ndd| jpb| jd}	|t |
  q;|jd }dd	lm} |||S )
a  Test PyTorch performance of text generation.

    Args:
        args (argparse.Namespace): arguments parsed from command line
        model (Union[GPT2LMHeadModel, T5ForConditionalGeneration]): PyTorch model
        input_ids (torch.Tensor): input_ids
        attention_mask (torch.Tensor): Attention mask
        eos_token_id (int): EOS token ID
        pad_token_id (int): Padding token ID
        bad_words_ids (List[List[int]]): Words shall not be generated.

    Raises:
        RuntimeError: PyTorch with CUDA is not available for --use_gpu

    Returns:
        Dict[str, Any]: A dictionary with string with metric name, and value can be integer or string.
    z=Please install PyTorch with Cuda for testing gpu performance.zcuda:0cpuFNTr   r   rg  rh  ri  rR   ry  rw  rx  rj  rk  rl  r  return_dict_in_generateoutput_scoresr   get_latency_result)rc   torchcudais_availabler   r   r	   rr   halfdevicer  set_grad_enabledr   
total_runstimegeneraterg  rh  ri  rR   ry  rj  rk  rl  rP   rQ   r   r+  benchmark_helperr  )r|   r   r   r   rw  rx  r  r  torch_latency_startrw  r  r)   r)   r*   test_torch_performance
  sB   







r  c                 C   sp   t j| jt jd}t| jd D ]%}d}t| jd D ]}| | | |kr0|dkr0d|| |< q|d7 }qq|S )Nr   r   rZ   )r   onesr+  int32r   )r   rx  r   r   abs_posr6  r)   r)   r*   create_attention_mask\
  s   
r  F	sentences	is_greedyc           +      C   s  | j dksJ tj| j| jd}d|_|j|_tj| j| j|j	d}|du r*g d}||ddd	}|d
 }|d }d}|j
|dd}	dd |	D }	| jrStd|	 ng }	|j}
|
j	}|
j	}|
j}g }d}| jstd td |j||| j| j| j| j| j||| j| j| j|	r|	ndd| jp| jd}td
| td td|j | jrtd|j | jrtd|j t |jD ]\}}|j!|dd}|"| t| d|  qtd td |r|# $ %t&j't&j(| jgt&j'dt&j(| jgt&j'dt&j(| jgt&j)dd}nB|# $ %t&j't&j(| jgt&j'dt&j(| jgt&j'dt&j(| jgt&j'dt&j(| jgt&j'dt&j(| jgt&j)dt&j(| jgt&j)dd}| jrgt&j*|t&j'd}| jrc|	D ]}d||< q[||d< | j+rrt,|||d< |j-d }| j.rt/d  t&j*||ft&j'd}||d!< | j0rt1| j2j34 }td"| dd#l5m6} t/d$| d% |g}t |D ]\}}t7j89|d&t:| }||| qtd'| | j;rdS td( t<| j2| j=| j>}td) |?d|}g }t@| jAD ]}tBB }|?d|}|"tBB |  qdd*lCmD}  |j-d }| ||}!td+ |d }"td|" | jr7td|d,  | jrBtd|d-  |rm|"j-\}}#g }$t@|D ]}|j!|"| dd}|$"| td.| d/|  qPn5|"j-\}}%}#g }$t@|D ](}t@|%D ] }&|j!|"| |& dd}|$"| td.| d0|& d|  qqy|r|jE|| jd1}'tFG|"}(td td2 t|' t| td td3 t|( t|$ td ||$k})td4|)rd5nd6 |)|!d7< | jHrtI| ||||||	}*td8|* td9|! |!S ):a9  Test GPT-2 model

    Args:
        args (argparse.Namespace): arguments parsed from command line
        sentences (Optional[List[str]], optional): input text. Defaults to None.

    Returns:
        Union[Dict[str, Any], None]: A dictionary with string with metric name, and value can be integer or string.
    r9   re  left)r   rx  N)zThe product is releasedzI enjoy walking in the parkzTest best way to investptTreturn_tensorsr   r   r   walk in park)add_prefix_spacec                 S      g | ]}|gqS r)   r)   r   word_idr)   r)   r*   r   
  r  z"test_gpt_model.<locals>.<listcomp>r  2--------------------------------------------------CTest PyTorch model and beam search with huggingface transformers...r  !huggingface transformers outputs:rn  ro  rp  skip_special_tokens: 'Testing beam search with onnxruntime...r   rm  rf  r   rS   zYUse prefix vocab mask with all ones in ORT, but no corresponding setting for Torch model.rV   test_data_diroutput_test_datazSaving test_data to z/test_data_set_* ...test_data_set_
ORT inputszCreating ort session......zRun ort session......r  ORT outputs:rZ   r   batch z sequence: 
 sequence r`   Torch Sequences:ORT Sequences:Torch and ORT result is same	differentparityTorch LatencyORT)Jr   r   r  r   r   padding_side	eos_token	pad_tokenr   rw  encoderS   r   r   r^  r  rd   r  r  rg  rh  ri  rR   ry  rj  rk  rl  rP   rQ   rn  ro  rp  r   decoder   r  numpyastyper   r  arrayfloat32r  rW   r  r+  rV   r   rg   r   r   r   r  bert_test_datar  rn   ro   rm   rl   re   r   rc   ra   runr   r  r  r  r  r  r  
LongTensorrf   r  )+r|   r  r  	tokenizerr   r  r   r   	bad_wordsr  r^  rw  rx  r  torch_decoded_sequencesbeam_outputsr   sequencedecoded_sequencerS   bad_word_idrw  rV   r  r  
all_inputsdirr   resultlatencyr  r  r  r   rn  rg  ort_decoded_sequencesnum_sequencesr6  torch_sequencesort_sequencesis_sametorch_latency_outputr)   r)   r*   test_gpt_modelh
  sB  

















	
r  c           )      C   s(  | j dv sJ | jrtd dS tj| j| jd}d|_| j dkr,t	j| j| jd}n	t
j| j| jd}|du r=ddg}||d	d
d}|d }|d }d}||dd }dd |D }| jrhtd| ng }|j}	|	j}
|	j}|	j}td|
 d| d|  g }| jstd td |j||| j| j| j| j| j|
|| j| j| j|r|ndd
| jp| jd}td| td td|j | jrtd|j | jrtd|j  t!|jD ]\}}|j"|d
d}|#| t| d|  qtd td t$j%|t$j&d }| jr|D ]}d!||< q|' ( )t$j&t$j*| jgt$j&d t$j*| jgt$j&d t$j*| jgt$j&d t$j*| jgt$j&d t$j*| jgt$j+d t$j*| jgt$j+d d"}| jr]||d#< | j,rht-|||d< | j.rt/| j0j12 }td$| d!d%l3m4} |g}t!|D ]\}}t5j67|d&t8| }||| qtd'| t9| j0| j:| j;}g }t<| j=D ]}t>> }|?d|}|#t>> |  q|j@d! }d!d(lAmB} |||}td) |d! } td|  | jrtd|d*  | jrtd|d+  | j@\}}!}"g }#t<|D ](}t<|!D ] }$|j"| | |$ d
d}|##| td,| d-|$ d|  qq	| jsz|jC|| jd}%tDE| }&td td. t|% t| td td/ t|& t|# td ||#k}'td0|'rsd1nd2 |'|d3< | jFrtG| ||||
||}(td4|( td5| |S )6a=  Test T5 or MT5 model

    Args:
        args (argparse.Namespace): arguments parsed from command line
        sentences (Optional[List[str]], optional): input text. Defaults to None.

    Returns:
        Union[Dict[str, Any], None]: A dictionary with string with metric name, and value can be integer or string.
    r  zLSkipping parity test as prefix vocab mask is not implemented by Hugging FaceNre  r  r:   z4translate English to French: The product is releasedzsummarize: research continues to show that pets bring real health benefits to their owners. Having a dog around can lead to lower levels of stress for both adults and kids.r  Tr  r   r   r  r`   c                 S   r  r)   r)   r  r)   r)   r*   r   h  r  z!test_t5_model.<locals>.<listcomp>r  zeos_token_id:z, pad_token_id:z, vocab_size:r  r  r  r  rn  ro  rp  r  r  r  r   r   rf  rS   r  r  r  r  r  r  rZ   r   r  r  r  r  r  r  r  r  r  r  )Hr   rV   r   r   r   r  r   r   r  r   r   r  rS   r^  rw  rx  r  rd   r  r  rg  rh  ri  rR   ry  rj  rk  rl  rP   rQ   rn  ro  rp  r   r  r   r   r  r  r  r  r  r  r  rW   r  rg   r   r   r   r  r  r  rn   ro   rm   rl   r   rc   ra   r   r  r  r  r+  r  r  r  r  r  rf   r  ))r|   r  r  r   r  r   r   r  r  r^  rw  rx  r  r  r  r   r  r  rS   r  r  r  r  r  r   r  r  r  r  rw  r  r   rn  r  rg  r  r6  r  r  r  r  r)   r)   r*   test_t5_model;  s   













	
r  c                 C   sr  t | }t|j |jdv rB|jr tj|js td|j |j	r2tj|j	s2td|j	 |jr8|j	r>|j	rB|jsBtd|j
dkoK|jdk}|jdkr}|r}|jdkrv|jdk rvt|tj td	 |jd
kss|jss|jrudS nt|tj nt| td |jdv rt||d}nt|||d}|r|jrtd|j d|j d |S td|j  |S )a/  Main entry function

    Args:
        argv (Optional[List[str]], optional): _description_. Defaults to None.
        sentences (Optional[List[str]], optional): input text. Defaults to None.

    Raises:
        ValueError: Path does not exist: --encoder_decoder_init_onnx
        ValueError: Path does not exist: --decoder_onnx
        ValueError: --decoder_onnx and --encoder_decoder_init_onnx are not used together for T5

    Returns:
        Union[Dict[str, Any], None]: A dictionary with string with metric name, and value can be integer or string.
    r  z1Path does not exist: --encoder_decoder_init_onnx z$Path does not exist: --decoder_onnx zB--decoder_onnx shall use together with --encoder_decoder_init_onnxrZ   r9   r_   r]   zThe test for gpt2_sampling onnx model is limited to non-custom model with small top_p(e.g <=0.01) value. The result should be the same as gpt2 greedy search.g{Gz?Nzstart testing model...)r  )r  r  zOutput files: r5   z.datazOutput file: )r}   r
   rC   r   r   rn   ro   r  r   r   ri  rj  r{  r  r"   r1   r   r   r~  rY   r0   r  r  rK   r   )r2   r  r|   r  r  r)   r)   r*   r     sF   



r   __main__r&   )T)r  r  NN)r  )r   rZ   r`   )NFr  )`__doc__rh   loggingr   rn   r  enumr   pathlibr   typingr   r   r   r   r   r  r   r   r  r  r	   r
   fusion_utilsr   r   r   r   r*  r   transformersr   r   r   r   r   r   r   r   onnxruntimer   r   r   r   4onnxruntime.transformers.models.gpt2.convert_to_onnxr   r   0onnxruntime.transformers.models.gpt2.gpt2_helperr   2onnxruntime.transformers.models.t5.convert_to_onnxr   r   ,onnxruntime.transformers.models.t5.t5_helperr   r    	getLoggerr   r"   rl   	Namespacer}   r   r   boolr   r   r   r  r  r  rs   dictr8  rB  rE  rU  r[  rb  rq  r  r  r  r	  r  r1  r6  r[  rc  r/   r  Tensorr  r  r  r  r,   r)   r)   r)   r*   <module>   s  '(
	   -N!8LG
j
%
0
P=
 d' ;
  %   w



B" T $>
;
