o
    gd#                     @   s8  d dl Z d dlZd dlZd dlmZmZ d dlZd dlmZ	 d dl
Z
d dlmZmZmZmZ d dlmZ ddlmZ ddlmZ eeZG dd	 d	Zd
d Zedkre Zejr`eej ejZ ej!Z"ej#$e"r}e%de" d e&de" de
'e Z(ee(ej)ej*ej+dZ,e,-  e,j(.e"d dS dS )    N)ListTuple)
GraphProto
ModelProto	NodeProtoTensorProto)quantize_matmul_bnb4   )	ONNXModel)attribute_to_kwargc                   @   s   e Zd ZdZdZdZddededefdd	Ze	d
e
e deeef fddZdejdejfddZdede
e defddZde
e fddZdd ZdS )MatMulBnb4QuantizerzMPerform 4b quantization of constant MatMul weights using FP4 or NF4 data typer   r	   Nmodel
quant_type
block_sizec                 C   s@   |pg }|t jt jfv sJ t|| _|| _|| _t|| _d S N)	r   FP4NF4r
   r   r   r   setnodes_to_exclude)selfr   r   r   r    r   e/var/www/visachat/venv/lib/python3.10/site-packages/onnxruntime/quantization/matmul_bnb4_quantizer.py__init__&   s   
zMatMulBnb4Quantizer.__init__
graph_pathreturnc                 C   sL   t t|d ddD ]}|| }|jD ]}|j| kr"||f    S qq
dS )Nr	   )NN)rangeleninitializername)r   r   gidgraphtensorr   r   r   __get_initializer.   s   

z%MatMulBnb4Quantizer.__get_initializerfpweightc                 C   s   t |jdkrtd|  }|j\}}|| }| j}|| d | }|d d }tj|dd}	tj||jd}
t	|	||
|| j
|| |	|
fS )z4b quantize fp32/fp16 weight   z9Current bnb4 block quantization only supports 2D tensors!r	   uint8)dtype)r   shape
ValueError	transposecopyr   npzerosr'   r   r   )r   r$   
fpweight_trowscolsnumelr   
num_blocksquantized_numelpackedabsmaxr   r   r   bnb4_block_quant7   s   
z$MatMulBnb4Quantizer.bnb4_block_quantnodegraph_stackc                 C   s  |j dkr|S td|j d |j| jv r#td|j d |S |jd }t||\}}|du r;td |S tj	
|}t|jd	krOtd
 |S | |\}}tj	|}	|jd |	_|jD ]}
|
j|krt|j|
  nqetj	|}|jd |_|j|	|g i }|j\}}||d< ||d< | j|d< | j|d< tjj	d|jd |	j|jg|jd g|jr|jd nddd|}td|j d |S )zdIf the node is MatMul with fp32 const weight, quantize the weight with int4, and return the new nodeMatMulzstart to quantize z ...zexclude to quantize z$ as specified by nodes_to_exclude...r	   Nz2MatMul doesn't have const weight. Skip to quantizer%   z)MatMul weight is not 2D. Skip to quantize_Bnb4_absmaxKNr   r   
MatMulBnb4r    com.microsoft)inputsoutputsr   domainzcomplete quantization of )r>   )op_typeloggerdebugr   r   inputr   %_MatMulBnb4Quantizer__get_initializeronnxnumpy_helperto_arrayr   r(   r6   
from_arrayremover   extendr   r   helper	make_nodeoutput)r   r7   r8   inputBBBs_graphB_arrayr4   r5   B_quantrG   absmax_tensorkwargsr/   r0   matmul_bnb4_noder   r   r   _bnb4_matmul_node_weightM   sX   









	z,MatMulBnb4Quantizer._bnb4_matmul_node_weightc                 C   s  g }|d }|j D ]s}dd |jD }t|rsi }|jD ]D}|jtjjkr4||j |j	| 
|i}n'|jtjjkrWg }	|jD ]}
||
 |	| 
|g q@|j	|	i}nt|}|| qtjj|j|j|jfd|j	i|}|| || q	|d |j | |  |S )Nr   c                 S   s,   g | ]}|j tjjks|j tjjkr|qS r   )typerI   AttributeProtoGRAPHGRAPHS).0attrr   r   r   
<listcomp>   s
    z9MatMulBnb4Quantizer._process_subgraph.<locals>.<listcomp>r   r7   )r7   	attributer   r[   rI   r\   r]   appendgr   _process_subgraphr^   graphsrN   r   updaterO   rP   rD   rG   rQ   rZ   
ClearFieldpop)r   r8   	new_nodesr!   r7   graph_attrsrX   r`   kvvaluesubgraphr   r   r   re      s@   




z%MatMulBnb4Quantizer._process_subgraphc                 C   sd   | j  g}| j  }d}|D ]	}|jdkrd}q|s&|tjddg | | | j 	  d S )NFr@   Tr	   )
r   r!   opset_importrC   rN   rI   rO   make_opsetidre   clean_initializers)r   r8   ro   has_ms_domainopsetr   r   r   process   s   


zMatMulBnb4Quantizer.processr   )__name__
__module____qualname____doc__r   r   r   intr   staticmethodr   r   r   r   rH   npt	ArrayLiker,   ndarrayr6   r   rZ   re   rt   r   r   r   r   r      s     7&r   c                  C   s   t jdd} | jdddd | jdddd | jd	d
dtjtjgdd | jdd
ddd | jddd
dd | jd
d | jddtd
g dd |  S )Na  Blockwise FP4/NF4 quantization for MatMul 2D weight matrices.

A weight matrix is partitioned into blocks, where each block is a contiguous
subset inside the flattened transposed weight matrix. Each block is quantized
into a set of 4b integers with an absolute value scaling factor.
)descriptionz--input_modelTzPath to the input model file)requiredhelpz--output_modelzPath to the output model filez--quant_typeFr	   z&Quantization data type. 0: FP4, 1: NF4)r   defaultchoicesr   z--block_size@   zVBlock size for blockwise quantization. Note: bnb.nn.Linear4bit only uses block_size=64)r   r   r   z-vz	--verbose
store_true)r   action)verbosez--nodes_to_exclude+zBSpecify the nodes to be excluded from quantization with node names)nargsr[   r   r   r   )	argparseArgumentParseradd_argumentr   r   r   set_defaultsstr
parse_args)parserr   r   r   r      s:   	
	r   __main__zfile z already exists)r   T)/r   loggingostypingr   r   numpyr,   numpy.typingr{   rI   onnx.onnx_pbr   r   r   r   onnxruntime.capi._pybind_stater   
onnx_modelr
   quant_utilsr   	getLoggerru   rE   r   r   argsr   setLevelDEBUGinput_modelinput_model_pathoutput_modeloutput_model_pathpathexistserror	Exceptionloadr   r   r   r   quantrt   save_model_to_filer   r   r   r   <module>   s:   
 "'
