diff --git a/QEfficient/base/onnx_transforms.py b/QEfficient/base/onnx_transforms.py index 61b5c00f6..ecab8ad57 100644 --- a/QEfficient/base/onnx_transforms.py +++ b/QEfficient/base/onnx_transforms.py @@ -5,11 +5,17 @@ # # ---------------------------------------------------------------------------- +import gc +import logging from typing import Optional, Tuple import numpy as np from onnx import ModelProto, external_data_helper, numpy_helper +from QEfficient.utils.constants import ONNX_TRANSFROM_MEMORY_CLEANUP_INTERVAL + +logger = logging.getLogger(__name__) + class OnnxTransform: """ @@ -31,6 +37,27 @@ def apply(cls, model: ModelProto, **kwargs) -> Tuple[ModelProto, bool]: """ raise NotImplementedError("Use subclasses for ONNX transform") + @classmethod + def _check_external_data_loaded(cls, model: ModelProto) -> bool: + """ + Check if external data is already loaded in the model. + + :param model: The ONNX model to check + :returns: True if external data is already loaded, False otherwise + """ + for tensor in external_data_helper._get_all_tensors(model): + # Check if tensor has external data but no raw data loaded + if len(tensor.external_data) > 0 and not tensor.HasField("raw_data"): + return False + return True + + @classmethod + def _cleanup_memory(cls): + """ + Force garbage collection to free up memory after tensor processing. + """ + gc.collect() + class FP16ClipTransform(OnnxTransform): """ @@ -47,6 +74,7 @@ def apply(cls, model: ModelProto, *, onnx_base_dir: Optional[str] = None, **kwar fp16_min = finfo.min transformed = False + processed_count = 0 for tensor in external_data_helper._get_all_tensors(model): nptensor = numpy_helper.to_array(tensor, onnx_base_dir) if nptensor.dtype == np.float32 and (np.any(nptensor > fp16_max) or np.any(nptensor < fp16_min)): @@ -61,6 +89,15 @@ def apply(cls, model: ModelProto, *, onnx_base_dir: Optional[str] = None, **kwar tensor.CopyFrom(new_tensor) transformed = True + del neg_inf_mask, clipped_tensor, new_tensor + + del nptensor + processed_count += 1 + + if processed_count % ONNX_TRANSFROM_MEMORY_CLEANUP_INTERVAL == 0: + cls._cleanup_memory() + + cls._cleanup_memory() return model, transformed @@ -89,7 +126,16 @@ def apply( file_num = 0 current_file_size = 0 transformed = False - external_data_helper.load_external_data_for_model(model, onnx_base_dir) + + # Check if external data is already loaded to avoid redundant loading + external_data_already_loaded = cls._check_external_data_loaded(model) + + if not external_data_already_loaded: + external_data_helper.load_external_data_for_model(model, onnx_base_dir) + else: + logger.info("External data already loaded, skipping redundant load operation") + + processed_count = 0 for tensor in external_data_helper._get_all_tensors(model): if tensor.HasField("raw_data") and ((tsize := len(tensor.raw_data)) > size_threshold): transformed = True @@ -98,4 +144,11 @@ def apply( file_num += 1 current_file_size = tsize external_data_helper.set_external_data(tensor, f"{model_name}_{file_num}.onnx.data") + + processed_count += 1 + if processed_count % ONNX_TRANSFROM_MEMORY_CLEANUP_INTERVAL == 0: + cls._cleanup_memory() + + cls._cleanup_memory() + return model, transformed diff --git a/QEfficient/utils/constants.py b/QEfficient/utils/constants.py index 50f36ea32..44426920a 100644 --- a/QEfficient/utils/constants.py +++ b/QEfficient/utils/constants.py @@ -70,6 +70,8 @@ def get_models_dir(): ONNX_EXPORT_EXAMPLE_MIN_PS = 0.99 ONNX_EXPORT_OPSET = 13 +ONNX_TRANSFROM_MEMORY_CLEANUP_INTERVAL = 100 + COMPILER = ["/opt/qti-aic/exec/qaic-exec", "-aic-hw", "-aic-hw-version=2.0"] # InternVL constants