Using ONNX Model Inference and Saving Input Data in NPY Format

Using ONNX Model Inference and Saving Input Data in NPY Format#

This topic explains how to perform inference with an ONNX model using floating-point inputs and save the input data in .npy format. This approach facilitates data storage and reuse and can serve as a calibration dataset during model quantization, provided that the data adequately reflects the typical distribution of the model inputs.

Through an example, we demonstrate how to define a simple dataset class (InputDataset), perform inference using an ONNX model, and save input data in .npy format to support subsequent model quantization.

Detailed Code#

import onnxruntime as ort
import numpy as np
import os
from torch.utils.data import Dataset, DataLoader


# A simple dataset with two inputs (`input1`, `input2`) and random tensors.
# Users can customize data generation to match their model's needs.
class InputDataset(Dataset):
    def __init__(self, num_samples):
        super(InputDataset, self).__init__()
        self.num_samples = num_samples
        self.input1 = [np.random.rand(3, 224, 224).astype(np.float32) for _ in range(num_samples)]
        self.input2 = [np.random.rand(10).astype(np.float32) for _ in range(num_samples)]
        self.labels = [np.random.randint(0, 2) for _ in range(num_samples)]

    def __len__(self):
        return self.num_samples

    def __getitem__(self, idx):
        return {
            "input1": self.input1[idx],
            "input2": self.input2[idx],
            "label": self.labels[idx]
        }


dataset = InputDataset(num_samples=10)
data_loader = DataLoader(dataset, batch_size=1, shuffle=False, num_workers=4, pin_memory=True)

onnx_model_path = "path/to/your/float_model.onnx"
session = ort.InferenceSession(onnx_model_path)
input_names = [inp.name for inp in session.get_inputs()]
output_names = [out.name for out in session.get_outputs()]

enable_data_caching = True
calibration_cache_dir = "calibration_data/"

if enable_data_caching:
    for name in input_names:
        input_folder_path = os.path.join(calibration_cache_dir, name)
        os.makedirs(input_folder_path, exist_ok=True)

for batch_idx, batch in enumerate(data_loader):
    input_feed = {}

    for name in input_names:
        input_data = batch[name].numpy()
        input_feed[name] = input_data

        # If `enable_data_caching` is True, save input data as `.npy` files by input name for each batch.
        if enable_data_caching:
            file_path = os.path.join(calibration_cache_dir, name, f"calib_{batch_idx+1:06d}.npy")
            np.save(file_path, input_data)
            print(f"Saved input data for {name} to {file_path}")

    outputs = session.run(output_names, input_feed)

    predictions = np.argmax(outputs[0], axis=1)

    print(f"Predictions for batch {batch_idx}: {predictions}")

The input data saved during ONNX inference can serve as a calibration dataset for model quantization. For instructions on how to use the saved NPY data as a calibration dataset, refer to Calibration Data Path for AMD Quark Quantizer. The output data format saved during inference is as follows:

For Single-Input Models#

calibration_data/
  calib_000001.npy
  calib_000002.npy
  calib_000003.npy
  calib_000004.npy
  calib_000005.npy
  ...

For Multi-Input Models#

calibration_data/
  input1_name/
    calib_000001.npy
    calib_000002.npy
    calib_000003.npy
    calib_000004.npy
    calib_000005.npy
    ...
  input2_name/
    calib_000001.npy
    calib_000002.npy
    calib_000003.npy
    calib_000004.npy
    calib_000005.npy
    ...