Skip to main content

Model Deployment

Learn how to deploy custom AI models in DeepStream for inference on video streams.

Overview

DeepStream supports models from various frameworks:

  • TensorRT: Native format (best performance)
  • ONNX: Open Neural Network Exchange
  • TensorFlow: TF models and TF-TRT
  • PyTorch: Via ONNX export
  • Caffe: Legacy support
  • Custom: Custom model parsers

Model Deployment Workflow

Train Model → Export → Convert to TensorRT → Configure DeepStream → Deploy

Step 1: Train Your Model

Example with PyTorch:

import torch
import torch.nn as nn

class SimpleDetector(nn.Module):
def __init__(self, num_classes):
super().__init__()
# Your model architecture
self.backbone = torch.hub.load('pytorch/vision:v0.10.0',
'resnet50',
pretrained=True)
self.detector_head = nn.Linear(1000, num_classes * 5) # [x, y, w, h, conf]

def forward(self, x):
features = self.backbone(x)
detections = self.detector_head(features)
return detections

# Train model
model = SimpleDetector(num_classes=80)
# ... training code ...

# Save model
torch.save(model.state_dict(), 'detector.pth')

Step 2: Export to ONNX

import torch

# Load trained model
model = SimpleDetector(num_classes=80)
model.load_state_dict(torch.load('detector.pth'))
model.eval()

# Prepare dummy input
batch_size = 1
dummy_input = torch.randn(batch_size, 3, 416, 416)

# Export to ONNX
torch.onnx.export(
model,
dummy_input,
'detector.onnx',
export_params=True,
opset_version=11,
do_constant_folding=True,
input_names=['input'],
output_names=['output'],
dynamic_axes={
'input': {0: 'batch_size'},
'output': {0: 'batch_size'}
}
)

print("Model exported to detector.onnx")

Step 3: Convert to TensorRT

DeepStream can automatically convert ONNX to TensorRT engine:

config_infer_custom.txt
[property]
gpu-id=0
net-scale-factor=0.0039215697906911373
onnx-file=detector.onnx
model-engine-file=detector_b1_gpu0_fp16.engine
labelfile-path=labels.txt
batch-size=1
network-mode=2 # 0=FP32, 1=INT8, 2=FP16
num-detected-classes=80
interval=0
gie-unique-id=1
network-type=0 # 0=Detector, 1=Classifier, 2=Segmentation, 100=Other
cluster-mode=2

On first run, DeepStream will generate the TensorRT engine file.

Method 2: Manual TensorRT Conversion

import tensorrt as trt

def build_engine(onnx_file_path, engine_file_path):
"""Convert ONNX to TensorRT engine"""
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)

builder = trt.Builder(TRT_LOGGER)
network = builder.create_network(
1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
)
parser = trt.OnnxParser(network, TRT_LOGGER)

# Parse ONNX
with open(onnx_file_path, 'rb') as model:
if not parser.parse(model.read()):
print('ERROR: Failed to parse the ONNX file.')
for error in range(parser.num_errors):
print(parser.get_error(error))
return None

# Build engine
config = builder.create_builder_config()
config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 1 << 30) # 1GB

# Enable FP16 if supported
if builder.platform_has_fast_fp16:
config.set_flag(trt.BuilderFlag.FP16)

# Build and serialize
engine = builder.build_serialized_network(network, config)

with open(engine_file_path, 'wb') as f:
f.write(engine)

print(f"Engine saved to {engine_file_path}")

# Convert
build_engine('detector.onnx', 'detector_b1_gpu0_fp16.engine')

YOLOv5

# Clone YOLOv5
git clone https://github.com/ultralytics/yolov5
cd yolov5

# Install requirements
pip install -r requirements.txt

# Export to ONNX
python export.py --weights yolov5s.pt --include onnx --simplify

DeepStream config for YOLOv5:

config_infer_yolov5.txt
[property]
gpu-id=0
net-scale-factor=0.0039215697906911373
onnx-file=yolov5s.onnx
model-engine-file=yolov5s_b1_gpu0_fp16.engine
labelfile-path=coco_labels.txt
batch-size=1
network-mode=2
num-detected-classes=80
interval=0
gie-unique-id=1
network-type=0
cluster-mode=2
maintain-aspect-ratio=1
symmetric-padding=1

[class-attrs-all]
nms-iou-threshold=0.45
pre-cluster-threshold=0.25
topk=300

YOLOv8

# Install Ultralytics
pip install ultralytics

# Export to ONNX
from ultralytics import YOLO

model = YOLO('yolov8n.pt')
model.export(format='onnx', simplify=True, dynamic=False)

Config for YOLOv8:

config_infer_yolov8.txt
[property]
gpu-id=0
net-scale-factor=0.0039215697906911373
onnx-file=yolov8n.onnx
model-engine-file=yolov8n_b1_gpu0_fp16.engine
labelfile-path=coco_labels.txt
batch-size=1
network-mode=2
num-detected-classes=80
interval=0
gie-unique-id=1
network-type=0
cluster-mode=4 # DBSCAN for YOLOv8
maintain-aspect-ratio=1

[class-attrs-all]
nms-iou-threshold=0.45
pre-cluster-threshold=0.25
topk=300

EfficientDet

# Export EfficientDet to ONNX
import torch
from effdet import create_model

model = create_model('efficientdet_d0',
pretrained=True,
bench_task='predict')
model.eval()

dummy_input = torch.randn(1, 3, 512, 512)
torch.onnx.export(model, dummy_input, 'efficientdet_d0.onnx',
input_names=['input'],
output_names=['boxes', 'scores', 'classes'],
dynamic_axes={'input': {0: 'batch'}})

Config:

config_infer_efficientdet.txt
[property]
gpu-id=0
net-scale-factor=0.0039215697906911373
onnx-file=efficientdet_d0.onnx
model-engine-file=efficientdet_d0_b1_gpu0_fp16.engine
labelfile-path=coco_labels.txt
batch-size=1
network-mode=2
num-detected-classes=90
interval=0
gie-unique-id=1
network-type=0
input-dims=3;512;512;0

Semantic Segmentation (DeepLabV3)

import torch
import torchvision

model = torchvision.models.segmentation.deeplabv3_resnet50(pretrained=True)
model.eval()

dummy_input = torch.randn(1, 3, 512, 512)
torch.onnx.export(model, dummy_input, 'deeplabv3.onnx',
input_names=['input'],
output_names=['output'])

Config:

config_infer_segmentation.txt
[property]
gpu-id=0
net-scale-factor=0.0039215697906911373
onnx-file=deeplabv3.onnx
model-engine-file=deeplabv3_b1_gpu0_fp16.engine
batch-size=1
network-mode=2
network-type=2 # Segmentation
interval=0
gie-unique-id=1
segmentation-threshold=0.5
input-dims=3;512;512;0
output-blob-names=output

Custom Post-Processing

For models with custom output formats, implement custom parsing:

def custom_parse_bbox(network_info, detection_params, layer_info):
"""Custom bounding box parser"""
# Get output layer
output_layer = layer_info[0]

# Parse your custom format
# Example: [batch, num_boxes, 6] where 6 = [x1, y1, x2, y2, conf, class]
detections = []

for i in range(num_boxes):
x1 = output_layer[i * 6 + 0]
y1 = output_layer[i * 6 + 1]
x2 = output_layer[i * 6 + 2]
y2 = output_layer[i * 6 + 3]
confidence = output_layer[i * 6 + 4]
class_id = int(output_layer[i * 6 + 5])

if confidence > detection_params.threshold:
detection = {
'left': x1,
'top': y1,
'width': x2 - x1,
'height': y2 - y1,
'confidence': confidence,
'classId': class_id
}
detections.append(detection)

return detections

# Register custom parser
pyds.NvDsInferParseCustom_register("custom_bbox_parser", custom_parse_bbox)

Configure custom parser in config file:

[property]
custom-lib-path=/path/to/libcustom_parser.so
parse-bbox-func-name=custom_bbox_parser

Secondary Models (Cascaded Inference)

Use secondary models for additional classification:

Primary Detector + Secondary Classifier

config_primary.txt
[property]
gpu-id=0
onnx-file=vehicle_detector.onnx
model-engine-file=vehicle_detector.engine
num-detected-classes=4 # car, truck, bus, motorcycle
gie-unique-id=1
network-type=0
# ... other properties ...
config_secondary_color.txt
[property]
gpu-id=0
onnx-file=vehicle_color_classifier.onnx
model-engine-file=vehicle_color_classifier.engine
num-detected-classes=10 # different colors
gie-unique-id=2
network-type=1 # Classifier
process-mode=2 # Process on objects from primary
operate-on-gie-id=1 # Operate on primary detector
operate-on-class-ids=0;1 # Only on cars and trucks
classifier-threshold=0.5
# ... other properties ...

Python pipeline with secondary inference:

# Create primary and secondary inference elements
pgie = Gst.ElementFactory.make("nvinfer", "primary-inference")
pgie.set_property('config-file-path', 'config_primary.txt')

sgie1 = Gst.ElementFactory.make("nvinfer", "secondary-inference-1")
sgie1.set_property('config-file-path', 'config_secondary_color.txt')

sgie2 = Gst.ElementFactory.make("nvinfer", "secondary-inference-2")
sgie2.set_property('config-file-path', 'config_secondary_make.txt')

# Link: streammux → pgie → sgie1 → sgie2 → tracker → ...
streammux.link(pgie)
pgie.link(sgie1)
sgie1.link(sgie2)
sgie2.link(tracker)

INT8 Calibration for Inference

INT8 precision can significantly improve performance. DeepStream supports INT8 calibration:

config_infer_int8.txt
[property]
gpu-id=0
onnx-file=model.onnx
model-engine-file=model_b1_gpu0_int8.engine
network-mode=1 # INT8
int8-calib-file=calibration.table
batch-size=1

# Calibration settings (for first run)
[calibration]
enable=1
calibration-type=Entropy # or MinMax
calibration-image-dir=/path/to/calibration/images
calibration-cache=calibration.bin

Generate calibration cache:

# Run inference on calibration dataset
deepstream-app -c config_infer_int8.txt

# After calibration, the .bin file will be created
# Subsequent runs will use the cached calibration

Model Performance Optimization

Dynamic Batching

Enable dynamic batching for better throughput:

streammux.set_property('batch-size', 8)  # Max batch size
streammux.set_property('batched-push-timeout', 40000) # 40ms timeout

Multiple Model Instances

Run multiple instances of the same model:

[property]
gpu-id=0
onnx-file=model.onnx
model-engine-file=model.engine
batch-size=1
num-copies=2 # Run 2 instances in parallel

Profile Inference

Enable profiling to identify bottlenecks:

[property]
enable-dla=0
dla-core=0
gpu-id=0
output-tensor-meta=1

Monitor with:

# Run with performance measurement
deepstream-app -c config.txt

# Use nvidia-smi to monitor GPU usage
watch -n 1 nvidia-smi

# Use tegrastats on Jetson
tegrastats

Testing Your Model

Create a test script:

test_model.py
#!/usr/bin/env python3

import sys
sys.path.append('/opt/nvidia/deepstream/deepstream/lib')

import gi
gi.require_version('Gst', '1.0')
from gi.repository import GLib, Gst
import pyds

def test_model(video_file, config_file):
"""Test custom model with video file"""
Gst.init(None)

# Create pipeline
pipeline = Gst.parse_launch(f"""
filesrc location={video_file} !
h264parse !
nvv4l2decoder !
m.sink_0 nvstreammux name=m width=1920 height=1080 batch-size=1 !
nvinfer config-file-path={config_file} !
nvvideoconvert !
nvdsosd !
nveglglessink sync=0
""")

# Start
pipeline.set_state(Gst.State.PLAYING)

# Run
loop = GLib.MainLoop()
bus = pipeline.get_bus()
bus.add_signal_watch()
bus.connect("message", lambda bus, msg: bus_call(bus, msg, loop), None)

try:
loop.run()
except KeyboardInterrupt:
pass

pipeline.set_state(Gst.State.NULL)

def bus_call(bus, message, loop):
t = message.type
if t == Gst.MessageType.EOS or t == Gst.MessageType.ERROR:
loop.quit()
return True

if __name__ == '__main__':
if len(sys.argv) < 3:
print("Usage: python3 test_model.py <video> <config>")
sys.exit(1)

test_model(sys.argv[1], sys.argv[2])

Run test:

python3 test_model.py sample.mp4 config_infer_custom.txt

Next Steps

Troubleshooting

Issue: Engine generation fails

# Check ONNX model validity
python -m onnxruntime.tools.check_onnx_model model.onnx

# Verbose TensorRT logging
export TRT_LOGGER_VERBOSITY=4

Issue: Poor inference performance

  • Use FP16 instead of FP32
  • Enable INT8 calibration
  • Increase batch size
  • Use dynamic batching
  • Profile with trtexec

Issue: Incorrect detections

  • Verify input preprocessing (scale factor, mean normalization)
  • Check output parsing logic
  • Validate NMS thresholds
  • Test model independently with trtexec

Resources