Skip to main content

Detailed Deployment Example

This comprehensive guide demonstrates deploying a production-ready machine learning model using BentoML, from training to serving.

Project Overview

We'll build a complete sentiment analysis service that:

  • Classifies text sentiment (positive, negative, neutral)
  • Uses a pre-trained transformer model
  • Includes preprocessing and postprocessing
  • Has proper error handling and validation
  • Is production-ready with monitoring

Prerequisites

pip install bentoml transformers torch datasets scikit-learn

Part 1: Model Training and Preparation

Step 1.1: Train the Model

# train_sentiment.py
import torch
from transformers import (
AutoModelForSequenceClassification,
AutoTokenizer,
TrainingArguments,
Trainer
)
from datasets import load_dataset
import bentoml

def train_model():
# Load dataset
print("Loading dataset...")
dataset = load_dataset("imdb", split="train[:5000]") # Use subset for demo

# Load pre-trained model and tokenizer
print("Loading model and tokenizer...")
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
model_name,
num_labels=2
)

# Tokenize dataset
def tokenize_function(examples):
return tokenizer(
examples["text"],
padding="max_length",
truncation=True,
max_length=512
)

print("Tokenizing dataset...")
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Training arguments
training_args = TrainingArguments(
output_dir="./results",
num_train_epochs=1, # Increase for better accuracy
per_device_train_batch_size=16,
warmup_steps=500,
weight_decay=0.01,
logging_dir="./logs",
)

# Train
print("Training model...")
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset,
)
trainer.train()

# Save model with BentoML
print("Saving model to BentoML...")
saved_model = bentoml.transformers.save_model(
"sentiment_model",
model,
custom_objects={
"tokenizer": tokenizer,
},
labels={
"framework": "transformers",
"task": "sentiment-analysis",
"model_type": "distilbert"
},
metadata={
"model_name": model_name,
"num_labels": 2,
"max_length": 512,
"training_samples": 5000
}
)

print(f"Model saved: {saved_model.tag}")
return saved_model

if __name__ == "__main__":
train_model()

Run training:

python train_sentiment.py

Step 1.2: Verify Model

bentoml models list
bentoml models get sentiment_model:latest

Part 2: Service Implementation

Step 2.1: Create Service with Advanced Features

# service.py
import bentoml
from pydantic import BaseModel, Field, validator
from typing import List, Optional
import logging

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Input validation models
class TextInput(BaseModel):
text: str = Field(..., min_length=1, max_length=5000)

@validator('text')
def validate_text(cls, v):
if not v.strip():
raise ValueError('Text cannot be empty')
return v.strip()

class BatchTextInput(BaseModel):
texts: List[str] = Field(..., min_items=1, max_items=100)

@validator('texts')
def validate_texts(cls, v):
return [text.strip() for text in v if text.strip()]

# Output models
class SentimentResult(BaseModel):
text: str
sentiment: str
confidence: float
label_scores: dict

@bentoml.service(
resources={
"cpu": "4",
"memory": "4Gi",
},
traffic={
"timeout": 30,
"max_concurrency": 100,
}
)
class SentimentAnalyzer:
"""
Production-ready sentiment analysis service
"""

def __init__(self):
# Load model and tokenizer
self.model_ref = bentoml.models.get("sentiment_model:latest")

# Load the model into memory
import torch
self.model = bentoml.transformers.load_model(self.model_ref)
self.tokenizer = self.model_ref.custom_objects["tokenizer"]

# Device configuration
self.device = "cuda" if torch.cuda.is_available() else "cpu"
self.model.to(self.device)
self.model.eval()

# Label mapping
self.label_map = {0: "negative", 1: "positive"}

logger.info(f"Model loaded on device: {self.device}")

def _predict(self, texts: List[str]) -> List[SentimentResult]:
"""Internal prediction method"""
import torch

# Tokenize inputs
inputs = self.tokenizer(
texts,
padding=True,
truncation=True,
max_length=512,
return_tensors="pt"
).to(self.device)

# Make predictions
with torch.no_grad():
outputs = self.model(**inputs)
probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)

# Process results
results = []
for idx, text in enumerate(texts):
probs = probabilities[idx].cpu().numpy()
predicted_label = int(probs.argmax())
confidence = float(probs[predicted_label])

result = SentimentResult(
text=text[:100] + "..." if len(text) > 100 else text,
sentiment=self.label_map[predicted_label],
confidence=confidence,
label_scores={
"negative": float(probs[0]),
"positive": float(probs[1])
}
)
results.append(result)

return results

@bentoml.api
async def analyze(self, input_data: TextInput) -> SentimentResult:
"""
Analyze sentiment of a single text

Args:
input_data: Text input with validation

Returns:
Sentiment analysis result
"""
try:
logger.info(f"Analyzing text: {input_data.text[:50]}...")
results = self._predict([input_data.text])
return results[0]
except Exception as e:
logger.error(f"Error analyzing text: {str(e)}")
raise

@bentoml.api
async def analyze_batch(self, input_data: BatchTextInput) -> List[SentimentResult]:
"""
Analyze sentiment of multiple texts in batch

Args:
input_data: Batch of texts with validation

Returns:
List of sentiment analysis results
"""
try:
logger.info(f"Analyzing batch of {len(input_data.texts)} texts")
results = self._predict(input_data.texts)
return results
except Exception as e:
logger.error(f"Error analyzing batch: {str(e)}")
raise

@bentoml.api
async def health(self) -> dict:
"""
Health check endpoint
"""
import torch
return {
"status": "healthy",
"model": str(self.model_ref.tag),
"device": self.device,
"cuda_available": torch.cuda.is_available()
}

Part 3: Configuration and Building

Step 3.1: Create bentofile.yaml

# bentofile.yaml
service: "service:SentimentAnalyzer"
labels:
owner: ml-team
project: sentiment-analysis
stage: production
description: "Production sentiment analysis service using DistilBERT"
include:
- "service.py"
- "train_sentiment.py"
exclude:
- "*.pyc"
- "__pycache__/"
- "*.log"
- "results/"
python:
requirements_txt: "./requirements.txt"
lock_packages: true
docker:
distro: debian
python_version: "3.10"
env:
TRANSFORMERS_CACHE: "/tmp/transformers_cache"
system_packages:
- git
setup_script: |
apt-get update && apt-get install -y \
build-essential \
&& rm -rf /var/lib/apt/lists/*

Step 3.2: Create requirements.txt

# requirements.txt
bentoml>=1.2.0
transformers>=4.30.0
torch>=2.0.0
pydantic>=2.0.0
datasets>=2.10.0

Step 3.3: Build the Bento

bentoml build

Output:

Successfully built Bento(tag="sentiment_analyzer:abc123xyz")

Part 4: Local Testing

Step 4.1: Serve Locally

bentoml serve service:SentimentAnalyzer

Step 4.2: Test with Different Methods

Using cURL:

# Single prediction
curl -X POST http://localhost:3000/analyze \
-H "Content-Type: application/json" \
-d '{"text": "This movie is absolutely fantastic! I loved every minute of it."}'

# Batch prediction
curl -X POST http://localhost:3000/analyze_batch \
-H "Content-Type: application/json" \
-d '{
"texts": [
"Great product, highly recommend!",
"Terrible experience, waste of money.",
"It was okay, nothing special."
]
}'

# Health check
curl http://localhost:3000/health

Using Python:

# test_service.py
import requests
import json

BASE_URL = "http://localhost:3000"

def test_single():
"""Test single text analysis"""
response = requests.post(
f"{BASE_URL}/analyze",
json={"text": "This is an amazing product!"}
)
print("Single prediction:")
print(json.dumps(response.json(), indent=2))

def test_batch():
"""Test batch analysis"""
texts = [
"I love this!",
"This is terrible.",
"Could be better.",
"Absolutely wonderful experience!",
"Not worth the money."
]

response = requests.post(
f"{BASE_URL}/analyze_batch",
json={"texts": texts}
)
print("\nBatch prediction:")
print(json.dumps(response.json(), indent=2))

def test_health():
"""Test health endpoint"""
response = requests.get(f"{BASE_URL}/health")
print("\nHealth check:")
print(json.dumps(response.json(), indent=2))

if __name__ == "__main__":
test_single()
test_batch()
test_health()

Part 5: Containerization

Step 5.1: Build Docker Image

bentoml containerize sentiment_analyzer:latest \
--image-tag sentiment-analyzer:v1.0.0 \
--platform linux/amd64

Step 5.2: Run Container

docker run -p 3000:3000 \
--name sentiment-service \
sentiment-analyzer:v1.0.0

Step 5.3: Run with GPU Support

docker run -p 3000:3000 \
--gpus all \
--name sentiment-service-gpu \
sentiment-analyzer:v1.0.0

Part 6: Kubernetes Deployment

Step 6.1: Create Kubernetes Manifests

# k8s-deployment.yaml
apiVersion: v1
kind: Namespace
metadata:
name: ml-services
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: sentiment-analyzer
namespace: ml-services
spec:
replicas: 3
selector:
matchLabels:
app: sentiment-analyzer
template:
metadata:
labels:
app: sentiment-analyzer
version: v1.0.0
spec:
containers:
- name: sentiment-service
image: sentiment-analyzer:v1.0.0
ports:
- containerPort: 3000
name: http
resources:
requests:
memory: "4Gi"
cpu: "2"
limits:
memory: "8Gi"
cpu: "4"
env:
- name: BENTOML_PORT
value: "3000"
- name: TRANSFORMERS_CACHE
value: "/tmp/transformers_cache"
livenessProbe:
httpGet:
path: /health
port: 3000
initialDelaySeconds: 30
periodSeconds: 10
readinessProbe:
httpGet:
path: /health
port: 3000
initialDelaySeconds: 20
periodSeconds: 5
---
apiVersion: v1
kind: Service
metadata:
name: sentiment-analyzer-service
namespace: ml-services
spec:
type: LoadBalancer
selector:
app: sentiment-analyzer
ports:
- port: 80
targetPort: 3000
protocol: TCP
name: http
---
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
name: sentiment-analyzer-hpa
namespace: ml-services
spec:
scaleTargetRef:
apiVersion: apps/v1
kind: Deployment
name: sentiment-analyzer
minReplicas: 3
maxReplicas: 10
metrics:
- type: Resource
resource:
name: cpu
target:
type: Utilization
averageUtilization: 70
- type: Resource
resource:
name: memory
target:
type: Utilization
averageUtilization: 80

Step 6.2: Deploy to Kubernetes

# Apply the deployment
kubectl apply -f k8s-deployment.yaml

# Check deployment status
kubectl get deployments -n ml-services

# Check pods
kubectl get pods -n ml-services

# Check service
kubectl get service -n ml-services

# View logs
kubectl logs -f deployment/sentiment-analyzer -n ml-services

Part 7: Cloud Deployment Options

Option 1: AWS Elastic Container Service (ECS)

# Push image to ECR
aws ecr create-repository --repository-name sentiment-analyzer
aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin <account-id>.dkr.ecr.us-east-1.amazonaws.com

docker tag sentiment-analyzer:v1.0.0 <account-id>.dkr.ecr.us-east-1.amazonaws.com/sentiment-analyzer:v1.0.0
docker push <account-id>.dkr.ecr.us-east-1.amazonaws.com/sentiment-analyzer:v1.0.0

# Create ECS task definition and service (use AWS Console or CLI)

Option 2: Google Cloud Run

# Authenticate
gcloud auth login

# Configure project
gcloud config set project YOUR_PROJECT_ID

# Build and push to GCR
gcloud builds submit --tag gcr.io/YOUR_PROJECT_ID/sentiment-analyzer:v1.0.0

# Deploy to Cloud Run
gcloud run deploy sentiment-analyzer \
--image gcr.io/YOUR_PROJECT_ID/sentiment-analyzer:v1.0.0 \
--platform managed \
--region us-central1 \
--memory 4Gi \
--cpu 2 \
--max-instances 10 \
--allow-unauthenticated

Option 3: Azure Container Instances

# Login to Azure
az login

# Create resource group
az group create --name ml-services --location eastus

# Create container registry
az acr create --resource-group ml-services --name sentimentacr --sku Basic

# Push image
az acr login --name sentimentacr
docker tag sentiment-analyzer:v1.0.0 sentimentacr.azurecr.io/sentiment-analyzer:v1.0.0
docker push sentimentacr.azurecr.io/sentiment-analyzer:v1.0.0

# Deploy container
az container create \
--resource-group ml-services \
--name sentiment-analyzer \
--image sentimentacr.azurecr.io/sentiment-analyzer:v1.0.0 \
--cpu 2 \
--memory 4 \
--dns-name-label sentiment-analyzer-service \
--ports 3000

Part 8: Monitoring and Observability

Step 8.1: Add Prometheus Metrics

# Add to service.py
from prometheus_client import Counter, Histogram, Gauge
import time

# Metrics
prediction_counter = Counter(
'sentiment_predictions_total',
'Total number of predictions',
['sentiment']
)
prediction_duration = Histogram(
'sentiment_prediction_duration_seconds',
'Prediction duration in seconds'
)
active_requests = Gauge(
'sentiment_active_requests',
'Number of active requests'
)

# Update predict method to include metrics
def _predict(self, texts: List[str]) -> List[SentimentResult]:
active_requests.inc()
start_time = time.time()

try:
# ... existing prediction code ...

# Record metrics
for result in results:
prediction_counter.labels(sentiment=result.sentiment).inc()

return results
finally:
prediction_duration.observe(time.time() - start_time)
active_requests.dec()

Step 8.2: Add Logging

# Enhanced logging in service
@bentoml.api
async def analyze(self, input_data: TextInput) -> SentimentResult:
request_id = str(uuid.uuid4())
logger.info(f"[{request_id}] Received prediction request")

try:
start_time = time.time()
result = self._predict([input_data.text])[0]
duration = time.time() - start_time

logger.info(
f"[{request_id}] Prediction completed in {duration:.3f}s: "
f"sentiment={result.sentiment}, confidence={result.confidence:.3f}"
)
return result
except Exception as e:
logger.error(f"[{request_id}] Prediction failed: {str(e)}", exc_info=True)
raise

Part 9: Performance Optimization

Enable Adaptive Batching

# In service.py, configure runner with batching
@bentoml.service(
resources={
"cpu": "4",
"memory": "4Gi",
},
traffic={
"timeout": 30,
"max_concurrency": 100,
},
runners=[
bentoml.Runner(
"sentiment_model",
max_batch_size=32,
max_latency_ms=100,
)
]
)
class SentimentAnalyzer:
# ... rest of the code

Summary

This comprehensive example covered:

Model Training - Training and saving a transformer model ✅ Service Implementation - Production-ready service with validation ✅ Testing - Local testing and validation ✅ Containerization - Building Docker images ✅ Kubernetes Deployment - Complete K8s setup with autoscaling ✅ Cloud Deployment - AWS, GCP, and Azure options ✅ Monitoring - Metrics and logging ✅ Optimization - Performance tuning

Next Steps