Detailed Deployment Example
This comprehensive guide demonstrates deploying a production-ready machine learning model using BentoML, from training to serving.
Project Overview
We'll build a complete sentiment analysis service that:
- Classifies text sentiment (positive, negative, neutral)
- Uses a pre-trained transformer model
- Includes preprocessing and postprocessing
- Has proper error handling and validation
- Is production-ready with monitoring
Prerequisites
pip install bentoml transformers torch datasets scikit-learn
Part 1: Model Training and Preparation
Step 1.1: Train the Model
# train_sentiment.py
import torch
from transformers import (
AutoModelForSequenceClassification,
AutoTokenizer,
TrainingArguments,
Trainer
)
from datasets import load_dataset
import bentoml
def train_model():
# Load dataset
print("Loading dataset...")
dataset = load_dataset("imdb", split="train[:5000]") # Use subset for demo
# Load pre-trained model and tokenizer
print("Loading model and tokenizer...")
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
model_name,
num_labels=2
)
# Tokenize dataset
def tokenize_function(examples):
return tokenizer(
examples["text"],
padding="max_length",
truncation=True,
max_length=512
)
print("Tokenizing dataset...")
tokenized_dataset = dataset.map(tokenize_function, batched=True)
# Training arguments
training_args = TrainingArguments(
output_dir="./results",
num_train_epochs=1, # Increase for better accuracy
per_device_train_batch_size=16,
warmup_steps=500,
weight_decay=0.01,
logging_dir="./logs",
)
# Train
print("Training model...")
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset,
)
trainer.train()
# Save model with BentoML
print("Saving model to BentoML...")
saved_model = bentoml.transformers.save_model(
"sentiment_model",
model,
custom_objects={
"tokenizer": tokenizer,
},
labels={
"framework": "transformers",
"task": "sentiment-analysis",
"model_type": "distilbert"
},
metadata={
"model_name": model_name,
"num_labels": 2,
"max_length": 512,
"training_samples": 5000
}
)
print(f"Model saved: {saved_model.tag}")
return saved_model
if __name__ == "__main__":
train_model()
Run training:
python train_sentiment.py
Step 1.2: Verify Model
bentoml models list
bentoml models get sentiment_model:latest
Part 2: Service Implementation
Step 2.1: Create Service with Advanced Features
# service.py
import bentoml
from pydantic import BaseModel, Field, validator
from typing import List, Optional
import logging
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Input validation models
class TextInput(BaseModel):
text: str = Field(..., min_length=1, max_length=5000)
@validator('text')
def validate_text(cls, v):
if not v.strip():
raise ValueError('Text cannot be empty')
return v.strip()
class BatchTextInput(BaseModel):
texts: List[str] = Field(..., min_items=1, max_items=100)
@validator('texts')
def validate_texts(cls, v):
return [text.strip() for text in v if text.strip()]
# Output models
class SentimentResult(BaseModel):
text: str
sentiment: str
confidence: float
label_scores: dict
@bentoml.service(
resources={
"cpu": "4",
"memory": "4Gi",
},
traffic={
"timeout": 30,
"max_concurrency": 100,
}
)
class SentimentAnalyzer:
"""
Production-ready sentiment analysis service
"""
def __init__(self):
# Load model and tokenizer
self.model_ref = bentoml.models.get("sentiment_model:latest")
# Load the model into memory
import torch
self.model = bentoml.transformers.load_model(self.model_ref)
self.tokenizer = self.model_ref.custom_objects["tokenizer"]
# Device configuration
self.device = "cuda" if torch.cuda.is_available() else "cpu"
self.model.to(self.device)
self.model.eval()
# Label mapping
self.label_map = {0: "negative", 1: "positive"}
logger.info(f"Model loaded on device: {self.device}")
def _predict(self, texts: List[str]) -> List[SentimentResult]:
"""Internal prediction method"""
import torch
# Tokenize inputs
inputs = self.tokenizer(
texts,
padding=True,
truncation=True,
max_length=512,
return_tensors="pt"
).to(self.device)
# Make predictions
with torch.no_grad():
outputs = self.model(**inputs)
probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
# Process results
results = []
for idx, text in enumerate(texts):
probs = probabilities[idx].cpu().numpy()
predicted_label = int(probs.argmax())
confidence = float(probs[predicted_label])
result = SentimentResult(
text=text[:100] + "..." if len(text) > 100 else text,
sentiment=self.label_map[predicted_label],
confidence=confidence,
label_scores={
"negative": float(probs[0]),
"positive": float(probs[1])
}
)
results.append(result)
return results
@bentoml.api
async def analyze(self, input_data: TextInput) -> SentimentResult:
"""
Analyze sentiment of a single text
Args:
input_data: Text input with validation
Returns:
Sentiment analysis result
"""
try:
logger.info(f"Analyzing text: {input_data.text[:50]}...")
results = self._predict([input_data.text])
return results[0]
except Exception as e:
logger.error(f"Error analyzing text: {str(e)}")
raise
@bentoml.api
async def analyze_batch(self, input_data: BatchTextInput) -> List[SentimentResult]:
"""
Analyze sentiment of multiple texts in batch
Args:
input_data: Batch of texts with validation
Returns:
List of sentiment analysis results
"""
try:
logger.info(f"Analyzing batch of {len(input_data.texts)} texts")
results = self._predict(input_data.texts)
return results
except Exception as e:
logger.error(f"Error analyzing batch: {str(e)}")
raise
@bentoml.api
async def health(self) -> dict:
"""
Health check endpoint
"""
import torch
return {
"status": "healthy",
"model": str(self.model_ref.tag),
"device": self.device,
"cuda_available": torch.cuda.is_available()
}
Part 3: Configuration and Building
Step 3.1: Create bentofile.yaml
# bentofile.yaml
service: "service:SentimentAnalyzer"
labels:
owner: ml-team
project: sentiment-analysis
stage: production
description: "Production sentiment analysis service using DistilBERT"
include:
- "service.py"
- "train_sentiment.py"
exclude:
- "*.pyc"
- "__pycache__/"
- "*.log"
- "results/"
python:
requirements_txt: "./requirements.txt"
lock_packages: true
docker:
distro: debian
python_version: "3.10"
env:
TRANSFORMERS_CACHE: "/tmp/transformers_cache"
system_packages:
- git
setup_script: |
apt-get update && apt-get install -y \
build-essential \
&& rm -rf /var/lib/apt/lists/*
Step 3.2: Create requirements.txt
# requirements.txt
bentoml>=1.2.0
transformers>=4.30.0
torch>=2.0.0
pydantic>=2.0.0
datasets>=2.10.0
Step 3.3: Build the Bento
bentoml build
Output:
Successfully built Bento(tag="sentiment_analyzer:abc123xyz")
Part 4: Local Testing
Step 4.1: Serve Locally
bentoml serve service:SentimentAnalyzer
Step 4.2: Test with Different Methods
Using cURL:
# Single prediction
curl -X POST http://localhost:3000/analyze \
-H "Content-Type: application/json" \
-d '{"text": "This movie is absolutely fantastic! I loved every minute of it."}'
# Batch prediction
curl -X POST http://localhost:3000/analyze_batch \
-H "Content-Type: application/json" \
-d '{
"texts": [
"Great product, highly recommend!",
"Terrible experience, waste of money.",
"It was okay, nothing special."
]
}'
# Health check
curl http://localhost:3000/health
Using Python:
# test_service.py
import requests
import json
BASE_URL = "http://localhost:3000"
def test_single():
"""Test single text analysis"""
response = requests.post(
f"{BASE_URL}/analyze",
json={"text": "This is an amazing product!"}
)
print("Single prediction:")
print(json.dumps(response.json(), indent=2))
def test_batch():
"""Test batch analysis"""
texts = [
"I love this!",
"This is terrible.",
"Could be better.",
"Absolutely wonderful experience!",
"Not worth the money."
]
response = requests.post(
f"{BASE_URL}/analyze_batch",
json={"texts": texts}
)
print("\nBatch prediction:")
print(json.dumps(response.json(), indent=2))
def test_health():
"""Test health endpoint"""
response = requests.get(f"{BASE_URL}/health")
print("\nHealth check:")
print(json.dumps(response.json(), indent=2))
if __name__ == "__main__":
test_single()
test_batch()
test_health()
Part 5: Containerization
Step 5.1: Build Docker Image
bentoml containerize sentiment_analyzer:latest \
--image-tag sentiment-analyzer:v1.0.0 \
--platform linux/amd64
Step 5.2: Run Container
docker run -p 3000:3000 \
--name sentiment-service \
sentiment-analyzer:v1.0.0
Step 5.3: Run with GPU Support
docker run -p 3000:3000 \
--gpus all \
--name sentiment-service-gpu \
sentiment-analyzer:v1.0.0
Part 6: Kubernetes Deployment
Step 6.1: Create Kubernetes Manifests
# k8s-deployment.yaml
apiVersion: v1
kind: Namespace
metadata:
name: ml-services
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: sentiment-analyzer
namespace: ml-services
spec:
replicas: 3
selector:
matchLabels:
app: sentiment-analyzer
template:
metadata:
labels:
app: sentiment-analyzer
version: v1.0.0
spec:
containers:
- name: sentiment-service
image: sentiment-analyzer:v1.0.0
ports:
- containerPort: 3000
name: http
resources:
requests:
memory: "4Gi"
cpu: "2"
limits:
memory: "8Gi"
cpu: "4"
env:
- name: BENTOML_PORT
value: "3000"
- name: TRANSFORMERS_CACHE
value: "/tmp/transformers_cache"
livenessProbe:
httpGet:
path: /health
port: 3000
initialDelaySeconds: 30
periodSeconds: 10
readinessProbe:
httpGet:
path: /health
port: 3000
initialDelaySeconds: 20
periodSeconds: 5
---
apiVersion: v1
kind: Service
metadata:
name: sentiment-analyzer-service
namespace: ml-services
spec:
type: LoadBalancer
selector:
app: sentiment-analyzer
ports:
- port: 80
targetPort: 3000
protocol: TCP
name: http
---
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
name: sentiment-analyzer-hpa
namespace: ml-services
spec:
scaleTargetRef:
apiVersion: apps/v1
kind: Deployment
name: sentiment-analyzer
minReplicas: 3
maxReplicas: 10
metrics:
- type: Resource
resource:
name: cpu
target:
type: Utilization
averageUtilization: 70
- type: Resource
resource:
name: memory
target:
type: Utilization
averageUtilization: 80
Step 6.2: Deploy to Kubernetes
# Apply the deployment
kubectl apply -f k8s-deployment.yaml
# Check deployment status
kubectl get deployments -n ml-services
# Check pods
kubectl get pods -n ml-services
# Check service
kubectl get service -n ml-services
# View logs
kubectl logs -f deployment/sentiment-analyzer -n ml-services
Part 7: Cloud Deployment Options
Option 1: AWS Elastic Container Service (ECS)
# Push image to ECR
aws ecr create-repository --repository-name sentiment-analyzer
aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin <account-id>.dkr.ecr.us-east-1.amazonaws.com
docker tag sentiment-analyzer:v1.0.0 <account-id>.dkr.ecr.us-east-1.amazonaws.com/sentiment-analyzer:v1.0.0
docker push <account-id>.dkr.ecr.us-east-1.amazonaws.com/sentiment-analyzer:v1.0.0
# Create ECS task definition and service (use AWS Console or CLI)
Option 2: Google Cloud Run
# Authenticate
gcloud auth login
# Configure project
gcloud config set project YOUR_PROJECT_ID
# Build and push to GCR
gcloud builds submit --tag gcr.io/YOUR_PROJECT_ID/sentiment-analyzer:v1.0.0
# Deploy to Cloud Run
gcloud run deploy sentiment-analyzer \
--image gcr.io/YOUR_PROJECT_ID/sentiment-analyzer:v1.0.0 \
--platform managed \
--region us-central1 \
--memory 4Gi \
--cpu 2 \
--max-instances 10 \
--allow-unauthenticated
Option 3: Azure Container Instances
# Login to Azure
az login
# Create resource group
az group create --name ml-services --location eastus
# Create container registry
az acr create --resource-group ml-services --name sentimentacr --sku Basic
# Push image
az acr login --name sentimentacr
docker tag sentiment-analyzer:v1.0.0 sentimentacr.azurecr.io/sentiment-analyzer:v1.0.0
docker push sentimentacr.azurecr.io/sentiment-analyzer:v1.0.0
# Deploy container
az container create \
--resource-group ml-services \
--name sentiment-analyzer \
--image sentimentacr.azurecr.io/sentiment-analyzer:v1.0.0 \
--cpu 2 \
--memory 4 \
--dns-name-label sentiment-analyzer-service \
--ports 3000
Part 8: Monitoring and Observability
Step 8.1: Add Prometheus Metrics
# Add to service.py
from prometheus_client import Counter, Histogram, Gauge
import time
# Metrics
prediction_counter = Counter(
'sentiment_predictions_total',
'Total number of predictions',
['sentiment']
)
prediction_duration = Histogram(
'sentiment_prediction_duration_seconds',
'Prediction duration in seconds'
)
active_requests = Gauge(
'sentiment_active_requests',
'Number of active requests'
)
# Update predict method to include metrics
def _predict(self, texts: List[str]) -> List[SentimentResult]:
active_requests.inc()
start_time = time.time()
try:
# ... existing prediction code ...
# Record metrics
for result in results:
prediction_counter.labels(sentiment=result.sentiment).inc()
return results
finally:
prediction_duration.observe(time.time() - start_time)
active_requests.dec()
Step 8.2: Add Logging
# Enhanced logging in service
@bentoml.api
async def analyze(self, input_data: TextInput) -> SentimentResult:
request_id = str(uuid.uuid4())
logger.info(f"[{request_id}] Received prediction request")
try:
start_time = time.time()
result = self._predict([input_data.text])[0]
duration = time.time() - start_time
logger.info(
f"[{request_id}] Prediction completed in {duration:.3f}s: "
f"sentiment={result.sentiment}, confidence={result.confidence:.3f}"
)
return result
except Exception as e:
logger.error(f"[{request_id}] Prediction failed: {str(e)}", exc_info=True)
raise
Part 9: Performance Optimization
Enable Adaptive Batching
# In service.py, configure runner with batching
@bentoml.service(
resources={
"cpu": "4",
"memory": "4Gi",
},
traffic={
"timeout": 30,
"max_concurrency": 100,
},
runners=[
bentoml.Runner(
"sentiment_model",
max_batch_size=32,
max_latency_ms=100,
)
]
)
class SentimentAnalyzer:
# ... rest of the code
Summary
This comprehensive example covered:
✅ Model Training - Training and saving a transformer model ✅ Service Implementation - Production-ready service with validation ✅ Testing - Local testing and validation ✅ Containerization - Building Docker images ✅ Kubernetes Deployment - Complete K8s setup with autoscaling ✅ Cloud Deployment - AWS, GCP, and Azure options ✅ Monitoring - Metrics and logging ✅ Optimization - Performance tuning
Next Steps
- Comparison - Compare BentoML with other ML deployment tools
- Best Practices - Production deployment patterns
- Official Documentation - Explore advanced features