Basic Usage
Learn the fundamental concepts and operations in PostgresML through practical examples.
Core Concepts
PostgresML follows a simple workflow:
- Prepare your data in PostgreSQL tables
- Train models using
pgml.train() - Deploy models automatically
- Predict using
pgml.predict()
Your First Model
Let's create a simple classification model step by step.
Step 1: Prepare Your Data
-- Create a sample dataset for classification
CREATE TABLE iris_data (
id SERIAL PRIMARY KEY,
sepal_length FLOAT,
sepal_width FLOAT,
petal_length FLOAT,
petal_width FLOAT,
species TEXT
);
-- Insert classic Iris dataset samples
INSERT INTO iris_data (sepal_length, sepal_width, petal_length, petal_width, species)
VALUES
(5.1, 3.5, 1.4, 0.2, 'setosa'),
(4.9, 3.0, 1.4, 0.2, 'setosa'),
(7.0, 3.2, 4.7, 1.4, 'versicolor'),
(6.4, 3.2, 4.5, 1.5, 'versicolor'),
(6.3, 3.3, 6.0, 2.5, 'virginica'),
(5.8, 2.7, 5.1, 1.9, 'virginica');
-- Add more samples...
-- View your data
SELECT * FROM iris_data LIMIT 5;
Step 2: Train a Model
-- Train a classification model
SELECT * FROM pgml.train(
project_name => 'iris_classifier',
task => 'classification',
relation_name => 'iris_data',
y_column_name => 'species',
algorithm => 'xgboost',
hyperparams => '{
"n_estimators": 100,
"max_depth": 5
}'
);
The function returns detailed training metrics:
- Model ID
- Training time
- Accuracy metrics
- Feature importance
Step 3: Make Predictions
-- Predict on new data
SELECT
sepal_length,
sepal_width,
petal_length,
petal_width,
species AS actual,
pgml.predict(
'iris_classifier',
ARRAY[sepal_length, sepal_width, petal_length, petal_width]
) AS predicted
FROM iris_data
LIMIT 10;
Common Machine Learning Tasks
Classification
Predict categorical labels:
-- Binary classification
SELECT * FROM pgml.train(
'spam_detector',
'classification',
'emails',
'is_spam'
);
-- Multi-class classification
SELECT * FROM pgml.train(
'product_categorizer',
'classification',
'products',
'category',
algorithm => 'random_forest'
);
Regression
Predict continuous values:
-- Train a regression model
SELECT * FROM pgml.train(
'house_price_predictor',
'regression',
'houses',
'price',
algorithm => 'lightgbm',
hyperparams => '{
"n_estimators": 200,
"learning_rate": 0.1
}'
);
-- Make predictions
SELECT
square_feet,
bedrooms,
bathrooms,
pgml.predict(
'house_price_predictor',
ARRAY[square_feet, bedrooms, bathrooms, location_score]
) AS predicted_price
FROM houses;
Clustering
Group similar data points:
-- K-means clustering
SELECT * FROM pgml.train(
'customer_segments',
'clustering',
'customers',
algorithm => 'kmeans',
hyperparams => '{
"n_clusters": 5
}'
);
-- Get cluster assignments
SELECT
customer_id,
pgml.predict(
'customer_segments',
ARRAY[age, income, purchase_frequency]
) AS cluster
FROM customers;
Working with Different Algorithms
PostgresML supports many algorithms. Here's how to use them:
Tree-Based Models
-- Random Forest
SELECT * FROM pgml.train(
'my_project',
'classification',
'my_table',
'label',
algorithm => 'random_forest',
hyperparams => '{
"n_estimators": 100,
"max_depth": 10,
"min_samples_split": 2
}'
);
-- XGBoost
SELECT * FROM pgml.train(
'my_project',
'classification',
'my_table',
'label',
algorithm => 'xgboost',
hyperparams => '{
"n_estimators": 100,
"learning_rate": 0.1,
"max_depth": 6
}'
);
-- LightGBM
SELECT * FROM pgml.train(
'my_project',
'classification',
'my_table',
'label',
algorithm => 'lightgbm',
hyperparams => '{
"n_estimators": 100,
"num_leaves": 31
}'
);
Linear Models
-- Logistic Regression
SELECT * FROM pgml.train(
'simple_classifier',
'classification',
'data',
'label',
algorithm => 'logistic_regression'
);
-- Linear Regression
SELECT * FROM pgml.train(
'price_predictor',
'regression',
'data',
'price',
algorithm => 'linear_regression'
);
-- Ridge Regression (with regularization)
SELECT * FROM pgml.train(
'robust_predictor',
'regression',
'data',
'target',
algorithm => 'ridge',
hyperparams => '{
"alpha": 1.0
}'
);
Support Vector Machines
-- SVM Classification
SELECT * FROM pgml.train(
'svm_classifier',
'classification',
'data',
'label',
algorithm => 'svm',
hyperparams => '{
"kernel": "rbf",
"C": 1.0
}'
);
Model Management
View Available Models
-- List all trained models
SELECT * FROM pgml.models ORDER BY created_at DESC;
-- Get models for a specific project
SELECT * FROM pgml.models
WHERE project_name = 'iris_classifier'
ORDER BY created_at DESC;
-- View deployed models
SELECT * FROM pgml.deployed_models;
Model Metrics
-- View model performance metrics
SELECT
id,
project_name,
algorithm,
metrics->>'accuracy' AS accuracy,
metrics->>'f1' AS f1_score,
metrics->>'precision' AS precision,
metrics->>'recall' AS recall
FROM pgml.models
WHERE project_name = 'iris_classifier'
ORDER BY (metrics->>'accuracy')::float DESC;
Deploy a Specific Model
-- Deploy a model by ID
SELECT pgml.deploy(
project_name => 'iris_classifier',
model_id => 42,
strategy => 'most_recent'
);
-- Deploy best model by metric
SELECT pgml.deploy(
project_name => 'iris_classifier',
strategy => 'best_score',
algorithm => 'xgboost'
);
Data Preprocessing
Feature Selection
-- Train with specific features
SELECT * FROM pgml.train(
'selective_model',
'classification',
'SELECT feature1, feature2, feature5, label FROM my_table',
'label'
);
Train-Test Split
-- Use automatic train-test split
SELECT * FROM pgml.train(
'my_model',
'classification',
'my_table',
'label',
test_size => 0.2, -- 20% test set
test_sampling => 'random'
);
Handling Categorical Features
-- PostgresML automatically handles categorical features
-- Just ensure your data types are correct
CREATE TABLE products (
id SERIAL,
category TEXT, -- Categorical
brand TEXT, -- Categorical
price NUMERIC, -- Numeric
rating FLOAT, -- Numeric
is_popular BOOLEAN -- Binary (converted automatically)
);
-- Train directly
SELECT * FROM pgml.train(
'product_recommender',
'classification',
'products',
'is_popular'
);
Batch Predictions
Process multiple predictions efficiently:
-- Predict on entire table
CREATE TABLE predictions AS
SELECT
id,
pgml.predict(
'my_model',
ARRAY[feature1, feature2, feature3]
) AS prediction,
pgml.predict_proba(
'my_model',
ARRAY[feature1, feature2, feature3]
) AS probability
FROM my_data;
-- View results
SELECT * FROM predictions LIMIT 10;
Hyperparameter Tuning
Grid Search Example
-- Try multiple hyperparameter combinations
-- Random Forest with different depths
SELECT * FROM pgml.train(
'tuned_model_v1',
'classification',
'my_data',
'label',
algorithm => 'random_forest',
hyperparams => '{"n_estimators": 50, "max_depth": 5}'
);
SELECT * FROM pgml.train(
'tuned_model_v2',
'classification',
'my_data',
'label',
algorithm => 'random_forest',
hyperparams => '{"n_estimators": 100, "max_depth": 10}'
);
-- Compare results
SELECT
project_name,
algorithm,
hyperparams,
metrics->>'accuracy' AS accuracy
FROM pgml.models
WHERE project_name LIKE 'tuned_model%'
ORDER BY (metrics->>'accuracy')::float DESC;
Best Practices
1. Start Simple
Begin with simple algorithms and add complexity as needed:
-- Start with logistic regression
SELECT * FROM pgml.train('my_project', 'classification', 'data', 'label',
algorithm => 'logistic_regression');
-- Then try tree-based if needed
SELECT * FROM pgml.train('my_project', 'classification', 'data', 'label',
algorithm => 'random_forest');
2. Monitor Performance
Always check model metrics:
SELECT
algorithm,
metrics->>'accuracy' AS accuracy,
training_time
FROM pgml.models
WHERE project_name = 'my_project'
ORDER BY created_at DESC
LIMIT 5;
3. Use Appropriate Test Sets
-- Use time-based split for time series
SELECT * FROM pgml.train(
'time_series_model',
'regression',
'SELECT * FROM sensor_data WHERE timestamp < NOW() - INTERVAL ''7 days''',
'value',
test_size => 0.2
);
4. Keep Data Fresh
Retrain models periodically:
-- Schedule regular retraining (use pg_cron or similar)
SELECT * FROM pgml.train(
'daily_predictor',
'regression',
'recent_data',
'target'
);
Next Steps
Now that you understand the basics, explore:
- Training Models - Advanced training techniques
- Making Predictions - Prediction strategies
- Transformers & NLP - Use pre-trained models
- Vector Operations - Semantic search and embeddings