Vector Operations & Embeddings
PostgresML provides powerful vector operations for semantic search, similarity matching, and Retrieval Augmented Generation (RAG) applications.
Overview
Vector embeddings represent text, images, or other data as high-dimensional vectors, enabling semantic similarity search and machine learning operations.
Creating Embeddings
Generate Text Embeddings
-- Generate embeddings for text
SELECT pgml.embed(
transformer => 'sentence-transformers/all-MiniLM-L6-v2',
text => 'PostgreSQL is a powerful database'
) AS embedding;
-- Returns: [0.123, -0.456, 0.789, ...] (384-dimensional vector)
-- Create embeddings for multiple texts
SELECT pgml.embed(
transformer => 'sentence-transformers/all-MiniLM-L6-v2',
text => text,
kwargs => '{"normalize": true}'::JSONB
) AS embedding
FROM documents;
Popular Embedding Models
-- Sentence Transformers (recommended)
'sentence-transformers/all-MiniLM-L6-v2' -- 384 dims, fast, good quality
'sentence-transformers/all-mpnet-base-v2' -- 768 dims, high quality
'sentence-transformers/multi-qa-MiniLM-L6-cos-v1' -- Optimized for Q&A
-- OpenAI-compatible
'intfloat/e5-small-v2' -- 384 dims
'intfloat/e5-base-v2' -- 768 dims
'intfloat/e5-large-v2' -- 1024 dims
-- Multilingual
'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'
Storing Embeddings
Create Table with Vectors
-- Install pgvector extension
CREATE EXTENSION IF NOT EXISTS vector;
-- Create table with vector column
CREATE TABLE documents (
id SERIAL PRIMARY KEY,
title TEXT,
content TEXT,
embedding vector(384) -- Dimension matches model output
);
-- Add embeddings to existing documents
UPDATE documents
SET embedding = pgml.embed(
'sentence-transformers/all-MiniLM-L6-v2',
content
)::vector;
Indexing for Performance
-- Create HNSW index for fast similarity search
CREATE INDEX ON documents
USING hnsw (embedding vector_cosine_ops);
-- Or IVFFlat index
CREATE INDEX ON documents
USING ivfflat (embedding vector_cosine_ops)
WITH (lists = 100);
-- Analyze the table
ANALYZE documents;
Semantic Search
Basic Similarity Search
-- Find similar documents
SELECT
id,
title,
1 - (embedding <=> pgml.embed(
'sentence-transformers/all-MiniLM-L6-v2',
'machine learning in databases'
)::vector) AS similarity
FROM documents
ORDER BY embedding <=> pgml.embed(
'sentence-transformers/all-MiniLM-L6-v2',
'machine learning in databases'
)::vector
LIMIT 10;
Distance Operators
PostgresML supports multiple distance metrics:
-- Cosine distance (most common for text)
embedding <=> query_embedding
-- L2 distance (Euclidean)
embedding <-> query_embedding
-- Inner product
embedding <#> query_embedding
-- Example usage
SELECT
title,
embedding <=> query_emb AS cosine_distance,
embedding <-> query_emb AS l2_distance,
embedding <#> query_emb AS inner_product
FROM documents,
LATERAL (
SELECT pgml.embed('sentence-transformers/all-MiniLM-L6-v2', 'search query')::vector
) AS q(query_emb)
ORDER BY cosine_distance
LIMIT 5;
Semantic Search Applications
Document Search System
-- Create search function
CREATE OR REPLACE FUNCTION semantic_search(
query_text TEXT,
limit_count INT DEFAULT 10
)
RETURNS TABLE (
document_id INT,
title TEXT,
content_preview TEXT,
similarity FLOAT
) AS $$
SELECT
id,
title,
LEFT(content, 200) AS content_preview,
1 - (embedding <=> pgml.embed(
'sentence-transformers/all-MiniLM-L6-v2',
query_text
)::vector) AS similarity
FROM documents
WHERE embedding IS NOT NULL
ORDER BY embedding <=> pgml.embed(
'sentence-transformers/all-MiniLM-L6-v2',
query_text
)::vector
LIMIT limit_count;
$$ LANGUAGE sql STABLE;
-- Use the function
SELECT * FROM semantic_search('how to train neural networks');
Hybrid Search (Keyword + Semantic)
-- Combine full-text search with semantic search
CREATE OR REPLACE FUNCTION hybrid_search(
query_text TEXT,
limit_count INT DEFAULT 10
)
RETURNS TABLE (
document_id INT,
title TEXT,
combined_score FLOAT
) AS $$
WITH keyword_scores AS (
SELECT
id,
title,
ts_rank(
to_tsvector('english', content),
plainto_tsquery('english', query_text)
) AS keyword_score
FROM documents
WHERE to_tsvector('english', content) @@ plainto_tsquery('english', query_text)
),
semantic_scores AS (
SELECT
id,
1 - (embedding <=> pgml.embed(
'sentence-transformers/all-MiniLM-L6-v2',
query_text
)::vector) AS semantic_score
FROM documents
)
SELECT
COALESCE(k.id, s.id) AS document_id,
COALESCE(k.title, d.title) AS title,
(COALESCE(k.keyword_score, 0) * 0.3 +
COALESCE(s.semantic_score, 0) * 0.7) AS combined_score
FROM keyword_scores k
FULL OUTER JOIN semantic_scores s ON k.id = s.id
JOIN documents d ON COALESCE(k.id, s.id) = d.id
ORDER BY combined_score DESC
LIMIT limit_count;
$$ LANGUAGE sql STABLE;
Retrieval Augmented Generation (RAG)
Basic RAG System
-- Create knowledge base
CREATE TABLE knowledge_base (
id SERIAL PRIMARY KEY,
title TEXT,
content TEXT,
metadata JSONB,
embedding vector(384),
created_at TIMESTAMPTZ DEFAULT NOW()
);
-- Populate with embeddings
INSERT INTO knowledge_base (title, content, embedding)
SELECT
title,
content,
pgml.embed('sentence-transformers/all-MiniLM-L6-v2', content)::vector
FROM source_documents;
-- RAG query function
CREATE OR REPLACE FUNCTION rag_query(
user_question TEXT,
context_limit INT DEFAULT 3
)
RETURNS TEXT AS $$
DECLARE
context TEXT;
prompt TEXT;
BEGIN
-- Retrieve relevant context
SELECT STRING_AGG(content, E'\n\n') INTO context
FROM (
SELECT content
FROM knowledge_base
ORDER BY embedding <=> pgml.embed(
'sentence-transformers/all-MiniLM-L6-v2',
user_question
)::vector
LIMIT context_limit
) relevant_docs;
-- Build prompt with context
prompt := format(
'Context: %s\n\nQuestion: %s\n\nAnswer based on the context:',
context,
user_question
);
-- Generate answer (if you have a generation model)
RETURN pgml.transform(
task => 'text-generation',
inputs => ARRAY[prompt],
args => '{"max_length": 200}'::JSONB
)->0->>'generated_text';
END;
$$ LANGUAGE plpgsql;
-- Use RAG
SELECT rag_query('What is PostgresML?');
Advanced RAG with Filtering
-- RAG with metadata filtering
CREATE OR REPLACE FUNCTION filtered_rag_query(
user_question TEXT,
filter_category TEXT DEFAULT NULL,
filter_date TIMESTAMPTZ DEFAULT NULL,
context_limit INT DEFAULT 5
)
RETURNS TABLE (
answer TEXT,
sources JSONB
) AS $$
WITH relevant_docs AS (
SELECT
id,
title,
content,
metadata,
1 - (embedding <=> pgml.embed(
'sentence-transformers/all-MiniLM-L6-v2',
user_question
)::vector) AS similarity
FROM knowledge_base
WHERE (filter_category IS NULL OR metadata->>'category' = filter_category)
AND (filter_date IS NULL OR created_at >= filter_date)
ORDER BY similarity DESC
LIMIT context_limit
),
context AS (
SELECT
STRING_AGG(content, E'\n\n') AS context_text,
jsonb_agg(jsonb_build_object(
'id', id,
'title', title,
'similarity', similarity
)) AS sources
FROM relevant_docs
)
SELECT
-- Your LLM call here
context_text AS answer,
sources
FROM context;
$$ LANGUAGE sql;
Clustering with Embeddings
K-Means on Embeddings
-- Cluster documents by semantic similarity
CREATE TABLE document_clusters AS
WITH embeddings AS (
SELECT
id,
pgml.embed('sentence-transformers/all-MiniLM-L6-v2', content) AS embedding
FROM documents
)
SELECT
id,
embedding,
pgml.predict('document_clusters', embedding::FLOAT[]) AS cluster
FROM embeddings;
-- Train clustering model
SELECT pgml.train(
'document_clusters',
'clustering',
'SELECT embedding::FLOAT[] AS features FROM documents',
algorithm => 'kmeans',
hyperparams => '{"n_clusters": 10}'
);
Hierarchical Clustering
-- Find document clusters at different granularities
WITH pairwise_distances AS (
SELECT
d1.id AS doc1_id,
d2.id AS doc2_id,
d1.embedding <=> d2.embedding AS distance
FROM documents d1
CROSS JOIN documents d2
WHERE d1.id < d2.id
)
SELECT *
FROM pairwise_distances
WHERE distance < 0.3 -- Similarity threshold
ORDER BY distance;
Recommendation Systems
Content-Based Recommendations
-- Recommend similar items
CREATE OR REPLACE FUNCTION recommend_similar(
item_id INT,
limit_count INT DEFAULT 5
)
RETURNS TABLE (
recommended_id INT,
title TEXT,
similarity FLOAT
) AS $$
SELECT
d.id,
d.title,
1 - (d.embedding <=> ref.embedding) AS similarity
FROM documents d,
LATERAL (
SELECT embedding FROM documents WHERE id = item_id
) ref
WHERE d.id != item_id
AND d.embedding IS NOT NULL
ORDER BY d.embedding <=> ref.embedding
LIMIT limit_count;
$$ LANGUAGE sql STABLE;
-- Get recommendations
SELECT * FROM recommend_similar(42);
User Preference Recommendations
-- Recommend based on user history
CREATE TABLE user_interactions (
user_id INT,
document_id INT,
interaction_type TEXT, -- 'view', 'like', 'save'
created_at TIMESTAMPTZ DEFAULT NOW()
);
CREATE OR REPLACE FUNCTION personalized_recommendations(
target_user_id INT,
limit_count INT DEFAULT 10
)
RETURNS TABLE (
document_id INT,
title TEXT,
score FLOAT
) AS $$
WITH user_profile AS (
-- Average embedding of user's liked content
SELECT AVG(d.embedding) AS profile_embedding
FROM user_interactions ui
JOIN documents d ON ui.document_id = d.id
WHERE ui.user_id = target_user_id
AND ui.interaction_type = 'like'
),
excluded_docs AS (
-- Documents user already interacted with
SELECT document_id
FROM user_interactions
WHERE user_id = target_user_id
)
SELECT
d.id,
d.title,
1 - (d.embedding <=> up.profile_embedding) AS score
FROM documents d, user_profile up
WHERE d.id NOT IN (SELECT document_id FROM excluded_docs)
AND d.embedding IS NOT NULL
ORDER BY d.embedding <=> up.profile_embedding
LIMIT limit_count;
$$ LANGUAGE sql;
Duplicate Detection
Find Similar Documents
-- Detect near-duplicates
WITH pairwise_similarity AS (
SELECT
d1.id AS doc1_id,
d1.title AS doc1_title,
d2.id AS doc2_id,
d2.title AS doc2_title,
1 - (d1.embedding <=> d2.embedding) AS similarity
FROM documents d1
JOIN documents d2 ON d1.id < d2.id
WHERE 1 - (d1.embedding <=> d2.embedding) > 0.95 -- High similarity threshold
)
SELECT *
FROM pairwise_similarity
ORDER BY similarity DESC;
Multi-Modal Embeddings
Image + Text Search
-- Create table for multi-modal data
CREATE TABLE media_items (
id SERIAL PRIMARY KEY,
title TEXT,
description TEXT,
image_url TEXT,
text_embedding vector(384),
image_embedding vector(512) -- Different dimension for image model
);
-- Generate text embeddings
UPDATE media_items
SET text_embedding = pgml.embed(
'sentence-transformers/all-MiniLM-L6-v2',
description
)::vector;
-- Search across modalities
SELECT
id,
title,
LEAST(
text_embedding <=> query_emb,
image_embedding <=> image_query_emb
) AS best_match_distance
FROM media_items,
LATERAL (
SELECT pgml.embed('sentence-transformers/all-MiniLM-L6-v2', 'sunset beach')::vector
) q(query_emb)
ORDER BY best_match_distance
LIMIT 10;
Performance Optimization
Batch Embedding Generation
-- Generate embeddings in batches
CREATE OR REPLACE FUNCTION batch_generate_embeddings(batch_size INT DEFAULT 100)
RETURNS VOID AS $$
DECLARE
batch RECORD;
BEGIN
FOR batch IN
SELECT ARRAY_AGG(id) AS ids, ARRAY_AGG(content) AS texts
FROM (
SELECT id, content
FROM documents
WHERE embedding IS NULL
LIMIT batch_size
) t
LOOP
UPDATE documents d
SET embedding = e.emb::vector
FROM (
SELECT
unnest(batch.ids) AS id,
unnest(pgml.embed(
'sentence-transformers/all-MiniLM-L6-v2',
unnest(batch.texts)
)) AS emb
) e
WHERE d.id = e.id;
END LOOP;
END;
$$ LANGUAGE plpgsql;
Index Tuning
-- Tune HNSW index parameters
CREATE INDEX ON documents
USING hnsw (embedding vector_cosine_ops)
WITH (m = 16, ef_construction = 64);
-- Set search parameters
SET hnsw.ef_search = 40;
-- For IVFFlat, tune probe count
SET ivfflat.probes = 10;
Best Practices
1. Choose Appropriate Models
-- Small, fast (good for most cases)
'sentence-transformers/all-MiniLM-L6-v2' -- 384 dims
-- Better quality, slightly slower
'sentence-transformers/all-mpnet-base-v2' -- 768 dims
-- Specialized for specific tasks
'sentence-transformers/multi-qa-MiniLM-L6-cos-v1' -- Q&A
2. Normalize Embeddings
-- Normalize for cosine similarity
SELECT pgml.embed(
'sentence-transformers/all-MiniLM-L6-v2',
text,
kwargs => '{"normalize": true}'::JSONB
)::vector;
3. Update Embeddings Incrementally
-- Trigger to auto-generate embeddings
CREATE OR REPLACE FUNCTION generate_embedding_on_insert()
RETURNS TRIGGER AS $$
BEGIN
NEW.embedding := pgml.embed(
'sentence-transformers/all-MiniLM-L6-v2',
NEW.content
)::vector;
RETURN NEW;
END;
$$ LANGUAGE plpgsql;
CREATE TRIGGER embedding_trigger
BEFORE INSERT OR UPDATE OF content ON documents
FOR EACH ROW
EXECUTE FUNCTION generate_embedding_on_insert();
4. Monitor Index Performance
-- Check index usage
SELECT
schemaname,
tablename,
indexname,
idx_scan,
idx_tup_read,
idx_tup_fetch
FROM pg_stat_user_indexes
WHERE tablename = 'documents';
Next Steps
Explore more advanced topics:
- Advanced Examples - Real-world RAG applications
- Transformers & NLP - Natural language processing