Skip to main content

Vector Operations & Embeddings

PostgresML provides powerful vector operations for semantic search, similarity matching, and Retrieval Augmented Generation (RAG) applications.

Overview

Vector embeddings represent text, images, or other data as high-dimensional vectors, enabling semantic similarity search and machine learning operations.

Creating Embeddings

Generate Text Embeddings

-- Generate embeddings for text
SELECT pgml.embed(
transformer => 'sentence-transformers/all-MiniLM-L6-v2',
text => 'PostgreSQL is a powerful database'
) AS embedding;

-- Returns: [0.123, -0.456, 0.789, ...] (384-dimensional vector)

-- Create embeddings for multiple texts
SELECT pgml.embed(
transformer => 'sentence-transformers/all-MiniLM-L6-v2',
text => text,
kwargs => '{"normalize": true}'::JSONB
) AS embedding
FROM documents;
-- Sentence Transformers (recommended)
'sentence-transformers/all-MiniLM-L6-v2' -- 384 dims, fast, good quality
'sentence-transformers/all-mpnet-base-v2' -- 768 dims, high quality
'sentence-transformers/multi-qa-MiniLM-L6-cos-v1' -- Optimized for Q&A

-- OpenAI-compatible
'intfloat/e5-small-v2' -- 384 dims
'intfloat/e5-base-v2' -- 768 dims
'intfloat/e5-large-v2' -- 1024 dims

-- Multilingual
'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'

Storing Embeddings

Create Table with Vectors

-- Install pgvector extension
CREATE EXTENSION IF NOT EXISTS vector;

-- Create table with vector column
CREATE TABLE documents (
id SERIAL PRIMARY KEY,
title TEXT,
content TEXT,
embedding vector(384) -- Dimension matches model output
);

-- Add embeddings to existing documents
UPDATE documents
SET embedding = pgml.embed(
'sentence-transformers/all-MiniLM-L6-v2',
content
)::vector;

Indexing for Performance

-- Create HNSW index for fast similarity search
CREATE INDEX ON documents
USING hnsw (embedding vector_cosine_ops);

-- Or IVFFlat index
CREATE INDEX ON documents
USING ivfflat (embedding vector_cosine_ops)
WITH (lists = 100);

-- Analyze the table
ANALYZE documents;
-- Find similar documents
SELECT
id,
title,
1 - (embedding <=> pgml.embed(
'sentence-transformers/all-MiniLM-L6-v2',
'machine learning in databases'
)::vector) AS similarity
FROM documents
ORDER BY embedding <=> pgml.embed(
'sentence-transformers/all-MiniLM-L6-v2',
'machine learning in databases'
)::vector
LIMIT 10;

Distance Operators

PostgresML supports multiple distance metrics:

-- Cosine distance (most common for text)
embedding <=> query_embedding

-- L2 distance (Euclidean)
embedding <-> query_embedding

-- Inner product
embedding <#> query_embedding

-- Example usage
SELECT
title,
embedding <=> query_emb AS cosine_distance,
embedding <-> query_emb AS l2_distance,
embedding <#> query_emb AS inner_product
FROM documents,
LATERAL (
SELECT pgml.embed('sentence-transformers/all-MiniLM-L6-v2', 'search query')::vector
) AS q(query_emb)
ORDER BY cosine_distance
LIMIT 5;

Semantic Search Applications

Document Search System

-- Create search function
CREATE OR REPLACE FUNCTION semantic_search(
query_text TEXT,
limit_count INT DEFAULT 10
)
RETURNS TABLE (
document_id INT,
title TEXT,
content_preview TEXT,
similarity FLOAT
) AS $$
SELECT
id,
title,
LEFT(content, 200) AS content_preview,
1 - (embedding <=> pgml.embed(
'sentence-transformers/all-MiniLM-L6-v2',
query_text
)::vector) AS similarity
FROM documents
WHERE embedding IS NOT NULL
ORDER BY embedding <=> pgml.embed(
'sentence-transformers/all-MiniLM-L6-v2',
query_text
)::vector
LIMIT limit_count;
$$ LANGUAGE sql STABLE;

-- Use the function
SELECT * FROM semantic_search('how to train neural networks');

Hybrid Search (Keyword + Semantic)

-- Combine full-text search with semantic search
CREATE OR REPLACE FUNCTION hybrid_search(
query_text TEXT,
limit_count INT DEFAULT 10
)
RETURNS TABLE (
document_id INT,
title TEXT,
combined_score FLOAT
) AS $$
WITH keyword_scores AS (
SELECT
id,
title,
ts_rank(
to_tsvector('english', content),
plainto_tsquery('english', query_text)
) AS keyword_score
FROM documents
WHERE to_tsvector('english', content) @@ plainto_tsquery('english', query_text)
),
semantic_scores AS (
SELECT
id,
1 - (embedding <=> pgml.embed(
'sentence-transformers/all-MiniLM-L6-v2',
query_text
)::vector) AS semantic_score
FROM documents
)
SELECT
COALESCE(k.id, s.id) AS document_id,
COALESCE(k.title, d.title) AS title,
(COALESCE(k.keyword_score, 0) * 0.3 +
COALESCE(s.semantic_score, 0) * 0.7) AS combined_score
FROM keyword_scores k
FULL OUTER JOIN semantic_scores s ON k.id = s.id
JOIN documents d ON COALESCE(k.id, s.id) = d.id
ORDER BY combined_score DESC
LIMIT limit_count;
$$ LANGUAGE sql STABLE;

Retrieval Augmented Generation (RAG)

Basic RAG System

-- Create knowledge base
CREATE TABLE knowledge_base (
id SERIAL PRIMARY KEY,
title TEXT,
content TEXT,
metadata JSONB,
embedding vector(384),
created_at TIMESTAMPTZ DEFAULT NOW()
);

-- Populate with embeddings
INSERT INTO knowledge_base (title, content, embedding)
SELECT
title,
content,
pgml.embed('sentence-transformers/all-MiniLM-L6-v2', content)::vector
FROM source_documents;

-- RAG query function
CREATE OR REPLACE FUNCTION rag_query(
user_question TEXT,
context_limit INT DEFAULT 3
)
RETURNS TEXT AS $$
DECLARE
context TEXT;
prompt TEXT;
BEGIN
-- Retrieve relevant context
SELECT STRING_AGG(content, E'\n\n') INTO context
FROM (
SELECT content
FROM knowledge_base
ORDER BY embedding <=> pgml.embed(
'sentence-transformers/all-MiniLM-L6-v2',
user_question
)::vector
LIMIT context_limit
) relevant_docs;

-- Build prompt with context
prompt := format(
'Context: %s\n\nQuestion: %s\n\nAnswer based on the context:',
context,
user_question
);

-- Generate answer (if you have a generation model)
RETURN pgml.transform(
task => 'text-generation',
inputs => ARRAY[prompt],
args => '{"max_length": 200}'::JSONB
)->0->>'generated_text';
END;
$$ LANGUAGE plpgsql;

-- Use RAG
SELECT rag_query('What is PostgresML?');

Advanced RAG with Filtering

-- RAG with metadata filtering
CREATE OR REPLACE FUNCTION filtered_rag_query(
user_question TEXT,
filter_category TEXT DEFAULT NULL,
filter_date TIMESTAMPTZ DEFAULT NULL,
context_limit INT DEFAULT 5
)
RETURNS TABLE (
answer TEXT,
sources JSONB
) AS $$
WITH relevant_docs AS (
SELECT
id,
title,
content,
metadata,
1 - (embedding <=> pgml.embed(
'sentence-transformers/all-MiniLM-L6-v2',
user_question
)::vector) AS similarity
FROM knowledge_base
WHERE (filter_category IS NULL OR metadata->>'category' = filter_category)
AND (filter_date IS NULL OR created_at >= filter_date)
ORDER BY similarity DESC
LIMIT context_limit
),
context AS (
SELECT
STRING_AGG(content, E'\n\n') AS context_text,
jsonb_agg(jsonb_build_object(
'id', id,
'title', title,
'similarity', similarity
)) AS sources
FROM relevant_docs
)
SELECT
-- Your LLM call here
context_text AS answer,
sources
FROM context;
$$ LANGUAGE sql;

Clustering with Embeddings

K-Means on Embeddings

-- Cluster documents by semantic similarity
CREATE TABLE document_clusters AS
WITH embeddings AS (
SELECT
id,
pgml.embed('sentence-transformers/all-MiniLM-L6-v2', content) AS embedding
FROM documents
)
SELECT
id,
embedding,
pgml.predict('document_clusters', embedding::FLOAT[]) AS cluster
FROM embeddings;

-- Train clustering model
SELECT pgml.train(
'document_clusters',
'clustering',
'SELECT embedding::FLOAT[] AS features FROM documents',
algorithm => 'kmeans',
hyperparams => '{"n_clusters": 10}'
);

Hierarchical Clustering

-- Find document clusters at different granularities
WITH pairwise_distances AS (
SELECT
d1.id AS doc1_id,
d2.id AS doc2_id,
d1.embedding <=> d2.embedding AS distance
FROM documents d1
CROSS JOIN documents d2
WHERE d1.id < d2.id
)
SELECT *
FROM pairwise_distances
WHERE distance < 0.3 -- Similarity threshold
ORDER BY distance;

Recommendation Systems

Content-Based Recommendations

-- Recommend similar items
CREATE OR REPLACE FUNCTION recommend_similar(
item_id INT,
limit_count INT DEFAULT 5
)
RETURNS TABLE (
recommended_id INT,
title TEXT,
similarity FLOAT
) AS $$
SELECT
d.id,
d.title,
1 - (d.embedding <=> ref.embedding) AS similarity
FROM documents d,
LATERAL (
SELECT embedding FROM documents WHERE id = item_id
) ref
WHERE d.id != item_id
AND d.embedding IS NOT NULL
ORDER BY d.embedding <=> ref.embedding
LIMIT limit_count;
$$ LANGUAGE sql STABLE;

-- Get recommendations
SELECT * FROM recommend_similar(42);

User Preference Recommendations

-- Recommend based on user history
CREATE TABLE user_interactions (
user_id INT,
document_id INT,
interaction_type TEXT, -- 'view', 'like', 'save'
created_at TIMESTAMPTZ DEFAULT NOW()
);

CREATE OR REPLACE FUNCTION personalized_recommendations(
target_user_id INT,
limit_count INT DEFAULT 10
)
RETURNS TABLE (
document_id INT,
title TEXT,
score FLOAT
) AS $$
WITH user_profile AS (
-- Average embedding of user's liked content
SELECT AVG(d.embedding) AS profile_embedding
FROM user_interactions ui
JOIN documents d ON ui.document_id = d.id
WHERE ui.user_id = target_user_id
AND ui.interaction_type = 'like'
),
excluded_docs AS (
-- Documents user already interacted with
SELECT document_id
FROM user_interactions
WHERE user_id = target_user_id
)
SELECT
d.id,
d.title,
1 - (d.embedding <=> up.profile_embedding) AS score
FROM documents d, user_profile up
WHERE d.id NOT IN (SELECT document_id FROM excluded_docs)
AND d.embedding IS NOT NULL
ORDER BY d.embedding <=> up.profile_embedding
LIMIT limit_count;
$$ LANGUAGE sql;

Duplicate Detection

Find Similar Documents

-- Detect near-duplicates
WITH pairwise_similarity AS (
SELECT
d1.id AS doc1_id,
d1.title AS doc1_title,
d2.id AS doc2_id,
d2.title AS doc2_title,
1 - (d1.embedding <=> d2.embedding) AS similarity
FROM documents d1
JOIN documents d2 ON d1.id < d2.id
WHERE 1 - (d1.embedding <=> d2.embedding) > 0.95 -- High similarity threshold
)
SELECT *
FROM pairwise_similarity
ORDER BY similarity DESC;

Multi-Modal Embeddings

-- Create table for multi-modal data
CREATE TABLE media_items (
id SERIAL PRIMARY KEY,
title TEXT,
description TEXT,
image_url TEXT,
text_embedding vector(384),
image_embedding vector(512) -- Different dimension for image model
);

-- Generate text embeddings
UPDATE media_items
SET text_embedding = pgml.embed(
'sentence-transformers/all-MiniLM-L6-v2',
description
)::vector;

-- Search across modalities
SELECT
id,
title,
LEAST(
text_embedding <=> query_emb,
image_embedding <=> image_query_emb
) AS best_match_distance
FROM media_items,
LATERAL (
SELECT pgml.embed('sentence-transformers/all-MiniLM-L6-v2', 'sunset beach')::vector
) q(query_emb)
ORDER BY best_match_distance
LIMIT 10;

Performance Optimization

Batch Embedding Generation

-- Generate embeddings in batches
CREATE OR REPLACE FUNCTION batch_generate_embeddings(batch_size INT DEFAULT 100)
RETURNS VOID AS $$
DECLARE
batch RECORD;
BEGIN
FOR batch IN
SELECT ARRAY_AGG(id) AS ids, ARRAY_AGG(content) AS texts
FROM (
SELECT id, content
FROM documents
WHERE embedding IS NULL
LIMIT batch_size
) t
LOOP
UPDATE documents d
SET embedding = e.emb::vector
FROM (
SELECT
unnest(batch.ids) AS id,
unnest(pgml.embed(
'sentence-transformers/all-MiniLM-L6-v2',
unnest(batch.texts)
)) AS emb
) e
WHERE d.id = e.id;
END LOOP;
END;
$$ LANGUAGE plpgsql;

Index Tuning

-- Tune HNSW index parameters
CREATE INDEX ON documents
USING hnsw (embedding vector_cosine_ops)
WITH (m = 16, ef_construction = 64);

-- Set search parameters
SET hnsw.ef_search = 40;

-- For IVFFlat, tune probe count
SET ivfflat.probes = 10;

Best Practices

1. Choose Appropriate Models

-- Small, fast (good for most cases)
'sentence-transformers/all-MiniLM-L6-v2' -- 384 dims

-- Better quality, slightly slower
'sentence-transformers/all-mpnet-base-v2' -- 768 dims

-- Specialized for specific tasks
'sentence-transformers/multi-qa-MiniLM-L6-cos-v1' -- Q&A

2. Normalize Embeddings

-- Normalize for cosine similarity
SELECT pgml.embed(
'sentence-transformers/all-MiniLM-L6-v2',
text,
kwargs => '{"normalize": true}'::JSONB
)::vector;

3. Update Embeddings Incrementally

-- Trigger to auto-generate embeddings
CREATE OR REPLACE FUNCTION generate_embedding_on_insert()
RETURNS TRIGGER AS $$
BEGIN
NEW.embedding := pgml.embed(
'sentence-transformers/all-MiniLM-L6-v2',
NEW.content
)::vector;
RETURN NEW;
END;
$$ LANGUAGE plpgsql;

CREATE TRIGGER embedding_trigger
BEFORE INSERT OR UPDATE OF content ON documents
FOR EACH ROW
EXECUTE FUNCTION generate_embedding_on_insert();

4. Monitor Index Performance

-- Check index usage
SELECT
schemaname,
tablename,
indexname,
idx_scan,
idx_tup_read,
idx_tup_fetch
FROM pg_stat_user_indexes
WHERE tablename = 'documents';

Next Steps

Explore more advanced topics: