Transformers & NLP
PostgresML provides seamless access to state-of-the-art transformer models from Hugging Face for natural language processing tasks.
Overview
PostgresML integrates with Hugging Face's model hub, allowing you to use pre-trained transformers directly in SQL queries without downloading or managing models yourself.
Text Classification
Sentiment Analysis
-- Analyze sentiment of text
SELECT pgml.transform(
task => 'text-classification',
inputs => ARRAY['I love this product!', 'This is terrible.']
) AS sentiment;
-- Returns: [{"label": "POSITIVE", "score": 0.99}, {"label": "NEGATIVE", "score": 0.98}]
-- Use in queries
SELECT
review_id,
review_text,
pgml.transform(
task => 'text-classification',
inputs => ARRAY[review_text]
) AS sentiment
FROM product_reviews
LIMIT 10;
Custom Classification Models
-- Use specific model
SELECT pgml.transform(
task => 'text-classification',
inputs => ARRAY['PostgreSQL is awesome!'],
args => '{
"model": "distilbert-base-uncased-finetuned-sst-2-english"
}'::JSONB
) AS result;
-- Multi-label classification
SELECT pgml.transform(
task => 'text-classification',
inputs => ARRAY['This movie has great action and romance'],
args => '{
"model": "facebook/bart-large-mnli",
"multi_label": true
}'::JSONB
);
Topic Classification
-- Classify text into topics
CREATE TABLE articles (
id SERIAL PRIMARY KEY,
title TEXT,
content TEXT,
published_at TIMESTAMPTZ
);
-- Add topic classification
SELECT
id,
title,
pgml.transform(
task => 'zero-shot-classification',
inputs => ARRAY[content],
args => '{
"candidate_labels": ["technology", "sports", "politics", "entertainment"]
}'::JSONB
) AS topics
FROM articles;
Text Generation
Content Generation
-- Generate text completions
SELECT pgml.transform(
task => 'text-generation',
inputs => ARRAY['Once upon a time'],
args => '{
"model": "gpt2",
"max_length": 100,
"num_return_sequences": 1
}'::JSONB
) AS generated_text;
Conversational AI
-- Create chatbot responses
SELECT pgml.transform(
task => 'conversational',
inputs => ARRAY['Hello, how are you?'],
args => '{
"model": "microsoft/DialoGPT-medium"
}'::JSONB
) AS response;
Question Answering
Extractive QA
-- Answer questions from context
SELECT pgml.transform(
task => 'question-answering',
inputs => ARRAY[
'{
"question": "What is PostgresML?",
"context": "PostgresML is a machine learning extension for PostgreSQL that allows you to train models and make predictions using SQL."
}'
]
) AS answer;
-- Use with documents
SELECT
doc_id,
pgml.transform(
task => 'question-answering',
inputs => ARRAY[
json_build_object(
'question', 'What is the main topic?',
'context', document_text
)::TEXT
]
) AS answer
FROM documents
WHERE doc_type = 'manual';
Knowledge Base Search
-- Build a QA system
CREATE TABLE knowledge_base (
id SERIAL PRIMARY KEY,
title TEXT,
content TEXT,
category TEXT
);
-- Query the knowledge base
CREATE OR REPLACE FUNCTION ask_question(question_text TEXT)
RETURNS TABLE(article_id INT, title TEXT, answer JSONB) AS $$
SELECT
id,
title,
pgml.transform(
task => 'question-answering',
inputs => ARRAY[
json_build_object(
'question', question_text,
'context', content
)::TEXT
]
) AS answer
FROM knowledge_base
ORDER BY (answer->0->>'score')::FLOAT DESC
LIMIT 5;
$$ LANGUAGE sql;
-- Use it
SELECT * FROM ask_question('How do I train a model?');
Named Entity Recognition
Extract Entities
-- Identify entities in text
SELECT pgml.transform(
task => 'token-classification',
inputs => ARRAY['Apple Inc. was founded by Steve Jobs in California.'],
args => '{
"model": "dslim/bert-base-NER"
}'::JSONB
) AS entities;
-- Process documents
SELECT
id,
title,
pgml.transform(
task => 'token-classification',
inputs => ARRAY[content]
) AS extracted_entities
FROM documents;
Entity Extraction Pipeline
-- Create entity extraction table
CREATE TABLE entity_extractions (
id SERIAL PRIMARY KEY,
document_id INT,
entity_text TEXT,
entity_type TEXT,
confidence FLOAT,
extracted_at TIMESTAMPTZ DEFAULT NOW()
);
-- Extract and store entities
INSERT INTO entity_extractions (document_id, entity_text, entity_type, confidence)
SELECT
d.id,
entity->>'word' AS entity_text,
entity->>'entity' AS entity_type,
(entity->>'score')::FLOAT AS confidence
FROM documents d,
LATERAL (
SELECT jsonb_array_elements(
pgml.transform(
task => 'token-classification',
inputs => ARRAY[d.content]
)->0
) AS entity
) entities
WHERE (entity->>'score')::FLOAT > 0.9;
Text Summarization
Summarize Documents
-- Generate summaries
SELECT pgml.transform(
task => 'summarization',
inputs => ARRAY['Long article text here...'],
args => '{
"model": "facebook/bart-large-cnn",
"max_length": 130,
"min_length": 30
}'::JSONB
) AS summary;
-- Bulk summarization
CREATE TABLE article_summaries AS
SELECT
id,
title,
pgml.transform(
task => 'summarization',
inputs => ARRAY[content],
args => '{"max_length": 100}'::JSONB
)->0->>'summary_text' AS summary
FROM articles
WHERE LENGTH(content) > 500;
Translation
Translate Text
-- Translate between languages
SELECT pgml.transform(
task => 'translation',
inputs => ARRAY['Hello, how are you?'],
args => '{
"model": "Helsinki-NLP/opus-mt-en-es"
}'::JSONB
) AS spanish_translation;
-- Multi-language support
CREATE TABLE translations (
id SERIAL PRIMARY KEY,
original_text TEXT,
source_lang TEXT,
target_lang TEXT,
translated_text TEXT
);
-- Translate content
INSERT INTO translations (original_text, source_lang, target_lang, translated_text)
SELECT
content,
'en',
'fr',
pgml.transform(
task => 'translation',
inputs => ARRAY[content],
args => '{"model": "Helsinki-NLP/opus-mt-en-fr"}'::JSONB
)->0->>'translation_text'
FROM articles
WHERE language = 'en';
Fill Mask
Complete Text with Masked Tokens
-- Fill in masked words
SELECT pgml.transform(
task => 'fill-mask',
inputs => ARRAY['PostgreSQL is a [MASK] database system.'],
args => '{
"model": "bert-base-uncased"
}'::JSONB
) AS predictions;
-- Returns top predictions for [MASK]
Feature Extraction
Get Embeddings
-- Extract feature vectors
SELECT pgml.transform(
task => 'feature-extraction',
inputs => ARRAY['This is a sample text for embedding.'],
args => '{
"model": "sentence-transformers/all-MiniLM-L6-v2"
}'::JSONB
) AS embeddings;
Using Specific Models
Browse Available Models
PostgresML supports thousands of models from Hugging Face. Popular choices:
Classification:
distilbert-base-uncased-finetuned-sst-2-english- Sentiment analysisfacebook/bart-large-mnli- Zero-shot classification
Question Answering:
deepset/roberta-base-squad2- Extractive QAdistilbert-base-cased-distilled-squad- Fast QA
Summarization:
facebook/bart-large-cnn- News summarizationgoogle/pegasus-xsum- Extreme summarization
Translation:
Helsinki-NLP/opus-mt-en-*- English to other languagesHelsinki-NLP/opus-mt-*-en- Other languages to English
NER:
dslim/bert-base-NER- General NERdbmdz/bert-large-cased-finetuned-conll03-english- CoNLL NER
Model Caching
PostgresML automatically caches downloaded models:
-- First call downloads the model
SELECT pgml.transform(
task => 'text-classification',
inputs => ARRAY['test'],
args => '{"model": "distilbert-base-uncased"}'::JSONB
);
-- Subsequent calls use cached model (much faster)
SELECT pgml.transform(
task => 'text-classification',
inputs => ARRAY['another test'],
args => '{"model": "distilbert-base-uncased"}'::JSONB
);
Advanced Patterns
Batch Processing
-- Process in batches for efficiency
CREATE OR REPLACE FUNCTION batch_classify(batch_size INT DEFAULT 100)
RETURNS VOID AS $$
DECLARE
batch_texts TEXT[];
BEGIN
FOR batch_texts IN
SELECT ARRAY_AGG(text)
FROM (
SELECT text
FROM unprocessed_texts
LIMIT batch_size
) b
LOOP
INSERT INTO classified_texts (text, classification)
SELECT
t.text,
c.classification
FROM UNNEST(batch_texts) t(text),
LATERAL (
SELECT pgml.transform(
task => 'text-classification',
inputs => ARRAY[t.text]
) AS classification
) c;
END LOOP;
END;
$$ LANGUAGE plpgsql;
Pipeline Combinations
-- Combine multiple NLP tasks
WITH sentiment AS (
SELECT
id,
text,
pgml.transform(
task => 'text-classification',
inputs => ARRAY[text]
)->0->>'label' AS sentiment_label
FROM reviews
),
entities AS (
SELECT
id,
pgml.transform(
task => 'token-classification',
inputs => ARRAY[text]
) AS extracted_entities
FROM reviews
)
SELECT
s.id,
s.text,
s.sentiment_label,
e.extracted_entities
FROM sentiment s
JOIN entities e ON s.id = e.id;
Multilingual Processing
-- Detect and process multiple languages
CREATE OR REPLACE FUNCTION process_multilingual_text(input_text TEXT)
RETURNS JSONB AS $$
WITH detection AS (
SELECT pgml.transform(
task => 'text-classification',
inputs => ARRAY[input_text],
args => '{"model": "papluca/xlm-roberta-base-language-detection"}'::JSONB
) AS lang_result
),
translation AS (
SELECT CASE
WHEN detection.lang_result->0->>'label' != 'en' THEN
pgml.transform(
task => 'translation',
inputs => ARRAY[input_text],
args => json_build_object(
'model',
'Helsinki-NLP/opus-mt-' || (detection.lang_result->0->>'label') || '-en'
)::JSONB
)->0->>'translation_text'
ELSE input_text
END AS english_text
FROM detection
),
sentiment AS (
SELECT pgml.transform(
task => 'text-classification',
inputs => ARRAY[english_text]
) AS sentiment_result
FROM translation
)
SELECT json_build_object(
'original_language', detection.lang_result->0->>'label',
'english_text', translation.english_text,
'sentiment', sentiment.sentiment_result->0
)
FROM detection, translation, sentiment;
$$ LANGUAGE sql;
Performance Optimization
Model Selection
Choose models based on your needs:
-- Fast but less accurate
SELECT pgml.transform(
task => 'text-classification',
inputs => ARRAY[text],
args => '{"model": "distilbert-base-uncased"}'::JSONB -- Smaller, faster
);
-- Slower but more accurate
SELECT pgml.transform(
task => 'text-classification',
inputs => ARRAY[text],
args => '{"model": "bert-large-uncased"}'::JSONB -- Larger, more accurate
);
Batch Inputs
-- Process multiple texts at once (more efficient)
SELECT pgml.transform(
task => 'text-classification',
inputs => ARRAY[
'Text 1',
'Text 2',
'Text 3',
-- ... up to 100s of texts
]
);
Best Practices
1. Choose Appropriate Models
Start with smaller models and upgrade if needed:
-- Development: Fast models
'distilbert-base-uncased'
'distilgpt2'
-- Production: Balance speed and accuracy
'bert-base-uncased'
'roberta-base'
-- High accuracy needed: Larger models
'bert-large-uncased'
'roberta-large'
2. Cache Results
-- Store transformer results to avoid recomputation
CREATE TABLE cached_classifications (
text_hash TEXT PRIMARY KEY,
text TEXT,
classification JSONB,
created_at TIMESTAMPTZ DEFAULT NOW()
);
-- Check cache before transforming
CREATE OR REPLACE FUNCTION classify_with_cache(input_text TEXT)
RETURNS JSONB AS $$
INSERT INTO cached_classifications (text_hash, text, classification)
VALUES (
MD5(input_text),
input_text,
pgml.transform(task => 'text-classification', inputs => ARRAY[input_text])
)
ON CONFLICT (text_hash) DO NOTHING
RETURNING classification;
SELECT classification FROM cached_classifications WHERE text_hash = MD5(input_text);
$$ LANGUAGE sql;
3. Monitor Performance
-- Track transformer usage
CREATE TABLE transformer_logs (
id SERIAL PRIMARY KEY,
task TEXT,
model TEXT,
input_length INT,
execution_time_ms INT,
created_at TIMESTAMPTZ DEFAULT NOW()
);
Next Steps
Continue with:
- Vector Operations - Embeddings and semantic search
- Advanced Examples - Real-world NLP applications