Skip to main content

Transformers & NLP

PostgresML provides seamless access to state-of-the-art transformer models from Hugging Face for natural language processing tasks.

Overview

PostgresML integrates with Hugging Face's model hub, allowing you to use pre-trained transformers directly in SQL queries without downloading or managing models yourself.

Text Classification

Sentiment Analysis

-- Analyze sentiment of text
SELECT pgml.transform(
task => 'text-classification',
inputs => ARRAY['I love this product!', 'This is terrible.']
) AS sentiment;

-- Returns: [{"label": "POSITIVE", "score": 0.99}, {"label": "NEGATIVE", "score": 0.98}]

-- Use in queries
SELECT
review_id,
review_text,
pgml.transform(
task => 'text-classification',
inputs => ARRAY[review_text]
) AS sentiment
FROM product_reviews
LIMIT 10;

Custom Classification Models

-- Use specific model
SELECT pgml.transform(
task => 'text-classification',
inputs => ARRAY['PostgreSQL is awesome!'],
args => '{
"model": "distilbert-base-uncased-finetuned-sst-2-english"
}'::JSONB
) AS result;

-- Multi-label classification
SELECT pgml.transform(
task => 'text-classification',
inputs => ARRAY['This movie has great action and romance'],
args => '{
"model": "facebook/bart-large-mnli",
"multi_label": true
}'::JSONB
);

Topic Classification

-- Classify text into topics
CREATE TABLE articles (
id SERIAL PRIMARY KEY,
title TEXT,
content TEXT,
published_at TIMESTAMPTZ
);

-- Add topic classification
SELECT
id,
title,
pgml.transform(
task => 'zero-shot-classification',
inputs => ARRAY[content],
args => '{
"candidate_labels": ["technology", "sports", "politics", "entertainment"]
}'::JSONB
) AS topics
FROM articles;

Text Generation

Content Generation

-- Generate text completions
SELECT pgml.transform(
task => 'text-generation',
inputs => ARRAY['Once upon a time'],
args => '{
"model": "gpt2",
"max_length": 100,
"num_return_sequences": 1
}'::JSONB
) AS generated_text;

Conversational AI

-- Create chatbot responses
SELECT pgml.transform(
task => 'conversational',
inputs => ARRAY['Hello, how are you?'],
args => '{
"model": "microsoft/DialoGPT-medium"
}'::JSONB
) AS response;

Question Answering

Extractive QA

-- Answer questions from context
SELECT pgml.transform(
task => 'question-answering',
inputs => ARRAY[
'{
"question": "What is PostgresML?",
"context": "PostgresML is a machine learning extension for PostgreSQL that allows you to train models and make predictions using SQL."
}'
]
) AS answer;

-- Use with documents
SELECT
doc_id,
pgml.transform(
task => 'question-answering',
inputs => ARRAY[
json_build_object(
'question', 'What is the main topic?',
'context', document_text
)::TEXT
]
) AS answer
FROM documents
WHERE doc_type = 'manual';
-- Build a QA system
CREATE TABLE knowledge_base (
id SERIAL PRIMARY KEY,
title TEXT,
content TEXT,
category TEXT
);

-- Query the knowledge base
CREATE OR REPLACE FUNCTION ask_question(question_text TEXT)
RETURNS TABLE(article_id INT, title TEXT, answer JSONB) AS $$
SELECT
id,
title,
pgml.transform(
task => 'question-answering',
inputs => ARRAY[
json_build_object(
'question', question_text,
'context', content
)::TEXT
]
) AS answer
FROM knowledge_base
ORDER BY (answer->0->>'score')::FLOAT DESC
LIMIT 5;
$$ LANGUAGE sql;

-- Use it
SELECT * FROM ask_question('How do I train a model?');

Named Entity Recognition

Extract Entities

-- Identify entities in text
SELECT pgml.transform(
task => 'token-classification',
inputs => ARRAY['Apple Inc. was founded by Steve Jobs in California.'],
args => '{
"model": "dslim/bert-base-NER"
}'::JSONB
) AS entities;

-- Process documents
SELECT
id,
title,
pgml.transform(
task => 'token-classification',
inputs => ARRAY[content]
) AS extracted_entities
FROM documents;

Entity Extraction Pipeline

-- Create entity extraction table
CREATE TABLE entity_extractions (
id SERIAL PRIMARY KEY,
document_id INT,
entity_text TEXT,
entity_type TEXT,
confidence FLOAT,
extracted_at TIMESTAMPTZ DEFAULT NOW()
);

-- Extract and store entities
INSERT INTO entity_extractions (document_id, entity_text, entity_type, confidence)
SELECT
d.id,
entity->>'word' AS entity_text,
entity->>'entity' AS entity_type,
(entity->>'score')::FLOAT AS confidence
FROM documents d,
LATERAL (
SELECT jsonb_array_elements(
pgml.transform(
task => 'token-classification',
inputs => ARRAY[d.content]
)->0
) AS entity
) entities
WHERE (entity->>'score')::FLOAT > 0.9;

Text Summarization

Summarize Documents

-- Generate summaries
SELECT pgml.transform(
task => 'summarization',
inputs => ARRAY['Long article text here...'],
args => '{
"model": "facebook/bart-large-cnn",
"max_length": 130,
"min_length": 30
}'::JSONB
) AS summary;

-- Bulk summarization
CREATE TABLE article_summaries AS
SELECT
id,
title,
pgml.transform(
task => 'summarization',
inputs => ARRAY[content],
args => '{"max_length": 100}'::JSONB
)->0->>'summary_text' AS summary
FROM articles
WHERE LENGTH(content) > 500;

Translation

Translate Text

-- Translate between languages
SELECT pgml.transform(
task => 'translation',
inputs => ARRAY['Hello, how are you?'],
args => '{
"model": "Helsinki-NLP/opus-mt-en-es"
}'::JSONB
) AS spanish_translation;

-- Multi-language support
CREATE TABLE translations (
id SERIAL PRIMARY KEY,
original_text TEXT,
source_lang TEXT,
target_lang TEXT,
translated_text TEXT
);

-- Translate content
INSERT INTO translations (original_text, source_lang, target_lang, translated_text)
SELECT
content,
'en',
'fr',
pgml.transform(
task => 'translation',
inputs => ARRAY[content],
args => '{"model": "Helsinki-NLP/opus-mt-en-fr"}'::JSONB
)->0->>'translation_text'
FROM articles
WHERE language = 'en';

Fill Mask

Complete Text with Masked Tokens

-- Fill in masked words
SELECT pgml.transform(
task => 'fill-mask',
inputs => ARRAY['PostgreSQL is a [MASK] database system.'],
args => '{
"model": "bert-base-uncased"
}'::JSONB
) AS predictions;

-- Returns top predictions for [MASK]

Feature Extraction

Get Embeddings

-- Extract feature vectors
SELECT pgml.transform(
task => 'feature-extraction',
inputs => ARRAY['This is a sample text for embedding.'],
args => '{
"model": "sentence-transformers/all-MiniLM-L6-v2"
}'::JSONB
) AS embeddings;

Using Specific Models

Browse Available Models

PostgresML supports thousands of models from Hugging Face. Popular choices:

Classification:

  • distilbert-base-uncased-finetuned-sst-2-english - Sentiment analysis
  • facebook/bart-large-mnli - Zero-shot classification

Question Answering:

  • deepset/roberta-base-squad2 - Extractive QA
  • distilbert-base-cased-distilled-squad - Fast QA

Summarization:

  • facebook/bart-large-cnn - News summarization
  • google/pegasus-xsum - Extreme summarization

Translation:

  • Helsinki-NLP/opus-mt-en-* - English to other languages
  • Helsinki-NLP/opus-mt-*-en - Other languages to English

NER:

  • dslim/bert-base-NER - General NER
  • dbmdz/bert-large-cased-finetuned-conll03-english - CoNLL NER

Model Caching

PostgresML automatically caches downloaded models:

-- First call downloads the model
SELECT pgml.transform(
task => 'text-classification',
inputs => ARRAY['test'],
args => '{"model": "distilbert-base-uncased"}'::JSONB
);

-- Subsequent calls use cached model (much faster)
SELECT pgml.transform(
task => 'text-classification',
inputs => ARRAY['another test'],
args => '{"model": "distilbert-base-uncased"}'::JSONB
);

Advanced Patterns

Batch Processing

-- Process in batches for efficiency
CREATE OR REPLACE FUNCTION batch_classify(batch_size INT DEFAULT 100)
RETURNS VOID AS $$
DECLARE
batch_texts TEXT[];
BEGIN
FOR batch_texts IN
SELECT ARRAY_AGG(text)
FROM (
SELECT text
FROM unprocessed_texts
LIMIT batch_size
) b
LOOP
INSERT INTO classified_texts (text, classification)
SELECT
t.text,
c.classification
FROM UNNEST(batch_texts) t(text),
LATERAL (
SELECT pgml.transform(
task => 'text-classification',
inputs => ARRAY[t.text]
) AS classification
) c;
END LOOP;
END;
$$ LANGUAGE plpgsql;

Pipeline Combinations

-- Combine multiple NLP tasks
WITH sentiment AS (
SELECT
id,
text,
pgml.transform(
task => 'text-classification',
inputs => ARRAY[text]
)->0->>'label' AS sentiment_label
FROM reviews
),
entities AS (
SELECT
id,
pgml.transform(
task => 'token-classification',
inputs => ARRAY[text]
) AS extracted_entities
FROM reviews
)
SELECT
s.id,
s.text,
s.sentiment_label,
e.extracted_entities
FROM sentiment s
JOIN entities e ON s.id = e.id;

Multilingual Processing

-- Detect and process multiple languages
CREATE OR REPLACE FUNCTION process_multilingual_text(input_text TEXT)
RETURNS JSONB AS $$
WITH detection AS (
SELECT pgml.transform(
task => 'text-classification',
inputs => ARRAY[input_text],
args => '{"model": "papluca/xlm-roberta-base-language-detection"}'::JSONB
) AS lang_result
),
translation AS (
SELECT CASE
WHEN detection.lang_result->0->>'label' != 'en' THEN
pgml.transform(
task => 'translation',
inputs => ARRAY[input_text],
args => json_build_object(
'model',
'Helsinki-NLP/opus-mt-' || (detection.lang_result->0->>'label') || '-en'
)::JSONB
)->0->>'translation_text'
ELSE input_text
END AS english_text
FROM detection
),
sentiment AS (
SELECT pgml.transform(
task => 'text-classification',
inputs => ARRAY[english_text]
) AS sentiment_result
FROM translation
)
SELECT json_build_object(
'original_language', detection.lang_result->0->>'label',
'english_text', translation.english_text,
'sentiment', sentiment.sentiment_result->0
)
FROM detection, translation, sentiment;
$$ LANGUAGE sql;

Performance Optimization

Model Selection

Choose models based on your needs:

-- Fast but less accurate
SELECT pgml.transform(
task => 'text-classification',
inputs => ARRAY[text],
args => '{"model": "distilbert-base-uncased"}'::JSONB -- Smaller, faster
);

-- Slower but more accurate
SELECT pgml.transform(
task => 'text-classification',
inputs => ARRAY[text],
args => '{"model": "bert-large-uncased"}'::JSONB -- Larger, more accurate
);

Batch Inputs

-- Process multiple texts at once (more efficient)
SELECT pgml.transform(
task => 'text-classification',
inputs => ARRAY[
'Text 1',
'Text 2',
'Text 3',
-- ... up to 100s of texts
]
);

Best Practices

1. Choose Appropriate Models

Start with smaller models and upgrade if needed:

-- Development: Fast models
'distilbert-base-uncased'
'distilgpt2'

-- Production: Balance speed and accuracy
'bert-base-uncased'
'roberta-base'

-- High accuracy needed: Larger models
'bert-large-uncased'
'roberta-large'

2. Cache Results

-- Store transformer results to avoid recomputation
CREATE TABLE cached_classifications (
text_hash TEXT PRIMARY KEY,
text TEXT,
classification JSONB,
created_at TIMESTAMPTZ DEFAULT NOW()
);

-- Check cache before transforming
CREATE OR REPLACE FUNCTION classify_with_cache(input_text TEXT)
RETURNS JSONB AS $$
INSERT INTO cached_classifications (text_hash, text, classification)
VALUES (
MD5(input_text),
input_text,
pgml.transform(task => 'text-classification', inputs => ARRAY[input_text])
)
ON CONFLICT (text_hash) DO NOTHING
RETURNING classification;

SELECT classification FROM cached_classifications WHERE text_hash = MD5(input_text);
$$ LANGUAGE sql;

3. Monitor Performance

-- Track transformer usage
CREATE TABLE transformer_logs (
id SERIAL PRIMARY KEY,
task TEXT,
model TEXT,
input_length INT,
execution_time_ms INT,
created_at TIMESTAMPTZ DEFAULT NOW()
);

Next Steps

Continue with: