Transformers & NLP

PostgresML provides seamless access to state-of-the-art transformer models from Hugging Face for natural language processing tasks.

Overview

PostgresML integrates with Hugging Face's model hub, allowing you to use pre-trained transformers directly in SQL queries without downloading or managing models yourself.

Text Classification

Sentiment Analysis

-- Analyze sentiment of text
SELECT pgml.transform(
  task   => 'text-classification',
  inputs => ARRAY['I love this product!', 'This is terrible.']
) AS sentiment;

-- Returns: [{"label": "POSITIVE", "score": 0.99}, {"label": "NEGATIVE", "score": 0.98}]

-- Use in queries
SELECT 
  review_id,
  review_text,
  pgml.transform(
    task   => 'text-classification',
    inputs => ARRAY[review_text]
  ) AS sentiment
FROM product_reviews
LIMIT 10;

Custom Classification Models

-- Use specific model
SELECT pgml.transform(
  task   => 'text-classification',
  inputs => ARRAY['PostgreSQL is awesome!'],
  args   => '{
    "model": "distilbert-base-uncased-finetuned-sst-2-english"
  }'::JSONB
) AS result;

-- Multi-label classification
SELECT pgml.transform(
  task   => 'text-classification',
  inputs => ARRAY['This movie has great action and romance'],
  args   => '{
    "model": "facebook/bart-large-mnli",
    "multi_label": true
  }'::JSONB
);

Topic Classification

-- Classify text into topics
CREATE TABLE articles (
  id SERIAL PRIMARY KEY,
  title TEXT,
  content TEXT,
  published_at TIMESTAMPTZ
);

-- Add topic classification
SELECT 
  id,
  title,
  pgml.transform(
    task   => 'zero-shot-classification',
    inputs => ARRAY[content],
    args   => '{
      "candidate_labels": ["technology", "sports", "politics", "entertainment"]
    }'::JSONB
  ) AS topics
FROM articles;

Text Generation

Content Generation

-- Generate text completions
SELECT pgml.transform(
  task   => 'text-generation',
  inputs => ARRAY['Once upon a time'],
  args   => '{
    "model": "gpt2",
    "max_length": 100,
    "num_return_sequences": 1
  }'::JSONB
) AS generated_text;

Conversational AI

-- Create chatbot responses
SELECT pgml.transform(
  task   => 'conversational',
  inputs => ARRAY['Hello, how are you?'],
  args   => '{
    "model": "microsoft/DialoGPT-medium"
  }'::JSONB
) AS response;

Question Answering

Extractive QA

-- Answer questions from context
SELECT pgml.transform(
  task   => 'question-answering',
  inputs => ARRAY[
    '{
      "question": "What is PostgresML?",
      "context": "PostgresML is a machine learning extension for PostgreSQL that allows you to train models and make predictions using SQL."
    }'
  ]
) AS answer;

-- Use with documents
SELECT 
  doc_id,
  pgml.transform(
    task   => 'question-answering',
    inputs => ARRAY[
      json_build_object(
        'question', 'What is the main topic?',
        'context', document_text
      )::TEXT
    ]
  ) AS answer
FROM documents
WHERE doc_type = 'manual';

Knowledge Base Search

-- Build a QA system
CREATE TABLE knowledge_base (
  id SERIAL PRIMARY KEY,
  title TEXT,
  content TEXT,
  category TEXT
);

-- Query the knowledge base
CREATE OR REPLACE FUNCTION ask_question(question_text TEXT)
RETURNS TABLE(article_id INT, title TEXT, answer JSONB) AS $$
  SELECT 
    id,
    title,
    pgml.transform(
      task   => 'question-answering',
      inputs => ARRAY[
        json_build_object(
          'question', question_text,
          'context', content
        )::TEXT
      ]
    ) AS answer
  FROM knowledge_base
  ORDER BY (answer->0->>'score')::FLOAT DESC
  LIMIT 5;
$$ LANGUAGE sql;

-- Use it
SELECT * FROM ask_question('How do I train a model?');

Named Entity Recognition

Extract Entities

-- Identify entities in text
SELECT pgml.transform(
  task   => 'token-classification',
  inputs => ARRAY['Apple Inc. was founded by Steve Jobs in California.'],
  args   => '{
    "model": "dslim/bert-base-NER"
  }'::JSONB
) AS entities;

-- Process documents
SELECT 
  id,
  title,
  pgml.transform(
    task   => 'token-classification',
    inputs => ARRAY[content]
  ) AS extracted_entities
FROM documents;

Entity Extraction Pipeline

-- Create entity extraction table
CREATE TABLE entity_extractions (
  id SERIAL PRIMARY KEY,
  document_id INT,
  entity_text TEXT,
  entity_type TEXT,
  confidence FLOAT,
  extracted_at TIMESTAMPTZ DEFAULT NOW()
);

-- Extract and store entities
INSERT INTO entity_extractions (document_id, entity_text, entity_type, confidence)
SELECT 
  d.id,
  entity->>'word' AS entity_text,
  entity->>'entity' AS entity_type,
  (entity->>'score')::FLOAT AS confidence
FROM documents d,
LATERAL (
  SELECT jsonb_array_elements(
    pgml.transform(
      task   => 'token-classification',
      inputs => ARRAY[d.content]
    )->0
  ) AS entity
) entities
WHERE (entity->>'score')::FLOAT > 0.9;

Text Summarization

Summarize Documents

-- Generate summaries
SELECT pgml.transform(
  task   => 'summarization',
  inputs => ARRAY['Long article text here...'],
  args   => '{
    "model": "facebook/bart-large-cnn",
    "max_length": 130,
    "min_length": 30
  }'::JSONB
) AS summary;

-- Bulk summarization
CREATE TABLE article_summaries AS
SELECT 
  id,
  title,
  pgml.transform(
    task   => 'summarization',
    inputs => ARRAY[content],
    args   => '{"max_length": 100}'::JSONB
  )->0->>'summary_text' AS summary
FROM articles
WHERE LENGTH(content) > 500;

Translation

Translate Text

-- Translate between languages
SELECT pgml.transform(
  task   => 'translation',
  inputs => ARRAY['Hello, how are you?'],
  args   => '{
    "model": "Helsinki-NLP/opus-mt-en-es"
  }'::JSONB
) AS spanish_translation;

-- Multi-language support
CREATE TABLE translations (
  id SERIAL PRIMARY KEY,
  original_text TEXT,
  source_lang TEXT,
  target_lang TEXT,
  translated_text TEXT
);

-- Translate content
INSERT INTO translations (original_text, source_lang, target_lang, translated_text)
SELECT 
  content,
  'en',
  'fr',
  pgml.transform(
    task   => 'translation',
    inputs => ARRAY[content],
    args   => '{"model": "Helsinki-NLP/opus-mt-en-fr"}'::JSONB
  )->0->>'translation_text'
FROM articles
WHERE language = 'en';

Fill Mask

Complete Text with Masked Tokens

-- Fill in masked words
SELECT pgml.transform(
  task   => 'fill-mask',
  inputs => ARRAY['PostgreSQL is a [MASK] database system.'],
  args   => '{
    "model": "bert-base-uncased"
  }'::JSONB
) AS predictions;

-- Returns top predictions for [MASK]

Feature Extraction

Get Embeddings

-- Extract feature vectors
SELECT pgml.transform(
  task   => 'feature-extraction',
  inputs => ARRAY['This is a sample text for embedding.'],
  args   => '{
    "model": "sentence-transformers/all-MiniLM-L6-v2"
  }'::JSONB
) AS embeddings;

Using Specific Models

Browse Available Models

PostgresML supports thousands of models from Hugging Face. Popular choices:

Classification:

distilbert-base-uncased-finetuned-sst-2-english - Sentiment analysis
facebook/bart-large-mnli - Zero-shot classification

Question Answering:

deepset/roberta-base-squad2 - Extractive QA
distilbert-base-cased-distilled-squad - Fast QA

Summarization:

facebook/bart-large-cnn - News summarization
google/pegasus-xsum - Extreme summarization

Translation:

Helsinki-NLP/opus-mt-en-* - English to other languages
Helsinki-NLP/opus-mt-*-en - Other languages to English

NER:

dslim/bert-base-NER - General NER
dbmdz/bert-large-cased-finetuned-conll03-english - CoNLL NER

Model Caching

PostgresML automatically caches downloaded models:

-- First call downloads the model
SELECT pgml.transform(
  task => 'text-classification',
  inputs => ARRAY['test'],
  args => '{"model": "distilbert-base-uncased"}'::JSONB
);

-- Subsequent calls use cached model (much faster)
SELECT pgml.transform(
  task => 'text-classification',
  inputs => ARRAY['another test'],
  args => '{"model": "distilbert-base-uncased"}'::JSONB
);

Advanced Patterns

Batch Processing

-- Process in batches for efficiency
CREATE OR REPLACE FUNCTION batch_classify(batch_size INT DEFAULT 100)
RETURNS VOID AS $$
DECLARE
  batch_texts TEXT[];
BEGIN
  FOR batch_texts IN
    SELECT ARRAY_AGG(text)
    FROM (
      SELECT text
      FROM unprocessed_texts
      LIMIT batch_size
    ) b
  LOOP
    INSERT INTO classified_texts (text, classification)
    SELECT 
      t.text,
      c.classification
    FROM UNNEST(batch_texts) t(text),
    LATERAL (
      SELECT pgml.transform(
        task => 'text-classification',
        inputs => ARRAY[t.text]
      ) AS classification
    ) c;
  END LOOP;
END;
$$ LANGUAGE plpgsql;

Pipeline Combinations

-- Combine multiple NLP tasks
WITH sentiment AS (
  SELECT 
    id,
    text,
    pgml.transform(
      task => 'text-classification',
      inputs => ARRAY[text]
    )->0->>'label' AS sentiment_label
  FROM reviews
),
entities AS (
  SELECT 
    id,
    pgml.transform(
      task => 'token-classification',
      inputs => ARRAY[text]
    ) AS extracted_entities
  FROM reviews
)
SELECT 
  s.id,
  s.text,
  s.sentiment_label,
  e.extracted_entities
FROM sentiment s
JOIN entities e ON s.id = e.id;

Multilingual Processing

-- Detect and process multiple languages
CREATE OR REPLACE FUNCTION process_multilingual_text(input_text TEXT)
RETURNS JSONB AS $$
  WITH detection AS (
    SELECT pgml.transform(
      task => 'text-classification',
      inputs => ARRAY[input_text],
      args => '{"model": "papluca/xlm-roberta-base-language-detection"}'::JSONB
    ) AS lang_result
  ),
  translation AS (
    SELECT CASE 
      WHEN detection.lang_result->0->>'label' != 'en' THEN
        pgml.transform(
          task => 'translation',
          inputs => ARRAY[input_text],
          args => json_build_object(
            'model', 
            'Helsinki-NLP/opus-mt-' || (detection.lang_result->0->>'label') || '-en'
          )::JSONB
        )->0->>'translation_text'
      ELSE input_text
    END AS english_text
    FROM detection
  ),
  sentiment AS (
    SELECT pgml.transform(
      task => 'text-classification',
      inputs => ARRAY[english_text]
    ) AS sentiment_result
    FROM translation
  )
  SELECT json_build_object(
    'original_language', detection.lang_result->0->>'label',
    'english_text', translation.english_text,
    'sentiment', sentiment.sentiment_result->0
  )
  FROM detection, translation, sentiment;
$$ LANGUAGE sql;

Performance Optimization

Model Selection

Choose models based on your needs:

-- Fast but less accurate
SELECT pgml.transform(
  task => 'text-classification',
  inputs => ARRAY[text],
  args => '{"model": "distilbert-base-uncased"}'::JSONB  -- Smaller, faster
);

-- Slower but more accurate
SELECT pgml.transform(
  task => 'text-classification',
  inputs => ARRAY[text],
  args => '{"model": "bert-large-uncased"}'::JSONB  -- Larger, more accurate
);

Batch Inputs

-- Process multiple texts at once (more efficient)
SELECT pgml.transform(
  task => 'text-classification',
  inputs => ARRAY[
    'Text 1',
    'Text 2',
    'Text 3',
    -- ... up to 100s of texts
  ]
);

Best Practices

1. Choose Appropriate Models

Start with smaller models and upgrade if needed:

-- Development: Fast models
'distilbert-base-uncased'
'distilgpt2'

-- Production: Balance speed and accuracy
'bert-base-uncased'
'roberta-base'

-- High accuracy needed: Larger models
'bert-large-uncased'
'roberta-large'

2. Cache Results

-- Store transformer results to avoid recomputation
CREATE TABLE cached_classifications (
  text_hash TEXT PRIMARY KEY,
  text TEXT,
  classification JSONB,
  created_at TIMESTAMPTZ DEFAULT NOW()
);

-- Check cache before transforming
CREATE OR REPLACE FUNCTION classify_with_cache(input_text TEXT)
RETURNS JSONB AS $$
  INSERT INTO cached_classifications (text_hash, text, classification)
  VALUES (
    MD5(input_text),
    input_text,
    pgml.transform(task => 'text-classification', inputs => ARRAY[input_text])
  )
  ON CONFLICT (text_hash) DO NOTHING
  RETURNING classification;
  
  SELECT classification FROM cached_classifications WHERE text_hash = MD5(input_text);
$$ LANGUAGE sql;

3. Monitor Performance

-- Track transformer usage
CREATE TABLE transformer_logs (
  id SERIAL PRIMARY KEY,
  task TEXT,
  model TEXT,
  input_length INT,
  execution_time_ms INT,
  created_at TIMESTAMPTZ DEFAULT NOW()
);

Next Steps

Continue with:

Vector Operations - Embeddings and semantic search
Advanced Examples - Real-world NLP applications

Overview​

Text Classification​

Sentiment Analysis​

Custom Classification Models​

Topic Classification​

Text Generation​

Content Generation​

Conversational AI​

Question Answering​

Extractive QA​

Knowledge Base Search​

Named Entity Recognition​

Extract Entities​

Entity Extraction Pipeline​

Text Summarization​

Summarize Documents​

Translation​

Translate Text​

Fill Mask​

Complete Text with Masked Tokens​

Feature Extraction​

Get Embeddings​

Using Specific Models​

Browse Available Models​

Model Caching​

Advanced Patterns​

Batch Processing​

Pipeline Combinations​

Multilingual Processing​

Performance Optimization​

Model Selection​

Batch Inputs​

Best Practices​

1. Choose Appropriate Models​

2. Cache Results​

3. Monitor Performance​

Next Steps​