agent-smith/packages/GLiNER2/tutorial/4-combined.md
2026-03-06 12:59:32 +01:00

11 KiB

GLiNER2 Combining Schemas Tutorial

Table of Contents

Why Combine Schemas

Combining schemas allows you to:

  • Extract multiple types of information in one pass
  • Maintain context between different extraction tasks
  • Improve efficiency by avoiding multiple model calls
  • Build comprehensive information extraction pipelines

Basic Combinations

Entities + Classification

from gliner2 import GLiNER2

extractor = GLiNER2.from_pretrained("your-model-name")

# Sentiment analysis with entity extraction
schema = (extractor.create_schema()
    .entities(["person", "product", "company"])
    .classification("sentiment", ["positive", "negative", "neutral"])
    .classification("category", ["review", "news", "opinion"])
)

text = "Tim Cook announced that Apple's new iPhone is exceeding sales expectations."
results = extractor.extract(text, schema)
# Output: {
#     'entities': {
#         'person': ['Tim Cook'],
#         'product': ['iPhone'],
#         'company': ['Apple']
#     },
#     'sentiment': 'positive',
#     'category': 'news'
# }

Entities + Structures

schema = (extractor.create_schema()
    .entities({
        "person": "Names of people mentioned",
        "date": "Dates and time references"
    })
    .structure("appointment")
        .field("patient", dtype="str")
        .field("doctor", dtype="str")
        .field("date")
        .field("time")
        .field("type", dtype="str", choices=["checkup", "followup", "consultation"])
)

text = """
Dr. Sarah Johnson confirmed the appointment with John Smith for 
March 15th at 2:30 PM. This will be a follow-up consultation 
regarding his previous visit on February 1st.
"""
results = extractor.extract(text, schema)

Classification + Structures

schema = (extractor.create_schema()
    .classification("email_type", 
        ["order_confirmation", "shipping_update", "promotional", "support"])
    .classification("priority", ["urgent", "normal", "low"])
    .structure("order_info")
        .field("order_number", dtype="str")
        .field("items")
        .field("total", dtype="str")
        .field("status", dtype="str", 
               choices=["pending", "processing", "shipped", "delivered"])
)

Advanced Multi-Task Schemas

Complete Document Analysis

# Comprehensive invoice extraction
invoice_schema = (extractor.create_schema()
    # Document classification
    .classification("document_type", 
        ["invoice", "credit_note", "purchase_order", "receipt"])
    .classification("payment_status", 
        ["paid", "unpaid", "partial", "overdue"])
    
    # Key entities
    .entities({
        "company": "Company names (buyer or seller)",
        "person": "Contact person names",
        "date": "Important dates",
        "amount": "Monetary amounts"
    })
    
    # Structured information
    .structure("invoice_header")
        .field("invoice_number", dtype="str")
        .field("issue_date", dtype="str")
        .field("due_date", dtype="str")
        .field("vendor_name", dtype="str")
        .field("customer_name", dtype="str")
    
    .structure("line_item")
        .field("description", dtype="str")
        .field("quantity")
        .field("unit_price")
        .field("amount")
        .field("tax_rate", dtype="str", choices=["0%", "5%", "10%", "20%"])
    
    .structure("payment_info")
        .field("method", dtype="str", 
               choices=["bank_transfer", "credit_card", "check", "cash"])
        .field("terms", description="Payment terms like NET30")
        .field("bank_details", dtype="list")
)

Customer Feedback Analysis

feedback_schema = (extractor.create_schema()
    # Overall classifications
    .classification("sentiment", ["positive", "negative", "neutral", "mixed"])
    .classification("intent", {
        "complaint": "Customer expressing dissatisfaction",
        "compliment": "Customer expressing satisfaction",
        "suggestion": "Customer providing improvement ideas",
        "question": "Customer asking for information"
    }, multi_label=True)
    
    # Extract mentioned entities
    .entities({
        "product": "Products or services mentioned",
        "feature": "Specific features discussed",
        "competitor": "Competing products mentioned",
        "price_mention": "Price points or cost references"
    })
    
    # Structured feedback components
    .structure("issue")
        .field("problem", dtype="str")
        .field("severity", dtype="str", choices=["critical", "major", "minor"])
        .field("affected_area", dtype="list")
    
    .structure("suggestion")
        .field("improvement", dtype="str")
        .field("benefit", description="Expected benefit of the suggestion")
)

News Article Analysis

news_schema = (extractor.create_schema()
    # Article metadata
    .classification("category", 
        ["politics", "business", "technology", "sports", "entertainment"])
    .classification("bias", ["left", "center", "right", "neutral"])
    .classification("factuality", ["fact", "opinion", "analysis", "speculation"])
    
    # Key entities
    .entities({
        "person": "People mentioned in the article",
        "organization": "Companies, agencies, or groups",
        "location": "Places, cities, or countries",
        "event": "Named events or incidents"
    })
    
    # Structured content
    .structure("quote")
        .field("speaker", dtype="str")
        .field("statement", dtype="str")
        .field("context", description="Context of the quote")
    
    .structure("claim")
        .field("statement", dtype="str")
        .field("source", dtype="str")
        .field("evidence", dtype="list")
)

Real-World Applications

E-commerce Product Listing

product_schema = (extractor.create_schema()
    # Listing classification
    .classification("condition", ["new", "used", "refurbished", "for_parts"])
    .classification("listing_type", ["buy_now", "auction", "best_offer"])
    
    # Extract key entities
    .entities({
        "brand": "Product brand or manufacturer",
        "model": "Specific model name or number",
        "color": "Product colors mentioned",
        "size": "Size specifications"
    })
    
    # Product details
    .structure("product")
        .field("title", dtype="str")
        .field("price", dtype="str")
        .field("features", dtype="list")
        .field("category", dtype="str")
    
    # Shipping information
    .structure("shipping")
        .field("method", dtype="list", 
               choices=["standard", "express", "overnight", "international"])
        .field("cost", dtype="str")
        .field("delivery_time", description="Estimated delivery timeframe")
    
    # Seller information
    .structure("seller")
        .field("name", dtype="str")
        .field("rating", dtype="str")
        .field("location", dtype="str")
)

Healthcare Clinical Note

clinical_schema = (extractor.create_schema()
    # Note classification
    .classification("visit_type", 
        ["initial_consultation", "follow_up", "emergency", "routine_checkup"])
    .classification("urgency", ["urgent", "routine", "elective"])
    
    # Medical entities
    .entities({
        "symptom": "Patient reported symptoms",
        "diagnosis": "Medical diagnoses or conditions",
        "medication": "Prescribed or mentioned medications",
        "procedure": "Medical procedures or tests",
        "body_part": "Anatomical references"
    })
    
    # Patient information
    .structure("patient_info")
        .field("name", dtype="str")
        .field("age", dtype="str")
        .field("gender", dtype="str", choices=["male", "female", "other"])
        .field("chief_complaint", dtype="str")
    
    # Clinical findings
    .structure("vital_signs")
        .field("blood_pressure", dtype="str")
        .field("heart_rate", dtype="str")
        .field("temperature", dtype="str")
        .field("respiratory_rate", dtype="str")
    
    # Treatment plan
    .structure("prescription")
        .field("medication", dtype="str")
        .field("dosage", dtype="str")
        .field("frequency")
        .field("duration")
        .field("route", dtype="str", choices=["oral", "IV", "topical", "injection"])
)
legal_schema = (extractor.create_schema()
    # Document classification
    .classification("document_type", 
        ["contract", "memorandum", "brief", "motion", "order"])
    .classification("jurisdiction", 
        ["federal", "state", "local", "international"])
    
    # Legal entities
    .entities({
        "party": "Parties involved (plaintiff, defendant, etc.)",
        "attorney": "Legal representatives",
        "judge": "Judicial officers",
        "statute": "Laws or regulations cited",
        "case_citation": "Referenced legal cases"
    })
    
    # Contract terms
    .structure("contract_term")
        .field("clause_type", dtype="str", 
               choices=["payment", "delivery", "warranty", "liability", "termination"])
        .field("obligation", dtype="str")
        .field("party_responsible", dtype="str")
        .field("deadline")
    
    # Legal claims
    .structure("claim")
        .field("type", dtype="str")
        .field("plaintiff", dtype="str")
        .field("defendant", dtype="str")
        .field("amount", dtype="str")
        .field("basis", description="Legal basis for the claim")
)

Using Confidence Scores and Character Positions with Combined Schemas

When using combined schemas, include_confidence and include_spans parameters apply to all extraction types:

schema = (extractor.create_schema()
    .entities(["person", "company"])
    .classification("sentiment", ["positive", "negative", "neutral"])
    .relations(["works_for"])
    .structure("product")
        .field("name", dtype="str")
        .field("price", dtype="str")
)

text = "Tim Cook works for Apple. The iPhone 15 costs $999. This is exciting!"
results = extractor.extract(
    text,
    schema,
    include_confidence=True,
    include_spans=True
)
# Output: {
#     'entities': {
#         'person': [
#             {'text': 'Tim Cook', 'confidence': 0.95, 'start': 0, 'end': 8}
#         ],
#         'company': [
#             {'text': 'Apple', 'confidence': 0.92, 'start': 20, 'end': 25}
#         ]
#     },
#     'sentiment': {'label': 'positive', 'confidence': 0.88},
#     'relation_extraction': {
#         'works_for': [{
#             'head': {'text': 'Tim Cook', 'confidence': 0.95, 'start': 0, 'end': 8},
#             'tail': {'text': 'Apple', 'confidence': 0.92, 'start': 20, 'end': 25}
#         }]
#     },
#     'product': [{
#         'name': {'text': 'iPhone 15', 'confidence': 0.90, 'start': 30, 'end': 39},
#         'price': {'text': '$999', 'confidence': 0.88, 'start': 46, 'end': 51}
#     }]
# }

Note: The include_confidence and include_spans parameters work consistently across all extraction types (entities, classifications, relations, and structures) when using combined schemas.