11 KiB
11 KiB
GLiNER2 Combining Schemas Tutorial
Table of Contents
Why Combine Schemas
Combining schemas allows you to:
- Extract multiple types of information in one pass
- Maintain context between different extraction tasks
- Improve efficiency by avoiding multiple model calls
- Build comprehensive information extraction pipelines
Basic Combinations
Entities + Classification
from gliner2 import GLiNER2
extractor = GLiNER2.from_pretrained("your-model-name")
# Sentiment analysis with entity extraction
schema = (extractor.create_schema()
.entities(["person", "product", "company"])
.classification("sentiment", ["positive", "negative", "neutral"])
.classification("category", ["review", "news", "opinion"])
)
text = "Tim Cook announced that Apple's new iPhone is exceeding sales expectations."
results = extractor.extract(text, schema)
# Output: {
# 'entities': {
# 'person': ['Tim Cook'],
# 'product': ['iPhone'],
# 'company': ['Apple']
# },
# 'sentiment': 'positive',
# 'category': 'news'
# }
Entities + Structures
schema = (extractor.create_schema()
.entities({
"person": "Names of people mentioned",
"date": "Dates and time references"
})
.structure("appointment")
.field("patient", dtype="str")
.field("doctor", dtype="str")
.field("date")
.field("time")
.field("type", dtype="str", choices=["checkup", "followup", "consultation"])
)
text = """
Dr. Sarah Johnson confirmed the appointment with John Smith for
March 15th at 2:30 PM. This will be a follow-up consultation
regarding his previous visit on February 1st.
"""
results = extractor.extract(text, schema)
Classification + Structures
schema = (extractor.create_schema()
.classification("email_type",
["order_confirmation", "shipping_update", "promotional", "support"])
.classification("priority", ["urgent", "normal", "low"])
.structure("order_info")
.field("order_number", dtype="str")
.field("items")
.field("total", dtype="str")
.field("status", dtype="str",
choices=["pending", "processing", "shipped", "delivered"])
)
Advanced Multi-Task Schemas
Complete Document Analysis
# Comprehensive invoice extraction
invoice_schema = (extractor.create_schema()
# Document classification
.classification("document_type",
["invoice", "credit_note", "purchase_order", "receipt"])
.classification("payment_status",
["paid", "unpaid", "partial", "overdue"])
# Key entities
.entities({
"company": "Company names (buyer or seller)",
"person": "Contact person names",
"date": "Important dates",
"amount": "Monetary amounts"
})
# Structured information
.structure("invoice_header")
.field("invoice_number", dtype="str")
.field("issue_date", dtype="str")
.field("due_date", dtype="str")
.field("vendor_name", dtype="str")
.field("customer_name", dtype="str")
.structure("line_item")
.field("description", dtype="str")
.field("quantity")
.field("unit_price")
.field("amount")
.field("tax_rate", dtype="str", choices=["0%", "5%", "10%", "20%"])
.structure("payment_info")
.field("method", dtype="str",
choices=["bank_transfer", "credit_card", "check", "cash"])
.field("terms", description="Payment terms like NET30")
.field("bank_details", dtype="list")
)
Customer Feedback Analysis
feedback_schema = (extractor.create_schema()
# Overall classifications
.classification("sentiment", ["positive", "negative", "neutral", "mixed"])
.classification("intent", {
"complaint": "Customer expressing dissatisfaction",
"compliment": "Customer expressing satisfaction",
"suggestion": "Customer providing improvement ideas",
"question": "Customer asking for information"
}, multi_label=True)
# Extract mentioned entities
.entities({
"product": "Products or services mentioned",
"feature": "Specific features discussed",
"competitor": "Competing products mentioned",
"price_mention": "Price points or cost references"
})
# Structured feedback components
.structure("issue")
.field("problem", dtype="str")
.field("severity", dtype="str", choices=["critical", "major", "minor"])
.field("affected_area", dtype="list")
.structure("suggestion")
.field("improvement", dtype="str")
.field("benefit", description="Expected benefit of the suggestion")
)
News Article Analysis
news_schema = (extractor.create_schema()
# Article metadata
.classification("category",
["politics", "business", "technology", "sports", "entertainment"])
.classification("bias", ["left", "center", "right", "neutral"])
.classification("factuality", ["fact", "opinion", "analysis", "speculation"])
# Key entities
.entities({
"person": "People mentioned in the article",
"organization": "Companies, agencies, or groups",
"location": "Places, cities, or countries",
"event": "Named events or incidents"
})
# Structured content
.structure("quote")
.field("speaker", dtype="str")
.field("statement", dtype="str")
.field("context", description="Context of the quote")
.structure("claim")
.field("statement", dtype="str")
.field("source", dtype="str")
.field("evidence", dtype="list")
)
Real-World Applications
E-commerce Product Listing
product_schema = (extractor.create_schema()
# Listing classification
.classification("condition", ["new", "used", "refurbished", "for_parts"])
.classification("listing_type", ["buy_now", "auction", "best_offer"])
# Extract key entities
.entities({
"brand": "Product brand or manufacturer",
"model": "Specific model name or number",
"color": "Product colors mentioned",
"size": "Size specifications"
})
# Product details
.structure("product")
.field("title", dtype="str")
.field("price", dtype="str")
.field("features", dtype="list")
.field("category", dtype="str")
# Shipping information
.structure("shipping")
.field("method", dtype="list",
choices=["standard", "express", "overnight", "international"])
.field("cost", dtype="str")
.field("delivery_time", description="Estimated delivery timeframe")
# Seller information
.structure("seller")
.field("name", dtype="str")
.field("rating", dtype="str")
.field("location", dtype="str")
)
Healthcare Clinical Note
clinical_schema = (extractor.create_schema()
# Note classification
.classification("visit_type",
["initial_consultation", "follow_up", "emergency", "routine_checkup"])
.classification("urgency", ["urgent", "routine", "elective"])
# Medical entities
.entities({
"symptom": "Patient reported symptoms",
"diagnosis": "Medical diagnoses or conditions",
"medication": "Prescribed or mentioned medications",
"procedure": "Medical procedures or tests",
"body_part": "Anatomical references"
})
# Patient information
.structure("patient_info")
.field("name", dtype="str")
.field("age", dtype="str")
.field("gender", dtype="str", choices=["male", "female", "other"])
.field("chief_complaint", dtype="str")
# Clinical findings
.structure("vital_signs")
.field("blood_pressure", dtype="str")
.field("heart_rate", dtype="str")
.field("temperature", dtype="str")
.field("respiratory_rate", dtype="str")
# Treatment plan
.structure("prescription")
.field("medication", dtype="str")
.field("dosage", dtype="str")
.field("frequency")
.field("duration")
.field("route", dtype="str", choices=["oral", "IV", "topical", "injection"])
)
Legal Document Analysis
legal_schema = (extractor.create_schema()
# Document classification
.classification("document_type",
["contract", "memorandum", "brief", "motion", "order"])
.classification("jurisdiction",
["federal", "state", "local", "international"])
# Legal entities
.entities({
"party": "Parties involved (plaintiff, defendant, etc.)",
"attorney": "Legal representatives",
"judge": "Judicial officers",
"statute": "Laws or regulations cited",
"case_citation": "Referenced legal cases"
})
# Contract terms
.structure("contract_term")
.field("clause_type", dtype="str",
choices=["payment", "delivery", "warranty", "liability", "termination"])
.field("obligation", dtype="str")
.field("party_responsible", dtype="str")
.field("deadline")
# Legal claims
.structure("claim")
.field("type", dtype="str")
.field("plaintiff", dtype="str")
.field("defendant", dtype="str")
.field("amount", dtype="str")
.field("basis", description="Legal basis for the claim")
)
Using Confidence Scores and Character Positions with Combined Schemas
When using combined schemas, include_confidence and include_spans parameters apply to all extraction types:
schema = (extractor.create_schema()
.entities(["person", "company"])
.classification("sentiment", ["positive", "negative", "neutral"])
.relations(["works_for"])
.structure("product")
.field("name", dtype="str")
.field("price", dtype="str")
)
text = "Tim Cook works for Apple. The iPhone 15 costs $999. This is exciting!"
results = extractor.extract(
text,
schema,
include_confidence=True,
include_spans=True
)
# Output: {
# 'entities': {
# 'person': [
# {'text': 'Tim Cook', 'confidence': 0.95, 'start': 0, 'end': 8}
# ],
# 'company': [
# {'text': 'Apple', 'confidence': 0.92, 'start': 20, 'end': 25}
# ]
# },
# 'sentiment': {'label': 'positive', 'confidence': 0.88},
# 'relation_extraction': {
# 'works_for': [{
# 'head': {'text': 'Tim Cook', 'confidence': 0.95, 'start': 0, 'end': 8},
# 'tail': {'text': 'Apple', 'confidence': 0.92, 'start': 20, 'end': 25}
# }]
# },
# 'product': [{
# 'name': {'text': 'iPhone 15', 'confidence': 0.90, 'start': 30, 'end': 39},
# 'price': {'text': '$999', 'confidence': 0.88, 'start': 46, 'end': 51}
# }]
# }
Note: The include_confidence and include_spans parameters work consistently across all extraction types (entities, classifications, relations, and structures) when using combined schemas.