# GLiNER2 Combining Schemas Tutorial ## Table of Contents - [Why Combine Schemas](#why-combine-schemas) - [Basic Combinations](#basic-combinations) - [Advanced Multi-Task Schemas](#advanced-multi-task-schemas) - [Real-World Applications](#real-world-applications) ## Why Combine Schemas Combining schemas allows you to: - Extract multiple types of information in one pass - Maintain context between different extraction tasks - Improve efficiency by avoiding multiple model calls - Build comprehensive information extraction pipelines ## Basic Combinations ### Entities + Classification ```python from gliner2 import GLiNER2 extractor = GLiNER2.from_pretrained("your-model-name") # Sentiment analysis with entity extraction schema = (extractor.create_schema() .entities(["person", "product", "company"]) .classification("sentiment", ["positive", "negative", "neutral"]) .classification("category", ["review", "news", "opinion"]) ) text = "Tim Cook announced that Apple's new iPhone is exceeding sales expectations." results = extractor.extract(text, schema) # Output: { # 'entities': { # 'person': ['Tim Cook'], # 'product': ['iPhone'], # 'company': ['Apple'] # }, # 'sentiment': 'positive', # 'category': 'news' # } ``` ### Entities + Structures ```python schema = (extractor.create_schema() .entities({ "person": "Names of people mentioned", "date": "Dates and time references" }) .structure("appointment") .field("patient", dtype="str") .field("doctor", dtype="str") .field("date") .field("time") .field("type", dtype="str", choices=["checkup", "followup", "consultation"]) ) text = """ Dr. Sarah Johnson confirmed the appointment with John Smith for March 15th at 2:30 PM. This will be a follow-up consultation regarding his previous visit on February 1st. """ results = extractor.extract(text, schema) ``` ### Classification + Structures ```python schema = (extractor.create_schema() .classification("email_type", ["order_confirmation", "shipping_update", "promotional", "support"]) .classification("priority", ["urgent", "normal", "low"]) .structure("order_info") .field("order_number", dtype="str") .field("items") .field("total", dtype="str") .field("status", dtype="str", choices=["pending", "processing", "shipped", "delivered"]) ) ``` ## Advanced Multi-Task Schemas ### Complete Document Analysis ```python # Comprehensive invoice extraction invoice_schema = (extractor.create_schema() # Document classification .classification("document_type", ["invoice", "credit_note", "purchase_order", "receipt"]) .classification("payment_status", ["paid", "unpaid", "partial", "overdue"]) # Key entities .entities({ "company": "Company names (buyer or seller)", "person": "Contact person names", "date": "Important dates", "amount": "Monetary amounts" }) # Structured information .structure("invoice_header") .field("invoice_number", dtype="str") .field("issue_date", dtype="str") .field("due_date", dtype="str") .field("vendor_name", dtype="str") .field("customer_name", dtype="str") .structure("line_item") .field("description", dtype="str") .field("quantity") .field("unit_price") .field("amount") .field("tax_rate", dtype="str", choices=["0%", "5%", "10%", "20%"]) .structure("payment_info") .field("method", dtype="str", choices=["bank_transfer", "credit_card", "check", "cash"]) .field("terms", description="Payment terms like NET30") .field("bank_details", dtype="list") ) ``` ### Customer Feedback Analysis ```python feedback_schema = (extractor.create_schema() # Overall classifications .classification("sentiment", ["positive", "negative", "neutral", "mixed"]) .classification("intent", { "complaint": "Customer expressing dissatisfaction", "compliment": "Customer expressing satisfaction", "suggestion": "Customer providing improvement ideas", "question": "Customer asking for information" }, multi_label=True) # Extract mentioned entities .entities({ "product": "Products or services mentioned", "feature": "Specific features discussed", "competitor": "Competing products mentioned", "price_mention": "Price points or cost references" }) # Structured feedback components .structure("issue") .field("problem", dtype="str") .field("severity", dtype="str", choices=["critical", "major", "minor"]) .field("affected_area", dtype="list") .structure("suggestion") .field("improvement", dtype="str") .field("benefit", description="Expected benefit of the suggestion") ) ``` ### News Article Analysis ```python news_schema = (extractor.create_schema() # Article metadata .classification("category", ["politics", "business", "technology", "sports", "entertainment"]) .classification("bias", ["left", "center", "right", "neutral"]) .classification("factuality", ["fact", "opinion", "analysis", "speculation"]) # Key entities .entities({ "person": "People mentioned in the article", "organization": "Companies, agencies, or groups", "location": "Places, cities, or countries", "event": "Named events or incidents" }) # Structured content .structure("quote") .field("speaker", dtype="str") .field("statement", dtype="str") .field("context", description="Context of the quote") .structure("claim") .field("statement", dtype="str") .field("source", dtype="str") .field("evidence", dtype="list") ) ``` ## Real-World Applications ### E-commerce Product Listing ```python product_schema = (extractor.create_schema() # Listing classification .classification("condition", ["new", "used", "refurbished", "for_parts"]) .classification("listing_type", ["buy_now", "auction", "best_offer"]) # Extract key entities .entities({ "brand": "Product brand or manufacturer", "model": "Specific model name or number", "color": "Product colors mentioned", "size": "Size specifications" }) # Product details .structure("product") .field("title", dtype="str") .field("price", dtype="str") .field("features", dtype="list") .field("category", dtype="str") # Shipping information .structure("shipping") .field("method", dtype="list", choices=["standard", "express", "overnight", "international"]) .field("cost", dtype="str") .field("delivery_time", description="Estimated delivery timeframe") # Seller information .structure("seller") .field("name", dtype="str") .field("rating", dtype="str") .field("location", dtype="str") ) ``` ### Healthcare Clinical Note ```python clinical_schema = (extractor.create_schema() # Note classification .classification("visit_type", ["initial_consultation", "follow_up", "emergency", "routine_checkup"]) .classification("urgency", ["urgent", "routine", "elective"]) # Medical entities .entities({ "symptom": "Patient reported symptoms", "diagnosis": "Medical diagnoses or conditions", "medication": "Prescribed or mentioned medications", "procedure": "Medical procedures or tests", "body_part": "Anatomical references" }) # Patient information .structure("patient_info") .field("name", dtype="str") .field("age", dtype="str") .field("gender", dtype="str", choices=["male", "female", "other"]) .field("chief_complaint", dtype="str") # Clinical findings .structure("vital_signs") .field("blood_pressure", dtype="str") .field("heart_rate", dtype="str") .field("temperature", dtype="str") .field("respiratory_rate", dtype="str") # Treatment plan .structure("prescription") .field("medication", dtype="str") .field("dosage", dtype="str") .field("frequency") .field("duration") .field("route", dtype="str", choices=["oral", "IV", "topical", "injection"]) ) ``` ### Legal Document Analysis ```python legal_schema = (extractor.create_schema() # Document classification .classification("document_type", ["contract", "memorandum", "brief", "motion", "order"]) .classification("jurisdiction", ["federal", "state", "local", "international"]) # Legal entities .entities({ "party": "Parties involved (plaintiff, defendant, etc.)", "attorney": "Legal representatives", "judge": "Judicial officers", "statute": "Laws or regulations cited", "case_citation": "Referenced legal cases" }) # Contract terms .structure("contract_term") .field("clause_type", dtype="str", choices=["payment", "delivery", "warranty", "liability", "termination"]) .field("obligation", dtype="str") .field("party_responsible", dtype="str") .field("deadline") # Legal claims .structure("claim") .field("type", dtype="str") .field("plaintiff", dtype="str") .field("defendant", dtype="str") .field("amount", dtype="str") .field("basis", description="Legal basis for the claim") ) ``` ## Using Confidence Scores and Character Positions with Combined Schemas When using combined schemas, `include_confidence` and `include_spans` parameters apply to all extraction types: ```python schema = (extractor.create_schema() .entities(["person", "company"]) .classification("sentiment", ["positive", "negative", "neutral"]) .relations(["works_for"]) .structure("product") .field("name", dtype="str") .field("price", dtype="str") ) text = "Tim Cook works for Apple. The iPhone 15 costs $999. This is exciting!" results = extractor.extract( text, schema, include_confidence=True, include_spans=True ) # Output: { # 'entities': { # 'person': [ # {'text': 'Tim Cook', 'confidence': 0.95, 'start': 0, 'end': 8} # ], # 'company': [ # {'text': 'Apple', 'confidence': 0.92, 'start': 20, 'end': 25} # ] # }, # 'sentiment': {'label': 'positive', 'confidence': 0.88}, # 'relation_extraction': { # 'works_for': [{ # 'head': {'text': 'Tim Cook', 'confidence': 0.95, 'start': 0, 'end': 8}, # 'tail': {'text': 'Apple', 'confidence': 0.92, 'start': 20, 'end': 25} # }] # }, # 'product': [{ # 'name': {'text': 'iPhone 15', 'confidence': 0.90, 'start': 30, 'end': 39}, # 'price': {'text': '$999', 'confidence': 0.88, 'start': 46, 'end': 51} # }] # } ``` **Note**: The `include_confidence` and `include_spans` parameters work consistently across all extraction types (entities, classifications, relations, and structures) when using combined schemas.