Jared AI Hub
Published on

Structured Outputs: Getting Reliable JSON from LLMs

Authors
  • avatar
    Name
    Jared Chung
    Twitter

Introduction

Free-form text is great for chat, but production applications need structured data. You need JSON for APIs, objects for code, and consistent formats for downstream processing.

The challenge: LLMs are trained to generate natural language, not valid JSON. Getting reliable structured outputs requires the right techniques.

Structured Outputs Flow

The Structured Output Spectrum

From least to most reliable:

MethodReliabilityFlexibilityProvider Support
Prompt engineeringLowHighAll
JSON modeMediumMediumMost
Function callingHighMediumMost
Structured outputsVery HighMediumOpenAI, Anthropic
Constrained decodingHighestLowOutlines, LMQL

Method 1: Prompt Engineering

The simplest approach sometimes works:

def extract_with_prompt(text: str) -> dict:
    prompt = f"""Extract the following information from the text as JSON:
    - name (string)
    - age (integer)
    - occupation (string)

    Text: {text}

    Return ONLY valid JSON, no other text:"""

    response = llm.generate(prompt)

    # Hope for the best
    return json.loads(response)

Problems:

  • Model might add markdown code blocks
  • Extra explanatory text
  • Invalid JSON syntax
  • Missing or extra fields

Method 2: JSON Mode

Most providers support a JSON mode:

OpenAI

from openai import OpenAI

client = OpenAI()

response = client.chat.completions.create(
    model="gpt-4o",
    response_format={"type": "json_object"},
    messages=[
        {"role": "system", "content": "Extract data as JSON with fields: name, age, occupation"},
        {"role": "user", "content": "John is a 30-year-old software engineer."}
    ]
)

data = json.loads(response.choices[0].message.content)
# {"name": "John", "age": 30, "occupation": "software engineer"}

Anthropic

from anthropic import Anthropic

client = Anthropic()

response = client.messages.create(
    model="claude-sonnet-4-20250514",
    max_tokens=1024,
    messages=[
        {"role": "user", "content": """Extract as JSON: {"name": str, "age": int, "occupation": str}

        Text: John is a 30-year-old software engineer.

        JSON:"""}
    ]
)

data = json.loads(response.content[0].text)

Better, but:

  • Schema not enforced
  • Can still get wrong types
  • Might miss required fields

Method 3: Function Calling / Tool Use

Define your schema as a function:

OpenAI Function Calling

from openai import OpenAI
import json

client = OpenAI()

tools = [{
    "type": "function",
    "function": {
        "name": "extract_person",
        "description": "Extract person information from text",
        "parameters": {
            "type": "object",
            "properties": {
                "name": {"type": "string", "description": "Person's full name"},
                "age": {"type": "integer", "description": "Person's age in years"},
                "occupation": {"type": "string", "description": "Person's job title"}
            },
            "required": ["name", "age", "occupation"]
        }
    }
}]

response = client.chat.completions.create(
    model="gpt-4o",
    messages=[{"role": "user", "content": "John is a 30-year-old software engineer."}],
    tools=tools,
    tool_choice={"type": "function", "function": {"name": "extract_person"}}
)

# Parse the function call
tool_call = response.choices[0].message.tool_calls[0]
data = json.loads(tool_call.function.arguments)

Anthropic Tool Use

from anthropic import Anthropic

client = Anthropic()

tools = [{
    "name": "extract_person",
    "description": "Extract person information from text",
    "input_schema": {
        "type": "object",
        "properties": {
            "name": {"type": "string"},
            "age": {"type": "integer"},
            "occupation": {"type": "string"}
        },
        "required": ["name", "age", "occupation"]
    }
}]

response = client.messages.create(
    model="claude-sonnet-4-20250514",
    max_tokens=1024,
    tools=tools,
    messages=[{"role": "user", "content": "John is a 30-year-old software engineer."}]
)

# Find tool use block
for block in response.content:
    if block.type == "tool_use":
        data = block.input
        break

Method 4: Native Structured Outputs

The most reliable option when available.

OpenAI Structured Outputs

from openai import OpenAI
from pydantic import BaseModel

client = OpenAI()

class Person(BaseModel):
    name: str
    age: int
    occupation: str

response = client.beta.chat.completions.parse(
    model="gpt-4o-2024-08-06",
    messages=[{"role": "user", "content": "John is a 30-year-old software engineer."}],
    response_format=Person
)

person = response.choices[0].message.parsed
# Person(name='John', age=30, occupation='software engineer')

Complex Nested Schemas

from pydantic import BaseModel, Field
from typing import List, Optional
from enum import Enum

class Priority(str, Enum):
    LOW = "low"
    MEDIUM = "medium"
    HIGH = "high"

class Task(BaseModel):
    title: str
    description: str
    priority: Priority
    estimated_hours: float = Field(ge=0)

class Project(BaseModel):
    name: str
    tasks: List[Task]
    deadline: Optional[str] = None

response = client.beta.chat.completions.parse(
    model="gpt-4o",
    messages=[{
        "role": "user",
        "content": "Create a project plan for building a REST API with 3 tasks"
    }],
    response_format=Project
)

project = response.choices[0].message.parsed
for task in project.tasks:
    print(f"- {task.title} ({task.priority.value}): {task.estimated_hours}h")

Method 5: Instructor Library

Works with multiple providers:

import instructor
from openai import OpenAI
from pydantic import BaseModel

# Patch the client
client = instructor.from_openai(OpenAI())

class Person(BaseModel):
    name: str
    age: int
    occupation: str

# Automatically handles parsing and retries
person = client.chat.completions.create(
    model="gpt-4o",
    messages=[{"role": "user", "content": "John is a 30-year-old engineer."}],
    response_model=Person,
)

print(person.name)  # John

With Anthropic

import instructor
from anthropic import Anthropic

client = instructor.from_anthropic(Anthropic())

person = client.messages.create(
    model="claude-sonnet-4-20250514",
    max_tokens=1024,
    messages=[{"role": "user", "content": "John is a 30-year-old engineer."}],
    response_model=Person,
)

Retries and Validation

from pydantic import BaseModel, field_validator

class Email(BaseModel):
    subject: str
    body: str
    recipients: List[str]

    @field_validator("recipients")
    @classmethod
    def validate_emails(cls, v):
        for email in v:
            if "@" not in email:
                raise ValueError(f"Invalid email: {email}")
        return v

# Instructor automatically retries if validation fails
email = client.chat.completions.create(
    model="gpt-4o",
    messages=[{"role": "user", "content": "Draft an email to john@example.com about the meeting"}],
    response_model=Email,
    max_retries=3,  # Retry if validation fails
)

Method 6: Constrained Decoding

For maximum reliability with local models:

Outlines

import outlines

model = outlines.models.transformers("mistralai/Mistral-7B-v0.1")

# Define schema
schema = {
    "type": "object",
    "properties": {
        "name": {"type": "string"},
        "age": {"type": "integer"},
        "occupation": {"type": "string"}
    },
    "required": ["name", "age", "occupation"]
}

generator = outlines.generate.json(model, schema)

result = generator("Extract from: John is a 30-year-old engineer")
# Guaranteed valid JSON matching schema

With Pydantic

from pydantic import BaseModel
import outlines

class Person(BaseModel):
    name: str
    age: int
    occupation: str

generator = outlines.generate.json(model, Person)
person = generator("John is a 30-year-old engineer")

Handling Edge Cases

Missing Information

from pydantic import BaseModel
from typing import Optional

class Person(BaseModel):
    name: str
    age: Optional[int] = None  # Might not be mentioned
    occupation: Optional[str] = None

# Add instructions
prompt = """Extract person info. Use null for missing fields.
Text: John works in tech.
"""

# Result: Person(name='John', age=None, occupation='tech')

Ambiguous Data

from pydantic import BaseModel, Field
from typing import List

class Extraction(BaseModel):
    value: str
    confidence: float = Field(ge=0, le=1)
    alternatives: List[str] = []

class Person(BaseModel):
    name: Extraction
    age: Extraction

# Prompt for uncertainty
prompt = """Extract person info with confidence scores.
If uncertain, provide alternatives.
Text: Jon or John is around 30.
"""

# Result shows uncertainty
# Person(
#   name=Extraction(value='John', confidence=0.6, alternatives=['Jon']),
#   age=Extraction(value='30', confidence=0.8, alternatives=['29', '31'])
# )

Lists and Multiple Entities

class Person(BaseModel):
    name: str
    age: Optional[int]

class ExtractionResult(BaseModel):
    people: List[Person]

prompt = """Extract all people mentioned.
Text: John (30) and Mary work together. Bob joined recently.
"""

# Result: ExtractionResult(people=[
#   Person(name='John', age=30),
#   Person(name='Mary', age=None),
#   Person(name='Bob', age=None)
# ])

Real-World Patterns

Entity Extraction Pipeline

from pydantic import BaseModel
from typing import List, Optional
import instructor
from openai import OpenAI

client = instructor.from_openai(OpenAI())

class Entity(BaseModel):
    text: str
    type: str  # PERSON, ORG, LOCATION, etc.
    start: int
    end: int

class ExtractionResult(BaseModel):
    entities: List[Entity]
    raw_text: str

def extract_entities(text: str) -> ExtractionResult:
    return client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": """Extract named entities.
            Include character positions for each entity.
            Types: PERSON, ORG, LOCATION, DATE, MONEY"""},
            {"role": "user", "content": text}
        ],
        response_model=ExtractionResult,
    )

result = extract_entities("Apple CEO Tim Cook announced...")
for entity in result.entities:
    print(f"{entity.type}: {entity.text} [{entity.start}:{entity.end}]")

Classification with Reasoning

from enum import Enum

class Sentiment(str, Enum):
    POSITIVE = "positive"
    NEGATIVE = "negative"
    NEUTRAL = "neutral"

class Classification(BaseModel):
    sentiment: Sentiment
    confidence: float
    reasoning: str

def classify_sentiment(text: str) -> Classification:
    return client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "Classify sentiment and explain your reasoning."},
            {"role": "user", "content": text}
        ],
        response_model=Classification,
    )

result = classify_sentiment("The product is okay but shipping was slow")
print(f"{result.sentiment.value} ({result.confidence:.0%}): {result.reasoning}")

Data Transformation

class RawRecord(BaseModel):
    text: str

class NormalizedRecord(BaseModel):
    name: str
    phone: str  # Normalized format: +1-XXX-XXX-XXXX
    email: str  # Lowercase

def normalize_contact(raw: str) -> NormalizedRecord:
    return client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": """Normalize contact information.
            Phone: +1-XXX-XXX-XXXX format
            Email: lowercase
            Name: Title Case"""},
            {"role": "user", "content": raw}
        ],
        response_model=NormalizedRecord,
    )

result = normalize_contact("JOHN DOE, john.doe@GMAIL.COM, (555) 123-4567")
# NormalizedRecord(
#   name='John Doe',
#   phone='+1-555-123-4567',
#   email='john.doe@gmail.com'
# )

Best Practices

  1. Use Pydantic for schemas: Type hints and validation built-in
  2. Make fields Optional when appropriate: Don't force hallucination
  3. Add Field descriptions: Helps the model understand intent
  4. Implement retries: Structured outputs can still fail
  5. Validate outputs: Don't trust, verify
  6. Log failures: Track which schemas cause problems
  7. Start simple: Add complexity only when needed

Conclusion

Structured outputs have evolved from hacky prompt engineering to first-class features. Use native structured outputs when available, Instructor for cross-provider compatibility, and constrained decoding for maximum reliability.

The key is matching your reliability requirements to the right technique and always validating the output before trusting it.