How to extract structured data using GPT5
This is the basic idea
Declare a Pydantic class like this
from pydantic import BaseModel, Field, field_validator
from datetime import date
from enum import Enum
from typing import Optional, List, Union, TypeVar, Generic
from typing_extensions import TypeAlias
# Enums
class LanguageLevel(Enum):
NO_SPEECH = 0
BABBLING_SINGLE_WORDS = 1
TWO_WORD_PHRASES = 2
SENTENCES_NAMING = 3
class SocialCommunicationLevel(Enum):
NO_ENGAGEMENT = 0
BASIC_ENGAGEMENT = 1
RECIPROCAL_INTERACTION = 2
COMPLEX_INTERACTION = 3
class AdaptiveBehaviorLevel(Enum):
NO_INDEPENDENCE = 0
BASIC_SELF_HELP = 1
MODERATE_INDEPENDENCE = 2
AGE_APPROPRIATE = 3
class MilestoneType(Enum):
LANGUAGE = "language"
SOCIAL_COMMUNICATION = "social_communication"
ADAPTIVE_BEHAVIOR = "adaptive_behavior"
class MilestoneName(Enum):
BABBLING = "babbling"
SINGLE_WORDS = "single_words"
TWO_WORD_PHRASES = "two_word_phrases"
NAMING_OBJECTS = "naming_objects"
EYE_CONTACT = "eye_contact"
RESPONDING_TO_NAME = "responding_to_name"
POINTING = "pointing"
WAVING = "waving"
CLAPPING = "clapping"
JOINT_ATTENTION = "joint_attention"
TURN_TAKING = "turn_taking"
FEEDING_WITH_HELP = "feeding_with_help"
USING_SPOON = "using_spoon"
DANCING = "dancing"
DRESSING_WITH_HELP = "dressing_with_help"
SELF_FEEDING = "self_feeding"
class RepetitiveBehaviorType(Enum):
NONE = "none"
SPINNING = "spinning"
FLAPPING = "flapping"
STARING = "staring"
ZONING_OUT = "zoning_out"
HEAD_BANGING = "head_banging"
BITING = "biting"
class ASDSeverity(Enum):
LEVEL_1 = 1
LEVEL_2 = 2
LEVEL_3 = 3
class MetalType(Enum):
MERCURY = "mercury"
ALUMINUM = "aluminum"
LEAD = "lead"
ARSENIC = "arsenic"
CADMIUM = "cadmium"
NICKEL = "nickel"
THALLIUM = "thallium"
ANTIMONY = "antimony"
class InterventionType(Enum):
SPEECH_THERAPY = "speech_therapy"
OCCUPATIONAL_THERAPY = "occupational_therapy"
ABA = "aba"
DETOX = "detox"
SOCIAL_SKILLS = "social_skills"
OTHER = "other"
class ComorbidityType(Enum):
ADHD = "adhd"
COLITIS = "colitis"
ALLERGIES = "allergies"
ASTHMA = "asthma"
ECZEMA = "eczema"
SEIZURE_DISORDER = "seizure_disorder"
OCD = "ocd"
ANXIETY = "anxiety"
OTHER = "other"
class ExposureType(Enum):
MATERNAL_AMALGAM = "maternal_amalgam"
POWER_PLANT = "power_plant"
OTHER_MERCURY = "other_mercury"
UNKNOWN = "unknown"
# Citable Models
class CitableInt(BaseModel):
value: int = Field(..., ge=0, description="Non-negative integer value")
citations: Optional[List[int]] = Field(default_factory=list, description="Citations for the value")
class CitableFloat(BaseModel):
value: float = Field(..., ge=0, description="Floating point value which allows decimals")
citations: Optional[List[int]] = Field(default_factory=list, description="Citations for the value")
class CitableBool(BaseModel):
value: bool = Field(..., description="Boolean value")
citations: Optional[List[int]] = Field(default_factory=list, description="Citations for the value")
EnumT = TypeVar("EnumT", bound=Enum)
class CitableEnum(BaseModel, Generic[EnumT]):
value: EnumT = Field(..., description="Enum value")
citations: Optional[List[int]] = Field(default_factory=list, description="Citations for the enum value")
class CitableList(BaseModel, Generic[EnumT]):
value: List[EnumT] = Field(default_factory=list, description="List of enum values")
citations: Optional[List[int]] = Field(default_factory=list, description="Citations for the list")
# Other Models
class HeavyMetalTest(BaseModel):
metal_type: Union[MetalType, str]
value: float
unit: str
is_elevated: bool
citations: List[int]
class DevelopmentalMilestone(BaseModel):
milestone_name: Union[MilestoneName, str]
milestone_type: MilestoneType
milestone_citations: List[int]
milestone_age_in_months: int
milestone_age_in_months_citations: List[int]
milestone_has_regressed: bool
regression_start_date_in_months: int
regression_start_date_in_months_citations: List[int]
progressed_after_regression: bool
@classmethod
def validate_milestone_type(cls, values):
milestone_name = values.get("milestone_name")
milestone_type = values.get("milestone_type")
if milestone_name and milestone_type:
expected_type = MILESTONE_TYPE_MAPPING.get(
milestone_name.value if isinstance(milestone_name, MilestoneName) else milestone_name)
if expected_type != milestone_type:
raise ValueError(
f"Milestone {milestone_name} should have milestone_type {expected_type}, not {milestone_type}"
)
return values
class DevelopmentalScore(BaseModel):
milestones: List[DevelopmentalMilestone] = Field(default_factory=list,
description="List of achieved developmental milestones")
@property
def language_score(self) -> int:
achieved = sum(1 for m in self.milestones if m.milestone_type == MilestoneType.LANGUAGE)
return min(achieved, 3)
@property
def social_communication_score(self) -> int:
achieved = sum(1 for m in self.milestones if m.milestone_type == MilestoneType.SOCIAL_COMMUNICATION)
return min(achieved // 2, 3)
@property
def adaptive_behavior_score(self) -> int:
achieved = sum(1 for m in self.milestones if m.milestone_type == MilestoneType.ADAPTIVE_BEHAVIOR)
return min(achieved, 3)
@property
def total_score(self) -> int:
return (
self.language_score +
self.social_communication_score +
self.adaptive_behavior_score
)
class RegressionRecord(BaseModel):
pre_regression_language_level: LanguageLevel = Field(
description="Language level before regression, calculated from milestones")
pre_regression_social_communication_level: SocialCommunicationLevel = Field(
description="Social communication level before regression, calculated from milestones")
pre_regression_adaptive_behavior_level: AdaptiveBehaviorLevel = Field(
description="Adaptive behavior level before regression, calculated from milestones")
post_regression_language_level: LanguageLevel = Field(
description="Language level after regression, calculated from milestones")
post_regression_social_communication_level: SocialCommunicationLevel = Field(
description="Social communication level after regression, calculated from milestones")
post_regression_adaptive_behavior_level: AdaptiveBehaviorLevel = Field(
description="Adaptive behavior level after regression, calculated from milestones")
has_language_loss: CitableBool = Field(description="Whether language loss occurred, with citations")
has_social_loss: CitableBool = Field(description="Whether social communication loss occurred, with citations")
has_adaptive_loss: CitableBool = Field(description="Whether adaptive behavior loss occurred, with citations")
has_repetitive_behaviors: CitableBool = Field(
description="Whether repetitive behaviors were observed, with citations")
class Config:
use_enum_values = True
@classmethod
def __pydantic_init_subclass__(cls):
super().__pydantic_init_subclass__()
cls.__pydantic_validator__.validate_python = cls._validate_levels
@classmethod
def _validate_levels(cls, values):
pre_milestones = [
DevelopmentalMilestone(
milestone_name=name,
milestone_type=MILESTONE_TYPE_MAPPING[name.value],
milestone_citations=[],
milestone_age_in_months=0,
milestone_age_in_months_citations=[],
milestone_has_regressed=False,
regression_start_date_in_months=0,
regression_start_date_in_months_citations=[],
progressed_after_regression=False
)
for name in values.get("pre_regression_milestones", [])
]
post_milestones = [m for m in values.get("developmental_milestones", []) if not m.milestone_has_regressed]
pre_score = DevelopmentalScore(milestones=pre_milestones)
post_score = DevelopmentalScore(milestones=post_milestones)
values["pre_regression_language_level"] = LanguageLevel(pre_score.language_score)
values["pre_regression_social_communication_level"] = SocialCommunicationLevel(
pre_score.social_communication_score)
values["pre_regression_adaptive_behavior_level"] = AdaptiveBehaviorLevel(pre_score.adaptive_behavior_score)
values["post_regression_language_level"] = LanguageLevel(post_score.language_score)
values["post_regression_social_communication_level"] = SocialCommunicationLevel(
post_score.social_communication_score)
values["post_regression_adaptive_behavior_level"] = AdaptiveBehaviorLevel(post_score.adaptive_behavior_score)
return values
class TimelineRecord(BaseModel):
vaccination_date: Optional[date] = Field(default=None, description="Date of vaccination")
regression_onset_date: Optional[date] = Field(default=None, description="Date of regression onset")
diagnosis_date: Optional[date] = Field(default=None, description="Date of diagnosis")
age_at_vaccination_months: Optional[Union[CitableInt, CitableFloat]] = Field(default=None,
description="Age at vaccination in months, with citations")
age_at_diagnosis_months: Optional[CitableInt] = Field(default=None,
description="Age at diagnosis in months, with citations")
class Config:
use_enum_values = True
class SymptomRecord(BaseModel):
has_seizures: CitableBool = Field(description="Whether seizures were observed, with citations")
has_fever: CitableBool = Field(description="Whether fever was observed, with citations")
has_sensory_sensitivity: CitableBool = Field(
description="Whether sensory sensitivities were observed, with citations")
physical_symptoms: CitableList[str] = Field(default_factory=lambda: CitableList[str](value=[], citations=[]),
description="List of physical symptoms with citations")
symptom_duration_days: Optional[CitableInt] = Field(default=None,
description="Duration of physical symptoms in days, with citations")
class Config:
use_enum_values = True
class DiagnosisRecord(BaseModel):
asd_severity: CitableEnum[ASDSeverity] = Field(description="ASD severity level, with citations")
diagnosis_name: str = Field(description="Specific diagnosis (e.g., autism, PDD-NOS)")
diagnosis_name_citations: List[int] = Field(default_factory=list, description="Citations for diagnosis name")
diagnosing_professional: Optional[str] = Field(default=None, description="Professional who made the diagnosis")
diagnosing_professional_citations: List[int] = Field(default_factory=list,
description="Citations for diagnosing professional")
class Config:
use_enum_values = True
class InterventionRecord(BaseModel):
interventions: CitableList[InterventionType] = Field(
default_factory=lambda: CitableList[InterventionType](value=[], citations=[]),
description="List of interventions with citations")
is_intervention_ongoing: CitableBool = Field(description="Whether any interventions are ongoing, with citations")
has_recovery: CitableBool = Field(description="Whether recovery occurred, with citations")
class Config:
use_enum_values = True
class BirthRecord(BaseModel):
was_normal_pre_vaccination: Optional[CitableBool] = Field(default=None,
description="Whether child was developmentally normal before vaccination, with citations")
apgar_score: Optional[str] = Field(default=None, description="APGAR score at birth, if reported")
apgar_score_citations: List[int] = Field(default_factory=list, description="Citations for APGAR score")
class Config:
use_enum_values = True
class BehaviorRecord(BaseModel):
repetitive_behaviors: List[Union[RepetitiveBehaviorType, str]] = Field(default_factory=list,
description="List of observed repetitive behaviors")
repetitive_behaviors_explanation: Optional[str] = Field(default=None,
description="Explanation for repetitive behaviors")
repetitive_behaviors_citations: List[int] = Field(default_factory=list,
description="Citations for repetitive behaviors")
class Config:
use_enum_values = True
@field_validator("repetitive_behaviors", mode="before")
@classmethod
def validate_repetitive_behaviors(cls, v):
if v is None:
return []
validated = []
for item in v:
if isinstance(item, str):
try:
validated.append(RepetitiveBehaviorType(item))
except ValueError:
validated.append(item)
elif isinstance(item, RepetitiveBehaviorType):
validated.append(item)
else:
raise ValueError(f"Invalid type for repetitive_behaviors: {type(item)}")
return validated
class VAERSReport(BaseModel):
timeline: TimelineRecord = Field(description="Timeline-related data including dates and ages")
pre_regression_milestones: List[MilestoneName] = Field(default_factory=list,
description="List of milestone names achieved before regression")
developmental_milestones: List[DevelopmentalMilestone] = Field(default_factory=list)
regression_record: RegressionRecord = Field(
description="Regression-related data including pre/post levels and loss indicators")
symptom_record: SymptomRecord = Field(
description="Symptom-related data including seizures, fever, and physical symptoms")
diagnosis_record: DiagnosisRecord = Field(description="Diagnosis-related data including severity and professional")
intervention_record: InterventionRecord = Field(description="Intervention-related data including recovery status")
birth_record: BirthRecord = Field(description="Birth and pre-vaccination developmental status")
behavior_record: BehaviorRecord = Field(description="Repetitive behavior data with explanations and citations")
heavy_metal_tests: Optional[List[HeavyMetalTest]] = Field(default=None, description="Heavy metal test results")
comorbidities: CitableList[ComorbidityType] = Field(
default_factory=lambda: CitableList[ComorbidityType](value=[], citations=[]),
description="List of co-occurring conditions with citations")
environmental_exposures: CitableList[ExposureType] = Field(
default_factory=lambda: CitableList[ExposureType](value=[], citations=[]),
description="List of environmental exposures with citations")
class Config:
use_enum_values = True
@classmethod
def __pydantic_init_subclass__(cls):
super().__pydantic_init_subclass__()
cls.__pydantic_validator__.validate_python = cls._validate_regression_record
@classmethod
def _validate_regression_record(cls, values):
regression_record = values.get("regression_record", {})
regression_record["pre_regression_milestones"] = values.get("pre_regression_milestones", [])
regression_record["developmental_milestones"] = values.get("developmental_milestones", [])
values["regression_record"] = RegressionRecord(**regression_record)
return values
def _create_pre_milestones(self) -> List[DevelopmentalMilestone]:
return [
DevelopmentalMilestone(
milestone_name=name,
milestone_type=MILESTONE_TYPE_MAPPING[name],
milestone_citations=[],
milestone_age_in_months=0,
milestone_age_in_months_citations=[],
milestone_has_regressed=False,
regression_start_date_in_months=0,
regression_start_date_in_months_citations=[],
progressed_after_regression=False
)
for name in self.pre_regression_milestones
]
@property
def dmsg(self) -> int:
pre_score = DevelopmentalScore(milestones=self._create_pre_milestones())
post_milestones = [m for m in self.developmental_milestones if not m.milestone_has_regressed]
post_score = DevelopmentalScore(milestones=post_milestones)
return pre_score.total_score - post_score.total_score
# MILESTONE_TYPE_MAPPING
MILESTONE_TYPE_MAPPING = {
MilestoneName.BABBLING.value: MilestoneType.LANGUAGE,
MilestoneName.SINGLE_WORDS.value: MilestoneType.LANGUAGE,
MilestoneName.TWO_WORD_PHRASES.value: MilestoneType.LANGUAGE,
MilestoneName.NAMING_OBJECTS.value: MilestoneType.LANGUAGE,
MilestoneName.EYE_CONTACT.value: MilestoneType.SOCIAL_COMMUNICATION,
MilestoneName.RESPONDING_TO_NAME.value: MilestoneType.SOCIAL_COMMUNICATION,
MilestoneName.POINTING.value: MilestoneType.SOCIAL_COMMUNICATION,
MilestoneName.WAVING.value: MilestoneType.SOCIAL_COMMUNICATION,
MilestoneName.CLAPPING.value: MilestoneType.SOCIAL_COMMUNICATION,
MilestoneName.JOINT_ATTENTION.value: MilestoneType.SOCIAL_COMMUNICATION,
MilestoneName.TURN_TAKING.value: MilestoneType.SOCIAL_COMMUNICATION,
MilestoneName.FEEDING_WITH_HELP.value: MilestoneType.ADAPTIVE_BEHAVIOR,
MilestoneName.USING_SPOON.value: MilestoneType.ADAPTIVE_BEHAVIOR,
MilestoneName.DANCING.value: MilestoneType.ADAPTIVE_BEHAVIOR,
MilestoneName.DRESSING_WITH_HELP.value: MilestoneType.ADAPTIVE_BEHAVIOR,
MilestoneName.SELF_FEEDING.value: MilestoneType.ADAPTIVE_BEHAVIOR,
}
The prompt will look like this
prompt = f"""
{system_prompt}
Report: {formatted_input}
JSON Schema:
{json.dumps(VAERSReport.model_json_schema(), indent=2)}
"""
Call the LLM
response = client.chat.completions.create(
model=MODEL_NAME,
**request_data
)
after = time.time()
elapsed = after - before
inner_response_text = response.choices[0].message.content
full_response_json = response.model_dump_json()
And then you will do a bunch of post processing to check if the response is valid JSON, and whether it conforms to the provided Pydantic schema etc (with GPT5 both of these are usually true)
I explain this in more detail in the course