How to extract structured data using GPT5

This is the basic idea

Declare a Pydantic class like this

from pydantic import BaseModel, Field, field_validator
from datetime import date
from enum import Enum
from typing import Optional, List, Union, TypeVar, Generic
from typing_extensions import TypeAlias


# Enums
class LanguageLevel(Enum):
    NO_SPEECH = 0
    BABBLING_SINGLE_WORDS = 1
    TWO_WORD_PHRASES = 2
    SENTENCES_NAMING = 3


class SocialCommunicationLevel(Enum):
    NO_ENGAGEMENT = 0
    BASIC_ENGAGEMENT = 1
    RECIPROCAL_INTERACTION = 2
    COMPLEX_INTERACTION = 3


class AdaptiveBehaviorLevel(Enum):
    NO_INDEPENDENCE = 0
    BASIC_SELF_HELP = 1
    MODERATE_INDEPENDENCE = 2
    AGE_APPROPRIATE = 3


class MilestoneType(Enum):
    LANGUAGE = "language"
    SOCIAL_COMMUNICATION = "social_communication"
    ADAPTIVE_BEHAVIOR = "adaptive_behavior"


class MilestoneName(Enum):
    BABBLING = "babbling"
    SINGLE_WORDS = "single_words"
    TWO_WORD_PHRASES = "two_word_phrases"
    NAMING_OBJECTS = "naming_objects"
    EYE_CONTACT = "eye_contact"
    RESPONDING_TO_NAME = "responding_to_name"
    POINTING = "pointing"
    WAVING = "waving"
    CLAPPING = "clapping"
    JOINT_ATTENTION = "joint_attention"
    TURN_TAKING = "turn_taking"
    FEEDING_WITH_HELP = "feeding_with_help"
    USING_SPOON = "using_spoon"
    DANCING = "dancing"
    DRESSING_WITH_HELP = "dressing_with_help"
    SELF_FEEDING = "self_feeding"


class RepetitiveBehaviorType(Enum):
    NONE = "none"
    SPINNING = "spinning"
    FLAPPING = "flapping"
    STARING = "staring"
    ZONING_OUT = "zoning_out"
    HEAD_BANGING = "head_banging"
    BITING = "biting"


class ASDSeverity(Enum):
    LEVEL_1 = 1
    LEVEL_2 = 2
    LEVEL_3 = 3


class MetalType(Enum):
    MERCURY = "mercury"
    ALUMINUM = "aluminum"
    LEAD = "lead"
    ARSENIC = "arsenic"
    CADMIUM = "cadmium"
    NICKEL = "nickel"
    THALLIUM = "thallium"
    ANTIMONY = "antimony"


class InterventionType(Enum):
    SPEECH_THERAPY = "speech_therapy"
    OCCUPATIONAL_THERAPY = "occupational_therapy"
    ABA = "aba"
    DETOX = "detox"
    SOCIAL_SKILLS = "social_skills"
    OTHER = "other"


class ComorbidityType(Enum):
    ADHD = "adhd"
    COLITIS = "colitis"
    ALLERGIES = "allergies"
    ASTHMA = "asthma"
    ECZEMA = "eczema"
    SEIZURE_DISORDER = "seizure_disorder"
    OCD = "ocd"
    ANXIETY = "anxiety"
    OTHER = "other"


class ExposureType(Enum):
    MATERNAL_AMALGAM = "maternal_amalgam"
    POWER_PLANT = "power_plant"
    OTHER_MERCURY = "other_mercury"
    UNKNOWN = "unknown"


# Citable Models
class CitableInt(BaseModel):
    value: int = Field(..., ge=0, description="Non-negative integer value")
    citations: Optional[List[int]] = Field(default_factory=list, description="Citations for the value")


class CitableFloat(BaseModel):
    value: float = Field(..., ge=0, description="Floating point value which allows decimals")
    citations: Optional[List[int]] = Field(default_factory=list, description="Citations for the value")


class CitableBool(BaseModel):
    value: bool = Field(..., description="Boolean value")
    citations: Optional[List[int]] = Field(default_factory=list, description="Citations for the value")


EnumT = TypeVar("EnumT", bound=Enum)


class CitableEnum(BaseModel, Generic[EnumT]):
    value: EnumT = Field(..., description="Enum value")
    citations: Optional[List[int]] = Field(default_factory=list, description="Citations for the enum value")


class CitableList(BaseModel, Generic[EnumT]):
    value: List[EnumT] = Field(default_factory=list, description="List of enum values")
    citations: Optional[List[int]] = Field(default_factory=list, description="Citations for the list")


# Other Models
class HeavyMetalTest(BaseModel):
    metal_type: Union[MetalType, str]
    value: float
    unit: str
    is_elevated: bool
    citations: List[int]


class DevelopmentalMilestone(BaseModel):
    milestone_name: Union[MilestoneName, str]
    milestone_type: MilestoneType
    milestone_citations: List[int]
    milestone_age_in_months: int
    milestone_age_in_months_citations: List[int]
    milestone_has_regressed: bool
    regression_start_date_in_months: int
    regression_start_date_in_months_citations: List[int]
    progressed_after_regression: bool

    @classmethod
    def validate_milestone_type(cls, values):
        milestone_name = values.get("milestone_name")
        milestone_type = values.get("milestone_type")
        if milestone_name and milestone_type:
            expected_type = MILESTONE_TYPE_MAPPING.get(
                milestone_name.value if isinstance(milestone_name, MilestoneName) else milestone_name)
            if expected_type != milestone_type:
                raise ValueError(
                    f"Milestone {milestone_name} should have milestone_type {expected_type}, not {milestone_type}"
                )
        return values


class DevelopmentalScore(BaseModel):
    milestones: List[DevelopmentalMilestone] = Field(default_factory=list,
                                                     description="List of achieved developmental milestones")

    @property
    def language_score(self) -> int:
        achieved = sum(1 for m in self.milestones if m.milestone_type == MilestoneType.LANGUAGE)
        return min(achieved, 3)

    @property
    def social_communication_score(self) -> int:
        achieved = sum(1 for m in self.milestones if m.milestone_type == MilestoneType.SOCIAL_COMMUNICATION)
        return min(achieved // 2, 3)

    @property
    def adaptive_behavior_score(self) -> int:
        achieved = sum(1 for m in self.milestones if m.milestone_type == MilestoneType.ADAPTIVE_BEHAVIOR)
        return min(achieved, 3)

    @property
    def total_score(self) -> int:
        return (
                self.language_score +
                self.social_communication_score +
                self.adaptive_behavior_score
        )


class RegressionRecord(BaseModel):
    pre_regression_language_level: LanguageLevel = Field(
        description="Language level before regression, calculated from milestones")
    pre_regression_social_communication_level: SocialCommunicationLevel = Field(
        description="Social communication level before regression, calculated from milestones")
    pre_regression_adaptive_behavior_level: AdaptiveBehaviorLevel = Field(
        description="Adaptive behavior level before regression, calculated from milestones")
    post_regression_language_level: LanguageLevel = Field(
        description="Language level after regression, calculated from milestones")
    post_regression_social_communication_level: SocialCommunicationLevel = Field(
        description="Social communication level after regression, calculated from milestones")
    post_regression_adaptive_behavior_level: AdaptiveBehaviorLevel = Field(
        description="Adaptive behavior level after regression, calculated from milestones")
    has_language_loss: CitableBool = Field(description="Whether language loss occurred, with citations")
    has_social_loss: CitableBool = Field(description="Whether social communication loss occurred, with citations")
    has_adaptive_loss: CitableBool = Field(description="Whether adaptive behavior loss occurred, with citations")
    has_repetitive_behaviors: CitableBool = Field(
        description="Whether repetitive behaviors were observed, with citations")

    class Config:
        use_enum_values = True

    @classmethod
    def __pydantic_init_subclass__(cls):
        super().__pydantic_init_subclass__()
        cls.__pydantic_validator__.validate_python = cls._validate_levels

    @classmethod
    def _validate_levels(cls, values):
        pre_milestones = [
            DevelopmentalMilestone(
                milestone_name=name,
                milestone_type=MILESTONE_TYPE_MAPPING[name.value],
                milestone_citations=[],
                milestone_age_in_months=0,
                milestone_age_in_months_citations=[],
                milestone_has_regressed=False,
                regression_start_date_in_months=0,
                regression_start_date_in_months_citations=[],
                progressed_after_regression=False
            )
            for name in values.get("pre_regression_milestones", [])
        ]
        post_milestones = [m for m in values.get("developmental_milestones", []) if not m.milestone_has_regressed]

        pre_score = DevelopmentalScore(milestones=pre_milestones)
        post_score = DevelopmentalScore(milestones=post_milestones)

        values["pre_regression_language_level"] = LanguageLevel(pre_score.language_score)
        values["pre_regression_social_communication_level"] = SocialCommunicationLevel(
            pre_score.social_communication_score)
        values["pre_regression_adaptive_behavior_level"] = AdaptiveBehaviorLevel(pre_score.adaptive_behavior_score)
        values["post_regression_language_level"] = LanguageLevel(post_score.language_score)
        values["post_regression_social_communication_level"] = SocialCommunicationLevel(
            post_score.social_communication_score)
        values["post_regression_adaptive_behavior_level"] = AdaptiveBehaviorLevel(post_score.adaptive_behavior_score)

        return values


class TimelineRecord(BaseModel):
    vaccination_date: Optional[date] = Field(default=None, description="Date of vaccination")
    regression_onset_date: Optional[date] = Field(default=None, description="Date of regression onset")
    diagnosis_date: Optional[date] = Field(default=None, description="Date of diagnosis")
    age_at_vaccination_months: Optional[Union[CitableInt, CitableFloat]] = Field(default=None,
                                                                                 description="Age at vaccination in months, with citations")
    age_at_diagnosis_months: Optional[CitableInt] = Field(default=None,
                                                          description="Age at diagnosis in months, with citations")

    class Config:
        use_enum_values = True


class SymptomRecord(BaseModel):
    has_seizures: CitableBool = Field(description="Whether seizures were observed, with citations")
    has_fever: CitableBool = Field(description="Whether fever was observed, with citations")
    has_sensory_sensitivity: CitableBool = Field(
        description="Whether sensory sensitivities were observed, with citations")
    physical_symptoms: CitableList[str] = Field(default_factory=lambda: CitableList[str](value=[], citations=[]),
                                                description="List of physical symptoms with citations")
    symptom_duration_days: Optional[CitableInt] = Field(default=None,
                                                        description="Duration of physical symptoms in days, with citations")

    class Config:
        use_enum_values = True


class DiagnosisRecord(BaseModel):
    asd_severity: CitableEnum[ASDSeverity] = Field(description="ASD severity level, with citations")
    diagnosis_name: str = Field(description="Specific diagnosis (e.g., autism, PDD-NOS)")
    diagnosis_name_citations: List[int] = Field(default_factory=list, description="Citations for diagnosis name")
    diagnosing_professional: Optional[str] = Field(default=None, description="Professional who made the diagnosis")
    diagnosing_professional_citations: List[int] = Field(default_factory=list,
                                                         description="Citations for diagnosing professional")

    class Config:
        use_enum_values = True


class InterventionRecord(BaseModel):
    interventions: CitableList[InterventionType] = Field(
        default_factory=lambda: CitableList[InterventionType](value=[], citations=[]),
        description="List of interventions with citations")
    is_intervention_ongoing: CitableBool = Field(description="Whether any interventions are ongoing, with citations")
    has_recovery: CitableBool = Field(description="Whether recovery occurred, with citations")

    class Config:
        use_enum_values = True


class BirthRecord(BaseModel):
    was_normal_pre_vaccination: Optional[CitableBool] = Field(default=None,
                                                              description="Whether child was developmentally normal before vaccination, with citations")
    apgar_score: Optional[str] = Field(default=None, description="APGAR score at birth, if reported")
    apgar_score_citations: List[int] = Field(default_factory=list, description="Citations for APGAR score")

    class Config:
        use_enum_values = True


class BehaviorRecord(BaseModel):
    repetitive_behaviors: List[Union[RepetitiveBehaviorType, str]] = Field(default_factory=list,
                                                                           description="List of observed repetitive behaviors")
    repetitive_behaviors_explanation: Optional[str] = Field(default=None,
                                                            description="Explanation for repetitive behaviors")
    repetitive_behaviors_citations: List[int] = Field(default_factory=list,
                                                      description="Citations for repetitive behaviors")

    class Config:
        use_enum_values = True

    @field_validator("repetitive_behaviors", mode="before")
    @classmethod
    def validate_repetitive_behaviors(cls, v):
        if v is None:
            return []
        validated = []
        for item in v:
            if isinstance(item, str):
                try:
                    validated.append(RepetitiveBehaviorType(item))
                except ValueError:
                    validated.append(item)
            elif isinstance(item, RepetitiveBehaviorType):
                validated.append(item)
            else:
                raise ValueError(f"Invalid type for repetitive_behaviors: {type(item)}")
        return validated


class VAERSReport(BaseModel):
    timeline: TimelineRecord = Field(description="Timeline-related data including dates and ages")
    pre_regression_milestones: List[MilestoneName] = Field(default_factory=list,
                                                           description="List of milestone names achieved before regression")
    developmental_milestones: List[DevelopmentalMilestone] = Field(default_factory=list)
    regression_record: RegressionRecord = Field(
        description="Regression-related data including pre/post levels and loss indicators")
    symptom_record: SymptomRecord = Field(
        description="Symptom-related data including seizures, fever, and physical symptoms")
    diagnosis_record: DiagnosisRecord = Field(description="Diagnosis-related data including severity and professional")
    intervention_record: InterventionRecord = Field(description="Intervention-related data including recovery status")
    birth_record: BirthRecord = Field(description="Birth and pre-vaccination developmental status")
    behavior_record: BehaviorRecord = Field(description="Repetitive behavior data with explanations and citations")
    heavy_metal_tests: Optional[List[HeavyMetalTest]] = Field(default=None, description="Heavy metal test results")
    comorbidities: CitableList[ComorbidityType] = Field(
        default_factory=lambda: CitableList[ComorbidityType](value=[], citations=[]),
        description="List of co-occurring conditions with citations")
    environmental_exposures: CitableList[ExposureType] = Field(
        default_factory=lambda: CitableList[ExposureType](value=[], citations=[]),
        description="List of environmental exposures with citations")

    class Config:
        use_enum_values = True

    @classmethod
    def __pydantic_init_subclass__(cls):
        super().__pydantic_init_subclass__()
        cls.__pydantic_validator__.validate_python = cls._validate_regression_record

    @classmethod
    def _validate_regression_record(cls, values):
        regression_record = values.get("regression_record", {})
        regression_record["pre_regression_milestones"] = values.get("pre_regression_milestones", [])
        regression_record["developmental_milestones"] = values.get("developmental_milestones", [])
        values["regression_record"] = RegressionRecord(**regression_record)
        return values

    def _create_pre_milestones(self) -> List[DevelopmentalMilestone]:
        return [
            DevelopmentalMilestone(
                milestone_name=name,
                milestone_type=MILESTONE_TYPE_MAPPING[name],
                milestone_citations=[],
                milestone_age_in_months=0,
                milestone_age_in_months_citations=[],
                milestone_has_regressed=False,
                regression_start_date_in_months=0,
                regression_start_date_in_months_citations=[],
                progressed_after_regression=False
            )
            for name in self.pre_regression_milestones
        ]

    @property
    def dmsg(self) -> int:
        pre_score = DevelopmentalScore(milestones=self._create_pre_milestones())
        post_milestones = [m for m in self.developmental_milestones if not m.milestone_has_regressed]
        post_score = DevelopmentalScore(milestones=post_milestones)
        return pre_score.total_score - post_score.total_score


# MILESTONE_TYPE_MAPPING
MILESTONE_TYPE_MAPPING = {
    MilestoneName.BABBLING.value: MilestoneType.LANGUAGE,
    MilestoneName.SINGLE_WORDS.value: MilestoneType.LANGUAGE,
    MilestoneName.TWO_WORD_PHRASES.value: MilestoneType.LANGUAGE,
    MilestoneName.NAMING_OBJECTS.value: MilestoneType.LANGUAGE,
    MilestoneName.EYE_CONTACT.value: MilestoneType.SOCIAL_COMMUNICATION,
    MilestoneName.RESPONDING_TO_NAME.value: MilestoneType.SOCIAL_COMMUNICATION,
    MilestoneName.POINTING.value: MilestoneType.SOCIAL_COMMUNICATION,
    MilestoneName.WAVING.value: MilestoneType.SOCIAL_COMMUNICATION,
    MilestoneName.CLAPPING.value: MilestoneType.SOCIAL_COMMUNICATION,
    MilestoneName.JOINT_ATTENTION.value: MilestoneType.SOCIAL_COMMUNICATION,
    MilestoneName.TURN_TAKING.value: MilestoneType.SOCIAL_COMMUNICATION,
    MilestoneName.FEEDING_WITH_HELP.value: MilestoneType.ADAPTIVE_BEHAVIOR,
    MilestoneName.USING_SPOON.value: MilestoneType.ADAPTIVE_BEHAVIOR,
    MilestoneName.DANCING.value: MilestoneType.ADAPTIVE_BEHAVIOR,
    MilestoneName.DRESSING_WITH_HELP.value: MilestoneType.ADAPTIVE_BEHAVIOR,
    MilestoneName.SELF_FEEDING.value: MilestoneType.ADAPTIVE_BEHAVIOR,
}

The prompt will look like this

prompt = f"""
{system_prompt} 

Report: {formatted_input}

JSON Schema:
{json.dumps(VAERSReport.model_json_schema(), indent=2)}
"""

Call the LLM

response = client.chat.completions.create(
    model=MODEL_NAME,
    **request_data
)
after = time.time()
elapsed = after - before
inner_response_text = response.choices[0].message.content
full_response_json = response.model_dump_json()

And then you will do a bunch of post processing to check if the response is valid JSON, and whether it conforms to the provided Pydantic schema etc (with GPT5 both of these are usually true)

I explain this in more detail in the course