Skip to content

Config

loadx.scd2.config

SCD2ColumnNames dataclass

Custom names for SCD2 output columns.

All fields are optional and default to their standard names. Pass an instance of this class to SCD2Loader.slowly_changing_dimension() via scd_columns to rename any subset of output columns.

Attributes:

Name Type Description
valid_from str

Date when the record became active.

valid_until str

Date when the record was superseded. 9999-12-31 for currently active records.

active_flag str

True for the currently active version of a record.

delete_flag str

True if the record was deleted in the source.

row_hash str

SHA-256 hash of non-key columns, used for change detection.

insert_date str

Timestamp when this record version was written to the target table.

latest_record_flag str

True for the most recent record per business key. Only present when enable_latest_record_flag=True.

Example
from loadx import SCD2ColumnNames

SCD2ColumnNames(valid_from="eff_start_date", valid_until="eff_end_date")
Source code in loadx/scd2/config.py
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
@dataclass
class SCD2ColumnNames:
    """Custom names for SCD2 output columns.

    All fields are optional and default to their standard names. Pass an instance
    of this class to `SCD2Loader.slowly_changing_dimension()` via `scd_columns`
    to rename any subset of output columns.

    Attributes:
        valid_from: Date when the record became active.
        valid_until: Date when the record was superseded. `9999-12-31` for currently active records.
        active_flag: `True` for the currently active version of a record.
        delete_flag: `True` if the record was deleted in the source.
        row_hash: SHA-256 hash of non-key columns, used for change detection.
        insert_date: Timestamp when this record version was written to the target table.
        latest_record_flag: `True` for the most recent record per business key.
            Only present when `enable_latest_record_flag=True`.

    Example:
        ```python
        from loadx import SCD2ColumnNames

        SCD2ColumnNames(valid_from="eff_start_date", valid_until="eff_end_date")
        ```
    """

    valid_from: str = "valid_from"
    valid_until: str = "valid_until"
    active_flag: str = "active_flag"
    delete_flag: str = "delete_flag"
    row_hash: str = "row_hash"
    insert_date: str = "insert_date"
    latest_record_flag: str = "latest_record_flag"

    @classmethod
    def from_dict(cls, data: dict[str, Any]) -> SCD2ColumnNames:
        """Create SCD2ColumnNames instance from dictionary, filtering valid fields."""
        field_names = {f.name for f in fields(cls)}
        filtered_data = {k: v for k, v in data.items() if k in field_names}
        return cls(**filtered_data)

    def field_list(self) -> list[str]:
        """Get list of all field attribute names."""
        return [f.name for f in fields(self)]

    def column_list(self) -> list[str]:
        """Get list of all actual output column names (respects user-defined renames)."""
        return [getattr(self, f.name) for f in fields(self)]

from_dict(data: dict[str, Any]) -> SCD2ColumnNames classmethod

Create SCD2ColumnNames instance from dictionary, filtering valid fields.

Source code in loadx/scd2/config.py
66
67
68
69
70
71
@classmethod
def from_dict(cls, data: dict[str, Any]) -> SCD2ColumnNames:
    """Create SCD2ColumnNames instance from dictionary, filtering valid fields."""
    field_names = {f.name for f in fields(cls)}
    filtered_data = {k: v for k, v in data.items() if k in field_names}
    return cls(**filtered_data)

field_list() -> list[str]

Get list of all field attribute names.

Source code in loadx/scd2/config.py
73
74
75
def field_list(self) -> list[str]:
    """Get list of all field attribute names."""
    return [f.name for f in fields(self)]

column_list() -> list[str]

Get list of all actual output column names (respects user-defined renames).

Source code in loadx/scd2/config.py
77
78
79
def column_list(self) -> list[str]:
    """Get list of all actual output column names (respects user-defined renames)."""
    return [getattr(self, f.name) for f in fields(self)]

SCD2Config dataclass

Configuration for SCD2 processing.

Source code in loadx/scd2/config.py
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
@dataclass
class SCD2Config:
    """Configuration for SCD2 processing."""

    business_keys: list[str]
    date_column: str = DEFAULT_DATE_COLUMN
    ignore_columns: list[str] | None = None
    non_copy_fields: list[str] | None = None
    open_end_date: datetime | None = OPEN_END_DATE
    scd_columns: SCD2ColumnNames = field(default_factory=SCD2ColumnNames)
    enable_latest_record_flag: bool = False
    source_type: SourceType = SourceType.FULL

    def __post_init__(self) -> None:
        """Initialize default values for optional fields."""
        if self.ignore_columns is None:
            self.ignore_columns = []
        if self.non_copy_fields is None:
            self.non_copy_fields = []

    @classmethod
    def create(
        cls,
        business_keys: list[str] | str,
        date_column: str = DEFAULT_DATE_COLUMN,
        ignore_columns: list[str] | None = None,
        non_copy_fields: list[str] | None = None,
        open_end_date: datetime | None = OPEN_END_DATE,
        scd_columns: SCD2ColumnNames | dict[str, str] | None = None,
        enable_latest_record_flag: bool = False,
        source_type: SourceType = SourceType.FULL,
    ) -> SCD2Config:
        """Create an SCD2Config instance with coercion and defaults applied.

        Args:
            business_keys: Column(s) that uniquely identify a dimension row.
                A single string is automatically wrapped in a list.
            date_column: Column containing the snapshot date.
            ignore_columns: Columns excluded from hash-based change detection.
            non_copy_fields: Source columns excluded from the output DataFrame.
            open_end_date: Value written to `valid_until` for currently active
                records. Defaults to `9999-12-31`.
            scd_columns: Override default SCD2 output column names. Accepts an
                `SCD2ColumnNames` instance or a plain dict with any subset of keys:
                `valid_from`, `valid_until`, `active_flag`, `delete_flag`, `row_hash`,
                `insert_date`.
            enable_latest_record_flag: When `True`, adds a `latest_record_flag` column
                that is `True` for the most recent record per business key.
            source_type: Whether the source is a ``"full"`` snapshot or
                ``"incremental"`` feed. Delete-flag detection is only performed
                for ``"full"`` sources; the ``delete_flag`` column is omitted
                entirely for ``"incremental"`` sources.
        """
        if isinstance(business_keys, str):
            business_keys = [business_keys]

        resolved_scd_columns: SCD2ColumnNames
        if isinstance(scd_columns, dict):
            resolved_scd_columns = SCD2ColumnNames.from_dict(scd_columns)
        elif scd_columns is None:
            resolved_scd_columns = SCD2ColumnNames()
        else:
            resolved_scd_columns = scd_columns

        return SCD2Config(
            business_keys=business_keys,
            date_column=date_column,
            ignore_columns=ignore_columns or [],
            non_copy_fields=non_copy_fields or [],
            open_end_date=open_end_date,
            scd_columns=resolved_scd_columns,
            enable_latest_record_flag=enable_latest_record_flag,
            source_type=source_type,
        )

__post_init__() -> None

Initialize default values for optional fields.

Source code in loadx/scd2/config.py
 95
 96
 97
 98
 99
100
def __post_init__(self) -> None:
    """Initialize default values for optional fields."""
    if self.ignore_columns is None:
        self.ignore_columns = []
    if self.non_copy_fields is None:
        self.non_copy_fields = []

create(business_keys: list[str] | str, date_column: str = DEFAULT_DATE_COLUMN, ignore_columns: list[str] | None = None, non_copy_fields: list[str] | None = None, open_end_date: datetime | None = OPEN_END_DATE, scd_columns: SCD2ColumnNames | dict[str, str] | None = None, enable_latest_record_flag: bool = False, source_type: SourceType = SourceType.FULL) -> SCD2Config classmethod

Create an SCD2Config instance with coercion and defaults applied.

Parameters:

Name Type Description Default
business_keys list[str] | str

Column(s) that uniquely identify a dimension row. A single string is automatically wrapped in a list.

required
date_column str

Column containing the snapshot date.

DEFAULT_DATE_COLUMN
ignore_columns list[str] | None

Columns excluded from hash-based change detection.

None
non_copy_fields list[str] | None

Source columns excluded from the output DataFrame.

None
open_end_date datetime | None

Value written to valid_until for currently active records. Defaults to 9999-12-31.

OPEN_END_DATE
scd_columns SCD2ColumnNames | dict[str, str] | None

Override default SCD2 output column names. Accepts an SCD2ColumnNames instance or a plain dict with any subset of keys: valid_from, valid_until, active_flag, delete_flag, row_hash, insert_date.

None
enable_latest_record_flag bool

When True, adds a latest_record_flag column that is True for the most recent record per business key.

False
source_type SourceType

Whether the source is a "full" snapshot or "incremental" feed. Delete-flag detection is only performed for "full" sources; the delete_flag column is omitted entirely for "incremental" sources.

FULL
Source code in loadx/scd2/config.py
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
@classmethod
def create(
    cls,
    business_keys: list[str] | str,
    date_column: str = DEFAULT_DATE_COLUMN,
    ignore_columns: list[str] | None = None,
    non_copy_fields: list[str] | None = None,
    open_end_date: datetime | None = OPEN_END_DATE,
    scd_columns: SCD2ColumnNames | dict[str, str] | None = None,
    enable_latest_record_flag: bool = False,
    source_type: SourceType = SourceType.FULL,
) -> SCD2Config:
    """Create an SCD2Config instance with coercion and defaults applied.

    Args:
        business_keys: Column(s) that uniquely identify a dimension row.
            A single string is automatically wrapped in a list.
        date_column: Column containing the snapshot date.
        ignore_columns: Columns excluded from hash-based change detection.
        non_copy_fields: Source columns excluded from the output DataFrame.
        open_end_date: Value written to `valid_until` for currently active
            records. Defaults to `9999-12-31`.
        scd_columns: Override default SCD2 output column names. Accepts an
            `SCD2ColumnNames` instance or a plain dict with any subset of keys:
            `valid_from`, `valid_until`, `active_flag`, `delete_flag`, `row_hash`,
            `insert_date`.
        enable_latest_record_flag: When `True`, adds a `latest_record_flag` column
            that is `True` for the most recent record per business key.
        source_type: Whether the source is a ``"full"`` snapshot or
            ``"incremental"`` feed. Delete-flag detection is only performed
            for ``"full"`` sources; the ``delete_flag`` column is omitted
            entirely for ``"incremental"`` sources.
    """
    if isinstance(business_keys, str):
        business_keys = [business_keys]

    resolved_scd_columns: SCD2ColumnNames
    if isinstance(scd_columns, dict):
        resolved_scd_columns = SCD2ColumnNames.from_dict(scd_columns)
    elif scd_columns is None:
        resolved_scd_columns = SCD2ColumnNames()
    else:
        resolved_scd_columns = scd_columns

    return SCD2Config(
        business_keys=business_keys,
        date_column=date_column,
        ignore_columns=ignore_columns or [],
        non_copy_fields=non_copy_fields or [],
        open_end_date=open_end_date,
        scd_columns=resolved_scd_columns,
        enable_latest_record_flag=enable_latest_record_flag,
        source_type=source_type,
    )