Skip to content

Great Expectations in Sidra

Great Expectations is a shared, open standard framework widely used in the industry to ensure data quality.

This tool in its 0.18.3 version, has been included in Sidra allowing to apply data quality check rules. These rules are called Expectations and can be configured from Sidra Web, in the Data Catalog section.

There are two key concepts you need to know regarding Great Expectations in Sidra:

  • Expectations (Validation type): those are the configurable data quality check rules per Entity. For example, one Expectation can be in charge of checking whether a specific column exists, for using it, the user will have to provide the column name in the Expectation configuration.

  • Suites are groups of Expectations to be applied against a specific Asset. The resulting report would match the expectations of the Entity for that Asset. The relationship is one Suite per Entity in Sidra.

Validations Configuration

For the below configurations, make sure that the JSON is strictly formatted.

Validation Name Configuration Type
expect_column_to_exist JSON BatchExpectation
expect_table_columns_to_match_ordered_list JSON BatchExpectation
expect_table_columns_to_match_set JSON BatchExpectation
expect_table_row_count_to_be_between JSON BatchExpectation
expect_column_values_to_not_be_null JSON ColumnMapExpectation
expect_column_values_to_be_null JSON ColumnMapExpectation
expect_column_values_to_not_be_in_set JSON ColumnMapExpectation
expect_column_values_to_be_decreasing JSON ColumnMapExpectation
expect_column_value_lengths_to_be_between JSON ColumnMapExpectation
expect_column_values_to_match_regex JSON ColumnMapExpectation
expect_column_values_to_not_match_regex JSON ColumnMapExpectation
expect_column_values_to_match_regex_list JSON ColumnMapExpectation
expect_column_values_to_not_match_regex_list JSON ColumnMapExpectation
expect_column_values_to_be_json_parseable JSON ColumnMapExpectation
expect_column_values_to_match_json_schema JSON ColumnMapExpectation
expect_column_distinct_values_to_be_in_set JSON ColumnAggregateExpectation
expect_column_distinct_values_to_contain_set JSON ColumnAggregateExpectation
expect_column_distinct_values_to_equal_set JSON ColumnAggregateExpectation
expect_column_mean_to_be_between JSON ColumnAggregateExpectation
expect_column_median_to_be_between JSON ColumnAggregateExpectation
expect_column_quantile_values_to_be_between JSON ColumnAggregateExpectation
expect_column_stdev_to_be_between JSON ColumnAggregateExpectation
expect_column_unique_value_count_to_be_between JSON ColumnAggregateExpectation
expect_column_proportion_of_unique_values_to_be_between JSON ColumnAggregateExpectation
expect_column_most_common_value_to_be_in_set JSON ColumnAggregateExpectation
expect_column_max_to_be_between JSON ColumnAggregateExpectation
expect_column_min_to_be_between JSON ColumnAggregateExpectation
expect_column_pair_values_to_be_equal JSON ColumnPairMapExpectation
expect_column_pair_values_to_be_in_set JSON ColumnPairMapExpectation
expect_select_column_values_to_be_unique_within_record JSON MultiColumnMapExpectation
expect_multicolumn_sum_to_equal JSON MultiColumnMapExpectation
expect_compound_columns_to_be_unique JSON MultiColumnMapExpectation

BatchExpectation type

expect_column_to_exist

{
    "column": "SystemInformationID"
}

expect_table_columns_to_match_ordered_list

{
    "column_list": [
        "SystemInformationID",
        "Database_Version",
        "VersionDate",
        "ModifiedDate",
        "SidraIsDeleted",
        "SidraPassedValidation",
        "SidraLoadDate",
        "SidraIdAsset",
        "SidraFileDate"
    ]
}

expect_table_columns_to_match_set

{
    "column_set": [
        "SystemInformationID",
        "Database_Version",
        "VersionDate",
        "ModifiedDate",
        "SidraIsDeleted",
        "SidraPassedValidation",
        "SidraLoadDate",
        "SidraIdAsset",
        "SidraFileDate"
    ],
    "exact_match": "True"
}

expect_table_row_count_to_be_between

{
    "min_value": 0,
    "max_value": 1
}

ColumnMapExpectation type

expect_column_values_to_not_be_null

{
    "column": "CustomerID"
}

expect_column_values_to_be_null

{
    "column": "DiscontinuedDate"
}

expect_column_values_to_not_be_in_set

{
    "column": "AddressType",
    "value_set": [
        "Home"
    ]
}

expect_column_values_to_be_decreasing

{
    "column": "IdTriggerTemplateType"
}

expect_column_value_lengths_to_be_between

{
    "column": "IdTriggerTemplateType",
    "min_value": 0,
    "max_value": 2
}

expect_column_values_to_match_regex

{
    "column": "SalesPerson",
    "regex": "^adventure-works"
}

expect_column_values_to_not_match_regex

{
    "column": "SalesPerson",
    "regex": "^aventuras-trabajos"
}

expect_column_values_to_match_regex_list

{
    "column": "EmailAddress",
    "regex_list": ["adventure", "dog", "dz0"],
    "match_on": "any"
}

expect_column_values_to_not_match_regex_list

{
    "column": "FirstName",
    "regex_list": ["\\d", "@"]
}

expect_column_values_to_be_json_parseable

{
    "column": "LastExecutionConfiguration"
}

expect_column_values_to_match_json_schema

{
    "column": "LastExecutionConfiguration",
    "json_schema": {}
}

ColumnAggregateExpectation type

expect_column_distinct_values_to_be_in_set

{
    "column": "ParentProductCategoryID",
    "value_set": [1,2,3,4]
}

expect_column_distinct_values_to_contain_set

{
    "column": "ProductCategoryID",
    "value_set": [29,30,39]
}

expect_column_distinct_values_to_equal_set

{
    "column": "CountryRegion",
    "value_set": [
        "Canada",
        "United Kingdom",
        "United States"
    ]
}

expect_column_mean_to_be_between

{
    "column": "ProductCategoryID",
    "min_value": 16,
    "max_value": 17
}

expect_column_median_to_be_between

{
    "column": "ProductCategoryID",
    "min_value": 15,
    "max_value": 17
}

expect_column_quantile_values_to_be_between

{
    "column": "ProductCategoryID",
    "quantile_ranges": {
        "quantiles": [
            0.0,
            0.333,
            0.6667,
            1.0
        ],
        "value_ranges": [
            [
                5.0,
                14.0
            ],
            [
                8.0,
                15.0
            ],
            [
                18.0,
                33.0
            ],
            [
                33.0,
                41.0
            ]
        ]
    }
}

expect_column_stdev_to_be_between

{
    "column": "ProductModelID",
    "min_value": 30,
    "max_value": 40
}

expect_column_unique_value_count_to_be_between

{
    "column": "ProductModelID",
    "min_value": 100,
    "max_value": 200
}

expect_column_proportion_of_unique_values_to_be_between

{
    "column": "ProductModelID",
    "min_value": 0.3,
    "max_value": 0.6
}

expect_column_most_common_value_to_be_in_set

{
    "column": "ModifiedDate",
    "value_set": [
        "2007-06-01 00:00:00"
    ]
}

expect_column_max_to_be_between

{
    "column": "ProductDescriptionID",
    "min_value": 2000,
    "max_value": 2020
}

expect_column_min_to_be_between

{
    "column": "ProductDescriptionID",
    "min_value": 2,
    "max_value": 2020
}

ColumnPairMapExpectation type

expect_column_pair_values_to_be_equal

{
    "column_A": "ShipDate",
    "column_B": "ModifiedDate"
}

expect_column_pair_values_to_be_in_set

{
    "column_A": "RevisionNumber",
    "column_B": "Status",
    "value_pairs_set": [
        [
            1,
            1
        ],
        [
            2,
            5
        ]
    ]
}

MultiColumnMapExpectation type

expect_select_column_values_to_be_unique_within_record

{
    "column_list": [
        "rowguid", 
        "SalesOrderNumber",
        "AccountNumber"
    ]
}

expect_multicolumn_sum_to_equal

{
    "column_list": [
        "RevisionNumber",
        "Status"
    ],
    "sum_total": 7
}

expect_compound_columns_to_be_unique

{
    "column_list": [
        "SalesOrderNumber", 
        "SalesOrderID"
    ]
}