PK {Tn ! plot_quickstart_selection_rate.py# Copyright (c) Microsoft Corporation and Fairlearn contributors.
# Licensed under the MIT License.
"""
=================================
Selection rates in census dataset
=================================
"""
# %%
from fairlearn.metrics import MetricFrame, selection_rate
from fairlearn.datasets import fetch_adult
data = fetch_adult(as_frame=True)
X = data.data
y_true = (data.target == '>50K') * 1
sex = X['sex']
selection_rates = MetricFrame(selection_rate,
y_true, y_true,
sensitive_features=sex)
fig = selection_rates.by_group.plot.bar(
legend=False, rot=0,
title='Fraction earning over $50,000')
PK {Tq.I$ $ plot_grid_search_census.py# Copyright (c) Microsoft Corporation and Fairlearn contributors.
# Licensed under the MIT License.
"""
===========================
GridSearch with Census Data
===========================
"""
# %%
# This notebook shows how to use Fairlearn and the Fairness dashboard to generate predictors
# for the Census dataset.
# This dataset is a classification problem - given a range of data about 32,000 individuals,
# predict whether their annual income is above or below fifty thousand dollars per year.
#
# For the purposes of this notebook, we shall treat this as a loan decision problem.
# We will pretend that the label indicates whether or not each individual repaid a loan in
# the past.
# We will use the data to train a predictor to predict whether previously unseen individuals
# will repay a loan or not.
# The assumption is that the model predictions are used to decide whether an individual
# should be offered a loan.
#
# We will first train a fairness-unaware predictor and show that it leads to unfair
# decisions under a specific notion of fairness called *demographic parity*.
# We then mitigate unfairness by applying the :code:`GridSearch` algorithm from the
# Fairlearn package.
# %%
# Load and preprocess the data set
# --------------------------------
# We download the data set using `fetch_adult` function in `fairlearn.datasets`.
# We start by importing the various modules we're going to use:
#
# .. note::
#
# The :code:`FairlearnDashboard` is no longer being developed as
# part of Fairlearn.
# The widget itself has been moved to
# `the raiwidgets package `_.
# Fairlearn will provide some of the existing functionality
# through :code:`matplotlib`-based visualizations.
from fairlearn.widget import FairlearnDashboard
from sklearn.model_selection import train_test_split
from fairlearn.reductions import GridSearch
from fairlearn.reductions import DemographicParity, ErrorRate
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
import pandas as pd
# %%
# We can now load and inspect the data by using the `fairlearn.datasets` module:
from sklearn.datasets import fetch_openml
data = fetch_openml(data_id=1590, as_frame=True)
X_raw = data.data
Y = (data.target == '>50K') * 1
X_raw
# %%
# We are going to treat the sex of each individual as a sensitive
# feature (where 0 indicates female and 1 indicates male), and in
# this particular case we are going separate this feature out and drop it
# from the main data.
# We then perform some standard data preprocessing steps to convert the
# data into a format suitable for the ML algorithms
A = X_raw["sex"]
X = X_raw.drop(labels=['sex'], axis=1)
X = pd.get_dummies(X)
sc = StandardScaler()
X_scaled = sc.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)
le = LabelEncoder()
Y = le.fit_transform(Y)
# %%
# Finally, we split the data into training and test sets:
X_train, X_test, Y_train, Y_test, A_train, A_test = train_test_split(X_scaled,
Y,
A,
test_size=0.2,
random_state=0,
stratify=Y)
# Work around indexing bug
X_train = X_train.reset_index(drop=True)
A_train = A_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
A_test = A_test.reset_index(drop=True)
# %%
# Training a fairness-unaware predictor
# -------------------------------------
#
# To show the effect of Fairlearn we will first train a standard ML predictor
# that does not incorporate fairness.
# For speed of demonstration, we use the simple
# :class:`sklearn.linear_model.LogisticRegression` class:
unmitigated_predictor = LogisticRegression(solver='liblinear', fit_intercept=True)
unmitigated_predictor.fit(X_train, Y_train)
# %%
# We can load this predictor into the Fairness dashboard, and assess its fairness:
FairlearnDashboard(sensitive_features=A_test, sensitive_feature_names=['sex'],
y_true=Y_test,
y_pred={"unmitigated": unmitigated_predictor.predict(X_test)})
# %%
# Looking at the disparity in accuracy, we see that males have an error
# about three times greater than the females.
# More interesting is the disparity in opportunity - males are offered loans at
# three times the rate of females.
#
# Despite the fact that we removed the feature from the training data, our
# predictor still discriminates based on sex.
# This demonstrates that simply ignoring a sensitive feature when fitting a
# predictor rarely eliminates unfairness.
# There will generally be enough other features correlated with the removed
# feature to lead to disparate impact.
# %%
# Mitigation with GridSearch
# --------------------------
#
# The :class:`fairlearn.reductions.GridSearch` class implements a simplified version of the
# exponentiated gradient reduction of `Agarwal et al. 2018 `_.
# The user supplies a standard ML estimator, which is treated as a blackbox.
# `GridSearch` works by generating a sequence of relabellings and reweightings, and
# trains a predictor for each.
#
# For this example, we specify demographic parity (on the sensitive feature of sex) as
# the fairness metric.
# Demographic parity requires that individuals are offered the opportunity (are approved
# for a loan in this example) independent of membership in the sensitive class (i.e., females
# and males should be offered loans at the same rate).
# We are using this metric for the sake of simplicity; in general, the appropriate fairness
# metric will not be obvious.
sweep = GridSearch(LogisticRegression(solver='liblinear', fit_intercept=True),
constraints=DemographicParity(),
grid_size=71)
# %%
# Our algorithms provide :code:`fit()` and :code:`predict()` methods, so they behave in a similar manner
# to other ML packages in Python.
# We do however have to specify two extra arguments to :code:`fit()` - the column of sensitive
# feature labels, and also the number of predictors to generate in our sweep.
#
# After :code:`fit()` completes, we extract the full set of predictors from the
# :class:`fairlearn.reductions.GridSearch` object.
sweep.fit(X_train, Y_train,
sensitive_features=A_train)
predictors = sweep.predictors_
# %%
# We could load these predictors into the Fairness dashboard now.
# However, the plot would be somewhat confusing due to their number.
# In this case, we are going to remove the predictors which are dominated in the
# error-disparity space by others from the sweep (note that the disparity will only be
# calculated for the sensitive feature; other potentially sensitive features will
# not be mitigated).
# In general, one might not want to do this, since there may be other considerations
# beyond the strict optimization of error and disparity (of the given sensitive feature).
errors, disparities = [], []
for m in predictors:
def classifier(X): return m.predict(X)
error = ErrorRate()
error.load_data(X_train, pd.Series(Y_train), sensitive_features=A_train)
disparity = DemographicParity()
disparity.load_data(X_train, pd.Series(Y_train), sensitive_features=A_train)
errors.append(error.gamma(classifier)[0])
disparities.append(disparity.gamma(classifier).max())
all_results = pd.DataFrame({"predictor": predictors, "error": errors, "disparity": disparities})
non_dominated = []
for row in all_results.itertuples():
errors_for_lower_or_eq_disparity = all_results["error"][all_results["disparity"] <= row.disparity]
if row.error <= errors_for_lower_or_eq_disparity.min():
non_dominated.append(row.predictor)
# %%
# Finally, we can put the dominant models into the Fairness dashboard, along with the
# unmitigated model.
dashboard_predicted = {"unmitigated": unmitigated_predictor.predict(X_test)}
for i in range(len(non_dominated)):
key = "dominant_model_{0}".format(i)
value = non_dominated[i].predict(X_test)
dashboard_predicted[key] = value
FairlearnDashboard(sensitive_features=A_test, sensitive_feature_names=['sex'],
y_true=Y_test,
y_pred=dashboard_predicted)
# %%
# We see a Pareto front forming - the set of predictors which represent optimal tradeoffs
# between accuracy and disparity in predictions.
# In the ideal case, we would have a predictor at (1,0) - perfectly accurate and without
# any unfairness under demographic parity (with respect to the sensitive feature "sex").
# The Pareto front represents the closest we can come to this ideal based on our data and
# choice of estimator.
# Note the range of the axes - the disparity axis covers more values than the accuracy,
# so we can reduce disparity substantially for a small loss in accuracy.
#
# By clicking on individual models on the plot, we can inspect their metrics for disparity
# and accuracy in greater detail.
# In a real example, we would then pick the model which represented the best trade-off
# between accuracy and disparity given the relevant business constraints.
PK V|TqN! ! plot_make_derived_metric.py# Copyright (c) Microsoft Corporation and Fairlearn contributors.
# Licensed under the MIT License.
"""
======================
Making Derived Metrics
======================
"""
# %%
# This notebook demonstrates the use of the :func:`fairlearn.metrics.make_derived_metric`
# function.
# Many higher-order machine learning algorithms (such as hyperparameter tuners) make use
# of scalar metrics when deciding how to proceed.
# While the :class:`fairlearn.metrics.MetricFrame` has the ability to produce such
# scalars through its aggregation functions, its API does not conform to that usually
# expected by these algorithms.
# The :func:`~fairlearn.metrics.make_derived_metric` function exists to bridge this gap.
#
# Getting the Data
# ================
#
# *This section may be skipped. It simply creates a dataset for
# illustrative purposes*
#
# We will use the well-known UCI 'Adult' dataset as the basis of this
# demonstration. This is not for a lending scenario, but we will regard
# it as one for the purposes of this example. We will use the existing
# 'race' and 'sex' columns (trimming the former to three unique values),
# and manufacture credit score bands and loan sizes from other columns.
# We start with some uncontroversial `import` statements:
import functools
import numpy as np
import sklearn.metrics as skm
from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_selector as selector
from sklearn.pipeline import Pipeline
from fairlearn.metrics import MetricFrame, make_derived_metric
from fairlearn.metrics import accuracy_score_group_min
# %%
# Next, we import the data, dropping any rows which are missing data:
data = fetch_openml(data_id=1590, as_frame=True)
X_raw = data.data
y = (data.target == ">50K") * 1
A = X_raw[["race", "sex"]]
# %%
# We are now going to preprocess the data. Before applying any transforms,
# we first split the data into train and test sets. All the transforms we
# apply will be trained on the training set, and then applied to the test
# set. This ensures that data doesn't leak between the two sets (this is
# a serious but subtle
# `problem in machine learning `_).
# So, first we split the data:
(X_train, X_test, y_train, y_test, A_train, A_test) = train_test_split(
X_raw, y, A, test_size=0.3, random_state=12345, stratify=y
)
# Ensure indices are aligned between X, y and A,
# after all the slicing and splitting of DataFrames
# and Series
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)
A_train = A_train.reset_index(drop=True)
A_test = A_test.reset_index(drop=True)
# %%
# Next, we build two :class:`~sklearn.pipeline.Pipeline` objects
# to process the columns, one for numeric data, and the other
# for categorical data. Both impute missing values; the difference
# is whether the data are scaled (numeric columns) or
# one-hot encoded (categorical columns). Imputation of missing
# values should generally be done with care, since it could
# potentially introduce biases. Of course, removing rows with
# missing data could also cause trouble, if particular subgroups
# have poorer data quality.
numeric_transformer = Pipeline(
steps=[
("impute", SimpleImputer()),
("scaler", StandardScaler()),
]
)
categorical_transformer = Pipeline(
[
("impute", SimpleImputer(strategy="most_frequent")),
("ohe", OneHotEncoder(handle_unknown="ignore")),
]
)
preprocessor = ColumnTransformer(
transformers=[
("num", numeric_transformer, selector(dtype_exclude="category")),
("cat", categorical_transformer, selector(dtype_include="category")),
]
)
# %%
# With our preprocessor defined, we can now build a
# new pipeline which includes an Estimator:
unmitigated_predictor = Pipeline(
steps=[
("preprocessor", preprocessor),
(
"classifier",
LogisticRegression(solver="liblinear", fit_intercept=True),
),
]
)
# %%
# With the pipeline fully defined, we can first train it
# with the training data, and then generate predictions
# from the test data.
unmitigated_predictor.fit(X_train, y_train)
y_pred = unmitigated_predictor.predict(X_test)
# %%
# Creating a derived metric
# =========================
#
# Suppose our key metric is the accuracy score, and we are most interested in
# ensuring that it exceeds some threshold for all subgroups
# We might use the :class:`~fairlearn.metrics.MetricFrame` as
# follows:
acc_frame = MetricFrame(
skm.accuracy_score, y_test, y_pred, sensitive_features=A_test["sex"]
)
print("Minimum accuracy_score: ", acc_frame.group_min())
# %%
# We can create a function to perform this in a single call
# using :func:`~fairlearn.metrics.make_derived_metric`.
# This takes the following arguments (which must always be
# supplied as keyword arguments):
#
# - :code:`metric=`, the base metric function
# - :code:`transform=`, the name of the aggregation
# transformation to perform. For this demonstration, we
# want this to be :code:`'group_min'`
# - :code:`sample_param_names=`, a list of parameter names
# which should be treated as sample
# parameters. This is optional, and defaults to
# :code:`['sample_weight']` which is appropriate for many
# metrics in `scikit-learn`.
#
# The result is a new function with the same signature as the
# base metric, which accepts two extra arguments:
#
# - :code:`sensitive_features=` to specify the sensitive features
# which define the subgroups
# - :code:`method=` to adjust how the aggregation transformation
# operates. This corresponds to the same argument in
# :meth:`fairlearn.metrics.MetricFrame.difference` and
# :meth:`fairlearn.metrics.MetricFrame.ratio`
#
# For the current case, we do not need the :code:`method=`
# argument, since we are taking the minimum value.
my_acc = make_derived_metric(metric=skm.accuracy_score, transform="group_min")
my_acc_min = my_acc(y_test, y_pred, sensitive_features=A_test["sex"])
print("Minimum accuracy_score: ", my_acc_min)
# %%
# To show that the returned function also works with sample weights:
random_weights = np.random.rand(len(y_test))
acc_frame_sw = MetricFrame(
skm.accuracy_score,
y_test,
y_pred,
sensitive_features=A_test["sex"],
sample_params={"sample_weight": random_weights},
)
from_frame = acc_frame_sw.group_min()
from_func = my_acc(
y_test,
y_pred,
sensitive_features=A_test["sex"],
sample_weight=random_weights,
)
print("From MetricFrame:", from_frame)
print("From function :", from_func)
# %%
# The returned function can also handle parameters which are not sample
# parameters. Consider :func:`sklearn.metrics.fbeta_score`, which
# has a required :code:`beta=` argument (and suppose that this time
# we are most interested in the maximum difference to the overall value).
# First we evaluate this with a :class:`fairlearn.metrics.MetricFrame`:
fbeta_03 = functools.partial(skm.fbeta_score, beta=0.3)
fbeta_03.__name__ = "fbeta_score__beta_0.3"
beta_frame = MetricFrame(
fbeta_03,
y_test,
y_pred,
sensitive_features=A_test["sex"],
sample_params={"sample_weight": random_weights},
)
beta_from_frame = beta_frame.difference(method="to_overall")
print("From frame:", beta_from_frame)
# %%
# And next, we create a function to evaluate the same. Note that
# we do not need to use :func:`functools.partial` to bind the
# :code:`beta=` argument:
beta_func = make_derived_metric(metric=skm.fbeta_score, transform="difference")
beta_from_func = beta_func(
y_test,
y_pred,
sensitive_features=A_test["sex"],
beta=0.3,
sample_weight=random_weights,
method="to_overall",
)
print("From function:", beta_from_func)
# %%
# Pregenerated Metrics
# ====================
#
# We provide a number of pregenerated metrics, to cover
# common use cases. For example, we provide a
# :code:`accuracy_score_group_min()` function to
# find the minimum over the accuracy scores:
from_myacc = my_acc(y_test, y_pred, sensitive_features=A_test["race"])
from_pregen = accuracy_score_group_min(
y_test, y_pred, sensitive_features=A_test["race"]
)
print("From my function :", from_myacc)
print("From pregenerated:", from_pregen)
assert from_myacc == from_pregen
PK X|T.d$H $H plot_new_metrics.py# Copyright (c) Microsoft Corporation and Fairlearn contributors.
# Licensed under the MIT License.
"""
==============================
Metrics with Multiple Features
==============================
"""
# %%
# This notebook demonstrates the new API for metrics, which supports
# multiple sensitive and conditional features. This example does not
# contain a proper discussion of how fairness relates to the dataset
# used, although it does highlight issues which users may want to
# consider when analysing their datasets.
#
# We are going to consider a lending scenario, supposing that we have
# a model which predicts whether or not a particular customer will
# repay a loan. This could be used as the basis of deciding whether
# or not to offer that customer a loan. With traditional metrics,
# we would assess the model using:
#
# - The 'true' values from the test set
# - The model predictions from the test set
#
# Our fairness metrics compute group-based fairness statistics.
# To use these, we also need categorical columns from the test
# set. For this example, we will include:
#
# - The sex of each individual (two unique values)
# - The race of each individual (three unique values)
# - The credit score band of each individual (three unique values)
# - Whether the loan is considered 'large' or 'small'
#
# An individual's sex and race should not affect a lending decision,
# but it would be legitimate to consider an individual's credit score
# and the relative size of the loan which they desired.
#
# A real scenario will be more complicated, but this will serve to
# illustrate the use of the new metrics.
#
# Getting the Data
# ================
#
# *This section may be skipped. It simply creates a dataset for
# illustrative purposes*
#
# We will use the well-known UCI 'Adult' dataset as the basis of this
# demonstration. This is not for a lending scenario, but we will regard
# it as one for the purposes of this example. We will use the existing
# 'race' and 'sex' columns (trimming the former to three unique values),
# and manufacture credit score bands and loan sizes from other columns.
# We start with some uncontroversial `import` statements:
import functools
import numpy as np
import sklearn.metrics as skm
from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_selector as selector
from sklearn.pipeline import Pipeline
from fairlearn.metrics import MetricFrame
from fairlearn.metrics import selection_rate
# %%
# Next, we import the data:
data = fetch_openml(data_id=1590, as_frame=True)
X_raw = data.data
y = (data.target == '>50K') * 1
# %%
# For purposes of clarity, we consolidate the 'race' column to have
# three unique values:
def race_transform(input_str):
"""Reduce values to White, Black and Other."""
result = 'Other'
if input_str == 'White' or input_str == 'Black':
result = input_str
return result
X_raw['race'] = X_raw['race'].map(race_transform).fillna('Other').astype('category')
print(np.unique(X_raw['race']))
# %%
# Now, we manufacture the columns for the credit score band and
# requested loan size. These are wholly constructed, and not
# part of the actual dataset in any way. They are simply for
# illustrative purposes.
def marriage_transform(m_s_string):
"""Perform some simple manipulations."""
result = 'Low'
if m_s_string.startswith("Married"):
result = 'Medium'
elif m_s_string.startswith("Widowed"):
result = 'High'
return result
def occupation_transform(occ_string):
"""Perform some simple manipulations."""
result = 'Small'
if occ_string.startswith("Machine"):
result = 'Large'
return result
col_credit = X_raw['marital-status'].map(marriage_transform).fillna('Low')
col_credit.name = "Credit Score"
col_loan_size = X_raw['occupation'].map(occupation_transform).fillna('Small')
col_loan_size.name = "Loan Size"
A = X_raw[['race', 'sex']]
A['Credit Score'] = col_credit
A['Loan Size'] = col_loan_size
A
# %%
# Now that we have imported our dataset and manufactured a few features, we
# can perform some more conventional processing. To avoid the problem of
# `data leakage `_,
# we need to split the data into training and test sets before applying
# any transforms or scaling:
(X_train, X_test, y_train, y_test, A_train, A_test) = train_test_split(
X_raw, y, A, test_size=0.3, random_state=54321, stratify=y
)
# Ensure indices are aligned between X, y and A,
# after all the slicing and splitting of DataFrames
# and Series
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)
A_train = A_train.reset_index(drop=True)
A_test = A_test.reset_index(drop=True)
# %%
# Next, we build two :class:`~sklearn.pipeline.Pipeline` objects
# to process the columns, one for numeric data, and the other
# for categorical data. Both impute missing values; the difference
# is whether the data are scaled (numeric columns) or
# one-hot encoded (categorical columns). Imputation of missing
# values should generally be done with care, since it could
# potentially introduce biases. Of course, removing rows with
# missing data could also cause trouble, if particular subgroups
# have poorer data quality.
numeric_transformer = Pipeline(
steps=[
("impute", SimpleImputer()),
("scaler", StandardScaler()),
]
)
categorical_transformer = Pipeline(
[
("impute", SimpleImputer(strategy="most_frequent")),
("ohe", OneHotEncoder(handle_unknown="ignore")),
]
)
preprocessor = ColumnTransformer(
transformers=[
("num", numeric_transformer, selector(dtype_exclude="category")),
("cat", categorical_transformer, selector(dtype_include="category")),
]
)
# %%
# With our preprocessor defined, we can now build a
# new pipeline which includes an Estimator:
unmitigated_predictor = Pipeline(
steps=[
("preprocessor", preprocessor),
(
"classifier",
LogisticRegression(solver="liblinear", fit_intercept=True),
),
]
)
# %%
# With the pipeline fully defined, we can first train it
# with the training data, and then generate predictions
# from the test data.
unmitigated_predictor.fit(X_train, y_train)
y_pred = unmitigated_predictor.predict(X_test)
# %%
# Analysing the Model with Metrics
# ================================
#
# After our data manipulations and model training, we have the following
# from our test set:
#
# - A vector of true values called ``y_test``
# - A vector of model predictions called ``y_pred``
# - A DataFrame of categorical features relevant to fairness called ``A_test``
#
# In a traditional model analysis, we would now look at some metrics
# evaluated on the entire dataset. Suppose in this case, the relevant
# metrics are :func:`fairlearn.metrics.selection_rate` and
# :func:`sklearn.metrics.fbeta_score` (with
# ``beta=0.6``).
# We can evaluate these metrics directly:
print("Selection Rate:", selection_rate(y_test, y_pred))
print("fbeta:", skm.fbeta_score(y_test, y_pred, beta=0.6))
# %%
# We know that there are sensitive features in our data, and we want to
# ensure that we're not harming individuals due to membership in any of
# these groups. For this purpose, Fairlearn provides the
# :class:`fairlearn.metrics.MetricFrame`
# class. Let us construct an instance of this class, and then look at
# its capabilities:
fbeta_06 = functools.partial(skm.fbeta_score, beta=0.6)
metric_fns = {'selection_rate': selection_rate, 'fbeta_06': fbeta_06}
grouped_on_sex = MetricFrame(metric_fns,
y_test, y_pred,
sensitive_features=A_test['sex'])
# %%
# The :class:`fairlearn.metrics.MetricFrame` object requires a
# minimum of four arguments:
#
# 1. The underlying metric function(s) to be evaluated
# 2. The true values
# 3. The predicted values
# 4. The sensitive feature values
#
# These are all passed as arguments to the constructor. If more than
# one underlying metric is required (as in this case), then we must
# provide them in a dictionary.
#
# The underlying metrics must have a signature ``fn(y_true, y_pred)``,
# so we have to use :func:`functools.partial` on ``fbeta_score()`` to
# furnish ``beta=0.6`` (we will show how to pass in extra array
# arguments such as sample weights shortly).
#
# We will now take a closer look at the :class:`fairlearn.metrics.MetricFrame`
# object. First, there is the ``overall`` property, which contains
# the metrics evaluated on the entire dataset. We see that this contains the
# same values calculated above:
assert grouped_on_sex.overall['selection_rate'] == selection_rate(y_test, y_pred)
assert grouped_on_sex.overall['fbeta_06'] == skm.fbeta_score(y_test, y_pred, beta=0.6)
print(grouped_on_sex.overall)
# %%
# The other property in the :class:`fairlearn.metrics.MetricFrame` object
# is ``by_group``. This contains the metrics evaluated on each subgroup defined
# by the categories in the ``sensitive_features=`` argument. In this case, we
# have results for males and females:
grouped_on_sex.by_group
# %%
# We can immediately see a substantial disparity in the selection rate between
# males and females.
#
# We can also create another :class:`fairlearn.metrics.MetricFrame` object
# using race as the sensitive feature:
grouped_on_race = MetricFrame(metric_fns,
y_test, y_pred,
sensitive_features=A_test['race'])
# %%
# The ``overall`` property is unchanged:
assert (grouped_on_sex.overall == grouped_on_race.overall).all()
# %%
# The ``by_group`` property now contains the metrics evaluated based on the 'race'
# column:
grouped_on_race.by_group
# %%
# We see that there is also a significant disparity in selection rates when
# grouping by race.
# %%
# Sample weights and other arrays
# -------------------------------
#
# We noted above that the underlying metric functions passed to the
# :class:`fairlearn.metrics.MetricFrame` constructor need to be of
# the form ``fn(y_true, y_pred)`` - we do not support scalar arguments
# such as ``pos_label=`` or ``beta=`` in the constructor. Such
# arguments should be bound into a new function using
# :func:`functools.partial`, and the result passed in. However, we do
# support arguments which have one entry for each sample, with an array
# of sample weights being the most common example. These are divided
# into subgroups along with ``y_true`` and ``y_pred``, and passed along
# to the underlying metric.
#
# To use these arguments, we pass in a dictionary as the ``sample_params=``
# argument of the constructor. Let us generate some random weights, and
# pass these along:
random_weights = np.random.rand(len(y_test))
example_sample_params = {
'selection_rate': {'sample_weight': random_weights},
'fbeta_06': {'sample_weight': random_weights},
}
grouped_with_weights = MetricFrame(metric_fns,
y_test, y_pred,
sensitive_features=A_test['sex'],
sample_params=example_sample_params)
# %%
# We can inspect the overall values, and check they are as expected:
assert grouped_with_weights.overall['selection_rate'] == \
selection_rate(y_test, y_pred, sample_weight=random_weights)
assert grouped_with_weights.overall['fbeta_06'] == \
skm.fbeta_score(y_test, y_pred, beta=0.6, sample_weight=random_weights)
print(grouped_with_weights.overall)
# %%
# We can also see the effect on the metric being evaluated on the subgroups:
grouped_with_weights.by_group
# %%
# Quantifying Disparities
# =======================
#
# We now know that our model is selecting individuals who are female far less
# often than individuals who are male. There is a similar effect when
# examining the results by race, with blacks being selected far less often than
# whites (and those classified as 'other'). However, there are many cases where
# presenting all these numbers at once will not be useful (for example, a high
# level dashboard which is monitoring model performance). Fairlearn provides
# several means of aggregating metrics across the subgroups, so that disparities
# can be readily quantified.
#
# The simplest of these aggregations is ``group_min()``, which reports the
# minimum value seen for a subgroup for each underlying metric (we also provide
# ``group_max()``). This is
# useful if there is a mandate that "no subgroup should have an ``fbeta_score()``
# of less than 0.6." We can evaluate the minimum values easily:
grouped_on_race.group_min()
# %%
# As noted above, the selection rates varies greatly by race and by sex.
# This can be quantified in terms of a difference between the subgroup with
# the highest value of the metric, and the subgroup with the lowest value.
# For this, we provide the method ``difference(method='between_groups)``:
grouped_on_race.difference(method='between_groups')
# %%
# We can also evaluate the difference relative to the corresponding overall
# value of the metric. In this case we take the absolute value, so that the
# result is always positive:
grouped_on_race.difference(method='to_overall')
# %%
# There are situations where knowing the ratios of the metrics evaluated on
# the subgroups is more useful. For this we have the ``ratio()`` method.
# We can take the ratios between the minimum and maximum values of each metric:
grouped_on_race.ratio(method='between_groups')
# %%
# We can also compute the ratios relative to the overall value for each
# metric. Analogous to the differences, the ratios are always in the range
# :math:`[0,1]`:
grouped_on_race.ratio(method='to_overall')
# %%
# Intersections of Features
# =========================
#
# So far we have only considered a single sensitive feature at a time,
# and we have already found some serious issues in our example data.
# However, sometimes serious issues can be hiding in intersections of
# features. For example, the
# `Gender Shades project `_
# found that facial recognition algorithms performed worse for blacks
# than whites, and also worse for women than men (despite overall high
# accuracy score). Moreover, performance on black females was *terrible*.
# We can examine the intersections of sensitive features by passing
# multiple columns to the :class:`fairlearn.metrics.MetricFrame`
# constructor:
grouped_on_race_and_sex = MetricFrame(metric_fns,
y_test, y_pred,
sensitive_features=A_test[['race', 'sex']])
# %%
# The overall values are unchanged, but the ``by_group`` table now
# shows the intersections between subgroups:
assert (grouped_on_race_and_sex.overall == grouped_on_race.overall).all()
grouped_on_race_and_sex.by_group
# %%
# The aggregations are still performed across all subgroups for each metric,
# so each continues to reduce to a single value. If we look at the
# ``group_min()``, we see that we violate the mandate we specified for the
# ``fbeta_score()`` suggested above (for females with a race of 'Other' in
# fact):
grouped_on_race_and_sex.group_min()
# %%
# Looking at the ``ratio()`` method, we see that the disparity is worse
# (specifically between white males and black females, if we check in
# the ``by_group`` table):
grouped_on_race_and_sex.ratio(method='between_groups')
# %%
# Control Features
# ================
#
# There is a further way we can slice up our data. We have (*completely
# made up*) features for the individuals' credit scores (in three bands)
# and also the size of the loan requested (large or small). In our loan
# scenario, it is acceptable that individuals with high credit scores
# are selected more often than individuals with low credit scores.
# However, within each credit score band, we do not want a disparity
# between (say) black females and white males. To example these cases,
# we have the concept of *control features*.
#
# Control features are introduced by the ``control_features=``
# argument to the :class:`fairlearn.metrics.MetricFrame` object:
cond_credit_score = MetricFrame(metric_fns,
y_test, y_pred,
sensitive_features=A_test[['race', 'sex']],
control_features=A_test['Credit Score'])
# %%
# This has an immediate effect on the ``overall`` property. Instead
# of having one value for each metric, we now have a value for each
# unique value of the control feature:
cond_credit_score.overall
# %%
# The ``by_group`` property is similarly expanded:
cond_credit_score.by_group
# %%
# The aggregates are also evaluated once for each group identified
# by the control feature:
cond_credit_score.group_min()
# %%
# And:
cond_credit_score.ratio(method='between_groups')
# %%
# In our data, we see that we have a dearth of positive results
# for high income non-whites, which significantly affects the
# aggregates.
#
# We can continue adding more control features:
cond_both = MetricFrame(metric_fns,
y_test, y_pred,
sensitive_features=A_test[['race', 'sex']],
control_features=A_test[['Loan Size', 'Credit Score']])
# %%
# The ``overall`` property now splits into more values:
cond_both.overall
# %%
# As does the ``by_groups`` property, where ``NaN`` values
# indicate that there were no samples in the cell:
cond_both.by_group
# %%
# The aggregates behave similarly. By this point, we are having significant issues
# with under-populated intersections. Consider:
def member_counts(y_true, y_pred):
assert len(y_true) == len(y_pred)
return len(y_true)
counts = MetricFrame(member_counts,
y_test, y_pred,
sensitive_features=A_test[['race', 'sex']],
control_features=A_test[['Loan Size', 'Credit Score']])
counts.by_group
# %%
# Recall that ``NaN`` indicates that there were no individuals
# in a cell - ``member_counts()`` will not even have been called.
PK {Tn ! plot_quickstart_selection_rate.pyPK {Tq.I$ $ plot_grid_search_census.pyPK V|TqN! ! ' plot_make_derived_metric.pyPK X|T.d$H $H I plot_new_metrics.pyPK !