Source code for fairdiverse.search.preprocessing_model.LFR

"""Learning fair representations is a pre-processing technique that finds a
    latent representation which encodes the data well but obfuscates information
    about protected attributes [2]_.
    References:
        .. [2] R. Zemel, Y. Wu, K. Swersky, T. Pitassi, and C. Dwork,  "Learning
           Fair Representations." International Conference on Machine Learning,
           2013.
    Based on code from https://github.com/zjelveh/learning-fair-representations
    """

import numpy as np
import pandas as pd
import scipy.optimize as optim
import os
from .modules.LFR.loss import LFR_optimisation as LFR_func
from .modules.probabilistic_mapping_helpers import compute_X_hat
from .utils import process_data_input, process_data_output, save_model_data, load_model_data
from .fair_model import PreprocessingFairnessIntervention

[docs] class LFR(PreprocessingFairnessIntervention): """ Learning Fair Representations (LFR) fairness intervention. This class applies the LFR approach to modify the dataset such that fairness constraints are met while preserving as much utility as possible. """ def __init__(self, configs, dataset): """ Initialize the LFR model with the given configurations and dataset. :param configs : dict Configuration dictionary containing model parameters. :param dataset : str The dataset to be processed. """ super().__init__(configs, dataset)
[docs] def fit(self, X_train, run): """ Train the LFR fairness model using the given training dataset. This method optimizes a fairness objective by learning fair representations of the data using constrained optimization. :param X_train : pandas.DataFrame or numpy.ndarray The training dataset. It is assumed that the last non-sensitive column is the target variable. :param run : str The identifier for the training run. :return : self The trained LFR model. """ if not os.path.exists(os.path.join(self.model_path, run)): X_train, group_weights, sensitive_groups, sensitive_column_indices, nonsensitive_column_indices = ( process_data_input(X_train, self.configs, self.dataset)) if self.configs["seed"] is not None: np.random.seed(self.configs["seed"]) # assumes that the last non-sensitive column of X_train is Y_train Y_train = X_train[:, nonsensitive_column_indices][-1] features_dim = X_train.shape[1] # Initialize the LFR_module optim objective parameters parameters_initialization = np.random.uniform(size=int(self.configs["k"] + features_dim * self.configs["k"])) bnd = [(0, 1)] * self.configs["k"] + [(None, None)] * features_dim * self.configs["k"] LFR_func.steps = 0 self.opt_params = optim.fmin_l_bfgs_b(LFR_func, x0=parameters_initialization, epsilon=1e-5, args=(X_train, Y_train, sensitive_groups, sensitive_column_indices, self.configs["k"], self.configs["A_x"], self.configs["A_y"], self.configs["A_z"], group_weights, self.configs["biggest_gap"], os.path.join(self.model_path, run)), bounds=bnd, approx_grad=True, maxfun=self.configs["maxfun"], maxiter=self.configs["maxiter"], disp=False)[0] self.w = self.opt_params[:self.configs["k"]] self.prototypes = self.opt_params[self.configs["k"]:].reshape((self.configs["k"], features_dim)) save_model_data(self, os.path.join(self.model_path, run)) else: self.opt_params = load_model_data(os.path.join(self.model_path, run)) return self
[docs] def transform(self, X, run, file_name=None): """ Apply the fairness transformation to the dataset using the learned model. This method ensures fairness by adjusting feature distributions while maintaining data utility. :param X : pandas.DataFrame The dataset to which the fairness transformation is applied. :param run : str The identifier for the transformation run. :param file_name : str, optional Name of the file to save the transformed dataset. :return : pandas.DataFrame The dataset with transformed fair columns. """ fair_data_path = os.path.join(self.fair_data_path, run) os.makedirs(fair_data_path, exist_ok=True) fair_data_file = os.path.join(fair_data_path, f'fair_{file_name}_data.csv') if not os.path.exists(fair_data_file): X_np, group_weights, sensitive_groups, sensitive_column_indices, nonsensitive_column_indices = ( process_data_input(X, self.configs, self.dataset)) X_hat, _ = compute_X_hat(X_np, self.opt_params, self.configs["k"], alpha=False) X_fair = process_data_output(X, X_hat, self.dataset, nonsensitive_column_indices, fair_data_path, file_name) else: X_fair = pd.read_csv(fair_data_file) return X_fair