Source code for fairdiverse.search.preprocessing_model.utils
import numpy as np
import pandas as pd
import os
[docs]
def compute_dynamic_values_to_code(data, sensitive_attributes):
value_to_code = {}
current_code = 1
# For each sensitive attribute
for s_attribute in sorted(sensitive_attributes):
unique_groups = sorted(set(data[s_attribute].unique()))
# Assign code to independent groups
for group in unique_groups:
if group not in value_to_code:
value_to_code[group] = current_code
current_code += 1
return value_to_code
[docs]
def process_data_input(data, configs, dataset):
features_col = dataset.feature_cols + [dataset.score_col]
if "unprivileged_groups" in configs:
sensitive_attributes = set(configs["unprivileged_groups"]) | set(configs["privileged_groups"])
value_to_code = compute_dynamic_values_to_code(data, sensitive_attributes)
unprivileged_groups, privileged_groups, group_weights = {}, {}, {}
# Encode sensitive attributes
for s_attribute in sensitive_attributes:
coded_attribute = s_attribute + '_coded'
features_col.append(coded_attribute)
data[coded_attribute] = data[s_attribute].map(value_to_code)
unprivileged_groups[s_attribute] = [value_to_code[group] for group in
configs["unprivileged_groups"].get(s_attribute, [])]
privileged_groups[s_attribute] = [value_to_code[group] for group in
configs["privileged_groups"].get(s_attribute, [])]
# Update group weights
for group in value_to_code:
if group in data[s_attribute].unique() and value_to_code[group] not in group_weights:
group_weights[value_to_code[group]] = configs['group_weights'].get(group, 0)
# Store sensitive groups
sensitive_groups = {"unprivileged_groups": unprivileged_groups, "privileged_groups": privileged_groups}
else:
sensitive_attributes = dataset.sensitive_cols
value_to_code = compute_dynamic_values_to_code(data, sensitive_attributes)
for s_attribute in sensitive_attributes:
coded_attribute = s_attribute + '_coded'
features_col.append(coded_attribute)
data[coded_attribute] = data[s_attribute].map(value_to_code)
group_weights = None
sensitive_groups = None
# Identify indices of sensitive columns
sensitive_column_indices = [
list(data[features_col].columns).index(s_attribute + "_coded") for s_attribute in sensitive_attributes
]
nonsensitive_column_indices = [index for index in range(0, len(features_col)) if
index not in sensitive_column_indices]
data_processed = data[features_col].to_numpy()
return data_processed, group_weights, sensitive_groups, sensitive_column_indices, nonsensitive_column_indices
[docs]
def process_data_output(data_orig, data_fair, dataset, nonsensitive_column_indices, fair_data_path, file_name=None):
# Combine relevant columns from configs
features_col = dataset.feature_cols + [dataset.score_col]
# Convert data_fair to a numpy array and select relevant columns
data_fair = np.vstack(data_fair)
fair_features_col = [f'{col}_fair' for col in features_col]
selected_fair_features_col = [fair_features_col[i] for i in nonsensitive_column_indices]
# Create DataFrame for fair data with selected columns
data_fair = pd.DataFrame(data_fair[:, nonsensitive_column_indices], columns=selected_fair_features_col)
# Add original data columns to the fair data
for col in data_orig.columns:
data_fair[col] = data_orig[col].values
data_fair[col] = data_orig[col].values
if file_name != None:
os.makedirs(fair_data_path, exist_ok=True)
data_fair.to_csv(os.path.join(fair_data_path, f'fair_{file_name}_data.csv'))
return data_fair
[docs]
def save_model_data(model, path):
with open(os.path.join(path, 'model_parmas.npy'), 'wb') as f:
np.save(f, model.opt_params)
[docs]
def load_model_data(path):
with open(os.path.join(path, 'model_parmas.npy'), 'rb') as f:
return np.load(f)