Source code for fairlens.sensitive.correlation

"""
Find correlations between protected columns and non-protected columns.
"""

import pathlib
from typing import Callable, Dict, List, Optional, Tuple, Union

import pandas as pd

from ..metrics import correlation as cm
from ..sensitive import detection as dt


[docs]def find_sensitive_correlations( df: pd.DataFrame, threshold: float = 0.1, str_distance: Callable[[Optional[str], Optional[str]], float] = None, corr_cutoff: float = 0.75, p_cutoff: float = 0.1, config_path: Union[str, pathlib.Path] = None, ) -> Dict[str, List[Tuple[str, Optional[str]]]]: """Looks at the columns that are not considered to be immediately sensitive and finds if any is strongly correlated with a sensitive column, specifying both the sensitive column name and the sensitive category it is a part of. Args: df (pd.DataFrame): Pandas dataframe that will be analyzed. threshold (float, optional): The threshold for the string distance function that will be used for detecting sensitive columns. Defaults to 0.1. str_distance (Callable[[Optional[str], Optional[str]], float], optional): The string distance function that will be used for detecting sensitive columns. Defaults to Ratcliff-Obershelp algorithm. corr_cutoff (float, optional): The cutoff for considering a column to be correlated with a sensitive attribute, with Pearson's correlation. Defaults to 0.75. p_cutoff (float, optional): The p-value cutoff to be used when checking if a categorical column is correlated with a numeric column using the Kruskal-Wallis H Test. config_path (Union[str, pathlib.Path], optional) The path of the JSON configuration file in which the dictionaries used for detecting sensitive attributes are defined. By default, the configuration is the one describing protected attributes and groups according to the UK Government. Returns: Dict[str, Tuple[Optional[str]]]: The returned value is a dictionary with the non-sensitive column as the key and a tuple as the value, where the first entry is the name of the corresponding sensitive column in the dataframe and the second entry is the sensitive attribute category. """ str_distance = str_distance or dt._ro_distance sensitive_dict = dt.detect_names_df( df, threshold=threshold, str_distance=str_distance, deep_search=True, config_path=config_path ) non_sensitive_cols = df.columns.difference(sensitive_dict) correlation_dict = dict() for non_sensitive_col in non_sensitive_cols: col1 = df[non_sensitive_col] correlation_list = list() for sensitive_col in sensitive_dict.keys(): col2 = df[sensitive_col] if _compute_series_correlation(col1, col2, corr_cutoff, p_cutoff): correlation_list.append((sensitive_col, sensitive_dict[sensitive_col])) if len(correlation_list) > 0: correlation_dict[non_sensitive_col] = correlation_list return correlation_dict
[docs]def find_column_correlation( col: Union[str, pd.Series], df: pd.DataFrame, threshold: float = 0.1, str_distance: Callable[[Optional[str], Optional[str]], float] = None, corr_cutoff: float = 0.75, p_cutoff: float = 0.1, config_path: Union[str, pathlib.Path] = None, ) -> List[Tuple[str, Optional[str]]]: """This function takes in a series or a column name of a given dataframe and checks whether any of the sensitive attribute columns detected in the dataframe are strongly correlated with the series or the column corresponding to the given name. If matches are found, a list containing the correlated column names and its associated sensitive category, respectively, is returned. Args: col (Union[str, pd.Series]): Pandas series or dataframe column name that will be analyzed. df (pd.DataFrame): Dataframe supporting the search, possibly already a column with the input name. threshold (float, optional): The threshold for the string distance function that will be used for detecting sensitive columns. Defaults to 0.1. str_distance (Callable[[Optional[str], Optional[str]], float], optional): The string distance function that will be used for detecting sensitive columns. Defaults to Ratcliff-Obershelp algorithm. corr_cutoff (float, optional): The cutoff for considering a column to be correlated with a sensitive attribute, with Pearson's correlation. Defaults to 0.75. p_cutoff (float, optional): The p-value cutoff to be used when checking if a categorical column is correlated with a numeric column using the Kruskal-Wallis H Test. config_path (Union[str, pathlib.Path], optional) The path of the JSON configuration file in which the dictionaries used for detecting sensitive attributes are defined. By default, the configuration is the one describing protected attributes and groups according to the UK Government. Returns: List[Tuple[str, Optional[str]]]: The returned value is a list containing tuples of all the correlated sensitive columns that were found, along with their associated sensitive category label. """ str_distance = str_distance or dt._ro_distance sensitive_dict = dt.detect_names_df( df, threshold=threshold, str_distance=str_distance, deep_search=True, config_path=config_path ) correlation_list = list() if isinstance(col, str): if col in df.columns: col1 = df[col] else: raise ValueError("The given dataframe does not contain a column with this name.") else: col1 = col for sensitive_col in sensitive_dict.keys(): col2 = df[sensitive_col] if _compute_series_correlation(col1, col2, corr_cutoff, p_cutoff): correlation_list.append((sensitive_col, sensitive_dict[sensitive_col])) return correlation_list
def _compute_series_correlation( sr_a: pd.Series, sr_b: pd.Series, corr_cutoff: float = 0.75, p_cutoff: float = 0.1 ) -> bool: a_categorical = sr_a.map(type).eq(str).all() b_categorical = sr_b.map(type).eq(str).all() if a_categorical and b_categorical: # If both columns are categorical, we use Cramer's V. if cm.cramers_v(sr_a, sr_b) > corr_cutoff: return True elif not a_categorical and b_categorical: # If just one column is categorical, we can group by it and use Kruskal-Wallis H Test. return cm.kruskal_wallis_boolean(sr_b, sr_a, p_cutoff=p_cutoff) elif a_categorical and not b_categorical: return cm.kruskal_wallis_boolean(sr_a, sr_b, p_cutoff=p_cutoff) # If both columns are numeric, we use standard Pearson correlation and the correlation cutoff. return cm.pearson(sr_a, sr_b) > corr_cutoff