Source code for fairlens.sensitive.correlation

"""
Find correlations between protected columns and non-protected columns.
"""

import pathlib
from typing import Callable, Dict, List, Optional, Tuple, Union

import pandas as pd

from ..metrics import correlation as cm
from ..sensitive import detection as dt


[docs]def find_sensitive_correlations(
    df: pd.DataFrame,
    threshold: float = 0.1,
    str_distance: Callable[[Optional[str], Optional[str]], float] = None,
    corr_cutoff: float = 0.75,
    p_cutoff: float = 0.1,
    config_path: Union[str, pathlib.Path] = None,
) -> Dict[str, List[Tuple[str, Optional[str]]]]:
    """Looks at the columns that are not considered to be immediately sensitive and finds if any is strongly
    correlated with a sensitive column, specifying both the sensitive column name and the sensitive category
    it is a part of.

    Args:
        df (pd.DataFrame):
            Pandas dataframe that will be analyzed.
        threshold (float, optional):
            The threshold for the string distance function that will be used for detecting sensitive columns.
            Defaults to 0.1.
        str_distance (Callable[[Optional[str], Optional[str]], float], optional):
            The string distance function that will be used for detecting sensitive columns.
            Defaults to Ratcliff-Obershelp algorithm.
        corr_cutoff (float, optional):
            The cutoff for considering a column to be correlated with a sensitive attribute, with Pearson's correlation.
            Defaults to 0.75.
        p_cutoff (float, optional):
            The p-value cutoff to be used when checking if a categorical column is correlated with a numeric column
            using the Kruskal-Wallis H Test.
        config_path (Union[str, pathlib.Path], optional)
            The path of the JSON configuration file in which the dictionaries used for
            detecting sensitive attributes are defined. By default, the configuration
            is the one describing protected attributes and groups according to the
            UK Government.

    Returns:
        Dict[str, Tuple[Optional[str]]]:
            The returned value is a dictionary with the non-sensitive column as the key and a tuple as the value,
            where the first entry is the name of the corresponding sensitive column in the dataframe and the second
            entry is the sensitive attribute category.
    """
    str_distance = str_distance or dt._ro_distance

    sensitive_dict = dt.detect_names_df(
        df, threshold=threshold, str_distance=str_distance, deep_search=True, config_path=config_path
    )

    non_sensitive_cols = df.columns.difference(sensitive_dict)

    correlation_dict = dict()

    for non_sensitive_col in non_sensitive_cols:
        col1 = df[non_sensitive_col]

        correlation_list = list()

        for sensitive_col in sensitive_dict.keys():
            col2 = df[sensitive_col]

            if _compute_series_correlation(col1, col2, corr_cutoff, p_cutoff):
                correlation_list.append((sensitive_col, sensitive_dict[sensitive_col]))

        if len(correlation_list) > 0:
            correlation_dict[non_sensitive_col] = correlation_list

    return correlation_dict


[docs]def find_column_correlation(
    col: Union[str, pd.Series],
    df: pd.DataFrame,
    threshold: float = 0.1,
    str_distance: Callable[[Optional[str], Optional[str]], float] = None,
    corr_cutoff: float = 0.75,
    p_cutoff: float = 0.1,
    config_path: Union[str, pathlib.Path] = None,
) -> List[Tuple[str, Optional[str]]]:
    """This function takes in a series or a column name of a given dataframe and checks whether any of
    the sensitive attribute columns detected in the dataframe are strongly correlated with the series
    or the column corresponding to the given name.
    If matches are found, a list containing the correlated
    column names and its associated sensitive category, respectively, is returned.

    Args:
        col (Union[str, pd.Series]):
            Pandas series or dataframe column name that will be analyzed.
        df (pd.DataFrame):
            Dataframe supporting the search, possibly already a column with the input name.
        threshold (float, optional):
            The threshold for the string distance function that will be used for detecting sensitive columns.
            Defaults to 0.1.
        str_distance (Callable[[Optional[str], Optional[str]], float], optional):
            The string distance function that will be used for detecting sensitive columns.
            Defaults to Ratcliff-Obershelp algorithm.
        corr_cutoff (float, optional):
            The cutoff for considering a column to be correlated with a sensitive attribute, with Pearson's correlation.
            Defaults to 0.75.
        p_cutoff (float, optional):
            The p-value cutoff to be used when checking if a categorical column is correlated with a numeric column
            using the Kruskal-Wallis H Test.
        config_path (Union[str, pathlib.Path], optional)
            The path of the JSON configuration file in which the dictionaries used for
            detecting sensitive attributes are defined. By default, the configuration
            is the one describing protected attributes and groups according to the
            UK Government.

    Returns:
        List[Tuple[str, Optional[str]]]:
            The returned value is a list containing tuples of all the correlated sensitive columns that were
            found, along with their associated sensitive category label.
    """
    str_distance = str_distance or dt._ro_distance

    sensitive_dict = dt.detect_names_df(
        df, threshold=threshold, str_distance=str_distance, deep_search=True, config_path=config_path
    )

    correlation_list = list()

    if isinstance(col, str):
        if col in df.columns:
            col1 = df[col]
        else:
            raise ValueError("The given dataframe does not contain a column with this name.")
    else:
        col1 = col

    for sensitive_col in sensitive_dict.keys():
        col2 = df[sensitive_col]

        if _compute_series_correlation(col1, col2, corr_cutoff, p_cutoff):
            correlation_list.append((sensitive_col, sensitive_dict[sensitive_col]))

    return correlation_list


def _compute_series_correlation(
    sr_a: pd.Series, sr_b: pd.Series, corr_cutoff: float = 0.75, p_cutoff: float = 0.1
) -> bool:
    a_categorical = sr_a.map(type).eq(str).all()
    b_categorical = sr_b.map(type).eq(str).all()

    if a_categorical and b_categorical:
        # If both columns are categorical, we use Cramer's V.
        if cm.cramers_v(sr_a, sr_b) > corr_cutoff:
            return True
    elif not a_categorical and b_categorical:
        # If just one column is categorical, we can group by it and use Kruskal-Wallis H Test.
        return cm.kruskal_wallis_boolean(sr_b, sr_a, p_cutoff=p_cutoff)
    elif a_categorical and not b_categorical:
        return cm.kruskal_wallis_boolean(sr_a, sr_b, p_cutoff=p_cutoff)

    # If both columns are numeric, we use standard Pearson correlation and the correlation cutoff.
    return cm.pearson(sr_a, sr_b) > corr_cutoff