Source code for fairlens.sensitive.detection

"""
Identify legally protected attributes in a dataset.
"""

import json
import os
import pathlib
from difflib import SequenceMatcher
from typing import Any, Callable, Dict, List, Optional, Tuple, Union

import pandas as pd

DEFAULT_PATH = "./configs/config_engb.json"

attr_synonym_dict: Dict[str, List[str]] = {}
attr_value_dict: Dict[str, List[str]] = {}


[docs]def detect_names_df(
    df: Union[pd.DataFrame, List[str]],
    threshold: float = 0.1,
    str_distance: Callable[[Optional[str], Optional[str]], float] = None,
    deep_search: bool = False,
    n_samples: int = 20,
    config_path: Union[str, pathlib.Path] = None,
) -> Dict[str, Optional[str]]:
    """Detects the sensitive columns in a dataframe or string list and creates a
    dictionary which maps the attribute names to the corresponding sensitive
    category name (such as Gender, Religion etc). The option to deep search can
    be enabled in the case of dataframes, which looks at the values in the tables
    and infers sensitive categories, even when the column name is inconclusive.

    Args:
        df (Union[pd.DataFrame, List[str]]):
            Pandas dataframe or string list that will be analysed.
        threshold (float, optional):
            The threshold for the string distance function. Defaults to 0.1.
        str_distance (Callable[[Optional[str], Optional[str]], float], optional):
            The string distance function. Defaults to Ratcliff-Obershelp algorithm.
        deep_search (bool, optional):
            The boolean flag that enables deep search when set to true. Deep search
            also makes use of the content of the column to check if it is sensitive.
        n_samples (int, optional):
            The number of values to be sampled from series of large datasets when using
            the deep search algorithm. A low sample number will greatly improve speed and
            still produce accurate results, assuming that the underlying dictionaries are
            comprehensive.
        config_path (Union[str, pathlib.Path], optional)
            The path of the JSON configuration file in which the dictionaries used for
            detecting sensitive attributes are defined. By default, the configuration
            is the one describing protected attributes and groups according to the
            UK Government.
    Returns:
        Dict[str, Optional[str]]:
            A dictionary containing a mapping from attribute names to a string representing the corresponding
            sensitive attribute category or None.
    Examples:
        >>> detect_names_dict_dataframe(["age", "gender", "legality", "risk"])
        {"age": "Age", "gender": "Gender"}
        >>> col_names = ["native", "location", "house", "subscription", "salary", "religion", "score"]
        >>> df = pd.DataFrame(columns=col_names)
        >>> detect_names_dict_dataframe(df)
        {"native": "Nationality", "location": "Nationality", "house": "Family Status", "religion": "Religion"}
    """

    if config_path:
        attr_synonym_dict, attr_value_dict = load_config(config_path)
    else:
        attr_synonym_dict, attr_value_dict = load_config()

    str_distance = str_distance or _ro_distance

    if isinstance(df, list):
        cols = df
        return _detect_names_dict(
            cols, threshold=threshold, str_distance=str_distance, attr_synonym_dict=attr_synonym_dict
        )
    else:
        cols = df.columns

    sensitive_dict = _detect_names_dict(cols, threshold, str_distance, attr_synonym_dict)

    if deep_search:

        for col in cols:
            # Series containing only the unique values of the analyzed column.
            uniques = pd.Series(df[col].unique())
            # If the series are larger than the provided n_samples, we take a sample to increase speed.
            if uniques.size > n_samples:
                column = uniques.sample(n=n_samples)
            else:
                column = uniques

            group_name = _deep_search(column, threshold, str_distance, attr_value_dict)

            if group_name is not None:
                sensitive_dict[col] = group_name

    return sensitive_dict


[docs]def load_config(config_path: Union[str, pathlib.Path] = DEFAULT_PATH) -> Tuple[Any, Any]:
    """Changes the configuration that creates the underlying synonym and possible value dictionaries
    on which the shallow and deep search algorithms for sensitive attributes are based.

    Args:
        config_path (Union[str, pathlib.Path], optional):
            The path of the JSON file containing the configuration.
            Defaults to fairlens/sensitive/configs/config_engb.json.

    Returns:
        Tuple[Any, Any]:
            Returns a tuple containing the synonym and value dictionaries in a format readable by the
            main detection function.
    """

    if config_path == DEFAULT_PATH:
        config_path = os.path.join(os.path.dirname(__file__), DEFAULT_PATH)

    with open(config_path) as json_file:
        config_dict = json.load(json_file)

    return config_dict["synonyms"], config_dict["values"]


def _ro_distance(s1: Optional[str], s2: Optional[str]) -> float:
    """Computes a distance between the input strings using the Ratcliff-Obershelp algorithm."""
    if s1 is None or s2 is None:
        return 1

    if not isinstance(s1, str) or not isinstance(s2, str):
        return 1

    return 1 - SequenceMatcher(None, s1.lower(), s2.lower()).ratio()


def _detect_name(
    name: str,
    threshold: float = 0.1,
    str_distance: Callable[[Optional[str], Optional[str]], float] = None,
    attr_synonym_dict: Dict[str, List[str]] = None,
) -> Optional[str]:
    """Detects whether a given attribute is sensitive and returns the corresponding sensitive group.

    Args:
        name (str):
            The name of the attribute.
        threshold (float, optional):
            The threshold for the string distance function. Defaults to 0.1.
        str_distance (Callable[[str, str], float], optional):
            The string distance function. Defaults to Ratcliff-Obershelp algorithm.
        attr_synonym_dict (Dict[str, List[str]]):
            The dictionary of sensitive category synonyms that is used for the shallow search.
            If none is passed, it defaults to the configuration describing protected attributes
            and groups according to the UK Government.

    Returns:
        Optional[str]:
            The sensitive name corresponding to the input.
    """

    if attr_synonym_dict is None:
        attr_synonym_dict, _ = load_config()

    str_distance = str_distance or _ro_distance

    name = name.lower()

    # Check exact match
    for group_name, attrs in attr_synonym_dict.items():
        for attr in attrs:
            if name == attr:
                return group_name

    # Check startswith / endswith
    for group_name, attrs in attr_synonym_dict.items():
        separator = " ,.-:"
        for attr in attrs:
            if name.startswith(attr.lower() + "|".join(separator)) or name.endswith("|".join(separator) + attr.lower()):
                return group_name

    # Check distance < threshold
    for group_name, attrs in attr_synonym_dict.items():
        for attr in attrs:
            dist = str_distance(name, attr)
            if dist < threshold:
                return group_name

    return None


def _detect_names_dict(
    names: List[str],
    threshold: float = 0.1,
    str_distance: Callable[[Optional[str], Optional[str]], float] = None,
    attr_synonym_dict: Dict[str, List[str]] = None,
) -> Dict[str, Optional[str]]:
    """Creates a dictionary which maps the attribute names to the corresponding sensitive attribute.

    Args:
        names (List[str]):
            List of attribute names.
        threshold (float, optional):
            The threshold for the string distance function. Defaults to 0.1.
        str_distance (Callable[[str, str], float], optional):
            The string distance function. Defaults to Ratcliff-Obershelp algorithm.
        attr_synonym_dict (Dict[str, List[str]]):
            The dictionary of sensitive category synonyms that is used for the shallow search.
            If none is passed, it defaults to the configuration describing protected attributes
            and groups according to the UK Government.

    Returns:
        Dict[str, Optional[str]]:
            A dictionary containing a mapping from attribute names to a string representing the corresponding
            sensitive attribute category or None.

    Examples:
        >>> _detect_names_dict(["age", "gender", "legality", "risk"])
        {"age": "Age", "gender": "Gender", "legality": None, "risk": None}
    """

    if attr_synonym_dict is None:
        attr_synonym_dict, _ = load_config()

    names_dict = dict()

    for name in names:
        names_dict[name] = _detect_name(
            name, threshold=threshold, str_distance=str_distance, attr_synonym_dict=attr_synonym_dict
        )

    # Remove columns with 'None' values.
    for key, value in dict(names_dict).items():
        if value is None:
            del names_dict[key]

    return names_dict


def _deep_search(
    s: pd.Series,
    threshold: float = 0.1,
    str_distance: Callable[[Optional[str], Optional[str]], float] = None,
    attr_value_dict: Dict[str, List[str]] = None,
) -> Optional[str]:
    if attr_value_dict is None:
        _, attr_value_dict = load_config()

    # Avoid checking number values as they can be inconclusive.
    if s.dtype.kind in ["i", "f", "m", "M"]:
        return None

    str_distance = str_distance or _ro_distance

    # Coarse grain search to check if there is an exact match to avoid mismatches.
    for group_name, values in attr_value_dict.items():
        # Skip sensitive groups that do not have defined possible values.
        if not values:
            continue
        if s.isin(values).mean() > 0.2:
            return group_name

    for group_name, values in attr_value_dict.items():
        if not values:
            continue
        pattern = "|".join(values)
        if s.str.contains(pattern).mean() > 0.6:
            return group_name

    # Fine grain search that will catch edge cases.
    for group_name, values in attr_value_dict.items():
        for value in values:
            if s.map(lambda x: str_distance(x, value) < threshold).mean() > 0.1:
                return group_name
    return None