Source code for fairlens.scorer

"""
Automatically generate a fairness report for a dataset.
"""

import logging
from itertools import combinations
from typing import Mapping, Optional, Sequence, Tuple

import pandas as pd

from . import utils
from .metrics.unified import stat_distance
from .plot.distr import mult_distr_plot
from .sensitive.detection import detect_names_df

logger = logging.getLogger(__name__)


[docs]class FairnessScorer: """This class analyzes a given DataFrame, looks for biases and quantifies fairness."""
[docs] def __init__( self, df: pd.DataFrame, target_attr: str, sensitive_attrs: Optional[Sequence[str]] = None, detect_sensitive: bool = False, distr_type: Optional[str] = None, sensitive_distr_types: Optional[Mapping[str, str]] = None, ): """Fairness Scorer constructor Args: df (pd.DataFrame): Input DataFrame to be scored. target_attr (str): The target attribute name. sensitive_attrs (Optional[Sequence[str]], optional): The sensitive attribute names. Defaults to None. detect_sensitive (bool, optional): Whether to try to detect sensitive attributes from the column names. Defaults to False. distr_type (Optional[str], optional): The type of distribution of the target attribute. Can take values from ["categorical", "continuous", "binary", "datetime"]. If None, the type of distribution is inferred based on the data in the column. Defaults to None. sensitive_distr_types (Optional[Mapping[str, str]], optional): The type of distribution of the sensitive attributes. Passed as a mapping from sensitive attribute name to corresponding distribution type. Can take values from ["categorical", "continuous", "binary", "datetime"]. If None, the type of distribution of all sensitive attributes are inferred based on the data in the respective columns. Defaults to None. """ if sensitive_attrs is None: detect_sensitive = True sensitive_attrs = [] # Detect sensitive attributes if detect_sensitive: attr_dict = detect_names_df(df, deep_search=True).items() sensitive_attrs = list(set([k for (k, v) in attr_dict if v is not None]).union(sensitive_attrs)) if len(sensitive_attrs) == 0: logger.warning("No sensitive attributes detected. Fairness score will always be 0.") self.df = df self.target_attr = target_attr self.sensitive_attrs = sorted(list(sensitive_attrs)) # Infer the types of each distribution if distr_type is None: self.distr_type = utils.infer_distr_type(df[target_attr]) else: self.distr_type = utils.DistrType(distr_type) t = sensitive_distr_types or {} self.sensitive_distr_types = [ utils.DistrType(t[attr]) if attr in t else utils.infer_distr_type(df[attr]) for attr in self.sensitive_attrs ]
[docs] def distribution_score( self, metric: str = "auto", method: str = "dist_to_all", p_value: bool = False, max_comb: Optional[int] = None, ) -> pd.DataFrame: """Returns a dataframe consisting of all unique sub-groups and their statistical distance to the rest of the population w.r.t. the target variable. Args: metric (str, optional): Choose a metric to use. Defaults to automatically chosen metric depending on the distribution of the target variable. method (str, optional): The method used to apply the metric to the sub-group. Can take values ["dist_to_all", dist_to_rest"] which correspond to measuring the distance between the subgroup distribution and the overall distribution, or the overall distribution without the subgroup, respectively. Defaults to "dist_to_all". p_value (bool, optional): Whether or not to compute a p-value for the distances. max_comb (Optional[int], optional): Max number of combinations of sensitive attributes to be considered. If None all combinations are considered. Defaults to 4. """ df = self.df[self.sensitive_attrs + [self.target_attr]].copy() sensitive_attrs = self.sensitive_attrs # Bin continuous sensitive attributes for attr, distr_type in zip(self.sensitive_attrs, self.sensitive_distr_types): if distr_type.is_continuous() or distr_type.is_datetime(): col = utils.infer_dtype(df[attr]) df.loc[:, attr] = utils._bin_as_string(col, distr_type.value, prefix=True) # Convert binary attributes to 0s and 1s if self.distr_type.is_binary(): df.loc[:, self.target_attr] = pd.factorize(df[self.target_attr])[0] if len(sensitive_attrs) == 0 or len(df) == 0 or len(df.dropna()) == 0: return 0.0, pd.DataFrame([], columns=["Group", "Distance", "Proportion", "Counts"]) max_comb = min(max_comb, len(sensitive_attrs)) if max_comb is not None else len(sensitive_attrs) df_dists = [] # Try all combinations of sensitive attributes for k in range(1, max_comb + 1): for sensitive_attr in combinations(sensitive_attrs, k): df_not_nan = df[~(df[list(sensitive_attr)] == "nan").any(axis=1)] if len(df_not_nan) == 0: continue df_dist = _calculate_distance(df, self.target_attr, list(sensitive_attr), metric, method, p_value) df_dists.append(df_dist) df_dist = pd.concat(df_dists, ignore_index=True) return df_dist.reset_index(drop=True)
[docs] def plot_distributions( self, figsize: Optional[Tuple[int, int]] = None, max_width: int = 3, max_quantiles: int = 8, show_hist: Optional[bool] = None, show_curve: Optional[bool] = None, shade: bool = True, normalize: bool = False, cmap: Optional[Sequence[Tuple[float, float, float]]] = None, ): """Plot the distributions of the target variable with respect to all sensitive values. Args: figsize (Optional[Tuple[int, int]], optional): The size of each figure. Defaults to (6, 4). max_width (int, optional): The maximum amount of figures. Defaults to 3. max_quantiles (int, optional): The maximum amount of quantiles to use for continuous data. Defaults to 8. show_hist (Optional[bool], optional): Shows the histogram if True. Defaults to True if the data is categorical or binary. show_curve (Optional[bool], optional): Shows a KDE if True. Defaults to True if the data is continuous or a date. shade (bool, optional): Shades the curve if True. Defaults to True. normalize (bool, optional): Normalizes the counts so the sum of the bar heights is 1. Defaults to False. cmap (Optional[Sequence[Tuple[float, float, float]]], optional): A sequence of RGB tuples used to colour the histograms. If None seaborn's default pallete will be used. Defaults to None. """ mult_distr_plot( self.df, self.target_attr, self.sensitive_attrs, figsize=figsize, max_width=max_width, max_quantiles=max_quantiles, show_hist=show_hist, show_curve=show_curve, shade=shade, normalize=normalize, cmap=cmap, )
[docs] def demographic_report( self, metric: str = "auto", method: str = "dist_to_all", alpha: Optional[float] = 0.05, max_comb: Optional[int] = 4, min_count: Optional[int] = 100, max_rows: int = 10, hide_positive: bool = False, ): """Generate a report on the fairness of different groups of sensitive attributes. Args: metric (str, optional): Choose a custom metric to use. Defaults to automatically chosen metric depending on the distribution of the target variable. See method (str, optional): The method used to apply the metric to the sub-group. Can take values ["dist_to_all", "dist_to_rest"] which correspond to measuring the distance between the subgroup distribution and the overall distribution, or the overall distribution without the subgroup, respectively. Defaults to "dist_to_all". alpha (Optional[float], optional): The maximum p-value to accept a bias. Defaults to 0.05. max_comb (Optional[int], optional): Max number of combinations of sensitive attributes to be considered. If None all combinations are considered. Defaults to 4. min_count (Optional[int], optional): If set, sub-groups with less samples than min_count will be ignored. Defaults to 100. max_rows (int, optional): Maximum number of biased demographics to display. Defaults to 10. hide_positive (bool, optional): Hides positive distances if set to True. This may be useful when using metrics which can return negative distances (binomial distance), in order to inspect a skew in only one direction. Alternatively changing the method may yeild more significant results. Defaults to False. """ df_dist = self.distribution_score(metric=metric, method=method, p_value=(alpha is not None), max_comb=max_comb) if alpha is not None: df_dist = df_dist[df_dist["P-Value"] < alpha] if min_count is not None: df_dist = df_dist[df_dist["Counts"] > min_count] score = calculate_score(df_dist) if hide_positive: df_dist = df_dist[df_dist["Distance"] < 0] df_dist = df_dist.sort_values("P-Value", ascending=True, key=abs) df_dist["Distance"] = df_dist["Distance"].map("{:.3f}".format) df_dist["P-Value"] = df_dist["P-Value"].map("{:.2e}".format) print(f"Sensitive Attributes: {self.sensitive_attrs}\n") print(df_dist[:max_rows].to_string(index=False)) print(f"\nWeighted Mean Statistical Distance: {score}")
[docs]def calculate_score(df_dist: pd.DataFrame) -> float: """Calculate the weighted mean pairwise statistical distance. Args: df_dist (pd.DataFrame): A dataframe of statistical distances produced by or `fairlens.FairnessScorer.distribution_score`. Returns: float: The weighted mean statistical distance. """ return (df_dist["Distance"].abs() * df_dist["Counts"]).sum() / df_dist["Counts"].sum()
def _calculate_distance( df: pd.DataFrame, target_attr: str, sensitive_attrs: Sequence[str], metric: str = "auto", method: str = "dist_to_all", p_value: bool = False, ) -> pd.DataFrame: unique = df[sensitive_attrs].drop_duplicates() dist = [] for _, row in unique.iterrows(): sensitive_group = {attr: [value] for attr, value in row.to_dict().items()} pred = utils.get_predicates_mult(df, [sensitive_group])[0] if method == "dist_to_rest": pred_other = ~pred else: pred_other = pd.Series([True] * len(df)) dist_res = stat_distance(df, target_attr, pred, pred_other, mode=metric, p_value=p_value) distance = dist_res[0] p = dist_res[1] if p_value else 0 dist.append( { "Group": ", ".join(map(str, row.to_dict().values())), "Distance": distance, "Proportion": len(df[pred]) / len(df), "Counts": len(df[pred]), "P-Value": p, } ) df_dist = pd.DataFrame(dist) if not p_value: df_dist.drop(columns=["P-Value"], inplace=True) return df_dist