Source code for mvtk.metrics

import public
import numpy
import pandas

from scipy.stats import entropy
from sklearn.feature_selection import mutual_info_classif


def binarize(data, t):
    y_true, y_pred = data.values.T
    return y_true > t, y_pred


[docs]@public.add def monotonicity(y_true, y_pred, weights=None): r"""Generalizes ROC AUC by computing :math:`P\left(\frac{\Delta\mathrm{y_pred}}{\Delta\mathrm{y_true}} > 0\right)`, the probability incrementing ``y_true`` increases ``y_pred`` for a randomly chosen pair of instances. This reduces to ROC AUC when ``y_true`` has two unique values. Adapted from Algorithm 2 in `Fawcett, T. (2006). An introduction to ROC analysis. Pattern Recognition Letters, 27(8), 861-874. <https://www.sciencedirect.com/science/article/pii/S016786550500303X>`_ Args: y_true (list-like): Ground truth ordinal values y_pred (list-like): Predicted ordinal values weights (list-like): Sample weights. Will be normalized to one across each unique values of ``y_true``. If ``None`` (default) all samples are weighed equally. Returns: Float between 0 and 1. 0 indicates 100% chance of ``y_pred`` decreasing upon incrementing ``y_true`` up to its next highest value in the dataset. 1 being a 100% chance of ``y_pred`` increasing for the same scenario. 0.5 would be 50% chance of either. """ if weights is None: weights = numpy.ones(len(y_true)) unique = numpy.unique(y_true) n = len(unique) - 1 true_lookup = {u: i + 1 for i, u in enumerate(unique)} idx = numpy.argsort(-y_pred) y_true = y_true[idx] y_pred = y_pred[idx] weights = weights[idx] # fp, fp_prev, tp, tp_prev, auc data = numpy.zeros((5, n)) prev_pred = numpy.full(n, numpy.nan) for true, pred, weight in zip(y_true, y_pred, weights): i = true_lookup[true] j = max(i - 2, 0) mask = pred != prev_pred[j:i] data[4, j:i][mask] += trap(*data[:4, j:i][:, mask]) data[1:4:2, j:i][:, mask] = data[:4:2, j:i][:, mask] prev_pred[j:i] = pred i -= 1 if i: data[2, j] += weight if i < n: data[0, i] += weight data[4] += trap(*data[:4]) return numpy.sum(data[4]) / 2 / data[0].dot(data[2])
def trap(x2, x1, y2, y1): return (x2 - x1) * (y2 + y1)
[docs]@public.add def rank_auc(y_true, y_pred, weights=None): r"""Generalizes ROC AUC by computing probability that two randomly chosen data points would be ranked consistently with ground truth labels. This reduces to ROC AUC when ``y_true`` has two unique values. Adapted from Algorithm 2 in `Fawcett, T. (2006). An introduction to ROC analysis. Pattern Recognition Letters, 27(8), 861-874. <https://www.sciencedirect.com/science/article/pii/S016786550500303X>`_ Args: y_true (list-like): Ground truth ordinal values y_pred (list-like): Predicted ordinal values weights (list-like): Sample weights. Will be normalized to one across each unique values of ``y_true``. If ``None`` (default) all samples are weighed equally. Returns: Float between 0 and 1. 0 indicates 100% chance of ``y_pred`` matching order of ``y_true``. 1 being a 100% chance of ``y_pred`` having the opposite order of ``y_true``. 0.5 would be 50% chance of either. """ if weights is None: weights = numpy.ones(len(y_true)) unique = numpy.unique(y_true) n = len(unique) - 1 true_lookup = {u: i + 1 for i, u in enumerate(unique)} idx = numpy.argsort(-y_pred) y_true = y_true[idx] y_pred = y_pred[idx] weights = weights[idx] # fp, fp_prev, tp, tp_prev, auc data = numpy.zeros((5, n)) prev_pred = numpy.full(n, numpy.nan) for true, pred, weight in zip(y_true, y_pred, weights): i = true_lookup[true] mask = pred != prev_pred[:i] data[4, :i][mask] += trap(*data[:4, :i][:, mask]) data[1:4:2, :i][:, mask] = data[:4:2, :i][:, mask] prev_pred[:i] = pred i -= 1 data[2, :i] += weight if i < n: data[0, i] += weight data[4] += trap(*data[:4]) return numpy.sum(data[4]) / 2 / data[0].dot(data[2])
[docs]@public.add def normalized_mutual_info(X, y, **kwargs): """Thin wrapper around `sklearn's mutual information <https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.mutual_info_classif.html>`_. This normalizes the result to 0-1 scale. ``y`` is assumed categorical. """ _, counts = numpy.unique(y, return_counts=True) return pandas.Series( dict( zip( X.columns, mutual_info_classif(X, y, **kwargs) / entropy(counts / counts.sum()), ) ) )