Source code for flotilla.compute.infotheory

"""
Information-theoretic calculations
"""

import numpy as np
import pandas as pd
from sklearn import cross_validation

EPSILON = 100 * np.finfo(float).eps


[docs]def bin_range_strings(bins): """Given a list of bins, make a list of strings of those bin ranges Parameters ---------- bins : list_like List of anything, usually values of bin edges Returns ------- bin_ranges : list List of bin ranges >>> bin_range_strings((0, 0.5, 1)) ['0-0.5', '0.5-1'] """ return ['{}-{}'.format(i, j) for i, j in zip(bins, bins[1:])]
def _check_prob_dist(x): if np.any(x < 0): raise ValueError('Each column of the input dataframes must be ' '**non-negative** probability distributions') try: if np.any(np.abs(x.sum() - np.ones(x.shape[1])) > EPSILON): raise ValueError('Each column of the input dataframe must be ' 'probability distributions that **sum to 1**') except IndexError: if np.any(np.abs(x.sum() - 1) > EPSILON): raise ValueError('Each column of the input dataframe must be ' 'probability distributions that **sum to 1**')
[docs]def binify(df, bins): """Makes a histogram of each column the provided binsize Parameters ---------- data : pandas.DataFrame A samples x features dataframe. Each feature (column) will be binned into the provided bins bins : iterable Bins you would like to use for this data. Must include the final bin value, e.g. (0, 0.5, 1) for the two bins (0, 0.5) and (0.5, 1). nbins = len(bins) - 1 Returns ------- binned : pandas.DataFrame An nbins x features DataFrame of each column binned across rows """ if bins is None: raise ValueError('Must specify "bins"') binned = df.apply(lambda x: pd.Series(np.histogram(x, bins=bins)[0])) binned.index = bin_range_strings(bins) # Normalize so each column sums to 1 binned = binned / binned.sum().astype(float) return binned
[docs]def kld(p, q): """Kullback-Leiber divergence of two probability distributions pandas dataframes, p and q Parameters ---------- p : pandas.DataFrame An nbins x features DataFrame, or (nbins,) Series q : pandas.DataFrame An nbins x features DataFrame, or (nbins,) Series Returns ------- kld : pandas.Series Kullback-Lieber divergence of the common columns between the dataframe. E.g. between 1st column in p and 1st column in q, and 2nd column in p and 2nd column in q. Raises ------ ValueError If the data provided is not a probability distribution, i.e. it has negative values or its columns do not sum to 1, raise ValueError Notes ----- The input to this function must be probability distributions, not raw values. Otherwise, the output makes no sense. """ try: _check_prob_dist(p) _check_prob_dist(q) except ValueError: return np.nan # If one of them is zero, then the other should be considered to be 0. # In this problem formulation, log0 = 0 p = p.replace(0, np.nan) q = q.replace(0, np.nan) return (np.log2(p / q) * p).sum(axis=0)
[docs]def jsd(p, q): """Finds the per-column JSD betwen dataframes p and q Jensen-Shannon divergence of two probability distrubutions pandas dataframes, p and q. These distributions are usually created by running binify() on the dataframe. Parameters ---------- p : pandas.DataFrame An nbins x features DataFrame. q : pandas.DataFrame An nbins x features DataFrame. Returns ------- jsd : pandas.Series Jensen-Shannon divergence of each column with the same names between p and q Raises ------ ValueError If the data provided is not a probability distribution, i.e. it has negative values or its columns do not sum to 1, raise ValueError """ try: _check_prob_dist(p) _check_prob_dist(q) except ValueError: return np.nan weight = 0.5 m = weight * (p + q) result = weight * kld(p, m) + (1 - weight) * kld(q, m) return result
[docs]def entropy(binned, base=2): """Find the entropy of each column of a dataframe Parameters ---------- binned : pandas.DataFrame A nbins x features DataFrame of probability distributions, where each column sums to 1 base : numeric The log-base of the entropy. Default is 2, so the resulting entropy is in bits. Returns ------- entropy : pandas.Seires Entropy values for each column of the dataframe. Raises ------ ValueError If the data provided is not a probability distribution, i.e. it has negative values or its columns do not sum to 1, raise ValueError """ try: _check_prob_dist(binned) except ValueError: np.nan return -((np.log(binned) / np.log(base)) * binned).sum(axis=0)
[docs]def binify_and_jsd(df1, df2, pair, bins): binned1 = binify(df1, bins=bins).dropna(how='all', axis=1) binned2 = binify(df2, bins=bins).dropna(how='all', axis=1) binned1, binned2 = binned1.align(binned2, axis=1, join='inner') series = np.sqrt(jsd(binned1, binned2)) series.name = pair return series
[docs]def cross_phenotype_jsd(data, groupby, bins, n_iter=100): """Jensen-Shannon divergence of features across phenotypes Parameters ---------- data : pandas.DataFrame A (n_samples, n_features) Dataframe groupby : mappable A samples to phenotypes mapping n_iter : int Number of bootstrap resampling iterations to perform for the within-group comparisons n_bins : int Number of bins to binify the singles data on Returns ------- jsd_df : pandas.DataFrame A (n_features, n_phenotypes^2) dataframe of the JSD between each feature between and within phenotypes """ grouped = data.groupby(groupby) jsds = [] seen = set([]) for phenotype1, df1 in grouped: for phenotype2, df2 in grouped: pair = tuple(sorted([phenotype1, phenotype2])) if pair in seen: continue seen.add(pair) if phenotype1 == phenotype2: seriess = [] bs = cross_validation.Bootstrap(df1.shape[0], n_iter=n_iter, train_size=0.5) for i, (ind1, ind2) in enumerate(bs): df1_subset = df1.iloc[ind1, :] df2_subset = df2.iloc[ind2, :] seriess.append( binify_and_jsd(df1_subset, df2_subset, None, bins)) series = pd.concat(seriess, axis=1, names=None).mean(axis=1) series.name = pair jsds.append(series) else: series = binify_and_jsd(df1, df2, pair, bins) jsds.append(series) return pd.concat(jsds, axis=1)
[docs]def jsd_df_to_2d(jsd_df): """Transform a tall JSD dataframe to a square matrix of mean JSDs Parameters ---------- jsd_df : pandas.DataFrame A (n_features, n_phenotypes^2) dataframe of the JSD between each feature between and within phenotypes Returns ------- jsd_2d : pandas.DataFrame A (n_phenotypes, n_phenotypes) symmetric dataframe of the mean JSD between and within phenotypes """ jsd_2d = jsd_df.mean().reset_index() jsd_2d = jsd_2d.rename( columns={'level_0': 'phenotype1', 'level_1': 'phenotype2', 0: 'jsd'}) jsd_2d = jsd_2d.pivot(index='phenotype1', columns='phenotype2', values='jsd') return jsd_2d + np.tril(jsd_2d.T, -1)
Olga B. Botvinnik is funded by the NDSEG fellowship and is a NumFOCUS John Hunter Technology Fellow.
Michael T. Lovci was partially funded by a fellowship from Genentech.
Partially funded by NIH grants NS075449 and HG004659 and CIRM grants RB4-06045 and TR3-05676 to Gene Yeo.