Source code for flotilla.compute.generic

import sys

import numpy as np
import pandas as pd
from sklearn.ensemble import ExtraTreesRegressor, GradientBoostingRegressor
from scipy import stats

from ..util import timeout, TimeoutError


[docs]def get_regressor(x, y, n_estimators=1500, n_tries=5,
                  verbose=False):
    """Calculate an ExtraTreesRegressor on predictor and target variables

    Parameters
    ----------
    x : numpy.array
        Predictor vector
    y : numpy.array
        Target vector
    n_estimators : int, optional
        Number of estimators to use
    n_tries : int, optional
        Number of attempts to calculate regression
    verbose : bool, optional
        If True, output progress statements

    Returns
    -------
    classifier : sklearn.ensemble.ExtraTreesRegressor
        The classifier with the highest out of bag scores of all the
        attempted "tries"
    oob_scores : numpy.array
        Out of bag scores of the classifier
    """
    if verbose:
        sys.stderr.write('Getting regressor\n')
    clfs = []
    oob_scores = []

    for i in range(n_tries):
        if verbose:
            sys.stderr.write('%d.' % i)

        clf = ExtraTreesRegressor(n_estimators=n_estimators, oob_score=True,
                                  bootstrap=True, max_features='sqrt',
                                  n_jobs=1, random_state=i).fit(x, y)
        clfs.append(clf)
        oob_scores.append(clf.oob_score_)
    clf = clfs[np.argmax(oob_scores)]
    clf.feature_importances = pd.Series(clf.feature_importances_,
                                        index=x.columns)

    return clf, oob_scores


[docs]def get_boosting_regressor(x, y, verbose=False):
    """Calculate a GradientBoostingRegressor on predictor and target variables

    Parameters
    ----------
    x : numpy.array
        Predictor variable
    y : numpy.array
        Target variable
    verbose : bool, optional
        If True, output status messages

    Returns
    -------
    classifier : sklearn.ensemble.GradientBoostingRegressor
        A fitted classifier of the predictor and target variable
    """
    if verbose:
        sys.stderr.write('Getting boosting regressor\n')

    clf = GradientBoostingRegressor(n_estimators=50, subsample=0.6,
                                    max_features=100,
                                    verbose=0, learning_rate=0.1,
                                    random_state=0).fit(x, y)

    clf.feature_importances = pd.Series(clf.feature_importances_,
                                        index=x.columns)
    if verbose:
        sys.stderr.write('Finished boosting regressor\n')

    return clf


[docs]def get_unstarted_events(mongodb):
    """
    get events that have not been started yet.
    generator sets started to True before returning an event

    Parameters
    ----------
    mongodb : pymongo.Database
        A MongoDB database object
    """
    go_on = True
    while go_on:
        event = mongodb['list'].find_one({"started": False})

        if event is None:
            go_on = False
        else:
            event['started'] = True
            mongodb['list'].save(event)
            yield event


@timeout(5)  # because these sometimes hang
[docs]def get_slope(x, y):
    """Get the linear regression slope of x and y

    Parameters
    ----------
    x : numpy.array
        X-values of data
    y : numpy.array
        Y-values of data

    Returns
    -------
    slope : float
        Scipy.stats.linregress slope

    """
    return stats.linregress(x, y)[0]


@timeout(5)  # because these sometimes hang
[docs]def do_r(s_1, s_2, method=stats.pearsonr, min_items=12):
    """Calculate correlation ("R-value") between two vectors

    Parameters
    ----------
    s_1 : pandas.Series
        Predictor vector
    s_2 : pandas.Series
        Target vector
    method : function, optional
        Which correlation method to use. (default scipy.stats.pearsonr)
    min_items : int, optional
        Minimum number of items occuring in both s_1 and s_2 (default 12)

    Returns
    -------
    r_value : float
        R-value of the correlation, i.e. how correlated the two inputs are
    p_value : float
        p-value of the correlation, i.e. how likely this correlation would
        happen given the null hypothesis that the two are not correlated

    Notes
    -----
    If too few items overlap, return (np.nan, np.nan)
    """
    s_1, s_2 = s_1.dropna().align(s_2.dropna(), join='inner')
    if len(s_1) <= min_items:
        return np.nan, np.nan
    return method(s_1, s_2)


@timeout(10)  # because these sometimes hang
[docs]def get_robust_values(x, y):
    """Calculate robust linear regression

    Parameters
    ----------
    x : numpy.array
        Predictor vector
    y : numpy.array
        Target vector

    Returns
    -------
    intercept : float
        Intercept of the fitted line
    slope : float
        Slope of the fitted line
    t_statistic : float
        T-statistic of the fit
    p_value : float
        p-value of the fit
    """
    import statsmodels.api as sm

    r = sm.RLM(y, sm.add_constant(x), missing='drop').fit()
    results = r.params[0], r.params[1], r.tvalues[0], r.pvalues[0]
    return results


@timeout(5)
[docs]def get_dcor(x, y):
    """Calculate distance correlation between two vectors

    Uses the distance correlation package from:
    https://github.com/andrewdyates/dcor

    Parameters
    ----------
    x : numpy.array
        1-dimensional array (aka a vector) of the independent, predictor
        variable
    y : numpy.array
        1-dimensional array (aka a vector) of the dependent, target variable

    Returns
    -------
    dc : float
        Distance covariance
    dr : float
        Distance correlation
    dvx : float
        Distance variance on x
    dvy : float
        Distance variance on y
    """
    # cython version of dcor
    try:
        import dcor_cpy as dcor
    except ImportError as e:
        sys.stderr.write("Please install dcor_cpy.")
        raise e

    dc, dr, dvx, dvy = dcor.dcov_all(x, y)
    return dc, dr, dvx, dvy


@timeout(100)
[docs]def apply_calc_rs(X, y, method=stats.pearsonr):
    """Apply R calculation method on each column of X versus the values of y

    Parameters
    ----------
    X : pandas.DataFrame
        A (n_samples, n_features) sized DataFrame, assumed to be of
        log-normal expression values
    y : pandas.Series
        A (n_samples,) sized Series, assumed to be of percent spliced-in
        alternative splicing scores
    method : function, optional
        Which correlation method to use on each feature in X versus the
        values in y

    Returns
    -------
    r_coefficients : pandas.Series
        Correlation coefficients
    p_values : pandas.Series
        Correlation significances (smaller is better)

    See Also
    --------
    do_r
        This is the underlying function which calculates correlation
    """
    out_R = pd.Series(index=X.columns, name=y.name)
    out_P = pd.Series(index=X.columns, name=y.name)
    for this_id, data in X.iteritems():
        x = pd.Series(data, name=this_id)
        try:
            r, p = do_r(x, y, method=method)

        except TimeoutError:
            sys.stderr.write(
                "%s r timeout event:%s, gene:%s\n" % (method, y.name, x.name))
            r, p = np.nan, np.nan
        out_R.ix[this_id] = r
        out_P.ix[this_id] = p
    return out_R, out_P


@timeout(220)
[docs]def apply_calc_robust(X, y, verbose=False):
    """Calculate robust regression between the columns of X and y

    Parameters
    ----------
    X : pandas.DataFrame
        A (n_samples, n_features) Dataframe of the predictor variable
    y : pandas.DataFrame
        A (n_samples, m_features) DataFrame of the response variable
    verbose : bool, optional
        If True, output status messages as the calculation is happening

    Returns
    -------
    out_I : pandas.Series
        Intercept of regressions
    out_S : pandas.Series
        Slope of regressions
    out_T : pandas.Series
        t-statistic of regressions
    out_P : pandas.Series
        p-values of regressions

    See Also
    --------
    get_robust_values
        This is the underlying function which calculates the slope,
        intercept, t-value, and p-value of the fit
    """
    if verbose:
        sys.stderr.write("getting robust regression\n")
    out_I = pd.Series(index=X.columns, name=y.name)  # intercept
    out_S = pd.Series(index=X.columns, name=y.name)  # slope
    out_T = pd.Series(index=X.columns, name=y.name)  # t-value
    out_P = pd.Series(index=X.columns, name=y.name)  # p-value

    for this_id, data in X.iteritems():
        x = pd.Series(data, name=this_id)
        try:
            i, s, t, p = get_robust_values(x, y)
        except TimeoutError:
            sys.stderr.write(
                "robust timeout event:%s, gene:%s\n" % (y.name, x.name))
            i, s, t, p = np.nan, np.nan, np.nan, np.nan
        out_I.ix[this_id] = i
        out_S.ix[this_id] = s
        out_T.ix[this_id] = t
        out_P.ix[this_id] = p
    return out_I, out_S, out_T, out_P


@timeout(50)
[docs]def apply_calc_slope(X, y, verbose=False):
    """X and y are dataframes, returns slope, t-value and p-value of robust
    regression

    Parameters
    ----------
    X : pandas.DataFrame
        A (n_samples, n_features) Dataframe of predictor variable values
    y : pandas.DataFrame
        A (n_samples, m_features) Dataframe of response variable values
    verbose : bool, optional
        If True, output status messages

    Returns
    -------
    slope : pandas.Series
        Slopes of the linear regression

    See Also
    --------
    get_slope
        This is the underlying function which calculates the slope
    """
    if verbose:
        sys.stderr.write("getting slope\n")

    out_S = pd.Series(index=X.columns, name=y.name)

    for this_id, data in X.iteritems():
        x = pd.Series(data, name=this_id)
        try:
            s = get_slope(x, y)
        except TimeoutError:
            sys.stderr.write(
                "linregress timeout event:%s, gene:%s\n" % (y.name, x.name))
            s = np.nan
        out_S.ix[this_id] = s

    return out_S


@timeout(50)
[docs]def apply_dcor(X, y, verbose=False):
    """Calcualte distance correlation between the columns of two dataframes

    Parameters
    ----------
    X : pandas.DataFrame
        A (n_samples, n_features) Dataframe of predictor variable values
    y : pandas.DataFrame
        A (n_samples, m_features) Dataframe of response variable values
    verbose : bool, optional
        If True, output status messages

    Returns
    -------
    dc : pandas.Series
        Distance covariance
    dr : pandas.Series
        Distance correlation
    dvx : pandas.Series
        Distance variance of x
    dvy : pandas.Series
        Distance variance of y

    See Also
    --------
    get_dcor
        This is the underlying function that gets called to calculate the
        distance correlation
    """
    if verbose:
        sys.stderr.write("getting dcor\n")

    out_DC = pd.Series(index=X.columns, name=y.name)
    out_DR = pd.Series(index=X.columns, name=y.name)
    out_DVX = pd.Series(index=X.columns, name=y.name)
    out_DVY = pd.Series(index=X.columns, name=y.name)

    for this_id, data in X.iteritems():
        x = pd.Series(data, name=this_id)
        try:
            dc, dr, dvx, dvy = get_dcor(*map(np.array, [x, y]))

        except TimeoutError:
            sys.stderr.write("dcor timeout event:%s, gene:%s\n" % (y.name,
                                                                   x.name))
            dc, dr, dvx, dvy = [np.nan] * 4
        out_DC.ix[this_id] = dc
        out_DR.ix[this_id] = dr
        out_DVX.ix[this_id] = dvx
        out_DVY.ix[this_id] = dvy
    return out_DC, out_DR, out_DVX, out_DVY


[docs]def dropna_mean(x):
    """Drop NA values and return the mean
    """
    return x.dropna().mean()


[docs]def spearmanr_series(x, y):
    """Calculate spearman r (with p-values) between two pandas series

    Parameters
    ----------
    x : pandas.Series
        One of the two series you'd like to correlate
    y : pandas.Series
        The other series you'd like to correlate

    Returns
    -------
    r_value : float
        The R-value of the correlation. 1 for perfect positive correlation,
        and -1 for perfect negative correlation
    p_value : float
        The p-value of the correlation.
    """
    x, y = x.dropna().align(y.dropna(), 'inner')
    return stats.spearmanr(x, y)


[docs]def spearmanr_dataframe(A, B, axis=0):
    """Calculate spearman correlations between dataframes A and B

    Parameters
    ----------
    A : pandas.DataFrame
        A n_samples x n_features1 dataframe. Must have the same number of rows
        as "B"
    B : pandas.DataFrame
        A n_samples x n_features2 Dataframe. Must have the same number of rows
        as "A"
    axis : int
        Which axis to compare. If 0, calculate correlations between all the
        columns of A vs te columns of B. If 1, calculate between rows.
        (default 0)

    Returns
    -------
    correlations : pandas.DataFrame
        A n_features2 x n_features1 DataFrame of (spearman_r, spearman_p)
        tuples

    Notes
    -----
    Use "applymap" to get just the R- and p-values of the resulting dataframe

    >>> import pandas as pd
    >>> import numpy as np
    >>> A = pd.DataFrame(np.random.randn(100).reshape(5, 20))
    >>> B = pd.DataFrame(np.random.randn(55).reshape(5, 11))
    >>> correls = spearmanr_dataframe(A, B)
    >>> correls.shape
    (11, 20)
    >>> spearman_r = correls.applymap(lambda x: x[0])
    >>> spearman_p = correls.applymap(lambda x: x[1])
    """
    return A.apply(lambda x: B.apply(lambda y: spearmanr_series(x, y),
                                     axis=axis),
                   axis=axis)