Source code for flotilla.compute.generic

import sys

import numpy as np
import pandas as pd
from sklearn.ensemble import ExtraTreesRegressor, GradientBoostingRegressor
from scipy import stats

from ..util import timeout, TimeoutError


[docs]def get_regressor(x, y, n_estimators=1500, n_tries=5, verbose=False): """Calculate an ExtraTreesRegressor on predictor and target variables Parameters ---------- x : numpy.array Predictor vector y : numpy.array Target vector n_estimators : int, optional Number of estimators to use n_tries : int, optional Number of attempts to calculate regression verbose : bool, optional If True, output progress statements Returns ------- classifier : sklearn.ensemble.ExtraTreesRegressor The classifier with the highest out of bag scores of all the attempted "tries" oob_scores : numpy.array Out of bag scores of the classifier """ if verbose: sys.stderr.write('Getting regressor\n') clfs = [] oob_scores = [] for i in range(n_tries): if verbose: sys.stderr.write('%d.' % i) clf = ExtraTreesRegressor(n_estimators=n_estimators, oob_score=True, bootstrap=True, max_features='sqrt', n_jobs=1, random_state=i).fit(x, y) clfs.append(clf) oob_scores.append(clf.oob_score_) clf = clfs[np.argmax(oob_scores)] clf.feature_importances = pd.Series(clf.feature_importances_, index=x.columns) return clf, oob_scores
[docs]def get_boosting_regressor(x, y, verbose=False): """Calculate a GradientBoostingRegressor on predictor and target variables Parameters ---------- x : numpy.array Predictor variable y : numpy.array Target variable verbose : bool, optional If True, output status messages Returns ------- classifier : sklearn.ensemble.GradientBoostingRegressor A fitted classifier of the predictor and target variable """ if verbose: sys.stderr.write('Getting boosting regressor\n') clf = GradientBoostingRegressor(n_estimators=50, subsample=0.6, max_features=100, verbose=0, learning_rate=0.1, random_state=0).fit(x, y) clf.feature_importances = pd.Series(clf.feature_importances_, index=x.columns) if verbose: sys.stderr.write('Finished boosting regressor\n') return clf
[docs]def get_unstarted_events(mongodb): """ get events that have not been started yet. generator sets started to True before returning an event Parameters ---------- mongodb : pymongo.Database A MongoDB database object """ go_on = True while go_on: event = mongodb['list'].find_one({"started": False}) if event is None: go_on = False else: event['started'] = True mongodb['list'].save(event) yield event
@timeout(5) # because these sometimes hang
[docs]def get_slope(x, y): """Get the linear regression slope of x and y Parameters ---------- x : numpy.array X-values of data y : numpy.array Y-values of data Returns ------- slope : float Scipy.stats.linregress slope """ return stats.linregress(x, y)[0]
@timeout(5) # because these sometimes hang
[docs]def do_r(s_1, s_2, method=stats.pearsonr, min_items=12): """Calculate correlation ("R-value") between two vectors Parameters ---------- s_1 : pandas.Series Predictor vector s_2 : pandas.Series Target vector method : function, optional Which correlation method to use. (default scipy.stats.pearsonr) min_items : int, optional Minimum number of items occuring in both s_1 and s_2 (default 12) Returns ------- r_value : float R-value of the correlation, i.e. how correlated the two inputs are p_value : float p-value of the correlation, i.e. how likely this correlation would happen given the null hypothesis that the two are not correlated Notes ----- If too few items overlap, return (np.nan, np.nan) """ s_1, s_2 = s_1.dropna().align(s_2.dropna(), join='inner') if len(s_1) <= min_items: return np.nan, np.nan return method(s_1, s_2)
@timeout(10) # because these sometimes hang
[docs]def get_robust_values(x, y): """Calculate robust linear regression Parameters ---------- x : numpy.array Predictor vector y : numpy.array Target vector Returns ------- intercept : float Intercept of the fitted line slope : float Slope of the fitted line t_statistic : float T-statistic of the fit p_value : float p-value of the fit """ import statsmodels.api as sm r = sm.RLM(y, sm.add_constant(x), missing='drop').fit() results = r.params[0], r.params[1], r.tvalues[0], r.pvalues[0] return results
@timeout(5)
[docs]def get_dcor(x, y): """Calculate distance correlation between two vectors Uses the distance correlation package from: https://github.com/andrewdyates/dcor Parameters ---------- x : numpy.array 1-dimensional array (aka a vector) of the independent, predictor variable y : numpy.array 1-dimensional array (aka a vector) of the dependent, target variable Returns ------- dc : float Distance covariance dr : float Distance correlation dvx : float Distance variance on x dvy : float Distance variance on y """ # cython version of dcor try: import dcor_cpy as dcor except ImportError as e: sys.stderr.write("Please install dcor_cpy.") raise e dc, dr, dvx, dvy = dcor.dcov_all(x, y) return dc, dr, dvx, dvy
@timeout(100)
[docs]def apply_calc_rs(X, y, method=stats.pearsonr): """Apply R calculation method on each column of X versus the values of y Parameters ---------- X : pandas.DataFrame A (n_samples, n_features) sized DataFrame, assumed to be of log-normal expression values y : pandas.Series A (n_samples,) sized Series, assumed to be of percent spliced-in alternative splicing scores method : function, optional Which correlation method to use on each feature in X versus the values in y Returns ------- r_coefficients : pandas.Series Correlation coefficients p_values : pandas.Series Correlation significances (smaller is better) See Also -------- do_r This is the underlying function which calculates correlation """ out_R = pd.Series(index=X.columns, name=y.name) out_P = pd.Series(index=X.columns, name=y.name) for this_id, data in X.iteritems(): x = pd.Series(data, name=this_id) try: r, p = do_r(x, y, method=method) except TimeoutError: sys.stderr.write( "%s r timeout event:%s, gene:%s\n" % (method, y.name, x.name)) r, p = np.nan, np.nan out_R.ix[this_id] = r out_P.ix[this_id] = p return out_R, out_P
@timeout(220)
[docs]def apply_calc_robust(X, y, verbose=False): """Calculate robust regression between the columns of X and y Parameters ---------- X : pandas.DataFrame A (n_samples, n_features) Dataframe of the predictor variable y : pandas.DataFrame A (n_samples, m_features) DataFrame of the response variable verbose : bool, optional If True, output status messages as the calculation is happening Returns ------- out_I : pandas.Series Intercept of regressions out_S : pandas.Series Slope of regressions out_T : pandas.Series t-statistic of regressions out_P : pandas.Series p-values of regressions See Also -------- get_robust_values This is the underlying function which calculates the slope, intercept, t-value, and p-value of the fit """ if verbose: sys.stderr.write("getting robust regression\n") out_I = pd.Series(index=X.columns, name=y.name) # intercept out_S = pd.Series(index=X.columns, name=y.name) # slope out_T = pd.Series(index=X.columns, name=y.name) # t-value out_P = pd.Series(index=X.columns, name=y.name) # p-value for this_id, data in X.iteritems(): x = pd.Series(data, name=this_id) try: i, s, t, p = get_robust_values(x, y) except TimeoutError: sys.stderr.write( "robust timeout event:%s, gene:%s\n" % (y.name, x.name)) i, s, t, p = np.nan, np.nan, np.nan, np.nan out_I.ix[this_id] = i out_S.ix[this_id] = s out_T.ix[this_id] = t out_P.ix[this_id] = p return out_I, out_S, out_T, out_P
@timeout(50)
[docs]def apply_calc_slope(X, y, verbose=False): """X and y are dataframes, returns slope, t-value and p-value of robust regression Parameters ---------- X : pandas.DataFrame A (n_samples, n_features) Dataframe of predictor variable values y : pandas.DataFrame A (n_samples, m_features) Dataframe of response variable values verbose : bool, optional If True, output status messages Returns ------- slope : pandas.Series Slopes of the linear regression See Also -------- get_slope This is the underlying function which calculates the slope """ if verbose: sys.stderr.write("getting slope\n") out_S = pd.Series(index=X.columns, name=y.name) for this_id, data in X.iteritems(): x = pd.Series(data, name=this_id) try: s = get_slope(x, y) except TimeoutError: sys.stderr.write( "linregress timeout event:%s, gene:%s\n" % (y.name, x.name)) s = np.nan out_S.ix[this_id] = s return out_S
@timeout(50)
[docs]def apply_dcor(X, y, verbose=False): """Calcualte distance correlation between the columns of two dataframes Parameters ---------- X : pandas.DataFrame A (n_samples, n_features) Dataframe of predictor variable values y : pandas.DataFrame A (n_samples, m_features) Dataframe of response variable values verbose : bool, optional If True, output status messages Returns ------- dc : pandas.Series Distance covariance dr : pandas.Series Distance correlation dvx : pandas.Series Distance variance of x dvy : pandas.Series Distance variance of y See Also -------- get_dcor This is the underlying function that gets called to calculate the distance correlation """ if verbose: sys.stderr.write("getting dcor\n") out_DC = pd.Series(index=X.columns, name=y.name) out_DR = pd.Series(index=X.columns, name=y.name) out_DVX = pd.Series(index=X.columns, name=y.name) out_DVY = pd.Series(index=X.columns, name=y.name) for this_id, data in X.iteritems(): x = pd.Series(data, name=this_id) try: dc, dr, dvx, dvy = get_dcor(*map(np.array, [x, y])) except TimeoutError: sys.stderr.write("dcor timeout event:%s, gene:%s\n" % (y.name, x.name)) dc, dr, dvx, dvy = [np.nan] * 4 out_DC.ix[this_id] = dc out_DR.ix[this_id] = dr out_DVX.ix[this_id] = dvx out_DVY.ix[this_id] = dvy return out_DC, out_DR, out_DVX, out_DVY
[docs]def dropna_mean(x): """Drop NA values and return the mean """ return x.dropna().mean()
[docs]def spearmanr_series(x, y): """Calculate spearman r (with p-values) between two pandas series Parameters ---------- x : pandas.Series One of the two series you'd like to correlate y : pandas.Series The other series you'd like to correlate Returns ------- r_value : float The R-value of the correlation. 1 for perfect positive correlation, and -1 for perfect negative correlation p_value : float The p-value of the correlation. """ x, y = x.dropna().align(y.dropna(), 'inner') return stats.spearmanr(x, y)
[docs]def spearmanr_dataframe(A, B, axis=0): """Calculate spearman correlations between dataframes A and B Parameters ---------- A : pandas.DataFrame A n_samples x n_features1 dataframe. Must have the same number of rows as "B" B : pandas.DataFrame A n_samples x n_features2 Dataframe. Must have the same number of rows as "A" axis : int Which axis to compare. If 0, calculate correlations between all the columns of A vs te columns of B. If 1, calculate between rows. (default 0) Returns ------- correlations : pandas.DataFrame A n_features2 x n_features1 DataFrame of (spearman_r, spearman_p) tuples Notes ----- Use "applymap" to get just the R- and p-values of the resulting dataframe >>> import pandas as pd >>> import numpy as np >>> A = pd.DataFrame(np.random.randn(100).reshape(5, 20)) >>> B = pd.DataFrame(np.random.randn(55).reshape(5, 11)) >>> correls = spearmanr_dataframe(A, B) >>> correls.shape (11, 20) >>> spearman_r = correls.applymap(lambda x: x[0]) >>> spearman_p = correls.applymap(lambda x: x[1]) """ return A.apply(lambda x: B.apply(lambda y: spearmanr_series(x, y), axis=axis), axis=axis)
Olga B. Botvinnik is funded by the NDSEG fellowship and is a NumFOCUS John Hunter Technology Fellow.
Michael T. Lovci was partially funded by a fellowship from Genentech.
Partially funded by NIH grants NS075449 and HG004659 and CIRM grants RB4-06045 and TR3-05676 to Gene Yeo.