Source code for flotilla.compute.predict

"""
Compute predictors on data, e.g. classify or regress on features/samples
"""
import sys
import warnings
from collections import defaultdict
import math

import numpy as np
import pandas as pd
from sklearn.ensemble import ExtraTreesClassifier, ExtraTreesRegressor, \
    GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.preprocessing import LabelEncoder
import pandas.util.testing as pdt

from ..util import memoize, timestamp
from .decomposition import DataFramePCA, DataFrameNMF


CLASSIFIER = 'ExtraTreesClassifier'
REGRESSOR = 'ExtraTreesRegressor'
SCORE_COEFFICIENT = 2


[docs]def default_predictor_scoring_fun(cls): """Return scores of how important a feature is to the prediction Most predictors score output coefficients in the variable cls.feature_importances_ and others may use another name for scores, so this function bridges the gap Parameters ---------- cls : sklearn predictor A scikit-learn prediction class, such as ExtraTreesClassifier or ExtraTreesRegressor Returns ------- scores : pandas.Series A (n_features,) size series of how important each feature was to the classification (bigger is better) """ return cls.feature_importances_
[docs]def default_score_cutoff_fun(arr, std_multiplier=SCORE_COEFFICIENT): """Calculate a minimum score cutoff for the best features By default, this function calculates: :math:`f(x) = mean(x) + 2 * std(x)` Parameters ---------- arr : numpy.ndarray A numpy array of scores std_multiplier : float, optional (default=2) What to multiply the standard deviation by. E.g. if you want only features that are 6 standard deviations away, set this to 6. Returns ------- cutoff : float Minimum score of "best" features, given these parameters """ return np.mean(arr) + std_multiplier * np.std(arr)
[docs]class PredictorConfig(object): """A configuration for a predictor, names and tracks/sets parameters Dynamically configures some args for predictor based on n_features (if this attribute exists) set general parameters with __init__ yield instances, set by your parameters, with __call__ """ def __init__(self, predictor_name, obj, predictor_scoring_fun=default_predictor_scoring_fun, score_cutoff_fun=default_score_cutoff_fun, n_features_dependent_kwargs=None, **kwargs): """Construct a predictor configuration Parameters ---------- predictor_name : str A name for this predictor obj : sklearn predictor A scikit-learn predictor, eg sklearn.ensemble.ExtraTreesClassifier predictor_scoring_fun : function, optional A function which returns the scores of a predictor. May be different for different predictor objects score_cutoff_fun : function, optional A function which returns the minimum "good" score of a predictor n_features_dependent_kwargs : dict, optional (default None) A dictionary of (key, function) arguments for the classifier, for keyword arguments that are dependent on the dataset input size kwargs : other keyword arguments, optional All other keyword arguments are passed along to the predictor """ if n_features_dependent_kwargs is None: n_features_dependent_kwargs = {} self.n_features_dependent_kwargs = n_features_dependent_kwargs self.constant_kwargs = kwargs self.predictor_scoring_fun = predictor_scoring_fun self.score_cutoff_fun = score_cutoff_fun self.predictor_name = predictor_name sys.stdout.write( "{}\tPredictor {} is of type {}\n".format(timestamp(), self.predictor_name, obj)) self._parent = obj self.__doc__ = obj.__doc__ sys.stdout.write( "{}\tAdded {} to default predictors\n".format(timestamp(), self.predictor_name)) @memoize
[docs] def parameters(self, n_features): """Given a number of features, return the appropriately scaled keyword arguments Parameters ---------- n_features : int Number of features in the data to scale appropriate keyword arguments to the predictor object """ kwargs = {} for parameter, setter in self.n_features_dependent_kwargs.items(): kwargs[parameter] = setter(n_features) for parameter, value in self.constant_kwargs.items(): kwargs[parameter] = value return kwargs
def __call__(self, n_features): """Initialize a predictor with this number of features Parameters ---------- n_features : int The number of features in the data Returns ------- predictor : sklearn predictor A scikit-learn predictor object inialized with keyword arguments specified in __init__, and the :py:attr:`n_feature_dependent_kwargs` scaled to this number of features """ parameters = self.parameters(n_features) sys.stdout.write( "{} Configuring predictor type: {} with {} features".format( timestamp(), self.predictor_name, n_features)) predictor = self._parent(**parameters) predictor.score_cutoff_fun = self.score_cutoff_fun predictor.predictor_scoring_fun = self.predictor_scoring_fun predictor.has_been_fit = False predictor.has_been_scored = False predictor._score_coefficient = SCORE_COEFFICIENT return predictor
[docs]class PredictorConfigScalers(object): """Scale parameters specified in the keyword arugments based on the dataset size """ _default_coef = 2.5 _default_nfeatures = 500 @staticmethod
[docs] def max_feature_scaler(n_features=_default_nfeatures, coef=_default_coef): """Scale the maximum number of features per estimator # TODO: @mlovci what are the principles behind this scaler? to see each feature "x" number of times? Parameters ---------- n_features : int, optional (default 500) Number of features in the data coef : float, optional (default 2.5) # TODO: What does this do? Returns ------- n_features : int Maximum number of features per estimator Raises ------ ValueError If n_features is None """ if n_features is None: raise ValueError return int(math.ceil(np.sqrt(np.sqrt(n_features) ** (coef + .3))))
@staticmethod
[docs] def n_estimators_scaler(n_features=_default_nfeatures, coef=_default_coef): """Scale the number of estimators based on the input features # TODO: @mlovci what are the principles behind this scaler? to see each feature "x" number of times? Parameters ---------- n_features : int, optional (default 500) Number of features in the data coef : float, optional (default 2.5) # @mlovci TODO: What does this do? Returns ------- n_estimators : int Number of estimators to use Raises ------ ValueError If n_features is None """ if n_features is None: raise ValueError return int(math.ceil((n_features / 50.) * coef))
@staticmethod
[docs] def n_jobs_scaler(n_features=_default_nfeatures): """Scale the number of jobs based on how many features are in the data # TODO: @mlovci what are the principles behind this scaler? to see each feature "x" number of times? Parameters ---------- n_features : int Number of features in the data Returns ------- n_jobs : int Number of jobs to use Raises ------ ValueError If n_features is None """ if n_features is None: raise ValueError return int(min(4, math.ceil(n_features / 2000.)))
[docs]class ConfigOptimizer(object): """choose the coef that makes some result most likely at all n_features (or some other function of the dataset) """ @staticmethod
[docs] def objective_average_times_seen( n_features, coef=PredictorConfigScalers._default_coef, max_feature_scaler=PredictorConfigScalers.max_feature_scaler, n_estimators_scaler=PredictorConfigScalers.n_estimators_scaler): """I have no idea what this does. @mlovci Parameters ---------- n_features : int ??? coef : float ??? max_feature_scaler : function ??? n_estimators_scaler : function ??? Returns ------- ??? """ return ((n_features / max_feature_scaler(n_features, coef)) * n_estimators_scaler(n_features, coef)) / float(n_features)
[docs]class PredictorConfigManager(object): """Manage several predictor configurations A container for predictor configurations, includes several built-ins @mlovci: built-ins such as ........ ? What is predictor_config vs new_predictor_config? Why are they separate? Attributes ---------- predictor_config : predictor_configs : builtin_predictor_configs : Methods ------- new_predictor_config Create a new predictor configuration >>> pcm = PredictorConfigManager() >>> # add a new type of predictor >>> pcm.new_predictor_config(ExtraTreesClassifier, 'ExtraTreesClassifier', ... n_features_dependent_kwargs= ... {'max_features': PredictorConfigScalers.max_feature_scaler, ... 'n_estimators': PredictorConfigScalers.n_estimators_scaler, ... 'n_jobs': PredictorConfigScalers.n_jobs_scaler}, ... bootstrap=True, random_state=0, ... oob_score=True, ... verbose=True}) """ def __init__(self): """Construct a predictor configuration manager with ExtraTreesClassifier, ExtraTreesRegressor, GradientBoostingClassifier, and GradientBoostingRegressor as default predictors. """ constant_extratrees_kwargs = {'bootstrap': True, 'random_state': 0, 'oob_score': True, 'verbose': True} self.predictor_config( 'ExtraTreesClassifier', obj=ExtraTreesClassifier, n_features_dependent_kwargs={ 'max_features': PredictorConfigScalers.max_feature_scaler, 'n_estimators': PredictorConfigScalers.n_estimators_scaler, 'n_jobs': PredictorConfigScalers.n_jobs_scaler}, **constant_extratrees_kwargs) self.predictor_config( 'ExtraTreesRegressor', obj=ExtraTreesRegressor, n_features_dependent_kwargs={ 'max_features': PredictorConfigScalers.max_feature_scaler, 'n_estimators': PredictorConfigScalers.n_estimators_scaler, 'n_jobs': PredictorConfigScalers.n_jobs_scaler}, **constant_extratrees_kwargs) constant_boosting_kwargs = {'n_estimators': 80, 'max_features': 1000, 'learning_rate': 0.2, 'subsample': 0.6, } self.predictor_config('GradientBoostingClassifier', obj=GradientBoostingClassifier, **constant_boosting_kwargs) self.predictor_config('GradientBoostingRegressor', obj=GradientBoostingRegressor, **constant_boosting_kwargs) @property
[docs] def builtin_predictor_configs(self): """Names of the predictor configurations """ return self.predictor_configs.keys()
@property
[docs] def predictor_configs(self): """Dict of predictor configurations """ if not hasattr(self, '_predictors'): self._predictors = {} return self._predictors
[docs] def predictor_config(self, name, **kwargs): """Create a new predictor configuration, added to :py:attr:`.predictors` Parameters ---------- name : str Name of the predictor kwargs : other keyword arguments, optional All other keyword arguments are passed to :py:meth:`predictor_configs` Returns ------- predictor : sklearn predictor An initalized scikit-learn predictor """ predictor = self.new_predictor_config(name, **kwargs) if name in self.predictor_configs and \ self.predictor_configs[name] != predictor: sys.stderr.write( "WARNING: over-writing predictor named: {}".format(name)) self.predictor_configs[name] = predictor return predictor
@memoize
[docs] def new_predictor_config(self, name, obj=None, predictor_scoring_fun=None, score_cutoff_fun=None, n_features_dependent_kwargs=None, **kwargs): """Create a new predictor configuration Parameters ---------- name : str Name of the predictor configuration obj : sklearn predictor object, optional (default=None) @mlovci: what is the point of setting the default to None if it's not really allowed? predictor_scoring_fun : function, optional (default=None) If None, get feature scores from obj.feature_importances_ score_cutoff_fun : function, optional (default=None) If None, get the cutoff for important features with by taking features with scores that are 2 standard deviations away from the mean score n_features_dependent_kwargs : dict, optional (default=None) A (key, function) dictionary of keyword argument names and functions which scale their values based on the dataset input size kwargs : other keyword arguments All other keyword arguments are passed to :py:class:`PredictorConfig` Returns ------- predictorconfig : PredictorConfig A predictor configuration Raises ------ ValueError If `obj` is None and any of the other keyword arguments are None KeyError If `obj` is None and "name" is not already in :py:attr:`.predictor_configs` """ if obj is None: # If obj is None, then this is probably just a "name" and you can't # change any of the parameters n_features_dependent_kwargs = None if \ n_features_dependent_kwargs == {} else \ n_features_dependent_kwargs kwargs = None if kwargs == {} else kwargs args = [predictor_scoring_fun, score_cutoff_fun, n_features_dependent_kwargs, kwargs] if any([i is not None for i in args]): # if obj is None, you'd better not be asking to set parameters # on it. raise ValueError try: return self.predictor_configs[name] except KeyError: raise KeyError("No such predictor: {}".format(name)) if predictor_scoring_fun is None: predictor_scoring_fun = default_predictor_scoring_fun if score_cutoff_fun is None: score_cutoff_fun = default_score_cutoff_fun if n_features_dependent_kwargs is not None: if type(n_features_dependent_kwargs) is not dict: raise TypeError else: n_features_dependent_kwargs = {} return PredictorConfig( name, obj, predictor_scoring_fun=predictor_scoring_fun, score_cutoff_fun=score_cutoff_fun, n_features_dependent_kwargs=n_features_dependent_kwargs, **kwargs)
[docs]class PredictorDataSet(object): def __init__(self, data, trait, data_name="MyDataset", categorical_trait=False, predictor_config_manager=None): """Store a (n_samples, n_features) matrix and (n_samples,) trait pair In scikit-learn parlance, store an X (data of independent variables) and y (target prediction) pair Parameters ---------- data : pandas.DataFrame A (n_samples, n_features) datafarme trait : pandas.Series Returns ------- Raises ------ data - X trait - y data_name - name to store this dataset, to be used with trait.name categorical_trait - is y categorical? """ if not isinstance(trait, pd.Series): raise TypeError("Traits must be pandas.Series objects") self.dataset_name = (data_name, trait.name) self.data_name = data_name self._data = data self.trait = trait self.trait_name = self.trait.name self.categorical_trait = categorical_trait if categorical_trait: if len(self.traitset) > 2: warnings.warn("WARNING: trait {} has >2 categories".format( self.trait_name)) # categorical encoder le = LabelEncoder().fit(self.traitset) # categorical encoding self._y = pd.Series(data=le.transform(self.trait), index=trait.index, name=self.trait.name) else: self._y = trait self.predictor_config_manager = predictor_config_manager \ if predictor_config_manager is not None \ else PredictorConfigManager() self.n_features = self.X.shape[1] self._predictors = defaultdict(dict) @property
[docs] def X(self): """(n_samples, n_features) matrix""" return self._data.align(self._y, axis=0, join='inner')[0]
@property
[docs] def y(self): """(n_samples,) vector of traits""" return self._data.align(self._y, axis=0, join='inner')[1]
@property
[docs] def traitset(self): """All unique values in :py:attr:`self.trait`""" return self.trait.groupby(self.trait).groups.keys()
@property
[docs] def predictors(self): """dict of PredictorConfig instances The idea here is to keep the predictors tied to their datasets """ if hasattr(self, '_predictors'): return self._predictors
@memoize
[docs] def predictor(self, name, **kwargs): """A single, initialized PredictorConfig instance Parameters ---------- name : str Name of the predictor to retrieve or initialize kwargs : other keyword arguments All other keyword arguments are passed to :py:class:`PredictorConfig` Returns ------- predictorconfig : PredictorConfig An initialized scikit-learn classifier or regressor """ predictor = self.predictor_config_manager.predictor_config(name, **kwargs) initialized = predictor(self.n_features) self.predictors[name] = initialized return initialized
[docs] def check_if_equal(self, data, trait, categorical_trait): """Check if this is the same as another dataset. Parameters ---------- data : pandas.DataFrame Input data of another dataset trait : pandas.Series Response variable of another dataset categorical_trait : bool Whether or not ``trait`` is categorical Raises ------ AssertionError If datasets are not the same """ pdt.assert_frame_equal(data, self._data) pdt.assert_series_equal(trait, self.trait) pdt.assert_equal(categorical_trait, self.categorical_trait)
[docs]class PredictorDataSetManager(object): """A collection of PredictorDataSet instances. Parameters ---------- predictor_config_manager : PredictorConfigManager, optional (default None) A predictor configuration manager. If None, instantiate a new one. Attributes ---------- datasets : dict Dict of dicts of {data: {trait: {categorical: dataset}}}. For convenient retrieval of predictors """ def __init__(self, predictor_config_manager=None): self.predictor_config_manager = predictor_config_manager \ if predictor_config_manager is not None \ else PredictorConfigManager() @property
[docs] def datasets(self): """3-layer deep dict of {data: {trait: {categorical: dataset}}} """ if not hasattr(self, '_datasets'): # 3 layer deep (data, trait, categorical?) # will almost always be either categorical true or false, rarely # both self._datasets = defaultdict(lambda: defaultdict(dict)) return self._datasets
[docs] def dataset(self, data_name, trait_name, categorical_trait=False, **kwargs): """???? @mlovci please fill in Parameters ---------- data_name : str Name of this data trait_name : str Name of this trait categorical_trait : bool, optional (default=False) If True, then this trait is treated as a categorical, rather than a sequential trait Returns ------- dataset : PredictorDataSet ??? """ kwargs['categorical_trait'] = categorical_trait dataset = self.new_dataset(data_name, trait_name, **kwargs) if data_name in self.datasets: if trait_name in self.datasets[data_name]: if categorical_trait in self.datasets[data_name][ trait_name] and \ self.datasets[data_name][trait_name][ categorical_trait] != dataset: sys.stderr.write( "WARNING: over-writing dataset named: {}".format( (data_name, trait_name, categorical_trait))) self.datasets[data_name][trait_name][ categorical_trait] = dataset else: self.datasets[data_name][trait_name][ categorical_trait] = dataset else: self.datasets[data_name][trait_name][ categorical_trait] = dataset else: self.datasets[data_name][trait_name][categorical_trait] = dataset return dataset
@memoize
[docs] def new_dataset(self, data_name, trait_name, categorical_trait=False, data=None, trait=None, predictor_config_manager=None): """??? Difference betwen this and ``dataset``??? @mlovci Parameters ---------- data_name : str Name of this data trait_name : str Name of this trait categorical_trait : bool, optional (default=False) If True, then this trait is treated as a categorical, rather than a sequential trait data : pandas.DataFrame, optional (default=None) ??? WHy is this optional!?!??!?! trait : pandas.Series, optional (default=None) ???? Why is this optional!?!?!? predictor_config_manager : PredictorConfigManager (default=None) Returns ------- dataset : PredictorDataSet ??? """ if data is None: # try to get this dataset by key in the dictionary args = np.array([data, trait, predictor_config_manager]) if np.any([i is not None for i in args]): # if data is None, you'd better not be asking to set other # parameters raise Exception try: return self.datasets[data_name][trait_name][categorical_trait] except KeyError: raise KeyError("No such dataset: {}".format( (data_name, trait_name, categorical_trait))) if trait is None: raise Exception if trait_name != trait.name: raise ValueError if data_name is None: data_name = "MyData" predictor_config_manager = predictor_config_manager \ if predictor_config_manager is not None \ else self.predictor_config_manager return PredictorDataSet( data, trait, data_name, categorical_trait=categorical_trait, predictor_config_manager=predictor_config_manager)
[docs]class PredictorBase(object): def __init__(self, predictor_name, data_name, trait_name, X_data=None, trait=None, predictor_obj=None, predictor_scoring_fun=None, score_cutoff_fun=None, n_features_dependent_kwargs=None, constant_kwargs=None, is_categorical_trait=None, predictor_dataset_manager=None, predictor_config_manager=None, feature_renamer=None, groupby=None, color=None, pooled=None, order=None, violinplot_kws=None, data_type=None, label_to_color=None, label_to_marker=None, singles=None, outliers=None): """A dataset-predictor pair from PredictorDatasetManager One datset, one predictor, from dataset manager. Parameters ---------- predictor_name : str Name for predictor data_name : str Name for this (subset of the) data trait_name : str Name for this trait X_data : pandas.DataFrame, optional Samples-by-features (row x col) dataset to train the predictor on trait : pandas.Series, optional A variable you want to predict using X_data. Indexed like X_data. predictor_obj : sklearn predictor, optional A scikit-learn predictor that implements fit and score on (X_data,trait) Default ExtraTreesClassifier predictor_scoring_fun : function, optional Function to get the feature scores for a scikit-learn classifier. This can be different for different classifiers, e.g. for a classifier named "x" it could be x.scores_, for other it's x.feature_importances_. Default: lambda x: x.feature_importances_ score_cutoff_fun : function, optional Function to cut off insignificant scores Default: lambda scores: np.mean(x) + 2 * np.std(x) n_features_dependent_kwargs : dict, optional kwargs to the predictor that depend on n_features Default: {} constant_kwargs : dict, optional kwargs to the predictor that are constant, i.e.: {'n_estimators': 100, 'bootstrap': True, 'max_features': 'auto', 'random_state': 0, 'oob_score': True, 'n_jobs': 2, 'verbose': True} """ self.predictor_name = predictor_name self.data_name = data_name self.trait_name = trait_name self.feature_renamer = feature_renamer self.groupby = groupby self.color = color self.pooled = pooled self.singles = singles self.outliers = outliers self.order = order self.violinplot_kws = violinplot_kws self.data_type = data_type self.label_to_color = label_to_color self.label_to_marker = label_to_marker if trait is not None: trait = trait.copy() trait.name = trait_name if predictor_dataset_manager is None: if predictor_config_manager is None: self.predictor_config_manager = PredictorConfigManager() else: self.predictor_config_manager = predictor_config_manager self.predictor_data_manager = PredictorDataSetManager( self.predictor_config_manager) else: self.predictor_data_manager = predictor_dataset_manager # load all args and kwargs into instance attributes self._data = X_data self.trait = trait self.predictor_obj = predictor_obj self.predictor_scoring_fun = predictor_scoring_fun self.score_cutoff_fun = score_cutoff_fun self.constant_kwargs = {} if constant_kwargs is None \ else constant_kwargs self.n_features_dependent_kwargs = {} \ if n_features_dependent_kwargs is None else \ n_features_dependent_kwargs self.categorical_trait = is_categorical_trait if \ is_categorical_trait is not None else False self.__doc__ = '{}\n\n{}\n\n{}\n\n'.format(self.__doc__, self.dataset.__doc__, self.predictor.__doc__) @property
[docs] def dataset(self): """Thin reference to `dataset`""" return self.predictor_data_manager.dataset( self.data_name, self.trait_name, data=self._data, trait=self.trait, categorical_trait=self.categorical_trait)
@property
[docs] def X(self): """Predictive variables, aligned with target. Thin reference to `dataset.X` """ return self.dataset.X
@property
[docs] def y(self): """Target variable, aligned with predictive variables Thin reference to `dataset.y` """ return self.dataset.y
@property
[docs] def predictor(self): """Thin reference to ``dataset.predictor``""" return self.dataset.predictor( self.predictor_name, obj=self.predictor_obj, predictor_scoring_fun=self.predictor_scoring_fun, score_cutoff_fun=self.score_cutoff_fun, n_features_dependent_kwargs=self.n_features_dependent_kwargs, **self.constant_kwargs)
[docs] def fit(self): """Fit predictor to the dataset""" sys.stdout.write( "Fitting a predictor for X:{}, y:{}, method:{}... please wait.\n" .format(self.dataset.data_name, self.dataset.trait_name, self.predictor_name)) self.predictor.fit(self.dataset.X, self.dataset.y) self.has_been_fit = True sys.stdout.write("\tFinished.\n") # Collect scores from predictor, rename innate scores variable to # self.scores_ scores = self.predictor.predictor_scoring_fun(self.predictor) self.scores_ = pd.Series(index=self.X.columns, data=scores) self.has_been_scored = True
@memoize
[docs] def predict(self, other): """Predict Parameters ---------- other : pandas.DataFrame Given a (m_samples, n_features) dataframe, predict the response Returns ------- prediction : pandas.Series (m_samples,) sized series of prediction of response Raises ------ TypeError If ``other`` is not a pandas DataFrame """ if not isinstance(other, pd.DataFrame): raise TypeError("please predict on a DataFrame") other_aligned, _ = other.align(self.X, axis=1, join='right').fillna(0) sys.stderr.write("predicting value, there are \ {} common and {} not-common features.".format( len(set(other.columns) and self.X.columns), len(other.columns and not self.X.columns))) return pd.Series(self.predictor.predict(other_aligned.values), index=other.index)
@property
[docs] def oob_score_(self): """Thin reference to `predictor.oob_score_`""" return self.predictor.oob_score_
@property def has_been_fit(self): """Thin reference to `predictor.has_been_fit`""" return self.predictor.has_been_fit @has_been_fit.setter
[docs] def has_been_fit(self, value): """Set whether the predictor has been fit""" self.predictor.has_been_fit = value
@property def has_been_scored(self): """Thin reference to :py:attr:`.predictor.has_been_scored`""" return self.predictor.has_been_scored @has_been_scored.setter
[docs] def has_been_scored(self, value): """Set whether the predictor has been scored""" self.predictor.has_been_scored = value
@property def score_coefficient(self): """Thin reference to ``predictor._score_coefficient``""" return self.predictor._score_coefficient @score_coefficient.setter
[docs] def score_coefficient(self, value): """Set the predictor's score coefficient""" self.predictor._score_coefficient = value
@property def scores_(self): """Scores of these features' importances in this predictor""" return self.predictor.scores_ @scores_.setter
[docs] def scores_(self, value): """Set the predictor scores If zero important features found, raise a warning """ self.predictor.scores_ = value if self.n_good_features_ <= 1: sys.stderr.write("cutoff: %.4f\n" % self.score_cutoff_) UserWarning("These classifier settings produced <= 1 important " "feature, consider reducing score_coefficient. " "DataFramePCA will fail with this error: " "\"ValueError: failed to create intent(" "cache|hide)|optional array-- must have defined " "dimensions but got (0,)\"\n")
@property
[docs] def score_cutoff_(self): """Get the minimum score of the 'good' features""" return self.predictor.score_cutoff_fun(self.scores_, self.score_coefficient)
@property
[docs] def important_features_(self): """Get all features with scores greater than ``score_cutoff_``""" return self.scores_ > self.score_cutoff_
@property
[docs] def subset_(self): """Get the subset of the data with only important features""" return self.X.ix[:, self.important_features_]
@property
[docs] def n_good_features_(self): """Get the number of good features""" return np.sum(self.important_features_)
@memoize
[docs] def pca(self): """Perform PCA on the top-performing features""" return DataFramePCA(self.subset_)
@memoize
[docs] def nmf(self): """Perform NMF on the top-performing features""" return DataFrameNMF(self.subset_)
[docs]class Regressor(PredictorBase): categorical = False __doc__ = "Regressor for continuous response variables.\n" + \ PredictorBase.__init__.__doc__ def __init__(self, data_name, trait_name, predictor_name=None, *args, **kwargs): if predictor_name is None: predictor_name = REGRESSOR kwargs['is_categorical_trait'] = False super(Regressor, self).__init__(predictor_name, data_name, trait_name, *args, **kwargs)
[docs]class Classifier(PredictorBase): categorical = True __doc__ = "Classifier for categorical response variables.\n" + \ PredictorBase.__init__.__doc__ def __init__(self, data_name, trait_name, predictor_name=None, *args, **kwargs): if predictor_name is None: predictor_name = CLASSIFIER kwargs['is_categorical_trait'] = True super(Classifier, self).__init__(predictor_name, data_name, trait_name, *args, **kwargs)
Olga B. Botvinnik is funded by the NDSEG fellowship and is a NumFOCUS John Hunter Technology Fellow.
Michael T. Lovci was partially funded by a fellowship from Genentech.
Partially funded by NIH grants NS075449 and HG004659 and CIRM grants RB4-06045 and TR3-05676 to Gene Yeo.