"""
Compute predictors on data, e.g. classify or regress on features/samples
"""
import sys
import warnings
from collections import defaultdict
import math
import numpy as np
import pandas as pd
from sklearn.ensemble import ExtraTreesClassifier, ExtraTreesRegressor, \
GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.preprocessing import LabelEncoder
import pandas.util.testing as pdt
from ..util import memoize, timestamp
from .decomposition import DataFramePCA, DataFrameNMF
CLASSIFIER = 'ExtraTreesClassifier'
REGRESSOR = 'ExtraTreesRegressor'
SCORE_COEFFICIENT = 2
[docs]def default_predictor_scoring_fun(cls):
"""Return scores of how important a feature is to the prediction
Most predictors score output coefficients in the variable
cls.feature_importances_ and others may use another name for scores, so
this function bridges the gap
Parameters
----------
cls : sklearn predictor
A scikit-learn prediction class, such as ExtraTreesClassifier or
ExtraTreesRegressor
Returns
-------
scores : pandas.Series
A (n_features,) size series of how important each feature was to the
classification (bigger is better)
"""
return cls.feature_importances_
[docs]def default_score_cutoff_fun(arr, std_multiplier=SCORE_COEFFICIENT):
"""Calculate a minimum score cutoff for the best features
By default, this function calculates: :math:`f(x) = mean(x) + 2 * std(x)`
Parameters
----------
arr : numpy.ndarray
A numpy array of scores
std_multiplier : float, optional (default=2)
What to multiply the standard deviation by. E.g. if you want only
features that are 6 standard deviations away, set this to 6.
Returns
-------
cutoff : float
Minimum score of "best" features, given these parameters
"""
return np.mean(arr) + std_multiplier * np.std(arr)
[docs]class PredictorConfig(object):
"""A configuration for a predictor, names and tracks/sets parameters
Dynamically configures some args for predictor based on n_features
(if this attribute exists)
set general parameters with __init__
yield instances, set by your parameters, with __call__
"""
def __init__(self, predictor_name, obj,
predictor_scoring_fun=default_predictor_scoring_fun,
score_cutoff_fun=default_score_cutoff_fun,
n_features_dependent_kwargs=None,
**kwargs):
"""Construct a predictor configuration
Parameters
----------
predictor_name : str
A name for this predictor
obj : sklearn predictor
A scikit-learn predictor, eg sklearn.ensemble.ExtraTreesClassifier
predictor_scoring_fun : function, optional
A function which returns the scores of a predictor. May be
different for different predictor objects
score_cutoff_fun : function, optional
A function which returns the minimum "good" score of a predictor
n_features_dependent_kwargs : dict, optional (default None)
A dictionary of (key, function) arguments for the classifier, for
keyword arguments that are dependent on the dataset input size
kwargs : other keyword arguments, optional
All other keyword arguments are passed along to the predictor
"""
if n_features_dependent_kwargs is None:
n_features_dependent_kwargs = {}
self.n_features_dependent_kwargs = n_features_dependent_kwargs
self.constant_kwargs = kwargs
self.predictor_scoring_fun = predictor_scoring_fun
self.score_cutoff_fun = score_cutoff_fun
self.predictor_name = predictor_name
sys.stdout.write(
"{}\tPredictor {} is of type {}\n".format(timestamp(),
self.predictor_name,
obj))
self._parent = obj
self.__doc__ = obj.__doc__
sys.stdout.write(
"{}\tAdded {} to default predictors\n".format(timestamp(),
self.predictor_name))
@memoize
[docs] def parameters(self, n_features):
"""Given a number of features, return the appropriately scaled keyword
arguments
Parameters
----------
n_features : int
Number of features in the data to scale appropriate keyword
arguments to the predictor object
"""
kwargs = {}
for parameter, setter in self.n_features_dependent_kwargs.items():
kwargs[parameter] = setter(n_features)
for parameter, value in self.constant_kwargs.items():
kwargs[parameter] = value
return kwargs
def __call__(self, n_features):
"""Initialize a predictor with this number of features
Parameters
----------
n_features : int
The number of features in the data
Returns
-------
predictor : sklearn predictor
A scikit-learn predictor object inialized with keyword arguments
specified in __init__, and the
:py:attr:`n_feature_dependent_kwargs` scaled to this number of
features
"""
parameters = self.parameters(n_features)
sys.stdout.write(
"{} Configuring predictor type: {} with {} features".format(
timestamp(), self.predictor_name, n_features))
predictor = self._parent(**parameters)
predictor.score_cutoff_fun = self.score_cutoff_fun
predictor.predictor_scoring_fun = self.predictor_scoring_fun
predictor.has_been_fit = False
predictor.has_been_scored = False
predictor._score_coefficient = SCORE_COEFFICIENT
return predictor
[docs]class PredictorConfigScalers(object):
"""Scale parameters specified in the keyword arugments based on the
dataset size
"""
_default_coef = 2.5
_default_nfeatures = 500
@staticmethod
[docs] def max_feature_scaler(n_features=_default_nfeatures, coef=_default_coef):
"""Scale the maximum number of features per estimator
# TODO: @mlovci what are the principles behind this scaler? to see each
feature "x" number of times?
Parameters
----------
n_features : int, optional (default 500)
Number of features in the data
coef : float, optional (default 2.5)
# TODO: What does this do?
Returns
-------
n_features : int
Maximum number of features per estimator
Raises
------
ValueError
If n_features is None
"""
if n_features is None:
raise ValueError
return int(math.ceil(np.sqrt(np.sqrt(n_features) ** (coef + .3))))
@staticmethod
[docs] def n_estimators_scaler(n_features=_default_nfeatures, coef=_default_coef):
"""Scale the number of estimators based on the input features
# TODO: @mlovci what are the principles behind this scaler? to see each
feature "x" number of times?
Parameters
----------
n_features : int, optional (default 500)
Number of features in the data
coef : float, optional (default 2.5)
# @mlovci TODO: What does this do?
Returns
-------
n_estimators : int
Number of estimators to use
Raises
------
ValueError
If n_features is None
"""
if n_features is None:
raise ValueError
return int(math.ceil((n_features / 50.) * coef))
@staticmethod
[docs] def n_jobs_scaler(n_features=_default_nfeatures):
"""Scale the number of jobs based on how many features are in the data
# TODO: @mlovci what are the principles behind this scaler? to see each
feature "x" number of times?
Parameters
----------
n_features : int
Number of features in the data
Returns
-------
n_jobs : int
Number of jobs to use
Raises
------
ValueError
If n_features is None
"""
if n_features is None:
raise ValueError
return int(min(4, math.ceil(n_features / 2000.)))
[docs]class ConfigOptimizer(object):
"""choose the coef that makes some result most likely at all n_features
(or some other function of the dataset)
"""
@staticmethod
[docs] def objective_average_times_seen(
n_features, coef=PredictorConfigScalers._default_coef,
max_feature_scaler=PredictorConfigScalers.max_feature_scaler,
n_estimators_scaler=PredictorConfigScalers.n_estimators_scaler):
"""I have no idea what this does. @mlovci
Parameters
----------
n_features : int
???
coef : float
???
max_feature_scaler : function
???
n_estimators_scaler : function
???
Returns
-------
???
"""
return ((n_features / max_feature_scaler(n_features, coef)) *
n_estimators_scaler(n_features, coef)) / float(n_features)
[docs]class PredictorConfigManager(object):
"""Manage several predictor configurations
A container for predictor configurations, includes several built-ins
@mlovci: built-ins such as ........ ?
What is predictor_config vs new_predictor_config? Why are they separate?
Attributes
----------
predictor_config :
predictor_configs :
builtin_predictor_configs :
Methods
-------
new_predictor_config
Create a new predictor configuration
>>> pcm = PredictorConfigManager()
>>> # add a new type of predictor
>>> pcm.new_predictor_config(ExtraTreesClassifier, 'ExtraTreesClassifier',
... n_features_dependent_kwargs=
... {'max_features': PredictorConfigScalers.max_feature_scaler,
... 'n_estimators': PredictorConfigScalers.n_estimators_scaler,
... 'n_jobs': PredictorConfigScalers.n_jobs_scaler},
... bootstrap=True, random_state=0,
... oob_score=True,
... verbose=True})
"""
def __init__(self):
"""Construct a predictor configuration manager with
ExtraTreesClassifier, ExtraTreesRegressor, GradientBoostingClassifier,
and GradientBoostingRegressor as default predictors.
"""
constant_extratrees_kwargs = {'bootstrap': True,
'random_state': 0,
'oob_score': True,
'verbose': True}
self.predictor_config(
'ExtraTreesClassifier', obj=ExtraTreesClassifier,
n_features_dependent_kwargs={
'max_features': PredictorConfigScalers.max_feature_scaler,
'n_estimators': PredictorConfigScalers.n_estimators_scaler,
'n_jobs': PredictorConfigScalers.n_jobs_scaler},
**constant_extratrees_kwargs)
self.predictor_config(
'ExtraTreesRegressor', obj=ExtraTreesRegressor,
n_features_dependent_kwargs={
'max_features': PredictorConfigScalers.max_feature_scaler,
'n_estimators': PredictorConfigScalers.n_estimators_scaler,
'n_jobs': PredictorConfigScalers.n_jobs_scaler},
**constant_extratrees_kwargs)
constant_boosting_kwargs = {'n_estimators': 80, 'max_features': 1000,
'learning_rate': 0.2, 'subsample': 0.6, }
self.predictor_config('GradientBoostingClassifier',
obj=GradientBoostingClassifier,
**constant_boosting_kwargs)
self.predictor_config('GradientBoostingRegressor',
obj=GradientBoostingRegressor,
**constant_boosting_kwargs)
@property
[docs] def builtin_predictor_configs(self):
"""Names of the predictor configurations
"""
return self.predictor_configs.keys()
@property
[docs] def predictor_configs(self):
"""Dict of predictor configurations
"""
if not hasattr(self, '_predictors'):
self._predictors = {}
return self._predictors
[docs] def predictor_config(self, name, **kwargs):
"""Create a new predictor configuration, added to
:py:attr:`.predictors`
Parameters
----------
name : str
Name of the predictor
kwargs : other keyword arguments, optional
All other keyword arguments are passed to
:py:meth:`predictor_configs`
Returns
-------
predictor : sklearn predictor
An initalized scikit-learn predictor
"""
predictor = self.new_predictor_config(name, **kwargs)
if name in self.predictor_configs and \
self.predictor_configs[name] != predictor:
sys.stderr.write(
"WARNING: over-writing predictor named: {}".format(name))
self.predictor_configs[name] = predictor
return predictor
@memoize
[docs] def new_predictor_config(self, name, obj=None,
predictor_scoring_fun=None,
score_cutoff_fun=None,
n_features_dependent_kwargs=None,
**kwargs):
"""Create a new predictor configuration
Parameters
----------
name : str
Name of the predictor configuration
obj : sklearn predictor object, optional (default=None)
@mlovci: what is the point of setting the default to None if it's
not really allowed?
predictor_scoring_fun : function, optional (default=None)
If None, get feature scores from obj.feature_importances_
score_cutoff_fun : function, optional (default=None)
If None, get the cutoff for important features with by taking
features with scores that are 2 standard deviations away from the
mean score
n_features_dependent_kwargs : dict, optional (default=None)
A (key, function) dictionary of keyword argument names and
functions which scale their values based on the dataset input size
kwargs : other keyword arguments
All other keyword arguments are passed to
:py:class:`PredictorConfig`
Returns
-------
predictorconfig : PredictorConfig
A predictor configuration
Raises
------
ValueError
If `obj` is None and any of the other keyword arguments are None
KeyError
If `obj` is None and "name" is not already in
:py:attr:`.predictor_configs`
"""
if obj is None:
# If obj is None, then this is probably just a "name" and you can't
# change any of the parameters
n_features_dependent_kwargs = None if \
n_features_dependent_kwargs == {} else \
n_features_dependent_kwargs
kwargs = None if kwargs == {} else kwargs
args = [predictor_scoring_fun, score_cutoff_fun,
n_features_dependent_kwargs, kwargs]
if any([i is not None for i in args]):
# if obj is None, you'd better not be asking to set parameters
# on it.
raise ValueError
try:
return self.predictor_configs[name]
except KeyError:
raise KeyError("No such predictor: {}".format(name))
if predictor_scoring_fun is None:
predictor_scoring_fun = default_predictor_scoring_fun
if score_cutoff_fun is None:
score_cutoff_fun = default_score_cutoff_fun
if n_features_dependent_kwargs is not None:
if type(n_features_dependent_kwargs) is not dict:
raise TypeError
else:
n_features_dependent_kwargs = {}
return PredictorConfig(
name, obj, predictor_scoring_fun=predictor_scoring_fun,
score_cutoff_fun=score_cutoff_fun,
n_features_dependent_kwargs=n_features_dependent_kwargs,
**kwargs)
[docs]class PredictorDataSet(object):
def __init__(self, data, trait,
data_name="MyDataset",
categorical_trait=False,
predictor_config_manager=None):
"""Store a (n_samples, n_features) matrix and (n_samples,) trait pair
In scikit-learn parlance, store an X (data of independent variables)
and y (target prediction) pair
Parameters
----------
data : pandas.DataFrame
A (n_samples, n_features) datafarme
trait : pandas.Series
Returns
-------
Raises
------
data - X
trait - y
data_name - name to store this dataset, to be used with trait.name
categorical_trait - is y categorical?
"""
if not isinstance(trait, pd.Series):
raise TypeError("Traits must be pandas.Series objects")
self.dataset_name = (data_name, trait.name)
self.data_name = data_name
self._data = data
self.trait = trait
self.trait_name = self.trait.name
self.categorical_trait = categorical_trait
if categorical_trait:
if len(self.traitset) > 2:
warnings.warn("WARNING: trait {} has >2 categories".format(
self.trait_name))
# categorical encoder
le = LabelEncoder().fit(self.traitset)
# categorical encoding
self._y = pd.Series(data=le.transform(self.trait),
index=trait.index,
name=self.trait.name)
else:
self._y = trait
self.predictor_config_manager = predictor_config_manager \
if predictor_config_manager is not None \
else PredictorConfigManager()
self.n_features = self.X.shape[1]
self._predictors = defaultdict(dict)
@property
[docs] def X(self):
"""(n_samples, n_features) matrix"""
return self._data.align(self._y, axis=0,
join='inner')[0]
@property
[docs] def y(self):
"""(n_samples,) vector of traits"""
return self._data.align(self._y, axis=0,
join='inner')[1]
@property
[docs] def traitset(self):
"""All unique values in :py:attr:`self.trait`"""
return self.trait.groupby(self.trait).groups.keys()
@property
[docs] def predictors(self):
"""dict of PredictorConfig instances
The idea here is to keep the predictors tied to their datasets
"""
if hasattr(self, '_predictors'):
return self._predictors
@memoize
[docs] def predictor(self, name, **kwargs):
"""A single, initialized PredictorConfig instance
Parameters
----------
name : str
Name of the predictor to retrieve or initialize
kwargs : other keyword arguments
All other keyword arguments are passed to
:py:class:`PredictorConfig`
Returns
-------
predictorconfig : PredictorConfig
An initialized scikit-learn classifier or regressor
"""
predictor = self.predictor_config_manager.predictor_config(name,
**kwargs)
initialized = predictor(self.n_features)
self.predictors[name] = initialized
return initialized
[docs] def check_if_equal(self, data, trait, categorical_trait):
"""Check if this is the same as another dataset.
Parameters
----------
data : pandas.DataFrame
Input data of another dataset
trait : pandas.Series
Response variable of another dataset
categorical_trait : bool
Whether or not ``trait`` is categorical
Raises
------
AssertionError
If datasets are not the same
"""
pdt.assert_frame_equal(data, self._data)
pdt.assert_series_equal(trait, self.trait)
pdt.assert_equal(categorical_trait, self.categorical_trait)
[docs]class PredictorDataSetManager(object):
"""A collection of PredictorDataSet instances.
Parameters
----------
predictor_config_manager : PredictorConfigManager, optional (default None)
A predictor configuration manager. If None, instantiate a new one.
Attributes
----------
datasets : dict
Dict of dicts of {data: {trait: {categorical: dataset}}}. For
convenient retrieval of predictors
"""
def __init__(self, predictor_config_manager=None):
self.predictor_config_manager = predictor_config_manager \
if predictor_config_manager is not None \
else PredictorConfigManager()
@property
[docs] def datasets(self):
"""3-layer deep dict of {data: {trait: {categorical: dataset}}}
"""
if not hasattr(self, '_datasets'):
# 3 layer deep (data, trait, categorical?)
# will almost always be either categorical true or false, rarely
# both
self._datasets = defaultdict(lambda: defaultdict(dict))
return self._datasets
[docs] def dataset(self, data_name, trait_name, categorical_trait=False,
**kwargs):
"""???? @mlovci please fill in
Parameters
----------
data_name : str
Name of this data
trait_name : str
Name of this trait
categorical_trait : bool, optional (default=False)
If True, then this trait is treated as a categorical, rather than a
sequential trait
Returns
-------
dataset : PredictorDataSet
???
"""
kwargs['categorical_trait'] = categorical_trait
dataset = self.new_dataset(data_name, trait_name, **kwargs)
if data_name in self.datasets:
if trait_name in self.datasets[data_name]:
if categorical_trait in self.datasets[data_name][
trait_name] and \
self.datasets[data_name][trait_name][
categorical_trait] != dataset:
sys.stderr.write(
"WARNING: over-writing dataset named: {}".format(
(data_name,
trait_name,
categorical_trait)))
self.datasets[data_name][trait_name][
categorical_trait] = dataset
else:
self.datasets[data_name][trait_name][
categorical_trait] = dataset
else:
self.datasets[data_name][trait_name][
categorical_trait] = dataset
else:
self.datasets[data_name][trait_name][categorical_trait] = dataset
return dataset
@memoize
[docs] def new_dataset(self, data_name, trait_name,
categorical_trait=False,
data=None, trait=None,
predictor_config_manager=None):
"""??? Difference betwen this and ``dataset``??? @mlovci
Parameters
----------
data_name : str
Name of this data
trait_name : str
Name of this trait
categorical_trait : bool, optional (default=False)
If True, then this trait is treated as a categorical, rather than a
sequential trait
data : pandas.DataFrame, optional (default=None)
??? WHy is this optional!?!??!?!
trait : pandas.Series, optional (default=None)
???? Why is this optional!?!?!?
predictor_config_manager : PredictorConfigManager (default=None)
Returns
-------
dataset : PredictorDataSet
???
"""
if data is None:
# try to get this dataset by key in the dictionary
args = np.array([data, trait, predictor_config_manager])
if np.any([i is not None for i in args]):
# if data is None, you'd better not be asking to set other
# parameters
raise Exception
try:
return self.datasets[data_name][trait_name][categorical_trait]
except KeyError:
raise KeyError("No such dataset: {}".format(
(data_name, trait_name, categorical_trait)))
if trait is None:
raise Exception
if trait_name != trait.name:
raise ValueError
if data_name is None:
data_name = "MyData"
predictor_config_manager = predictor_config_manager \
if predictor_config_manager is not None \
else self.predictor_config_manager
return PredictorDataSet(
data, trait, data_name, categorical_trait=categorical_trait,
predictor_config_manager=predictor_config_manager)
[docs]class PredictorBase(object):
def __init__(self, predictor_name, data_name, trait_name,
X_data=None,
trait=None,
predictor_obj=None,
predictor_scoring_fun=None,
score_cutoff_fun=None,
n_features_dependent_kwargs=None,
constant_kwargs=None,
is_categorical_trait=None,
predictor_dataset_manager=None,
predictor_config_manager=None,
feature_renamer=None,
groupby=None, color=None, pooled=None, order=None,
violinplot_kws=None, data_type=None,
label_to_color=None, label_to_marker=None,
singles=None, outliers=None):
"""A dataset-predictor pair from PredictorDatasetManager
One datset, one predictor, from dataset manager.
Parameters
----------
predictor_name : str
Name for predictor
data_name : str
Name for this (subset of the) data
trait_name : str
Name for this trait
X_data : pandas.DataFrame, optional
Samples-by-features (row x col) dataset to train the predictor on
trait : pandas.Series, optional
A variable you want to predict using X_data. Indexed like X_data.
predictor_obj : sklearn predictor, optional
A scikit-learn predictor that implements fit and score on
(X_data,trait) Default ExtraTreesClassifier
predictor_scoring_fun : function, optional
Function to get the feature scores for a scikit-learn classifier.
This can be different for different classifiers, e.g. for a
classifier named "x" it could be x.scores_, for other it's
x.feature_importances_. Default: lambda x: x.feature_importances_
score_cutoff_fun : function, optional
Function to cut off insignificant scores
Default: lambda scores: np.mean(x) + 2 * np.std(x)
n_features_dependent_kwargs : dict, optional
kwargs to the predictor that depend on n_features
Default: {}
constant_kwargs : dict, optional
kwargs to the predictor that are constant, i.e.:
{'n_estimators': 100, 'bootstrap': True, 'max_features': 'auto',
'random_state': 0, 'oob_score': True, 'n_jobs': 2, 'verbose': True}
"""
self.predictor_name = predictor_name
self.data_name = data_name
self.trait_name = trait_name
self.feature_renamer = feature_renamer
self.groupby = groupby
self.color = color
self.pooled = pooled
self.singles = singles
self.outliers = outliers
self.order = order
self.violinplot_kws = violinplot_kws
self.data_type = data_type
self.label_to_color = label_to_color
self.label_to_marker = label_to_marker
if trait is not None:
trait = trait.copy()
trait.name = trait_name
if predictor_dataset_manager is None:
if predictor_config_manager is None:
self.predictor_config_manager = PredictorConfigManager()
else:
self.predictor_config_manager = predictor_config_manager
self.predictor_data_manager = PredictorDataSetManager(
self.predictor_config_manager)
else:
self.predictor_data_manager = predictor_dataset_manager
# load all args and kwargs into instance attributes
self._data = X_data
self.trait = trait
self.predictor_obj = predictor_obj
self.predictor_scoring_fun = predictor_scoring_fun
self.score_cutoff_fun = score_cutoff_fun
self.constant_kwargs = {} if constant_kwargs is None \
else constant_kwargs
self.n_features_dependent_kwargs = {} \
if n_features_dependent_kwargs is None else \
n_features_dependent_kwargs
self.categorical_trait = is_categorical_trait if \
is_categorical_trait is not None else False
self.__doc__ = '{}\n\n{}\n\n{}\n\n'.format(self.__doc__,
self.dataset.__doc__,
self.predictor.__doc__)
@property
[docs] def dataset(self):
"""Thin reference to `dataset`"""
return self.predictor_data_manager.dataset(
self.data_name, self.trait_name, data=self._data, trait=self.trait,
categorical_trait=self.categorical_trait)
@property
[docs] def X(self):
"""Predictive variables, aligned with target.
Thin reference to `dataset.X`
"""
return self.dataset.X
@property
[docs] def y(self):
"""Target variable, aligned with predictive variables
Thin reference to `dataset.y`
"""
return self.dataset.y
@property
[docs] def predictor(self):
"""Thin reference to ``dataset.predictor``"""
return self.dataset.predictor(
self.predictor_name, obj=self.predictor_obj,
predictor_scoring_fun=self.predictor_scoring_fun,
score_cutoff_fun=self.score_cutoff_fun,
n_features_dependent_kwargs=self.n_features_dependent_kwargs,
**self.constant_kwargs)
[docs] def fit(self):
"""Fit predictor to the dataset"""
sys.stdout.write(
"Fitting a predictor for X:{}, y:{}, method:{}... please wait.\n"
.format(self.dataset.data_name,
self.dataset.trait_name,
self.predictor_name))
self.predictor.fit(self.dataset.X, self.dataset.y)
self.has_been_fit = True
sys.stdout.write("\tFinished.\n")
# Collect scores from predictor, rename innate scores variable to
# self.scores_
scores = self.predictor.predictor_scoring_fun(self.predictor)
self.scores_ = pd.Series(index=self.X.columns, data=scores)
self.has_been_scored = True
@memoize
[docs] def predict(self, other):
"""Predict
Parameters
----------
other : pandas.DataFrame
Given a (m_samples, n_features) dataframe, predict the response
Returns
-------
prediction : pandas.Series
(m_samples,) sized series of prediction of response
Raises
------
TypeError
If ``other`` is not a pandas DataFrame
"""
if not isinstance(other, pd.DataFrame):
raise TypeError("please predict on a DataFrame")
other_aligned, _ = other.align(self.X, axis=1, join='right').fillna(0)
sys.stderr.write("predicting value, there are \
{} common and {} not-common features.".format(
len(set(other.columns) and self.X.columns),
len(other.columns and not self.X.columns)))
return pd.Series(self.predictor.predict(other_aligned.values),
index=other.index)
@property
[docs] def oob_score_(self):
"""Thin reference to `predictor.oob_score_`"""
return self.predictor.oob_score_
@property
def has_been_fit(self):
"""Thin reference to `predictor.has_been_fit`"""
return self.predictor.has_been_fit
@has_been_fit.setter
[docs] def has_been_fit(self, value):
"""Set whether the predictor has been fit"""
self.predictor.has_been_fit = value
@property
def has_been_scored(self):
"""Thin reference to :py:attr:`.predictor.has_been_scored`"""
return self.predictor.has_been_scored
@has_been_scored.setter
[docs] def has_been_scored(self, value):
"""Set whether the predictor has been scored"""
self.predictor.has_been_scored = value
@property
def score_coefficient(self):
"""Thin reference to ``predictor._score_coefficient``"""
return self.predictor._score_coefficient
@score_coefficient.setter
[docs] def score_coefficient(self, value):
"""Set the predictor's score coefficient"""
self.predictor._score_coefficient = value
@property
def scores_(self):
"""Scores of these features' importances in this predictor"""
return self.predictor.scores_
@scores_.setter
[docs] def scores_(self, value):
"""Set the predictor scores
If zero important features found, raise a warning
"""
self.predictor.scores_ = value
if self.n_good_features_ <= 1:
sys.stderr.write("cutoff: %.4f\n" % self.score_cutoff_)
UserWarning("These classifier settings produced <= 1 important "
"feature, consider reducing score_coefficient. "
"DataFramePCA will fail with this error: "
"\"ValueError: failed to create intent("
"cache|hide)|optional array-- must have defined "
"dimensions but got (0,)\"\n")
@property
[docs] def score_cutoff_(self):
"""Get the minimum score of the 'good' features"""
return self.predictor.score_cutoff_fun(self.scores_,
self.score_coefficient)
@property
[docs] def important_features_(self):
"""Get all features with scores greater than ``score_cutoff_``"""
return self.scores_ > self.score_cutoff_
@property
[docs] def subset_(self):
"""Get the subset of the data with only important features"""
return self.X.ix[:, self.important_features_]
@property
[docs] def n_good_features_(self):
"""Get the number of good features"""
return np.sum(self.important_features_)
@memoize
[docs] def pca(self):
"""Perform PCA on the top-performing features"""
return DataFramePCA(self.subset_)
@memoize
[docs] def nmf(self):
"""Perform NMF on the top-performing features"""
return DataFrameNMF(self.subset_)
[docs]class Regressor(PredictorBase):
categorical = False
__doc__ = "Regressor for continuous response variables.\n" + \
PredictorBase.__init__.__doc__
def __init__(self, data_name, trait_name,
predictor_name=None,
*args, **kwargs):
if predictor_name is None:
predictor_name = REGRESSOR
kwargs['is_categorical_trait'] = False
super(Regressor, self).__init__(predictor_name, data_name, trait_name,
*args, **kwargs)
[docs]class Classifier(PredictorBase):
categorical = True
__doc__ = "Classifier for categorical response variables.\n" + \
PredictorBase.__init__.__doc__
def __init__(self, data_name, trait_name,
predictor_name=None,
*args, **kwargs):
if predictor_name is None:
predictor_name = CLASSIFIER
kwargs['is_categorical_trait'] = True
super(Classifier, self).__init__(predictor_name, data_name, trait_name,
*args, **kwargs)