Source code for flotilla.compute.outlier

"""
Detect outlier samples in data
"""

import sklearn
import pandas as pd


[docs]class OutlierDetection(object): """Construct an outlier detection object Parameters ---------- X : pandas.DataFrame A (n_samples, n_features) dataframe, where the outliers will be detected from the rows (the samples) method : sklearn classifier, optional If None, defaults to OneClassSVM. The method class must have both method.fit() and method.predict() methods nu : float, optional (default 0.1) An upper bound on the fraction of training errors and a lower bound of the fraction of support vectors. Should be in the interval (0, 1]. By default 0.5 will be taken. kernel : str, optional (default='rbf') The kernel to be used by the outlier detection algorihthm gamma : float, optional (default=0.1) Kernel coefficient for 'rbf', 'poly' and 'sigmoid'. If gamma is 0.0 then 1/n_features will be used instead. random_state : int, optional (default=0) Random state of the method, for reproducibility. kwargs : other keyword arguments, optional All other keyword arguments are passed to method() """ def __init__(self, X, method=None, nu=0.1, kernel='rbf', gamma=0.1, random_state=0, **kwargs): if method is None: method = sklearn.svm.OneClassSVM print kernel kwargs.update(dict(nu=nu, kernel=kernel, gamma=gamma, random_state=random_state)) self.kwargs = kwargs self.outlier_detector = method(**kwargs) self.X = X self.outlier_detector.fit(self.X)
[docs] def predict(self, X=None): """Predict which samples are outliers Parameters ---------- X : pandas.DataFrame, optional (default None) A (n_samples, n_features) Dataframe. If None, predict outliers of the original input data, where the new data has the same number of features as the original data. Otherwise, use the original input data to detect outliers on this new data. Returns ------- outliers : pandas.Series A boolean """ X = X if X is not None else self.X self.outliers = pd.Series( self.outlier_detector.predict(X.fillna(0)) == -1, index=X.index) # TODO: Since you can run this on self.X OR new X, then "self.outliers" # can change and not be consistent....... this is a problem return self.outliers
Olga B. Botvinnik is funded by the NDSEG fellowship and is a NumFOCUS John Hunter Technology Fellow.
Michael T. Lovci was partially funded by a fellowship from Genentech.
Partially funded by NIH grants NS075449 and HG004659 and CIRM grants RB4-06045 and TR3-05676 to Gene Yeo.