Skip to content

Commit

Permalink
Added a ransac method along with a RansacGaussianProcess model
Browse files Browse the repository at this point in the history
which removes outliers before fitting.
  • Loading branch information
akleeman committed Sep 25, 2018
1 parent ada7e68 commit 795de2e
Show file tree
Hide file tree
Showing 14 changed files with 572 additions and 59 deletions.
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ set(albatross_HEADERS
albatross/tuning_metrics.h
albatross/map_utils.h
albatross/csv_utils.h
albatross/outlier.h
albatross/core/keys.h
albatross/core/dataset.h
albatross/core/model.h
Expand Down
53 changes: 30 additions & 23 deletions albatross/core/indexing.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,15 +15,17 @@

#include "core/dataset.h"
#include <Eigen/Core>
#include <algorithm>
#include <functional>
#include <iostream>
#include <iterator>
#include <map>
#include <numeric>
#include <vector>

namespace albatross {

using s32 = int32_t;
using FoldIndices = std::vector<s32>;
using FoldIndices = std::vector<std::size_t>;
using FoldName = std::string;
using FoldIndexer = std::map<FoldName, FoldIndices>;

Expand Down Expand Up @@ -83,7 +85,7 @@ inline Eigen::MatrixXd subset(const std::vector<SizeType> &row_indices,
auto ii = static_cast<Eigen::Index>(i);
auto jj = static_cast<Eigen::Index>(j);
auto row_index = static_cast<Eigen::Index>(row_indices[i]);
auto col_index = static_cast<Eigen::Index>(col_indices[i]);
auto col_index = static_cast<Eigen::Index>(col_indices[j]);
out(ii, jj) = v(row_index, col_index);
}
}
Expand Down Expand Up @@ -141,20 +143,26 @@ template <typename FeatureType> struct RegressionFold {
test_indices(test_indices_){};
};

inline FoldIndices get_train_indices(const FoldIndices &test_indices,
const int n) {
const s32 k = static_cast<s32>(test_indices.size());
// The train indices are all the indices that are not test indices.
FoldIndices train_indices(n - k);
s32 train_cnt = 0;
for (s32 j = 0; j < n; j++) {
if (std::find(test_indices.begin(), test_indices.end(), j) ==
test_indices.end()) {
train_indices[train_cnt] = j;
train_cnt++;
}
}
return train_indices;
template <typename X>
inline std::vector<X> vector_set_difference(const std::vector<X> &x,
const std::vector<X> &y) {
std::vector<X> diff;
std::set_difference(x.begin(), x.end(), y.begin(), y.end(),
std::inserter(diff, diff.begin()));
return diff;
}

/*
* Computes the indices between 0 and n - 1 which are NOT contained
* in `indices`. Here complement is the mathematical interpretation
* of the word meaning "the part required to make something whole".
* In other words, indices and indices_complement(indices) should
* contain all the numbers between 0 and n-1
*/
inline FoldIndices indices_complement(const FoldIndices &indices, const int n) {
FoldIndices all_indices(n);
std::iota(all_indices.begin(), all_indices.end(), 0);
return vector_set_difference(all_indices, indices);
}

/*
Expand All @@ -168,7 +176,7 @@ static inline std::vector<RegressionFold<FeatureType>>
folds_from_fold_indexer(const RegressionDataset<FeatureType> &dataset,
const FoldIndexer &groups) {
// For a dataset with n features, we'll have n folds.
const s32 n = static_cast<s32>(dataset.features.size());
const std::size_t n = dataset.features.size();
std::vector<RegressionFold<FeatureType>> folds;
// For each fold, partition into train and test sets.
for (const auto &pair : groups) {
Expand All @@ -177,7 +185,7 @@ folds_from_fold_indexer(const RegressionDataset<FeatureType> &dataset,
// from changing the input FoldIndexer we perform a copy here.
const FoldName group_name(pair.first);
const FoldIndices test_indices(pair.second);
const auto train_indices = get_train_indices(test_indices, n);
const auto train_indices = indices_complement(test_indices, n);

std::vector<FeatureType> train_features =
subset(train_indices, dataset.features);
Expand Down Expand Up @@ -205,7 +213,7 @@ template <typename FeatureType>
static inline FoldIndexer
leave_one_out_indexer(const RegressionDataset<FeatureType> &dataset) {
FoldIndexer groups;
for (s32 i = 0; i < static_cast<s32>(dataset.features.size()); i++) {
for (std::size_t i = 0; i < dataset.features.size(); i++) {
FoldName group_name = std::to_string(i);
groups[group_name] = {i};
}
Expand All @@ -221,9 +229,8 @@ static inline FoldIndexer leave_one_group_out_indexer(
const RegressionDataset<FeatureType> &dataset,
const std::function<FoldName(const FeatureType &)> &get_group_name) {
FoldIndexer groups;
for (s32 i = 0; i < static_cast<s32>(dataset.features.size()); i++) {
const std::string k =
get_group_name(dataset.features[static_cast<std::size_t>(i)]);
for (std::size_t i = 0; i < dataset.features.size(); i++) {
const std::string k = get_group_name(dataset.features[i]);
// Get the existing indices if we've already encountered this group_name
// otherwise initialize a new one.
FoldIndices indices;
Expand Down
11 changes: 4 additions & 7 deletions albatross/core/model.h
Original file line number Diff line number Diff line change
Expand Up @@ -66,8 +66,7 @@ class RegressionModel : public ParameterHandlingMixin {
void fit(const std::vector<FeatureType> &features,
const MarginalDistribution &targets) {
assert(features.size() > 0);
assert(static_cast<s32>(features.size()) ==
static_cast<s32>(targets.size()));
assert(features.size() == static_cast<std::size_t>(targets.size()));
fit_(features, targets);
has_been_fit_ = true;
}
Expand Down Expand Up @@ -174,8 +173,7 @@ class RegressionModel : public ParameterHandlingMixin {
detail::PredictTypeIdentity<JointDistribution> &&identity) const {
assert(has_been_fit());
JointDistribution preds = predict_(features);
assert(static_cast<s32>(preds.mean.size()) ==
static_cast<s32>(features.size()));
assert(static_cast<std::size_t>(preds.mean.size()) == features.size());
return preds;
}

Expand All @@ -184,8 +182,7 @@ class RegressionModel : public ParameterHandlingMixin {
detail::PredictTypeIdentity<MarginalDistribution> &&identity) const {
assert(has_been_fit());
MarginalDistribution preds = predict_marginal_(features);
assert(static_cast<s32>(preds.mean.size()) ==
static_cast<s32>(features.size()));
assert(static_cast<std::size_t>(preds.mean.size()) == features.size());
return preds;
}

Expand All @@ -194,7 +191,7 @@ class RegressionModel : public ParameterHandlingMixin {
detail::PredictTypeIdentity<Eigen::VectorXd> &&identity) const {
assert(has_been_fit());
Eigen::VectorXd preds = predict_mean_(features);
assert(static_cast<s32>(preds.size()) == static_cast<s32>(features.size()));
assert(static_cast<std::size_t>(preds.size()) == features.size());
return preds;
}

Expand Down
54 changes: 36 additions & 18 deletions albatross/models/gp.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@
* WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A PARTICULAR PURPOSE.
*/

#ifndef ALBATROSS_GP_GP_H
#define ALBATROSS_GP_GP_H
#ifndef ALBATROSS_MODELS_GP_H
#define ALBATROSS_MODELS_GP_H

#include "evaluate.h"
#include "stdio.h"
Expand Down Expand Up @@ -46,6 +46,35 @@ template <typename FeatureType> struct GaussianProcessFit {
}
};

template <typename FeatureType>
inline JointDistribution predict_from_covariance_and_fit(
const Eigen::MatrixXd &cross_cov, const Eigen::MatrixXd &pred_cov,
const GaussianProcessFit<FeatureType> &model_fit) {
const Eigen::VectorXd pred = cross_cov.transpose() * model_fit.information;
auto ldlt = model_fit.train_ldlt;
Eigen::MatrixXd posterior_cov = ldlt.solve(cross_cov);
posterior_cov = cross_cov.transpose() * posterior_cov;
posterior_cov = pred_cov - posterior_cov;
return JointDistribution(pred, posterior_cov);
}

template <typename FeatureType>
inline GaussianProcessFit<FeatureType>
fit_from_covariance(const std::vector<FeatureType> &features,
const Eigen::MatrixXd &train_cov,
const MarginalDistribution &targets) {
GaussianProcessFit<FeatureType> model_fit;
model_fit.train_features = features;
Eigen::MatrixXd cov(train_cov);
if (targets.has_covariance()) {
cov += targets.covariance;
}
model_fit.train_ldlt = Eigen::SerializableLDLT(cov.ldlt());
// Precompute the information vector
model_fit.information = model_fit.train_ldlt.solve(targets.mean);
return model_fit;
}

template <typename FeatureType, typename SubFeatureType = FeatureType>
using SerializableGaussianProcess =
SerializableRegressionModel<FeatureType,
Expand Down Expand Up @@ -101,7 +130,7 @@ class GaussianProcessRegression
symmetric_covariance(covariance_function_, features);
auto ldlt = this->model_fit_.train_ldlt;
pred_cov -= cross_cov * ldlt.solve(cross_cov.transpose());
assert(static_cast<s32>(pred.size()) == static_cast<s32>(features.size()));
assert(static_cast<std::size_t>(pred.size()) == features.size());
return InspectionDistribution(pred, pred_cov);
}

Expand Down Expand Up @@ -212,27 +241,17 @@ class GaussianProcessRegression
serializable_fit_(const std::vector<FeatureType> &features,
const MarginalDistribution &targets) const override {
Eigen::MatrixXd cov = symmetric_covariance(covariance_function_, features);
FitType model_fit;
model_fit.train_features = features;
if (targets.has_covariance()) {
cov += targets.covariance;
}
model_fit.train_ldlt = Eigen::SerializableLDLT(cov.ldlt());
// Precompute the information vector
model_fit.information = model_fit.train_ldlt.solve(targets.mean);
return model_fit;
return fit_from_covariance(features, cov, targets);
}

JointDistribution
predict_(const std::vector<FeatureType> &features) const override {
const auto cross_cov = asymmetric_covariance(
covariance_function_, features, this->model_fit_.train_features);
const Eigen::VectorXd pred = cross_cov * this->model_fit_.information;
covariance_function_, this->model_fit_.train_features, features);
Eigen::MatrixXd pred_cov =
symmetric_covariance(covariance_function_, features);
auto ldlt = this->model_fit_.train_ldlt;
pred_cov -= cross_cov * ldlt.solve(cross_cov.transpose());
return JointDistribution(pred, pred_cov);
return predict_from_covariance_and_fit(cross_cov, pred_cov,
this->model_fit_);
}

virtual MarginalDistribution
Expand Down Expand Up @@ -261,7 +280,6 @@ class GaussianProcessRegression
return pred;
}

private:
CovarianceFunction covariance_function_;
std::string model_name_;
};
Expand Down
8 changes: 4 additions & 4 deletions albatross/models/least_squares.h
Original file line number Diff line number Diff line change
Expand Up @@ -70,11 +70,11 @@ class LeastSquaresRegression
protected:
Eigen::VectorXd
predict_mean_(const std::vector<Eigen::VectorXd> &features) const override {
int n = static_cast<s32>(features.size());
std::size_t n = features.size();
Eigen::VectorXd mean(n);
for (s32 i = 0; i < n; i++) {
mean(i) =
features[static_cast<std::size_t>(i)].dot(this->model_fit_.coefs);
for (std::size_t i = 0; i < n; i++) {
mean(static_cast<Eigen::Index>(i)) =
features[i].dot(this->model_fit_.coefs);
}
return mean;
}
Expand Down
Loading

0 comments on commit 795de2e

Please sign in to comment.