From a88cf30c173f1c8b230b0720e1b327ac6c935f1d Mon Sep 17 00:00:00 2001 From: CharlesAuguste Date: Mon, 23 Mar 2020 16:15:46 +0000 Subject: [PATCH] Improving monotone constraints ("Fast" method; linked to #2305, #2717) (#2770) * Add util functions. * Added monotone_constraints_method as a parameter. * Add the intermediate constraining method. * Updated tests. * Minor fixes. * Typo. * Linting. * Ran the parameter generator for the doc. * Removed usage of the FeatureMonotone function. * more fixes * Fix. * Remove duplicated code. * Add debug checks. * Typo. * Bug fix. * Disable the use of intermediate monotone constraints and feature sampling at the same time. * Added an alias for monotone constraining method. * Use the right variable to get the number of threads. * Fix DEBUG checks. * Add back check to determine if histogram is splittable. * Added forgotten override keywords. * Perform monotone constraint update only when necessary. * Small refactor of FastLeafConstraints. * Post rebase commit. * Small refactor. * Typo. * Added comment and slightly improved logic of monotone constraints. * Forgot a const. * Vectors that are to be modified need to be pointers. * Rename FastLeafConstraints to IntermediateLeafConstraints to match documentation. * Remove overload of GoUpToFindLeavesToUpdate. * Stop memory leaking. * Fix cpplint issues. * Fix checks. * Fix more cpplint issues. * Refactor config monotone constraints method. * Typos. * Remove useless empty lines. * Add new line to separate includes. * Replace unsigned ind by size_t. * Reduce number of trials in tests to decrease CI time. * Specify monotone constraints better in tests. * Removed outer loop in test of monotone constraints. * Added categorical features to the monotone constraints tests. * Add blank line. * Regenerate parameters automatically. * Speed up ShouldKeepGoingLeftRight. Co-authored-by: Charles Auguste Co-authored-by: guolinke --- docs/Parameters.rst | 10 + include/LightGBM/config.h | 7 + include/LightGBM/tree.h | 24 +- src/io/config.cpp | 11 + src/io/config_auto.cpp | 6 + src/io/tree.cpp | 3 +- src/treelearner/leaf_splits.hpp | 13 +- src/treelearner/monotone_constraints.hpp | 413 ++++++++++++++++++++++- src/treelearner/serial_tree_learner.cpp | 60 +++- src/treelearner/serial_tree_learner.h | 4 +- tests/python_package_test/test_engine.py | 83 +++-- 11 files changed, 588 insertions(+), 46 deletions(-) diff --git a/docs/Parameters.rst b/docs/Parameters.rst index 573602c47f61..0d9ade659fef 100644 --- a/docs/Parameters.rst +++ b/docs/Parameters.rst @@ -460,6 +460,16 @@ Learning Control Parameters - you need to specify all features in order. For example, ``mc=-1,0,1`` means decreasing for 1st feature, non-constraint for 2nd feature and increasing for the 3rd feature +- ``monotone_constraints_method`` :raw-html:`🔗︎`, default = ``basic``, type = string, aliases: ``monotone_constraining_method``, ``mc_method`` + + - used only if ``monotone_constraints`` is set + + - monotone constraints method + + - ``basic``, the most basic monotone constraints method. It does not slow the library at all, but over-constrains the predictions + + - ``intermediate``, a `more advanced method `__, which may slow the library very slightly. However, this method is much less constraining than the basic method and should significantly improve the results + - ``feature_contri`` :raw-html:`🔗︎`, default = ``None``, type = multi-double, aliases: ``feature_contrib``, ``fc``, ``fp``, ``feature_penalty`` - used to control feature's split gain, will use ``gain[i] = max(0, feature_contri[i]) * gain[i]`` to replace the split gain of i-th feature diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h index 87dc147923b6..057f56e99491 100644 --- a/include/LightGBM/config.h +++ b/include/LightGBM/config.h @@ -440,6 +440,13 @@ struct Config { // desc = you need to specify all features in order. For example, ``mc=-1,0,1`` means decreasing for 1st feature, non-constraint for 2nd feature and increasing for the 3rd feature std::vector monotone_constraints; + // alias = monotone_constraining_method, mc_method + // desc = used only if ``monotone_constraints`` is set + // desc = monotone constraints method + // descl2 = ``basic``, the most basic monotone constraints method. It does not slow the library at all, but over-constrains the predictions + // descl2 = ``intermediate``, a `more advanced method `__, which may slow the library very slightly. However, this method is much less constraining than the basic method and should significantly improve the results + std::string monotone_constraints_method = "basic"; + // type = multi-double // alias = feature_contrib, fc, fp, feature_penalty // default = None diff --git a/include/LightGBM/tree.h b/include/LightGBM/tree.h index 88578feaa31e..55568e41f544 100644 --- a/include/LightGBM/tree.h +++ b/include/LightGBM/tree.h @@ -147,6 +147,28 @@ class Tree { inline double split_gain(int split_idx) const { return split_gain_[split_idx]; } + inline double internal_value(int node_idx) const { + return internal_value_[node_idx]; + } + + inline bool IsNumericalSplit(int node_idx) const { + return !GetDecisionType(decision_type_[node_idx], kCategoricalMask); + } + + inline int left_child(int node_idx) const { return left_child_[node_idx]; } + + inline int right_child(int node_idx) const { return right_child_[node_idx]; } + + inline int split_feature_inner(int node_idx) const { + return split_feature_inner_[node_idx]; + } + + inline int leaf_parent(int leaf_idx) const { return leaf_parent_[leaf_idx]; } + + inline uint32_t threshold_in_bin(int node_idx) const { + return threshold_in_bin_[node_idx]; + } + /*! \brief Get the number of data points that fall at or below this node*/ inline int data_count(int node) const { return node >= 0 ? internal_count_[node] : leaf_count_[~node]; } @@ -436,7 +458,6 @@ inline void Tree::Split(int leaf, int feature, int real_feature, // add new node split_feature_inner_[new_node_idx] = feature; split_feature_[new_node_idx] = real_feature; - split_gain_[new_node_idx] = gain; // add two new leaves left_child_[new_node_idx] = ~leaf; @@ -544,7 +565,6 @@ inline int Tree::GetLeafByMap(const std::unordered_map& feature_val return ~node; } - } // namespace LightGBM #endif // LightGBM_TREE_H_ diff --git a/src/io/config.cpp b/src/io/config.cpp index 3aeb82b0eaac..0cf1d3c8bf21 100644 --- a/src/io/config.cpp +++ b/src/io/config.cpp @@ -317,6 +317,17 @@ void Config::CheckParamConflict() { force_col_wise = true; force_row_wise = false; } + if (is_parallel && monotone_constraints_method == std::string("intermediate")) { + // In distributed mode, local node doesn't have histograms on all features, cannot perform "intermediate" monotone constraints. + Log::Warning("Cannot use \"intermediate\" monotone constraints in parallel learning, auto set to \"basic\" method."); + monotone_constraints_method = "basic"; + } + if (feature_fraction_bynode != 1.0 && monotone_constraints_method == std::string("intermediate")) { + // "intermediate" monotone constraints need to recompute splits. If the features are sampled when computing the + // split initially, then the sampling needs to be recorded or done once again, which is currently not supported + Log::Warning("Cannot use \"intermediate\" monotone constraints with feature fraction different from 1, auto set monotone constraints to \"basic\" method."); + monotone_constraints_method = "basic"; + } } std::string Config::ToString() const { diff --git a/src/io/config_auto.cpp b/src/io/config_auto.cpp index 72824db8624e..59cc62a5d375 100644 --- a/src/io/config_auto.cpp +++ b/src/io/config_auto.cpp @@ -85,6 +85,8 @@ const std::unordered_map& Config::alias_table() { {"topk", "top_k"}, {"mc", "monotone_constraints"}, {"monotone_constraint", "monotone_constraints"}, + {"monotone_constraining_method", "monotone_constraints_method"}, + {"mc_method", "monotone_constraints_method"}, {"feature_contrib", "feature_contri"}, {"fc", "feature_contri"}, {"fp", "feature_contri"}, @@ -215,6 +217,7 @@ const std::unordered_set& Config::parameter_set() { "max_cat_to_onehot", "top_k", "monotone_constraints", + "monotone_constraints_method", "feature_contri", "forcedsplits_filename", "refit_decay_rate", @@ -414,6 +417,8 @@ void Config::GetMembersFromString(const std::unordered_map(tmp_str, ','); } + GetString(params, "monotone_constraints_method", &monotone_constraints_method); + if (GetString(params, "feature_contri", &tmp_str)) { feature_contri = Common::StringToArray(tmp_str, ','); } @@ -633,6 +638,7 @@ std::string Config::SaveMembersToString() const { str_buf << "[max_cat_to_onehot: " << max_cat_to_onehot << "]\n"; str_buf << "[top_k: " << top_k << "]\n"; str_buf << "[monotone_constraints: " << Common::Join(Common::ArrayCast(monotone_constraints), ",") << "]\n"; + str_buf << "[monotone_constraints_method: " << monotone_constraints_method << "]\n"; str_buf << "[feature_contri: " << Common::Join(feature_contri, ",") << "]\n"; str_buf << "[forcedsplits_filename: " << forcedsplits_filename << "]\n"; str_buf << "[refit_decay_rate: " << refit_decay_rate << "]\n"; diff --git a/src/io/tree.cpp b/src/io/tree.cpp index affc7080472f..5b5e24a2321c 100644 --- a/src/io/tree.cpp +++ b/src/io/tree.cpp @@ -50,7 +50,8 @@ Tree::~Tree() { int Tree::Split(int leaf, int feature, int real_feature, uint32_t threshold_bin, double threshold_double, double left_value, double right_value, - int left_cnt, int right_cnt, double left_weight, double right_weight, float gain, MissingType missing_type, bool default_left) { + int left_cnt, int right_cnt, double left_weight, double right_weight, float gain, + MissingType missing_type, bool default_left) { Split(leaf, feature, real_feature, left_value, right_value, left_cnt, right_cnt, left_weight, right_weight, gain); int new_node_idx = num_leaves_ - 1; decision_type_[new_node_idx] = 0; diff --git a/src/treelearner/leaf_splits.hpp b/src/treelearner/leaf_splits.hpp index 84323c3349b5..6d18c3b6f174 100644 --- a/src/treelearner/leaf_splits.hpp +++ b/src/treelearner/leaf_splits.hpp @@ -31,7 +31,6 @@ class LeafSplits { } /*! - * \brief Init split on current leaf on partial data. * \param leaf Index of current leaf * \param data_partition current data partition @@ -45,6 +44,18 @@ class LeafSplits { sum_hessians_ = sum_hessians; } + /*! + * \brief Init split on current leaf on partial data. + * \param leaf Index of current leaf + * \param sum_gradients + * \param sum_hessians + */ + void Init(int leaf, double sum_gradients, double sum_hessians) { + leaf_index_ = leaf; + sum_gradients_ = sum_gradients; + sum_hessians_ = sum_hessians; + } + /*! * \brief Init splits on current leaf, it will traverse all data to sum up the results * \param gradients diff --git a/src/treelearner/monotone_constraints.hpp b/src/treelearner/monotone_constraints.hpp index 88658208e94e..4d804d7fbfa0 100644 --- a/src/treelearner/monotone_constraints.hpp +++ b/src/treelearner/monotone_constraints.hpp @@ -1,15 +1,19 @@ /*! * Copyright (c) 2020 Microsoft Corporation. All rights reserved. - * Licensed under the MIT License. See LICENSE file in the project root for license information. + * Licensed under the MIT License. See LICENSE file in the project root for + * license information. */ #ifndef LIGHTGBM_TREELEARNER_MONOTONE_CONSTRAINTS_HPP_ #define LIGHTGBM_TREELEARNER_MONOTONE_CONSTRAINTS_HPP_ -#include #include #include +#include +#include #include +#include "split_info.hpp" + namespace LightGBM { struct ConstraintEntry { @@ -26,24 +30,59 @@ struct ConstraintEntry { void UpdateMin(double new_min) { min = std::max(new_min, min); } void UpdateMax(double new_max) { max = std::min(new_max, max); } + + bool UpdateMinAndReturnBoolIfChanged(double new_min) { + if (new_min > min) { + min = new_min; + return true; + } + return false; + } + + bool UpdateMaxAndReturnBoolIfChanged(double new_max) { + if (new_max < max) { + max = new_max; + return true; + } + return false; + } +}; + +class LeafConstraintsBase { + public: + virtual ~LeafConstraintsBase() {} + virtual const ConstraintEntry& Get(int leaf_idx) const = 0; + virtual void Reset() = 0; + virtual void BeforeSplit(const Tree* tree, int leaf, int new_leaf, + int8_t monotone_type) = 0; + virtual std::vector Update( + const Tree* tree, bool is_numerical_split, + int leaf, int new_leaf, int8_t monotone_type, double right_output, + double left_output, int split_feature, const SplitInfo& split_info, + const std::vector& best_split_per_leaf) = 0; + + inline static LeafConstraintsBase* Create(const Config* config, int num_leaves); }; -template -class LeafConstraints { +class BasicLeafConstraints : public LeafConstraintsBase { public: - explicit LeafConstraints(int num_leaves) : num_leaves_(num_leaves) { + explicit BasicLeafConstraints(int num_leaves) : num_leaves_(num_leaves) { entries_.resize(num_leaves_); } - void Reset() { + void Reset() override { for (auto& entry : entries_) { entry.Reset(); } } - void UpdateConstraints(bool is_numerical_split, int leaf, int new_leaf, - int8_t monotone_type, double right_output, - double left_output) { + void BeforeSplit(const Tree*, int, int, int8_t) override {} + + std::vector Update(const Tree*, + bool is_numerical_split, int leaf, int new_leaf, + int8_t monotone_type, double right_output, + double left_output, int, const SplitInfo& , + const std::vector&) override { entries_[new_leaf] = entries_[leaf]; if (is_numerical_split) { double mid = (left_output + right_output) / 2.0f; @@ -55,14 +94,366 @@ class LeafConstraints { entries_[new_leaf].UpdateMin(mid); } } + return std::vector(); } - const ConstraintEntry& Get(int leaf_idx) const { return entries_[leaf_idx]; } + const ConstraintEntry& Get(int leaf_idx) const override { return entries_[leaf_idx]; } - private: + protected: int num_leaves_; std::vector entries_; }; +class IntermediateLeafConstraints : public BasicLeafConstraints { + public: + explicit IntermediateLeafConstraints(const Config* config, int num_leaves) + : BasicLeafConstraints(num_leaves), config_(config) { + leaf_is_in_monotone_subtree_.resize(num_leaves_, false); + node_parent_.resize(num_leaves_ - 1, -1); + leaves_to_update_.reserve(num_leaves_); + } + + void Reset() override { + BasicLeafConstraints::Reset(); + std::fill_n(leaf_is_in_monotone_subtree_.begin(), num_leaves_, false); + std::fill_n(node_parent_.begin(), num_leaves_ - 1, -1); + leaves_to_update_.clear(); + } + + void BeforeSplit(const Tree* tree, int leaf, int new_leaf, + int8_t monotone_type) override { + if (monotone_type != 0 || leaf_is_in_monotone_subtree_[leaf]) { + leaf_is_in_monotone_subtree_[leaf] = true; + leaf_is_in_monotone_subtree_[new_leaf] = true; + } +#ifdef DEBUG + CHECK_GE(new_leaf - 1, 0); + CHECK_LT(static_cast(new_leaf - 1), node_parent_.size()); +#endif + node_parent_[new_leaf - 1] = tree->leaf_parent(leaf); + } + + void UpdateConstraintsWithOutputs(bool is_numerical_split, int leaf, + int new_leaf, int8_t monotone_type, + double right_output, double left_output) { + entries_[new_leaf] = entries_[leaf]; + if (is_numerical_split) { + if (monotone_type < 0) { + entries_[leaf].UpdateMin(right_output); + entries_[new_leaf].UpdateMax(left_output); + } else if (monotone_type > 0) { + entries_[leaf].UpdateMax(right_output); + entries_[new_leaf].UpdateMin(left_output); + } + } + } + + std::vector Update(const Tree* tree, bool is_numerical_split, int leaf, + int new_leaf, int8_t monotone_type, + double right_output, double left_output, + int split_feature, const SplitInfo& split_info, + const std::vector& best_split_per_leaf) override { + leaves_to_update_.clear(); + if (leaf_is_in_monotone_subtree_[leaf]) { + UpdateConstraintsWithOutputs(is_numerical_split, leaf, new_leaf, + monotone_type, right_output, left_output); + + // Initialize variables to store information while going up the tree + int depth = tree->leaf_depth(new_leaf) - 1; + + std::vector features_of_splits_going_up_from_original_leaf; + std::vector thresholds_of_splits_going_up_from_original_leaf; + std::vector was_original_leaf_right_child_of_split; + + features_of_splits_going_up_from_original_leaf.reserve(depth); + thresholds_of_splits_going_up_from_original_leaf.reserve(depth); + was_original_leaf_right_child_of_split.reserve(depth); + + GoUpToFindLeavesToUpdate(tree, tree->leaf_parent(new_leaf), + &features_of_splits_going_up_from_original_leaf, + &thresholds_of_splits_going_up_from_original_leaf, + &was_original_leaf_right_child_of_split, + split_feature, split_info, split_info.threshold, + best_split_per_leaf); + } + return leaves_to_update_; + } + + bool OppositeChildShouldBeUpdated( + bool is_split_numerical, + const std::vector& features_of_splits_going_up_from_original_leaf, + int inner_feature, + const std::vector& was_original_leaf_right_child_of_split, + bool is_in_right_child) { + bool opposite_child_should_be_updated = true; + + // if the split is categorical, it is not handled by this optimisation, + // so the code will have to go down in the other child subtree to see if + // there are leaves to update + // even though it may sometimes be unnecessary + if (is_split_numerical) { + // only branches containing leaves that are contiguous to the original + // leaf need to be updated + // therefore, for the same feature, there is no use going down from the + // second time going up on the right (or on the left) + for (size_t split_idx = 0; + split_idx < features_of_splits_going_up_from_original_leaf.size(); + ++split_idx) { + if (features_of_splits_going_up_from_original_leaf[split_idx] == + inner_feature && + (was_original_leaf_right_child_of_split[split_idx] == + is_in_right_child)) { + opposite_child_should_be_updated = false; + break; + } + } + } + return opposite_child_should_be_updated; + } + + // Recursive function that goes up the tree, and then down to find leaves that + // have constraints to be updated + void GoUpToFindLeavesToUpdate( + const Tree* tree, int node_idx, + std::vector* features_of_splits_going_up_from_original_leaf, + std::vector* thresholds_of_splits_going_up_from_original_leaf, + std::vector* was_original_leaf_right_child_of_split, + int split_feature, const SplitInfo& split_info, uint32_t split_threshold, + const std::vector& best_split_per_leaf) { +#ifdef DEBUG + CHECK_GE(node_idx, 0); + CHECK_LT(static_cast(node_idx), node_parent_.size()); +#endif + int parent_idx = node_parent_[node_idx]; + // if not at the root + if (parent_idx != -1) { + int inner_feature = tree->split_feature_inner(parent_idx); + int feature = tree->split_feature(parent_idx); + int8_t monotone_type = config_->monotone_constraints[feature]; + bool is_in_right_child = tree->right_child(parent_idx) == node_idx; + bool is_split_numerical = tree->IsNumericalSplit(node_idx); + + // this is just an optimisation not to waste time going down in subtrees + // where there won't be any leaf to update + bool opposite_child_should_be_updated = OppositeChildShouldBeUpdated( + is_split_numerical, *features_of_splits_going_up_from_original_leaf, + inner_feature, *was_original_leaf_right_child_of_split, + is_in_right_child); + + if (opposite_child_should_be_updated) { + // if there is no monotone constraint on a split, + // then there is no relationship between its left and right leaves' values + if (monotone_type != 0) { + // these variables correspond to the current split we encounter going + // up the tree + int left_child_idx = tree->left_child(parent_idx); + int right_child_idx = tree->right_child(parent_idx); + bool left_child_is_curr_idx = (left_child_idx == node_idx); + int opposite_child_idx = + (left_child_is_curr_idx) ? right_child_idx : left_child_idx; + bool update_max_constraints_in_opposite_child_leaves = + (monotone_type < 0) ? left_child_is_curr_idx + : !left_child_is_curr_idx; + + // the opposite child needs to be updated + // so the code needs to go down in the the opposite child + // to see which leaves' constraints need to be updated + GoDownToFindLeavesToUpdate( + tree, opposite_child_idx, + *features_of_splits_going_up_from_original_leaf, + *thresholds_of_splits_going_up_from_original_leaf, + *was_original_leaf_right_child_of_split, + update_max_constraints_in_opposite_child_leaves, split_feature, + split_info, true, true, split_threshold, best_split_per_leaf); + } + + // if opposite_child_should_be_updated, then it means the path to come up there was relevant, + // i.e. that it will be helpful going down to determine which leaf + // is actually contiguous to the original 2 leaves and should be updated + // so the variables associated with the split need to be recorded + was_original_leaf_right_child_of_split->push_back( + tree->right_child(parent_idx) == node_idx); + thresholds_of_splits_going_up_from_original_leaf->push_back( + tree->threshold_in_bin(parent_idx)); + features_of_splits_going_up_from_original_leaf->push_back( + tree->split_feature_inner(parent_idx)); + } + + // since current node is not the root, keep going up + GoUpToFindLeavesToUpdate( + tree, parent_idx, features_of_splits_going_up_from_original_leaf, + thresholds_of_splits_going_up_from_original_leaf, + was_original_leaf_right_child_of_split, split_feature, split_info, + split_threshold, best_split_per_leaf); + } + } + + void GoDownToFindLeavesToUpdate( + const Tree* tree, int node_idx, + const std::vector& features_of_splits_going_up_from_original_leaf, + const std::vector& + thresholds_of_splits_going_up_from_original_leaf, + const std::vector& was_original_leaf_right_child_of_split, + bool update_max_constraints, int split_feature, + const SplitInfo& split_info, bool use_left_leaf, bool use_right_leaf, + uint32_t split_threshold, + const std::vector& best_split_per_leaf) { + // if leaf + if (node_idx < 0) { + int leaf_idx = ~node_idx; + + // splits that are not to be used shall not be updated, + // included leaf at max depth + if (best_split_per_leaf[leaf_idx].gain == kMinScore) { + return; + } + + std::pair min_max_constraints; + bool something_changed = false; + // if the current leaf is contiguous with both the new right leaf and the new left leaf + // then it may need to be greater than the max of the 2 or smaller than the min of the 2 + // otherwise, if the current leaf is contiguous with only one of the 2 new leaves, + // then it may need to be greater or smaller than it + if (use_right_leaf && use_left_leaf) { + min_max_constraints = + std::minmax(split_info.right_output, split_info.left_output); + } else if (use_right_leaf && !use_left_leaf) { + min_max_constraints = std::pair( + split_info.right_output, split_info.right_output); + } else { + min_max_constraints = std::pair(split_info.left_output, + split_info.left_output); + } + +#ifdef DEBUG + if (update_max_constraints) { + CHECK_GE(min_max_constraints.first, tree->LeafOutput(leaf_idx)); + } else { + CHECK_LE(min_max_constraints.second, tree->LeafOutput(leaf_idx)); + } +#endif + // depending on which split made the current leaf and the original leaves contiguous, + // either the min constraint or the max constraint of the current leaf need to be updated + if (!update_max_constraints) { + something_changed = entries_[leaf_idx].UpdateMinAndReturnBoolIfChanged( + min_max_constraints.second); + } else { + something_changed = entries_[leaf_idx].UpdateMaxAndReturnBoolIfChanged( + min_max_constraints.first); + } + // If constraints were not updated, then there is no need to update the leaf + if (!something_changed) { + return; + } + leaves_to_update_.push_back(leaf_idx); + + } else { // if node + // check if the children are contiguous with the original leaf + std::pair keep_going_left_right = ShouldKeepGoingLeftRight( + tree, node_idx, features_of_splits_going_up_from_original_leaf, + thresholds_of_splits_going_up_from_original_leaf, + was_original_leaf_right_child_of_split); + int inner_feature = tree->split_feature_inner(node_idx); + uint32_t threshold = tree->threshold_in_bin(node_idx); + bool is_split_numerical = tree->IsNumericalSplit(node_idx); + bool use_left_leaf_for_update_right = true; + bool use_right_leaf_for_update_left = true; + // if the split is on the same feature (categorical variables not supported) + // then depending on the threshold, + // the current left child may not be contiguous with the original right leaf, + // or the current right child may not be contiguous with the original left leaf + if (is_split_numerical && inner_feature == split_feature) { + if (threshold >= split_threshold) { + use_left_leaf_for_update_right = false; + } + if (threshold <= split_threshold) { + use_right_leaf_for_update_left = false; + } + } + + // go down left + if (keep_going_left_right.first) { + GoDownToFindLeavesToUpdate( + tree, tree->left_child(node_idx), + features_of_splits_going_up_from_original_leaf, + thresholds_of_splits_going_up_from_original_leaf, + was_original_leaf_right_child_of_split, update_max_constraints, + split_feature, split_info, use_left_leaf, + use_right_leaf_for_update_left && use_right_leaf, split_threshold, + best_split_per_leaf); + } + // go down right + if (keep_going_left_right.second) { + GoDownToFindLeavesToUpdate( + tree, tree->right_child(node_idx), + features_of_splits_going_up_from_original_leaf, + thresholds_of_splits_going_up_from_original_leaf, + was_original_leaf_right_child_of_split, update_max_constraints, + split_feature, split_info, + use_left_leaf_for_update_right && use_left_leaf, use_right_leaf, + split_threshold, best_split_per_leaf); + } + } + } + + std::pair ShouldKeepGoingLeftRight( + const Tree* tree, int node_idx, + const std::vector& features_of_splits_going_up_from_original_leaf, + const std::vector& + thresholds_of_splits_going_up_from_original_leaf, + const std::vector& was_original_leaf_right_child_of_split) { + int inner_feature = tree->split_feature_inner(node_idx); + uint32_t threshold = tree->threshold_in_bin(node_idx); + bool is_split_numerical = tree->IsNumericalSplit(node_idx); + + bool keep_going_right = true; + bool keep_going_left = true; + // left and right nodes are checked to find out if they are contiguous with + // the original leaves if so the algorithm should keep going down these nodes + // to update constraints + if (is_split_numerical) { + for (size_t i = 0; + i < features_of_splits_going_up_from_original_leaf.size(); ++i) { + if (features_of_splits_going_up_from_original_leaf[i] == + inner_feature) { + if (threshold >= + thresholds_of_splits_going_up_from_original_leaf[i] && + !was_original_leaf_right_child_of_split[i]) { + keep_going_right = false; + if (!keep_going_left) { + break; + } + } + if (threshold <= + thresholds_of_splits_going_up_from_original_leaf[i] && + was_original_leaf_right_child_of_split[i]) { + keep_going_left = false; + if (!keep_going_right) { + break; + } + } + } + } + } + return std::pair(keep_going_left, keep_going_right); + } + + private: + const Config* config_; + std::vector leaves_to_update_; + // add parent node information + std::vector node_parent_; + // Keeps track of the monotone splits above the leaf + std::vector leaf_is_in_monotone_subtree_; +}; + +LeafConstraintsBase* LeafConstraintsBase::Create(const Config* config, + int num_leaves) { + if (config->monotone_constraints_method == "intermediate") { + return new IntermediateLeafConstraints(config, num_leaves); + } + return new BasicLeafConstraints(num_leaves); +} + } // namespace LightGBM #endif // LIGHTGBM_TREELEARNER_MONOTONE_CONSTRAINTS_HPP_ diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp index 52368d98bab3..b7569d22c8e2 100644 --- a/src/treelearner/serial_tree_learner.cpp +++ b/src/treelearner/serial_tree_learner.cpp @@ -46,7 +46,7 @@ void SerialTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian // push split information for all leaves best_split_per_leaf_.resize(config_->num_leaves); - constraints_.reset(new LeafConstraints(config_->num_leaves)); + constraints_.reset(LeafConstraintsBase::Create(config_, config_->num_leaves)); // initialize splits for leaf smaller_leaf_splits_.reset(new LeafSplits(train_data_->num_data())); @@ -144,6 +144,7 @@ void SerialTreeLearner::ResetConfig(const Config* config) { cegb_.reset(new CostEfficientGradientBoosting(this)); cegb_->Init(); } + constraints_.reset(LeafConstraintsBase::Create(config_, config_->num_leaves)); } Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians) { @@ -533,6 +534,10 @@ void SerialTreeLearner::SplitInner(Tree* tree, int best_leaf, int* left_leaf, *left_leaf = best_leaf; auto next_leaf_id = tree->NextLeafId(); + // update before tree split + constraints_->BeforeSplit(tree, best_leaf, next_leaf_id, + best_split_info.monotone_type); + bool is_numerical_split = train_data_->FeatureBinMapper(inner_feature_index)->bin_type() == BinType::NumericalBin; @@ -619,10 +624,15 @@ void SerialTreeLearner::SplitInner(Tree* tree, int best_leaf, int* left_leaf, best_split_info.left_sum_gradient, best_split_info.left_sum_hessian); } - constraints_->UpdateConstraints(is_numerical_split, *left_leaf, *right_leaf, - best_split_info.monotone_type, - best_split_info.right_output, - best_split_info.left_output); + auto leaves_need_update = constraints_->Update( + tree, is_numerical_split, *left_leaf, *right_leaf, + best_split_info.monotone_type, best_split_info.right_output, + best_split_info.left_output, inner_feature_index, best_split_info, + best_split_per_leaf_); + // update leave outputs if needed + for (auto leaf : leaves_need_update) { + RecomputeBestSplitForLeaf(leaf, &best_split_per_leaf_[leaf]); + } } void SerialTreeLearner::RenewTreeOutput(Tree* tree, const ObjectiveFunction* obj, std::function residual_getter, @@ -687,4 +697,44 @@ void SerialTreeLearner::ComputeBestSplitForFeature( } } +void SerialTreeLearner::RecomputeBestSplitForLeaf(int leaf, SplitInfo* split) { + FeatureHistogram* histogram_array_; + if (!histogram_pool_.Get(leaf, &histogram_array_)) { + Log::Warning( + "Get historical Histogram for leaf %d failed, will skip the " + "``RecomputeBestSplitForLeaf``", + leaf); + return; + } + double sum_gradients = split->left_sum_gradient + split->right_sum_gradient; + double sum_hessians = split->left_sum_hessian + split->right_sum_hessian; + int num_data = split->left_count + split->right_count; + + std::vector bests(share_state_->num_threads); + LeafSplits leaf_splits(num_data); + leaf_splits.Init(leaf, sum_gradients, sum_hessians); + + OMP_INIT_EX(); +// find splits +#pragma omp parallel for schedule(static) num_threads(share_state_->num_threads) + for (int feature_index = 0; feature_index < num_features_; ++feature_index) { + OMP_LOOP_EX_BEGIN(); + if (!col_sampler_.is_feature_used_bytree()[feature_index] || + !histogram_array_[feature_index].is_splittable()) { + continue; + } + const int tid = omp_get_thread_num(); + int real_fidx = train_data_->RealFeatureIndex(feature_index); + ComputeBestSplitForFeature( + histogram_array_, feature_index, real_fidx, + true, + num_data, &leaf_splits, &bests[tid]); + + OMP_LOOP_EX_END(); + } + OMP_THROW_EX(); + auto best_idx = ArrayArgs::ArgMax(bests); + *split = bests[best_idx]; +} + } // namespace LightGBM diff --git a/src/treelearner/serial_tree_learner.h b/src/treelearner/serial_tree_learner.h index 2223b3e247bf..6a0d7f0e9a6d 100644 --- a/src/treelearner/serial_tree_learner.h +++ b/src/treelearner/serial_tree_learner.h @@ -122,6 +122,8 @@ class SerialTreeLearner: public TreeLearner { void GetShareStates(const Dataset* dataset, bool is_constant_hessian, bool is_first_time); + void RecomputeBestSplitForLeaf(int leaf, SplitInfo* split); + /*! * \brief Some initial works before training */ @@ -188,7 +190,7 @@ class SerialTreeLearner: public TreeLearner { /*! \brief store best split per feature for all leaves */ std::vector splits_per_leaf_; /*! \brief stores minimum and maximum constraints for each leaf */ - std::unique_ptr> constraints_; + std::unique_ptr constraints_; /*! \brief stores best thresholds for all feature for smaller leaf */ std::unique_ptr smaller_leaf_splits_; diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index ba9796c188de..51be083a9f01 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -47,6 +47,10 @@ def decreasing_metric(preds, train_data): return ('decreasing_metric', next(decreasing_generator), False) +def categorize(continuous_x): + return np.digitize(continuous_x, bins=np.arange(0, 1, 0.01)) + + class TestEngine(unittest.TestCase): def test_binary(self): X, y = load_breast_cancer(True) @@ -1010,45 +1014,74 @@ def test_init_with_subset(self): self.assertEqual(subset_data_3.get_data(), "lgb_train_data.bin") self.assertEqual(subset_data_4.get_data(), "lgb_train_data.bin") - def test_monotone_constraint(self): + def generate_trainset_for_monotone_constraints_tests(self, x3_to_category=True): + number_of_dpoints = 3000 + x1_positively_correlated_with_y = np.random.random(size=number_of_dpoints) + x2_negatively_correlated_with_y = np.random.random(size=number_of_dpoints) + x3_negatively_correlated_with_y = np.random.random(size=number_of_dpoints) + x = np.column_stack( + (x1_positively_correlated_with_y, + x2_negatively_correlated_with_y, + categorize(x3_negatively_correlated_with_y) if x3_to_category else x3_negatively_correlated_with_y)) + + zs = np.random.normal(loc=0.0, scale=0.01, size=number_of_dpoints) + scales = 10. * (np.random.random(6) + 0.5) + y = (scales[0] * x1_positively_correlated_with_y + + np.sin(scales[1] * np.pi * x1_positively_correlated_with_y) + - scales[2] * x2_negatively_correlated_with_y + - np.cos(scales[3] * np.pi * x2_negatively_correlated_with_y) + - scales[4] * x3_negatively_correlated_with_y + - np.cos(scales[5] * np.pi * x3_negatively_correlated_with_y) + + zs) + categorical_features = [] + if x3_to_category: + categorical_features = [2] + trainset = lgb.Dataset(x, label=y, categorical_feature=categorical_features) + return trainset + + def test_monotone_constraints(self): def is_increasing(y): return (np.diff(y) >= 0.0).all() def is_decreasing(y): return (np.diff(y) <= 0.0).all() - def is_correctly_constrained(learner): - n = 200 + def is_non_monotone(y): + return (np.diff(y) < 0.0).any() and (np.diff(y) > 0.0).any() + + def is_correctly_constrained(learner, x3_to_category=True): + iterations = 10 + n = 1000 variable_x = np.linspace(0, 1, n).reshape((n, 1)) fixed_xs_values = np.linspace(0, 1, n) - for i in range(n): + for i in range(iterations): fixed_x = fixed_xs_values[i] * np.ones((n, 1)) - monotonically_increasing_x = np.column_stack((variable_x, fixed_x)) + monotonically_increasing_x = np.column_stack((variable_x, fixed_x, fixed_x)) monotonically_increasing_y = learner.predict(monotonically_increasing_x) - monotonically_decreasing_x = np.column_stack((fixed_x, variable_x)) + monotonically_decreasing_x = np.column_stack((fixed_x, variable_x, fixed_x)) monotonically_decreasing_y = learner.predict(monotonically_decreasing_x) - if not (is_increasing(monotonically_increasing_y) and is_decreasing(monotonically_decreasing_y)): + non_monotone_x = np.column_stack((fixed_x, + fixed_x, + categorize(variable_x) if x3_to_category else variable_x)) + non_monotone_y = learner.predict(non_monotone_x) + if not (is_increasing(monotonically_increasing_y) + and is_decreasing(monotonically_decreasing_y) + and is_non_monotone(non_monotone_y)): return False return True - number_of_dpoints = 2000 - x1_positively_correlated_with_y = np.random.random(size=number_of_dpoints) - x2_negatively_correlated_with_y = np.random.random(size=number_of_dpoints) - x = np.column_stack((x1_positively_correlated_with_y, x2_negatively_correlated_with_y)) - zs = np.random.normal(loc=0.0, scale=0.01, size=number_of_dpoints) - y = (5 * x1_positively_correlated_with_y - + np.sin(10 * np.pi * x1_positively_correlated_with_y) - - 5 * x2_negatively_correlated_with_y - - np.cos(10 * np.pi * x2_negatively_correlated_with_y) - + zs) - trainset = lgb.Dataset(x, label=y) - params = { - 'min_data': 20, - 'num_leaves': 20, - 'monotone_constraints': '1,-1' - } - constrained_model = lgb.train(params, trainset) - self.assertTrue(is_correctly_constrained(constrained_model)) + for test_with_categorical_variable in [True, False]: + for monotone_constraints_method in ["basic", "intermediate"]: + trainset = self.generate_trainset_for_monotone_constraints_tests(test_with_categorical_variable) + params = { + 'min_data': 20, + 'num_leaves': 20, + 'monotone_constraints': [1, -1, 0], + "monotone_constraints_method": monotone_constraints_method, + "use_missing": False, + } + constrained_model = lgb.train(params, trainset) + self.assertTrue(is_correctly_constrained(constrained_model, test_with_categorical_variable)) def test_max_bin_by_feature(self): col1 = np.arange(0, 100)[:, np.newaxis]