From 066693754b982a428eb1bc61942b8b12f0cd3f10 Mon Sep 17 00:00:00 2001 From: Chris Holden Date: Fri, 21 Aug 2015 15:47:13 -0400 Subject: [PATCH] Switch to YAML config files; add model pickles See issues #26, #29, #30 --- examples/classifiers/RandomForest.yaml | 32 ++++++ examples/p022r049_example.ini | 103 ----------------- examples/p022r049_example.yaml | 102 +++++++++++++++++ examples/p035r032_example.ini | 103 ----------------- examples/p035r032_example.yaml | 107 ++++++++++++++++++ .../regression/LassoCV_n100_alpha_0-50.pkl | Bin 0 -> 1263 bytes examples/regression/Lasso_alpha20.pkl | Bin 0 -> 291 bytes examples/regression/OLS.pkl | Bin 0 -> 114 bytes examples/regression/README.md | 26 +++++ yatsm/classifiers/RandomForest.ini | 34 ------ 10 files changed, 267 insertions(+), 240 deletions(-) create mode 100644 examples/classifiers/RandomForest.yaml delete mode 100644 examples/p022r049_example.ini create mode 100644 examples/p022r049_example.yaml delete mode 100644 examples/p035r032_example.ini create mode 100644 examples/p035r032_example.yaml create mode 100644 examples/regression/LassoCV_n100_alpha_0-50.pkl create mode 100644 examples/regression/Lasso_alpha20.pkl create mode 100644 examples/regression/OLS.pkl create mode 100644 examples/regression/README.md delete mode 100644 yatsm/classifiers/RandomForest.ini diff --git a/examples/classifiers/RandomForest.yaml b/examples/classifiers/RandomForest.yaml new file mode 100644 index 00000000..a534db6e --- /dev/null +++ b/examples/classifiers/RandomForest.yaml @@ -0,0 +1,32 @@ +# Default configuration file for Random Forest algorithm +# +# See sklearn implementation API here: +# http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html + +# Specify algorithm and hyperparameters +algorithm: RandomForest + # Number of trees in forest + n_estimators: 500 + # Criterion for qualify of split ['gini', 'entropy'] + criterion: "gini" + # number of features tried at each node + max_features: "auto" + # maximum depth of tree + max_depth: + # minimum number of samples required to split an internal node + min_samples_split: 2 + # minimum number of samples in newly created leaves + min_samples_leaf: 1 + # maximum leaf nodes -- if not None max_depth is ignored + max_leaf_nodes: + # Use bootstrap sample + bootstrap: True + # use out-of-bag sample for generalization error + oob_score: True + # number of jobs in parallel for fit and predict + n_jobs: 1 + +# Algorithm fit parameters +fit: + # Sample weights for training data + sample_weight: diff --git a/examples/p022r049_example.ini b/examples/p022r049_example.ini deleted file mode 100644 index 1b34e4bd..00000000 --- a/examples/p022r049_example.ini +++ /dev/null @@ -1,103 +0,0 @@ -# Example configuration file for YATSM line runner -# -# This configuration includes details about the dataset and how YATSM should -# run - -# Version of config -[metadata] -version = 0.4 - -# Section for Dataset -[dataset] -# Text file containing dates and images -input_file = /home/ceholden/Documents/yatsm/examples/p022r049_input.csv -# Input date format -date_format = %Y%j -# Output location -output = /home/ceholden/Documents/landsat_stack/p022r049/images/YATSM -# Output file prefix (e.g., [prefix]_[line].npz) -output_prefix = yatsm_r -# Total number of bands -n_bands = 8 -# Mask band (e.g., Fmask) -mask_band = 8 -# List of integer values to mask within the mask band -mask_values = 2, 3, 4, 255 -# Valid range of non-mask band data -# specify 1 range for all bands, or specify ranges for each band -valid_range = 0, 10000 -# Indices for multi-temporal cloud masking (indexed on 1) -green_band = 2 -swir1_band = 5 -# Use BIP image reader? If not, use GDAL to read in -use_bip_reader = true -# Directory location for caching dataset lines -cache_line_dir = /home/ceholden/Documents/landsat_stack/p022r049/images/.yatsm_cache - -# Section for YATSM parameters -[YATSM] -consecutive = 5 -threshold = 3 -min_obs = 16 -min_rmse = 150 -# Patsy style model specification for timeseries model -design_matrix = 1 + x + harm(x, 1) -test_indices = 2, 4, 5 -# Number of days between model fit updates during monitoring period -retrain_time = 365.25 -# Multitemporal cloud/shadow screening algorithm and threshold -screening = RLM -screening_crit = 400.0 -# Training period slope test -slope_test = False -# Remove observation if no change has been detected, but first observation is -# above threshold -remove_noise = True -dynamic_rmse = False -lassocv = False -reverse = False -robust = False -# Commission test alpha value for test; leave blank to ignore test -commission_alpha = - -# Section for phenology fitting -[phenology] -# Boolean for calculating phenology, or not -calc_pheno = False -# Specification for dataset indices required for EVI based phenology monitoring -red_index = 2 -nir_index = 3 -blue_index = 0 -# Scale factor for reflectance bands -scale = 0.0001 -# You can also specify index of EVI if contained in dataset to override calculation -evi_index = -evi_scale = -# Number of years to group together when normalizing EVI to upper and lower percentiles -year_interval = 3 -# Upper and lower percentiles of EVI used for max/min scaling -q_min = 10 -q_max = 90 - -# Section for segmentation -[segment] -# Segmentation image -segmentation = -# Resegmentation threshold (0 turns off resegmentation) -resegment_crit = 0 -# Resegmentation size thresholds -resegment_minpix = 5 -resegment_maxpix = 50 - -# Section for training and classification -[classification] -# Training data file -training_image = /home/ceholden/Documents/yatsm/examples/training_data.gtif -# Training data masked values -roi_mask_values = 0,255 -# Date range -training_start = 1999-01-01 -training_end = 2001-01-01 -training_date_format = %Y-%m-%d -# Cache X feature input and y labels for training data image into file? -cache_training = diff --git a/examples/p022r049_example.yaml b/examples/p022r049_example.yaml new file mode 100644 index 00000000..7d48b1d1 --- /dev/null +++ b/examples/p022r049_example.yaml @@ -0,0 +1,102 @@ +# Example configuration file for YATSM +# As of v0.5.0, config files are to be written in YAML +# +# Quotes around strings are optional, but encouraged, except where the leading +# character would produce a parsing error (e.g., when writing the +# date_format, "%Y%j") + +version: "0.5.0" + +dataset: + # Text file containing dates and images + input_file: "/home/ceholden/Documents/yatsm/examples/p022r049_input.csv" + # Input date format + date_format: "%Y%j" + # Output location + output: "/home/ceholden/Documents/landsat_stack/p022r049/images/YATSM" + # Output file prefix (e.g., [prefix]_[line].npz) + output_prefix: "yatsm_r" + # Total number of bands + n_bands: 8 + # Mask band (e.g., Fmask) + mask_band: 8 + # List of integer values to mask within the mask band + mask_values: [2, 3, 4, 255] + # Valid range of non-mask band data + # specify 1 range for all bands, or specify ranges for each band + valid_range: [0, 10000] + # Indices for multi-temporal cloud masking (indexed on 1) + green_band: 2 + swir1_band: 5 + # Use BIP image reader? If not, use GDAL to read in + use_bip_reader: true + # Directory location for caching dataset lines + cache_line_dir: "/home/ceholden/Documents/landsat_stack/p022r049/images/.yatsm_cache" + +# Parameters common to all timeseries analysis models within YATSM package +YATSM: + algorithm: "CCDCesque" + prediction: "LassoCV" + design_matrix: "1 + x + harm(x, 1)" + reverse: False + robust: False + commission_alpha: + +# Parameters for CCDCesque algorithm -- referenced by "algorithm" key in YATSM +CCDCesque: + consecutive: 5 + threshold: 3.0 + min_obs: 16 + min_rmse: 150 + test_indices: 2, 4, 5 + retrain_time: 365.25 + screening: RLM + screening_crit: 400.0 + slope_test: False + remove_noise: True + dynamic_rmse: False + +# Regression estimator +LassoCV: + pickle: "/home/ceholden/Documents/yatsm/examples/regression/LassoCV_n100_alpha_0-50.pkl" + +# Section for phenology fitting +phenology: + calc_pheno: False + # Specification for dataset indices required for EVI based phenology monitoring + red_index: 2 + nir_index: 3 + blue_index: 0 + # Scale factor for reflectance bands + scale: 0.0001 + # You can also specify index of EVI if contained in dataset to override calculation + evi_index: + evi_scale: + # Number of years to group together when normalizing EVI to upper and lower percentiles + year_interval: 3 + # Upper and lower percentiles of EVI used for max/min scaling + q_min: 10 + q_max: 90 + +# Section for segmentation +segment: + # Segmentation image + segmentation: + # Resegmentation threshold (0 turns off resegmentation) + resegment_crit: 0 + # Resegmentation size thresholds + resegment_minpix: 5 + resegment_maxpix: 50 + +# Section for training and classification +classification: + # Training data file + training_image: "/home/ceholden/Documents/yatsm/examples/training_data.gtif" + # Training data masked values + roi_mask_values: [0, 255] + # Date range + training_start: "1999-01-01" + training_end: "2001-01-01" + training_date_format: "%Y-%m-%d" + # Cache X feature input and y labels for training data image into file? + cache_training: diff --git a/examples/p035r032_example.ini b/examples/p035r032_example.ini deleted file mode 100644 index 799fd29a..00000000 --- a/examples/p035r032_example.ini +++ /dev/null @@ -1,103 +0,0 @@ -# Example configuration file for YATSM line runner -# -# This configuration includes details about the dataset and how YATSM should -# run - -# Version of config -[metadata] -version = 0.4 - -# Section for Dataset -[dataset] -# Text file containing dates and images -input_file = /home/ceholden/Documents/yatsm/examples/p035r032_input.csv -# Input date format -date_format = %Y%j -# Output location -output = /home/ceholden/Documents/landsat_stack/p035r032/images/YATSM -# Output file prefix (e.g., [prefix]_[line].npz) -output_prefix = yatsm_r -# Total number of bands -n_bands = 8 -# Mask band (e.g., Fmask) -mask_band = 8 -# List of integer values to mask within the mask band -mask_values = 2, 3, 4, 255 -# Valid range of non-mask band data -# specify 1 range for all bands, or specify ranges for each band -valid_range = 0, 10000 -# Indices for multi-temporal cloud masking (indexed on 1) -green_band = 2 -swir1_band = 5 -# Use BIP image reader? If not, use GDAL to read in -use_bip_reader = false -# Directory location for caching dataset lines -cache_line_dir = /home/ceholden/Documents/landsat_stack/p035r032/images/.yatsm_cache - -# Section for YATSM parameters -[YATSM] -consecutive = 5 -threshold = 3.5 -min_obs = 24 -min_rmse = 150 -# Patsy style model specification for timeseries model -design_matrix = 1 + x + harm(x, 1) + harm(x, 2) -test_indices = 2, 3, 4, 5 -# Number of days between model fit updates during monitoring period -retrain_time = 365.25 -# Multitemporal cloud/shadow screening algorithm and threshold -screening = RLM -screening_crit = 400.0 -# Training period slope test -slope_test = False -# Remove observation if no change has been detected, but first observation is -# above threshold -remove_noise = True -dynamic_rmse = False -lassocv = False -reverse = True -robust = False -# Commission test alpha value for test; leave blank to ignore test -commission_alpha = - -# Section for phenology fitting -[phenology] -# Boolean for calculating phenology, or not -calc_pheno = True -# Specification for dataset indices required for EVI based phenology monitoring -red_index = 2 -nir_index = 3 -blue_index = 0 -# Scale factor for reflectance bands -scale = 0.0001 -# You can also specify index of EVI if contained in dataset to override calculation -evi_index = -evi_scale = -# Number of years to group together when normalizing EVI to upper and lower percentiles -year_interval = 3 -# Upper and lower percentiles of EVI used for max/min scaling -q_min = 10 -q_max = 90 - -# Section for segmentation -[segment] -# Segmentation image -segmentation = /home/ceholden/Documents/yatsm/sandbox/segment/p035r032/bgw_seg.armap.15 -# Resegmentation threshold (0 turns off resegmentation) -resegment_crit = 0 -# Resegmentation size thresholds -resegment_minpix = 5 -resegment_maxpix = 50 - -# Section for training and classification -[classification] -# Training data file -training_image = -# Training data masked values -roi_mask_values = 0,255 -# Date range -training_start = 1999-01-01 -training_end = 2001-01-01 -training_date_format = %Y-%m-%d -# Cache X feature input and y labels for training data image into file? -cache_training = diff --git a/examples/p035r032_example.yaml b/examples/p035r032_example.yaml new file mode 100644 index 00000000..64607ece --- /dev/null +++ b/examples/p035r032_example.yaml @@ -0,0 +1,107 @@ +# Example configuration file for YATSM line runner +# +# This configuration includes details about the dataset and how YATSM should +# run + +# Version of config +version: "0.5.0" + +dataset: + # Text file containing dates and images + input_file: "/home/ceholden/Documents/yatsm/examples/p035r032_input.csv" + # Input date format + date_format: "%Y%j" + # Output location + output: "/home/ceholden/Documents/landsat_stack/p035r032/images/YATSM" + # Output file prefix (e.g., [prefix]_[line].npz) + output_prefix: "yatsm_r" + # Total number of bands + n_bands: 8 + # Mask band (e.g., Fmask) + mask_band: 8 + # List of integer values to mask within the mask band + mask_values: [2, 3, 4, 255] + # Valid range of non-mask band data + # specify 1 range for all bands, or specify ranges for each band + valid_range: [0, 10000] + # Indices for multi-temporal cloud masking (indexed on 1) + green_band: 2 + swir1_band: 5 + # Use BIP image reader? If not, use GDAL to read in + use_bip_reader: False + # Directory location for caching dataset lines + cache_line_dir: "/home/ceholden/Documents/landsat_stack/p035r032/images/.yatsm_cache" + +# Parameters common to all timeseries analysis models within YATSM package +YATSM: + algorithm: "CCDCesque" + prediction: "LassoCV" + design_matrix: "1 + x + harm(x, 1) + harm(x, 2)" + reverse: False + robust: False + commission_alpha: + +# Parameters for CCDCesque algorithm -- referenced by "algorithm" key in YATSM +CCDCesque: + consecutive: 5 + threshold: 3.5 + min_obs: 24 + min_rmse: 150 + test_indices: 2, 3, 4, 5 + retrain_time: 365.25 + screening: RLM + screening_crit: 400.0 + slope_test: False + remove_noise: True + dynamic_rmse: False + +# Regression estimator +LassoCV: + pickle: "/home/ceholden/Documents/yatsm/examples/regression/LassoCV_n100_alpha_0-50.pkl" + +Lasso20: + pickle: "/home/ceholden/Documents/yatsm/examples/regression/Lasso_alpha20.pkl" + +OLS: + pickle: "/home/ceholden/Documents/yatsm/examples/regression/OLS.pkl" + +# Section for phenology fitting +phenology: + calc_pheno: False + # Specification for dataset indices required for EVI based phenology monitoring + red_index: 2 + nir_index: 3 + blue_index: 0 + # Scale factor for reflectance bands + scale: 0.0001 + # You can also specify index of EVI if contained in dataset to override calculation + evi_index: + evi_scale: + # Number of years to group together when normalizing EVI to upper and lower percentiles + year_interval: 3 + # Upper and lower percentiles of EVI used for max/min scaling + q_min: 10 + q_max: 90 + +# Section for segmentation +segment: + # Segmentation image + segmentation: + # Resegmentation threshold (0 turns off resegmentation) + resegment_crit: 0 + # Resegmentation size thresholds + resegment_minpix: 5 + resegment_maxpix: 50 + +# Section for training and classification +classification: + # Training data file + training_image: "/home/ceholden/Documents/yatsm/examples/training_data.gtif" + # Training data masked values + roi_mask_values: [0, 255] + # Date range + training_start: "1999-01-01" + training_end: "2001-01-01" + training_date_format: "%Y-%m-%d" + # Cache X feature input and y labels for training data image into file? + cache_training: diff --git a/examples/regression/LassoCV_n100_alpha_0-50.pkl b/examples/regression/LassoCV_n100_alpha_0-50.pkl new file mode 100644 index 0000000000000000000000000000000000000000..c25120d61892167c84b7ee0b45b7a0a23fe51d67 GIT binary patch literal 1263 zcmVQ2}-<>HJGc)m#qNz)q_@U$!^@Iu)}z%K;?JrG7}*;hC_fu@!ozPt_h_(!MEtmcdtvTOzNxy3fx?}Y^b8|94w(e2Q1GHwP47%R3_508i zSqw|;b{1sIjxRZ@yEk1dJCP&E|I<}3J04?=OXRl88jfeLy1uPP_Ii<5<1{@gd*VM@ z;#1)z&pcM-PUwq}_p{ku|la(E1MYvA|6((O}? zCU~iWr5-LRf@2TvMY7CCVES-Y&<|nX!e1}m6A(CX1D@pYJ+pbkaMjdQwf?B5qPEqu zrd+c?QD3Me{hQuXh$a^O+&4d6F;ULeZ#WGUpAMF;D%4geMtZM=q9;x&HeoxZd**d2 zDi`uf&WztzjCl8}`WL$un`ez}kvM&oei!}_KR6bxoJ{AC2ip*3k6V)!KWsyKd-S3zuUB?y z`=Prk7hRFMRrW$fRjvH!pVwk!$nTTA)|JbUO>+w`ox7k!yrV_W!~}+9o?3k+q<0tc zV%bEUNOl;3-6@+2uC*ah+VuMU`PY!zgq_J*u0f=((x*8sj-x&>ZY$_nFkf9Ar9X8s zWsSPSpNXn#(5tV+_lNVlt?HZMV?xcXQgushelg+mxq7sDTRmKKUVZJN#aBSwRxf|% z`Pt=?arJJcYiOV$sYls0|demPA4XA7lDkyhoioHi#@~t(R zl7~4J&h*ooK2djDM#NRkA@4!$=-^`w(-n=iw6l_9#J?^-+Yy{>KQ%UIH6ELM#8l{- z9kMVf4l6bMB{HjFe;R)d|H!4L@3pY6g+b#-mQrrt@s19@IVmT2o~4GbPw#7b?*fi_xA1E-KY@+?Q(z9O0e-K zctmlUVTNI5-Ud28SX9AP*0`Q!OAXpe6NSb&l~6VqHM%5bDn=5VoxbzdD-TYfbQD@; z%?ongfp}7(l`*N1c)No#qD7qyIUd4Due3FqM=)8KB#lcLH2TClYbflw45|-uQBS|K z4WMY%vow8*%bU*}#zDD;Hth*n%t82onp(f?=0.5.0`) can be run using a variety of prediction methods as long as they are serializable as a class object with a `fit` and `predict` interface similar to estimators from the `scikit-learn` package. + +## Examples +Current examples include: + +1. `Lasso_alpha20.pkl` + - Lasso regression method where `alpha` is fixed to a value of `20`. This specific parameterization of Lasso regression is used by Zhu Zhe in the CCDC algorithm. +2. `LassoCV_n100_alpha_0-50.pkl` + - Lasso regression method where the `alpha` (usually called `lambda`, the tradeoff between least squares and L1 shrinkage) hyperparameter is crossvalidated among `n=100` values ranging between `0` and `50`. +3. `OLS.pkl` + * Ordinary Least Squares + +## Creation + +Custom regression estimators may be created as "pickles" as follows: + +``` python +In [1]: import sklearn.linear_model, sklearn.externals + +In [2]: lasso = sklearn.linear_model.Lasso(alpha=20.0) + +In [3]: sklearn.externals.joblib.dump(lasso, 'Lasso_alpha20.pkl') +Out[3]: ['Lasso_alpha20.pkl'] +``` diff --git a/yatsm/classifiers/RandomForest.ini b/yatsm/classifiers/RandomForest.ini deleted file mode 100644 index 8c29786d..00000000 --- a/yatsm/classifiers/RandomForest.ini +++ /dev/null @@ -1,34 +0,0 @@ -# Default configuration file for Random Forest algorithm -# -# See sklearn implementation API here: -# http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html - -[metadata] -# Classification algorithm -algorithm = RandomForest - -[init] -# Number of trees in forest -n_estimators = 500 -# Criterion for qualify of split ['gini', 'entropy'] -criterion = gini -# number of features tried at each node -max_features = auto -# maximum depth of tree -max_depth = None -# minimum number of samples required to split an internal node -min_samples_split = 2 -# minimum number of samples in newly created leaves -min_samples_leaf = 1 -# maximum leaf nodes -- if not None max_depth is ignored -max_leaf_nodes = None -# Use bootstrap sample -bootstrap = True -# use out-of-bag sample for generalization error -oob_score = True -# number of jobs in parallel for fit and predict -n_jobs = 1 - -[fit] -# Sample weights for training data -sample_weight = None