From 1b81774f7899aae3d8e2daf178b4f4ddf126b816 Mon Sep 17 00:00:00 2001 From: Hirotaka Aoki <113173839+aoki-h-jp@users.noreply.github.com> Date: Tue, 5 Sep 2023 00:38:02 +0900 Subject: [PATCH 1/2] Add feature engineering --- crypto_features/feature/engineering.py | 237 ++++++++++++++++++ .../feature/information_correlation.py | 36 ++- crypto_features/feature/preprocessing.py | 2 + 3 files changed, 264 insertions(+), 11 deletions(-) create mode 100644 crypto_features/feature/engineering.py diff --git a/crypto_features/feature/engineering.py b/crypto_features/feature/engineering.py new file mode 100644 index 0000000..3a81a0f --- /dev/null +++ b/crypto_features/feature/engineering.py @@ -0,0 +1,237 @@ +""" +Feature engineering module +""" +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt +from crypto_features.feature.information_correlation import InformationCorrelation + + +class FeatureEngineering: + def __init__(self, feature: pd.Series, klines=None): + """ + + :param feature: feature data + :param klines: klines data + """ + self._feature = feature + self._klines = klines + + def set_feature(self, feature: pd.Series): + """ + Set feature data + """ + self._feature = feature + + def set_klines(self, klines: pd.DataFrame): + """ + Set return data + """ + self._klines = klines + + def _make_return(self, minutes: int) -> pd.Series: + """ + Make return data + + :param minutes: minutes to calculate return + """ + return self._klines['close'].pct_change(minutes) + + def visualize_histogram(self, return_minutes: int): + """ + Visualize histogram of funding rate + + :param return_minutes: minutes to calculate return + """ + # plot settings + fig = plt.figure(figsize=(8, 8)) + grid = plt.GridSpec(5, 4, hspace=0.5, wspace=0.5) + + x, y = InformationCorrelation.format_array(self._klines, self._feature, return_minutes) + main_ax = fig.add_subplot(grid[1:, 1:]) + main_ax.scatter(x, y, alpha=0.5) + main_ax.set_xlabel("feature") + main_ax.tick_params(left=True, bottom=True, labelleft=True, labelbottom=True) + + x_hist = fig.add_subplot(grid[0, 1:], sharex=main_ax) + x_hist.hist(x, bins=50, align='mid', rwidth=0.8) + x_hist.set_title('feature vs return') + x_hist.tick_params(bottom=True, labelbottom=True) + + y_hist = fig.add_subplot(grid[1:, 0], sharey=main_ax) + y_hist.hist(y, bins=50, orientation='horizontal', align='mid', rwidth=0.8) + y_hist.invert_xaxis() + y_hist.tick_params(left=True, labelleft=True) + y_hist.set_ylabel(f"return (after {return_minutes} minutes)") + + plt.tight_layout() + plt.savefig(f'feature_vs_return_{return_minutes}.png') + plt.close() + + def diff_feature(self) -> pd.Series: + """ + Calculate difference of funding rate + """ + return self._feature.diff() + + def square_feature(self) -> pd.Series: + """ + Calculate square of funding rate + """ + return self._feature ** 2 + + def cube_feature(self) -> pd.Series: + """ + Calculate cube of funding rate + """ + return self._feature ** 3 + + def exp_feature(self) -> pd.Series: + """ + Calculate exp of funding rate + """ + return np.exp(self._feature) + + def sin_feature(self) -> pd.Series: + """ + Calculate sin of funding rate + """ + return np.sin(self._feature) + + def cos_feature(self) -> pd.Series: + """ + Calculate cos of funding rate + """ + return np.cos(self._feature) + + def tan_feature(self) -> pd.Series: + """ + Calculate tan of funding rate + """ + return np.tan(self._feature) + + def tanh_feature(self) -> pd.Series: + """ + Calculate tanh of funding rate + """ + return np.tanh(self._feature) + + def sigmoid_feature(self) -> pd.Series: + """ + Calculate sigmoid of funding rate + """ + return 1 / (1 + np.exp(-self._feature)) + + def softmax_feature(self) -> pd.Series: + """ + Calculate softmax of funding rate + """ + return np.exp(self._feature) / np.sum(np.exp(self._feature)) + + def log_feature(self) -> pd.Series: + """ + Calculate log of funding rate + """ + return np.log(self._feature) + + def log10_feature(self) -> pd.Series: + """ + Calculate log10 of funding rate + """ + return np.log10(self._feature) + + def log2_feature(self) -> pd.Series: + """ + Calculate log2 of funding rate + """ + return np.log2(self._feature) + + def square_root_feature(self) -> pd.Series: + """ + Calculate square root of funding rate + """ + return np.sqrt(self._feature) + + def arctan_feature(self) -> pd.Series: + """ + Calculate arctan of funding rate + """ + return np.arctan(self._feature) + + def arcsin_feature(self) -> pd.Series: + """ + Calculate arcsin of funding rate + """ + return np.arcsin(self._feature) + + def arccos_feature(self) -> pd.Series: + """ + Calculate arccos of funding rate + """ + return np.arccos(self._feature) + + def arctanh_feature(self) -> pd.Series: + """ + Calculate arctanh of funding rate + """ + return np.arctanh(self._feature) + + def arcsinh_feature(self) -> pd.Series: + """ + Calculate arcsinh of funding rate + """ + return np.arcsinh(self._feature) + + def arccosh_feature(self) -> pd.Series: + """ + Calculate arccosh of funding rate + """ + return np.arccosh(self._feature) + + def absolute_feature(self) -> pd.Series: + """ + Calculate absolute of funding rate + """ + return np.absolute(self._feature) + + def reciprocal_feature(self) -> pd.Series: + """ + Calculate reciprocal of funding rate + """ + return np.reciprocal(self._feature) + + def negative_feature(self) -> pd.Series: + """ + Calculate negative of funding rate + """ + return np.negative(self._feature) + + def sign_feature(self) -> pd.Series: + """ + Calculate sign of funding rate + """ + return np.sign(self._feature) + + def ceil_feature(self) -> pd.Series: + """ + Calculate ceil of funding rate + """ + return np.ceil(self._feature) + + def floor_feature(self) -> pd.Series: + """ + Calculate floor of funding rate + """ + return np.floor(self._feature) + + def rint_feature(self) -> pd.Series: + """ + Calculate rint of funding rate + """ + return np.rint(self._feature) + + def trunc_feature(self) -> pd.Series: + """ + Calculate trunc of funding rate + """ + return np.trunc(self._feature) diff --git a/crypto_features/feature/information_correlation.py b/crypto_features/feature/information_correlation.py index 20bf437..b7fcb49 100644 --- a/crypto_features/feature/information_correlation.py +++ b/crypto_features/feature/information_correlation.py @@ -20,19 +20,14 @@ def __init__(self): pass @staticmethod - def run_calculate( - klines: pd.DataFrame, feature: pd.Series, return_minutes=1, **kwargs - ): + def format_array(klines: pd.DataFrame, feature: pd.Series, return_minutes=1): """ - Calculate and visualize the information correlation. - + Format the array. :param klines: The klines data. :param feature: The feature data. :param return_minutes: The return minutes. + :return: formatted feature and return array. """ - if not os.path.exists("information_correlation"): - os.mkdir("information_correlation") - close_chg_pct_header = f"close_chg_pct_after_{return_minutes}min" klines["close"] = klines["close"].astype(float) klines[close_chg_pct_header] = klines["close"].pct_change( @@ -49,14 +44,33 @@ def run_calculate( klines[close_chg_pct_header] = klines[close_chg_pct_header].round(4) feature_arr = feature[feature.index.isin(klines.index)].values - klines_arr = klines[klines.index.isin(feature.index)][ + return_arr = klines[klines.index.isin(feature.index)][ close_chg_pct_header ].values assert len(feature_arr) == len( - klines_arr - ), f"len(feature_arr)={len(feature_arr)}, len(klines_arr)={len(klines_arr)}" + return_arr + ), f"len(feature_arr)={len(feature_arr)}, len(return_arr)={len(return_arr)}" + + return feature_arr, return_arr + + @staticmethod + def run_calculate( + klines: pd.DataFrame, feature: pd.Series, return_minutes=1, **kwargs + ): + """ + Calculate and visualize the information correlation. + + :param klines: The klines data. + :param feature: The feature data. + :param return_minutes: The return minutes. + """ + if not os.path.exists("information_correlation"): + os.mkdir("information_correlation") + feature_arr, klines_arr = InformationCorrelation.format_array( + klines, feature, return_minutes + ) print("[green] Start calculating the information correlation... [/green]") # Pearson's correlation coefficient diff --git a/crypto_features/feature/preprocessing.py b/crypto_features/feature/preprocessing.py index 86b5726..1825aaa 100644 --- a/crypto_features/feature/preprocessing.py +++ b/crypto_features/feature/preprocessing.py @@ -111,6 +111,7 @@ def _load_klines_data(self, symbol) -> pd.DataFrame: df.set_index("timestamp_open", inplace=True) df.index = pd.to_datetime(df.index, utc=True, unit="ms") + df["close"] = df["close"].astype(float) return df @@ -186,6 +187,7 @@ def _load_klines_data(self, symbol) -> pd.DataFrame: df = df.drop(["index", "index2"], axis=1) df["timestamp_open"] = pd.to_datetime(df["timestamp_open"], utc=True) df.set_index("timestamp_open", inplace=True) + df["close"] = df["close"].astype(float) return df From 52a59b0432fe7764e652b5a35066d85fb9fc8fc6 Mon Sep 17 00:00:00 2001 From: aoki-h-jp Date: Mon, 4 Sep 2023 15:38:32 +0000 Subject: [PATCH 2/2] Apply Code Formatter Change --- crypto_features/feature/engineering.py | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/crypto_features/feature/engineering.py b/crypto_features/feature/engineering.py index 3a81a0f..8bcffb2 100644 --- a/crypto_features/feature/engineering.py +++ b/crypto_features/feature/engineering.py @@ -1,10 +1,12 @@ """ Feature engineering module """ +import matplotlib.pyplot as plt import numpy as np import pandas as pd -import matplotlib.pyplot as plt -from crypto_features.feature.information_correlation import InformationCorrelation + +from crypto_features.feature.information_correlation import \ + InformationCorrelation class FeatureEngineering: @@ -35,7 +37,7 @@ def _make_return(self, minutes: int) -> pd.Series: :param minutes: minutes to calculate return """ - return self._klines['close'].pct_change(minutes) + return self._klines["close"].pct_change(minutes) def visualize_histogram(self, return_minutes: int): """ @@ -47,25 +49,27 @@ def visualize_histogram(self, return_minutes: int): fig = plt.figure(figsize=(8, 8)) grid = plt.GridSpec(5, 4, hspace=0.5, wspace=0.5) - x, y = InformationCorrelation.format_array(self._klines, self._feature, return_minutes) + x, y = InformationCorrelation.format_array( + self._klines, self._feature, return_minutes + ) main_ax = fig.add_subplot(grid[1:, 1:]) main_ax.scatter(x, y, alpha=0.5) main_ax.set_xlabel("feature") main_ax.tick_params(left=True, bottom=True, labelleft=True, labelbottom=True) x_hist = fig.add_subplot(grid[0, 1:], sharex=main_ax) - x_hist.hist(x, bins=50, align='mid', rwidth=0.8) - x_hist.set_title('feature vs return') + x_hist.hist(x, bins=50, align="mid", rwidth=0.8) + x_hist.set_title("feature vs return") x_hist.tick_params(bottom=True, labelbottom=True) y_hist = fig.add_subplot(grid[1:, 0], sharey=main_ax) - y_hist.hist(y, bins=50, orientation='horizontal', align='mid', rwidth=0.8) + y_hist.hist(y, bins=50, orientation="horizontal", align="mid", rwidth=0.8) y_hist.invert_xaxis() y_hist.tick_params(left=True, labelleft=True) y_hist.set_ylabel(f"return (after {return_minutes} minutes)") plt.tight_layout() - plt.savefig(f'feature_vs_return_{return_minutes}.png') + plt.savefig(f"feature_vs_return_{return_minutes}.png") plt.close() def diff_feature(self) -> pd.Series: @@ -78,13 +82,13 @@ def square_feature(self) -> pd.Series: """ Calculate square of funding rate """ - return self._feature ** 2 + return self._feature**2 def cube_feature(self) -> pd.Series: """ Calculate cube of funding rate """ - return self._feature ** 3 + return self._feature**3 def exp_feature(self) -> pd.Series: """