Skip to content

Commit

Permalink
fixes #164
Browse files Browse the repository at this point in the history
  • Loading branch information
AlexTheLion123 committed Jun 18, 2024
1 parent 90ec87c commit 9505f9a
Showing 1 changed file with 69 additions and 14 deletions.
83 changes: 69 additions & 14 deletions tradingstrategy/binance/downloader.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""Get candlestick price and volume data from Binance.
"""

import re
import requests
import datetime
import pandas as pd
Expand Down Expand Up @@ -129,9 +129,7 @@ def fetch_candlestick_data(
)
dataframes.append(df)

# Count the cached file size
path = self.get_parquet_path(symbol, time_bucket, start_at, end_at)
total_size += os.path.getsize(path)
total_size += self._get_parquet_size(df, symbol, time_bucket, start_at, end_at)

if progress_bar:
progress_bar.set_postfix(
Expand Down Expand Up @@ -361,11 +359,7 @@ def fetch_lending_rates(
)
dataframes.append(df)

# Count the cached file size
path = self.get_parquet_path(
asset_symbol, time_bucket, start_at, end_at, is_lending=True
)
total_size += os.path.getsize(path)
total_size += self._get_parquet_size(df, asset_symbol, time_bucket, start_at, end_at, is_lending=True)

if progress_bar:
progress_bar.set_postfix(
Expand Down Expand Up @@ -558,6 +552,13 @@ def get_data_parquet(
:param end_at: End date of the data
:return: Path to the parquet file
"""
non_date_string = self._get_non_date_string(symbol, time_bucket, is_lending)
file_name_with_bigger_date_range = self._get_file_name_with_bigger_date_range(non_date_string, start_at, end_at)
if file_name_with_bigger_date_range:
bigger_parquet_path = self.cache_directory.joinpath(file_name_with_bigger_date_range)
bigger_df = pd.read_parquet(bigger_parquet_path)
return bigger_df[start_at:end_at]

path = self.get_parquet_path(symbol, time_bucket, start_at, end_at, is_lending)
try:
return pd.read_parquet(path)
Expand All @@ -580,15 +581,69 @@ def get_parquet_path(
:param end_at: End date of the data
:return: Path to the parquet file
"""
non_date_string = self._get_non_date_string(symbol, time_bucket, is_lending)
file = Path(
f"{non_date_string}-{start_at}-{end_at}.parquet"
)
return self.cache_directory.joinpath(file)

def _get_file_name_with_bigger_date_range(self, non_date_string: str, start_at: datetime.datetime, end_at: datetime.datetime):
"""Get parquet file name with a bigger date range."""
for file in self.cache_directory.glob(f"{non_date_string}*.parquet"):
match = re.match(rf"{non_date_string}-(\d{{4}}-\d{{2}}-\d{{2}} \d{{2}}:\d{{2}}:\d{{2}})-(\d{{4}}-\d{{2}}-\d{{2}} \d{{2}}:\d{{2}}:\d{{2}}).parquet", file.name)
if match:
file_start_at = datetime.datetime.fromisoformat(match.group(1))
file_end_at = datetime.datetime.fromisoformat(match.group(2))
if file_start_at <= start_at and file_end_at >= end_at:
return file.name
return None

@staticmethod
def _get_non_date_string(
symbol: str,
time_bucket: TimeBucket,
is_lending: bool,
) -> str:
"""Get part of parquet file name that is not a date.
:param symbol: Trading pair symbol E.g. ETHUSDC
:param time_bucket: TimeBucket instance
:return: string
"""
if is_lending:
file_str = "lending"
else:
file_str = "candles"

file = Path(
file_str + f"-{symbol}-{time_bucket.value}-{start_at}-{end_at}.parquet"
)
return self.cache_directory.joinpath(file)
return f"{file_str}-{symbol}-{time_bucket.value}"

def _get_parquet_size(
self,
df: pd.DataFrame,
symbol: str,
time_bucket: TimeBucket,
start_at: datetime.datetime, end_at: datetime.datetime,
is_lending:bool=False,
) -> int:
"""Gets the size of the parquet file or the dataframe in memory if the parquet file does not exist.
:param df: Dataframe
:param symbol: Trading pair symbol E.g. ETHUSDC
:param time_bucket: TimeBucket instance
:param start_at: Start date of the data
:param end_at: End date of the data
:param is_lending: Whether the data is lending data or not
:return: Size in bytes
"""
try:
# Count the cached file size
path = self.get_parquet_path(symbol, time_bucket, start_at, end_at, is_lending)
size = os.path.getsize(path)
except:
# Count the size of the dataframe
# Can be inaccurate
# Divide by 2 based on rough testing for memory usage -> parquet file size
size = df.memory_usage(deep=True).sum()/2
return size

def overwrite_cached_data(
self,
Expand Down

0 comments on commit 9505f9a

Please sign in to comment.