From d51282c74666124cba93eb4be617daba339580c7 Mon Sep 17 00:00:00 2001 From: Collin Simon Date: Wed, 20 Dec 2023 18:14:24 +0000 Subject: [PATCH 01/65] Adding None support to HashableDict --- tap_mambu/helpers/hashable_dict.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tap_mambu/helpers/hashable_dict.py b/tap_mambu/helpers/hashable_dict.py index 724ad86..9100152 100644 --- a/tap_mambu/helpers/hashable_dict.py +++ b/tap_mambu/helpers/hashable_dict.py @@ -1,4 +1,5 @@ import json +import math class HashableDict(dict): @@ -8,7 +9,7 @@ def _recur_hash(value): if type(value) in [dict, HashableDict]: return HashableDict(value).__key() if type(value) == list: - return tuple(sorted(map(HashableDict._recur_hash, value))) + return tuple(sorted(map(HashableDict._recur_hash, value), key=lambda x: x if x is not None else -math.inf)) return value def __key(self): From 78010075b9e8dae6ec826d503917bb394f405c03 Mon Sep 17 00:00:00 2001 From: shantanu73 Date: Tue, 20 Feb 2024 09:59:32 +0000 Subject: [PATCH 02/65] Initial date windowing logic for activities stream. --- tap_mambu/tap_generators/generator.py | 2 ++ .../multithreaded_bookmark_generator.py | 26 +++++++++++++++++++ 2 files changed, 28 insertions(+) diff --git a/tap_mambu/tap_generators/generator.py b/tap_mambu/tap_generators/generator.py index dbd5fa9..52df51a 100644 --- a/tap_mambu/tap_generators/generator.py +++ b/tap_mambu/tap_generators/generator.py @@ -16,6 +16,8 @@ def __init__(self, stream_name, client, config, state, sub_type): self.config = config self.state = state self.sub_type = sub_type + self.date_windowing = False + self.date_window_size = 3 # Define parameters inside init self.params = dict() diff --git a/tap_mambu/tap_generators/multithreaded_bookmark_generator.py b/tap_mambu/tap_generators/multithreaded_bookmark_generator.py index c9bbe02..d6463df 100644 --- a/tap_mambu/tap_generators/multithreaded_bookmark_generator.py +++ b/tap_mambu/tap_generators/multithreaded_bookmark_generator.py @@ -1,8 +1,10 @@ import datetime import time +import backoff from copy import deepcopy from singer import get_logger +from datetime import datetime, timedelta from .multithreaded_offset_generator import MultithreadedOffsetGenerator from ..helpers import transform_json, convert @@ -51,6 +53,30 @@ def queue_batches(self): deepcopy(self.params))) return futures + @backoff.on_exception(backoff.expo, RuntimeError, max_tries=5) + def _all_fetch_batch_steps(self): + futures = [] + if self.date_windowing: + start = datetime.strptime(self.params["from"], '%Y-%m-%d').date() + end = datetime.strptime(self.params["to"], '%Y-%m-%d').date() + temp = start + timedelta(days=self.date_window_size) + while temp < end: + self.endpoint_intermediary_bookmark_offset = 0 + self.params["from"] = datetime.strftime(start, '%Y-%m-%d') + self.params["to"] = datetime.strftime(temp, '%Y-%m-%d') + futures += self.queue_batches() + start = temp + temp = start + timedelta(days=self.date_window_size) + self.endpoint_intermediary_bookmark_offset = 0 + self.params["from"] = datetime.strftime(start, '%Y-%m-%d') + self.params["to"] = datetime.strftime(end, '%Y-%m-%d') + futures += self.queue_batches() + final_buffer, stop_iteration = self.collect_batches(futures) + self.preprocess_batches(final_buffer) + if not final_buffer or stop_iteration: + return False + return True + def collect_batches(self, futures): # wait for responses, and check them for errors final_buffer = set() From e3851a8d54eb5bb7723ff418e01247c62f272ec8 Mon Sep 17 00:00:00 2001 From: shantanu73 Date: Mon, 26 Feb 2024 21:08:55 +0000 Subject: [PATCH 03/65] Changes: 1) Fixed Date windowing logic for multithreaded_bookmark_generator. 2) Added date_windowing instance variable as true in the constructor of activities stream. --- .../tap_generators/activities_generator.py | 4 +++ .../multithreaded_bookmark_generator.py | 26 ++++++++++--------- 2 files changed, 18 insertions(+), 12 deletions(-) diff --git a/tap_mambu/tap_generators/activities_generator.py b/tap_mambu/tap_generators/activities_generator.py index 4133bae..25ceefb 100644 --- a/tap_mambu/tap_generators/activities_generator.py +++ b/tap_mambu/tap_generators/activities_generator.py @@ -4,6 +4,10 @@ class ActivitiesGenerator(MultithreadedBookmarkDayByDayGenerator): + def __init__(self, stream_name, client, config, state, sub_type): + super(ActivitiesGenerator, self).__init__(stream_name, client, config, state, sub_type) + self.date_windowing = True + def _init_endpoint_config(self): super(ActivitiesGenerator, self)._init_endpoint_config() self.endpoint_path = "activities" diff --git a/tap_mambu/tap_generators/multithreaded_bookmark_generator.py b/tap_mambu/tap_generators/multithreaded_bookmark_generator.py index d6463df..6a10961 100644 --- a/tap_mambu/tap_generators/multithreaded_bookmark_generator.py +++ b/tap_mambu/tap_generators/multithreaded_bookmark_generator.py @@ -55,23 +55,25 @@ def queue_batches(self): @backoff.on_exception(backoff.expo, RuntimeError, max_tries=5) def _all_fetch_batch_steps(self): - futures = [] if self.date_windowing: start = datetime.strptime(self.params["from"], '%Y-%m-%d').date() end = datetime.strptime(self.params["to"], '%Y-%m-%d').date() temp = start + timedelta(days=self.date_window_size) + stop_iteration = True while temp < end: - self.endpoint_intermediary_bookmark_offset = 0 - self.params["from"] = datetime.strftime(start, '%Y-%m-%d') - self.params["to"] = datetime.strftime(temp, '%Y-%m-%d') - futures += self.queue_batches() - start = temp - temp = start + timedelta(days=self.date_window_size) - self.endpoint_intermediary_bookmark_offset = 0 - self.params["from"] = datetime.strftime(start, '%Y-%m-%d') - self.params["to"] = datetime.strftime(end, '%Y-%m-%d') - futures += self.queue_batches() - final_buffer, stop_iteration = self.collect_batches(futures) + if stop_iteration: + self.offset = 0 + self.static_params["from"] = datetime.strftime(start, '%Y-%m-%d') + self.static_params["to"] = datetime.strftime(temp, '%Y-%m-%d') + final_buffer, stop_iteration = self.collect_batches(self.queue_batches()) + self.preprocess_batches(final_buffer) + if not final_buffer or stop_iteration: + start = temp + temp = start + timedelta(days=self.date_window_size) + self.offset = 0 + self.static_params["from"] = datetime.strftime(start, '%Y-%m-%d') + self.static_params["to"] = datetime.strftime(end, '%Y-%m-%d') + final_buffer, stop_iteration = self.collect_batches(self.queue_batches()) self.preprocess_batches(final_buffer) if not final_buffer or stop_iteration: return False From 16d3fcb86e74dd9b1543161f6f525215b18d9e8b Mon Sep 17 00:00:00 2001 From: shantanu73 Date: Wed, 6 Mar 2024 05:00:56 +0000 Subject: [PATCH 04/65] Changes: 1) Removed endpoint params from activities generator. 2) Overrode modify_reques_params method for Loan Transactions generator. 3) Removed the method _all_fetch_batch_steps from multithreaded bookmark generator. 4) Modified the implementation for _all_fetch_batch_steps method in multithreaded offset generator to include date windowing. 5) Added new method modify_reques_params in multithreaded offset generator. --- .../tap_generators/activities_generator.py | 4 --- .../loan_transactions_generator.py | 20 +++++++++---- .../multithreaded_bookmark_generator.py | 28 ----------------- .../multithreaded_offset_generator.py | 30 +++++++++++++++++-- 4 files changed, 42 insertions(+), 40 deletions(-) diff --git a/tap_mambu/tap_generators/activities_generator.py b/tap_mambu/tap_generators/activities_generator.py index 25ceefb..1634141 100644 --- a/tap_mambu/tap_generators/activities_generator.py +++ b/tap_mambu/tap_generators/activities_generator.py @@ -13,10 +13,6 @@ def _init_endpoint_config(self): self.endpoint_path = "activities" self.endpoint_api_method = "GET" self.endpoint_api_version = "v1" - - self.endpoint_params["from"] = datetime_to_utc_str(str_to_localized_datetime( - get_bookmark(self.state, self.stream_name, self.sub_type, self.start_date)))[:10] - self.endpoint_params["to"] = datetime_to_utc_str(utc_now())[:10] self.endpoint_bookmark_field = "timestamp" @staticmethod diff --git a/tap_mambu/tap_generators/loan_transactions_generator.py b/tap_mambu/tap_generators/loan_transactions_generator.py index 7c30f5b..02c554e 100644 --- a/tap_mambu/tap_generators/loan_transactions_generator.py +++ b/tap_mambu/tap_generators/loan_transactions_generator.py @@ -1,20 +1,30 @@ from .multithreaded_bookmark_generator import MultithreadedBookmarkGenerator -from ..helpers import get_bookmark -from ..helpers.datetime_utils import datetime_to_utc_str, str_to_localized_datetime +from ..helpers.datetime_utils import datetime_to_utc_str +from datetime import datetime class LoanTransactionsGenerator(MultithreadedBookmarkGenerator): + def __init__(self, stream_name, client, config, state, sub_type): + super(LoanTransactionsGenerator, self).__init__(stream_name, client, config, state, sub_type) + self.date_windowing = True + def _init_endpoint_config(self): super(LoanTransactionsGenerator, self)._init_endpoint_config() self.endpoint_path = "loans/transactions:search" self.endpoint_bookmark_field = "creationDate" self.endpoint_sorting_criteria["field"] = "id" - self.endpoint_filter_criteria = [ + + def modify_request_params(self, start, end): + self.endpoint_body['filterCriteria'] = [ { "field": "creationDate", "operator": "AFTER", - "value": datetime_to_utc_str(str_to_localized_datetime( - get_bookmark(self.state, self.stream_name, self.sub_type, self.start_date))) + "value": datetime.strftime(start, '%Y-%m-%dT00:00:00.000000Z') + }, + { + "field": "creationDate", + "operator": "BEFORE", + "value": datetime.strftime(end, '%Y-%m-%dT23:59:59.000000Z') } ] diff --git a/tap_mambu/tap_generators/multithreaded_bookmark_generator.py b/tap_mambu/tap_generators/multithreaded_bookmark_generator.py index 6a10961..c9bbe02 100644 --- a/tap_mambu/tap_generators/multithreaded_bookmark_generator.py +++ b/tap_mambu/tap_generators/multithreaded_bookmark_generator.py @@ -1,10 +1,8 @@ import datetime import time -import backoff from copy import deepcopy from singer import get_logger -from datetime import datetime, timedelta from .multithreaded_offset_generator import MultithreadedOffsetGenerator from ..helpers import transform_json, convert @@ -53,32 +51,6 @@ def queue_batches(self): deepcopy(self.params))) return futures - @backoff.on_exception(backoff.expo, RuntimeError, max_tries=5) - def _all_fetch_batch_steps(self): - if self.date_windowing: - start = datetime.strptime(self.params["from"], '%Y-%m-%d').date() - end = datetime.strptime(self.params["to"], '%Y-%m-%d').date() - temp = start + timedelta(days=self.date_window_size) - stop_iteration = True - while temp < end: - if stop_iteration: - self.offset = 0 - self.static_params["from"] = datetime.strftime(start, '%Y-%m-%d') - self.static_params["to"] = datetime.strftime(temp, '%Y-%m-%d') - final_buffer, stop_iteration = self.collect_batches(self.queue_batches()) - self.preprocess_batches(final_buffer) - if not final_buffer or stop_iteration: - start = temp - temp = start + timedelta(days=self.date_window_size) - self.offset = 0 - self.static_params["from"] = datetime.strftime(start, '%Y-%m-%d') - self.static_params["to"] = datetime.strftime(end, '%Y-%m-%d') - final_buffer, stop_iteration = self.collect_batches(self.queue_batches()) - self.preprocess_batches(final_buffer) - if not final_buffer or stop_iteration: - return False - return True - def collect_batches(self, futures): # wait for responses, and check them for errors final_buffer = set() diff --git a/tap_mambu/tap_generators/multithreaded_offset_generator.py b/tap_mambu/tap_generators/multithreaded_offset_generator.py index 297cccd..4d25336 100644 --- a/tap_mambu/tap_generators/multithreaded_offset_generator.py +++ b/tap_mambu/tap_generators/multithreaded_offset_generator.py @@ -4,9 +4,11 @@ import backoff from singer import get_logger +from datetime import datetime, timedelta from .generator import TapGenerator -from ..helpers import transform_json +from ..helpers import transform_json, get_bookmark +from ..helpers.datetime_utils import str_to_localized_datetime, datetime_to_utc_str, utc_now from ..helpers.multithreaded_requests import MultithreadedRequestsPool from ..helpers.perf_metrics import PerformanceMetrics @@ -120,13 +122,35 @@ def preprocess_batches(self, final_buffer): @backoff.on_exception(backoff.expo, RuntimeError, max_tries=5) def _all_fetch_batch_steps(self): - futures = self.queue_batches() - final_buffer, stop_iteration = self.collect_batches(futures) + if self.date_windowing: + start_datetime = datetime_to_utc_str(str_to_localized_datetime( + get_bookmark(self.state, self.stream_name, self.sub_type, self.start_date)))[:10] + end_datetime = datetime_to_utc_str(utc_now())[:10] + start = datetime.strptime(start_datetime, '%Y-%m-%d').date() + end = datetime.strptime(end_datetime, '%Y-%m-%d').date() + temp = start + timedelta(days=self.date_window_size) + stop_iteration = True + while temp < end: + if stop_iteration: + self.offset = 0 + self.modify_request_params(start, temp) + final_buffer, stop_iteration = self.collect_batches(self.queue_batches()) + self.preprocess_batches(final_buffer) + if not final_buffer or stop_iteration: + start = temp + temp = start + timedelta(days=self.date_window_size) + self.offset = 0 + self.modify_request_params(start, end) + final_buffer, stop_iteration = self.collect_batches(self.queue_batches()) self.preprocess_batches(final_buffer) if not final_buffer or stop_iteration: return False return True + def modify_request_params(self, start, end): + self.static_params["from"] = datetime.strftime(start, '%Y-%m-%d') + self.static_params["to"] = datetime.strftime(end, '%Y-%m-%d') + def error_check_and_fix(self, final_buffer: set, temp_buffer: set, futures: list): try: final_buffer = self.check_and_get_set_reunion(final_buffer, temp_buffer, self.artificial_limit) From 859d923e43d4e1faf69c133d2c1aa7430cb1262e Mon Sep 17 00:00:00 2001 From: shantanu73 Date: Wed, 6 Mar 2024 06:07:56 +0000 Subject: [PATCH 05/65] Changing number of threads from 20 to 1 to fix memory issue. --- tap_mambu/tap_generators/multithreaded_offset_generator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tap_mambu/tap_generators/multithreaded_offset_generator.py b/tap_mambu/tap_generators/multithreaded_offset_generator.py index 4d25336..71c0a4c 100644 --- a/tap_mambu/tap_generators/multithreaded_offset_generator.py +++ b/tap_mambu/tap_generators/multithreaded_offset_generator.py @@ -31,7 +31,7 @@ def _init_config(self): self.end_of_file = False self.fetch_batch_thread = None self.last_batch_set = set() - self.max_threads = 20 + self.max_threads = 1 @staticmethod def check_and_get_set_reunion(a: set, b: set, lower_limit: int): From db922e526f37044a7ac1cb6e2807252e507060ab Mon Sep 17 00:00:00 2001 From: shantanu73 Date: Wed, 6 Mar 2024 14:25:41 +0000 Subject: [PATCH 06/65] Changes: 1) Set max threads back to 20. 2) Fixed bug in Date windowing implementation. --- .../tap_generators/multithreaded_offset_generator.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/tap_mambu/tap_generators/multithreaded_offset_generator.py b/tap_mambu/tap_generators/multithreaded_offset_generator.py index 71c0a4c..05e1f5c 100644 --- a/tap_mambu/tap_generators/multithreaded_offset_generator.py +++ b/tap_mambu/tap_generators/multithreaded_offset_generator.py @@ -31,7 +31,7 @@ def _init_config(self): self.end_of_file = False self.fetch_batch_thread = None self.last_batch_set = set() - self.max_threads = 1 + self.max_threads = 20 @staticmethod def check_and_get_set_reunion(a: set, b: set, lower_limit: int): @@ -130,7 +130,7 @@ def _all_fetch_batch_steps(self): end = datetime.strptime(end_datetime, '%Y-%m-%d').date() temp = start + timedelta(days=self.date_window_size) stop_iteration = True - while temp < end: + while start < end: if stop_iteration: self.offset = 0 self.modify_request_params(start, temp) @@ -139,10 +139,9 @@ def _all_fetch_batch_steps(self): if not final_buffer or stop_iteration: start = temp temp = start + timedelta(days=self.date_window_size) - self.offset = 0 - self.modify_request_params(start, end) - final_buffer, stop_iteration = self.collect_batches(self.queue_batches()) - self.preprocess_batches(final_buffer) + else: + final_buffer, stop_iteration = self.collect_batches(self.queue_batches()) + self.preprocess_batches(final_buffer) if not final_buffer or stop_iteration: return False return True From 01d315db324d05dcd04b9340a732902df9c85e98 Mon Sep 17 00:00:00 2001 From: shantanu73 Date: Sat, 16 Mar 2024 17:57:13 +0000 Subject: [PATCH 07/65] Changes: 1) Changed number of threads from 5 to default 20 for gl_journal_entries stream. 2) Added date window implementation for gl_journal_entries stream. --- .../gl_journal_entries_generator.py | 25 +++++++++++-------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/tap_mambu/tap_generators/gl_journal_entries_generator.py b/tap_mambu/tap_generators/gl_journal_entries_generator.py index 6b54305..151492c 100644 --- a/tap_mambu/tap_generators/gl_journal_entries_generator.py +++ b/tap_mambu/tap_generators/gl_journal_entries_generator.py @@ -1,12 +1,12 @@ from .multithreaded_bookmark_generator import MultithreadedBookmarkGenerator -from ..helpers import get_bookmark -from ..helpers.datetime_utils import datetime_to_utc_str, str_to_localized_datetime, utc_now +from ..helpers.datetime_utils import datetime_to_utc_str +from datetime import datetime class GlJournalEntriesGenerator(MultithreadedBookmarkGenerator): - def _init_config(self): - super()._init_config() - self.max_threads = 5 + def __init__(self, stream_name, client, config, state, sub_type): + super(GlJournalEntriesGenerator, self).__init__(stream_name, client, config, state, sub_type) + self.date_windowing = True def _init_endpoint_config(self): super(GlJournalEntriesGenerator, self)._init_endpoint_config() @@ -16,13 +16,18 @@ def _init_endpoint_config(self): "field": "entryId", "order": "ASC" } - self.endpoint_filter_criteria = [ + + def modify_request_params(self, start, end): + self.endpoint_body['filterCriteria'] = [ + { + "field": "creationDate", + "operator": "AFTER", + "value": datetime.strftime(start, '%Y-%m-%dT00:00:00.000000Z') + }, { "field": "creationDate", - "operator": "BETWEEN", - "value": datetime_to_utc_str(str_to_localized_datetime( - get_bookmark(self.state, self.stream_name, self.sub_type, self.start_date))), - "secondValue": datetime_to_utc_str(utc_now()) + "operator": "BEFORE", + "value": datetime.strftime(end, '%Y-%m-%dT23:59:59.000000Z') } ] From 28fbc3a174c949197f2a250becca990938bd6661 Mon Sep 17 00:00:00 2001 From: shantanu73 Date: Sun, 17 Mar 2024 06:02:00 +0000 Subject: [PATCH 08/65] Fixed issue with initializing offset and infinite loop in date windowing streams. --- tap_mambu/tap_generators/multithreaded_offset_generator.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tap_mambu/tap_generators/multithreaded_offset_generator.py b/tap_mambu/tap_generators/multithreaded_offset_generator.py index 05e1f5c..c88f9dc 100644 --- a/tap_mambu/tap_generators/multithreaded_offset_generator.py +++ b/tap_mambu/tap_generators/multithreaded_offset_generator.py @@ -131,12 +131,11 @@ def _all_fetch_batch_steps(self): temp = start + timedelta(days=self.date_window_size) stop_iteration = True while start < end: - if stop_iteration: - self.offset = 0 self.modify_request_params(start, temp) final_buffer, stop_iteration = self.collect_batches(self.queue_batches()) self.preprocess_batches(final_buffer) if not final_buffer or stop_iteration: + self.offset = 0 start = temp temp = start + timedelta(days=self.date_window_size) else: From e33022b73a702d34b398837bf942d5c56e791e99 Mon Sep 17 00:00:00 2001 From: Collin Simon Date: Tue, 19 Mar 2024 14:58:21 +0000 Subject: [PATCH 09/65] Remove performance metrics --- tap_mambu/helpers/metrics_plotter.py | 34 -------- tap_mambu/helpers/multithreaded_requests.py | 20 +++-- tap_mambu/helpers/perf_metrics.py | 77 ------------------- tap_mambu/sync.py | 21 ----- tap_mambu/tap_generators/generator.py | 21 +++-- .../multithreaded_offset_generator.py | 6 +- .../tap_processors/deduplication_processor.py | 6 +- tap_mambu/tap_processors/processor.py | 6 +- 8 files changed, 24 insertions(+), 167 deletions(-) delete mode 100644 tap_mambu/helpers/metrics_plotter.py delete mode 100644 tap_mambu/helpers/perf_metrics.py diff --git a/tap_mambu/helpers/metrics_plotter.py b/tap_mambu/helpers/metrics_plotter.py deleted file mode 100644 index 3324f73..0000000 --- a/tap_mambu/helpers/metrics_plotter.py +++ /dev/null @@ -1,34 +0,0 @@ -# pragma pylint: disable=protected-access -import matplotlib.pyplot as plt -from matplotlib.lines import Line2D - -from .perf_metrics import PerformanceMetrics as PerfMetrics - - -# noinspection PyProtectedMember -def show_thread_graph(): - all_timestamps = [(*generator_time, "r", "Generator") for generator_time in PerfMetrics._metrics["generator"]] + \ - [(*processor_time, "b", "Processor") for processor_time in PerfMetrics._metrics["processor"]] - counter = 0 - total_time = 0 - for timestamp in sorted(all_timestamps, key=lambda ts: ts[0]): - start_time = round(timestamp[0] - PerfMetrics._metrics_start_time, 1) - end_time = round(timestamp[1] - PerfMetrics._metrics_start_time, 1) - plt.plot([start_time, end_time], - [counter, counter], color=timestamp[2], label=timestamp[3], linestyle="-") - counter += 1 - if end_time > total_time: - total_time = end_time - plt.title(f"Total execution time: {total_time}s") - plt.ylabel("Timestamp") - plt.legend([Line2D([0], [0], color="r", lw=4), Line2D([0], [0], color="b", lw=4)], ["Generator", "Processor"]) - - plt.show() - - -# noinspection PyProtectedMember -def show_request_duration_graph(): - data_points = [record[1] - record[0] for record in PerfMetrics._metrics["generator"]] - x = list(range(len(data_points))) - plt.bar(x, data_points) - plt.show() \ No newline at end of file diff --git a/tap_mambu/helpers/multithreaded_requests.py b/tap_mambu/helpers/multithreaded_requests.py index 724f58a..979b277 100644 --- a/tap_mambu/helpers/multithreaded_requests.py +++ b/tap_mambu/helpers/multithreaded_requests.py @@ -1,7 +1,6 @@ import singer from concurrent.futures import Future, ThreadPoolExecutor from typing import List -from .perf_metrics import PerformanceMetrics LOGGER = singer.get_logger() @@ -24,16 +23,15 @@ def run(client, stream_name, f'{endpoint_api_version}): {client.base_url}/{endpoint_path}?{endpoint_querystring}' f' - body = {endpoint_body}') - with PerformanceMetrics(metric_name="generator"): - response = client.request( - method=endpoint_api_method, - path=endpoint_path, - version=endpoint_api_version, - apikey_type=endpoint_api_key_type, - params=endpoint_querystring, - endpoint=stream_name, - json=endpoint_body - ) + response = client.request( + method=endpoint_api_method, + path=endpoint_path, + version=endpoint_api_version, + apikey_type=endpoint_api_key_type, + params=endpoint_querystring, + endpoint=stream_name, + json=endpoint_body + ) LOGGER.info(f'(generator) Stream {stream_name} - extracted records: {len(response)}') return response diff --git a/tap_mambu/helpers/perf_metrics.py b/tap_mambu/helpers/perf_metrics.py deleted file mode 100644 index 3330699..0000000 --- a/tap_mambu/helpers/perf_metrics.py +++ /dev/null @@ -1,77 +0,0 @@ -import math -import time - - -class PerformanceMetrics: - _metrics_start_time = time.monotonic() - _metrics = dict(generator=list(), - processor=list(), - generator_wait=list(), - processor_wait=list()) - _generator_batch_size = 500 - - def __init__(self, metric_name): - self.start_time = None - if metric_name not in self._metrics: - raise ValueError("One argument must be True, but only one!") - self._metric_name = metric_name - - def __enter__(self): - self.start_time = time.monotonic() - - def __exit__(self, exc_type, exc_val, exc_tb): - metric = (self.start_time, time.monotonic()) - self._metrics[self._metric_name].append(metric) - - @classmethod - def reset_metrics(cls): - cls._metrics_start_time = time.monotonic() - cls._metrics = dict(generator=list(), - processor=list(), - generator_wait=list(), - processor_wait=list()) - - @classmethod - def set_generator_batch_size(cls, batch_size): - if any(cls._metrics.values()): - raise RuntimeError("You cannot change batch size after measuring metrics!") - cls._generator_batch_size = batch_size - - @property - def generator_batch_size(self): - return self._generator_batch_size - - @staticmethod - def get_sum(metric): - if not metric: - return 0 - return sum([record[1] - record[0] for record in metric]) - - @staticmethod - def get_avg_with_98th(metric): - if not metric: - return 0, 0 - values_total = sorted([record[1] - record[0] for record in metric], reverse=True) - values_98th = values_total[:math.ceil(len(values_total) * 2 / 100)] - - average = sum(values_total) / len(values_total) - average_98th = sum(values_98th) / len(values_98th) - - return average, average_98th - - @classmethod - def get_statistics(cls): - extraction_duration = time.monotonic() - cls._metrics_start_time - - generator_avg, generator_avg_98th = cls.get_avg_with_98th(cls._metrics["generator"]) - processor_avg, processor_avg_98th = cls.get_avg_with_98th(cls._metrics["processor"]) - generator_wait = cls.get_sum(cls._metrics["generator_wait"]) - processor_wait = cls.get_sum(cls._metrics["processor_wait"]) - - return dict(generator=generator_avg / cls._generator_batch_size, - generator_98th=generator_avg_98th / cls._generator_batch_size, - processor=processor_avg, processor_98th=processor_avg_98th, - generator_wait=generator_wait, - processor_wait=processor_wait, - records=len(cls._metrics["processor"]) // extraction_duration, - extraction=extraction_duration) diff --git a/tap_mambu/sync.py b/tap_mambu/sync.py index 14653b1..4d4b2e4 100644 --- a/tap_mambu/sync.py +++ b/tap_mambu/sync.py @@ -5,7 +5,6 @@ from .helpers.datetime_utils import get_timezone_info from .helpers.generator_processor_pairs import get_generator_processor_for_stream, get_stream_subtypes from .helpers.multithreaded_requests import MultithreadedRequestsPool -from .helpers.perf_metrics import PerformanceMetrics LOGGER = singer.get_logger() @@ -38,8 +37,6 @@ def sync_all_streams(client, config, catalog, state): get_timezone_info(client) - PerformanceMetrics.set_generator_batch_size(int(config.get("page_size", DEFAULT_PAGE_SIZE))) - selected_streams = get_selected_streams(catalog) LOGGER.info('selected_streams: {}'.format(selected_streams)) @@ -75,7 +72,6 @@ def sync_all_streams(client, config, catalog, state): LOGGER.info('START Syncing: {}, Type: {}'.format(stream_name, sub_type)) update_currently_syncing(state, stream_name) - PerformanceMetrics.reset_metrics() total_records = sync_endpoint( client=client, catalog=catalog, @@ -91,21 +87,4 @@ def sync_all_streams(client, config, catalog, state): total_records)) LOGGER.info('FINISHED Syncing: {}'.format(stream_name)) - statistics = PerformanceMetrics.get_statistics() - - if statistics['generator'] and statistics['generator_98th']: - LOGGER.info(f"Average Generator Records/s: {round(1/statistics['generator'])} " - f"[98th percentile: {round(1/statistics['generator_98th'])}]") - - if statistics['processor'] and statistics['processor_98th']: - LOGGER.info(f"Average Processor Records/s: {round(1/statistics['processor'])} " - f"[98th percentile: {round(1/statistics['processor_98th'])}]") - - LOGGER.info(f"Total Generator Wait (s): {round(statistics['generator_wait'], 1)} ") - - LOGGER.info(f"Total Processor Wait (s): {round(statistics['processor_wait'], 1)} ") - - LOGGER.info(f"Average Records/s: {statistics['records']}") - LOGGER.info(f"Total Duration: {statistics['extraction']}") - MultithreadedRequestsPool.shutdown() diff --git a/tap_mambu/tap_generators/generator.py b/tap_mambu/tap_generators/generator.py index dbd5fa9..c6d2918 100644 --- a/tap_mambu/tap_generators/generator.py +++ b/tap_mambu/tap_generators/generator.py @@ -4,7 +4,6 @@ from ..helpers import transform_json from ..helpers.hashable_dict import HashableDict -from ..helpers.perf_metrics import PerformanceMetrics LOGGER = get_logger() @@ -85,7 +84,6 @@ def next(self): return self.buffer.pop(0) def __next__(self): - # with PerformanceMetrics(metric_name="processor_wait"): return self.next() def prepare_batch(self): @@ -108,16 +106,15 @@ def fetch_batch(self): f'{self.endpoint_api_version}): {self.client.base_url}/{self.endpoint_path}?{endpoint_querystring}') LOGGER.info(f'(generator) Stream {self.stream_name} - body = {self.endpoint_body}') - with PerformanceMetrics(metric_name="generator"): - response = self.client.request( - method=self.endpoint_api_method, - path=self.endpoint_path, - version=self.endpoint_api_version, - apikey_type=self.endpoint_api_key_type, - params=endpoint_querystring, - endpoint=self.stream_name, - json=self.endpoint_body - ) + response = self.client.request( + method=self.endpoint_api_method, + path=self.endpoint_path, + version=self.endpoint_api_version, + apikey_type=self.endpoint_api_key_type, + params=endpoint_querystring, + endpoint=self.stream_name, + json=self.endpoint_body + ) self.time_extracted = utils.now() LOGGER.info(f'(generator) Stream {self.stream_name} - extracted records: {len(response)}') diff --git a/tap_mambu/tap_generators/multithreaded_offset_generator.py b/tap_mambu/tap_generators/multithreaded_offset_generator.py index 297cccd..d2c592f 100644 --- a/tap_mambu/tap_generators/multithreaded_offset_generator.py +++ b/tap_mambu/tap_generators/multithreaded_offset_generator.py @@ -8,7 +8,6 @@ from .generator import TapGenerator from ..helpers import transform_json from ..helpers.multithreaded_requests import MultithreadedRequestsPool -from ..helpers.perf_metrics import PerformanceMetrics LOGGER = get_logger() @@ -150,9 +149,8 @@ def __iter__(self): def next(self): if not self.buffer and not self.end_of_file: - with PerformanceMetrics(metric_name="processor_wait"): - while not self.buffer and not self.end_of_file: - time.sleep(0.01) + while not self.buffer and not self.end_of_file: + time.sleep(0.01) if not self.buffer and self.end_of_file: raise StopIteration() return self.buffer.pop(0) diff --git a/tap_mambu/tap_processors/deduplication_processor.py b/tap_mambu/tap_processors/deduplication_processor.py index 5d3c59c..b98e1d4 100644 --- a/tap_mambu/tap_processors/deduplication_processor.py +++ b/tap_mambu/tap_processors/deduplication_processor.py @@ -6,7 +6,6 @@ from .processor import TapProcessor from ..helpers import convert from ..helpers.exceptions import NoDeduplicationCapabilityException, NoDeduplicationKeyException -from ..helpers.perf_metrics import PerformanceMetrics LOGGER = get_logger() @@ -86,9 +85,8 @@ def process_records(self): # Process the record record = self.generator_values[record_key] - with PerformanceMetrics(metric_name="processor"): - is_processed = self.process_record(record, record_key.time_extracted, - record_key.endpoint_bookmark_field) + is_processed = self.process_record(record, record_key.time_extracted, + record_key.endpoint_bookmark_field) if is_processed: record_count += 1 diff --git a/tap_mambu/tap_processors/processor.py b/tap_mambu/tap_processors/processor.py index c12839b..a581c0c 100644 --- a/tap_mambu/tap_processors/processor.py +++ b/tap_mambu/tap_processors/processor.py @@ -5,7 +5,6 @@ from ..helpers import convert, get_bookmark, write_bookmark from ..helpers.transformer import Transformer from ..helpers.exceptions import NoDeduplicationCapabilityException -from ..helpers.perf_metrics import PerformanceMetrics from ..helpers.datetime_utils import utc_now, str_to_datetime, datetime_to_utc_str, str_to_localized_datetime LOGGER = get_logger() @@ -61,9 +60,8 @@ def process_records(self): with metrics.record_counter(self.stream_name) as counter: for record in self.generators[0]: # Process the record - with PerformanceMetrics(metric_name="processor"): - is_processed = self.process_record(record, utils.now(), - self.generators[0].endpoint_bookmark_field) + is_processed = self.process_record(record, utils.now(), + self.generators[0].endpoint_bookmark_field) if is_processed: record_count += 1 self._process_child_records(record) From 1b399dc3a3e4971616c15b704c076d601bbcf522 Mon Sep 17 00:00:00 2001 From: shantanu73 Date: Wed, 20 Mar 2024 04:13:23 +0000 Subject: [PATCH 10/65] Initializing final_buffer variable. --- tap_mambu/tap_generators/multithreaded_offset_generator.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tap_mambu/tap_generators/multithreaded_offset_generator.py b/tap_mambu/tap_generators/multithreaded_offset_generator.py index f20cdf8..fbd7cfe 100644 --- a/tap_mambu/tap_generators/multithreaded_offset_generator.py +++ b/tap_mambu/tap_generators/multithreaded_offset_generator.py @@ -129,6 +129,7 @@ def _all_fetch_batch_steps(self): end = datetime.strptime(end_datetime, '%Y-%m-%d').date() temp = start + timedelta(days=self.date_window_size) stop_iteration = True + final_buffer = [] while start < end: self.modify_request_params(start, temp) final_buffer, stop_iteration = self.collect_batches(self.queue_batches()) From da24d92744eeca8a30e0a788c78b27e516accaad Mon Sep 17 00:00:00 2001 From: shantanu73 Date: Wed, 20 Mar 2024 09:54:47 +0000 Subject: [PATCH 11/65] Fixed BEFORE date for loan_transactions & gl_journal_entries streams. --- tap_mambu/tap_generators/gl_journal_entries_generator.py | 2 +- tap_mambu/tap_generators/loan_transactions_generator.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tap_mambu/tap_generators/gl_journal_entries_generator.py b/tap_mambu/tap_generators/gl_journal_entries_generator.py index 151492c..d7b6d70 100644 --- a/tap_mambu/tap_generators/gl_journal_entries_generator.py +++ b/tap_mambu/tap_generators/gl_journal_entries_generator.py @@ -27,7 +27,7 @@ def modify_request_params(self, start, end): { "field": "creationDate", "operator": "BEFORE", - "value": datetime.strftime(end, '%Y-%m-%dT23:59:59.000000Z') + "value": datetime.strftime(end, '%Y-%m-%dT00:00:00.000000Z') } ] diff --git a/tap_mambu/tap_generators/loan_transactions_generator.py b/tap_mambu/tap_generators/loan_transactions_generator.py index 02c554e..1077093 100644 --- a/tap_mambu/tap_generators/loan_transactions_generator.py +++ b/tap_mambu/tap_generators/loan_transactions_generator.py @@ -24,7 +24,7 @@ def modify_request_params(self, start, end): { "field": "creationDate", "operator": "BEFORE", - "value": datetime.strftime(end, '%Y-%m-%dT23:59:59.000000Z') + "value": datetime.strftime(end, '%Y-%m-%dT00:00:00.000000Z') } ] From a8d46dde06d63c7035283dd7401d69384daa45ba Mon Sep 17 00:00:00 2001 From: shantanu73 Date: Wed, 20 Mar 2024 10:22:46 +0000 Subject: [PATCH 12/65] Fixed BEFORE date for loan_transactions & gl_journal_entries streams. Added 1 second in the BEFORE value. --- tap_mambu/tap_generators/gl_journal_entries_generator.py | 2 +- tap_mambu/tap_generators/loan_transactions_generator.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tap_mambu/tap_generators/gl_journal_entries_generator.py b/tap_mambu/tap_generators/gl_journal_entries_generator.py index d7b6d70..9b98b4c 100644 --- a/tap_mambu/tap_generators/gl_journal_entries_generator.py +++ b/tap_mambu/tap_generators/gl_journal_entries_generator.py @@ -27,7 +27,7 @@ def modify_request_params(self, start, end): { "field": "creationDate", "operator": "BEFORE", - "value": datetime.strftime(end, '%Y-%m-%dT00:00:00.000000Z') + "value": datetime.strftime(end, '%Y-%m-%dT00:00:01.000000Z') } ] diff --git a/tap_mambu/tap_generators/loan_transactions_generator.py b/tap_mambu/tap_generators/loan_transactions_generator.py index 1077093..42e75af 100644 --- a/tap_mambu/tap_generators/loan_transactions_generator.py +++ b/tap_mambu/tap_generators/loan_transactions_generator.py @@ -24,7 +24,7 @@ def modify_request_params(self, start, end): { "field": "creationDate", "operator": "BEFORE", - "value": datetime.strftime(end, '%Y-%m-%dT00:00:00.000000Z') + "value": datetime.strftime(end, '%Y-%m-%dT00:00:01.000000Z') } ] From 6d6ae6675dadaf281ca8ac83fe69489a537323a7 Mon Sep 17 00:00:00 2001 From: RushiT0122 Date: Thu, 21 Mar 2024 06:32:15 +0000 Subject: [PATCH 13/65] Limit max buffer size --- tap_mambu/tap_generators/generator.py | 1 + tap_mambu/tap_generators/multithreaded_offset_generator.py | 3 +++ 2 files changed, 4 insertions(+) diff --git a/tap_mambu/tap_generators/generator.py b/tap_mambu/tap_generators/generator.py index b46a1dd..20d2481 100644 --- a/tap_mambu/tap_generators/generator.py +++ b/tap_mambu/tap_generators/generator.py @@ -55,6 +55,7 @@ def _init_endpoint_body(self): def _init_buffers(self): self.buffer: List = list() + self.max_buffer_size = 100000 def _init_params(self): self.time_extracted = None diff --git a/tap_mambu/tap_generators/multithreaded_offset_generator.py b/tap_mambu/tap_generators/multithreaded_offset_generator.py index fbd7cfe..1a03766 100644 --- a/tap_mambu/tap_generators/multithreaded_offset_generator.py +++ b/tap_mambu/tap_generators/multithreaded_offset_generator.py @@ -131,6 +131,9 @@ def _all_fetch_batch_steps(self): stop_iteration = True final_buffer = [] while start < end: + # Limit the buffer size by holding generators from creating new batches + while len(self.buffer) > 100000: + time.sleep(1) self.modify_request_params(start, temp) final_buffer, stop_iteration = self.collect_batches(self.queue_batches()) self.preprocess_batches(final_buffer) From 162e542306e72bffefd5a1db8663416071abbf6e Mon Sep 17 00:00:00 2001 From: RushiT0122 Date: Thu, 21 Mar 2024 06:35:14 +0000 Subject: [PATCH 14/65] Add generator change --- tap_mambu/tap_generators/multithreaded_offset_generator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tap_mambu/tap_generators/multithreaded_offset_generator.py b/tap_mambu/tap_generators/multithreaded_offset_generator.py index 1a03766..7d57229 100644 --- a/tap_mambu/tap_generators/multithreaded_offset_generator.py +++ b/tap_mambu/tap_generators/multithreaded_offset_generator.py @@ -132,7 +132,7 @@ def _all_fetch_batch_steps(self): final_buffer = [] while start < end: # Limit the buffer size by holding generators from creating new batches - while len(self.buffer) > 100000: + while len(self.buffer) > self.max_buffer_size: time.sleep(1) self.modify_request_params(start, temp) final_buffer, stop_iteration = self.collect_batches(self.queue_batches()) From b5c39419e5f3467e729e48bbef995860abd060b7 Mon Sep 17 00:00:00 2001 From: shantanu73 Date: Fri, 22 Mar 2024 09:23:55 +0000 Subject: [PATCH 15/65] Changing default date window, from 3 to 5. --- tap_mambu/tap_generators/generator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tap_mambu/tap_generators/generator.py b/tap_mambu/tap_generators/generator.py index 20d2481..0063c2c 100644 --- a/tap_mambu/tap_generators/generator.py +++ b/tap_mambu/tap_generators/generator.py @@ -16,7 +16,7 @@ def __init__(self, stream_name, client, config, state, sub_type): self.state = state self.sub_type = sub_type self.date_windowing = False - self.date_window_size = 3 + self.date_window_size = 5 # Define parameters inside init self.params = dict() From 58a98b831372fb1d3505681678e677e693edb45f Mon Sep 17 00:00:00 2001 From: shantanu73 Date: Fri, 22 Mar 2024 09:25:20 +0000 Subject: [PATCH 16/65] Added generic date windowing for all MultiThreadedOffsetGenerator streams. --- .../multithreaded_offset_generator.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/tap_mambu/tap_generators/multithreaded_offset_generator.py b/tap_mambu/tap_generators/multithreaded_offset_generator.py index 7d57229..ec5dbbb 100644 --- a/tap_mambu/tap_generators/multithreaded_offset_generator.py +++ b/tap_mambu/tap_generators/multithreaded_offset_generator.py @@ -15,6 +15,10 @@ class MultithreadedOffsetGenerator(TapGenerator): + def __init__(self, stream_name, client, config, state, sub_type): + super(MultithreadedOffsetGenerator, self).__init__(stream_name, client, config, state, sub_type) + self.date_windowing = True + def _init_params(self): self.time_extracted = None self.static_params = dict(self.endpoint_params) @@ -149,8 +153,18 @@ def _all_fetch_batch_steps(self): return True def modify_request_params(self, start, end): - self.static_params["from"] = datetime.strftime(start, '%Y-%m-%d') - self.static_params["to"] = datetime.strftime(end, '%Y-%m-%d') + self.endpoint_body['filterCriteria'] = [ + { + "field": self.endpoint_bookmark_field, + "operator": "AFTER", + "value": datetime.strftime(start, '%Y-%m-%dT00:00:00.000000Z') + }, + { + "field": self.endpoint_bookmark_field, + "operator": "BEFORE", + "value": datetime.strftime(end, '%Y-%m-%dT00:00:01.000000Z') + } + ] def error_check_and_fix(self, final_buffer: set, temp_buffer: set, futures: list): try: From 3403ea7bc6d3657f40d5922ab08bbfb743250582 Mon Sep 17 00:00:00 2001 From: shantanu73 Date: Fri, 22 Mar 2024 09:28:11 +0000 Subject: [PATCH 17/65] Implemented modify_request_params method for Activities stream. --- tap_mambu/tap_generators/activities_generator.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tap_mambu/tap_generators/activities_generator.py b/tap_mambu/tap_generators/activities_generator.py index 1634141..8c1f2bc 100644 --- a/tap_mambu/tap_generators/activities_generator.py +++ b/tap_mambu/tap_generators/activities_generator.py @@ -1,6 +1,6 @@ from .multithreaded_bookmark_generator import MultithreadedBookmarkDayByDayGenerator -from ..helpers import get_bookmark -from ..helpers.datetime_utils import datetime_to_utc_str, str_to_localized_datetime, utc_now +from ..helpers.datetime_utils import datetime_to_utc_str +from datetime import datetime class ActivitiesGenerator(MultithreadedBookmarkDayByDayGenerator): @@ -15,6 +15,10 @@ def _init_endpoint_config(self): self.endpoint_api_version = "v1" self.endpoint_bookmark_field = "timestamp" + def modify_request_params(self, start, end): + self.static_params["from"] = datetime.strftime(start, '%Y-%m-%d') + self.static_params["to"] = datetime.strftime(end, '%Y-%m-%d') + @staticmethod def unpack_activity(record): record.update(record["activity"]) From aedd6aaaf0f392cf733511ad167142ebfb098460 Mon Sep 17 00:00:00 2001 From: shantanu73 Date: Fri, 22 Mar 2024 09:29:09 +0000 Subject: [PATCH 18/65] Implemented modify_request_params method for Installments stream. --- tap_mambu/tap_generators/installments_generator.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/tap_mambu/tap_generators/installments_generator.py b/tap_mambu/tap_generators/installments_generator.py index 10e4c99..e1ab1ca 100644 --- a/tap_mambu/tap_generators/installments_generator.py +++ b/tap_mambu/tap_generators/installments_generator.py @@ -1,5 +1,5 @@ from .multithreaded_offset_generator import MultithreadedOffsetGenerator -from ..helpers.datetime_utils import datetime_to_utc_str, str_to_localized_datetime, utc_now +from datetime import datetime class InstallmentsGenerator(MultithreadedOffsetGenerator): @@ -7,14 +7,12 @@ def _init_endpoint_config(self): super(InstallmentsGenerator, self)._init_endpoint_config() self.endpoint_path = "installments" self.endpoint_api_method = "GET" - self.endpoint_params = { - "dueFrom": datetime_to_utc_str(str_to_localized_datetime(self.start_date))[:10], - "dueTo": datetime_to_utc_str(utc_now())[:10], - "detailsLevel": "FULL", - "paginationDetails": "OFF" - } self.endpoint_bookmark_field = "lastPaidDate" + def modify_request_params(self, start, end): + self.static_params["dueFrom"] = datetime.strftime(start, '%Y-%m-%d') + self.static_params["dueTo"] = datetime.strftime(end, '%Y-%m-%d') + def transform_batch(self, batch): temp_batch = super(InstallmentsGenerator, self).transform_batch(batch) for record in temp_batch: From a9ff3adcc90785732541222b955b6dc1a461daed Mon Sep 17 00:00:00 2001 From: shantanu73 Date: Fri, 22 Mar 2024 09:30:07 +0000 Subject: [PATCH 19/65] Disabled Date windowing for Users stream. --- tap_mambu/tap_generators/users_generator.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tap_mambu/tap_generators/users_generator.py b/tap_mambu/tap_generators/users_generator.py index c418e4e..32f1cc4 100644 --- a/tap_mambu/tap_generators/users_generator.py +++ b/tap_mambu/tap_generators/users_generator.py @@ -2,6 +2,10 @@ class UsersGenerator(MultithreadedOffsetGenerator): + def __init__(self, stream_name, client, config, state, sub_type): + super(UsersGenerator, self).__init__(stream_name, client, config, state, sub_type) + self.date_windowing = False + def _init_endpoint_config(self): super(UsersGenerator, self)._init_endpoint_config() self.endpoint_path = "users" From db07f7a93a2ac9da047d3d8b8c14cfbea76d6dd4 Mon Sep 17 00:00:00 2001 From: shantanu73 Date: Fri, 22 Mar 2024 09:31:22 +0000 Subject: [PATCH 20/65] Removed endpoint filter criteria for Clients stream. --- tap_mambu/tap_generators/clients_generator.py | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/tap_mambu/tap_generators/clients_generator.py b/tap_mambu/tap_generators/clients_generator.py index 38c3bde..d014dfa 100644 --- a/tap_mambu/tap_generators/clients_generator.py +++ b/tap_mambu/tap_generators/clients_generator.py @@ -1,6 +1,5 @@ from .multithreaded_bookmark_generator import MultithreadedBookmarkGenerator -from ..helpers import get_bookmark -from ..helpers.datetime_utils import str_to_localized_datetime, datetime_to_local_str +from ..helpers.datetime_utils import datetime_to_local_str class ClientsGenerator(MultithreadedBookmarkGenerator): @@ -16,14 +15,6 @@ def _init_endpoint_config(self): "field": "lastModifiedDate", "order": "ASC" } - self.endpoint_filter_criteria = [ - { - "field": "lastModifiedDate", - "operator": "AFTER", - "value": datetime_to_local_str(str_to_localized_datetime( - get_bookmark(self.state, self.stream_name, self.sub_type, self.start_date))) - } - ] def prepare_batch_params(self): super(ClientsGenerator, self).prepare_batch_params() From 554720b520db1c0a65d9d870b83e616b12f708cc Mon Sep 17 00:00:00 2001 From: shantanu73 Date: Fri, 22 Mar 2024 09:32:43 +0000 Subject: [PATCH 21/65] Implemented modify_request_params method for Communications stream. --- .../communications_generator.py | 26 +++++++------------ 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/tap_mambu/tap_generators/communications_generator.py b/tap_mambu/tap_generators/communications_generator.py index 561559d..620dc47 100644 --- a/tap_mambu/tap_generators/communications_generator.py +++ b/tap_mambu/tap_generators/communications_generator.py @@ -1,6 +1,5 @@ from .multithreaded_bookmark_generator import MultithreadedBookmarkGenerator -from ..helpers import get_bookmark -from ..helpers.datetime_utils import datetime_to_local_str, str_to_localized_datetime +from ..helpers.datetime_utils import datetime_to_local_str class CommunicationsGenerator(MultithreadedBookmarkGenerator): @@ -12,23 +11,18 @@ def _init_endpoint_config(self): "paginationDetails": "OFF" } self.endpoint_bookmark_field = "creationDate" - self.endpoint_filter_criteria = [ - { - "field": "state", - "operator": "EQUALS", - "value": "SENT" - }, - { - "field": "creationDate", - "operator": "AFTER", - "value": datetime_to_local_str(str_to_localized_datetime( - get_bookmark(self.state, self.stream_name, self.sub_type, self.start_date))) - } - ] + + def modify_request_params(self, start, end): + super().modify_request_params(start, end) + self.endpoint_body['filterCriteria'].append({ + "field": "state", + "operator": "EQUALS", + "value": "SENT" + }) def _init_endpoint_body(self): self.endpoint_body = self.endpoint_filter_criteria def prepare_batch_params(self): super(CommunicationsGenerator, self).prepare_batch_params() - self.endpoint_filter_criteria[1]["value"] = datetime_to_local_str(self.endpoint_intermediary_bookmark_value) + self.endpoint_filter_criteria[0]["value"] = datetime_to_local_str(self.endpoint_intermediary_bookmark_value) From 7711832d2ba3000322209a58f9b8db0e3cc4c083 Mon Sep 17 00:00:00 2001 From: shantanu73 Date: Fri, 22 Mar 2024 09:33:37 +0000 Subject: [PATCH 22/65] Removed endpoint filter criteria for DepositAccounts stream. --- .../tap_generators/deposit_accounts_generator.py | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/tap_mambu/tap_generators/deposit_accounts_generator.py b/tap_mambu/tap_generators/deposit_accounts_generator.py index be99802..d592a2d 100644 --- a/tap_mambu/tap_generators/deposit_accounts_generator.py +++ b/tap_mambu/tap_generators/deposit_accounts_generator.py @@ -1,6 +1,5 @@ from .multithreaded_bookmark_generator import MultithreadedBookmarkGenerator -from ..helpers import get_bookmark -from ..helpers.datetime_utils import datetime_to_utc_str, str_to_localized_datetime +from ..helpers.datetime_utils import datetime_to_utc_str class DepositAccountsGenerator(MultithreadedBookmarkGenerator): @@ -11,19 +10,11 @@ def _init_config(self): def _init_endpoint_config(self): super(DepositAccountsGenerator, self)._init_endpoint_config() self.endpoint_path = "deposits:search" + self.endpoint_bookmark_field = "lastModifiedDate" self.endpoint_sorting_criteria = { "field": "lastModifiedDate", "order": "ASC" } - self.endpoint_filter_criteria = [ - { - "field": "lastModifiedDate", - "operator": "AFTER", - "value": datetime_to_utc_str(str_to_localized_datetime( - get_bookmark(self.state, self.stream_name, self.sub_type, self.start_date))) - } - ] - self.endpoint_bookmark_field = "lastModifiedDate" def prepare_batch_params(self): super(DepositAccountsGenerator, self).prepare_batch_params() From 11e44de095810de4a940d60a204ba62cbdf57def Mon Sep 17 00:00:00 2001 From: shantanu73 Date: Fri, 22 Mar 2024 09:35:14 +0000 Subject: [PATCH 23/65] Removed endpoint filter criteria for DepositTransactions stream. --- .../tap_generators/deposit_transactions_generator.py | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/tap_mambu/tap_generators/deposit_transactions_generator.py b/tap_mambu/tap_generators/deposit_transactions_generator.py index aa83ff4..2889531 100644 --- a/tap_mambu/tap_generators/deposit_transactions_generator.py +++ b/tap_mambu/tap_generators/deposit_transactions_generator.py @@ -1,6 +1,5 @@ from .multithreaded_bookmark_generator import MultithreadedBookmarkGenerator -from ..helpers import get_bookmark -from ..helpers.datetime_utils import datetime_to_utc_str, str_to_localized_datetime +from ..helpers.datetime_utils import datetime_to_utc_str class DepositTransactionsGenerator(MultithreadedBookmarkGenerator): @@ -13,14 +12,6 @@ def _init_endpoint_config(self): self.endpoint_path = "deposits/transactions:search" self.endpoint_bookmark_field = "creationDate" self.endpoint_sorting_criteria["field"] = "id" - self.endpoint_filter_criteria = [ - { - "field": "creationDate", - "operator": "AFTER", - "value": datetime_to_utc_str(str_to_localized_datetime( - get_bookmark(self.state, self.stream_name, self.sub_type, self.start_date))) - } - ] def prepare_batch_params(self): super(DepositTransactionsGenerator, self).prepare_batch_params() From 09473ff56fe6a8a03e1a4fa4384bb099d7e6b5fa Mon Sep 17 00:00:00 2001 From: shantanu73 Date: Fri, 22 Mar 2024 09:37:09 +0000 Subject: [PATCH 24/65] Removed modify_request_params implementation from GL Journal Accounts stream class. --- .../gl_journal_entries_generator.py | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/tap_mambu/tap_generators/gl_journal_entries_generator.py b/tap_mambu/tap_generators/gl_journal_entries_generator.py index 9b98b4c..5ace492 100644 --- a/tap_mambu/tap_generators/gl_journal_entries_generator.py +++ b/tap_mambu/tap_generators/gl_journal_entries_generator.py @@ -1,6 +1,5 @@ from .multithreaded_bookmark_generator import MultithreadedBookmarkGenerator from ..helpers.datetime_utils import datetime_to_utc_str -from datetime import datetime class GlJournalEntriesGenerator(MultithreadedBookmarkGenerator): @@ -17,20 +16,6 @@ def _init_endpoint_config(self): "order": "ASC" } - def modify_request_params(self, start, end): - self.endpoint_body['filterCriteria'] = [ - { - "field": "creationDate", - "operator": "AFTER", - "value": datetime.strftime(start, '%Y-%m-%dT00:00:00.000000Z') - }, - { - "field": "creationDate", - "operator": "BEFORE", - "value": datetime.strftime(end, '%Y-%m-%dT00:00:01.000000Z') - } - ] - def prepare_batch_params(self): super(GlJournalEntriesGenerator, self).prepare_batch_params() self.endpoint_filter_criteria[0]["value"] = datetime_to_utc_str(self.endpoint_intermediary_bookmark_value) From 232a7ea86da2ef43f88546082d97e970cf3723ba Mon Sep 17 00:00:00 2001 From: shantanu73 Date: Fri, 22 Mar 2024 09:37:44 +0000 Subject: [PATCH 25/65] Removed endpoint filter criteria for Groups stream. --- tap_mambu/tap_generators/groups_generator.py | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/tap_mambu/tap_generators/groups_generator.py b/tap_mambu/tap_generators/groups_generator.py index bcabdcd..8e38037 100644 --- a/tap_mambu/tap_generators/groups_generator.py +++ b/tap_mambu/tap_generators/groups_generator.py @@ -1,6 +1,5 @@ from .multithreaded_bookmark_generator import MultithreadedBookmarkGenerator -from ..helpers import get_bookmark -from ..helpers.datetime_utils import datetime_to_local_str, str_to_localized_datetime +from ..helpers.datetime_utils import datetime_to_local_str class GroupsGenerator(MultithreadedBookmarkGenerator): @@ -12,14 +11,6 @@ def _init_endpoint_config(self): "field": "lastModifiedDate", "order": "ASC" } - self.endpoint_filter_criteria = [ - { - "field": "lastModifiedDate", - "operator": "AFTER", - "value": datetime_to_local_str(str_to_localized_datetime( - get_bookmark(self.state, self.stream_name, self.sub_type, self.start_date))) - } - ] def prepare_batch_params(self): super(GroupsGenerator, self).prepare_batch_params() From 70fc6165382f7e32d4683b96d59a8f064cca18d6 Mon Sep 17 00:00:00 2001 From: shantanu73 Date: Fri, 22 Mar 2024 09:38:33 +0000 Subject: [PATCH 26/65] Removed endpoint filter criteria for Interest Accrual Breakdown stream. --- .../interest_accrual_breakdown_generator.py | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/tap_mambu/tap_generators/interest_accrual_breakdown_generator.py b/tap_mambu/tap_generators/interest_accrual_breakdown_generator.py index 7b094f6..a9bd7ab 100644 --- a/tap_mambu/tap_generators/interest_accrual_breakdown_generator.py +++ b/tap_mambu/tap_generators/interest_accrual_breakdown_generator.py @@ -1,6 +1,5 @@ from .multithreaded_bookmark_generator import MultithreadedBookmarkDayByDayGenerator -from ..helpers import get_bookmark -from ..helpers.datetime_utils import datetime_to_utc_str, str_to_localized_datetime +from ..helpers.datetime_utils import datetime_to_utc_str class InterestAccrualBreakdownGenerator(MultithreadedBookmarkDayByDayGenerator): @@ -12,14 +11,6 @@ def _init_endpoint_config(self): "field": "entryId", "order": "ASC" } - self.endpoint_filter_criteria = [ - { - "field": "creationDate", - "operator": "AFTER", - "value": datetime_to_utc_str(str_to_localized_datetime( - get_bookmark(self.state, self.stream_name, self.sub_type, self.start_date)))[:10] - } - ] def prepare_batch_params(self): super(InterestAccrualBreakdownGenerator, self).prepare_batch_params() From 785696195bcd28d17cb96f843698d8e4839a0663 Mon Sep 17 00:00:00 2001 From: shantanu73 Date: Fri, 22 Mar 2024 09:39:31 +0000 Subject: [PATCH 27/65] Removed modify_request_params implementation from Loan Transactions stream class. --- .../tap_generators/loan_transactions_generator.py | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/tap_mambu/tap_generators/loan_transactions_generator.py b/tap_mambu/tap_generators/loan_transactions_generator.py index 42e75af..8120352 100644 --- a/tap_mambu/tap_generators/loan_transactions_generator.py +++ b/tap_mambu/tap_generators/loan_transactions_generator.py @@ -1,6 +1,5 @@ from .multithreaded_bookmark_generator import MultithreadedBookmarkGenerator from ..helpers.datetime_utils import datetime_to_utc_str -from datetime import datetime class LoanTransactionsGenerator(MultithreadedBookmarkGenerator): @@ -14,20 +13,6 @@ def _init_endpoint_config(self): self.endpoint_bookmark_field = "creationDate" self.endpoint_sorting_criteria["field"] = "id" - def modify_request_params(self, start, end): - self.endpoint_body['filterCriteria'] = [ - { - "field": "creationDate", - "operator": "AFTER", - "value": datetime.strftime(start, '%Y-%m-%dT00:00:00.000000Z') - }, - { - "field": "creationDate", - "operator": "BEFORE", - "value": datetime.strftime(end, '%Y-%m-%dT00:00:01.000000Z') - } - ] - def prepare_batch_params(self): super(LoanTransactionsGenerator, self).prepare_batch_params() self.endpoint_filter_criteria[0]["value"] = datetime_to_utc_str(self.endpoint_intermediary_bookmark_value) From b4cf6d5aa4c5b0eb74c3d4caeb199c01012609e6 Mon Sep 17 00:00:00 2001 From: shantanu73 Date: Fri, 22 Mar 2024 10:03:57 +0000 Subject: [PATCH 28/65] Changes: 1) Removed redundant initialization of date_windowing for Activities, GL Journal entries & Loan transactions streams. 2) Changed max threads for Deposit transactions stream to default value. --- tap_mambu/tap_generators/activities_generator.py | 4 ---- tap_mambu/tap_generators/deposit_transactions_generator.py | 4 ---- tap_mambu/tap_generators/gl_journal_entries_generator.py | 4 ---- tap_mambu/tap_generators/loan_transactions_generator.py | 4 ---- 4 files changed, 16 deletions(-) diff --git a/tap_mambu/tap_generators/activities_generator.py b/tap_mambu/tap_generators/activities_generator.py index 8c1f2bc..afa7140 100644 --- a/tap_mambu/tap_generators/activities_generator.py +++ b/tap_mambu/tap_generators/activities_generator.py @@ -4,10 +4,6 @@ class ActivitiesGenerator(MultithreadedBookmarkDayByDayGenerator): - def __init__(self, stream_name, client, config, state, sub_type): - super(ActivitiesGenerator, self).__init__(stream_name, client, config, state, sub_type) - self.date_windowing = True - def _init_endpoint_config(self): super(ActivitiesGenerator, self)._init_endpoint_config() self.endpoint_path = "activities" diff --git a/tap_mambu/tap_generators/deposit_transactions_generator.py b/tap_mambu/tap_generators/deposit_transactions_generator.py index 2889531..d528023 100644 --- a/tap_mambu/tap_generators/deposit_transactions_generator.py +++ b/tap_mambu/tap_generators/deposit_transactions_generator.py @@ -3,10 +3,6 @@ class DepositTransactionsGenerator(MultithreadedBookmarkGenerator): - def _init_config(self): - super()._init_config() - self.max_threads = 5 - def _init_endpoint_config(self): super(DepositTransactionsGenerator, self)._init_endpoint_config() self.endpoint_path = "deposits/transactions:search" diff --git a/tap_mambu/tap_generators/gl_journal_entries_generator.py b/tap_mambu/tap_generators/gl_journal_entries_generator.py index 5ace492..9070d6d 100644 --- a/tap_mambu/tap_generators/gl_journal_entries_generator.py +++ b/tap_mambu/tap_generators/gl_journal_entries_generator.py @@ -3,10 +3,6 @@ class GlJournalEntriesGenerator(MultithreadedBookmarkGenerator): - def __init__(self, stream_name, client, config, state, sub_type): - super(GlJournalEntriesGenerator, self).__init__(stream_name, client, config, state, sub_type) - self.date_windowing = True - def _init_endpoint_config(self): super(GlJournalEntriesGenerator, self)._init_endpoint_config() self.endpoint_path = "gljournalentries:search" diff --git a/tap_mambu/tap_generators/loan_transactions_generator.py b/tap_mambu/tap_generators/loan_transactions_generator.py index 8120352..dfaf2e4 100644 --- a/tap_mambu/tap_generators/loan_transactions_generator.py +++ b/tap_mambu/tap_generators/loan_transactions_generator.py @@ -3,10 +3,6 @@ class LoanTransactionsGenerator(MultithreadedBookmarkGenerator): - def __init__(self, stream_name, client, config, state, sub_type): - super(LoanTransactionsGenerator, self).__init__(stream_name, client, config, state, sub_type) - self.date_windowing = True - def _init_endpoint_config(self): super(LoanTransactionsGenerator, self)._init_endpoint_config() self.endpoint_path = "loans/transactions:search" From 69f243579b0621dc523429a78d725ba6dd9b3c0b Mon Sep 17 00:00:00 2001 From: shantanu73 Date: Mon, 25 Mar 2024 10:58:06 +0000 Subject: [PATCH 29/65] Fixed bug in communications stream date windowing implementation. --- .../communications_generator.py | 24 ++++++++++++++----- 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/tap_mambu/tap_generators/communications_generator.py b/tap_mambu/tap_generators/communications_generator.py index 620dc47..866bffd 100644 --- a/tap_mambu/tap_generators/communications_generator.py +++ b/tap_mambu/tap_generators/communications_generator.py @@ -1,5 +1,6 @@ from .multithreaded_bookmark_generator import MultithreadedBookmarkGenerator from ..helpers.datetime_utils import datetime_to_local_str +from datetime import datetime class CommunicationsGenerator(MultithreadedBookmarkGenerator): @@ -13,12 +14,23 @@ def _init_endpoint_config(self): self.endpoint_bookmark_field = "creationDate" def modify_request_params(self, start, end): - super().modify_request_params(start, end) - self.endpoint_body['filterCriteria'].append({ - "field": "state", - "operator": "EQUALS", - "value": "SENT" - }) + self.endpoint_body = [ + { + "field": self.endpoint_bookmark_field, + "operator": "AFTER", + "value": datetime.strftime(start, '%Y-%m-%dT00:00:00.000000Z') + }, + { + "field": self.endpoint_bookmark_field, + "operator": "BEFORE", + "value": datetime.strftime(end, '%Y-%m-%dT00:00:01.000000Z') + }, + { + "field": "state", + "operator": "EQUALS", + "value": "SENT" + } + ] def _init_endpoint_body(self): self.endpoint_body = self.endpoint_filter_criteria From 2be5cf7bf592e1b49941cb54737af4fbc9c56e72 Mon Sep 17 00:00:00 2001 From: shantanu73 Date: Mon, 25 Mar 2024 11:30:33 +0000 Subject: [PATCH 30/65] Set date_windowing as False and restored original implementation for Installments stream. --- .../tap_generators/installments_generator.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/tap_mambu/tap_generators/installments_generator.py b/tap_mambu/tap_generators/installments_generator.py index e1ab1ca..a9810f3 100644 --- a/tap_mambu/tap_generators/installments_generator.py +++ b/tap_mambu/tap_generators/installments_generator.py @@ -1,18 +1,24 @@ from .multithreaded_offset_generator import MultithreadedOffsetGenerator -from datetime import datetime +from ..helpers.datetime_utils import datetime_to_utc_str, str_to_localized_datetime, utc_now class InstallmentsGenerator(MultithreadedOffsetGenerator): + def __init__(self, stream_name, client, config, state, sub_type): + super(InstallmentsGenerator, self).__init__(stream_name, client, config, state, sub_type) + self.date_windowing = False + def _init_endpoint_config(self): super(InstallmentsGenerator, self)._init_endpoint_config() self.endpoint_path = "installments" self.endpoint_api_method = "GET" + self.endpoint_params = { + "dueFrom": datetime_to_utc_str(str_to_localized_datetime(self.start_date))[:10], + "dueTo": datetime_to_utc_str(utc_now())[:10], + "detailsLevel": "FULL", + "paginationDetails": "OFF" + } self.endpoint_bookmark_field = "lastPaidDate" - def modify_request_params(self, start, end): - self.static_params["dueFrom"] = datetime.strftime(start, '%Y-%m-%d') - self.static_params["dueTo"] = datetime.strftime(end, '%Y-%m-%d') - def transform_batch(self, batch): temp_batch = super(InstallmentsGenerator, self).transform_batch(batch) for record in temp_batch: From 33bfbd18862daac189fe8d406c5f9834c3d3dd71 Mon Sep 17 00:00:00 2001 From: shantanu73 Date: Tue, 26 Mar 2024 13:15:12 +0000 Subject: [PATCH 31/65] Changed loan_accounts stream implementation to MultithreadedBookmarkGenerator stream. --- .../tap_generators/loan_accounts_generator.py | 41 +++++-------------- 1 file changed, 10 insertions(+), 31 deletions(-) diff --git a/tap_mambu/tap_generators/loan_accounts_generator.py b/tap_mambu/tap_generators/loan_accounts_generator.py index 1d8ee5b..54e96fe 100644 --- a/tap_mambu/tap_generators/loan_accounts_generator.py +++ b/tap_mambu/tap_generators/loan_accounts_generator.py @@ -1,44 +1,23 @@ -import abc +from .multithreaded_bookmark_generator import MultithreadedBookmarkGenerator +from ..helpers.datetime_utils import datetime_to_utc_str -from .generator import TapGenerator -from ..helpers import get_bookmark -from ..helpers.datetime_utils import str_to_localized_datetime, datetime_to_local_str - -class LoanAccountsGenerator(TapGenerator): - @abc.abstractmethod +class LoanAccountsLMGenerator(MultithreadedBookmarkGenerator): def _init_endpoint_config(self): - super(LoanAccountsGenerator, self)._init_endpoint_config() + super(LoanAccountsLMGenerator, self)._init_endpoint_config() self.endpoint_path = "loans:search" + self.endpoint_bookmark_field = "lastModifiedDate" self.endpoint_sorting_criteria = { "field": "id", "order": "ASC" } - -class LoanAccountsLMGenerator(LoanAccountsGenerator): - def _init_endpoint_config(self): - super()._init_endpoint_config() - self.endpoint_bookmark_field = "lastModifiedDate" - self.endpoint_filter_criteria = [ - { - "field": "lastModifiedDate", - "operator": "AFTER", - "value": datetime_to_local_str(str_to_localized_datetime( - get_bookmark(self.state, self.stream_name, self.sub_type, self.start_date))) - } - ] + def prepare_batch_params(self): + super(LoanAccountsLMGenerator, self).prepare_batch_params() + self.endpoint_filter_criteria[0]["value"] = datetime_to_utc_str(self.endpoint_intermediary_bookmark_value) -class LoanAccountsADGenerator(LoanAccountsGenerator): +class LoanAccountsADGenerator(LoanAccountsLMGenerator): def _init_endpoint_config(self): - super()._init_endpoint_config() + super(LoanAccountsADGenerator, self)._init_endpoint_config() self.endpoint_bookmark_field = "lastAccountAppraisalDate" - self.endpoint_filter_criteria = [ - { - "field": "lastAccountAppraisalDate", - "operator": "AFTER", - "value": datetime_to_local_str(str_to_localized_datetime( - get_bookmark(self.state, self.stream_name, self.sub_type, self.start_date))) - } - ] From 0d57fb7b626905523a51690de2740cc39e8b47fd Mon Sep 17 00:00:00 2001 From: shantanu73 Date: Thu, 28 Mar 2024 09:49:45 +0000 Subject: [PATCH 32/65] Added backoff for intermittent ProtocolError in Mambu APIs. --- tap_mambu/helpers/client.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tap_mambu/helpers/client.py b/tap_mambu/helpers/client.py index 12c2284..9a901fb 100644 --- a/tap_mambu/helpers/client.py +++ b/tap_mambu/helpers/client.py @@ -4,6 +4,7 @@ from requests.exceptions import ConnectionError from singer import metrics, get_logger +from urllib3.exceptions import ProtocolError LOGGER = get_logger() class ClientError(Exception): @@ -178,7 +179,7 @@ def check_access(self): return True @backoff.on_exception(backoff.expo, - (MambuInternalServiceError, ConnectionError, MambuApiLimitError), + (MambuInternalServiceError, ConnectionError, MambuApiLimitError, ProtocolError), max_tries=7, factor=3) def request(self, method, path=None, url=None, json=None, version=None, apikey_type=None, **kwargs): From 08bc5f79eb86ef69841e4623df9b88502aad90b3 Mon Sep 17 00:00:00 2001 From: shantanu73 Date: Thu, 28 Mar 2024 13:14:39 +0000 Subject: [PATCH 33/65] - Raise exception when thread is dead - Write bookmark periodically --- tap_mambu/helpers/exceptions.py | 3 +++ tap_mambu/tap_generators/multithreaded_offset_generator.py | 3 +++ tap_mambu/tap_processors/processor.py | 5 +++++ 3 files changed, 11 insertions(+) diff --git a/tap_mambu/helpers/exceptions.py b/tap_mambu/helpers/exceptions.py index ed1ad8a..46ef2f8 100644 --- a/tap_mambu/helpers/exceptions.py +++ b/tap_mambu/helpers/exceptions.py @@ -3,3 +3,6 @@ class NoDeduplicationCapabilityException(Exception): class NoDeduplicationKeyException(Exception): pass + +class MambuGeneratorThreadNotAlive(Exception): + pass diff --git a/tap_mambu/tap_generators/multithreaded_offset_generator.py b/tap_mambu/tap_generators/multithreaded_offset_generator.py index ec5dbbb..71261bf 100644 --- a/tap_mambu/tap_generators/multithreaded_offset_generator.py +++ b/tap_mambu/tap_generators/multithreaded_offset_generator.py @@ -10,6 +10,7 @@ from ..helpers import transform_json, get_bookmark from ..helpers.datetime_utils import str_to_localized_datetime, datetime_to_utc_str, utc_now from ..helpers.multithreaded_requests import MultithreadedRequestsPool +from ..helpers.exceptions import MambuGeneratorThreadNotAlive LOGGER = get_logger() @@ -190,6 +191,8 @@ def __iter__(self): def next(self): if not self.buffer and not self.end_of_file: while not self.buffer and not self.end_of_file: + if not self.fetch_batch_thread.is_alive(): + raise MambuGeneratorThreadNotAlive("Generator stopped running premaurely") time.sleep(0.01) if not self.buffer and self.end_of_file: raise StopIteration() diff --git a/tap_mambu/tap_processors/processor.py b/tap_mambu/tap_processors/processor.py index a581c0c..39c7ee6 100644 --- a/tap_mambu/tap_processors/processor.py +++ b/tap_mambu/tap_processors/processor.py @@ -66,6 +66,11 @@ def process_records(self): record_count += 1 self._process_child_records(record) counter.increment() + + # Write bookmark after thousand records + if record_count%1000 == 0: + self.write_bookmark() + return record_count def process_streams_from_generators(self): From 1c9aed4db6158c3ff2fb13d958a124c95a983ba5 Mon Sep 17 00:00:00 2001 From: shantanu73 Date: Wed, 3 Apr 2024 20:28:43 +0000 Subject: [PATCH 34/65] Adding frequent bookmarks in case of multi-threaded parent streams. --- tap_mambu/tap_processors/multithreaded_parent_processor.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tap_mambu/tap_processors/multithreaded_parent_processor.py b/tap_mambu/tap_processors/multithreaded_parent_processor.py index dbeaabd..6be02a9 100644 --- a/tap_mambu/tap_processors/multithreaded_parent_processor.py +++ b/tap_mambu/tap_processors/multithreaded_parent_processor.py @@ -19,6 +19,7 @@ def process_records(self): def _process_child_records(self, record): from ..sync import sync_endpoint + self.write_bookmark() super(MultithreadedParentProcessor, self)._process_child_records(record) for child_stream_name in self.endpoint_child_streams: From fdd10199a11b1d9ac155d5547526633e4ede4176 Mon Sep 17 00:00:00 2001 From: shantanu73 Date: Wed, 3 Apr 2024 21:31:20 +0000 Subject: [PATCH 35/65] Changes: 1) Increased frequeny to write bookmarks. 2) Increased overlap window of dates in date windowing. --- tap_mambu/tap_generators/multithreaded_offset_generator.py | 2 +- tap_mambu/tap_processors/processor.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tap_mambu/tap_generators/multithreaded_offset_generator.py b/tap_mambu/tap_generators/multithreaded_offset_generator.py index 71261bf..e23c379 100644 --- a/tap_mambu/tap_generators/multithreaded_offset_generator.py +++ b/tap_mambu/tap_generators/multithreaded_offset_generator.py @@ -163,7 +163,7 @@ def modify_request_params(self, start, end): { "field": self.endpoint_bookmark_field, "operator": "BEFORE", - "value": datetime.strftime(end, '%Y-%m-%dT00:00:01.000000Z') + "value": datetime.strftime(end, '%Y-%m-%dT00:10:00.000000Z') } ] diff --git a/tap_mambu/tap_processors/processor.py b/tap_mambu/tap_processors/processor.py index 39c7ee6..1b657af 100644 --- a/tap_mambu/tap_processors/processor.py +++ b/tap_mambu/tap_processors/processor.py @@ -67,8 +67,8 @@ def process_records(self): self._process_child_records(record) counter.increment() - # Write bookmark after thousand records - if record_count%1000 == 0: + # Write bookmark after hundred records + if record_count%100 == 0: self.write_bookmark() return record_count From e8789a45a6049893670bd7fe45bddab49ce07e8d Mon Sep 17 00:00:00 2001 From: RushiT0122 Date: Thu, 4 Apr 2024 14:14:13 +0000 Subject: [PATCH 36/65] Fix memory limit for loan_repayments --- tap_mambu/tap_generators/generator.py | 7 ++++++- tap_mambu/tap_generators/loan_repayments_generator.py | 8 +++++++- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/tap_mambu/tap_generators/generator.py b/tap_mambu/tap_generators/generator.py index 20d2481..d09d673 100644 --- a/tap_mambu/tap_generators/generator.py +++ b/tap_mambu/tap_generators/generator.py @@ -1,3 +1,5 @@ +import time + from abc import ABC from typing import List from singer import utils, get_logger @@ -16,7 +18,7 @@ def __init__(self, stream_name, client, config, state, sub_type): self.state = state self.sub_type = sub_type self.date_windowing = False - self.date_window_size = 3 + self.date_window_size = 5 # Define parameters inside init self.params = dict() @@ -65,6 +67,9 @@ def _init_params(self): self.params = self.static_params def _all_fetch_batch_steps(self): + if len(self.buffer) > self.max_buffer_size: + return + self.prepare_batch() raw_batch = self.fetch_batch() self.buffer = transform_json(raw_batch, self.stream_name) diff --git a/tap_mambu/tap_generators/loan_repayments_generator.py b/tap_mambu/tap_generators/loan_repayments_generator.py index b13fae1..9dea519 100644 --- a/tap_mambu/tap_generators/loan_repayments_generator.py +++ b/tap_mambu/tap_generators/loan_repayments_generator.py @@ -1,9 +1,15 @@ from .child_generator import ChildGenerator +from typing import List class LoanRepaymentsGenerator(ChildGenerator): + def _init_buffers(self): + self.buffer: List = list() + self.max_buffer_size = 1000 + def _init_endpoint_config(self): super(LoanRepaymentsGenerator, self)._init_endpoint_config() self.endpoint_api_version = "v1" self.endpoint_api_method = "GET" - self.endpoint_path = f"loans/{self.endpoint_parent_id}/repayments" # include parent id in endpoint path + # include parent id in endpoint path + self.endpoint_path = f"loans/{self.endpoint_parent_id}/repayments" From f559a633a9faeebd8d78bcac760f3d4a39e51f2b Mon Sep 17 00:00:00 2001 From: RushiT0122 Date: Thu, 4 Apr 2024 14:34:17 +0000 Subject: [PATCH 37/65] Fix memory issue for child generators --- tap_mambu/tap_generators/child_generator.py | 5 +++++ tap_mambu/tap_generators/loan_repayments_generator.py | 4 ---- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/tap_mambu/tap_generators/child_generator.py b/tap_mambu/tap_generators/child_generator.py index 1c3c5e0..52f9b9a 100644 --- a/tap_mambu/tap_generators/child_generator.py +++ b/tap_mambu/tap_generators/child_generator.py @@ -1,4 +1,5 @@ from .generator import TapGenerator +from typing import List class ChildGenerator(TapGenerator): @@ -10,3 +11,7 @@ def _init_endpoint_config(self): super(ChildGenerator, self)._init_endpoint_config() self.endpoint_path = f"{self.endpoint_parent_id}" # include parent id in endpoint path + def _init_buffers(self): + self.buffer: List = list() + self.max_buffer_size = 1000 + diff --git a/tap_mambu/tap_generators/loan_repayments_generator.py b/tap_mambu/tap_generators/loan_repayments_generator.py index 9dea519..4a9281a 100644 --- a/tap_mambu/tap_generators/loan_repayments_generator.py +++ b/tap_mambu/tap_generators/loan_repayments_generator.py @@ -1,11 +1,7 @@ from .child_generator import ChildGenerator -from typing import List class LoanRepaymentsGenerator(ChildGenerator): - def _init_buffers(self): - self.buffer: List = list() - self.max_buffer_size = 1000 def _init_endpoint_config(self): super(LoanRepaymentsGenerator, self)._init_endpoint_config() From ec82db9e400015bd122a01fda630dcba9dccf83a Mon Sep 17 00:00:00 2001 From: RushiT0122 Date: Thu, 4 Apr 2024 17:18:22 +0000 Subject: [PATCH 38/65] Reduce max buffer limit of parent stream --- tap_mambu/tap_generators/generator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tap_mambu/tap_generators/generator.py b/tap_mambu/tap_generators/generator.py index d09d673..3869a8e 100644 --- a/tap_mambu/tap_generators/generator.py +++ b/tap_mambu/tap_generators/generator.py @@ -57,7 +57,7 @@ def _init_endpoint_body(self): def _init_buffers(self): self.buffer: List = list() - self.max_buffer_size = 100000 + self.max_buffer_size = 50000 def _init_params(self): self.time_extracted = None From 9857841cef232d89c5b4a92b62bf1bcf4e5bf765 Mon Sep 17 00:00:00 2001 From: RushiT0122 Date: Fri, 5 Apr 2024 04:37:55 +0000 Subject: [PATCH 39/65] Reducing max threads for Loan accounts generator. --- tap_mambu/tap_generators/loan_accounts_generator.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tap_mambu/tap_generators/loan_accounts_generator.py b/tap_mambu/tap_generators/loan_accounts_generator.py index 54e96fe..ce001d4 100644 --- a/tap_mambu/tap_generators/loan_accounts_generator.py +++ b/tap_mambu/tap_generators/loan_accounts_generator.py @@ -3,6 +3,10 @@ class LoanAccountsLMGenerator(MultithreadedBookmarkGenerator): + def __init__(self, stream_name, client, config, state, sub_type): + super(LoanAccountsLMGenerator, self).__init__(stream_name, client, config, state, sub_type) + self.max_threads = 5 + def _init_endpoint_config(self): super(LoanAccountsLMGenerator, self)._init_endpoint_config() self.endpoint_path = "loans:search" From 1b8973dd160957d73bb82fde296f456dd891c504 Mon Sep 17 00:00:00 2001 From: shantanu73 Date: Fri, 5 Apr 2024 08:03:01 +0000 Subject: [PATCH 40/65] Changes: 1) Fixed date windowing logic by adding 1 day extra to the end date. 2) Changed max buffer size fto 20000. --- tap_mambu/tap_generators/generator.py | 2 +- tap_mambu/tap_generators/multithreaded_offset_generator.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tap_mambu/tap_generators/generator.py b/tap_mambu/tap_generators/generator.py index 3869a8e..d15459c 100644 --- a/tap_mambu/tap_generators/generator.py +++ b/tap_mambu/tap_generators/generator.py @@ -57,7 +57,7 @@ def _init_endpoint_body(self): def _init_buffers(self): self.buffer: List = list() - self.max_buffer_size = 50000 + self.max_buffer_size = 20000 def _init_params(self): self.time_extracted = None diff --git a/tap_mambu/tap_generators/multithreaded_offset_generator.py b/tap_mambu/tap_generators/multithreaded_offset_generator.py index e23c379..ffc30f6 100644 --- a/tap_mambu/tap_generators/multithreaded_offset_generator.py +++ b/tap_mambu/tap_generators/multithreaded_offset_generator.py @@ -129,7 +129,7 @@ def _all_fetch_batch_steps(self): if self.date_windowing: start_datetime = datetime_to_utc_str(str_to_localized_datetime( get_bookmark(self.state, self.stream_name, self.sub_type, self.start_date)))[:10] - end_datetime = datetime_to_utc_str(utc_now())[:10] + end_datetime = datetime_to_utc_str(utc_now() + timedelta(days=1))[:10] start = datetime.strptime(start_datetime, '%Y-%m-%d').date() end = datetime.strptime(end_datetime, '%Y-%m-%d').date() temp = start + timedelta(days=self.date_window_size) From 6cd984a4b67bad970ed910cb9ceb3d42d87e67b9 Mon Sep 17 00:00:00 2001 From: shantanu73 Date: Fri, 5 Apr 2024 08:13:36 +0000 Subject: [PATCH 41/65] Changed BEFORE value of modify_request_params to have 1 min overlap in date windows. --- tap_mambu/tap_generators/multithreaded_offset_generator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tap_mambu/tap_generators/multithreaded_offset_generator.py b/tap_mambu/tap_generators/multithreaded_offset_generator.py index ffc30f6..7ee02ff 100644 --- a/tap_mambu/tap_generators/multithreaded_offset_generator.py +++ b/tap_mambu/tap_generators/multithreaded_offset_generator.py @@ -163,7 +163,7 @@ def modify_request_params(self, start, end): { "field": self.endpoint_bookmark_field, "operator": "BEFORE", - "value": datetime.strftime(end, '%Y-%m-%dT00:10:00.000000Z') + "value": datetime.strftime(end, '%Y-%m-%dT00:01:00.000000Z') } ] From ec3bc764432007d077684fa5d9133363cb9095a0 Mon Sep 17 00:00:00 2001 From: Rushikesh Todkar <98420315+RushiT0122@users.noreply.github.com> Date: Sun, 7 Apr 2024 17:09:44 +0530 Subject: [PATCH 42/65] Add backoff for ChunkedEncodingError --- tap_mambu/helpers/client.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tap_mambu/helpers/client.py b/tap_mambu/helpers/client.py index 9a901fb..9df4737 100644 --- a/tap_mambu/helpers/client.py +++ b/tap_mambu/helpers/client.py @@ -1,7 +1,7 @@ import backoff import requests import requests.adapters -from requests.exceptions import ConnectionError +from requests.exceptions import ConnectionError, ChunkedEncodingError from singer import metrics, get_logger from urllib3.exceptions import ProtocolError @@ -179,7 +179,8 @@ def check_access(self): return True @backoff.on_exception(backoff.expo, - (MambuInternalServiceError, ConnectionError, MambuApiLimitError, ProtocolError), + (MambuInternalServiceError, ConnectionError, + ChunkedEncodingError, MambuApiLimitError, ProtocolError), max_tries=7, factor=3) def request(self, method, path=None, url=None, json=None, version=None, apikey_type=None, **kwargs): From 1dbdd6bd041e58f80b5b404fdcba3b7d7e37a358 Mon Sep 17 00:00:00 2001 From: Rushikesh Todkar <98420315+RushiT0122@users.noreply.github.com> Date: Tue, 9 Apr 2024 17:18:57 +0530 Subject: [PATCH 43/65] Update loan_accounts bookmarking strategy --- tap_mambu/tap_generators/generator.py | 1 + .../tap_generators/loan_accounts_generator.py | 29 +++++++++++++-- .../multithreaded_offset_generator.py | 35 +++++++++++++++---- tap_mambu/tap_processors/processor.py | 9 +++-- 4 files changed, 62 insertions(+), 12 deletions(-) diff --git a/tap_mambu/tap_generators/generator.py b/tap_mambu/tap_generators/generator.py index d15459c..73c0bb2 100644 --- a/tap_mambu/tap_generators/generator.py +++ b/tap_mambu/tap_generators/generator.py @@ -16,6 +16,7 @@ def __init__(self, stream_name, client, config, state, sub_type): self.client = client self.config = config self.state = state + self.state_changed = True self.sub_type = sub_type self.date_windowing = False self.date_window_size = 5 diff --git a/tap_mambu/tap_generators/loan_accounts_generator.py b/tap_mambu/tap_generators/loan_accounts_generator.py index ce001d4..4b50fce 100644 --- a/tap_mambu/tap_generators/loan_accounts_generator.py +++ b/tap_mambu/tap_generators/loan_accounts_generator.py @@ -5,23 +5,46 @@ class LoanAccountsLMGenerator(MultithreadedBookmarkGenerator): def __init__(self, stream_name, client, config, state, sub_type): super(LoanAccountsLMGenerator, self).__init__(stream_name, client, config, state, sub_type) - self.max_threads = 5 + self.max_threads = 3 def _init_endpoint_config(self): super(LoanAccountsLMGenerator, self)._init_endpoint_config() self.endpoint_path = "loans:search" self.endpoint_bookmark_field = "lastModifiedDate" self.endpoint_sorting_criteria = { - "field": "id", + "field": "lastModifiedDate", "order": "ASC" } def prepare_batch_params(self): super(LoanAccountsLMGenerator, self).prepare_batch_params() - self.endpoint_filter_criteria[0]["value"] = datetime_to_utc_str(self.endpoint_intermediary_bookmark_value) + self.endpoint_filter_criteria[0]["value"] = datetime_to_utc_str( + self.endpoint_intermediary_bookmark_value) + + def set_last_sync_window_start(self, start): + self.state["last_sync_windows_start_lmg"] = start + self.state_changed = True + + def get_last_sync_window_start(self): + return self.state.get("last_sync_windows_start_lmg") + + def remove_last_sync_window_start(self): + if "last_sync_windows_start_ad" in self.state: + del self.state["last_sync_windows_start_lmg"] class LoanAccountsADGenerator(LoanAccountsLMGenerator): def _init_endpoint_config(self): super(LoanAccountsADGenerator, self)._init_endpoint_config() self.endpoint_bookmark_field = "lastAccountAppraisalDate" + + def set_last_sync_window_start(self, start): + self.state["last_sync_windows_start_ad"] = start + self.state_changed = True + + def get_last_sync_window_start(self): + return self.state.get("last_sync_windows_start_ad") + + def remove_last_sync_window_start(self): + if "last_sync_windows_start_ad" in self.state: + del self.state["last_sync_windows_start_ad"] diff --git a/tap_mambu/tap_generators/multithreaded_offset_generator.py b/tap_mambu/tap_generators/multithreaded_offset_generator.py index 7ee02ff..0a69e33 100644 --- a/tap_mambu/tap_generators/multithreaded_offset_generator.py +++ b/tap_mambu/tap_generators/multithreaded_offset_generator.py @@ -124,28 +124,49 @@ def preprocess_batches(self, final_buffer): self.preprocess_record(raw_record) self.last_batch_size = len(self.last_batch_set) + def set_last_sync_window_start(self, start): + self.state["last_sync_windows_start"] = start + self.state_changed = True + + def get_last_sync_window_start(self): + return self.state.get("last_sync_windows_start") + + def remove_last_sync_window_start(self): + if "last_sync_windows_start_ad" in self.state: + del self.state["last_sync_windows_start_ad"] + @backoff.on_exception(backoff.expo, RuntimeError, max_tries=5) def _all_fetch_batch_steps(self): if self.date_windowing: + last_sync_window_start = self.get_last_sync_window_start() start_datetime = datetime_to_utc_str(str_to_localized_datetime( - get_bookmark(self.state, self.stream_name, self.sub_type, self.start_date)))[:10] - end_datetime = datetime_to_utc_str(utc_now() + timedelta(days=1))[:10] - start = datetime.strptime(start_datetime, '%Y-%m-%d').date() - end = datetime.strptime(end_datetime, '%Y-%m-%d').date() + get_bookmark(self.state, self.stream_name, self.sub_type, self.start_date)) - timedelta(days=self.date_window_size)) + + if last_sync_window_start: + start = str_to_localized_datetime(last_sync_window_start) + else: + start = str_to_localized_datetime(start_datetime) + + end_datetime = datetime_to_utc_str(utc_now() + timedelta(days=1)) + end = str_to_localized_datetime(end_datetime) temp = start + timedelta(days=self.date_window_size) stop_iteration = True final_buffer = [] while start < end: + self.set_last_sync_window_start(datetime_to_utc_str(start)) # Limit the buffer size by holding generators from creating new batches - while len(self.buffer) > self.max_buffer_size: - time.sleep(1) + if len(self.buffer) > self.max_buffer_size: + while len(self.buffer): + time.sleep(1) self.modify_request_params(start, temp) - final_buffer, stop_iteration = self.collect_batches(self.queue_batches()) + final_buffer, stop_iteration = self.collect_batches( + self.queue_batches()) self.preprocess_batches(final_buffer) if not final_buffer or stop_iteration: self.offset = 0 start = temp temp = start + timedelta(days=self.date_window_size) + self.remove_last_sync_window_start() else: final_buffer, stop_iteration = self.collect_batches(self.queue_batches()) self.preprocess_batches(final_buffer) diff --git a/tap_mambu/tap_processors/processor.py b/tap_mambu/tap_processors/processor.py index 1b657af..20f4fdd 100644 --- a/tap_mambu/tap_processors/processor.py +++ b/tap_mambu/tap_processors/processor.py @@ -1,6 +1,7 @@ from abc import ABC from singer import write_record, metadata, write_schema, get_logger, metrics, utils +from copy import deepcopy from ..helpers import convert, get_bookmark, write_bookmark from ..helpers.transformer import Transformer @@ -66,10 +67,14 @@ def process_records(self): record_count += 1 self._process_child_records(record) counter.increment() - + # Write bookmark after hundred records - if record_count%100 == 0: + if record_count % 1000 == 0: + self.write_bookmark() + + if self.generators[0].state_changed: self.write_bookmark() + self.generators[0].state_changed = False return record_count From 56eca6c3327ce02ff6e0c234f33c822e764f0cda Mon Sep 17 00:00:00 2001 From: Rushikesh Todkar <98420315+RushiT0122@users.noreply.github.com> Date: Wed, 10 Apr 2024 12:14:48 +0530 Subject: [PATCH 44/65] Interrupted sync enhancements --- .../communications_generator.py | 4 +- tap_mambu/tap_generators/generator.py | 10 ++++ .../tap_generators/loan_accounts_generator.py | 46 +++++++++++++++---- .../multithreaded_offset_generator.py | 38 +++++++++------ .../multithreaded_parent_processor.py | 4 ++ tap_mambu/tap_processors/processor.py | 1 - 6 files changed, 76 insertions(+), 27 deletions(-) diff --git a/tap_mambu/tap_generators/communications_generator.py b/tap_mambu/tap_generators/communications_generator.py index 866bffd..ab2ad0d 100644 --- a/tap_mambu/tap_generators/communications_generator.py +++ b/tap_mambu/tap_generators/communications_generator.py @@ -18,12 +18,12 @@ def modify_request_params(self, start, end): { "field": self.endpoint_bookmark_field, "operator": "AFTER", - "value": datetime.strftime(start, '%Y-%m-%dT00:00:00.000000Z') + "value": datetime.strftime(start) }, { "field": self.endpoint_bookmark_field, "operator": "BEFORE", - "value": datetime.strftime(end, '%Y-%m-%dT00:00:01.000000Z') + "value": datetime.strftime(end) }, { "field": "state", diff --git a/tap_mambu/tap_generators/generator.py b/tap_mambu/tap_generators/generator.py index 73c0bb2..312b22d 100644 --- a/tap_mambu/tap_generators/generator.py +++ b/tap_mambu/tap_generators/generator.py @@ -20,6 +20,7 @@ def __init__(self, stream_name, client, config, state, sub_type): self.sub_type = sub_type self.date_windowing = False self.date_window_size = 5 + self.start_windows_datetime_str = None # Define parameters inside init self.params = dict() @@ -128,3 +129,12 @@ def fetch_batch(self): self.time_extracted = utils.now() LOGGER.info(f'(generator) Stream {self.stream_name} - extracted records: {len(response)}') return self.transform_batch(response) + + def get_default_start_value(self): + return None + + def set_default_start_value(self, end_time): + pass + + def set_last_sync_completed(self, end_time): + pass diff --git a/tap_mambu/tap_generators/loan_accounts_generator.py b/tap_mambu/tap_generators/loan_accounts_generator.py index 4b50fce..c68c0f9 100644 --- a/tap_mambu/tap_generators/loan_accounts_generator.py +++ b/tap_mambu/tap_generators/loan_accounts_generator.py @@ -1,5 +1,6 @@ from .multithreaded_bookmark_generator import MultithreadedBookmarkGenerator from ..helpers.datetime_utils import datetime_to_utc_str +from ..helpers import get_bookmark, write_bookmark class LoanAccountsLMGenerator(MultithreadedBookmarkGenerator): @@ -22,15 +23,33 @@ def prepare_batch_params(self): self.endpoint_intermediary_bookmark_value) def set_last_sync_window_start(self, start): - self.state["last_sync_windows_start_lmg"] = start - self.state_changed = True + if "bookmarks" not in self.state: + self.state["bookmarks"] = {} + # self.state["bookmarks"]["lmg_last_sync_windows_start"] = start + write_bookmark(self.state, "lmg_last_sync_windows_start", self.sub_type, start) def get_last_sync_window_start(self): - return self.state.get("last_sync_windows_start_lmg") + last_bookmark = get_bookmark( + self.state, self.stream_name, self.sub_type, self.start_date) + return self.state.get("bookmarks", {}).get("lmg_last_sync_windows_start", last_bookmark) def remove_last_sync_window_start(self): - if "last_sync_windows_start_ad" in self.state: - del self.state["last_sync_windows_start_lmg"] + if "lmg_last_sync_windows_start" in self.state["bookmarks"]: + del self.state["bookmarks"]["lmg_last_sync_windows_start"] + + def get_default_start_value(self): + return self.state.get("bookmarks", {}).get("lmg_last_multithread_sync_completed", self.start_date) + + def set_intermediary_bookmark(self, record_bookmark_value): + if self.endpoint_intermediary_bookmark_value is None or \ + self.compare_bookmark_values(record_bookmark_value, + self.endpoint_intermediary_bookmark_value): + self.endpoint_intermediary_bookmark_offset = 1 + return + + if record_bookmark_value == self.endpoint_intermediary_bookmark_value: + self.endpoint_intermediary_bookmark_offset += 1 + return class LoanAccountsADGenerator(LoanAccountsLMGenerator): @@ -39,12 +58,19 @@ def _init_endpoint_config(self): self.endpoint_bookmark_field = "lastAccountAppraisalDate" def set_last_sync_window_start(self, start): - self.state["last_sync_windows_start_ad"] = start - self.state_changed = True + if "bookmarks" not in self.state: + self.state["bookmarks"] = {} + write_bookmark(self.state, "ad_last_sync_windows_start", self.sub_type, start) def get_last_sync_window_start(self): - return self.state.get("last_sync_windows_start_ad") + last_bookmark = get_bookmark( + self.state, self.stream_name, self.sub_type, self.start_date) + return self.state.get("bookmarks", {}).get("ad_last_sync_windows_start", last_bookmark) def remove_last_sync_window_start(self): - if "last_sync_windows_start_ad" in self.state: - del self.state["last_sync_windows_start_ad"] + if "ad_last_sync_windows_start" in self.state["bookmarks"]: + del self.state["bookmarks"]["ad_last_sync_windows_start"] + self.state_changed = True + + def get_default_start_value(self): + return self.state.get("bookmarks", {}).get("ad_last_multithread_sync_completed", self.start_date) diff --git a/tap_mambu/tap_generators/multithreaded_offset_generator.py b/tap_mambu/tap_generators/multithreaded_offset_generator.py index 0a69e33..7a4fa50 100644 --- a/tap_mambu/tap_generators/multithreaded_offset_generator.py +++ b/tap_mambu/tap_generators/multithreaded_offset_generator.py @@ -7,8 +7,8 @@ from datetime import datetime, timedelta from .generator import TapGenerator -from ..helpers import transform_json, get_bookmark -from ..helpers.datetime_utils import str_to_localized_datetime, datetime_to_utc_str, utc_now +from ..helpers import transform_json, get_bookmark, write_bookmark +from ..helpers.datetime_utils import str_to_localized_datetime, datetime_to_utc_str, utc_now, str_to_datetime from ..helpers.multithreaded_requests import MultithreadedRequestsPool from ..helpers.exceptions import MambuGeneratorThreadNotAlive @@ -19,6 +19,7 @@ class MultithreadedOffsetGenerator(TapGenerator): def __init__(self, stream_name, client, config, state, sub_type): super(MultithreadedOffsetGenerator, self).__init__(stream_name, client, config, state, sub_type) self.date_windowing = True + self.start_windows_datetime_str = None def _init_params(self): self.time_extracted = None @@ -125,27 +126,35 @@ def preprocess_batches(self, final_buffer): self.last_batch_size = len(self.last_batch_set) def set_last_sync_window_start(self, start): - self.state["last_sync_windows_start"] = start + self.state["bookmarks"]["last_sync_windows_start"] = start self.state_changed = True def get_last_sync_window_start(self): - return self.state.get("last_sync_windows_start") + return self.state.get("bookmarks", {}).get("last_sync_windows_start") def remove_last_sync_window_start(self): - if "last_sync_windows_start_ad" in self.state: - del self.state["last_sync_windows_start_ad"] + if "last_sync_windows_start" in self.state: + del self.state["ad_last_sync_windows_start"] + + def set_last_sync_completed(self, end_time): + # self.state["bookmarks"]["ad_last_multithrad_sync_completed"] = datetime_to_utc_str(end_time) + last_bookmark = get_bookmark(self.state, self.stream_name, self.sub_type, self.start_date) + if end_time < str_to_datetime(last_bookmark): + write_bookmark(self.state, self.stream_name, + self.sub_type, datetime_to_utc_str(end_time)) @backoff.on_exception(backoff.expo, RuntimeError, max_tries=5) def _all_fetch_batch_steps(self): if self.date_windowing: last_sync_window_start = self.get_last_sync_window_start() - start_datetime = datetime_to_utc_str(str_to_localized_datetime( - get_bookmark(self.state, self.stream_name, self.sub_type, self.start_date)) - timedelta(days=self.date_window_size)) if last_sync_window_start: - start = str_to_localized_datetime(last_sync_window_start) + truncated_start_date = str_to_datetime( + last_sync_window_start).replace(hour=0, minute=0, second=0) + start = str_to_localized_datetime( + datetime_to_utc_str(truncated_start_date)) else: - start = str_to_localized_datetime(start_datetime) + start = str_to_localized_datetime(self.get_default_start_value()) end_datetime = datetime_to_utc_str(utc_now() + timedelta(days=1)) end = str_to_localized_datetime(end_datetime) @@ -153,20 +162,21 @@ def _all_fetch_batch_steps(self): stop_iteration = True final_buffer = [] while start < end: + self.remove_last_sync_window_start() self.set_last_sync_window_start(datetime_to_utc_str(start)) # Limit the buffer size by holding generators from creating new batches if len(self.buffer) > self.max_buffer_size: while len(self.buffer): time.sleep(1) - self.modify_request_params(start, temp) + self.modify_request_params(start - timedelta(minutes=5), temp) final_buffer, stop_iteration = self.collect_batches( self.queue_batches()) self.preprocess_batches(final_buffer) if not final_buffer or stop_iteration: self.offset = 0 + self.start_windows_datetime_str = start start = temp temp = start + timedelta(days=self.date_window_size) - self.remove_last_sync_window_start() else: final_buffer, stop_iteration = self.collect_batches(self.queue_batches()) self.preprocess_batches(final_buffer) @@ -179,12 +189,12 @@ def modify_request_params(self, start, end): { "field": self.endpoint_bookmark_field, "operator": "AFTER", - "value": datetime.strftime(start, '%Y-%m-%dT00:00:00.000000Z') + "value": datetime_to_utc_str(start) }, { "field": self.endpoint_bookmark_field, "operator": "BEFORE", - "value": datetime.strftime(end, '%Y-%m-%dT00:01:00.000000Z') + "value": datetime_to_utc_str(end) } ] diff --git a/tap_mambu/tap_processors/multithreaded_parent_processor.py b/tap_mambu/tap_processors/multithreaded_parent_processor.py index 6be02a9..e9c6367 100644 --- a/tap_mambu/tap_processors/multithreaded_parent_processor.py +++ b/tap_mambu/tap_processors/multithreaded_parent_processor.py @@ -15,6 +15,10 @@ def process_records(self): for future in futures.as_completed(self.futures): record_count += future.result() + + for generator in self.generators: + generator.set_last_sync_completed(self.generators[0].start_windows_datetime_str) + generator.remove_last_sync_window_start() return record_count def _process_child_records(self, record): diff --git a/tap_mambu/tap_processors/processor.py b/tap_mambu/tap_processors/processor.py index 20f4fdd..f4a695c 100644 --- a/tap_mambu/tap_processors/processor.py +++ b/tap_mambu/tap_processors/processor.py @@ -1,7 +1,6 @@ from abc import ABC from singer import write_record, metadata, write_schema, get_logger, metrics, utils -from copy import deepcopy from ..helpers import convert, get_bookmark, write_bookmark from ..helpers.transformer import Transformer From 8684a688cb8d819535c9a1a75a8b4e0059a989da Mon Sep 17 00:00:00 2001 From: Rushikesh Todkar <98420315+RushiT0122@users.noreply.github.com> Date: Wed, 10 Apr 2024 14:46:55 +0530 Subject: [PATCH 45/65] Minor fix --- tap_mambu/tap_generators/loan_accounts_generator.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tap_mambu/tap_generators/loan_accounts_generator.py b/tap_mambu/tap_generators/loan_accounts_generator.py index c68c0f9..4a01f72 100644 --- a/tap_mambu/tap_generators/loan_accounts_generator.py +++ b/tap_mambu/tap_generators/loan_accounts_generator.py @@ -25,7 +25,6 @@ def prepare_batch_params(self): def set_last_sync_window_start(self, start): if "bookmarks" not in self.state: self.state["bookmarks"] = {} - # self.state["bookmarks"]["lmg_last_sync_windows_start"] = start write_bookmark(self.state, "lmg_last_sync_windows_start", self.sub_type, start) def get_last_sync_window_start(self): @@ -34,7 +33,7 @@ def get_last_sync_window_start(self): return self.state.get("bookmarks", {}).get("lmg_last_sync_windows_start", last_bookmark) def remove_last_sync_window_start(self): - if "lmg_last_sync_windows_start" in self.state["bookmarks"]: + if "lmg_last_sync_windows_start" in self.state.get("bookmarks", {}): del self.state["bookmarks"]["lmg_last_sync_windows_start"] def get_default_start_value(self): @@ -68,7 +67,7 @@ def get_last_sync_window_start(self): return self.state.get("bookmarks", {}).get("ad_last_sync_windows_start", last_bookmark) def remove_last_sync_window_start(self): - if "ad_last_sync_windows_start" in self.state["bookmarks"]: + if "ad_last_sync_windows_start" in self.state.get("bookmarks", {}): del self.state["bookmarks"]["ad_last_sync_windows_start"] self.state_changed = True From f7175ef850f9d8f4651a936024ff37e841beac0f Mon Sep 17 00:00:00 2001 From: Rushikesh Todkar <98420315+RushiT0122@users.noreply.github.com> Date: Fri, 12 Apr 2024 18:05:32 +0530 Subject: [PATCH 46/65] - Update bookmarking to fix missing child records - Reduce date window size to avoid race condition - Make sub-stream extractions run in sync to avoid race condition - Code refactoring --- tap_mambu/tap_generators/generator.py | 11 ++-- .../tap_generators/loan_accounts_generator.py | 53 +++++++------------ .../multithreaded_offset_generator.py | 37 ++++++++----- .../multithreaded_parent_processor.py | 2 +- tap_mambu/tap_processors/processor.py | 14 ++++- 5 files changed, 64 insertions(+), 53 deletions(-) diff --git a/tap_mambu/tap_generators/generator.py b/tap_mambu/tap_generators/generator.py index 312b22d..967ae5f 100644 --- a/tap_mambu/tap_generators/generator.py +++ b/tap_mambu/tap_generators/generator.py @@ -16,10 +16,9 @@ def __init__(self, stream_name, client, config, state, sub_type): self.client = client self.config = config self.state = state - self.state_changed = True self.sub_type = sub_type self.date_windowing = False - self.date_window_size = 5 + self.date_window_size = 1 self.start_windows_datetime_str = None # Define parameters inside init @@ -59,7 +58,7 @@ def _init_endpoint_body(self): def _init_buffers(self): self.buffer: List = list() - self.max_buffer_size = 20000 + self.max_buffer_size = 10000 def _init_params(self): self.time_extracted = None @@ -69,6 +68,8 @@ def _init_params(self): self.params = self.static_params def _all_fetch_batch_steps(self): + # Large buffer size can impact memory utilization of connector + # so empty the buffer once it reaches default max limit if len(self.buffer) > self.max_buffer_size: return @@ -133,8 +134,8 @@ def fetch_batch(self): def get_default_start_value(self): return None - def set_default_start_value(self, end_time): + def set_last_sync_completed(self, end_time): pass - def set_last_sync_completed(self, end_time): + def wait_for_slibling_to_catchup(self): pass diff --git a/tap_mambu/tap_generators/loan_accounts_generator.py b/tap_mambu/tap_generators/loan_accounts_generator.py index 4a01f72..d70f4cb 100644 --- a/tap_mambu/tap_generators/loan_accounts_generator.py +++ b/tap_mambu/tap_generators/loan_accounts_generator.py @@ -7,13 +7,15 @@ class LoanAccountsLMGenerator(MultithreadedBookmarkGenerator): def __init__(self, stream_name, client, config, state, sub_type): super(LoanAccountsLMGenerator, self).__init__(stream_name, client, config, state, sub_type) self.max_threads = 3 + self.sub_stream_name = "loan_accounts_lmg" + self.sibling_sub_stream = ["loan_accounts_adg"] def _init_endpoint_config(self): super(LoanAccountsLMGenerator, self)._init_endpoint_config() self.endpoint_path = "loans:search" self.endpoint_bookmark_field = "lastModifiedDate" self.endpoint_sorting_criteria = { - "field": "lastModifiedDate", + "field": "id", "order": "ASC" } @@ -22,22 +24,20 @@ def prepare_batch_params(self): self.endpoint_filter_criteria[0]["value"] = datetime_to_utc_str( self.endpoint_intermediary_bookmark_value) - def set_last_sync_window_start(self, start): - if "bookmarks" not in self.state: - self.state["bookmarks"] = {} - write_bookmark(self.state, "lmg_last_sync_windows_start", self.sub_type, start) - - def get_last_sync_window_start(self): - last_bookmark = get_bookmark( - self.state, self.stream_name, self.sub_type, self.start_date) - return self.state.get("bookmarks", {}).get("lmg_last_sync_windows_start", last_bookmark) - - def remove_last_sync_window_start(self): - if "lmg_last_sync_windows_start" in self.state.get("bookmarks", {}): - del self.state["bookmarks"]["lmg_last_sync_windows_start"] + def write_sub_stream_bookmark(self, start): + write_bookmark(self.state, self.sub_stream_name, self.sub_type, start) def get_default_start_value(self): - return self.state.get("bookmarks", {}).get("lmg_last_multithread_sync_completed", self.start_date) + # Historical sync will use start date as as date window + # Increamental syncs will use last stream bookmark value + # Interrupted syncs will use last winodow of sub-stream as first date window + stream_bookmark = get_bookmark(self.state, self.stream_name, self.sub_type, self.start_date) + return get_bookmark(self.state, self.sub_stream_name, self.sub_type, stream_bookmark) + + def remove_sub_stream_bookmark(self): + # Remove sub-stream bookmark once we finish extraction till current date + if self.sub_stream_name in self.state.get("bookmarks", {}): + del self.state["bookmarks"][self.sub_stream_name] def set_intermediary_bookmark(self, record_bookmark_value): if self.endpoint_intermediary_bookmark_value is None or \ @@ -52,24 +52,11 @@ def set_intermediary_bookmark(self, record_bookmark_value): class LoanAccountsADGenerator(LoanAccountsLMGenerator): + def __init__(self, stream_name, client, config, state, sub_type): + super(LoanAccountsADGenerator, self).__init__(stream_name, client, config, state, sub_type) + self.sub_stream_name = "loan_accounts_adg" + self.sibling_sub_stream = ["loan_accounts_lmg"] + def _init_endpoint_config(self): super(LoanAccountsADGenerator, self)._init_endpoint_config() self.endpoint_bookmark_field = "lastAccountAppraisalDate" - - def set_last_sync_window_start(self, start): - if "bookmarks" not in self.state: - self.state["bookmarks"] = {} - write_bookmark(self.state, "ad_last_sync_windows_start", self.sub_type, start) - - def get_last_sync_window_start(self): - last_bookmark = get_bookmark( - self.state, self.stream_name, self.sub_type, self.start_date) - return self.state.get("bookmarks", {}).get("ad_last_sync_windows_start", last_bookmark) - - def remove_last_sync_window_start(self): - if "ad_last_sync_windows_start" in self.state.get("bookmarks", {}): - del self.state["bookmarks"]["ad_last_sync_windows_start"] - self.state_changed = True - - def get_default_start_value(self): - return self.state.get("bookmarks", {}).get("ad_last_multithread_sync_completed", self.start_date) diff --git a/tap_mambu/tap_generators/multithreaded_offset_generator.py b/tap_mambu/tap_generators/multithreaded_offset_generator.py index 7a4fa50..2f23526 100644 --- a/tap_mambu/tap_generators/multithreaded_offset_generator.py +++ b/tap_mambu/tap_generators/multithreaded_offset_generator.py @@ -19,7 +19,7 @@ class MultithreadedOffsetGenerator(TapGenerator): def __init__(self, stream_name, client, config, state, sub_type): super(MultithreadedOffsetGenerator, self).__init__(stream_name, client, config, state, sub_type) self.date_windowing = True - self.start_windows_datetime_str = None + self.start_windows_datetime_str = self.start_date def _init_params(self): self.time_extracted = None @@ -125,16 +125,14 @@ def preprocess_batches(self, final_buffer): self.preprocess_record(raw_record) self.last_batch_size = len(self.last_batch_set) - def set_last_sync_window_start(self, start): - self.state["bookmarks"]["last_sync_windows_start"] = start - self.state_changed = True + def write_sub_stream_bookmark(self, start): + write_bookmark(self.state, self.sub_stream_name, self.sub_type, start) - def get_last_sync_window_start(self): - return self.state.get("bookmarks", {}).get("last_sync_windows_start") + def get_default_start_value(self): + return get_bookmark(self.state, self.stream_name, self.sub_type, self.start_date) - def remove_last_sync_window_start(self): - if "last_sync_windows_start" in self.state: - del self.state["ad_last_sync_windows_start"] + def remove_sub_stream_bookmark(self): + pass def set_last_sync_completed(self, end_time): # self.state["bookmarks"]["ad_last_multithrad_sync_completed"] = datetime_to_utc_str(end_time) @@ -143,10 +141,24 @@ def set_last_sync_completed(self, end_time): write_bookmark(self.state, self.stream_name, self.sub_type, datetime_to_utc_str(end_time)) + def wait_for_slibling_to_catchup(self): + keep_waiting = True + while keep_waiting: + current_bookmark = self.get_default_start_value() + for sibling in self.sibling_sub_stream: + sibling_bookmark = get_bookmark(self.state, sibling, self.sub_type, self.get_default_start_value()) + if str_to_datetime(current_bookmark) > str_to_datetime(sibling_bookmark): + keep_waiting = True + LOGGER.info(f"Waiting for sibling {sibling} thread to catch-up!") + time.sleep(5) + else: + keep_waiting = False + break + @backoff.on_exception(backoff.expo, RuntimeError, max_tries=5) def _all_fetch_batch_steps(self): if self.date_windowing: - last_sync_window_start = self.get_last_sync_window_start() + last_sync_window_start = self.get_default_start_value() if last_sync_window_start: truncated_start_date = str_to_datetime( @@ -162,8 +174,8 @@ def _all_fetch_batch_steps(self): stop_iteration = True final_buffer = [] while start < end: - self.remove_last_sync_window_start() - self.set_last_sync_window_start(datetime_to_utc_str(start)) + self.write_sub_stream_bookmark(datetime_to_utc_str(start)) + self.wait_for_slibling_to_catchup() # Limit the buffer size by holding generators from creating new batches if len(self.buffer) > self.max_buffer_size: while len(self.buffer): @@ -177,6 +189,7 @@ def _all_fetch_batch_steps(self): self.start_windows_datetime_str = start start = temp temp = start + timedelta(days=self.date_window_size) + else: final_buffer, stop_iteration = self.collect_batches(self.queue_batches()) self.preprocess_batches(final_buffer) diff --git a/tap_mambu/tap_processors/multithreaded_parent_processor.py b/tap_mambu/tap_processors/multithreaded_parent_processor.py index e9c6367..ed93c74 100644 --- a/tap_mambu/tap_processors/multithreaded_parent_processor.py +++ b/tap_mambu/tap_processors/multithreaded_parent_processor.py @@ -18,7 +18,7 @@ def process_records(self): for generator in self.generators: generator.set_last_sync_completed(self.generators[0].start_windows_datetime_str) - generator.remove_last_sync_window_start() + generator.remove_sub_stream_bookmark() return record_count def _process_child_records(self, record): diff --git a/tap_mambu/tap_processors/processor.py b/tap_mambu/tap_processors/processor.py index f4a695c..1bee4ff 100644 --- a/tap_mambu/tap_processors/processor.py +++ b/tap_mambu/tap_processors/processor.py @@ -43,7 +43,14 @@ def _init_endpoint_config(self): "you need to use the deduplication processor") def _init_bookmarks(self): - self.last_bookmark_value = get_bookmark(self.state, self.stream_name, self.sub_type, self.start_date) + # Since we have date window implementation in multithreaded genrators, + # we can't rely on bookmark value since if case of interruption we may miss some of the records + # lesser bookmark value record by lagging threads than bookmark written by faster thread + # Because of which in next sync we will miss parent as well as corresponding child records. + # In such scenario we should resume extraction from the last date window where extration interrupted + last_bookmark = self.generators[0].get_default_start_value() + + self.last_bookmark_value = last_bookmark or get_bookmark(self.state, self.stream_name, self.sub_type, self.start_date) self.max_bookmark_value = self.last_bookmark_value def write_schema(self): @@ -67,7 +74,7 @@ def process_records(self): self._process_child_records(record) counter.increment() - # Write bookmark after hundred records + # Write bookmark after thousand records if record_count % 1000 == 0: self.write_bookmark() @@ -107,6 +114,9 @@ def _is_record_past_bookmark(self, transformed_record, bookmark_field): if str_to_localized_datetime(transformed_record[bookmark_field]) >= \ str_to_localized_datetime(self.last_bookmark_value): return True + else: + LOGGER.info( + f"Skipped record older than bookmark: {self.stream_name} {transformed_record.get('id')}") return False def process_record(self, record, time_extracted, bookmark_field): From b1e91381fad6606f7f6314c78c70b5a43541a000 Mon Sep 17 00:00:00 2001 From: Rushikesh Todkar <98420315+RushiT0122@users.noreply.github.com> Date: Fri, 12 Apr 2024 20:49:19 +0530 Subject: [PATCH 47/65] Minor fixes --- tap_mambu/tap_generators/communications_generator.py | 6 +++--- tap_mambu/tap_generators/generator.py | 2 ++ tap_mambu/tap_generators/multithreaded_offset_generator.py | 2 ++ tap_mambu/tap_processors/processor.py | 4 ---- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/tap_mambu/tap_generators/communications_generator.py b/tap_mambu/tap_generators/communications_generator.py index ab2ad0d..9ece42b 100644 --- a/tap_mambu/tap_generators/communications_generator.py +++ b/tap_mambu/tap_generators/communications_generator.py @@ -1,5 +1,5 @@ from .multithreaded_bookmark_generator import MultithreadedBookmarkGenerator -from ..helpers.datetime_utils import datetime_to_local_str +from ..helpers.datetime_utils import datetime_to_local_str, datetime_to_utc_str from datetime import datetime @@ -18,12 +18,12 @@ def modify_request_params(self, start, end): { "field": self.endpoint_bookmark_field, "operator": "AFTER", - "value": datetime.strftime(start) + "value": datetime_to_utc_str(start) }, { "field": self.endpoint_bookmark_field, "operator": "BEFORE", - "value": datetime.strftime(end) + "value": datetime_to_utc_str(start) }, { "field": "state", diff --git a/tap_mambu/tap_generators/generator.py b/tap_mambu/tap_generators/generator.py index 967ae5f..09052b0 100644 --- a/tap_mambu/tap_generators/generator.py +++ b/tap_mambu/tap_generators/generator.py @@ -20,6 +20,8 @@ def __init__(self, stream_name, client, config, state, sub_type): self.date_windowing = False self.date_window_size = 1 self.start_windows_datetime_str = None + self.sub_stream_name = stream_name + self.sibling_sub_stream = None # Define parameters inside init self.params = dict() diff --git a/tap_mambu/tap_generators/multithreaded_offset_generator.py b/tap_mambu/tap_generators/multithreaded_offset_generator.py index 2f23526..ed73635 100644 --- a/tap_mambu/tap_generators/multithreaded_offset_generator.py +++ b/tap_mambu/tap_generators/multithreaded_offset_generator.py @@ -142,6 +142,8 @@ def set_last_sync_completed(self, end_time): self.sub_type, datetime_to_utc_str(end_time)) def wait_for_slibling_to_catchup(self): + if not self.sibling_sub_stream: + return keep_waiting = True while keep_waiting: current_bookmark = self.get_default_start_value() diff --git a/tap_mambu/tap_processors/processor.py b/tap_mambu/tap_processors/processor.py index 1bee4ff..c117a3c 100644 --- a/tap_mambu/tap_processors/processor.py +++ b/tap_mambu/tap_processors/processor.py @@ -78,10 +78,6 @@ def process_records(self): if record_count % 1000 == 0: self.write_bookmark() - if self.generators[0].state_changed: - self.write_bookmark() - self.generators[0].state_changed = False - return record_count def process_streams_from_generators(self): From 0efeb7720de83cc6ecbc59235ae721f2beec7f0f Mon Sep 17 00:00:00 2001 From: Rushikesh Todkar <98420315+RushiT0122@users.noreply.github.com> Date: Sun, 14 Apr 2024 13:38:22 +0530 Subject: [PATCH 48/65] Remove wait for thread implementation --- .../multithreaded_offset_generator.py | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/tap_mambu/tap_generators/multithreaded_offset_generator.py b/tap_mambu/tap_generators/multithreaded_offset_generator.py index ed73635..196acf2 100644 --- a/tap_mambu/tap_generators/multithreaded_offset_generator.py +++ b/tap_mambu/tap_generators/multithreaded_offset_generator.py @@ -141,22 +141,6 @@ def set_last_sync_completed(self, end_time): write_bookmark(self.state, self.stream_name, self.sub_type, datetime_to_utc_str(end_time)) - def wait_for_slibling_to_catchup(self): - if not self.sibling_sub_stream: - return - keep_waiting = True - while keep_waiting: - current_bookmark = self.get_default_start_value() - for sibling in self.sibling_sub_stream: - sibling_bookmark = get_bookmark(self.state, sibling, self.sub_type, self.get_default_start_value()) - if str_to_datetime(current_bookmark) > str_to_datetime(sibling_bookmark): - keep_waiting = True - LOGGER.info(f"Waiting for sibling {sibling} thread to catch-up!") - time.sleep(5) - else: - keep_waiting = False - break - @backoff.on_exception(backoff.expo, RuntimeError, max_tries=5) def _all_fetch_batch_steps(self): if self.date_windowing: @@ -177,7 +161,6 @@ def _all_fetch_batch_steps(self): final_buffer = [] while start < end: self.write_sub_stream_bookmark(datetime_to_utc_str(start)) - self.wait_for_slibling_to_catchup() # Limit the buffer size by holding generators from creating new batches if len(self.buffer) > self.max_buffer_size: while len(self.buffer): @@ -191,7 +174,6 @@ def _all_fetch_batch_steps(self): self.start_windows_datetime_str = start start = temp temp = start + timedelta(days=self.date_window_size) - else: final_buffer, stop_iteration = self.collect_batches(self.queue_batches()) self.preprocess_batches(final_buffer) From ba83d275d15f8b33f27116c936d74d7218f76310 Mon Sep 17 00:00:00 2001 From: Rushikesh Todkar <98420315+RushiT0122@users.noreply.github.com> Date: Mon, 15 Apr 2024 12:50:16 +0530 Subject: [PATCH 49/65] - Empty buffer before next date window - Remove redundant code --- tap_mambu/tap_generators/generator.py | 1 - tap_mambu/tap_generators/loan_accounts_generator.py | 12 ++++++++---- .../tap_generators/multithreaded_offset_generator.py | 8 ++++---- 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/tap_mambu/tap_generators/generator.py b/tap_mambu/tap_generators/generator.py index 09052b0..eae6734 100644 --- a/tap_mambu/tap_generators/generator.py +++ b/tap_mambu/tap_generators/generator.py @@ -21,7 +21,6 @@ def __init__(self, stream_name, client, config, state, sub_type): self.date_window_size = 1 self.start_windows_datetime_str = None self.sub_stream_name = stream_name - self.sibling_sub_stream = None # Define parameters inside init self.params = dict() diff --git a/tap_mambu/tap_generators/loan_accounts_generator.py b/tap_mambu/tap_generators/loan_accounts_generator.py index d70f4cb..5ea57e0 100644 --- a/tap_mambu/tap_generators/loan_accounts_generator.py +++ b/tap_mambu/tap_generators/loan_accounts_generator.py @@ -1,5 +1,5 @@ from .multithreaded_bookmark_generator import MultithreadedBookmarkGenerator -from ..helpers.datetime_utils import datetime_to_utc_str +from ..helpers.datetime_utils import datetime_to_utc_str, str_to_datetime from ..helpers import get_bookmark, write_bookmark @@ -8,7 +8,6 @@ def __init__(self, stream_name, client, config, state, sub_type): super(LoanAccountsLMGenerator, self).__init__(stream_name, client, config, state, sub_type) self.max_threads = 3 self.sub_stream_name = "loan_accounts_lmg" - self.sibling_sub_stream = ["loan_accounts_adg"] def _init_endpoint_config(self): super(LoanAccountsLMGenerator, self)._init_endpoint_config() @@ -32,7 +31,13 @@ def get_default_start_value(self): # Increamental syncs will use last stream bookmark value # Interrupted syncs will use last winodow of sub-stream as first date window stream_bookmark = get_bookmark(self.state, self.stream_name, self.sub_type, self.start_date) - return get_bookmark(self.state, self.sub_stream_name, self.sub_type, stream_bookmark) + sub_stream_bookmark = get_bookmark(self.state, self.sub_stream_name, self.sub_type, stream_bookmark) + if self.compare_bookmark_values(sub_stream_bookmark, stream_bookmark): + start_value = stream_bookmark + else: + start_value = sub_stream_bookmark + truncated_start_date = datetime_to_utc_str(str_to_datetime(start_value).replace(hour=0, minute=0, second=0)) + return truncated_start_date def remove_sub_stream_bookmark(self): # Remove sub-stream bookmark once we finish extraction till current date @@ -55,7 +60,6 @@ class LoanAccountsADGenerator(LoanAccountsLMGenerator): def __init__(self, stream_name, client, config, state, sub_type): super(LoanAccountsADGenerator, self).__init__(stream_name, client, config, state, sub_type) self.sub_stream_name = "loan_accounts_adg" - self.sibling_sub_stream = ["loan_accounts_lmg"] def _init_endpoint_config(self): super(LoanAccountsADGenerator, self)._init_endpoint_config() diff --git a/tap_mambu/tap_generators/multithreaded_offset_generator.py b/tap_mambu/tap_generators/multithreaded_offset_generator.py index 196acf2..0cb4026 100644 --- a/tap_mambu/tap_generators/multithreaded_offset_generator.py +++ b/tap_mambu/tap_generators/multithreaded_offset_generator.py @@ -160,11 +160,11 @@ def _all_fetch_batch_steps(self): stop_iteration = True final_buffer = [] while start < end: + # Empty the current buffer before moving to next window to make sure all records + # of current date window are processed to reduce memory pressure and improve bookmarking + while len(self.buffer): + time.sleep(1) self.write_sub_stream_bookmark(datetime_to_utc_str(start)) - # Limit the buffer size by holding generators from creating new batches - if len(self.buffer) > self.max_buffer_size: - while len(self.buffer): - time.sleep(1) self.modify_request_params(start - timedelta(minutes=5), temp) final_buffer, stop_iteration = self.collect_batches( self.queue_batches()) From 179ad28b6b46b86991fe0ba0eb1bdbcd4f66c79f Mon Sep 17 00:00:00 2001 From: Rushikesh Todkar <98420315+RushiT0122@users.noreply.github.com> Date: Wed, 17 Apr 2024 23:16:01 +0530 Subject: [PATCH 50/65] Remove frequent bookmarking --- tap_mambu/tap_processors/multithreaded_parent_processor.py | 1 - tap_mambu/tap_processors/processor.py | 4 ---- 2 files changed, 5 deletions(-) diff --git a/tap_mambu/tap_processors/multithreaded_parent_processor.py b/tap_mambu/tap_processors/multithreaded_parent_processor.py index ed93c74..892d13d 100644 --- a/tap_mambu/tap_processors/multithreaded_parent_processor.py +++ b/tap_mambu/tap_processors/multithreaded_parent_processor.py @@ -23,7 +23,6 @@ def process_records(self): def _process_child_records(self, record): from ..sync import sync_endpoint - self.write_bookmark() super(MultithreadedParentProcessor, self)._process_child_records(record) for child_stream_name in self.endpoint_child_streams: diff --git a/tap_mambu/tap_processors/processor.py b/tap_mambu/tap_processors/processor.py index c117a3c..d8d67e7 100644 --- a/tap_mambu/tap_processors/processor.py +++ b/tap_mambu/tap_processors/processor.py @@ -74,10 +74,6 @@ def process_records(self): self._process_child_records(record) counter.increment() - # Write bookmark after thousand records - if record_count % 1000 == 0: - self.write_bookmark() - return record_count def process_streams_from_generators(self): From 0aabdafce1163d0564e9edd25430c2720e110497 Mon Sep 17 00:00:00 2001 From: Rushikesh Todkar <98420315+RushiT0122@users.noreply.github.com> Date: Thu, 2 May 2024 20:27:24 +0530 Subject: [PATCH 51/65] Reduce integration test execution time --- tests/base.py | 2 +- tests/test_sync_canary.py | 35 ----------------------------------- 2 files changed, 1 insertion(+), 36 deletions(-) delete mode 100644 tests/test_sync_canary.py diff --git a/tests/base.py b/tests/base.py index fd68374..e0f0db5 100644 --- a/tests/base.py +++ b/tests/base.py @@ -246,7 +246,7 @@ def run_and_verify_check_mode(self, conn_id): def get_properties(self, original_properties=True): properties = { - 'start_date': '2017-01-01T00:00:00Z', + 'start_date': '2021-01-01T00:00:00Z', 'username': os.environ['TAP_MAMBU_USERNAME'], 'subdomain': os.environ['TAP_MAMBU_SUBDOMAIN'], 'page_size': '100' diff --git a/tests/test_sync_canary.py b/tests/test_sync_canary.py deleted file mode 100644 index 380ed7b..0000000 --- a/tests/test_sync_canary.py +++ /dev/null @@ -1,35 +0,0 @@ -""" -Test that with no fields selected for a stream automatic fields are still replicated -""" -from tap_tester import connections -from base import MambuBaseTest - -class SyncCanaryTest(MambuBaseTest): - """ - Smoke test - """ - - @staticmethod - def name(): - return "tap_tester_mambu_sync_canary_test" - - def test_run(self): - """ - Run tap in check mode, then select all streams and all fields within streams. Run a sync and - verify exit codes do not throw errors. This is meant to be a smoke test for the tap. If this - is failing do not expect any other tests to pass. - """ - conn_id = connections.ensure_connection(self) - self.run_and_verify_check_mode(conn_id) - - self.select_and_verify_fields(conn_id) - - record_count_by_stream = self.run_and_verify_sync(conn_id) - - - # Assert all expected streams synced at least one record - for stream in self.expected_streams(): - with self.subTest(stream=stream): - self.assertGreater(record_count_by_stream.get(stream, 0), - 0, - msg="{} did not sync any records".format(stream)) From 77aa759472c1835e31b66e51df5954bc5672a59c Mon Sep 17 00:00:00 2001 From: RushiT0122 Date: Fri, 3 May 2024 12:37:50 +0000 Subject: [PATCH 52/65] Fix integration tests --- tests/base.py | 4 ++-- tests/test_bookmarks.py | 48 ++++++++++++++++++++++++++++++++++++++-- tests/test_pagination.py | 1 + tests/test_start_date.py | 6 +++-- 4 files changed, 53 insertions(+), 6 deletions(-) diff --git a/tests/base.py b/tests/base.py index fd68374..ae206c2 100644 --- a/tests/base.py +++ b/tests/base.py @@ -246,14 +246,14 @@ def run_and_verify_check_mode(self, conn_id): def get_properties(self, original_properties=True): properties = { - 'start_date': '2017-01-01T00:00:00Z', + 'start_date': '2021-01-07T00:00:00Z', 'username': os.environ['TAP_MAMBU_USERNAME'], 'subdomain': os.environ['TAP_MAMBU_SUBDOMAIN'], 'page_size': '100' } if not original_properties: - properties['start_date'] = '2021-01-01T00:00:00Z' + properties['start_date'] = '2019-01-01T00:00:00Z' return properties diff --git a/tests/test_bookmarks.py b/tests/test_bookmarks.py index 7c847d8..2ceba6f 100644 --- a/tests/test_bookmarks.py +++ b/tests/test_bookmarks.py @@ -12,7 +12,8 @@ def poll_state_version(conn_id): """Make the request for state version until it returns a version greater than 0""" return menagerie.get_state_version(conn_id) -class BookmarksTest(MambuBaseTest): + +class BookmarksBase(MambuBaseTest): """ Test that the tap can replicate multiple pages of data """ @@ -33,7 +34,7 @@ def subtract_day(self, bookmark): adjusted_bookmark = bookmark_dt - timedelta(days=1) return strftime(adjusted_bookmark) - def test_run(self): + def run_execution(self): """ Verify that we can get multiple pages of data for each stream """ @@ -143,3 +144,46 @@ def test_run(self): raise NotImplementedError( "invalid replication method: {}".format(replication_method) ) + + +class Bookmarkstest1(BookmarksBase): + def expected_streams(self): + return super().expected_streams() - self.untestable_streams() + + def untestable_streams(self): + return {'branches', + 'centres', + 'clients', + 'credit_arrangements', + 'communications', + 'deposit_products', + 'installments', + 'groups', + 'gl_accounts', + 'loan_transactions', + 'tasks'} + + def get_properties(self, original_properties=True): + return super().get_properties(original_properties=False) + + def test_run(self): + return self.run_execution() + + +class Bookmarkstest2(BookmarksBase): + def expected_streams(self): + return {'branches', + 'centres', + 'clients', + 'credit_arrangements', + 'deposit_products', + 'groups', + 'gl_accounts', + 'loan_transactions', + 'tasks'} - self.untestable_streams() + + def get_properties(self, original_properties=True): + return super().get_properties(original_properties) + + def test_run(self): + return self.run_execution() diff --git a/tests/test_pagination.py b/tests/test_pagination.py index bc91f28..205327d 100644 --- a/tests/test_pagination.py +++ b/tests/test_pagination.py @@ -16,6 +16,7 @@ def name(): def untestable_streams(self): return set([ + "clients", # Stream does not have enough records to test pagination "communications", # Need to set up Twilio or email server to send stuff ]) diff --git a/tests/test_start_date.py b/tests/test_start_date.py index 569fa3c..5cbf9e4 100644 --- a/tests/test_start_date.py +++ b/tests/test_start_date.py @@ -20,10 +20,12 @@ def name(): return "tap_tester_mambu_start_date_test" def setUp(self): - self.first_sync_start_date = self.get_properties()['start_date'] + self.first_sync_start_date = self.get_properties( + original_properties=False + )['start_date'] self.second_sync_start_date = self.get_properties( - original_properties=False + original_properties=True )['start_date'] def untestable_streams(self): From 5b691a9760af9bb3bdd2d577dd813de10d20ad95 Mon Sep 17 00:00:00 2001 From: RushiT0122 Date: Fri, 3 May 2024 13:06:18 +0000 Subject: [PATCH 53/65] Parallelise integration tests --- .circleci/config.yml | 111 +++++++++++++++++++++++++++++++++++-------- 1 file changed, 91 insertions(+), 20 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index f344ae6..8483a6e 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -1,8 +1,20 @@ -version: 2 -jobs: - build: +version: 2.1 +orbs: + slack: circleci/slack@3.4.2 + +executors: + docker-executor: docker: - image: 218546966473.dkr.ecr.us-east-1.amazonaws.com/circle-ci:stitch-tap-tester + +jobs: + build: + executor: docker-executor + steps: + - run: echo "CI Done" + + ensure_env: + executor: docker-executor steps: - checkout - run: @@ -12,39 +24,98 @@ jobs: source /usr/local/share/virtualenvs/tap-mambu/bin/activate pip install -U 'pip<19.2' 'setuptools<51.0.0' pip install .[dev] - - add_ssh_keys - - run: - name: 'JSON Validator' - command: | - source /usr/local/share/virtualenvs/tap-tester/bin/activate - stitch-validate-json tap_mambu/helpers/schemas/*.json + aws s3 cp s3://com-stitchdata-dev-deployment-assets/environments/tap-tester/tap_tester_sandbox /usr/local/share/virtualenvs/dev_env.sh + - slack/notify-on-failure: + only_for_branches: master + - persist_to_workspace: + root: /usr/local/share/virtualenvs + paths: + - tap-mambu + - dev_env.sh + + run_unit_tests: + executor: docker-executor + steps: + - checkout + - attach_workspace: + at: /usr/local/share/virtualenvs - run: - when: always name: 'Unit Tests' command: | source /usr/local/share/virtualenvs/tap-mambu/bin/activate pytest tests/unittests + - store_test_results: + path: test_output/report.xml + - store_artifacts: + path: htmlcov - run: - name: 'Integration Tests' + name: 'JSON Validator' command: | - aws s3 cp s3://com-stitchdata-dev-deployment-assets/environments/tap-tester/tap_tester_sandbox dev_env.sh - source dev_env.sh source /usr/local/share/virtualenvs/tap-tester/bin/activate - run-test --tap=tap-mambu tests + stitch-validate-json tap_mambu/schemas/*.json + + run_integration_tests: + executor: docker-executor + parallelism: 5 + steps: + - checkout + - attach_workspace: + at: /usr/local/share/virtualenvs + - run: + name: 'Run Integration Tests' + no_output_timeout: 30m + command: | + source /usr/local/share/virtualenvs/dev_env.sh + mkdir /tmp/${CIRCLE_PROJECT_REPONAME} + export STITCH_CONFIG_DIR=/tmp/${CIRCLE_PROJECT_REPONAME} + source /usr/local/share/virtualenvs/tap-tester/bin/activate + circleci tests glob "tests/test_*.py" | circleci tests split > ./tests-to-run + if [ -s ./tests-to-run ]; then + for test_file in $(cat ./tests-to-run) + do + echo $test_file > $STITCH_CONFIG_DIR/tap_test.txt + run-test --tap=${CIRCLE_PROJECT_REPONAME} $test_file + done + fi + - slack/notify-on-failure: + only_for_branches: master + - store_artifacts: + path: /tmp/tap-mambu + workflows: version: 2 - commit: + commit: &commit_jobs jobs: + - ensure_env: + context: + - circleci-user + - tier-1-tap-user + - run_unit_tests: + context: + - circleci-user + - tier-1-tap-user + requires: + - ensure_env + - run_integration_tests: + context: + - circleci-user + - tier-1-tap-user + requires: + - ensure_env - build: - context: circleci-user + context: + - circleci-user + - tier-1-tap-user + requires: + - run_pylint + - run_unit_tests + - run_integration_tests build_daily: + <<: *commit_jobs triggers: - schedule: - cron: "0 0 * * *" + cron: "0 1 * * *" filters: branches: only: - master - jobs: - - build: - context: circleci-user From cd3ae4395b212ef5ec8d5853ccfe296931629b96 Mon Sep 17 00:00:00 2001 From: RushiT0122 Date: Fri, 3 May 2024 13:21:59 +0000 Subject: [PATCH 54/65] fix config.yml --- .circleci/config.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 8483a6e..dc6118f 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -107,7 +107,6 @@ workflows: - circleci-user - tier-1-tap-user requires: - - run_pylint - run_unit_tests - run_integration_tests build_daily: From a68007165330a6dbae6855fdd7bbcbbb2a25f33d Mon Sep 17 00:00:00 2001 From: RushiT0122 Date: Fri, 3 May 2024 13:27:28 +0000 Subject: [PATCH 55/65] fix config.yml --- .circleci/config.yml | 5 ----- 1 file changed, 5 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index dc6118f..fcbd571 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -48,11 +48,6 @@ jobs: path: test_output/report.xml - store_artifacts: path: htmlcov - - run: - name: 'JSON Validator' - command: | - source /usr/local/share/virtualenvs/tap-tester/bin/activate - stitch-validate-json tap_mambu/schemas/*.json run_integration_tests: executor: docker-executor From 8eae933a5a7f1a74e91863a7b3a7661cbf3bed49 Mon Sep 17 00:00:00 2001 From: RushiT0122 Date: Mon, 6 May 2024 08:37:09 +0000 Subject: [PATCH 56/65] Read window_ize from config --- tap_mambu/__init__.py | 3 ++- tap_mambu/helpers/client.py | 4 +++- tap_mambu/tap_generators/generator.py | 2 +- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/tap_mambu/__init__.py b/tap_mambu/__init__.py index 287a4ae..5fa2cae 100644 --- a/tap_mambu/__init__.py +++ b/tap_mambu/__init__.py @@ -35,7 +35,8 @@ def main(): parsed_args.config['subdomain'], parsed_args.config.get('apikey_audit'), int(parsed_args.config.get('page_size', DEFAULT_PAGE_SIZE)), - user_agent=parsed_args.config['user_agent']) as client: + user_agent=parsed_args.config['user_agent'], + window_size=parsed_args.config['window_size']) as client: state = {} if parsed_args.state: diff --git a/tap_mambu/helpers/client.py b/tap_mambu/helpers/client.py index 9df4737..80e6d34 100644 --- a/tap_mambu/helpers/client.py +++ b/tap_mambu/helpers/client.py @@ -119,13 +119,15 @@ def __init__(self, subdomain, apikey_audit, page_size, - user_agent=''): + user_agent='', + window_size=1): self.__username = username self.__password = password self.__subdomain = subdomain base_url = "https://{}.mambu.com/api".format(subdomain) self.base_url = base_url self.page_size = page_size + self.window_size=window_size self.__user_agent = f'MambuTap-{user_agent}' if user_agent else 'MambuTap' self.__apikey = apikey self.__session = requests.Session() diff --git a/tap_mambu/tap_generators/generator.py b/tap_mambu/tap_generators/generator.py index eae6734..c898edf 100644 --- a/tap_mambu/tap_generators/generator.py +++ b/tap_mambu/tap_generators/generator.py @@ -18,7 +18,7 @@ def __init__(self, stream_name, client, config, state, sub_type): self.state = state self.sub_type = sub_type self.date_windowing = False - self.date_window_size = 1 + self.date_window_size = client.window_size self.start_windows_datetime_str = None self.sub_stream_name = stream_name From fa9d9e3fd561365394f4ea143b00d28646b3b01d Mon Sep 17 00:00:00 2001 From: RushiT0122 Date: Mon, 6 May 2024 16:13:35 +0000 Subject: [PATCH 57/65] Minor window size fixes --- tap_mambu/__init__.py | 2 +- tap_mambu/tap_generators/generator.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tap_mambu/__init__.py b/tap_mambu/__init__.py index 5fa2cae..71236bd 100644 --- a/tap_mambu/__init__.py +++ b/tap_mambu/__init__.py @@ -36,7 +36,7 @@ def main(): parsed_args.config.get('apikey_audit'), int(parsed_args.config.get('page_size', DEFAULT_PAGE_SIZE)), user_agent=parsed_args.config['user_agent'], - window_size=parsed_args.config['window_size']) as client: + window_size=parsed_args.config.get('window_size')) as client: state = {} if parsed_args.state: diff --git a/tap_mambu/tap_generators/generator.py b/tap_mambu/tap_generators/generator.py index c898edf..39e1ced 100644 --- a/tap_mambu/tap_generators/generator.py +++ b/tap_mambu/tap_generators/generator.py @@ -18,7 +18,7 @@ def __init__(self, stream_name, client, config, state, sub_type): self.state = state self.sub_type = sub_type self.date_windowing = False - self.date_window_size = client.window_size + self.date_window_size = int(client.window_size) or 1 self.start_windows_datetime_str = None self.sub_stream_name = stream_name From c37fedf10a511e828498fb16150bf163e4a8d24b Mon Sep 17 00:00:00 2001 From: RushiT0122 Date: Mon, 6 May 2024 16:18:36 +0000 Subject: [PATCH 58/65] - Fix bookmarking for FULL_TABLE multi-threaded streams - Minor refactoring --- tap_mambu/tap_generators/clients_generator.py | 3 +++ tap_mambu/tap_generators/multithreaded_offset_generator.py | 1 - tap_mambu/tap_processors/processor.py | 5 ++++- 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/tap_mambu/tap_generators/clients_generator.py b/tap_mambu/tap_generators/clients_generator.py index d014dfa..fe42767 100644 --- a/tap_mambu/tap_generators/clients_generator.py +++ b/tap_mambu/tap_generators/clients_generator.py @@ -19,3 +19,6 @@ def _init_endpoint_config(self): def prepare_batch_params(self): super(ClientsGenerator, self).prepare_batch_params() self.endpoint_filter_criteria[0]["value"] = datetime_to_local_str(self.endpoint_intermediary_bookmark_value) + + def write_sub_stream_bookmark(self, start): + pass diff --git a/tap_mambu/tap_generators/multithreaded_offset_generator.py b/tap_mambu/tap_generators/multithreaded_offset_generator.py index 0cb4026..f5a3fd7 100644 --- a/tap_mambu/tap_generators/multithreaded_offset_generator.py +++ b/tap_mambu/tap_generators/multithreaded_offset_generator.py @@ -135,7 +135,6 @@ def remove_sub_stream_bookmark(self): pass def set_last_sync_completed(self, end_time): - # self.state["bookmarks"]["ad_last_multithrad_sync_completed"] = datetime_to_utc_str(end_time) last_bookmark = get_bookmark(self.state, self.stream_name, self.sub_type, self.start_date) if end_time < str_to_datetime(last_bookmark): write_bookmark(self.state, self.stream_name, diff --git a/tap_mambu/tap_processors/processor.py b/tap_mambu/tap_processors/processor.py index d8d67e7..f6585e0 100644 --- a/tap_mambu/tap_processors/processor.py +++ b/tap_mambu/tap_processors/processor.py @@ -6,6 +6,7 @@ from ..helpers.transformer import Transformer from ..helpers.exceptions import NoDeduplicationCapabilityException from ..helpers.datetime_utils import utc_now, str_to_datetime, datetime_to_utc_str, str_to_localized_datetime +from ..helpers.schema import STREAMS LOGGER = get_logger() @@ -80,7 +81,9 @@ def process_streams_from_generators(self): self.write_schema() record_count = self.process_records() - self.write_bookmark() + if STREAMS.get(self.stream_name).get("replication_method") == "INCREMENTAL": + self.write_bookmark() + return record_count # This function is provided for processors with child streams, must be overridden if child streams are to be used From f101dff58f32ea59a2bc74caa4c05bc59ff96b9e Mon Sep 17 00:00:00 2001 From: RushiT0122 Date: Mon, 6 May 2024 16:25:07 +0000 Subject: [PATCH 59/65] - Add 30 days window size to reduce integration test execution time - Revert previous integration test changes --- tests/base.py | 7 +++--- tests/test_bookmarks.py | 50 +++------------------------------------- tests/test_start_date.py | 6 ++--- 3 files changed, 9 insertions(+), 54 deletions(-) diff --git a/tests/base.py b/tests/base.py index ae206c2..6b51a83 100644 --- a/tests/base.py +++ b/tests/base.py @@ -246,14 +246,15 @@ def run_and_verify_check_mode(self, conn_id): def get_properties(self, original_properties=True): properties = { - 'start_date': '2021-01-07T00:00:00Z', + 'start_date': '2017-01-01T00:00:00Z', 'username': os.environ['TAP_MAMBU_USERNAME'], 'subdomain': os.environ['TAP_MAMBU_SUBDOMAIN'], - 'page_size': '100' + 'page_size': '90', + 'window_size': 30 } if not original_properties: - properties['start_date'] = '2019-01-01T00:00:00Z' + properties['start_date'] = '2021-01-01T00:00:00Z' return properties diff --git a/tests/test_bookmarks.py b/tests/test_bookmarks.py index 2ceba6f..19b70a3 100644 --- a/tests/test_bookmarks.py +++ b/tests/test_bookmarks.py @@ -12,8 +12,7 @@ def poll_state_version(conn_id): """Make the request for state version until it returns a version greater than 0""" return menagerie.get_state_version(conn_id) - -class BookmarksBase(MambuBaseTest): +class BookmarksTest(MambuBaseTest): """ Test that the tap can replicate multiple pages of data """ @@ -34,7 +33,7 @@ def subtract_day(self, bookmark): adjusted_bookmark = bookmark_dt - timedelta(days=1) return strftime(adjusted_bookmark) - def run_execution(self): + def test_run(self): """ Verify that we can get multiple pages of data for each stream """ @@ -112,7 +111,7 @@ def run_execution(self): # Verify the second sync records fall between simulated bookmark value and the # final bookmark value for message in second_sync_messages: - lower_bound = strptime_to_utc(simulated_bookmark_value) + lower_bound = strptime_to_utc(simulated_bookmark_value) - timedelta(minutes=5) upper_bound = strptime_to_utc(second_sync_bookmark_value) record = message.get('data') actual_values = [strptime_to_utc(record.get(replication_key)) @@ -144,46 +143,3 @@ def run_execution(self): raise NotImplementedError( "invalid replication method: {}".format(replication_method) ) - - -class Bookmarkstest1(BookmarksBase): - def expected_streams(self): - return super().expected_streams() - self.untestable_streams() - - def untestable_streams(self): - return {'branches', - 'centres', - 'clients', - 'credit_arrangements', - 'communications', - 'deposit_products', - 'installments', - 'groups', - 'gl_accounts', - 'loan_transactions', - 'tasks'} - - def get_properties(self, original_properties=True): - return super().get_properties(original_properties=False) - - def test_run(self): - return self.run_execution() - - -class Bookmarkstest2(BookmarksBase): - def expected_streams(self): - return {'branches', - 'centres', - 'clients', - 'credit_arrangements', - 'deposit_products', - 'groups', - 'gl_accounts', - 'loan_transactions', - 'tasks'} - self.untestable_streams() - - def get_properties(self, original_properties=True): - return super().get_properties(original_properties) - - def test_run(self): - return self.run_execution() diff --git a/tests/test_start_date.py b/tests/test_start_date.py index 5cbf9e4..569fa3c 100644 --- a/tests/test_start_date.py +++ b/tests/test_start_date.py @@ -20,12 +20,10 @@ def name(): return "tap_tester_mambu_start_date_test" def setUp(self): - self.first_sync_start_date = self.get_properties( - original_properties=False - )['start_date'] + self.first_sync_start_date = self.get_properties()['start_date'] self.second_sync_start_date = self.get_properties( - original_properties=True + original_properties=False )['start_date'] def untestable_streams(self): From 993883a939c9ad2cf5a7b30b15aacb45c85eefe3 Mon Sep 17 00:00:00 2001 From: RushiT0122 Date: Wed, 8 May 2024 08:02:23 +0000 Subject: [PATCH 60/65] Update window_size implementation --- tap_mambu/helpers/client.py | 11 +++++++++-- tap_mambu/helpers/constants.py | 1 + tap_mambu/tap_generators/generator.py | 2 +- 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/tap_mambu/helpers/client.py b/tap_mambu/helpers/client.py index 80e6d34..1905c73 100644 --- a/tap_mambu/helpers/client.py +++ b/tap_mambu/helpers/client.py @@ -5,6 +5,7 @@ from singer import metrics, get_logger from urllib3.exceptions import ProtocolError +from tap_mambu.helpers.constants import DEFAULT_DATE_WINDOW_SIZE LOGGER = get_logger() class ClientError(Exception): @@ -120,14 +121,20 @@ def __init__(self, apikey_audit, page_size, user_agent='', - window_size=1): + window_size=DEFAULT_DATE_WINDOW_SIZE): self.__username = username self.__password = password self.__subdomain = subdomain base_url = "https://{}.mambu.com/api".format(subdomain) self.base_url = base_url self.page_size = page_size - self.window_size=window_size + try: + self.window_size = int(float(window_size)) if window_size else DEFAULT_DATE_WINDOW_SIZE + if self.window_size <= 0: + raise ValueError() + except ValueError: + raise Exception("The entered window size '{}' is invalid; it should be a valid non-zero integer.".format(window_size)) + self.__user_agent = f'MambuTap-{user_agent}' if user_agent else 'MambuTap' self.__apikey = apikey self.__session = requests.Session() diff --git a/tap_mambu/helpers/constants.py b/tap_mambu/helpers/constants.py index fef71d5..bb05042 100644 --- a/tap_mambu/helpers/constants.py +++ b/tap_mambu/helpers/constants.py @@ -1 +1,2 @@ DEFAULT_PAGE_SIZE = 200 +DEFAULT_DATE_WINDOW_SIZE = 1 diff --git a/tap_mambu/tap_generators/generator.py b/tap_mambu/tap_generators/generator.py index 39e1ced..c898edf 100644 --- a/tap_mambu/tap_generators/generator.py +++ b/tap_mambu/tap_generators/generator.py @@ -18,7 +18,7 @@ def __init__(self, stream_name, client, config, state, sub_type): self.state = state self.sub_type = sub_type self.date_windowing = False - self.date_window_size = int(client.window_size) or 1 + self.date_window_size = client.window_size self.start_windows_datetime_str = None self.sub_stream_name = stream_name From 8a256258cfed56912e598caa3b6c3d7c24b946ee Mon Sep 17 00:00:00 2001 From: RushiT0122 Date: Wed, 8 May 2024 08:08:36 +0000 Subject: [PATCH 61/65] Add window_size unittests --- tests/unittests/test_window_size.py | 86 +++++++++++++++++++++++++++++ 1 file changed, 86 insertions(+) create mode 100644 tests/unittests/test_window_size.py diff --git a/tests/unittests/test_window_size.py b/tests/unittests/test_window_size.py new file mode 100644 index 0000000..ad95729 --- /dev/null +++ b/tests/unittests/test_window_size.py @@ -0,0 +1,86 @@ +from parameterized import parameterized +import mock + +from tap_mambu.helpers.client import MambuClient +from tap_mambu.helpers.constants import DEFAULT_DATE_WINDOW_SIZE, DEFAULT_PAGE_SIZE + +config = { + "username": "YOUR_USERNAME", + "password": "YOUR_PASSWORD", + "apikey": "YOUR_APIKEY", + "subdomain": "YOUR_SUBDOMAIN", + "start_date": "2019-01-01T00:00:00Z", + "lookback_window": 30, + "user_agent": "tap-mambu ", + "page_size": "500", + "apikey_audit": "AUDIT_TRAIL_APIKEY"} + + +@mock.patch("tap_mambu.helpers.client.MambuClient.check_access") +class TestGetWindowSize(): + def test_non_value(self, mock_check_access): + """ + Test if no window size is not passed in the config, then set it to the default value. + """ + # Verify that the default window size value is set. + with MambuClient(config.get('username'), + config.get('password'), + config.get('apikey'), + config['subdomain'], + config.get('apikey_audit'), + int(config.get('page_size', DEFAULT_PAGE_SIZE)), + user_agent=config['user_agent'], + window_size=config.get('window_size')) as client: + # Verify window size value is expected + assert client.window_size == DEFAULT_DATE_WINDOW_SIZE + + @parameterized.expand([ + ["None_value", None, DEFAULT_DATE_WINDOW_SIZE], + ["integer_value", 10, 10], + ["float_value", 100.5, 100], + ["string_integer", "10", 10], + ["string_float", "100.5", 100], + ]) + def test_window_size_values(self, mock_check_access, name, date_window_size, expected_value): + """ + Test that for the valid value of window size, + No exception is raised and the expected value is set. + """ + with MambuClient(config.get('username'), + config.get('password'), + config.get('apikey'), + config['subdomain'], + config.get('apikey_audit'), + int(config.get('page_size', DEFAULT_PAGE_SIZE)), + user_agent=config['user_agent'], + window_size=date_window_size) as client: + # Verify window size value is expected + assert client.window_size == expected_value + + @parameterized.expand([ + ["zero_string", "0"], + ["less_than_1_string", "0.5"], + ["negative_value", -10], + ["string_negative_value", "-100"], + ["string_alphabate", "abc"], + ]) + def test_invalid_value(self, mock_check_access, name, date_window_size): + """ + Test that for invalid value exception is raised. + """ + actual_exc_string = "" + expected_exc_string = "The entered window size '{}' is invalid; it should be a valid non-zero integer.".format(date_window_size) + try: + MambuClient(config.get('username'), + config.get('password'), + config.get('apikey'), + config['subdomain'], + config.get('apikey_audit'), + int(config.get('page_size', DEFAULT_PAGE_SIZE)), + user_agent=config['user_agent'], + window_size=date_window_size) + except Exception as e: + # Verify that the exception message is expected. + actual_exc_string = str(e) + + assert actual_exc_string == expected_exc_string From 4fe5582b0059f4fc783164e3ffff806d72ae5c81 Mon Sep 17 00:00:00 2001 From: RushiT0122 Date: Wed, 8 May 2024 08:09:08 +0000 Subject: [PATCH 62/65] Update README file --- README.md | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 22af6b1..72c8042 100644 --- a/README.md +++ b/README.md @@ -254,10 +254,13 @@ This tap: "lookback_window": 30, "user_agent": "tap-mambu ", "page_size": "500", - "apikey_audit": "AUDIT_TRAIL_APIKEY" + "apikey_audit": "AUDIT_TRAIL_APIKEY", + "window_size": 7 } ``` - + + Note: The `window_size` parameter defaults to 1 day, which may cause slowdowns in historical sync for streams utilizing multi-threaded implementation. Conversely, using a larger `window_size` could lead to potential `out-of-memory` issues. It is advisable to select an optimal `window_size` based on the `start_date` and volume of data to mitigate these concerns. + Optionally, also create a `state.json` file. `currently_syncing` is an optional attribute used for identifying the last object to be synced in case the job is interrupted mid-stream. The next run would begin where the last job left off. ```json From 94838137e4c4cc35b0e34799bfa6fdbc0f1f969e Mon Sep 17 00:00:00 2001 From: RushiT0122 Date: Wed, 8 May 2024 08:15:50 +0000 Subject: [PATCH 63/65] Install parameterized madule for unittests --- .circleci/config.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.circleci/config.yml b/.circleci/config.yml index fcbd571..b586bf2 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -43,6 +43,7 @@ jobs: name: 'Unit Tests' command: | source /usr/local/share/virtualenvs/tap-mambu/bin/activate + pip install parameterized pytest tests/unittests - store_test_results: path: test_output/report.xml From f82eac5857d7b9818461c25edfeaa8ba4cc074a2 Mon Sep 17 00:00:00 2001 From: RushiT0122 Date: Wed, 8 May 2024 11:45:11 +0000 Subject: [PATCH 64/65] Fix review comments --- tap_mambu/tap_generators/generator.py | 2 -- tests/test_pagination.py | 1 - 2 files changed, 3 deletions(-) diff --git a/tap_mambu/tap_generators/generator.py b/tap_mambu/tap_generators/generator.py index c898edf..4af7e69 100644 --- a/tap_mambu/tap_generators/generator.py +++ b/tap_mambu/tap_generators/generator.py @@ -1,5 +1,3 @@ -import time - from abc import ABC from typing import List from singer import utils, get_logger diff --git a/tests/test_pagination.py b/tests/test_pagination.py index 205327d..bc91f28 100644 --- a/tests/test_pagination.py +++ b/tests/test_pagination.py @@ -16,7 +16,6 @@ def name(): def untestable_streams(self): return set([ - "clients", # Stream does not have enough records to test pagination "communications", # Need to set up Twilio or email server to send stuff ]) From da2f222f38b023695380940e08fa6e3de2a99819 Mon Sep 17 00:00:00 2001 From: shantanu73 Date: Mon, 13 May 2024 08:07:44 +0000 Subject: [PATCH 65/65] Bump version 4.2.0 --- CHANGELOG.md | 8 ++++++++ setup.py | 2 +- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7058feb..e8cfd7c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,13 @@ # Changelog +## 4.2.0 + * Perfomance improvements [#113](https://github.com/singer-io/tap-mambu/pull/113) + * Implement date windowing (default size = 1 day) and pagination for multi-threaded streams + * Limit generator buffer growth to finite boundaries + * Revise bookmark strategy for multi-threaded generators to address data discrepancies + * Segregate LoanAccounts sub-stream bookmarking to rectify data inconsistencies + * Eliminate performance metrics to reduce performance overheads + ## 4.1.0 * Change clients stream to full table sync [#111](https://github.com/singer-io/tap-mambu/pull/111) diff --git a/setup.py b/setup.py index 17baaa1..77fc623 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ from setuptools import setup, find_packages setup(name='tap-mambu', - version='4.1.0', + version='4.2.0', description='Singer.io tap for extracting data from the Mambu 2.0 API', author='jeff.huth@bytecode.io', classifiers=['Programming Language :: Python :: 3 :: Only'],