-
Notifications
You must be signed in to change notification settings - Fork 3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
fix(bigquery): add rate limiting for api calls made #4967
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -15,6 +15,7 @@ | |
from dateutil.relativedelta import relativedelta | ||
from google.cloud.bigquery import Client as BigQueryClient | ||
from google.cloud.logging_v2.client import Client as GCPLoggingClient | ||
from ratelimiter import RateLimiter | ||
from sqlalchemy import create_engine, inspect | ||
from sqlalchemy.engine.reflection import Inspector | ||
|
||
|
@@ -456,9 +457,15 @@ def _get_bigquery_log_entries( | |
f"Start loading log entries from BigQuery start_time={start_time} and end_time={end_time}" | ||
) | ||
for client in clients: | ||
entries = client.list_entries( | ||
filter_=filter, page_size=self.config.log_page_size | ||
) | ||
if self.config.rate_limit: | ||
with RateLimiter(max_calls=self.config.requests_per_min, period=60): | ||
entries = client.list_entries( | ||
filter_=filter, page_size=self.config.log_page_size | ||
) | ||
else: | ||
entries = client.list_entries( | ||
filter_=filter, page_size=self.config.log_page_size | ||
) | ||
for entry in entries: | ||
self.report.num_total_log_entries += 1 | ||
yield entry | ||
|
@@ -519,7 +526,11 @@ def _get_exported_bigquery_audit_metadata( | |
f"Finished loading log entries from BigQueryAuditMetadata in {dataset}" | ||
) | ||
|
||
yield from query_job | ||
if self.config.rate_limit: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. same here like above |
||
with RateLimiter(max_calls=self.config.requests_per_min, period=60): | ||
yield from query_job | ||
else: | ||
yield from query_job | ||
|
||
# Currently we only parse JobCompleted events but in future we would want to parse other | ||
# events to also create field level lineage. | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -15,6 +15,7 @@ | |
from google.cloud.bigquery import Client as BigQueryClient | ||
from google.cloud.logging_v2.client import Client as GCPLoggingClient | ||
from more_itertools import partition | ||
from ratelimiter import RateLimiter | ||
|
||
import datahub.emitter.mce_builder as builder | ||
from datahub.configuration.time_window_config import get_time_bucket | ||
|
@@ -817,7 +818,11 @@ def _get_exported_bigquery_audit_metadata( | |
logger.info( | ||
f"Finished loading log entries from BigQueryAuditMetadata in {dataset}" | ||
) | ||
yield from query_job | ||
if self.config.rate_limit: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think rate-limiting should not happen here but at the point where the returned generator is used. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same here, the ratelimiting should at _create_lineage_map |
||
with RateLimiter(max_calls=self.config.requests_per_min, period=60): | ||
yield from query_job | ||
else: | ||
yield from query_job | ||
|
||
def _get_entry_timestamp( | ||
self, entry: Union[AuditLogEntry, BigQueryAuditMetadata] | ||
|
@@ -884,11 +889,16 @@ def _get_bigquery_log_entries_via_gcp_logging( | |
] = list() | ||
for client in clients: | ||
try: | ||
list_entries: Iterable[ | ||
Union[AuditLogEntry, BigQueryAuditMetadata] | ||
] = client.list_entries( | ||
filter_=filter, page_size=self.config.log_page_size | ||
) | ||
list_entries: Iterable[Union[AuditLogEntry, BigQueryAuditMetadata]] | ||
if self.config.rate_limit: | ||
with RateLimiter(max_calls=self.config.requests_per_min, period=60): | ||
list_entries = client.list_entries( | ||
filter_=filter, page_size=self.config.log_page_size | ||
) | ||
else: | ||
list_entries = client.list_entries( | ||
filter_=filter, page_size=self.config.log_page_size | ||
) | ||
list_entry_generators_across_clients.append(list_entries) | ||
except Exception as e: | ||
logger.warning( | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
import pydantic | ||
|
||
from datahub.configuration.common import ConfigModel | ||
|
||
|
||
class BigQueryBaseConfig(ConfigModel): | ||
rate_limit: bool = pydantic.Field( | ||
default=False, description="Should we rate limit reqeusts made to API." | ||
) | ||
requests_per_min: int = pydantic.Field( | ||
default=60, | ||
description="Used to control number of API calls made per min. Only used when `rate_limit` is set to `True`.", | ||
) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think rate-limiting should happen on _create_lineage_map and not here which only returns a generator.