-
Notifications
You must be signed in to change notification settings - Fork 7.4k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat(ingest): Created a faster ingestion mode - pipeline (#1750)
* Unify pgvector and postgres connection settings * Remove local changes * Update file pgvector->postgres * postgresql should be postgres * Adding pipeline ingestion mode * disable hugging face parallelism. Continue on file to doc transform failure * Semaphore to limit docq async workers. ETA reporting
- Loading branch information
Showing
5 changed files
with
301 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,122 @@ | ||
import datetime | ||
import logging | ||
import math | ||
import time | ||
from collections import deque | ||
from typing import Any | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
def human_time(*args: Any, **kwargs: Any) -> str: | ||
def timedelta_total_seconds(timedelta: datetime.timedelta) -> float: | ||
return ( | ||
timedelta.microseconds | ||
+ 0.0 | ||
+ (timedelta.seconds + timedelta.days * 24 * 3600) * 10**6 | ||
) / 10**6 | ||
|
||
secs = float(timedelta_total_seconds(datetime.timedelta(*args, **kwargs))) | ||
# We want (ms) precision below 2 seconds | ||
if secs < 2: | ||
return f"{secs * 1000}ms" | ||
units = [("y", 86400 * 365), ("d", 86400), ("h", 3600), ("m", 60), ("s", 1)] | ||
parts = [] | ||
for unit, mul in units: | ||
if secs / mul >= 1 or mul == 1: | ||
if mul > 1: | ||
n = int(math.floor(secs / mul)) | ||
secs -= n * mul | ||
else: | ||
# >2s we drop the (ms) component. | ||
n = int(secs) | ||
if n: | ||
parts.append(f"{n}{unit}") | ||
return " ".join(parts) | ||
|
||
|
||
def eta(iterator: list[Any]) -> Any: | ||
"""Report an ETA after 30s and every 60s thereafter.""" | ||
total = len(iterator) | ||
_eta = ETA(total) | ||
_eta.needReport(30) | ||
for processed, data in enumerate(iterator, start=1): | ||
yield data | ||
_eta.update(processed) | ||
if _eta.needReport(60): | ||
logger.info(f"{processed}/{total} - ETA {_eta.human_time()}") | ||
|
||
|
||
class ETA: | ||
"""Predict how long something will take to complete.""" | ||
|
||
def __init__(self, total: int): | ||
self.total: int = total # Total expected records. | ||
self.rate: float = 0.0 # per second | ||
self._timing_data: deque[tuple[float, int]] = deque(maxlen=100) | ||
self.secondsLeft: float = 0.0 | ||
self.nexttime: float = 0.0 | ||
|
||
def human_time(self) -> str: | ||
if self._calc(): | ||
return f"{human_time(seconds=self.secondsLeft)} @ {int(self.rate * 60)}/min" | ||
return "(computing)" | ||
|
||
def update(self, count: int) -> None: | ||
# count should be in the range 0 to self.total | ||
assert count > 0 | ||
assert count <= self.total | ||
self._timing_data.append((time.time(), count)) # (X,Y) for pearson | ||
|
||
def needReport(self, whenSecs: int) -> bool: | ||
now = time.time() | ||
if now > self.nexttime: | ||
self.nexttime = now + whenSecs | ||
return True | ||
return False | ||
|
||
def _calc(self) -> bool: | ||
# A sample before a prediction. Need two points to compute slope! | ||
if len(self._timing_data) < 3: | ||
return False | ||
|
||
# http://en.wikipedia.org/wiki/Pearson_product-moment_correlation_coefficient | ||
# Calculate means and standard deviations. | ||
samples = len(self._timing_data) | ||
# column wise sum of the timing tuples to compute their mean. | ||
mean_x, mean_y = ( | ||
sum(i) / samples for i in zip(*self._timing_data, strict=False) | ||
) | ||
std_x = math.sqrt( | ||
sum(pow(i[0] - mean_x, 2) for i in self._timing_data) / (samples - 1) | ||
) | ||
std_y = math.sqrt( | ||
sum(pow(i[1] - mean_y, 2) for i in self._timing_data) / (samples - 1) | ||
) | ||
|
||
# Calculate coefficient. | ||
sum_xy, sum_sq_v_x, sum_sq_v_y = 0.0, 0.0, 0 | ||
for x, y in self._timing_data: | ||
x -= mean_x | ||
y -= mean_y | ||
sum_xy += x * y | ||
sum_sq_v_x += pow(x, 2) | ||
sum_sq_v_y += pow(y, 2) | ||
pearson_r = sum_xy / math.sqrt(sum_sq_v_x * sum_sq_v_y) | ||
|
||
# Calculate regression line. | ||
# y = mx + b where m is the slope and b is the y-intercept. | ||
m = self.rate = pearson_r * (std_y / std_x) | ||
y = self.total | ||
b = mean_y - m * mean_x | ||
x = (y - b) / m | ||
|
||
# Calculate fitted line (transformed/shifted regression line horizontally). | ||
fitted_b = self._timing_data[-1][1] - (m * self._timing_data[-1][0]) | ||
fitted_x = (y - fitted_b) / m | ||
_, count = self._timing_data[-1] # adjust last data point progress count | ||
adjusted_x = ((fitted_x - x) * (count / self.total)) + x | ||
eta_epoch = adjusted_x | ||
|
||
self.secondsLeft = max([eta_epoch - time.time(), 0]) | ||
return True |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters