From a256efd58d27070fb9122ad765b07edea35b4aec Mon Sep 17 00:00:00 2001 From: Jorge Leitao Date: Sun, 17 Oct 2021 22:49:56 +0200 Subject: [PATCH] Removed benches for pyarrow. (#535) --- benchmarks/.gitignore | 1 - benchmarks/bench_read.py | 49 -------------------- benchmarks/run.py | 20 -------- benchmarks/summarize.py | 51 -------------------- parquet_integration/bench_write.py | 74 ------------------------------ 5 files changed, 195 deletions(-) delete mode 100644 benchmarks/.gitignore delete mode 100644 benchmarks/bench_read.py delete mode 100644 benchmarks/run.py delete mode 100644 benchmarks/summarize.py delete mode 100644 parquet_integration/bench_write.py diff --git a/benchmarks/.gitignore b/benchmarks/.gitignore deleted file mode 100644 index 344f079e498..00000000000 --- a/benchmarks/.gitignore +++ /dev/null @@ -1 +0,0 @@ -runs diff --git a/benchmarks/bench_read.py b/benchmarks/bench_read.py deleted file mode 100644 index cab190fe2cd..00000000000 --- a/benchmarks/bench_read.py +++ /dev/null @@ -1,49 +0,0 @@ -import timeit -import io -import os -import json - -import pyarrow.parquet - - -def _bench_single(log2_size: int, column: str, use_dict: bool) -> float: - if use_dict: - path = f"fixtures/pyarrow3/v1/dict/benches_{2**log2_size}.parquet" - else: - path = f"fixtures/pyarrow3/v1/benches_{2**log2_size}.parquet" - with open(path, "rb") as f: - data = f.read() - data = io.BytesIO(data) - - def f(): - pyarrow.parquet.read_table(data, columns=[column]) - - seconds = timeit.Timer(f).timeit(number=512) / 512 - ns = seconds * 1000 * 1000 * 1000 - return ns - - -def _report(name: str, result: float): - path = f"benchmarks/runs/{name}/new" - os.makedirs(path, exist_ok=True) - with open(f"{path}/estimates.json", "w") as f: - json.dump({"mean": {"point_estimate": result}}, f) - - -def _bench(size, ty): - column, use_dict = { - "i64": ("int64", False), - "bool": ("bool", False), - "utf8": ("string", False), - "utf8 dict": ("string", True), - }[ty] - - result = _bench_single(size, column, use_dict) - print(result) - _report(f"read {ty} 2_{size}", result) - - -for size in range(10, 22, 2): - for ty in ["i64", "bool", "utf8", "utf8 dict"]: - print(size, ty) - _bench(size, ty) diff --git a/benchmarks/run.py b/benchmarks/run.py deleted file mode 100644 index a707f23f1bd..00000000000 --- a/benchmarks/run.py +++ /dev/null @@ -1,20 +0,0 @@ -import subprocess - - -# run pyarrow -subprocess.call(["python", "benchmarks/bench_read.py"]) - - -for ty in ["i64", "bool", "utf8", "utf8 dict"]: - args = [ - "cargo", - "bench", - "--features", - "io_parquet,io_parquet_compression", - "--bench", - "read_parquet", - "--", - f"{ty} 2", - ] - - subprocess.call(args) diff --git a/benchmarks/summarize.py b/benchmarks/summarize.py deleted file mode 100644 index a44c0ac182f..00000000000 --- a/benchmarks/summarize.py +++ /dev/null @@ -1,51 +0,0 @@ -import json -import os - - -def _read_reports(engine: str): - root = { - "arrow2": "target/criterion", - "pyarrow": "benchmarks/runs", - }[engine] - - result = [] - for item in os.listdir(root): - if item == "report": - continue - - with open(os.path.join(root, item, "new", "estimates.json")) as f: - data = json.load(f) - - ms = data["mean"]["point_estimate"] / 1000 - task = item.split()[0] - type = " ".join(item.split()[1:-1]) - size = int(item.split()[-1].split("_")[1]) - result.append( - { - "engine": engine, - "task": task, - "type": type, - "size": size, - "time": ms, - } - ) - return result - - -def _print_report(result): - for ty in ["i64", "bool", "utf8", "utf8 dict"]: - print(ty) - r = filter(lambda x: x["type"] == ty, result) - r = sorted(r, key=lambda x: x["size"]) - for row in r: - print(row["time"]) - - -def print_report(): - for engine in ["arrow2", "pyarrow"]: - print(engine) - result = _read_reports(engine) - _print_report(result) - - -print_report() diff --git a/parquet_integration/bench_write.py b/parquet_integration/bench_write.py deleted file mode 100644 index 2c47912205c..00000000000 --- a/parquet_integration/bench_write.py +++ /dev/null @@ -1,74 +0,0 @@ -""" -Benchmark of writing a pyarrow table of size N to parquet. -""" -import io -import os -import timeit - -import numpy -import pyarrow.parquet - - -def case_basic_nullable(size = 1): - int64 = [0, 1, None, 3, None, 5, 6, 7, None, 9] - float64 = [0.0, 1.0, None, 3.0, None, 5.0, 6.0, 7.0, None, 9.0] - string = ["Hello", None, "aa", "", None, "abc", None, None, "def", "aaa"] - boolean = [True, None, False, False, None, True, None, None, True, True] - - fields = [ - pa.field('int64', pa.int64()), - pa.field('float64', pa.float64()), - pa.field('string', pa.utf8()), - pa.field('bool', pa.bool_()), - pa.field('date', pa.timestamp('ms')), - pa.field('uint32', pa.uint32()), - ] - schema = pa.schema(fields) - - return { - "int64": int64 * size, - "float64": float64 * size, - "string": string * size, - "bool": boolean * size, - "date": int64 * size, - "uint32": int64 * size, - }, schema, f"basic_nullable_{size*10}.parquet" - -def bench(log2_size: int, datatype: str): - - if datatype == 'int64': - data = [0, 1, None, 3, 4, 5, 6, 7] * 128 # 1024 entries - field = pyarrow.field('int64', pyarrow.int64()) - elif datatype == 'utf8': - # 4 each because our own benches also use 4 - data = ["aaaa", "aaab", None, "aaac", "aaad", "aaae", "aaaf", "aaag"] * 128 # 1024 entries - field = pyarrow.field('utf8', pyarrow.utf8()) - elif datatype == 'bool': - data = [True, False, None, True, False, True, True, True] * 128 # 1024 entries - field = pyarrow.field('bool', pyarrow.bool_()) - - data = data * 2**log2_size - - t = pyarrow.table([data], schema=pyarrow.schema([field])) - - def f(): - pyarrow.parquet.write_table(t, - io.BytesIO(), - use_dictionary=False, - compression=None, - write_statistics=False, - data_page_size=2**40, # i.e. a large number to ensure a single page - data_page_version="1.0") - - seconds = timeit.Timer(f).timeit(number=512) / 512 - microseconds = seconds * 1000 * 1000 - print(f"write {datatype} 2^{10 + log2_size} time: {microseconds:.2f} us") - -for i in range(0, 12, 2): - bench(i, "int64") - -for i in range(0, 12, 2): - bench(i, "utf8") - -for i in range(0, 12, 2): - bench(i, "bool")