Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

prepare_iter_for_array PR #43

Open
wants to merge 23 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
23242ef
Adds C impl for is_gen_copy_values. 1.3x faster.
chaburkland Apr 26, 2021
16375c6
Adds initial framework for prepare_iter_for_array
chaburkland Apr 26, 2021
783b47c
Initial iteration on prepare_iter_for_array.
chaburkland Apr 27, 2021
4ab7a7c
Initial effort to clean up isinstance checks. Cleans up some ref coun…
chaburkland Apr 28, 2021
3613f35
Adds FrozenAutoMap to module to avoid imports. Cleans up perf test an…
chaburkland Apr 28, 2021
5d56ad7
Adds automap to requirements.
chaburkland Apr 28, 2021
fb76295
Simplifies and cleans up unit test.
chaburkland Apr 28, 2021
e1914d3
Comments out a failing test temporarily.
chaburkland Apr 28, 2021
590ca5f
Comments out another failing test temporarily.
chaburkland Apr 28, 2021
8d4665d
Uncomments the failing tests.
chaburkland Apr 28, 2021
e5d3b49
Fixes incorrect enum type check logic - address 32 bit issue with int…
chaburkland Apr 28, 2021
cd845e8
Merge branch 'master' into 9/prepare_iter_for_array
chaburkland May 26, 2021
5eb35b5
Addresses pyi failures.
chaburkland May 26, 2021
d344602
Updates reference python to reflect new changes.
chaburkland May 26, 2021
0e89909
Updates c source to return object/None instead of a boolean.
chaburkland May 26, 2021
bf342e8
Aligns c closer to python source.
chaburkland May 26, 2021
de76312
Optimizes prepare_iter_to_array and cleans up big_int logic.
chaburkland May 28, 2021
449f910
Merge branch 'master' into 9/prepare_iter_for_array
chaburkland May 28, 2021
0201b00
Adds in a missing cast.
chaburkland May 28, 2021
9c24888
Merge branch 'master' into 9/prepare_iter_for_array
chaburkland Jun 1, 2021
8368df6
Fixes __init__.py
chaburkland Jun 1, 2021
5239d53
Removes unnecessary enum isinstance check.
chaburkland Jun 1, 2021
26116ef
Merge branch 'master' into 9/prepare_iter_for_array
chaburkland Jun 2, 2021
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
125 changes: 123 additions & 2 deletions performance/__main__.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
import argparse
import collections
import datetime
import timeit
import argparse
import typing as tp
from enum import Enum

from automap import FrozenAutoMap
import numpy as np

from performance.reference.util import mloc as mloc_ref
Expand All @@ -17,6 +20,8 @@
from performance.reference.util import dtype_from_element as dtype_from_element_ref
from performance.reference.util import array_deepcopy as array_deepcopy_ref
from performance.reference.util import isna_element as isna_element_ref
from performance.reference.util import is_gen_copy_values as is_gen_copy_values_ref
from performance.reference.util import prepare_iter_for_array as prepare_iter_for_array_ref

from performance.reference.array_go import ArrayGO as ArrayGOREF

Expand All @@ -32,6 +37,8 @@
from arraykit import dtype_from_element as dtype_from_element_ak
from arraykit import array_deepcopy as array_deepcopy_ak
from arraykit import isna_element as isna_element_ak
from arraykit import is_gen_copy_values as is_gen_copy_values_ak
from arraykit import prepare_iter_for_array as prepare_iter_for_array_ak

from arraykit import ArrayGO as ArrayGOAK

Expand Down Expand Up @@ -360,6 +367,120 @@ class IsNaElementPerfREF(IsNaElementPerf):


#-------------------------------------------------------------------------------
class IsGenCopyValues(Perf):
NUMBER = 2500

def pre(self):
self.objects = [
[1, 2, 3],
(1, 2, 3),
FrozenAutoMap((1, 2, 3)),
{1, 2, 3},
{1:1, 2:2, 3:3},
]

def main(self):
for _ in range(200):
for obj in self.objects:
self.entry(obj)

class IsGenCopyValuesAK(IsGenCopyValues):
entry = staticmethod(is_gen_copy_values_ak)

class IsGenCopyValuesREF(IsGenCopyValues):
entry = staticmethod(is_gen_copy_values_ref)


#-------------------------------------------------------------------------------
class PrepareIterForArray(Perf):
NUMBER = 5
FUNCTIONS = ('iter_small', 'iter_large')

def pre(self):
def a() -> tp.Iterator[tp.Any]:
for i in range(3):
yield i
yield None

def b() -> tp.Iterator[tp.Any]:
yield None
for i in range(3):
yield i

def c() -> tp.Iterator[tp.Any]:
yield 10
yield None
for i in range(3):
yield i
yield (3,4)

class E(Enum):
A = 1
B = 2
C = 3

self.small_iterables = [
('a', 'b', 'c'),
('a', 'b', 3),
('a', 'b', (1, 2)),
[True, False, True],
(1, 2, 4.3, 2),
(1, 2, 4.3, 2, None),
(1, 2, 4.3, 2, 'g'),
range(4),
[3, 2, (3,4)],
[300000000000000002, 5000000000000000001],
range(3, 7),
[0.0, 36_028_797_018_963_969],
(x for x in ()),
list(),
tuple(),
dict(),
set(),
FrozenAutoMap((1, 2, 3, 4, 5, 6)),
[E.A, E.B, E.C],
]

self.small_iterables.extend([iter(iterable) for iterable in self.small_iterables])
self.small_iterables.extend((a(), b(), c()))

self.large_iterables = [
('a', 'b', 'c') * 10000,
('a', 'b', 'c') * 10000 + (1, ),
('a', 'b', 'c') * 10000 + ((1, 2), ),
[True, False, True] * 10000,
(1, 2, 4.3, 2) * 10000,
(1, 2, 4.3, 2) * 10000 + (None, ),
(1, 2, 4.3, 2) * 10000 + ('g', ),
range(10000),
[3, 2, 1] * 10000 + [(3,4)],
[300000000000000002] * 20000 + [5000000000000000001],
range(30000, 40000),
[0.0] * 20000 + [36_028_797_018_963_969],
FrozenAutoMap(range(10000)),
[E.A, E.B, E.C] * 10000,
]
self.large_iterables.extend([iter(iterable) for iterable in self.large_iterables])

def iter_small(self):
for _ in range(2000):
for restrict_copy in (True, False):
for iterable in self.small_iterables:
self.entry(iterable, restrict_copy=restrict_copy)

def iter_large(self):
for restrict_copy in (True, False):
for iterable in self.large_iterables:
self.entry(iterable, restrict_copy=restrict_copy)

class PrepareIterForArrayAK(PrepareIterForArray):
entry = staticmethod(prepare_iter_for_array_ak)

class PrepareIterForArrayREF(PrepareIterForArray):
entry = staticmethod(prepare_iter_for_array_ref)

#-------------------------------------------------------------------------------


def get_arg_parser():

Expand Down Expand Up @@ -398,7 +519,7 @@ def main():
number=cls_runner.NUMBER)
records.append((cls_perf.__name__, func_attr, results['ak'], results['ref'], results['ref'] / results['ak']))

width = 24
width = 32
for record in records:
print(''.join(
(r.ljust(width) if isinstance(r, str) else str(round(r, 8)).ljust(width)) for r in record
Expand Down
106 changes: 106 additions & 0 deletions performance/reference/util.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
import typing as tp
from copy import deepcopy
from collections import abc
from automap import FrozenAutoMap # pylint: disable = E0611

import numpy as np

DtypeSpecifier = tp.Optional[tp.Union[str, np.dtype, type]]

DTYPE_DATETIME_KIND = 'M'
DTYPE_TIMEDELTA_KIND = 'm'
DTYPE_COMPLEX_KIND = 'c'
Expand All @@ -25,6 +29,17 @@
DTYPES_BOOL = (DTYPE_BOOL,)
DTYPES_INEXACT = (DTYPE_FLOAT_DEFAULT, DTYPE_COMPLEX_DEFAULT)

INEXACT_TYPES = (float, complex, np.inexact) # inexact matches floating, complexfloating

DICTLIKE_TYPES = (abc.Set, dict, FrozenAutoMap)

# iterables that cannot be used in NP array constructors; asumes that dictlike
# types have already been identified
INVALID_ITERABLE_FOR_ARRAY = (abc.ValuesView, abc.KeysView)

# integers above this value will lose precision when coerced to a float
INT_MAX_COERCIBLE_TO_FLOAT = 9_007_199_256_349_108


def mloc(array: np.ndarray) -> int:
'''Return the memory location of an array.
Expand Down Expand Up @@ -216,3 +231,94 @@ def dtype_from_element(value: tp.Optional[tp.Hashable]) -> np.dtype:
# NOTE: calling array and getting dtype on np.nan is faster than combining isinstance, isnan calls
return np.array(value).dtype


def is_gen_copy_values(values: tp.Iterable[tp.Any]) -> tp.Tuple[bool, bool]:
'''
Returns:
copy_values: True if values cannot be used in an np.array constructor.`
'''
if hasattr(values, '__len__'):
if isinstance(values, DICTLIKE_TYPES + INVALID_ITERABLE_FOR_ARRAY):
# Dict-like iterables need copies
return False, True

return False, False

# We are a generator and all generators need copies
return True, True


def prepare_iter_for_array(
values: tp.Iterable[tp.Any],
restrict_copy: bool = False
) -> tp.Tuple[DtypeSpecifier, bool, tp.Sequence[tp.Any]]:
'''
Determine an appropriate DtypeSpecifier for values in an iterable.
This does not try to determine the actual dtype, but instead, if the DtypeSpecifier needs to be
object rather than None (which lets NumPy auto detect).
This is expected to only operate on 1D data.

Args:
values: can be a generator that will be exhausted in processing;
if a generator, a copy will be made and returned as values.
restrict_copy: if True, reject making a copy, even if a generator is given

Returns:
resolved_dtype, has_tuple, values
'''
is_gen, copy_values = is_gen_copy_values(values)

if not is_gen and len(values) == 0: #type: ignore
return None, False, values #type: ignore

if restrict_copy:
copy_values = False

v_iter = values if is_gen else iter(values)

if copy_values:
values_post = []

resolved = None # None is valid specifier if the type is not ambiguous

has_tuple = False
has_str = False
has_non_str = False
has_inexact = False
has_big_int = False

for v in v_iter:
if copy_values:
# if a generator, have to make a copy while iterating
values_post.append(v)

value_type = type(v)

if (value_type is str
or value_type is np.str_
or value_type is bytes
or value_type is np.bytes_):
# must compare to both string types
has_str = True
elif hasattr(v, '__len__'):
# identify SF types by if they have STATIC attr they also must be assigned after array creation, so we treat them like tuples
has_tuple = True
resolved = object
break
else:
has_non_str = True
if value_type in INEXACT_TYPES:
has_inexact = True
elif value_type is int and abs(v) > INT_MAX_COERCIBLE_TO_FLOAT:
has_big_int = True

if (has_str and has_non_str) or (has_big_int and has_inexact):
resolved = object
break

if copy_values:
# v_iter is an iter, we need to finish it
values_post.extend(v_iter)
return resolved, has_tuple, values_post

return resolved, has_tuple, values #type: ignore
2 changes: 1 addition & 1 deletion requirements-test.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@ numpy==1.17.4
pytest==3.8.0
pylint==2.7.4
invoke==1.4.0

automap==0.4.8
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1 +1,3 @@
numpy==1.17.4
automap==0.4.8

4 changes: 4 additions & 0 deletions src/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
# pylint: disable=C0414

from ._arraykit import __version__
from ._arraykit import FrozenAutoMap as FrozenAutoMap
from ._arraykit import ArrayGO as ArrayGO
from ._arraykit import immutable_filter as immutable_filter
from ._arraykit import mloc as mloc
Expand All @@ -16,3 +17,6 @@
from ._arraykit import resolve_dtype_iter as resolve_dtype_iter
from ._arraykit import isna_element as isna_element
from ._arraykit import dtype_from_element as dtype_from_element
from ._arraykit import is_gen_copy_values as is_gen_copy_values
from ._arraykit import prepare_iter_for_array as prepare_iter_for_array

7 changes: 7 additions & 0 deletions src/__init__.pyi
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
import typing as tp
from automap import FrozenAutoMap

import numpy as np # type: ignore

_T = tp.TypeVar('_T')
_DtypeSpecifier = tp.Optional[tp.Union[str, np.dtype, type]]

__version__: str

Expand Down Expand Up @@ -32,4 +34,9 @@ def resolve_dtype(__d1: np.dtype, __d2: np.dtype) -> np.dtype: ...
def resolve_dtype_iter(__dtypes: tp.Iterable[np.dtype]) -> np.dtype: ...
def isna_element(__value: tp.Any) -> bool: ...
def dtype_from_element(__value: tp.Optional[tp.Hashable]) -> np.dtype: ...
def is_gen_copy_values(__values: tp.Iterable[tp.Any]) -> tp.Tuple[bool, bool]: ...
def prepare_iter_for_array(
__values: tp.Iterable[tp.Any],
restrict_copy: bool = ...,
) -> tp.Tuple[_DtypeSpecifier, bool, tp.Sequence[tp.Any]]: ...

Loading