diff --git a/.gitignore b/.gitignore index 300033e8..e55b9691 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,4 @@ .vscode __pycache__ build +*.diff diff --git a/arraykit.c b/arraykit.c index ce30a0d5..c570d1b8 100644 --- a/arraykit.c +++ b/arraykit.c @@ -5,6 +5,8 @@ # define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION # include "numpy/arrayobject.h" +# include "numpy/arrayscalars.h" // Needed for Datetime scalar expansions +# include "numpy/ufuncobject.h" //------------------------------------------------------------------------------ // Macros @@ -41,6 +43,12 @@ return NULL;\ } while (0) +// To simplify lines merely checking for `!value` +# define AK_RETURN_NULL_IF_NOT(obj) \ + if (!obj) { \ + return NULL; \ + } + # define _AK_DEBUG_BEGIN() \ do { \ fprintf(stderr, "XXX %s:%i:%s: ", __FILE__, __LINE__, __FUNCTION__); @@ -117,9 +125,8 @@ PyArray_Descr* AK_ResolveDTypeIter(PyObject *dtypes) { PyObject *iterator = PyObject_GetIter(dtypes); - if (iterator == NULL) { - return NULL; - } + AK_RETURN_NULL_IF_NOT(iterator); + PyArray_Descr *resolved = NULL; PyArray_Descr *dtype; while ((dtype = (PyArray_Descr*) PyIter_Next(iterator))) { @@ -249,9 +256,9 @@ shape_filter(PyObject *Py_UNUSED(m), PyObject *a) AK_CHECK_NUMPY_ARRAY_1D_2D(a); PyArrayObject *array = (PyArrayObject *)a; - int size0 = PyArray_DIM(array, 0); + npy_intp size0 = PyArray_DIM(array, 0); // If 1D array, set size for axis 1 at 1, else use 2D array to get the size of axis 1 - int size1 = PyArray_NDIM(array) == 1 ? 1 : PyArray_DIM(array, 1); + npy_intp size1 = PyArray_NDIM(array) == 1 ? 1 : PyArray_DIM(array, 1); return Py_BuildValue("ii", size0, size1); } @@ -335,11 +342,8 @@ static PyObject * resolve_dtype(PyObject *Py_UNUSED(m), PyObject *args) { PyArray_Descr *d1, *d2; - if (!PyArg_ParseTuple(args, "O!O!:resolve_dtype", - &PyArrayDescr_Type, &d1, &PyArrayDescr_Type, &d2)) - { - return NULL; - } + AK_RETURN_NULL_IF_NOT(PyArg_ParseTuple(args, "O!O!:resolve_dtype", + &PyArrayDescr_Type, &d1, &PyArrayDescr_Type, &d2)); return (PyObject *)AK_ResolveDTypes(d1, d2); } @@ -349,6 +353,203 @@ resolve_dtype_iter(PyObject *Py_UNUSED(m), PyObject *arg) return (PyObject *)AK_ResolveDTypeIter(arg); } +//------------------------------------------------------------------------------ +// isin + +static PyObject * +AK_isin_array_dtype_use_np(PyArrayObject *array, PyArrayObject *other, int assume_unique) +{ + PyObject* result = NULL; + + PyObject* args = PyTuple_Pack(2, (PyObject*)array, (PyObject*)other); + AK_RETURN_NULL_IF_NOT(args); + + PyObject* kwarg = PyDict_New(); + if (!kwarg) { + Py_DECREF(args); + return NULL; + } + + PyObject* assume_unique_obj = PyLong_FromLong((long)assume_unique); + if (!assume_unique_obj) { + goto failure; + } + + int success = PyDict_SetItemString(kwarg, "assume_unique", assume_unique_obj); + Py_DECREF(assume_unique_obj); + if (success == -1) { + goto failure; + } + + PyObject* numpy = PyImport_ImportModule("numpy"); + if (!numpy) { + goto failure; + } + + PyObject* func = PyObject_GetAttrString(numpy, PyArray_NDIM(array) == 1 ? "in1d": "isin"); + Py_DECREF(numpy); + if (!func) { + goto failure; + } + + result = PyObject_Call(func, args, kwarg); + Py_DECREF(func); +failure: + // These will always exist. + Py_DECREF(args); + Py_DECREF(kwarg); + + return result; +} + +static PyObject * +AK_isin_array_object(PyArrayObject *array, PyArrayObject *other) +{ + /* Algorithm: + + for loc, element in loc_iter(array): + result[loc] = element in set(other) + */ + + PyObject *compare_elements = PyFrozenSet_New((PyObject*)other); + AK_RETURN_NULL_IF_NOT(compare_elements); + + PyArrayObject *arrays[2]; + npy_uint32 arrays_flags[2]; + PyArray_Descr *op_dtypes[2]; + arrays[0] = array; + arrays[1] = NULL; + arrays_flags[0] = NPY_ITER_READONLY; + arrays_flags[1] = NPY_ITER_WRITEONLY | NPY_ITER_ALLOCATE; + op_dtypes[0] = PyArray_DescrFromType(NPY_OBJECT); + op_dtypes[1] = PyArray_DescrFromType(NPY_BOOL); + + // No inner iteration - inner loop is handled by CopyArray code + // Reference objects are OK. + int iter_flags = NPY_ITER_EXTERNAL_LOOP | NPY_ITER_REFS_OK; + + // Construct the iterator + NpyIter *iter = NpyIter_MultiNew( + 2, // number of arrays + arrays, + iter_flags, + NPY_KEEPORDER, // Maintain existing order for `array` + NPY_NO_CASTING, // No casting will be required + arrays_flags, + op_dtypes); + + if (!iter) { + Py_DECREF(compare_elements); + return NULL; + } + + NpyIter_IterNextFunc *iternext = NpyIter_GetIterNext(iter, NULL); + if (!iternext) { + Py_DECREF(compare_elements); + NpyIter_Deallocate(iter); + return NULL; + } + + char** dataptr = NpyIter_GetDataPtrArray(iter); + npy_intp *strideptr = NpyIter_GetInnerStrideArray(iter); + npy_intp *sizeptr = NpyIter_GetInnerLoopSizePtr(iter); + + // If we don't need the GIL, iteration can be multi-threaded! + NPY_BEGIN_THREADS_DEF; + if (!NpyIter_IterationNeedsAPI(iter)) { + // This will likely never happen, since I am pretty sure that object + // dtypes need the API. However, I don't know enough about the internals + // of numpy iteration to know that this will *never happen.... + NPY_BEGIN_THREADS; + } + + do { + char* src_data = dataptr[0]; + char* dst_data = dataptr[1]; + npy_intp size = sizeptr[0]; + npy_intp src_stride = strideptr[0]; + npy_intp dst_stride = strideptr[1]; + + PyObject* obj_ref = NULL; + + while (size--) { + // Object arrays contains pointers to PyObjects, so we will only temporarily + // look at the reference here. + memcpy(&obj_ref, src_data, sizeof(obj_ref)); + + // 5. Assign into result whether or not the element exists in the set + // int found = PySequence_Contains(compare_elements, ((PyObject**)data)[0]); + npy_bool found = (npy_bool)PySequence_Contains(compare_elements, obj_ref); + + if (found == -1) { + NpyIter_Deallocate(iter); + Py_DECREF(compare_elements); + return NULL; + } + + *dst_data = found; + + src_data += src_stride; + dst_data += dst_stride; + } + + // Increment the iterator to the next inner loop + } while(iternext(iter)); + + NPY_END_THREADS; + + Py_DECREF(compare_elements); + + // If the API was needed, it may have thrown an error + if (NpyIter_IterationNeedsAPI(iter) && PyErr_Occurred()) { + NpyIter_Deallocate(iter); + return NULL; + } + + // Get the result from the iterator object array + PyObject *ret = (PyObject*)NpyIter_GetOperandArray(iter)[1]; + if (!ret) { + NpyIter_Deallocate(iter); + return NULL; + } + Py_INCREF(ret); + + if (NpyIter_Deallocate(iter) != NPY_SUCCEED) { + Py_DECREF(ret); + return NULL; + } + + return ret; +} + +static PyObject * +isin_array(PyObject *Py_UNUSED(m), PyObject *args, PyObject *kwargs) +{ + int array_is_unique, other_is_unique; + PyArrayObject *array, *other; + + static char *kwlist[] = {"array", "array_is_unique", "other", "other_is_unique", NULL}; + + AK_RETURN_NULL_IF_NOT(PyArg_ParseTupleAndKeywords(args, kwargs, "O!iO!i:isin_array", + kwlist, + &PyArray_Type, &array, &array_is_unique, + &PyArray_Type, &other, &other_is_unique)); + + if (PyArray_NDIM(other) != 1) { + return PyErr_Format(PyExc_TypeError, "Expected other to be 1-dimensional"); + } + + PyArray_Descr* array_dtype = PyArray_DTYPE(array); + PyArray_Descr* other_dtype = PyArray_DTYPE(other); + + // Use Python sets to handle object arrays + if (PyDataType_ISOBJECT(array_dtype) || PyDataType_ISOBJECT(other_dtype)) { + return AK_isin_array_object(array, other); + } + // Use numpy in1d logic for dtype arrays + return AK_isin_array_dtype_use_np(array, other, array_is_unique && other_is_unique); +} + //------------------------------------------------------------------------------ // ArrayGO //------------------------------------------------------------------------------ @@ -414,13 +615,10 @@ ArrayGO_new(PyTypeObject *cls, PyObject *args, PyObject *kwargs) int parsed = PyArg_ParseTupleAndKeywords( args, kwargs, "O|$p:ArrayGO", argnames, &iterable, &own_iterable ); - if (!parsed) { - return NULL; - } + AK_RETURN_NULL_IF_NOT(parsed); + ArrayGOObject *self = (ArrayGOObject *)cls->tp_alloc(cls, 0); - if (!self) { - return NULL; - } + AK_RETURN_NULL_IF_NOT(self); if (PyArray_Check(iterable)) { if (!PyDataType_ISOBJECT(PyArray_DESCR((PyArrayObject *)iterable))) { @@ -463,9 +661,8 @@ ArrayGO_append(ArrayGOObject *self, PyObject *value) { if (!self->list) { self->list = PyList_New(1); - if (!self->list) { - return NULL; - } + AK_RETURN_NULL_IF_NOT(self->list); + Py_INCREF(value); PyList_SET_ITEM(self->list, 0, value); } @@ -481,9 +678,8 @@ ArrayGO_extend(ArrayGOObject *self, PyObject *values) { if (!self->list) { self->list = PySequence_List(values); - if (!self->list) { - return NULL; - } + AK_RETURN_NULL_IF_NOT(self->list); + Py_RETURN_NONE; } Py_ssize_t len = PyList_Size(self->list); @@ -626,6 +822,7 @@ static PyMethodDef arraykit_methods[] = { {"array_deepcopy", array_deepcopy, METH_VARARGS, NULL}, {"resolve_dtype", resolve_dtype, METH_VARARGS, NULL}, {"resolve_dtype_iter", resolve_dtype_iter, METH_O, NULL}, + {"isin_array", (PyCFunction)isin_array, METH_VARARGS | METH_KEYWORDS, NULL}, {NULL}, }; diff --git a/arraykit.pyi b/arraykit.pyi index b56e49c2..b9a7a576 100644 --- a/arraykit.pyi +++ b/arraykit.pyi @@ -28,3 +28,9 @@ def row_1d_filter(__array: np.array) -> np.ndarray: ... def array_deepcopy(__array: np.array, memo: tp.Dict[int, tp.Any]) -> np.ndarray: ... def resolve_dtype(__d1: np.dtype, __d2: np.dtype) -> np.dtype: ... def resolve_dtype_iter(__dtypes: tp.Iterable[np.dtype]) -> np.dtype: ... +def isin_array(*, + array: np.ndarray, + array_is_unique: bool, + other: np.ndarray, + other_is_unique: bool, + ) -> np.ndarray: ... diff --git a/debug.py b/debug.py new file mode 100755 index 00000000..4c30504d --- /dev/null +++ b/debug.py @@ -0,0 +1,125 @@ +#!/usr/bin/env python +from functools import partial + +import numpy as np # type: ignore +from arraykit import isin_array + +funcTT = partial(isin_array, array_is_unique=True, other_is_unique=True) +funcTF = partial(isin_array, array_is_unique=True, other_is_unique=False) +funcFT = partial(isin_array, array_is_unique=False, other_is_unique=True) +funcFF = partial(isin_array, array_is_unique=False, other_is_unique=False) + +class Obj: + def __init__(self, value): + self.v = value + def __hash__(self): + return hash(self.v) + def __eq__(self, other): + return self.v == other.v + +arr1 = np.array([[Obj(1), Obj(2), Obj(3)], [Obj(4), Obj(5), Obj(9)]], dtype=object) +arr2 = np.array([Obj(1), Obj(4), Obj(7), Obj(9)], dtype=object) +post = funcTT(array=arr1, other=arr2) +print(post) + +arr1 = np.array([1, 5, 2, 3, 4, 5, 1], dtype=np.int_) +arr2 = np.array([1, 4, 7, 9], dtype=np.int_) +post = funcFF(array=arr1, other=arr2) + +arr1 = np.array([1, 5, 2, 3, 4, 5, 1], dtype=np.float_) +arr2 = np.array([1, 4, 7, 9], dtype=np.float_) +post = funcFF(array=arr1, other=arr2) + +arr1 = np.array([1, 5, 2, 3, 4, 5, 1], dtype=str) +arr2 = np.array([1, 4, 7, 9], dtype=str) +post = funcFF(array=arr1, other=arr2) + +arr1 = np.array([1, 5, 2, 3, 4, 5, 1], dtype=np.complex_) +arr2 = np.array([1, 4, 7, 9], dtype=np.complex_) +post = funcFF(array=arr1, other=arr2) + + +def test_arrays(arr1, arr2, expected, func): + post = func(array=arr1.astype(np.int_), other=arr2.astype(np.int_)) + assert np.array_equal(expected, post) + + post = func(array=arr1.astype(np.float_), other=arr2.astype(np.float_)) + assert np.array_equal(expected, post) + + post = func(array=arr1.astype(np.complex_), other=arr2.astype(np.complex_)) + assert np.array_equal(expected, post) + + for freq in 'DMY': + post = func(array=arr1.astype(f'datetime64[{freq}]'), other=arr2.astype(f'datetime64[{freq}]')) + assert np.array_equal(expected, post) + + post = func(array=arr1.astype(f'timedelta64[{freq}]'), other=arr2.astype(f'timedelta64[{freq}]')) + assert np.array_equal(expected, post) + + +# ------------------------------------------------------------------------------ +# ------------------------------------- 1D ------------------------------------- + +def dtype_unique_1d(func): + arr1 = np.array([1, 5, 2, 3, 4]) + arr2 = np.array([1, 4, 7, 9]) + expected = np.array([1, 0, 0, 0, 1], dtype=np.bool_) + test_arrays(arr1, arr2, expected, func) + + +def dtype_arr1_non_unique_1d(func): + arr1 = np.array([1, 5, 2, 3, 4, 5, 1]) + arr2 = np.array([1, 4, 7, 9]) + expected = np.array([1, 0, 0, 0, 1, 0, 1], dtype=np.bool_) + test_arrays(arr1, arr2, expected, func) + + +def dtype_arr2_non_unique_1d(func): + arr1 = np.array([1, 5, 2, 3, 4]) + arr2 = np.array([1, 9, 4, 7, 9, 1]) + expected = np.array([1, 0, 0, 0, 1], dtype=np.bool_) + test_arrays(arr1, arr2, expected, func) + + +# ------------------------------------------------------------------------------ +# ------------------------------------- 2D ------------------------------------- + +def dtype_unique_2d(func): + arr1 = np.array([[1, 2, 3], [4, 5, 9]]) + arr2 = np.array([1, 4, 7, 9]) + expected = np.array([[1, 0, 0], [1, 0, 1]], dtype=np.bool_) + test_arrays(arr1, arr2, expected, func) + + +def dtype_arr2_non_unique_1d(func): + arr1 = np.array([[9, 1, 2, 3], [4, 3, 5, 9]]) + arr2 = np.array([1, 4, 7, 9]) + expected = np.array([[1, 1, 0, 0], [1, 0, 0, 1]], dtype=np.bool_) + test_arrays(arr1, arr2, expected, func) + + +def dtype_arr2_non_unique_1d(func): + arr1 = np.array([[1, 2, 3], [4, 5, 9]]) + arr2 = np.array([1, 9, 4, 7, 9, 1]) + expected = np.array([[1, 0, 0], [1, 0, 1]], dtype=np.bool_) + test_arrays(arr1, arr2, expected, func) + + +dtype_unique_1d(funcTT) +dtype_unique_1d(funcTF) +dtype_unique_1d(funcFT) +dtype_unique_1d(funcFF) +dtype_unique_2d(funcTT) +dtype_unique_2d(funcTF) +dtype_unique_2d(funcFT) +dtype_unique_2d(funcFF) + +dtype_arr1_non_unique_1d(funcFT) +dtype_arr1_non_unique_1d(funcFF) +dtype_arr2_non_unique_1d(funcTF) +dtype_arr2_non_unique_1d(funcFF) + +dtype_arr2_non_unique_1d(funcFT) +dtype_arr2_non_unique_1d(funcFF) +dtype_arr2_non_unique_1d(funcTF) +dtype_arr2_non_unique_1d(funcFF) diff --git a/performance/main.py b/performance/main.py index c7ca120a..9a801621 100644 --- a/performance/main.py +++ b/performance/main.py @@ -1,10 +1,8 @@ - - - import timeit import argparse import numpy as np +import pandas as pd from performance.reference.util import mloc as mloc_ref from performance.reference.util import immutable_filter as immutable_filter_ref @@ -15,6 +13,7 @@ from performance.reference.util import row_1d_filter as row_1d_filter_ref from performance.reference.util import resolve_dtype as resolve_dtype_ref from performance.reference.util import resolve_dtype_iter as resolve_dtype_iter_ref +from performance.reference.util import isin_array as isin_array_ref from performance.reference.util import array_deepcopy as array_deepcopy_ref from performance.reference.array_go import ArrayGO as ArrayGOREF @@ -28,6 +27,7 @@ from arraykit import row_1d_filter as row_1d_filter_ak from arraykit import resolve_dtype as resolve_dtype_ak from arraykit import resolve_dtype_iter as resolve_dtype_iter_ak +from arraykit import isin_array as isin_array_ak from arraykit import array_deepcopy as array_deepcopy_ak from arraykit import ArrayGO as ArrayGOAK @@ -250,6 +250,240 @@ class ArrayGOPerfREF(ArrayGOPerf): entry = staticmethod(ArrayGOREF) +#------------------------------------------------------------------------------- + +def build_arr(dtype, size, num_nans, num_duplicates): + if dtype.kind == 'M': + if dtype == 'datetime64[Y]': + delta = np.timedelta64(size, 'Y') + elif dtype == 'datetime64[M]': + delta = np.timedelta64(size, 'M') + else: + delta = np.timedelta64(size, 'D') + + start = np.datetime64('2000-01-01').astype(dtype) + end = start + delta + arr = np.arange(start, start + delta).astype(dtype) + + nan_val = np.datetime64('NaT') + else: + if dtype.kind == 'm': + nan_val = np.timedelta64('NaT') + elif dtype.kind == 'c': + nan_val = np.complex_(np.nan) + else: + nan_val = np.nan + + arr = np.arange(size).astype(dtype) + + if num_nans == 1: + arr = np.concatenate((arr[:-1], [nan_val]*num_nans)) + elif num_nans > 1: + arr = np.concatenate((arr, [nan_val]*num_nans)) + + if num_duplicates: + indices = np.arange(size) + np.random.seed(0) + np.random.shuffle(indices) + + dups = np.array([arr[i] for i in indices[:num_duplicates]]) + dups[~pd.isnull(dups)].astype(dtype) + arr = np.concatenate((arr, dups)) + + np.random.seed(0) + np.random.shuffle(arr) + return arr, (num_nans <= 1 and num_duplicates == 0) + +storage = [] +def build_subclassses(klass, meth): + #storage.append(type(f'{klass.__name__}AK', (klass,), dict(entry=staticmethod(globals()[f'{meth}_ak'])))) + #storage.append(type(f'{klass.__name__}REF', (klass,), dict(entry=staticmethod(globals()[f'{meth}_ref'])))) + storage.append(type(f'{klass.__name__}AK', (klass,), dict(entry=staticmethod(isin_array_ak)))) + storage.append(type(f'{klass.__name__}REF', (klass,), dict(entry=staticmethod(isin_array_ref)))) + +class Obj: + def __init__(self, val): + self.val = val + def __eq__(self, other): + return self.val == other.val + def __hash__(self): + return hash(self.val) + +def get_dtypes(): + dtypes = [np.dtype(int), np.dtype(float), np.dtype(np.complex_), np.dtype('O')] + dtypes.extend((np.dtype(f'datetime64[{f}]') for f in 'DMY')) + dtypes.extend((np.dtype(f'timedelta64[{f}]') for f in 'DMY')) + return dtypes + +class IsinArrayDtypeUnique1DPerf(Perf): + NUMBER = 3 + + def pre(self): + self.kwargs = [] + for dtype in get_dtypes(): + for size in (100, 5000, 20000, 100000): + for num_nans in (0, 1): + arr1, arr1_unique = build_arr(dtype, size, num_nans, num_duplicates=0) + arr2, arr2_unique = build_arr(dtype, size // 25, num_nans // 25, num_duplicates=0) + assert arr1_unique and arr2_unique, 'Expect both arrays to be unique' + self.kwargs.append(dict(array=arr1, array_is_unique=True, other=arr2, other_is_unique=True)) + + def main(self): + assert set(x['array'].ndim for x in self.kwargs) == {1}, "Expected all arr1's to be 1D" + for kwargs in self.kwargs: + self.entry(**kwargs) + +class IsinArrayDtypeUnique2DPerf(Perf): + NUMBER = 3 + + def pre(self): + self.kwargs = [] + for dtype in get_dtypes(): + for size, reshape in [ + (100, (10, 10)), + (5000, (200, 25)), + (20000, (200, 100)), + (100000, (500, 200)), + ]: + for num_nans in (0, 1): + arr1, arr1_unique = build_arr(dtype, size, num_nans, num_duplicates=0) + arr2, arr2_unique = build_arr(dtype, size // 10, num_nans // 10, num_duplicates=0) + assert arr1_unique and arr2_unique, 'Expect both arrays to be unique' + self.kwargs.append(dict(array=arr1.reshape(reshape), array_is_unique=True, other=arr2, other_is_unique=True)) + + def main(self): + assert set(x['array'].ndim for x in self.kwargs) == {2}, "Expected all arr1's to be 2D" + for kwargs in self.kwargs: + self.entry(**kwargs) + +class IsinArrayDtypeNonUnique1DPerf(Perf): + NUMBER = 3 + + def pre(self): + self.kwargs = [] + for dtype in get_dtypes(): + for size in (100, 5000, 20000): + for num_nans, num_duplicates in ((2 + (size // 2), 0), (size // 2, size // 15), (2 + (size // 8), 0), (size // 8, size // 15)): + arr1, arr1_unique = build_arr(dtype, size, num_nans, num_duplicates) + arr2, arr2_unique = build_arr(dtype, size // 25, num_nans // 25, num_duplicates) + assert not arr1_unique or not arr2_unique, 'Expect at least one of the arrays to contains duplicates' + self.kwargs.append(dict(array=arr1, array_is_unique=arr1_unique, other=arr2, other_is_unique=arr2_unique)) + + def main(self): + assert set(x['array'].ndim for x in self.kwargs) == {1}, "Expected all arr1's to be 1D" + for kwargs in self.kwargs: + self.entry(**kwargs) + +class IsinArrayDtypeNonUnique2DPerf(Perf): + NUMBER = 1 + + def pre(self): + self.kwargs = [] + for dtype in get_dtypes(): + for size, num_nans, num_duplicates, reshape in [ + (90, 10, 35, (27, 5)), + (80, 20, 35, (27, 5)), + (4500, 500, 950, (119, 50)), + (4000, 1000, 950, (119, 50)), + (18000, 2000, 2500, (250, 90)), + (16000, 4000, 2500, (250, 90)), + (90000, 10000, 15000, (500, 230)), + (80000, 20000, 15000, (500, 230)), + ]: + arr1, arr1_unique = build_arr(dtype, size, num_nans, num_duplicates) + arr2, arr2_unique = build_arr(dtype, size // 10, int(num_nans / 10), int(num_duplicates / 10)) + assert not arr1_unique or not arr2_unique, 'Expect at least one of the arrays to contains duplicates' + self.kwargs.append(dict(array=arr1.reshape(reshape), array_is_unique=arr1_unique, other=arr2, other_is_unique=arr2_unique)) + + def main(self): + assert set(x['array'].ndim for x in self.kwargs) == {2}, "Expected all arr1's to be 2D" + for kwargs in self.kwargs: + self.entry(**kwargs) + +class IsinArrayObject1DPerf(Perf): + NUMBER = 3 + + def pre(self): + self.kwargs = [] + for dtype in get_dtypes(): + for size in (100, 5000, 20000): + for num_nans, num_duplicates in ((2 + (size // 2), 0), (size // 2, size // 15), (2 + (size // 8), 0), (size // 8, size // 15)): + arr1, arr1_unique = build_arr(dtype, size, num_nans, num_duplicates) + arr2, arr2_unique = build_arr(dtype, size // 25, num_nans // 25, num_duplicates) + assert not arr1_unique or not arr2_unique, 'Expect at least one of the arrays to contains duplicates' + self.kwargs.append(dict(array=arr1, array_is_unique=arr1_unique, other=arr2, other_is_unique=arr2_unique)) + + for size in (100, 5000, 20000): + for num_duplicates in (size // 15, 0): + tmp_arr1, arr1_unique = build_arr(np.dtype(int), size, 0, num_duplicates) + tmp_arr2, arr2_unique = build_arr(np.dtype(int), size // 25, 0, num_duplicates) + + arr1 = np.array([Obj(v) for v in tmp_arr1]) + arr2 = np.array([Obj(v) for v in tmp_arr2]) + + self.kwargs.append(dict(array=arr1, array_is_unique=arr1_unique, other=arr2, other_is_unique=arr2_unique)) + + def main(self): + assert set(x['array'].ndim for x in self.kwargs) == {1}, "Expected all arr1's to be 1D" + for kwargs in self.kwargs: + self.entry(**kwargs) + +class IsinArrayObject2DPerf(Perf): + NUMBER = 1 + + def pre(self): + self.kwargs = [] + for dtype in get_dtypes(): + for size, num_nans, num_duplicates, reshape in [ + (100, 0, 0, (10, 10)), + (90, 10, 35, (27, 5)), + (80, 20, 35, (27, 5)), + (5000, 0, 0, (200, 25)), + (4500, 500, 950, (119, 50)), + (4000, 1000, 950, (119, 50)), + (20000, 0, 0, (200, 100)), + (18000, 2000, 2500, (250, 90)), + (16000, 4000, 2500, (250, 90)), + (100000, 1, 0, (500, 200)), + (90000, 10000, 15000, (500, 230)), + (80000, 20000, 15000, (500, 230)), + ]: + arr1, arr1_unique = build_arr(dtype, size, num_nans, num_duplicates) + arr2, arr2_unique = build_arr(dtype, size // 10, int(num_nans / 10), int(num_duplicates / 10)) + self.kwargs.append(dict(array=arr1.reshape(reshape).astype(object), array_is_unique=arr1_unique, other=arr2.astype(object), other_is_unique=arr2_unique)) + + for size, num_duplicates, reshape in [ + (100, 0, (10, 10)), + (90, 10, (10, 10)), + (5000, 0, (200, 25)), + (4500, 500, (200, 25)), + (20000, 0, (200, 100)), + (18000, 2000, (200, 100)), + ]: + tmp_arr1, arr1_unique = build_arr(np.dtype(int), size, 0, num_duplicates) + tmp_arr2, arr2_unique = build_arr(np.dtype(int), size // 10, 0, num_duplicates // 10) + + arr1 = np.array([Obj(v) for v in tmp_arr1]).reshape(reshape) + arr2 = np.array([Obj(v) for v in tmp_arr2]) + + self.kwargs.append(dict(array=arr1, array_is_unique=arr1_unique, other=arr2, other_is_unique=arr2_unique)) + + def main(self): + assert set(x['array'].ndim for x in self.kwargs) == {2}, "Expected all arr1's to be 2D" + for kwargs in self.kwargs: + self.entry(**kwargs) + + +build_subclassses(IsinArrayDtypeUnique1DPerf, 'isin_array') +build_subclassses(IsinArrayDtypeUnique2DPerf, 'isin_array') + +build_subclassses(IsinArrayDtypeNonUnique1DPerf, 'isin_array') +build_subclassses(IsinArrayDtypeNonUnique2DPerf, 'isin_array') + +build_subclassses(IsinArrayObject1DPerf, 'isin_array') +build_subclassses(IsinArrayObject2DPerf, 'isin_array') + + #------------------------------------------------------------------------------- def get_arg_parser(): @@ -279,6 +513,7 @@ def main(): cls_map['ak'] = cls_runner elif cls_runner.__name__.endswith('REF'): cls_map['ref'] = cls_runner + assert cls_map for func_attr in cls_perf.FUNCTIONS: results = {} for key, cls_runner in cls_map.items(): @@ -290,12 +525,12 @@ def main(): number=cls_runner.NUMBER) records.append((cls_perf.__name__, func_attr, results['ak'], results['ref'], results['ref'] / results['ak'])) - width = 24 + width = 36 for record in records: print(''.join( (r.ljust(width) if isinstance(r, str) else str(round(r, 8)).ljust(width)) for r in record )) + if __name__ == '__main__': main() - diff --git a/performance/reference/util.py b/performance/reference/util.py index 6d437b28..3accdda6 100644 --- a/performance/reference/util.py +++ b/performance/reference/util.py @@ -181,3 +181,103 @@ def array_deepcopy( if memo is not None: memo[ident] = post return post + + +def _isin_1d( + array: np.ndarray, + other: tp.FrozenSet[tp.Any] + ) -> np.ndarray: + ''' + Iterate over an 1D array to build a 1D Boolean ndarray representing whether or not the original element is in the set + + Args: + array: The source array + other: The set of elements being looked for + ''' + result: np.ndarray = np.empty(array.shape, dtype=DTYPE_BOOL) + + for i, element in enumerate(array): + result[i] = element in other + + result.flags.writeable = False + return result + + +def _isin_2d( + array: np.ndarray, + other: tp.FrozenSet[tp.Any] + ) -> np.ndarray: + ''' + Iterate over an 2D array to build a 2D, immutable, Boolean ndarray representing whether or not the original element is in the set + + Args: + array: The source array + other: The set of elements being looked for + ''' + result: np.ndarray = np.empty(array.shape, dtype=DTYPE_BOOL) + + for (i, j), v in np.ndenumerate(array): + result[i, j] = v in other + + result.flags.writeable = False + return result + + +def isin_array(*, + array: np.ndarray, + array_is_unique: bool, + other: np.ndarray, + other_is_unique: bool, + ) -> np.ndarray: + '''Core isin processing after other has been converted to an array. + ''' + if array.dtype == DTYPE_OBJECT or other.dtype == DTYPE_OBJECT: + # both funcs return immutable arrays + func = _isin_1d if array.ndim == 1 else _isin_2d + try: + return func(array, frozenset(other)) # Isolate the frozenset creation to it's own try-except + except TypeError: # only occur when something is unhashable. + pass + + assume_unique = array_is_unique and other_is_unique + func = np.in1d if array.ndim == 1 else np.isin + + result = func(array, other, assume_unique=assume_unique) #type: ignore + result.flags.writeable = False + + return result + + +def unique(ar, return_inverse=False): + + ar = np.asanyarray(ar).flatten() + + if return_inverse: + perm = ar.argsort(kind='quicksort') + aux = ar[perm] + else: + ar.sort() + aux = ar + + mask = np.empty(aux.shape, dtype=np.bool_) + mask[:1] = True + if aux.dtype.kind in "cfmM" and np.isnan(aux[-1]): + if aux.dtype.kind == "c": # for complex all NaNs are considered equivalent + aux_firstnan = np.searchsorted(np.isnan(aux), True, side='left') + else: + aux_firstnan = np.searchsorted(aux, aux[-1], side='left') + + mask[1:aux_firstnan] = (aux[1:aux_firstnan] != aux[:aux_firstnan - 1]) + mask[aux_firstnan] = True + mask[aux_firstnan + 1:] = False + else: + mask[1:] = aux[1:] != aux[:-1] + + ret = aux[mask] + if return_inverse: + imask = np.cumsum(mask) - 1 + inv_idx = np.empty(mask.shape, dtype=np.intp) + inv_idx[perm] = imask + return ret, inv_idx + + return ret diff --git a/test/test_util.py b/test/test_util.py index c44d7ab9..743faf3c 100644 --- a/test/test_util.py +++ b/test/test_util.py @@ -1,3 +1,6 @@ +from datetime import date, timedelta +from functools import partial +import itertools import unittest import numpy as np # type: ignore @@ -70,7 +73,6 @@ def test_resolve_dtype_c(self) -> None: self.assertEqual(resolve_dtype(a1.dtype, a4.dtype), np.dtype('O')) - def test_resolve_dtype_d(self) -> None: dt1 = np.array(1).dtype dt2 = np.array(2.3).dtype @@ -82,7 +84,6 @@ def test_resolve_dtype_e(self) -> None: assert resolve_dtype(dt1, dt2) == np.dtype(object) assert resolve_dtype(dt1, dt1) == dt1 - #--------------------------------------------------------------------------- def test_resolve_dtype_iter_a(self) -> None: @@ -141,7 +142,6 @@ def test_column_2d_filter_a(self) -> None: with self.assertRaises(NotImplementedError): column_2d_filter(a1.reshape(1,2,5)) - #--------------------------------------------------------------------------- def test_column_1d_filter_a(self) -> None: @@ -224,9 +224,179 @@ def test_array_deepcopy_c2(self) -> None: self.assertFalse(a2.flags.writeable) self.assertIn(id(a1), memo) + def test_isin_1d(self) -> None: + from performance.reference.util import isin_array -if __name__ == '__main__': - unittest.main() + T, F = True, False + arr1 = np.array([1, 2, 3, 4, 5]) + + expected = [ + (np.array([T, F, T, T, F]), [1, 3, 4]), + (np.array([F, F, F, F, F]), [7, 8]), + (np.array([T, T, T, T, T]), [1, 2, 3, 4, 5]), + ] + + for expected_result, values in expected: + for dtype in (int, object): + arr2 = np.array(values, dtype=dtype) + + for aiu, oiu in itertools.product((T, F), (T, F)): + self.assertTrue(np.array_equal(expected_result, isin_array( + array=arr1, + array_is_unique=aiu, + other=arr2, + other_is_unique=oiu, + ))) + + def test_isin_2d(self) -> None: + from performance.reference.util import isin_array + + T, F = True, False + arr1 = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) + + expected = [ + (np.array([[T, F, T], [T, F, F], [F, F, T]]), [1, 3, 4, 9]), + (np.array([[F, F, F], [F, F, F], [F, F, F]]), [10, 11]), + (np.array([[T, T, T], [T, T, T], [T, T, T]]), [1, 2, 3, 4, 5, 6, 7, 8, 9]), + ] + + for expected_result, values in expected: + for dtype in (int, object): + arr2 = np.array(values, dtype=dtype) + + for aiu, oiu in itertools.product((T, F), (T, F)): + self.assertTrue(np.array_equal(expected_result, isin_array( + array=arr1, + array_is_unique=aiu, + other=arr2, + other_is_unique=oiu, + ))) + + def test_1d_2d_dtype_unique(self) -> None: + from arraykit import isin_array + + isin_array_func = partial(isin_array, array_is_unique=True, other_is_unique=True) + + e_1d = np.array([1, 0, 0, 1, 0], dtype=bool) + e_2d = np.array([[1, 0, 0], [1, 0, 1]], dtype=bool) + + v_1d = [1, 2, 3, 4, 5] + v_2d = [[1, 2, 3], [4, 5, 9]] + + w_1d = [1, 4, 7, 9] + + dtype_funcs = [ + (int, int), + (float, float), + (str, str), + ('datetime64[D]', lambda x: date(2020, 1, x)), + ('timedelta64[D]', timedelta), + ] + + for dtype, dtype_func in dtype_funcs: + arr1 = np.array([dtype_func(v) for v in v_1d], dtype=dtype) + arr2 = np.array([dtype_func(v) for v in w_1d], dtype=dtype) + + post = isin_array_func(array=arr1, other=arr2) + self.assertTrue(np.array_equal(e_1d, post), msg=f'\n{dtype}\nExpected:\n{e_1d}\nActual:\n{post}') + + for dtype, dtype_func in dtype_funcs: + arr1 = np.array([[dtype_func(x) for x in y] for y in v_2d], dtype=dtype) + arr2 = np.array([dtype_func(v) for v in w_1d], dtype=dtype) + + post = isin_array_func(array=arr1, other=arr2) + self.assertTrue(np.array_equal(e_2d, post), msg=f'\n{dtype}\nExpected:\n{e_2d}\nActual:\n{post}') + + def test_1d_2d_dtype_object_unique(self) -> None: + from arraykit import isin_array + + e_1d = np.array([1, 0, 0, 1, 0], dtype=bool) + e_2d = np.array([[1, 0, 0], [1, 0, 1]], dtype=bool) + + arr1_1d = np.array([1, 2, 3, 4, 5], dtype=object) + arr1_2d = np.array([[1, 2, 3], [4, 5, 9]], dtype=object) + + arr2 = np.array([1, 4, 7, 9], dtype=object) + post = isin_array(array=arr1_1d, array_is_unique=True, other=arr2, other_is_unique=True) + self.assertTrue(np.array_equal(e_1d, post)) + post = isin_array(array=arr1_2d, array_is_unique=True, other=arr2, other_is_unique=True) + self.assertTrue(np.array_equal(e_2d, post)) + class C: + def __init__(self, val): + self.val = val + + def __eq__(self, other): + return self.val == other.val + + def __hash__(self): + return hash(self.val) + + arr1 = np.array([C(1), C(2), C(3), C(4), C(5)]) + arr2 = np.array([C(1), C(4), C(9)]) + + post = isin_array(array=arr1, array_is_unique=True, other=arr2, other_is_unique=True) + self.assertTrue(np.array_equal(e_1d, post)) + + arr1 = np.array([[C(1), C(2), C(3)], [C(4), C(5), C(9)]]) + + post = isin_array(array=arr1, array_is_unique=True, other=arr2, other_is_unique=True) + self.assertTrue(np.array_equal(e_2d, post)) + + def test_1d_2d_dtype_object_non_unique(self) -> None: + from arraykit import isin_array + + e_1d = np.array([1, 0, 0, 1, 0], dtype=bool) + e_2d = np.array([[1, 0, 0], [1, 0, 1]], dtype=bool) + + arr1_1d = np.array([1, 2, 2, 4, 5], dtype=object) + arr1_2d = np.array([[1, 2, 3], [4, 2, 9]], dtype=object) + + arr2 = np.array([1, 4, 4, 9], dtype=object) + + post = isin_array(array=arr1_1d, array_is_unique=False, other=arr2, other_is_unique=False) + self.assertTrue(np.array_equal(e_1d, post)) + + post = isin_array(array=arr1_2d, array_is_unique=False, other=arr2, other_is_unique=False) + self.assertTrue(np.array_equal(e_2d, post)) + + def test_1d_2d_dtype_non_unique(self) -> None: + from arraykit import isin_array + + isin_array_func = partial(isin_array, array_is_unique=False, other_is_unique=False) + + e_1d = np.array([1, 0, 0, 0, 1, 0, 1], dtype=bool) + e_2d = np.array([[1, 1, 0, 0], [1, 0, 0, 1]], dtype=bool) + + v_1d = [1, 5, 2, 3, 4, 5, 1] + v_2d = [[9, 1, 2, 3], [4, 3, 5, 9]] + + w_1d = [1, 4, 7, 9] + + dtype_funcs = [ + (int, int), + (float, float), + (str, str), + ('datetime64[D]', lambda x: date(2020, 1, x)), + ('timedelta64[D]', timedelta), + ] + + for dtype, dtype_func in dtype_funcs: + arr1 = np.array([dtype_func(v) for v in v_1d], dtype=dtype) + arr2 = np.array([dtype_func(v) for v in w_1d], dtype=dtype) + + post = isin_array_func(array=arr1, other=arr2) + self.assertTrue(np.array_equal(e_1d, post), msg=f'\n{dtype}\nExpected:\n{e_1d}\nActual:\n{post}') + + for dtype, dtype_func in dtype_funcs: + arr1 = np.array([[dtype_func(x) for x in y] for y in v_2d], dtype=dtype) + arr2 = np.array([dtype_func(v) for v in w_1d], dtype=dtype) + + post = isin_array_func(array=arr1, other=arr2) + self.assertTrue(np.array_equal(e_2d, post), msg=f'\n{dtype}\nExpected:\n{e_2d}\nActual:\n{post}') + + +if __name__ == '__main__': + unittest.main()