From 1264dcec5a6a637382695d6870db6606c71612ee Mon Sep 17 00:00:00 2001 From: Christopher Ariza Date: Sat, 8 Jun 2024 10:06:36 -0700 Subject: [PATCH] 0.6.3 RC 1 --- README.rst | 8 ++- setup.py | 2 +- src/_arraykit.c | 173 ------------------------------------------------ 3 files changed, 8 insertions(+), 175 deletions(-) diff --git a/README.rst b/README.rst index 24bc1cab..3be5d899 100644 --- a/README.rst +++ b/README.rst @@ -37,10 +37,16 @@ ArrayKit requires the following: What is New in ArrayKit ------------------------- +0.6.3 +............ + +Optimized memory allocation strategy for ``nonzero_1d()``. + + 0.6.2 ............ -Extended ``nonzero_1d()`` to support non-contiguous arrays, +Extended ``nonzero_1d()`` to support non-contiguous arrays. Optimizations to ``TriMap`` when mapping to object and flexible dtypes. diff --git a/setup.py b/setup.py index dfa0a468..253ae931 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ from setuptools import setup from pathlib import Path -AK_VERSION = '0.6.2' +AK_VERSION = '0.6.3' def get_long_description() -> str: return '''The ArrayKit library provides utilities for creating and transforming NumPy arrays, implementing performance-critical StaticFrame operations as Python C extensions. diff --git a/src/_arraykit.c b/src/_arraykit.c index f1a96669..43a4197e 100644 --- a/src/_arraykit.c +++ b/src/_arraykit.c @@ -3535,179 +3535,6 @@ resolve_dtype_iter(PyObject *Py_UNUSED(m), PyObject *arg) { //------------------------------------------------------------------------------ // general utility - -// #define NONZERO_APPEND_INDEX_RELATIVE { \ -// if (AK_UNLIKELY(count == capacity)) { \ -// capacity <<= 1; \ -// indices = (npy_int64*)realloc(indices, sizeof(npy_int64) * capacity);\ -// if (indices == NULL) { \ -// return NULL; \ -// } \ -// } \ -// indices[count++] = p - p_start; \ -// } \ - -// #define NONZERO_APPEND_INDEX_ABSOLUTE { \ -// if (AK_UNLIKELY(count == capacity)) { \ -// capacity <<= 1; \ -// indices = (npy_int64*)realloc(indices, sizeof(npy_int64) * capacity);\ -// if (indices == NULL) { \ -// return NULL; \ -// } \ -// } \ -// indices[count++] = i; \ -// } \ - - - -// // Given a Boolean, contiguous 1D array, return the index positions in an int64 array. Through experimentation it has been verified that doing full-size allocation of memory does not permit outperforming NumPy at 10_000_000 scale; but doing less optimizations does help. Using bit masks does not improve perforamnce over pointer arithmetic. Prescanning for all empty is very effective. Note that NumPy befits from first counting the nonzeros, then allocating only enough data for the expexted number. -// static inline PyObject* -// AK_nonzero_1d(PyArrayObject* array) { -// // the maxiumum number of indices we could return is the size of the array; if this is under a certain number, probably better to just allocate that rather than reallocate -// PyObject* final; -// npy_intp count_max = PyArray_SIZE(array); - -// if (count_max == 0) { // return empty array -// npy_intp dims = {count_max}; -// final = PyArray_SimpleNew(1, &dims, NPY_INT64); -// PyArray_CLEARFLAGS((PyArrayObject*)final, NPY_ARRAY_WRITEABLE); -// return final; -// } -// lldiv_t size_div = lldiv((long long)count_max, 8); // quot, rem - -// Py_ssize_t count = 0; -// // the maximum number of collected integers is equal to or less than count_max; for small count_max, we can just set that value; for large size, we set it to half the size -// // Py_ssize_t capacity = count_max < 1024 ? count_max : (Py_ssize_t)AK_next_power((npy_uint32)(count_max / 8)); -// Py_ssize_t capacity = count_max < 1024 ? count_max : count_max / 8; -// npy_int64* indices = (npy_int64*)malloc(sizeof(npy_int64) * capacity); - -// NPY_BEGIN_THREADS_DEF; -// NPY_BEGIN_THREADS; - -// if (PyArray_IS_C_CONTIGUOUS(array)) { -// npy_bool* p_start = (npy_bool*)PyArray_DATA(array); -// npy_bool* p = p_start; -// npy_bool* p_end = p + count_max; -// npy_bool* p_end_roll = p_end - size_div.rem; - -// while (p < p_end_roll) { -// if (*(npy_uint64*)p == 0) { -// p += 8; // no true within this 8 byte roll region -// continue; -// } -// if (*p) {NONZERO_APPEND_INDEX_RELATIVE;} -// p++; -// if (*p) {NONZERO_APPEND_INDEX_RELATIVE;} -// p++; -// if (*p) {NONZERO_APPEND_INDEX_RELATIVE;} -// p++; -// if (*p) {NONZERO_APPEND_INDEX_RELATIVE;} -// p++; -// if (*p) {NONZERO_APPEND_INDEX_RELATIVE;} -// p++; -// if (*p) {NONZERO_APPEND_INDEX_RELATIVE;} -// p++; -// if (*p) {NONZERO_APPEND_INDEX_RELATIVE;} -// p++; -// if (*p) {NONZERO_APPEND_INDEX_RELATIVE;} -// p++; -// } -// while (p < p_end) { -// if (*p) {NONZERO_APPEND_INDEX_RELATIVE;} -// p++; -// } -// } -// // else { -// // NpyIter *iter = NpyIter_New( -// // array, // array -// // NPY_ITER_READONLY | NPY_ITER_EXTERNAL_LOOP, // iter flags -// // NPY_KEEPORDER, // order -// // NPY_NO_CASTING, // casting -// // NULL // dtype -// // ); -// // if (iter == NULL) { -// // free(indices); -// // return NULL; -// // } -// // NpyIter_IterNextFunc *iter_next = NpyIter_GetIterNext(iter, NULL); -// // if (iter_next == NULL) { -// // free(indices); -// // NpyIter_Deallocate(iter); -// // return NULL; -// // } -// // char **data_ptr = NpyIter_GetDataPtrArray(iter); -// // char* data; -// // npy_intp *stride_ptr = NpyIter_GetInnerStrideArray(iter); -// // npy_intp stride; -// // npy_intp *inner_size_ptr = NpyIter_GetInnerLoopSizePtr(iter); -// // npy_intp inner_size; -// // npy_int64 i = 0; -// // do { -// // data = *data_ptr; -// // stride = *stride_ptr; -// // inner_size = *inner_size_ptr; -// // while (inner_size--) { -// // if (*(npy_bool*)data) { -// // if (AK_UNLIKELY(count == capacity)) { -// // capacity <<= 1; -// // indices = (npy_int64*)realloc(indices, sizeof(npy_int64) * capacity); -// // if (indices == NULL) { -// // NpyIter_Deallocate(iter); -// // return NULL; -// // } -// // } -// // indices[count++] = i; -// // } -// // i++; -// // data += stride; -// // } -// // } while(iter_next(iter)); -// // NpyIter_Deallocate(iter); -// // } -// else { -// npy_intp i = 0; // position within Boolean array -// npy_intp i_end = count_max; -// npy_intp i_end_roll = count_max - size_div.rem; -// while (i < i_end_roll) { -// if (*(npy_bool*)PyArray_GETPTR1(array, i)) {NONZERO_APPEND_INDEX_ABSOLUTE;} -// i++; -// if (*(npy_bool*)PyArray_GETPTR1(array, i)) {NONZERO_APPEND_INDEX_ABSOLUTE;} -// i++; -// if (*(npy_bool*)PyArray_GETPTR1(array, i)) {NONZERO_APPEND_INDEX_ABSOLUTE;} -// i++; -// if (*(npy_bool*)PyArray_GETPTR1(array, i)) {NONZERO_APPEND_INDEX_ABSOLUTE;} -// i++; -// if (*(npy_bool*)PyArray_GETPTR1(array, i)) {NONZERO_APPEND_INDEX_ABSOLUTE;} -// i++; -// if (*(npy_bool*)PyArray_GETPTR1(array, i)) {NONZERO_APPEND_INDEX_ABSOLUTE;} -// i++; -// if (*(npy_bool*)PyArray_GETPTR1(array, i)) {NONZERO_APPEND_INDEX_ABSOLUTE;} -// i++; -// if (*(npy_bool*)PyArray_GETPTR1(array, i)) {NONZERO_APPEND_INDEX_ABSOLUTE;} -// i++; -// } -// while (i < i_end) { -// if (*(npy_bool*)PyArray_GETPTR1(array, i)) {NONZERO_APPEND_INDEX_ABSOLUTE;} -// i++; -// } -// } -// NPY_END_THREADS; - -// npy_intp dims = {count}; -// final = PyArray_SimpleNewFromData(1, &dims, NPY_INT64, (void*)indices); -// if (!final) { -// free(indices); -// return NULL; -// } -// // This ensures that the array frees the indices array; this has been tested by calling free(indices) and observing segfault -// PyArray_ENABLEFLAGS((PyArrayObject*)final, NPY_ARRAY_OWNDATA); -// PyArray_CLEARFLAGS((PyArrayObject*)final, NPY_ARRAY_WRITEABLE); -// return final; -// } -// #undef NONZERO_APPEND_INDEX_RELATIVE -// #undef NONZERO_APPEND_INDEX_ABSOLUTE - - // Given a Boolean, contiguous 1D array, return the index positions in an int64 array. Through experimentation it has been verified that doing full-size allocation of memory provides the best performance at all scales. Using NpyIter, or using, bit masks does not improve performance over pointer arithmetic. Prescanning for all empty is very effective. Note that NumPy benefits from first counting the nonzeros, then allocating only enough data for the expexted number of indices. static inline PyObject* AK_nonzero_1d(PyArrayObject* array) {