0.6.3 RC 1

static-frame · Jun 8, 2024 · 1264dce · 1264dce
1 parent 3402b00
commit 1264dce
Show file tree

Hide file tree

Showing 3 changed files with 8 additions and 175 deletions.
diff --git a/README.rst b/README.rst
@@ -37,10 +37,16 @@ ArrayKit requires the following:
 What is New in ArrayKit
 -------------------------
 
+0.6.3
+............
+
+Optimized memory allocation strategy for ``nonzero_1d()``.
+
+
 0.6.2
 ............
 
-Extended ``nonzero_1d()`` to support non-contiguous arrays,
+Extended ``nonzero_1d()`` to support non-contiguous arrays.
 
 Optimizations to ``TriMap`` when mapping to object and flexible dtypes.
 

diff --git a/setup.py b/setup.py
@@ -5,7 +5,7 @@
 from setuptools import setup
 from pathlib import Path
 
-AK_VERSION = '0.6.2'
+AK_VERSION = '0.6.3'
 
 def get_long_description() -> str:
     return '''The ArrayKit library provides utilities for creating and transforming NumPy arrays, implementing performance-critical StaticFrame operations as Python C extensions.

diff --git a/src/_arraykit.c b/src/_arraykit.c
@@ -3535,179 +3535,6 @@ resolve_dtype_iter(PyObject *Py_UNUSED(m), PyObject *arg) {
 //------------------------------------------------------------------------------
 // general utility
 
-
-// #define NONZERO_APPEND_INDEX_RELATIVE {                                      \
-//     if (AK_UNLIKELY(count == capacity)) {                                    \
-//         capacity <<= 1;                                                      \
-//         indices = (npy_int64*)realloc(indices, sizeof(npy_int64) * capacity);\
-//         if (indices == NULL) {                                               \
-//             return NULL;                                                     \
-//         }                                                                    \
-//     }                                                                        \
-//     indices[count++] = p - p_start;                                          \
-// }                                                                            \
-
-// #define NONZERO_APPEND_INDEX_ABSOLUTE {                                      \
-//     if (AK_UNLIKELY(count == capacity)) {                                    \
-//         capacity <<= 1;                                                      \
-//         indices = (npy_int64*)realloc(indices, sizeof(npy_int64) * capacity);\
-//         if (indices == NULL) {                                               \
-//             return NULL;                                                     \
-//         }                                                                    \
-//     }                                                                        \
-//     indices[count++] = i;                                                    \
-// }                                                                            \
-
-
-
-// // Given a Boolean, contiguous 1D array, return the index positions in an int64 array. Through experimentation it has been verified that doing full-size allocation of memory does not permit outperforming NumPy at 10_000_000 scale; but doing less optimizations does help. Using bit masks does not improve perforamnce over pointer arithmetic. Prescanning for all empty is very effective. Note that NumPy befits from first counting the nonzeros, then allocating only enough data for the expexted number.
-// static inline PyObject*
-// AK_nonzero_1d(PyArrayObject* array) {
-//     // the maxiumum number of indices we could return is the size of the array; if this is under a certain number, probably better to just allocate that rather than reallocate
-//     PyObject* final;
-//     npy_intp count_max = PyArray_SIZE(array);
-
-//     if (count_max == 0) { // return empty array
-//         npy_intp dims = {count_max};
-//         final = PyArray_SimpleNew(1, &dims, NPY_INT64);
-//         PyArray_CLEARFLAGS((PyArrayObject*)final, NPY_ARRAY_WRITEABLE);
-//         return final;
-//     }
-//     lldiv_t size_div = lldiv((long long)count_max, 8); // quot, rem
-
-//     Py_ssize_t count = 0;
-//     // the maximum number of collected integers is equal to or less than count_max; for small count_max, we can just set that value; for large size, we set it to half the size
-//     // Py_ssize_t capacity = count_max < 1024 ? count_max : (Py_ssize_t)AK_next_power((npy_uint32)(count_max / 8));
-//     Py_ssize_t capacity = count_max < 1024 ? count_max : count_max / 8;
-//     npy_int64* indices = (npy_int64*)malloc(sizeof(npy_int64) * capacity);
-
-//     NPY_BEGIN_THREADS_DEF;
-//     NPY_BEGIN_THREADS;
-
-//     if (PyArray_IS_C_CONTIGUOUS(array)) {
-//         npy_bool* p_start = (npy_bool*)PyArray_DATA(array);
-//         npy_bool* p = p_start;
-//         npy_bool* p_end = p + count_max;
-//         npy_bool* p_end_roll = p_end - size_div.rem;
-
-//         while (p < p_end_roll) {
-//             if (*(npy_uint64*)p == 0) {
-//                 p += 8; // no true within this 8 byte roll region
-//                 continue;
-//             }
-//             if (*p) {NONZERO_APPEND_INDEX_RELATIVE;}
-//             p++;
-//             if (*p) {NONZERO_APPEND_INDEX_RELATIVE;}
-//             p++;
-//             if (*p) {NONZERO_APPEND_INDEX_RELATIVE;}
-//             p++;
-//             if (*p) {NONZERO_APPEND_INDEX_RELATIVE;}
-//             p++;
-//             if (*p) {NONZERO_APPEND_INDEX_RELATIVE;}
-//             p++;
-//             if (*p) {NONZERO_APPEND_INDEX_RELATIVE;}
-//             p++;
-//             if (*p) {NONZERO_APPEND_INDEX_RELATIVE;}
-//             p++;
-//             if (*p) {NONZERO_APPEND_INDEX_RELATIVE;}
-//             p++;
-//         }
-//         while (p < p_end) {
-//             if (*p) {NONZERO_APPEND_INDEX_RELATIVE;}
-//             p++;
-//         }
-//     }
-//     // else {
-//     //     NpyIter *iter = NpyIter_New(
-//     //             array,                                      // array
-//     //             NPY_ITER_READONLY | NPY_ITER_EXTERNAL_LOOP, // iter flags
-//     //             NPY_KEEPORDER,                              // order
-//     //             NPY_NO_CASTING,                             // casting
-//     //             NULL                                        // dtype
-//     //             );
-//     //     if (iter == NULL) {
-//     //         free(indices);
-//     //         return NULL;
-//     //     }
-//     //     NpyIter_IterNextFunc *iter_next = NpyIter_GetIterNext(iter, NULL);
-//     //     if (iter_next == NULL) {
-//     //         free(indices);
-//     //         NpyIter_Deallocate(iter);
-//     //         return NULL;
-//     //     }
-//     //     char **data_ptr = NpyIter_GetDataPtrArray(iter);
-//     //     char* data;
-//     //     npy_intp *stride_ptr = NpyIter_GetInnerStrideArray(iter);
-//     //     npy_intp stride;
-//     //     npy_intp *inner_size_ptr = NpyIter_GetInnerLoopSizePtr(iter);
-//     //     npy_intp inner_size;
-//     //     npy_int64 i = 0;
-//     //     do {
-//     //         data = *data_ptr;
-//     //         stride = *stride_ptr;
-//     //         inner_size = *inner_size_ptr;
-//     //         while (inner_size--) {
-//     //             if (*(npy_bool*)data) {
-//     //                 if (AK_UNLIKELY(count == capacity)) {
-//     //                     capacity <<= 1;
-//     //                     indices = (npy_int64*)realloc(indices, sizeof(npy_int64) * capacity);
-//     //                     if (indices == NULL) {
-//     //                         NpyIter_Deallocate(iter);
-//     //                         return NULL;
-//     //                     }
-//     //                 }
-//     //                 indices[count++] = i;
-//     //             }
-//     //             i++;
-//     //             data += stride;
-//     //         }
-//     //     } while(iter_next(iter));
-//     //     NpyIter_Deallocate(iter);
-//     // }
-//     else {
-//         npy_intp i = 0; // position within Boolean array
-//         npy_intp i_end = count_max;
-//         npy_intp i_end_roll = count_max - size_div.rem;
-//         while (i < i_end_roll) {
-//             if (*(npy_bool*)PyArray_GETPTR1(array, i)) {NONZERO_APPEND_INDEX_ABSOLUTE;}
-//             i++;
-//             if (*(npy_bool*)PyArray_GETPTR1(array, i)) {NONZERO_APPEND_INDEX_ABSOLUTE;}
-//             i++;
-//             if (*(npy_bool*)PyArray_GETPTR1(array, i)) {NONZERO_APPEND_INDEX_ABSOLUTE;}
-//             i++;
-//             if (*(npy_bool*)PyArray_GETPTR1(array, i)) {NONZERO_APPEND_INDEX_ABSOLUTE;}
-//             i++;
-//             if (*(npy_bool*)PyArray_GETPTR1(array, i)) {NONZERO_APPEND_INDEX_ABSOLUTE;}
-//             i++;
-//             if (*(npy_bool*)PyArray_GETPTR1(array, i)) {NONZERO_APPEND_INDEX_ABSOLUTE;}
-//             i++;
-//             if (*(npy_bool*)PyArray_GETPTR1(array, i)) {NONZERO_APPEND_INDEX_ABSOLUTE;}
-//             i++;
-//             if (*(npy_bool*)PyArray_GETPTR1(array, i)) {NONZERO_APPEND_INDEX_ABSOLUTE;}
-//             i++;
-//         }
-//         while (i < i_end) {
-//             if (*(npy_bool*)PyArray_GETPTR1(array, i)) {NONZERO_APPEND_INDEX_ABSOLUTE;}
-//             i++;
-//         }
-//     }
-//     NPY_END_THREADS;
-
-//     npy_intp dims = {count};
-//     final = PyArray_SimpleNewFromData(1, &dims, NPY_INT64, (void*)indices);
-//     if (!final) {
-//         free(indices);
-//         return NULL;
-//     }
-//     // This ensures that the array frees the indices array; this has been tested by calling free(indices) and observing segfault
-//     PyArray_ENABLEFLAGS((PyArrayObject*)final, NPY_ARRAY_OWNDATA);
-//     PyArray_CLEARFLAGS((PyArrayObject*)final, NPY_ARRAY_WRITEABLE);
-//     return final;
-// }
-// #undef NONZERO_APPEND_INDEX_RELATIVE
-// #undef NONZERO_APPEND_INDEX_ABSOLUTE
-
-
 // Given a Boolean, contiguous 1D array, return the index positions in an int64 array. Through experimentation it has been verified that doing full-size allocation of memory provides the best performance at all scales. Using NpyIter, or using, bit masks does not improve performance over pointer arithmetic. Prescanning for all empty is very effective. Note that NumPy benefits from first counting the nonzeros, then allocating only enough data for the expexted number of indices.
 static inline PyObject*
 AK_nonzero_1d(PyArrayObject* array) {