Skip to content

Commit

Permalink
0.6.3 RC 1
Browse files Browse the repository at this point in the history
  • Loading branch information
flexatone committed Jun 8, 2024
1 parent 3402b00 commit 1264dce
Show file tree
Hide file tree
Showing 3 changed files with 8 additions and 175 deletions.
8 changes: 7 additions & 1 deletion README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,16 @@ ArrayKit requires the following:
What is New in ArrayKit
-------------------------

0.6.3
............

Optimized memory allocation strategy for ``nonzero_1d()``.


0.6.2
............

Extended ``nonzero_1d()`` to support non-contiguous arrays,
Extended ``nonzero_1d()`` to support non-contiguous arrays.

Optimizations to ``TriMap`` when mapping to object and flexible dtypes.

Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from setuptools import setup
from pathlib import Path

AK_VERSION = '0.6.2'
AK_VERSION = '0.6.3'

def get_long_description() -> str:
return '''The ArrayKit library provides utilities for creating and transforming NumPy arrays, implementing performance-critical StaticFrame operations as Python C extensions.
Expand Down
173 changes: 0 additions & 173 deletions src/_arraykit.c
Original file line number Diff line number Diff line change
Expand Up @@ -3535,179 +3535,6 @@ resolve_dtype_iter(PyObject *Py_UNUSED(m), PyObject *arg) {
//------------------------------------------------------------------------------
// general utility


// #define NONZERO_APPEND_INDEX_RELATIVE { \
// if (AK_UNLIKELY(count == capacity)) { \
// capacity <<= 1; \
// indices = (npy_int64*)realloc(indices, sizeof(npy_int64) * capacity);\
// if (indices == NULL) { \
// return NULL; \
// } \
// } \
// indices[count++] = p - p_start; \
// } \
// #define NONZERO_APPEND_INDEX_ABSOLUTE { \
// if (AK_UNLIKELY(count == capacity)) { \
// capacity <<= 1; \
// indices = (npy_int64*)realloc(indices, sizeof(npy_int64) * capacity);\
// if (indices == NULL) { \
// return NULL; \
// } \
// } \
// indices[count++] = i; \
// } \


// // Given a Boolean, contiguous 1D array, return the index positions in an int64 array. Through experimentation it has been verified that doing full-size allocation of memory does not permit outperforming NumPy at 10_000_000 scale; but doing less optimizations does help. Using bit masks does not improve perforamnce over pointer arithmetic. Prescanning for all empty is very effective. Note that NumPy befits from first counting the nonzeros, then allocating only enough data for the expexted number.
// static inline PyObject*
// AK_nonzero_1d(PyArrayObject* array) {
// // the maxiumum number of indices we could return is the size of the array; if this is under a certain number, probably better to just allocate that rather than reallocate
// PyObject* final;
// npy_intp count_max = PyArray_SIZE(array);

// if (count_max == 0) { // return empty array
// npy_intp dims = {count_max};
// final = PyArray_SimpleNew(1, &dims, NPY_INT64);
// PyArray_CLEARFLAGS((PyArrayObject*)final, NPY_ARRAY_WRITEABLE);
// return final;
// }
// lldiv_t size_div = lldiv((long long)count_max, 8); // quot, rem

// Py_ssize_t count = 0;
// // the maximum number of collected integers is equal to or less than count_max; for small count_max, we can just set that value; for large size, we set it to half the size
// // Py_ssize_t capacity = count_max < 1024 ? count_max : (Py_ssize_t)AK_next_power((npy_uint32)(count_max / 8));
// Py_ssize_t capacity = count_max < 1024 ? count_max : count_max / 8;
// npy_int64* indices = (npy_int64*)malloc(sizeof(npy_int64) * capacity);

// NPY_BEGIN_THREADS_DEF;
// NPY_BEGIN_THREADS;

// if (PyArray_IS_C_CONTIGUOUS(array)) {
// npy_bool* p_start = (npy_bool*)PyArray_DATA(array);
// npy_bool* p = p_start;
// npy_bool* p_end = p + count_max;
// npy_bool* p_end_roll = p_end - size_div.rem;

// while (p < p_end_roll) {
// if (*(npy_uint64*)p == 0) {
// p += 8; // no true within this 8 byte roll region
// continue;
// }
// if (*p) {NONZERO_APPEND_INDEX_RELATIVE;}
// p++;
// if (*p) {NONZERO_APPEND_INDEX_RELATIVE;}
// p++;
// if (*p) {NONZERO_APPEND_INDEX_RELATIVE;}
// p++;
// if (*p) {NONZERO_APPEND_INDEX_RELATIVE;}
// p++;
// if (*p) {NONZERO_APPEND_INDEX_RELATIVE;}
// p++;
// if (*p) {NONZERO_APPEND_INDEX_RELATIVE;}
// p++;
// if (*p) {NONZERO_APPEND_INDEX_RELATIVE;}
// p++;
// if (*p) {NONZERO_APPEND_INDEX_RELATIVE;}
// p++;
// }
// while (p < p_end) {
// if (*p) {NONZERO_APPEND_INDEX_RELATIVE;}
// p++;
// }
// }
// // else {
// // NpyIter *iter = NpyIter_New(
// // array, // array
// // NPY_ITER_READONLY | NPY_ITER_EXTERNAL_LOOP, // iter flags
// // NPY_KEEPORDER, // order
// // NPY_NO_CASTING, // casting
// // NULL // dtype
// // );
// // if (iter == NULL) {
// // free(indices);
// // return NULL;
// // }
// // NpyIter_IterNextFunc *iter_next = NpyIter_GetIterNext(iter, NULL);
// // if (iter_next == NULL) {
// // free(indices);
// // NpyIter_Deallocate(iter);
// // return NULL;
// // }
// // char **data_ptr = NpyIter_GetDataPtrArray(iter);
// // char* data;
// // npy_intp *stride_ptr = NpyIter_GetInnerStrideArray(iter);
// // npy_intp stride;
// // npy_intp *inner_size_ptr = NpyIter_GetInnerLoopSizePtr(iter);
// // npy_intp inner_size;
// // npy_int64 i = 0;
// // do {
// // data = *data_ptr;
// // stride = *stride_ptr;
// // inner_size = *inner_size_ptr;
// // while (inner_size--) {
// // if (*(npy_bool*)data) {
// // if (AK_UNLIKELY(count == capacity)) {
// // capacity <<= 1;
// // indices = (npy_int64*)realloc(indices, sizeof(npy_int64) * capacity);
// // if (indices == NULL) {
// // NpyIter_Deallocate(iter);
// // return NULL;
// // }
// // }
// // indices[count++] = i;
// // }
// // i++;
// // data += stride;
// // }
// // } while(iter_next(iter));
// // NpyIter_Deallocate(iter);
// // }
// else {
// npy_intp i = 0; // position within Boolean array
// npy_intp i_end = count_max;
// npy_intp i_end_roll = count_max - size_div.rem;
// while (i < i_end_roll) {
// if (*(npy_bool*)PyArray_GETPTR1(array, i)) {NONZERO_APPEND_INDEX_ABSOLUTE;}
// i++;
// if (*(npy_bool*)PyArray_GETPTR1(array, i)) {NONZERO_APPEND_INDEX_ABSOLUTE;}
// i++;
// if (*(npy_bool*)PyArray_GETPTR1(array, i)) {NONZERO_APPEND_INDEX_ABSOLUTE;}
// i++;
// if (*(npy_bool*)PyArray_GETPTR1(array, i)) {NONZERO_APPEND_INDEX_ABSOLUTE;}
// i++;
// if (*(npy_bool*)PyArray_GETPTR1(array, i)) {NONZERO_APPEND_INDEX_ABSOLUTE;}
// i++;
// if (*(npy_bool*)PyArray_GETPTR1(array, i)) {NONZERO_APPEND_INDEX_ABSOLUTE;}
// i++;
// if (*(npy_bool*)PyArray_GETPTR1(array, i)) {NONZERO_APPEND_INDEX_ABSOLUTE;}
// i++;
// if (*(npy_bool*)PyArray_GETPTR1(array, i)) {NONZERO_APPEND_INDEX_ABSOLUTE;}
// i++;
// }
// while (i < i_end) {
// if (*(npy_bool*)PyArray_GETPTR1(array, i)) {NONZERO_APPEND_INDEX_ABSOLUTE;}
// i++;
// }
// }
// NPY_END_THREADS;

// npy_intp dims = {count};
// final = PyArray_SimpleNewFromData(1, &dims, NPY_INT64, (void*)indices);
// if (!final) {
// free(indices);
// return NULL;
// }
// // This ensures that the array frees the indices array; this has been tested by calling free(indices) and observing segfault
// PyArray_ENABLEFLAGS((PyArrayObject*)final, NPY_ARRAY_OWNDATA);
// PyArray_CLEARFLAGS((PyArrayObject*)final, NPY_ARRAY_WRITEABLE);
// return final;
// }
// #undef NONZERO_APPEND_INDEX_RELATIVE
// #undef NONZERO_APPEND_INDEX_ABSOLUTE


// Given a Boolean, contiguous 1D array, return the index positions in an int64 array. Through experimentation it has been verified that doing full-size allocation of memory provides the best performance at all scales. Using NpyIter, or using, bit masks does not improve performance over pointer arithmetic. Prescanning for all empty is very effective. Note that NumPy benefits from first counting the nonzeros, then allocating only enough data for the expexted number of indices.
static inline PyObject*
AK_nonzero_1d(PyArrayObject* array) {
Expand Down

0 comments on commit 1264dce

Please sign in to comment.