Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implements fast is_sorted check #96

Draft
wants to merge 6 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,10 @@ Extended arguments to and functionality in ``split_after_count()`` to support th

Now building wheels for 3.11.

0.1.12
............

Implemented ``is_sorted``.

0.2.2
............
Expand Down
1 change: 1 addition & 0 deletions src/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from ._arraykit import delimited_to_arrays as delimited_to_arrays
from ._arraykit import iterable_str_to_array_1d as iterable_str_to_array_1d
from ._arraykit import get_new_indexers_and_screen as get_new_indexers_and_screen
from ._arraykit import is_sorted as is_sorted
from ._arraykit import split_after_count as split_after_count
from ._arraykit import count_iteration as count_iteration
from ._arraykit import first_true_1d as first_true_1d
Expand Down
1 change: 1 addition & 0 deletions src/__init__.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ def resolve_dtype_iter(__dtypes: tp.Iterable[np.dtype]) -> np.dtype: ...
def isna_element(__value: tp.Any, include_none: bool = True) -> bool: ...
def dtype_from_element(__value: tp.Optional[tp.Hashable]) -> np.dtype: ...
def get_new_indexers_and_screen(indexers: np.ndarray, positions: np.ndarray) -> tp.Tuple[np.ndarray, np.ndarray]: ...
def is_sorted(arr: np.ndarray) -> bool: ...

def first_true_1d(__array: np.ndarray, *, forward: bool) -> int: ...
def first_true_2d(__array: np.ndarray, *, forward: bool, axis: int) -> np.ndarray: ...
178 changes: 177 additions & 1 deletion src/_arraykit.c
Original file line number Diff line number Diff line change
Expand Up @@ -4031,7 +4031,7 @@ get_new_indexers_and_screen(PyObject *Py_UNUSED(m), PyObject *args, PyObject *kw
Py_DECREF(element_locations);

// new_positions = order_found[:num_unique]
PyObject *new_positions = PySequence_GetSlice((PyObject*)order_found, 0, num_found);
PyObject *new_positions = PySequence_GetSlice((PyObject*)order_found, 0, (Py_ssize_t)num_found);
Py_DECREF(order_found);
if (new_positions == NULL) {
return NULL;
Expand All @@ -4058,6 +4058,181 @@ get_new_indexers_and_screen(PyObject *Py_UNUSED(m), PyObject *args, PyObject *kw
return NULL;
}

//------------------------------------------------------------------------------

# define AK_COMPARE_SIMPLE(a, b) a > b
# define AK_COMPARE_COMPLEX(a, b) a.real > b.real || (a.real == b.real && a.imag > b.imag)

/*Note: Data array needs a unique name for each case inside the switch*/
# define AK_IS_SORTED(ctype, compare_macro) \
if (contiguous) { \
NPY_BEGIN_THREADS_DEF; \
NPY_BEGIN_THREADS; \
ctype* data_##ctype##_ = (ctype*)PyArray_DATA(arr); \
for (size_t i = 0; i < arr_size - 1; ++i) { \
ctype element = data_##ctype##_[i]; \
ctype next = data_##ctype##_[i + 1]; \
if (compare_macro(element, next)) { \
NPY_END_THREADS; \
Py_RETURN_FALSE; \
} \
} \
NPY_END_THREADS; \
} \
else { \
NPY_BEGIN_THREADS_DEF; \
NPY_BEGIN_THREADS; \
for (size_t i = 0; i < arr_size - 1; ++i) { \
ctype element = *(ctype*)PyArray_GETPTR1(arr, i); \
ctype next = *(ctype*)PyArray_GETPTR1(arr, i + 1); \
if (compare_macro(element, next)) { \
NPY_END_THREADS; \
Py_RETURN_FALSE; \
} \
} \
NPY_END_THREADS; \
} \
Py_RETURN_TRUE; \


static bool
AK_is_sorted_string(PyArrayObject* arr, bool contiguous, size_t arr_size)
{
size_t item_size = (size_t)PyArray_ITEMSIZE(arr);

if (contiguous) {
NPY_BEGIN_THREADS_DEF;
NPY_BEGIN_THREADS;
char* data = (char*)PyArray_DATA(arr);
size_t i = 0;
while (i < (arr_size - 1) * item_size) {
if (strncmp(&data[i], &data[i + item_size], item_size) > 0) {
NPY_END_THREADS;
Py_RETURN_FALSE;
}
i += item_size;
}
NPY_END_THREADS;
}
else {
NPY_BEGIN_THREADS_DEF;
NPY_BEGIN_THREADS;
size_t i = 0;
while (i < (arr_size - 1) * item_size) {
char *element = PyArray_GETPTR1(arr, i);
char *next = PyArray_GETPTR1(arr, i + 1);
if (strncmp(element, next, item_size) > 0) {
NPY_END_THREADS;
Py_RETURN_FALSE;
}
i += item_size;
}
NPY_END_THREADS;
}
Py_RETURN_TRUE;
}


static PyObject *
is_sorted(PyObject *Py_UNUSED(m), PyObject *arg)
{
AK_CHECK_NUMPY_ARRAY(arg);
PyArrayObject *arr = (PyArrayObject*)arg;

if (PyArray_NDIM(arr) != 1) {
PyErr_SetString(PyExc_ValueError, "Array must be 1-dimensional");
return NULL;
}

bool contiguous = (bool)PyArray_IS_C_CONTIGUOUS(arr);
size_t arr_size = (size_t)PyArray_SIZE(arr);

switch (PyArray_TYPE(arr)) {
case NPY_BOOL:;
AK_IS_SORTED(npy_bool, AK_COMPARE_SIMPLE)
case NPY_BYTE:;
AK_IS_SORTED(npy_byte, AK_COMPARE_SIMPLE)
case NPY_UBYTE:;
AK_IS_SORTED(npy_ubyte, AK_COMPARE_SIMPLE)
case NPY_SHORT:;
AK_IS_SORTED(npy_short, AK_COMPARE_SIMPLE)
case NPY_USHORT:;
AK_IS_SORTED(npy_ushort, AK_COMPARE_SIMPLE)
case NPY_INT:;
AK_IS_SORTED(npy_int, AK_COMPARE_SIMPLE)
case NPY_UINT:;
AK_IS_SORTED(npy_uint, AK_COMPARE_SIMPLE)
case NPY_LONG:;
AK_IS_SORTED(npy_long, AK_COMPARE_SIMPLE)
case NPY_ULONG:;
AK_IS_SORTED(npy_ulong, AK_COMPARE_SIMPLE)
case NPY_LONGLONG:;
AK_IS_SORTED(npy_longlong, AK_COMPARE_SIMPLE)
case NPY_ULONGLONG:;
AK_IS_SORTED(npy_ulonglong, AK_COMPARE_SIMPLE)
case NPY_FLOAT:;
AK_IS_SORTED(npy_float, AK_COMPARE_SIMPLE)
case NPY_DOUBLE:;
AK_IS_SORTED(npy_double, AK_COMPARE_SIMPLE)

# ifdef PyFloat128ArrType_Type
case NPY_LONGDOUBLE:;
AK_IS_SORTED(npy_longdouble, AK_COMPARE_SIMPLE)
# endif

case NPY_DATETIME:;
AK_IS_SORTED(npy_datetime, AK_COMPARE_SIMPLE)
case NPY_TIMEDELTA:;
AK_IS_SORTED(npy_timedelta, AK_COMPARE_SIMPLE)
case NPY_HALF:;
AK_IS_SORTED(npy_half, AK_COMPARE_SIMPLE)
case NPY_CFLOAT:;
AK_IS_SORTED(npy_complex64, AK_COMPARE_COMPLEX)
case NPY_CDOUBLE:;
AK_IS_SORTED(npy_complex128, AK_COMPARE_COMPLEX)

# ifdef PyComplex256ArrType_Type
case NPY_CLONGDOUBLE:;
AK_IS_SORTED(npy_complex256, AK_COMPARE_COMPLEX)
# endif

case NPY_STRING:
case NPY_UNICODE:
if (!AK_is_sorted_string(arr, contiguous, arr_size)) {
Py_RETURN_FALSE;
}
Py_RETURN_TRUE;
default:;
PyErr_Format(PyExc_ValueError,
"Unsupported dtype: %s",
PyArray_DESCR(arr)->typeobj->tp_name
);
return NULL;
}
// // ------------------------------------------------------------------------
// // perf is not good here - maybe drop support?
// else if (np_dtype == NPY_OBJECT) {
// do {
// char* data = *dataptr;
// npy_intp stride = *strideptr;
// npy_intp inner_size = *innersizeptr;

// PyObject* prev = *((PyObject **)data);
// data += stride;
// inner_size--;
// while (inner_size--) {
// PyObject* element = *((PyObject **)data);
// if (PyObject_RichCompareBool(element, prev, Py_LT) == 1) {
// goto fail;
// }
// prev = element;
// data += stride;
// }
// } while(arr_iternext(arr_iter));
// }
Py_UNREACHABLE();
}

//------------------------------------------------------------------------------
// ArrayGO
//------------------------------------------------------------------------------
Expand Down Expand Up @@ -4364,6 +4539,7 @@ static PyMethodDef arraykit_methods[] = {
METH_VARARGS | METH_KEYWORDS,
NULL},
{"dtype_from_element", dtype_from_element, METH_O, NULL},
{"is_sorted", is_sorted, METH_O, NULL},
{"get_new_indexers_and_screen",
(PyCFunction)get_new_indexers_and_screen,
METH_VARARGS | METH_KEYWORDS,
Expand Down
Loading