From 5f91f93a5c24d511e8c6a8305a90f66f5ec3abff Mon Sep 17 00:00:00 2001 From: Christopher Ariza Date: Sun, 21 Nov 2021 07:36:13 -0800 Subject: [PATCH 1/7] preliminary scaffolding for array_bytes_to_file --- src/__init__.py | 1 + src/__init__.pyi | 17 +++++++++-------- src/_arraykit.c | 23 +++++++++++++++++++++++ test/test_util.py | 11 ++++++++++- 4 files changed, 43 insertions(+), 9 deletions(-) diff --git a/src/__init__.py b/src/__init__.py index 988ca110..4d68f5e1 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -16,3 +16,4 @@ from ._arraykit import resolve_dtype_iter as resolve_dtype_iter from ._arraykit import isna_element as isna_element from ._arraykit import dtype_from_element as dtype_from_element +from ._arraykit import array_bytes_to_file as array_bytes_to_file diff --git a/src/__init__.pyi b/src/__init__.pyi index 4ff12eb9..1f7006a4 100644 --- a/src/__init__.pyi +++ b/src/__init__.pyi @@ -8,7 +8,7 @@ __version__: str class ArrayGO: - values: np.array + values: np.ndarray def __init__( self, iterable: tp.Iterable[object], *, own_iterable: bool = ... ) -> None: ... @@ -20,16 +20,17 @@ class ArrayGO: def copy(self: _T) -> _T: ... def extend(self, __values: tp.Iterable[object]) -> None: ... -def immutable_filter(__array: np.array) -> np.array: ... -def mloc(__array: np.array) -> int: ... +def immutable_filter(__array: np.ndarray) -> np.ndarray: ... +def mloc(__array: np.ndarray) -> int: ... def name_filter(__name: tp.Hashable) -> tp.Hashable: ... -def shape_filter(__array: np.array) -> np.ndarray: ... -def column_2d_filter(__array: np.array) -> np.ndarray: ... -def column_1d_filter(__array: np.array) -> np.ndarray: ... -def row_1d_filter(__array: np.array) -> np.ndarray: ... -def array_deepcopy(__array: np.array, memo: tp.Dict[int, tp.Any]) -> np.ndarray: ... +def shape_filter(__array: np.ndarray) -> np.ndarray: ... +def column_2d_filter(__array: np.ndarray) -> np.ndarray: ... +def column_1d_filter(__array: np.ndarray) -> np.ndarray: ... +def row_1d_filter(__array: np.ndarray) -> np.ndarray: ... +def array_deepcopy(__array: np.ndarray, memo: tp.Dict[int, tp.Any]) -> np.ndarray: ... def resolve_dtype(__d1: np.dtype, __d2: np.dtype) -> np.dtype: ... def resolve_dtype_iter(__dtypes: tp.Iterable[np.dtype]) -> np.dtype: ... def isna_element(__value: tp.Any) -> bool: ... def dtype_from_element(__value: tp.Optional[tp.Hashable]) -> np.dtype: ... +def array_bytes_to_file(__array: np.ndarray, __file: tp.IO) -> int: ... diff --git a/src/_arraykit.c b/src/_arraykit.c index 9912cea2..ffffcec7 100644 --- a/src/_arraykit.c +++ b/src/_arraykit.c @@ -342,6 +342,28 @@ array_deepcopy(PyObject *m, PyObject *args, PyObject *kwargs) return AK_ArrayDeepCopy(m, (PyArrayObject*)array, memo); } + + +// Wites array bytes to an open, writeable file. Possibly return number of bytes written. +static PyObject * +array_bytes_to_file(PyObject *Py_UNUSED(m), PyObject *args) +{ + PyObject *array; + PyObject *file; + if (!PyArg_ParseTuple(args, "OO:array_bytes_to_file", + &array, &file)) // how to validate file type? + { + return NULL; + } + AK_CHECK_NUMPY_ARRAY(array); + + PyObject *post = PyLong_FromLong(3); // temp + if (!post) { + return NULL; + } + return post; +} + //------------------------------------------------------------------------------ // type resolution @@ -769,6 +791,7 @@ static PyMethodDef arraykit_methods[] = { (PyCFunction)array_deepcopy, METH_VARARGS | METH_KEYWORDS, NULL}, + {"array_bytes_to_file", array_bytes_to_file, METH_VARARGS, NULL}, {"resolve_dtype", resolve_dtype, METH_VARARGS, NULL}, {"resolve_dtype_iter", resolve_dtype_iter, METH_O, NULL}, {"isna_element", isna_element, METH_O, NULL}, diff --git a/test/test_util.py b/test/test_util.py index 045cf34a..09e76bb5 100644 --- a/test/test_util.py +++ b/test/test_util.py @@ -17,6 +17,7 @@ from arraykit import array_deepcopy from arraykit import isna_element from arraykit import dtype_from_element +from arraykit import array_bytes_to_file from performance.reference.util import mloc as mloc_ref @@ -147,7 +148,7 @@ def test_shape_filter_a(self) -> None: # zero dimension shape_filter(np.array(1)) - + #--------------------------------------------------------------------------- def test_column_2d_filter_a(self) -> None: @@ -382,6 +383,14 @@ def test_dtype_from_element_str_and_bytes_dtypes(self) -> None: self.assertEqual(np.dtype(f'|S{size}'), dtype_from_element(bytes(size))) self.assertEqual(np.dtype(f' None: + + a1 = np.array([3, 4, 5]) + with open('/tmp/tmp.txt', 'w') as f: + post = array_bytes_to_file(a1, f) + self.assertTrue(post > 0) + # import ipdb; ipdb.set_trace() if __name__ == '__main__': unittest.main() From c0388994110453d593ddc0ba1c4fef7bdd43878f Mon Sep 17 00:00:00 2001 From: Christopher Ariza Date: Sun, 21 Nov 2021 08:00:32 -0800 Subject: [PATCH 2/7] added proper temp file creation --- test/test_util.py | 36 ++++++++++++++++++++++++++++++++---- 1 file changed, 32 insertions(+), 4 deletions(-) diff --git a/test/test_util.py b/test/test_util.py index 09e76bb5..4bd24a85 100644 --- a/test/test_util.py +++ b/test/test_util.py @@ -3,6 +3,12 @@ import datetime import unittest import itertools +import typing as tp +from contextlib import contextmanager +import os +from os import PathLike +from pathlib import Path +import tempfile import numpy as np # type: ignore @@ -21,6 +27,27 @@ from performance.reference.util import mloc as mloc_ref +PathSpecifier = tp.Union[str, PathLike] + +@contextmanager +def temp_file(suffix: tp.Optional[str] = None, + path: bool = False + ) -> tp.Iterator[PathSpecifier]: + try: + with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as f: + tmp_name = f.name + if path: + yield Path(tmp_name) + else: + yield tmp_name + finally: + if os.path.exists(tmp_name): + try: + os.unlink(tmp_name) + except PermissionError: # happens on Windows sometimes + pass + + class TestUnit(unittest.TestCase): @@ -387,10 +414,11 @@ def test_dtype_from_element_str_and_bytes_dtypes(self) -> None: def test_array_bytes_to_file_a(self) -> None: a1 = np.array([3, 4, 5]) - with open('/tmp/tmp.txt', 'w') as f: - post = array_bytes_to_file(a1, f) - self.assertTrue(post > 0) - # import ipdb; ipdb.set_trace() + with temp_file('.npy') as fp: + with open(fp, 'wb') as f: + post = array_bytes_to_file(a1, f) + self.assertTrue(post > 0) + # import ipdb; ipdb.set_trace() if __name__ == '__main__': unittest.main() From a354d7696a3da5ffd89477552b725e184ccf2980 Mon Sep 17 00:00:00 2001 From: flexatone Date: Mon, 22 Nov 2021 16:39:10 -0800 Subject: [PATCH 3/7] notes on PyMemoryView_FromMemory and PyArray_ToFile implementations --- src/_arraykit.c | 48 +++++++++++++++++++++++++++++++++++++++++++++-- test/test_util.py | 2 ++ 2 files changed, 48 insertions(+), 2 deletions(-) diff --git a/src/_arraykit.c b/src/_arraykit.c index ffffcec7..721769a3 100644 --- a/src/_arraykit.c +++ b/src/_arraykit.c @@ -344,12 +344,14 @@ array_deepcopy(PyObject *m, PyObject *args, PyObject *kwargs) -// Wites array bytes to an open, writeable file. Possibly return number of bytes written. +// Wites array bytes to an open, writeable file. Possibly return number of bytes written. This is similar to what tofile() does but tofile() cannot be used on a _ZipWriteFile when writing into a zip (raises io.UnsupportedOperation: fileno) static PyObject * array_bytes_to_file(PyObject *Py_UNUSED(m), PyObject *args) { + PyObject *array; PyObject *file; + if (!PyArg_ParseTuple(args, "OO:array_bytes_to_file", &array, &file)) // how to validate file type? { @@ -357,11 +359,53 @@ array_bytes_to_file(PyObject *Py_UNUSED(m), PyObject *args) } AK_CHECK_NUMPY_ARRAY(array); - PyObject *post = PyLong_FromLong(3); // temp + PyObject *write_func = PyObject_GetAttrString(file, "write"); + if (!write_func) { + goto error; + } + // PyObject *_ = PyObject_CallFunctionObjArgs(fileno, NULL); + + // this is what PyArray_ToFile to does + // if (PyArray_ISCONTIGUOUS(array)) { + // size = PyArray_SIZE(array); + // NPY_BEGIN_ALLOW_THREADS; + // n = fwrite((const void *)PyArray_DATA(array), + // (size_t) PyArray_DESCR(array)->elsize, + // (size_t) size, + // file); + // NPY_END_ALLOW_THREADS; + // if (n < size) { + // PyErr_Format(PyExc_OSError, + // "%ld requested and %ld written", + // (long) size, (long) n); + // return NULL; + // } + // } + + // can create memory view object and pass this to the write method + // PyObject *PyMemoryView_FromMemory(char *mem, Py_ssize_t size, int flags) + // PyObject *PyMemoryView_GetContiguous(PyObject *obj, int buffertype, char order) + + PyArrayIterObject *it = (PyArrayIterObject *) PyArray_IterNew(array); + while (it->index < it->size) { + // fwrite((const void *)it->dataptr, + // (size_t) PyArray_DESCR(self)->elsize, + // 1, fp) + PyArray_ITER_NEXT(it); + } + Py_DECREF(it); + Py_DECREF(write_func); + + // dummy return + PyObject *post = PyLong_FromLong(3); if (!post) { return NULL; } return post; + +error: + return NULL; + } //------------------------------------------------------------------------------ diff --git a/test/test_util.py b/test/test_util.py index 4bd24a85..42bd79f8 100644 --- a/test/test_util.py +++ b/test/test_util.py @@ -420,5 +420,7 @@ def test_array_bytes_to_file_a(self) -> None: self.assertTrue(post > 0) # import ipdb; ipdb.set_trace() + # import ipdb; ipdb.set_trace() + if __name__ == '__main__': unittest.main() From 08286e11acf275d07631e3774ff4eb1656337400 Mon Sep 17 00:00:00 2001 From: flexatone Date: Mon, 22 Nov 2021 16:45:59 -0800 Subject: [PATCH 4/7] notes on PyArray_ToString implementation --- src/_arraykit.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/src/_arraykit.c b/src/_arraykit.c index 721769a3..afe971f7 100644 --- a/src/_arraykit.c +++ b/src/_arraykit.c @@ -387,6 +387,9 @@ array_bytes_to_file(PyObject *Py_UNUSED(m), PyObject *args) // PyObject *PyMemoryView_GetContiguous(PyObject *obj, int buffertype, char order) PyArrayIterObject *it = (PyArrayIterObject *) PyArray_IterNew(array); + if (it == NULL) { + return NULL; + } while (it->index < it->size) { // fwrite((const void *)it->dataptr, // (size_t) PyArray_DESCR(self)->elsize, @@ -408,6 +411,25 @@ array_bytes_to_file(PyObject *Py_UNUSED(m), PyObject *args) } +// from PyArray_ToString: create an empty bytes object and write to it + + // ret = PyBytes_FromStringAndSize(NULL, (Py_ssize_t) numbytes); + // if (ret == NULL) { + // Py_DECREF(it); + // return NULL; + // } + // dptr = PyBytes_AS_STRING(ret); + // i = it->size; + // elsize = PyArray_DESCR(self)->elsize; + // while (i--) { + // memcpy(dptr, it->dataptr, elsize); + // dptr += elsize; + // PyArray_ITER_NEXT(it); + // } + // Py_DECREF(it); + + + //------------------------------------------------------------------------------ // type resolution From 79d8472e7599d75a499efe52ea7eb09c86e133d6 Mon Sep 17 00:00:00 2001 From: Christopher Ariza Date: Mon, 22 Nov 2021 19:46:26 -0800 Subject: [PATCH 5/7] progress on array_bytes_to_file --- src/_arraykit.c | 19 ++++++++++++++++--- test/test_util.py | 11 ++++++++--- 2 files changed, 24 insertions(+), 6 deletions(-) diff --git a/src/_arraykit.c b/src/_arraykit.c index afe971f7..a46779bd 100644 --- a/src/_arraykit.c +++ b/src/_arraykit.c @@ -390,11 +390,19 @@ array_bytes_to_file(PyObject *Py_UNUSED(m), PyObject *args) if (it == NULL) { return NULL; } + + PyObject *mv; + PyObject *ret; + + size_t elsize = PyArray_DESCR(array)->elsize; + while (it->index < it->size) { - // fwrite((const void *)it->dataptr, - // (size_t) PyArray_DESCR(self)->elsize, - // 1, fp) + mv = PyMemoryView_FromMemory(it->dataptr, elsize, 0); + ret = PyObject_CallFunctionObjArgs(write_func, mv, NULL); + PyArray_ITER_NEXT(it); + Py_DECREF(mv); + Py_DECREF(ret); } Py_DECREF(it); Py_DECREF(write_func); @@ -413,6 +421,11 @@ array_bytes_to_file(PyObject *Py_UNUSED(m), PyObject *args) // from PyArray_ToString: create an empty bytes object and write to it + // fwrite((const void *)it->dataptr, + // (size_t) PyArray_DESCR(self)->elsize, + // 1, fp) + + // ret = PyBytes_FromStringAndSize(NULL, (Py_ssize_t) numbytes); // if (ret == NULL) { // Py_DECREF(it); diff --git a/test/test_util.py b/test/test_util.py index 42bd79f8..6a16bedf 100644 --- a/test/test_util.py +++ b/test/test_util.py @@ -416,11 +416,16 @@ def test_array_bytes_to_file_a(self) -> None: a1 = np.array([3, 4, 5]) with temp_file('.npy') as fp: with open(fp, 'wb') as f: - post = array_bytes_to_file(a1, f) - self.assertTrue(post > 0) + count = array_bytes_to_file(a1, f) + self.assertTrue(count > 0) # import ipdb; ipdb.set_trace() - # import ipdb; ipdb.set_trace() + with open(fp, 'r') as f: + a2 = np.fromfile(f, dtype=a1.dtype) + self.assertTrue((a1 == a2).all()) + # print(a2) + # import ipdb; ipdb.set_trace() + pass if __name__ == '__main__': unittest.main() From b08a4e308fa5ed74731e31b15fa9ab3f265f4de8 Mon Sep 17 00:00:00 2001 From: flexatone Date: Tue, 23 Nov 2021 20:05:23 -0800 Subject: [PATCH 6/7] refinements to array_bytes_to_file --- performance/reference/util.py | 27 ++++++++++++++++ src/_arraykit.c | 61 ++++++++++++++--------------------- 2 files changed, 52 insertions(+), 36 deletions(-) diff --git a/performance/reference/util.py b/performance/reference/util.py index 0f2d0efc..f2a0761d 100644 --- a/performance/reference/util.py +++ b/performance/reference/util.py @@ -216,3 +216,30 @@ def dtype_from_element(value: tp.Optional[tp.Hashable]) -> np.dtype: # NOTE: calling array and getting dtype on np.nan is faster than combining isinstance, isnan calls return np.array(value).dtype + +NDITER_FLAGS = ('external_loop', 'buffered', 'zerosize_ok') +BUFFERSIZE_NUMERATOR = 16 * 1024 ** 2 +# for 8 bytes this would give 2,097,152 bytes + +def array_bytes_to_file( + array: np.ndarray, + file: tp.BinaryIO, + ): + buffersize = max(BUFFERSIZE_NUMERATOR // array.itemsize, 1) + flags = array.flags + if flags.f_contiguous and not flags.c_contiguous: + for chunk in np.nditer( + array, + flags=NDITER_FLAGS, + buffersize=buffersize, + order='F', + ): + file.write(chunk.tobytes('C')) + else: + for chunk in np.nditer( + array, + flags=NDITER_FLAGS, + buffersize=buffersize, + order='C', + ): + file.write(chunk.tobytes('C')) \ No newline at end of file diff --git a/src/_arraykit.c b/src/_arraykit.c index a46779bd..b3d5ba11 100644 --- a/src/_arraykit.c +++ b/src/_arraykit.c @@ -363,48 +363,33 @@ array_bytes_to_file(PyObject *Py_UNUSED(m), PyObject *args) if (!write_func) { goto error; } - // PyObject *_ = PyObject_CallFunctionObjArgs(fileno, NULL); - - // this is what PyArray_ToFile to does - // if (PyArray_ISCONTIGUOUS(array)) { - // size = PyArray_SIZE(array); - // NPY_BEGIN_ALLOW_THREADS; - // n = fwrite((const void *)PyArray_DATA(array), - // (size_t) PyArray_DESCR(array)->elsize, - // (size_t) size, - // file); - // NPY_END_ALLOW_THREADS; - // if (n < size) { - // PyErr_Format(PyExc_OSError, - // "%ld requested and %ld written", - // (long) size, (long) n); - // return NULL; - // } - // } - - // can create memory view object and pass this to the write method - // PyObject *PyMemoryView_FromMemory(char *mem, Py_ssize_t size, int flags) - // PyObject *PyMemoryView_GetContiguous(PyObject *obj, int buffertype, char order) - - PyArrayIterObject *it = (PyArrayIterObject *) PyArray_IterNew(array); - if (it == NULL) { - return NULL; - } - PyObject *mv; PyObject *ret; + size_t elsize = PyArray_DESCR((PyArrayObject*)array)->elsize; - size_t elsize = PyArray_DESCR(array)->elsize; - - while (it->index < it->size) { - mv = PyMemoryView_FromMemory(it->dataptr, elsize, 0); + // this is what PyArray_ToFile to does + if (PyArray_ISCONTIGUOUS((PyArrayObject*)array)) { + npy_intp size = PyArray_SIZE((PyArrayObject*)array); + // might use PyMemoryView_GetContiguous + mv = PyMemoryView_FromMemory(PyArray_DATA((PyArrayObject*)array), size * elsize, 0); ret = PyObject_CallFunctionObjArgs(write_func, mv, NULL); - - PyArray_ITER_NEXT(it); Py_DECREF(mv); - Py_DECREF(ret); + Py_DECREF(ret); } + else { + PyArrayIterObject *it = (PyArrayIterObject *) PyArray_IterNew(array); + if (it == NULL) { + return NULL; + } + while (it->index < it->size) { + mv = PyMemoryView_FromMemory(it->dataptr, elsize, 0); + ret = PyObject_CallFunctionObjArgs(write_func, mv, NULL); + + PyArray_ITER_NEXT(it); + Py_DECREF(mv); + Py_DECREF(ret); + } + Py_DECREF(it); } - Py_DECREF(it); Py_DECREF(write_func); // dummy return @@ -419,6 +404,10 @@ array_bytes_to_file(PyObject *Py_UNUSED(m), PyObject *args) } + // can create memory view object and pass this to the write method + // PyObject *PyMemoryView_FromMemory(char *mem, Py_ssize_t size, int flags) + // PyObject *PyMemoryView_GetContiguous(PyObject *obj, int buffertype, char order) + // from PyArray_ToString: create an empty bytes object and write to it // fwrite((const void *)it->dataptr, From 177fffe333e5ac6c5813818763319be2dee3e92a Mon Sep 17 00:00:00 2001 From: Christopher Ariza Date: Fri, 21 Oct 2022 13:08:53 -0700 Subject: [PATCH 7/7] merged from master --- performance/reference/util.py | 7 +------ src/__init__.py | 3 --- test/test_util.py | 12 ------------ 3 files changed, 1 insertion(+), 21 deletions(-) diff --git a/performance/reference/util.py b/performance/reference/util.py index 8f501933..10903762 100644 --- a/performance/reference/util.py +++ b/performance/reference/util.py @@ -217,7 +217,6 @@ def dtype_from_element(value: tp.Optional[tp.Hashable]) -> np.dtype: return np.array(value).dtype -<<<<<<< HEAD NDITER_FLAGS = ('external_loop', 'buffered', 'zerosize_ok') BUFFERSIZE_NUMERATOR = 16 * 1024 ** 2 # for 8 bytes this would give 2,097,152 bytes @@ -244,7 +243,7 @@ def array_bytes_to_file( order='C', ): file.write(chunk.tobytes('C')) -======= + def get_new_indexers_and_screen_ref( indexers: np.ndarray, positions: np.ndarray, @@ -288,7 +287,3 @@ def count_iteration(iterable: tp.Iterable): count += 1 return count - - - ->>>>>>> master diff --git a/src/__init__.py b/src/__init__.py index 13db8d74..cdb600be 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -16,12 +16,9 @@ from ._arraykit import resolve_dtype_iter as resolve_dtype_iter from ._arraykit import isna_element as isna_element from ._arraykit import dtype_from_element as dtype_from_element -<<<<<<< HEAD from ._arraykit import array_bytes_to_file as array_bytes_to_file -======= from ._arraykit import delimited_to_arrays as delimited_to_arrays from ._arraykit import iterable_str_to_array_1d as iterable_str_to_array_1d from ._arraykit import get_new_indexers_and_screen as get_new_indexers_and_screen from ._arraykit import split_after_count as split_after_count from ._arraykit import count_iteration as count_iteration ->>>>>>> master diff --git a/test/test_util.py b/test/test_util.py index 7e7f5018..98a1252f 100644 --- a/test/test_util.py +++ b/test/test_util.py @@ -2,7 +2,6 @@ import collections import datetime import unittest -<<<<<<< HEAD import itertools import typing as tp from contextlib import contextmanager @@ -11,10 +10,8 @@ from pathlib import Path import tempfile -======= import warnings from io import StringIO ->>>>>>> master import numpy as np # type: ignore from arraykit import resolve_dtype @@ -28,15 +25,12 @@ from arraykit import array_deepcopy from arraykit import isna_element from arraykit import dtype_from_element -<<<<<<< HEAD from arraykit import array_bytes_to_file -======= from arraykit import split_after_count from arraykit import count_iteration from performance.reference.util import get_new_indexers_and_screen_ak as get_new_indexers_and_screen_full from arraykit import get_new_indexers_and_screen ->>>>>>> master from performance.reference.util import mloc as mloc_ref @@ -188,10 +182,6 @@ def test_shape_filter_a(self) -> None: # zero dimension shape_filter(np.array(1)) -<<<<<<< HEAD - -======= ->>>>>>> master #--------------------------------------------------------------------------- def test_column_2d_filter_a(self) -> None: @@ -565,7 +555,5 @@ def test_count_iteration_b(self) -> None: self.assertEqual(post, 5) ->>>>>>> master - if __name__ == '__main__': unittest.main()