diff --git a/cpp/src/arrow/c/bridge.h b/cpp/src/arrow/c/bridge.h index f74995501166b..294f53e49fb55 100644 --- a/cpp/src/arrow/c/bridge.h +++ b/cpp/src/arrow/c/bridge.h @@ -29,6 +29,10 @@ namespace arrow { +/// \defgroup c-data-interface Functions for working with the C data interface. +/// +/// @{ + /// \brief Export C++ DataType using the C data interface format. /// /// The root type is considered to have empty name and metadata. @@ -160,8 +164,17 @@ ARROW_EXPORT Result> ImportRecordBatch(struct ArrowArray* array, struct ArrowSchema* schema); +/// @} + +/// \defgroup c-stream-interface Functions for working with the C data interface. +/// +/// @{ + /// \brief EXPERIMENTAL: Export C++ RecordBatchReader using the C stream interface. /// +/// The resulting ArrowArrayStream struct keeps the record batch reader alive +/// until its release callback is called by the consumer. +/// /// \param[in] reader RecordBatchReader object to export /// \param[out] out C struct where to export the stream ARROW_EXPORT @@ -170,10 +183,15 @@ Status ExportRecordBatchReader(std::shared_ptr reader, /// \brief EXPERIMENTAL: Import C++ RecordBatchReader from the C stream interface. /// +/// The ArrowArrayStream struct has its contents moved to a private object +/// held alive by the resulting record batch reader. +/// /// \param[in,out] stream C stream interface struct /// \return Imported RecordBatchReader object ARROW_EXPORT Result> ImportRecordBatchReader( struct ArrowArrayStream* stream); +/// @} + } // namespace arrow diff --git a/docs/source/cpp/api.rst b/docs/source/cpp/api.rst index 59d221012d36e..626b388b66458 100644 --- a/docs/source/cpp/api.rst +++ b/docs/source/cpp/api.rst @@ -29,6 +29,7 @@ API Reference api/scalar api/builder api/table + api/c_abi api/compute api/tensor api/utilities diff --git a/docs/source/cpp/api/c_abi.rst b/docs/source/cpp/api/c_abi.rst new file mode 100644 index 0000000000000..1e05bd8fcdb0c --- /dev/null +++ b/docs/source/cpp/api/c_abi.rst @@ -0,0 +1,44 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +============ +C Interfaces +============ + +ABI Structures +============== + +.. doxygenstruct:: ArrowSchema + :project: arrow_cpp + +.. doxygenstruct:: ArrowArray + :project: arrow_cpp + +.. doxygenstruct:: ArrowArrayStream + :project: arrow_cpp + +C Data Interface +================ + +.. doxygengroup:: c-data-interface + :content-only: + +C Stream Interface +================== + +.. doxygengroup:: c-stream-interface + :content-only: diff --git a/docs/source/format/CDataInterface.rst b/docs/source/format/CDataInterface.rst index 768dc47114498..dbecf307db990 100644 --- a/docs/source/format/CDataInterface.rst +++ b/docs/source/format/CDataInterface.rst @@ -535,6 +535,8 @@ Therefore, the consumer MUST not try to interfere with the producer's handling of these members' lifetime. The only way the consumer influences data lifetime is by calling the base structure's ``release`` callback. +.. _c-data-interface-released: + Released structure '''''''''''''''''' diff --git a/docs/source/format/CStreamInterface.rst b/docs/source/format/CStreamInterface.rst new file mode 100644 index 0000000000000..b8ccce3559238 --- /dev/null +++ b/docs/source/format/CStreamInterface.rst @@ -0,0 +1,218 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. highlight:: c + +.. _c-stream-interface: + +============================ +The Arrow C stream interface +============================ + +.. warning:: + This interface is experimental and may evolve based on feedback from + early users. ABI stability is not guaranteed yet. Feel free to + `contact us `__. + +The C stream interface builds on the structures defined in the +:ref:`C data interface ` and combines them into a higher-level +specification so as to ease the communication of streaming data within a single +process. + +Semantics +========= + +An Arrow C stream exposes a streaming source of data chunks, each with the +same schema. Chunks are obtained by calling a blocking pull-style iteration +function. + +Structure definition +==================== + +The C stream interface is defined by a single ``struct`` definition:: + + struct ArrowArrayStream { + // Callbacks providing stream functionality + int (*get_schema)(struct ArrowArrayStream*, struct ArrowSchema* out); + int (*get_next)(struct ArrowArrayStream*, struct ArrowArray* out); + const char* (*get_last_error)(struct ArrowArrayStream*); + + // Release callback + void (*release)(struct ArrowArrayStream*); + + // Opaque producer-specific data + void* private_data; + }; + +The ArrowArrayStream structure +------------------------------ + +The ``ArrowArrayStream`` provides the required callbacks to interact with a +streaming source of Arrow arrays. It has the following fields: + +.. c:member:: int (*ArrowArrayStream.get_schema)(struct ArrowArrayStream*, struct ArrowSchema* out) + + *Mandatory.* This callback allows the consumer to query the schema of + the chunks of data in the stream. The schema is the same for all + data chunks. + + This callback must NOT be called on a released ``ArrowArrayStream``. + + *Return value:* 0 on success, a non-zero + :ref:`error code ` otherwise. + +.. c:member:: int (*ArrowArrayStream.get_next)(struct ArrowArrayStream*, struct ArrowArray* out) + + *Mandatory.* This callback allows the consumer to get the next chunk + of data in the stream. + + This callback must NOT be called on a released ``ArrowArrayStream``. + + *Return value:* 0 on success, a non-zero + :ref:`error code ` otherwise. + + On success, the consumer must check whether the ``ArrowArray`` is + marked :ref:`released `. If the + ``ArrowArray`` is released, then the end of stream has been reached. + Otherwise, the ``ArrowArray`` contains a valid data chunk. + +.. c:member:: const char* (*ArrowArrayStream.get_last_error)(struct ArrowArrayStream*) + + *Mandatory.* This callback allows the consumer to get a textual description + of the last error. + + This callback must ONLY be called if the last operation on the + ``ArrowArrayStream`` returned an error. It must NOT be called on a + released ``ArrowArrayStream``. + + *Return value:* a pointer to a NULL-terminated character string (UTF8-encoded). + NULL can also be returned if no detailed description is available. + + The returned pointer is only guaranteed to be valid until the next call of + one of the stream's callbacks. The character string it points to should + be copied to consumer-managed storage if it is intended to survive longer. + +.. c:member:: void (*ArrowArrayStream.release)(struct ArrowArrayStream*) + + *Mandatory.* A pointer to a producer-provided release callback. + +.. c:member:: void* ArrowArrayStream.private_data + + *Optional.* An opaque pointer to producer-provided private data. + + Consumers MUST not process this member. Lifetime of this member + is handled by the producer, and especially by the release callback. + + +.. _c-stream-interface-error-codes: + +Error codes +----------- + +The ``get_schema`` and ``get_next`` callbacks may return an error under the form +of a non-zero integer code. Such error codes should be interpreted like +``errno`` numbers (as defined by the local platform). Note that the symbolic +forms of these constants are stable from platform to platform, but their numeric +values are platform-specific. + +In particular, it is recommended to recognize the following values: + +* ``EINVAL``: for a parameter or input validation error +* ``ENOMEM``: for a memory allocation failure (out of memory) +* ``EIO``: for a generic input/output error + +.. seealso:: + `Standard POSIX error codes `__. + + `Error codes recognized by the Windows C runtime library + `__. + +Result lifetimes +---------------- + +The data returned by the ``get_schema`` and ``get_next`` callbacks must be +released independently. Their lifetimes are not tied to that of the +``ArrowArrayStream``. + +Stream lifetime +--------------- + +Lifetime of the C stream is managed using a release callback with similar +usage as in the :ref:`C data interface `. + + +C consumer example +================== + +Let's say a particular database provides the following C API to execute +a SQL query and return the result set as a Arrow C stream:: + + void MyDB_Query(const char* query, struct ArrowArrayStream* result_set); + +Then a consumer could use the following code to iterate over the results:: + + static void handle_error(int errcode, struct ArrowArrayStream* stream) { + // Print stream error + const char* errdesc = stream->get_last_error(stream); + if (errdesc != NULL) { + fputs(errdesc, stderr); + } else { + fputs(strerror(errcode), stderr); + } + // Release stream and abort + stream->release(stream), + exit(1); + } + + void run_query() { + struct ArrowArrayStream stream; + struct ArrowSchema schema; + struct ArrowArray chunk; + int errcode; + + MyDB_Query("SELECT * FROM my_table", &stream); + + // Query result set schema + errcode = stream.get_schema(&stream, &schema); + if (errcode != 0) { + handle_error(errcode, &stream); + } + + int64_t num_rows = 0; + + // Iterate over results: loop until error or end of stream + while ((errcode = stream.get_next(&stream, &chunk) == 0) && + chunk.release != NULL) { + // Do something with chunk... + fprintf(stderr, "Result chunk: got %lld rows\n", chunk.length); + num_rows += chunk.length; + + // Release chunk + chunk.release(&chunk); + } + + // Was it an error? + if (errcode != 0) { + handle_error(errcode, &stream); + } + + fprintf(stderr, "Result stream ended: total %lld rows\n", num_rows); + + // Release schema and stream + schema.release(&schema); + stream.release(&stream); + } diff --git a/docs/source/index.rst b/docs/source/index.rst index 2d95e22f16ac5..cfcf865398227 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -43,6 +43,7 @@ such topics as: format/Flight format/Integration format/CDataInterface + format/CStreamInterface format/Other .. _toc.usage: