extend series

pola-rs · Feb 4, 2022 · eea2710 · eea2710
1 parent 554d571
commit eea2710
Show file tree

Hide file tree

Showing 15 changed files with 231 additions and 67 deletions.
diff --git a/polars/polars-core/src/chunked_array/ops/extend.rs b/polars/polars-core/src/chunked_array/ops/extend.rs
@@ -15,8 +15,8 @@ fn extend_immutable(immutable: &dyn Array, chunks: &mut Vec<ArrayRef>, other_chu
 }
 
 impl<T> ChunkedArray<T>
-    where
-        T: PolarsNumericType,
+where
+    T: PolarsNumericType,
 {
     /// Extend the memory backed by this array with the values from `other`.
     ///
@@ -30,13 +30,14 @@ impl<T> ChunkedArray<T>
     /// online operations where you add `n` rows and rerun a query.
     ///
     /// Prefer `append` over `extend` when you want to append many times before doing a query. For instance
-    /// when you read in multiple files and when to store them in a single `DataFrame`. Finish the sequence
-    /// of `append` operations with a [`rechunk`](Self::rechunk).
+    /// when you read in multiple files and when to store them in a single `DataFrame`.
+    /// In the latter case finish the sequence of `append` operations with a [`rechunk`](Self::rechunk).
     pub fn extend(&mut self, other: &Self) {
-        // make sure that we are a single chunk already
+        // all to a single chunk
         if self.chunks.len() > 1 {
-            self.rechunk();
-            self.extend(other)
+            self.append(other);
+            *self = self.rechunk();
+            return;
         }
         // Depending on the state of the underlying arrow array we
         // might be able to get a `MutablePrimitiveArray`
@@ -48,7 +49,7 @@ impl<T> ChunkedArray<T>
         let arr = self.downcast_iter().next().unwrap();
 
         // increments 1
-        let mut arr = arr.clone();
+        let arr = arr.clone();
 
         // now we drop our owned ArrayRefs so that
         // decrements 1
@@ -79,15 +80,15 @@ impl<T> ChunkedArray<T>
 #[doc(hidden)]
 impl Utf8Chunked {
     pub fn extend(&mut self, other: &Self) {
-        // make sure that we are a single chunk already
         if self.chunks.len() > 1 {
-            self.rechunk();
-            self.extend(other)
+            self.append(other);
+            *self = self.rechunk();
+            return;
         }
         let arr = self.downcast_iter().next().unwrap();
 
         // increments 1
-        let mut arr = arr.clone();
+        let arr = arr.clone();
 
         // now we drop our owned ArrayRefs so that
         // decrements 1
@@ -117,13 +118,14 @@ impl BooleanChunked {
     pub fn extend(&mut self, other: &Self) {
         // make sure that we are a single chunk already
         if self.chunks.len() > 1 {
-            self.rechunk();
-            self.extend(other)
+            self.append(other);
+            *self = self.rechunk();
+            return;
         }
         let arr = self.downcast_iter().next().unwrap();
 
         // increments 1
-        let mut arr = arr.clone();
+        let arr = arr.clone();
 
         // now we drop our owned ArrayRefs so that
         // decrements 1
@@ -155,7 +157,6 @@ impl ListChunked {
         // this is harder because we don't know the inner type of the list
         self.append(other);
     }
-
 }
 
 #[cfg(test)]
@@ -189,8 +190,8 @@ mod test {
 
     #[test]
     fn test_extend_utf8() {
-        let mut ca= Utf8Chunked::new("a", &["a", "b", "c"]);
-        let to_append= Utf8Chunked::new("a", &["a", "b", "e"]);
+        let mut ca = Utf8Chunked::new("a", &["a", "b", "c"]);
+        let to_append = Utf8Chunked::new("a", &["a", "b", "e"]);
 
         ca.extend(&to_append);
         let vals = ca.into_no_null_iter().collect::<Vec<_>>();
@@ -199,8 +200,8 @@ mod test {
 
     #[test]
     fn test_extend_bool() {
-        let mut ca= BooleanChunked::new("a", [true, false]);
-        let to_append= BooleanChunked::new("a", &[false, false]);
+        let mut ca = BooleanChunked::new("a", [true, false]);
+        let to_append = BooleanChunked::new("a", &[false, false]);
 
         ca.extend(&to_append);
         let vals = ca.into_no_null_iter().collect::<Vec<_>>();

diff --git a/polars/polars-core/src/frame/mod.rs b/polars/polars-core/src/frame/mod.rs
@@ -731,6 +731,8 @@ impl DataFrame {
 
     /// Concatenate a `DataFrame` to this `DataFrame` and return as newly allocated `DataFrame`.
     ///
+    /// If many `vstack` operations are done, it is recommended to call [`DataFrame::rechunk`].
+    ///
     /// # Example
     ///
     /// ```rust
@@ -767,14 +769,46 @@ impl DataFrame {
     /// | Palladium | 1828.05           |
     /// +-----------+-------------------+
     /// ```
-    pub fn vstack(&self, columns: &DataFrame) -> Result<Self> {
+    pub fn vstack(&self, other: &DataFrame) -> Result<Self> {
         let mut df = self.clone();
-        df.vstack_mut(columns)?;
+        df.vstack_mut(other)?;
         Ok(df)
     }
 
+    // utility to test if we can vstack/extend the columns
+    fn can_extend(&self, left: &Series, right: &Series) -> Result<()> {
+        if left.dtype() != right.dtype() || left.name() != right.name() {
+            if left.dtype() != right.dtype() {
+                return Err(PolarsError::SchemaMisMatch(
+                    format!(
+                        "cannot vstack: because column datatypes (dtypes) in the two DataFrames do not match for \
+                                left.name='{}' with left.dtype={} != right.dtype={} with right.name='{}'",
+                        left.name(),
+                        left.dtype(),
+                        right.dtype(),
+                        right.name()
+                    )
+                        .into(),
+                ));
+            } else {
+                return Err(PolarsError::SchemaMisMatch(
+                    format!(
+                        "cannot vstack: because column names in the two DataFrames do not match for \
+                                left.name='{}' != right.name='{}'",
+                        left.name(),
+                        right.name()
+                    )
+                        .into(),
+                ));
+            }
+        };
+        Ok(())
+    }
+
     /// Concatenate a DataFrame to this DataFrame
     ///
+    /// If many `vstack` operations are done, it is recommended to call [`DataFrame::rechunk`].
+    ///
     /// # Example
     ///
     /// ```rust
@@ -811,51 +845,56 @@ impl DataFrame {
     /// | Palladium | 1828.05           |
     /// +-----------+-------------------+
     /// ```
-    pub fn vstack_mut(&mut self, df: &DataFrame) -> Result<&mut Self> {
-        if self.width() != df.width() {
+    pub fn vstack_mut(&mut self, other: &DataFrame) -> Result<&mut Self> {
+        if self.width() != other.width() {
             return Err(PolarsError::ShapeMisMatch(
-                format!("Could not vertically stack DataFrame. The DataFrames appended width {} differs from the parent DataFrames width {}", self.width(), df.width()).into()
+                format!("Could not vertically stack DataFrame. The DataFrames appended width {} differs from the parent DataFrames width {}", self.width(), other.width()).into()
             ));
         }
 
         self.columns
             .iter_mut()
-            .zip(df.columns.iter())
+            .zip(other.columns.iter())
             .try_for_each(|(left, right)| {
-                if left.dtype() != right.dtype() || left.name() != right.name() {
-                    if left.dtype() != right.dtype() {
-                        return Err(PolarsError::SchemaMisMatch(
-                            format!(
-                                "cannot vstack: because column datatypes (dtypes) in the two DataFrames do not match for \
-                                left.name='{}' with left.dtype={} != right.dtype={} with right.name='{}'",
-                                left.name(),
-                                left.dtype(),
-                                right.dtype(),
-                                right.name()
-                            )
-                            .into(),
-                        ));
-                    }
-                    else {
-                        return Err(PolarsError::SchemaMisMatch(
-                            format!(
-                                "cannot vstack: because column names in the two DataFrames do not match for \
-                                left.name='{}' != right.name='{}'",
-                                left.name(),
-                                right.name()
-                            )
-                            .into(),
-                        ));
-                    }
-                }
-
+                self.can_extend(left, right)?;
                 left.append(right).expect("should not fail");
                 Ok(())
             })?;
-        // don't rechunk here. Chunks in columns always match.
         Ok(self)
     }
 
+    /// Extend the memory backed by this [`DataFrame`] with the values from `other`.
+    ///
+    /// Different from [`vstack`](Self::vstack) which adds the chunks from `other` to the chunks of this [`DataFrame`]
+    /// `extent` appends the data from `other` to the underlying memory locations and thus may cause a reallocation.
+    ///
+    /// If this does not cause a reallocation, the resulting data structure will not have any extra chunks
+    /// and thus will yield faster queries.
+    ///
+    /// Prefer `extend` over `vstack` when you want do a query after a single append. For instance during
+    /// online operations where you add `n` rows and rerun a query.
+    ///
+    /// Prefer `vstack` over `extend` when you want to append many times before doing a query. For instance
+    /// when you read in multiple files and when to store them in a single `DataFrame`. In the latter case, finish the sequence
+    /// of `append` operations with a [`rechunk`](Self::rechunk).
+    pub fn extend(&mut self, other: &DataFrame) -> Result<()> {
+        if self.width() != other.width() {
+            return Err(PolarsError::ShapeMisMatch(
+                format!("Could not extend DataFrame. The DataFrames extended width {} differs from the parent DataFrames width {}", self.width(), other.width()).into()
+            ));
+        }
+
+        self.columns
+            .iter_mut()
+            .zip(other.columns.iter())
+            .try_for_each(|(left, right)| {
+                self.can_extend(left, right)?;
+                left.extend(right).unwrap();
+                Ok(())
+            })?;
+        Ok(())
+    }
+
     /// Remove a column by name and return the column removed.
     ///
     /// # Example

diff --git a/polars/polars-core/src/prelude.rs b/polars/polars-core/src/prelude.rs
@@ -10,7 +10,7 @@ pub use crate::{
             ListPrimitiveChunkedBuilder, ListUtf8ChunkedBuilder, NewChunkedArray,
             PrimitiveChunkedBuilder, Utf8ChunkedBuilder,
         },
-        iterator::{PolarsIterator},
+        iterator::PolarsIterator,
         ops::{aggregate::*, *},
         ChunkedArray,
     },

diff --git a/polars/polars-core/src/series/implementations/boolean.rs b/polars/polars-core/src/series/implementations/boolean.rs
@@ -159,7 +159,6 @@ impl SeriesTrait for SeriesWrap<BooleanChunked> {
 
     fn append(&mut self, other: &Series) -> Result<()> {
         if self.0.dtype() == other.dtype() {
-            // todo! add object
             self.0.append(other.as_ref().as_ref());
             Ok(())
         } else {
@@ -169,6 +168,17 @@ impl SeriesTrait for SeriesWrap<BooleanChunked> {
         }
     }
 
+    fn extend(&mut self, other: &Series) -> Result<()> {
+        if self.0.dtype() == other.dtype() {
+            self.0.extend(other.as_ref().as_ref());
+            Ok(())
+        } else {
+            Err(PolarsError::SchemaMisMatch(
+                "cannot extend Series; data types don't match".into(),
+            ))
+        }
+    }
+
     fn filter(&self, filter: &BooleanChunked) -> Result<Series> {
         ChunkFilter::filter(&self.0, filter).map(|ca| ca.into_series())
     }

diff --git a/polars/polars-core/src/series/implementations/categorical.rs b/polars/polars-core/src/series/implementations/categorical.rs
@@ -151,15 +151,28 @@ impl SeriesTrait for SeriesWrap<CategoricalChunked> {
 
     fn append(&mut self, other: &Series) -> Result<()> {
         if self.0.dtype() == other.dtype() {
-            // todo! add object
-            self.0.append(other.as_ref().as_ref());
+            let other = other.categorical()?;
+            self.0.append(other);
+            self.0.merge_categorical_map(other);
             Ok(())
         } else {
             Err(PolarsError::SchemaMisMatch(
                 "cannot append Series; data types don't match".into(),
             ))
         }
     }
+    fn extend(&mut self, other: &Series) -> Result<()> {
+        if self.0.dtype() == other.dtype() {
+            let other = other.categorical()?;
+            self.0.extend(other);
+            self.0.merge_categorical_map(other);
+            Ok(())
+        } else {
+            Err(PolarsError::SchemaMisMatch(
+                "cannot extend Series; data types don't match".into(),
+            ))
+        }
+    }
 
     fn filter(&self, filter: &BooleanChunked) -> Result<Series> {
         ChunkFilter::filter(&self.0, filter).map(|ca| ca.into_series())

diff --git a/polars/polars-core/src/series/implementations/dates_time.rs b/polars/polars-core/src/series/implementations/dates_time.rs
@@ -298,15 +298,34 @@ macro_rules! impl_dyn_series {
 
             fn append(&mut self, other: &Series) -> Result<()> {
                 if self.0.dtype() == other.dtype() {
-                    let other = other.to_physical_repr().into_owned();
-                    self.0.append(other.as_ref().as_ref());
+                    let other = other.to_physical_repr();
+                    // 3 refs
+                    // ref Cow
+                    // ref SeriesTrait
+                    // ref ChunkedArray
+                    self.0.append(other.as_ref().as_ref().as_ref());
                     Ok(())
                 } else {
                     Err(PolarsError::SchemaMisMatch(
                         "cannot append Series; data types don't match".into(),
                     ))
                 }
             }
+            fn extend(&mut self, other: &Series) -> Result<()> {
+                if self.0.dtype() == other.dtype() {
+                    // 3 refs
+                    // ref Cow
+                    // ref SeriesTrait
+                    // ref ChunkedArray
+                    let other = other.to_physical_repr();
+                    self.0.extend(other.as_ref().as_ref().as_ref());
+                    Ok(())
+                } else {
+                    Err(PolarsError::SchemaMisMatch(
+                        "cannot extend Series; data types don't match".into(),
+                    ))
+                }
+            }
 
             fn filter(&self, filter: &BooleanChunked) -> Result<Series> {
                 self.0

diff --git a/polars/polars-core/src/series/implementations/datetime.rs b/polars/polars-core/src/series/implementations/datetime.rs
@@ -296,8 +296,8 @@ impl SeriesTrait for SeriesWrap<DatetimeChunked> {
 
     fn append(&mut self, other: &Series) -> Result<()> {
         if self.0.dtype() == other.dtype() {
-            let other = other.to_physical_repr().into_owned();
-            self.0.append(other.as_ref().as_ref());
+            let other = other.to_physical_repr();
+            self.0.append(other.as_ref().as_ref().as_ref());
             Ok(())
         } else {
             Err(PolarsError::SchemaMisMatch(
@@ -306,6 +306,18 @@ impl SeriesTrait for SeriesWrap<DatetimeChunked> {
         }
     }
 
+    fn extend(&mut self, other: &Series) -> Result<()> {
+        if self.0.dtype() == other.dtype() {
+            let other = other.to_physical_repr();
+            self.0.extend(other.as_ref().as_ref().as_ref());
+            Ok(())
+        } else {
+            Err(PolarsError::SchemaMisMatch(
+                "cannot extend Series; data types don't match".into(),
+            ))
+        }
+    }
+
     fn filter(&self, filter: &BooleanChunked) -> Result<Series> {
         self.0.filter(filter).map(|ca| {
             ca.into_datetime(self.0.time_unit(), self.0.time_zone().clone())