diff --git a/polars/polars-core/src/chunked_array/ops/extend.rs b/polars/polars-core/src/chunked_array/ops/extend.rs index daaefb124ac3..f1dceae199f3 100644 --- a/polars/polars-core/src/chunked_array/ops/extend.rs +++ b/polars/polars-core/src/chunked_array/ops/extend.rs @@ -26,7 +26,7 @@ where /// However if this does not cause a reallocation, the resulting data structure will not have any extra chunks /// and thus will yield faster queries. /// - /// Prefer `extend` over `append` when you want do a query after a single append. For instance during + /// Prefer `extend` over `append` when you want to do a query after a single append. For instance during /// online operations where you add `n` rows and rerun a query. /// /// Prefer `append` over `extend` when you want to append many times before doing a query. For instance diff --git a/polars/polars-core/src/frame/arithmetic.rs b/polars/polars-core/src/frame/arithmetic.rs index 9661035d7abd..261a6c0490b7 100644 --- a/polars/polars-core/src/frame/arithmetic.rs +++ b/polars/polars-core/src/frame/arithmetic.rs @@ -126,10 +126,10 @@ impl DataFrame { let mut r = r.cast(&st)?; if diff_l > 0 { - l = l.extend(AnyValue::Null, diff_l)?; + l = l.extend_constant(AnyValue::Null, diff_l)?; }; if diff_r > 0 { - r = r.extend(AnyValue::Null, diff_r)?; + r = r.extend_constant(AnyValue::Null, diff_r)?; }; f(&l, &r) diff --git a/polars/polars-core/src/frame/mod.rs b/polars/polars-core/src/frame/mod.rs index ed0cd353b8bf..4b17759f68a6 100644 --- a/polars/polars-core/src/frame/mod.rs +++ b/polars/polars-core/src/frame/mod.rs @@ -775,36 +775,6 @@ impl DataFrame { Ok(df) } - // utility to test if we can vstack/extend the columns - fn can_extend(&self, left: &Series, right: &Series) -> Result<()> { - if left.dtype() != right.dtype() || left.name() != right.name() { - if left.dtype() != right.dtype() { - return Err(PolarsError::SchemaMisMatch( - format!( - "cannot vstack: because column datatypes (dtypes) in the two DataFrames do not match for \ - left.name='{}' with left.dtype={} != right.dtype={} with right.name='{}'", - left.name(), - left.dtype(), - right.dtype(), - right.name() - ) - .into(), - )); - } else { - return Err(PolarsError::SchemaMisMatch( - format!( - "cannot vstack: because column names in the two DataFrames do not match for \ - left.name='{}' != right.name='{}'", - left.name(), - right.name() - ) - .into(), - )); - } - }; - Ok(()) - } - /// Concatenate a DataFrame to this DataFrame /// /// If many `vstack` operations are done, it is recommended to call [`DataFrame::rechunk`]. @@ -855,8 +825,8 @@ impl DataFrame { self.columns .iter_mut() .zip(other.columns.iter()) - .try_for_each(|(left, right)| { - self.can_extend(left, right)?; + .try_for_each::<_, Result<_>>(|(left, right)| { + can_extend(left, right)?; left.append(right).expect("should not fail"); Ok(()) })?; @@ -871,7 +841,7 @@ impl DataFrame { /// If this does not cause a reallocation, the resulting data structure will not have any extra chunks /// and thus will yield faster queries. /// - /// Prefer `extend` over `vstack` when you want do a query after a single append. For instance during + /// Prefer `extend` over `vstack` when you want to do a query after a single append. For instance during /// online operations where you add `n` rows and rerun a query. /// /// Prefer `vstack` over `extend` when you want to append many times before doing a query. For instance @@ -887,8 +857,8 @@ impl DataFrame { self.columns .iter_mut() .zip(other.columns.iter()) - .try_for_each(|(left, right)| { - self.can_extend(left, right)?; + .try_for_each::<_, Result<_>>(|(left, right)| { + can_extend(left, right)?; left.extend(right).unwrap(); Ok(()) })?; @@ -2887,6 +2857,36 @@ impl From for Vec { } } +// utility to test if we can vstack/extend the columns +fn can_extend(left: &Series, right: &Series) -> Result<()> { + if left.dtype() != right.dtype() || left.name() != right.name() { + if left.dtype() != right.dtype() { + return Err(PolarsError::SchemaMisMatch( + format!( + "cannot vstack: because column datatypes (dtypes) in the two DataFrames do not match for \ + left.name='{}' with left.dtype={} != right.dtype={} with right.name='{}'", + left.name(), + left.dtype(), + right.dtype(), + right.name() + ) + .into(), + )); + } else { + return Err(PolarsError::SchemaMisMatch( + format!( + "cannot vstack: because column names in the two DataFrames do not match for \ + left.name='{}' != right.name='{}'", + left.name(), + right.name() + ) + .into(), + )); + } + }; + Ok(()) +} + #[cfg(test)] mod test { use super::*; diff --git a/polars/polars-core/src/functions.rs b/polars/polars-core/src/functions.rs index 4a18ca5708c6..6736ff85179b 100644 --- a/polars/polars-core/src/functions.rs +++ b/polars/polars-core/src/functions.rs @@ -182,7 +182,7 @@ pub fn hor_concat_df(dfs: &[DataFrame]) -> Result { let diff = max_len - df.height(); df.columns .iter_mut() - .for_each(|s| *s = s.extend(AnyValue::Null, diff).unwrap()); + .for_each(|s| *s = s.extend_constant(AnyValue::Null, diff).unwrap()); } df }) diff --git a/polars/polars-core/src/series/implementations/categorical.rs b/polars/polars-core/src/series/implementations/categorical.rs index 97b7fb81fb61..56ab941a832e 100644 --- a/polars/polars-core/src/series/implementations/categorical.rs +++ b/polars/polars-core/src/series/implementations/categorical.rs @@ -153,7 +153,7 @@ impl SeriesTrait for SeriesWrap { if self.0.dtype() == other.dtype() { let other = other.categorical()?; self.0.append(other); - self.0.merge_categorical_map(other); + self.0.categorical_map = Some(self.0.merge_categorical_map(other)); Ok(()) } else { Err(PolarsError::SchemaMisMatch( @@ -165,7 +165,7 @@ impl SeriesTrait for SeriesWrap { if self.0.dtype() == other.dtype() { let other = other.categorical()?; self.0.extend(other); - self.0.merge_categorical_map(other); + self.0.categorical_map = Some(self.0.merge_categorical_map(other)); Ok(()) } else { Err(PolarsError::SchemaMisMatch( diff --git a/polars/polars-core/src/series/implementations/duration.rs b/polars/polars-core/src/series/implementations/duration.rs index b5d5baef360e..ab27a3399a48 100644 --- a/polars/polars-core/src/series/implementations/duration.rs +++ b/polars/polars-core/src/series/implementations/duration.rs @@ -301,7 +301,7 @@ impl SeriesTrait for SeriesWrap { fn extend(&mut self, other: &Series) -> Result<()> { if self.0.dtype() == other.dtype() { let other = other.to_physical_repr(); - self.0.append(other.as_ref().as_ref().as_ref()); + self.0.extend(other.as_ref().as_ref().as_ref()); Ok(()) } else { Err(PolarsError::SchemaMisMatch( diff --git a/polars/polars-core/src/series/implementations/mod.rs b/polars/polars-core/src/series/implementations/mod.rs index e4f38cff942a..40d83118be19 100644 --- a/polars/polars-core/src/series/implementations/mod.rs +++ b/polars/polars-core/src/series/implementations/mod.rs @@ -494,7 +494,7 @@ macro_rules! impl_dyn_series { fn extend(&mut self, other: &Series) -> Result<()> { if self.0.dtype() == other.dtype() { - self.0.append(other.as_ref().as_ref()); + self.0.extend(other.as_ref().as_ref()); Ok(()) } else { Err(PolarsError::SchemaMisMatch( diff --git a/polars/polars-core/src/series/ops/extend.rs b/polars/polars-core/src/series/ops/extend.rs index 4e0b5f3c2c7f..49668281f9cb 100644 --- a/polars/polars-core/src/series/ops/extend.rs +++ b/polars/polars-core/src/series/ops/extend.rs @@ -2,7 +2,7 @@ use crate::prelude::*; impl Series { /// Extend with a constant value. - pub fn extend(&self, value: AnyValue, n: usize) -> Result { + pub fn extend_constant(&self, value: AnyValue, n: usize) -> Result { use AnyValue::*; let s = match value { Float32(v) => Series::new("", vec![v]), diff --git a/py-polars/docs/source/reference/dataframe.rst b/py-polars/docs/source/reference/dataframe.rst index 0d5eae96baf3..37e0db816fab 100644 --- a/py-polars/docs/source/reference/dataframe.rst +++ b/py-polars/docs/source/reference/dataframe.rst @@ -107,6 +107,7 @@ Manipulation/ selection DataFrame.with_column DataFrame.hstack DataFrame.vstack + DataFrame.extend DataFrame.groupby DataFrame.groupby_dynamic DataFrame.groupby_rolling diff --git a/py-polars/docs/source/reference/expression.rst b/py-polars/docs/source/reference/expression.rst index 4ccd88294011..a23a9c1e224e 100644 --- a/py-polars/docs/source/reference/expression.rst +++ b/py-polars/docs/source/reference/expression.rst @@ -199,6 +199,7 @@ Manipulation/ selection Expr.reshape Expr.to_physical Expr.shuffle + Expr.extend_constant Expr.extend Column names diff --git a/py-polars/docs/source/reference/series.rst b/py-polars/docs/source/reference/series.rst index c265464242a1..38a607298999 100644 --- a/py-polars/docs/source/reference/series.rst +++ b/py-polars/docs/source/reference/series.rst @@ -178,6 +178,7 @@ Manipulation/ selection Series.reshape Series.to_dummies Series.shuffle + Series.extend_constant Series.extend Various diff --git a/py-polars/polars/internals/expr.py b/py-polars/polars/internals/expr.py index ee3ce44d93f5..1ab8f04c8856 100644 --- a/py-polars/polars/internals/expr.py +++ b/py-polars/polars/internals/expr.py @@ -2303,7 +2303,25 @@ def extend(self, value: Optional[Union[int, float, str, bool]], n: int) -> "Expr n The number of values to extend. """ - return wrap_expr(self._pyexpr.extend(value, n)) + return wrap_expr(self._pyexpr.extend_constant(value, n)) + + def extend_constant( + self, value: Optional[Union[int, float, str, bool]], n: int + ) -> "Expr": + """ + Extend the Series with given number of values. + + .. deprecated::0.12.21 + use extend_constant + + Parameters + ---------- + value + The value to extend the Series with. This value may be None to fill with nulls. + n + The number of values to extend. + """ + return self.extend_constant(value, n) # Below are the namespaces defined. Keep these at the end of the definition of Expr, as to not confuse mypy with # the type annotation `str` with the namespace "str" diff --git a/py-polars/polars/internals/frame.py b/py-polars/polars/internals/frame.py index 88efda91dbeb..49350f3b7946 100644 --- a/py-polars/polars/internals/frame.py +++ b/py-polars/polars/internals/frame.py @@ -3212,6 +3212,30 @@ def vstack(self, df: "DataFrame", in_place: bool = False) -> Optional["DataFrame else: return wrap_df(self._df.vstack(df._df)) + def extend(self, other: "DataFrame") -> None: + """ + Extend the memory backed by this `DataFrame` with the values from `other`. + + Different from `vstack` which adds the chunks from `other` to the chunks of this `DataFrame` + `extent` appends the data from `other` to the underlying memory locations and thus may cause a reallocation. + + If this does not cause a reallocation, the resulting data structure will not have any extra chunks + and thus will yield faster queries. + + Prefer `extend` over `vstack` when you want to do a query after a single append. For instance during + online operations where you add `n` rows and rerun a query. + + Prefer `vstack` over `extend` when you want to append many times before doing a query. For instance + when you read in multiple files and when to store them in a single `DataFrame`. + In the latter case, finish the sequence of `vstack` operations with a `rechunk`. + + Parameters + ---------- + other + DataFrame to vertically add. + """ + self._df.extend(other._df) + def drop(self, name: Union[str, List[str]]) -> "DataFrame": """ Remove column from DataFrame and return as new. diff --git a/py-polars/polars/internals/series.py b/py-polars/polars/internals/series.py index 1af3ee6a9b18..1572a5328b83 100644 --- a/py-polars/polars/internals/series.py +++ b/py-polars/polars/internals/series.py @@ -1127,7 +1127,7 @@ def slice(self, offset: int, length: int) -> "Series": """ return wrap_s(self._s.slice(offset, length)) - def append(self, other: "Series") -> None: + def append(self, other: "Series", append_chunks: bool = True) -> None: """ Append a Series to this one. @@ -1135,6 +1135,25 @@ def append(self, other: "Series") -> None: ---------- other Series to append. + append_chunks + If set to `True` the append operation will add the chunks from `other` to self. This is super cheap. + + if set to `False` the append operation will do the same as `DataFrame.extend` wich: + extends the memory backed by this `Series` with the values from `other`. + + Different from `append chunks`, `extent` appends the data from `other` to the underlying memory locations and + thus may cause a reallocation (which are expensive). + + If this does not cause a reallocation, the resulting data structure will not have any extra chunks + and thus will yield faster queries. + + Prefer `extend` over `append_chunks` when you want to do a query after a single append. For instance during + online operations where you add `n` rows and rerun a query. + + Prefer `append_chunks` over `extend` when you want to append many times before doing a query. For instance + when you read in multiple files and when to store them in a single `Series`. + In the latter case, finish the sequence of `append_chunks` operations with a `rechunk`. + Examples -------- @@ -1154,7 +1173,10 @@ def append(self, other: "Series") -> None: ] """ - self._s.append(other._s) + if append_chunks: + self._s.append(other._s) + else: + self._s.extend(other._s) def filter(self, predicate: Union["Series", list]) -> "Series": """ @@ -3385,10 +3407,28 @@ def ewm_var( .to_series() ) + def extend_constant( + self, value: Optional[Union[int, float, str, bool]], n: int + ) -> "Series": + """ + Extend the Series with given number of values. + + Parameters + ---------- + value + The value to extend the Series with. This value may be None to fill with nulls. + n + The number of values to extend. + """ + return wrap_s(self._s.extend_constant(value, n)) + def extend(self, value: Optional[Union[int, float, str, bool]], n: int) -> "Series": """ Extend the Series with given number of values. + .. deprecated::0.12.21 + use extend_constant + Parameters ---------- value @@ -3396,7 +3436,7 @@ def extend(self, value: Optional[Union[int, float, str, bool]], n: int) -> "Seri n The number of values to extend. """ - return wrap_s(self._s.extend(value, n)) + return self.extend_constant(value, n) @property def time_unit(self) -> Optional[str]: diff --git a/py-polars/src/dataframe.rs b/py-polars/src/dataframe.rs index e5f434fea02f..eb8b4e4b09b1 100644 --- a/py-polars/src/dataframe.rs +++ b/py-polars/src/dataframe.rs @@ -654,6 +654,11 @@ impl PyDataFrame { Ok(df.into()) } + pub fn extend(&mut self, df: &PyDataFrame) -> PyResult<()> { + self.df.extend(&df.df).map_err(PyPolarsEr::from)?; + Ok(()) + } + pub fn vstack_mut(&mut self, df: &PyDataFrame) -> PyResult<()> { self.df.vstack_mut(&df.df).map_err(PyPolarsEr::from)?; Ok(()) diff --git a/py-polars/src/lazy/dsl.rs b/py-polars/src/lazy/dsl.rs index f860535568c4..df5c9ad977a5 100644 --- a/py-polars/src/lazy/dsl.rs +++ b/py-polars/src/lazy/dsl.rs @@ -1246,7 +1246,7 @@ impl PyExpr { }; self.inner.clone().ewm_var(options).into() } - pub fn extend(&self, py: Python, value: Wrap, n: usize) -> Self { + pub fn extend_constant(&self, py: Python, value: Wrap, n: usize) -> Self { let value = value.into_py(py); self.inner .clone() @@ -1255,7 +1255,7 @@ impl PyExpr { let gil = Python::acquire_gil(); let py = gil.python(); let value = value.extract::>(py).unwrap().0; - s.extend(value, n) + s.extend_constant(value, n) }, GetOutput::same_type(), ) diff --git a/py-polars/src/series.rs b/py-polars/src/series.rs index 5f7a172c8b75..44e3a72bc79f 100644 --- a/py-polars/src/series.rs +++ b/py-polars/src/series.rs @@ -433,6 +433,13 @@ impl PySeries { Ok(()) } + pub fn extend(&mut self, other: &PySeries) -> PyResult<()> { + self.series + .extend(&other.series) + .map_err(PyPolarsEr::from)?; + Ok(()) + } + pub fn filter(&self, filter: &PySeries) -> PyResult { let filter_series = &filter.series; if let Ok(ca) = filter_series.bool() { @@ -1439,9 +1446,12 @@ impl PySeries { pub fn shuffle(&self, seed: u64) -> Self { self.series.shuffle(seed).into() } - pub fn extend(&self, value: Wrap, n: usize) -> PyResult { + pub fn extend_constant(&self, value: Wrap, n: usize) -> PyResult { let value = value.0; - let out = self.series.extend(value, n).map_err(PyPolarsEr::from)?; + let out = self + .series + .extend_constant(value, n) + .map_err(PyPolarsEr::from)?; Ok(out.into()) } diff --git a/py-polars/tests/test_df.py b/py-polars/tests/test_df.py index 06d73091c8f0..e574e73d81a2 100644 --- a/py-polars/tests/test_df.py +++ b/py-polars/tests/test_df.py @@ -635,6 +635,55 @@ def test_vstack(in_place: bool) -> None: assert out.frame_equal(expected) # type: ignore +def test_extend() -> None: + with pl.StringCache(): + df1 = pl.DataFrame( + { + "foo": [1, 2], + "bar": [True, False], + "ham": ["a", "b"], + "cat": ["A", "B"], + "dates": [datetime(2021, 1, 1), datetime(2021, 2, 1)], + } + ).with_columns( + [ + pl.col("cat").cast(pl.Categorical), + ] + ) + df2 = pl.DataFrame( + { + "foo": [3, 4], + "bar": [True, None], + "ham": ["c", "d"], + "cat": ["C", "B"], + "dates": [datetime(2022, 9, 1), datetime(2021, 2, 1)], + } + ).with_columns( + [ + pl.col("cat").cast(pl.Categorical), + ] + ) + + df1.extend(df2) + expected = pl.DataFrame( + { + "foo": [1, 2, 3, 4], + "bar": [True, False, True, None], + "ham": ["a", "b", "c", "d"], + "cat": ["A", "B", "C", "B"], + "dates": [ + datetime(2021, 1, 1), + datetime(2021, 2, 1), + datetime(2022, 9, 1), + datetime(2021, 2, 1), + ], + } + ).with_column( + pl.col("cat").cast(pl.Categorical), + ) + assert df1.frame_equal(expected) + + def test_drop() -> None: df = pl.DataFrame({"a": [2, 1, 3], "b": ["a", "b", "c"], "c": [1, 2, 3]}) df = df.drop("a") diff --git a/py-polars/tests/test_series.py b/py-polars/tests/test_series.py index 4a79c797ca58..8643d6b7e53a 100644 --- a/py-polars/tests/test_series.py +++ b/py-polars/tests/test_series.py @@ -211,6 +211,16 @@ def test_add_string() -> None: testing.assert_series_equal(result, pl.Series(["hello world", "weird world"])) +def test_append_extend() -> None: + a = pl.Series("a", [1, 2]) + b = pl.Series("b", [8, 9, None]) + a.append(b, append_chunks=False) + expected = pl.Series("a", [1, 2, 8, 9, None]) + assert a.series_equal(expected, null_equal=True) + print(a.chunk_lengths()) + assert a.n_chunks() == 1 + + def test_various() -> None: a = pl.Series("a", [1, 2])