Skip to content

Commit

Permalink
extend series
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 committed Feb 4, 2022
1 parent 554d571 commit eea2710
Show file tree
Hide file tree
Showing 15 changed files with 231 additions and 67 deletions.
41 changes: 21 additions & 20 deletions polars/polars-core/src/chunked_array/ops/extend.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@ fn extend_immutable(immutable: &dyn Array, chunks: &mut Vec<ArrayRef>, other_chu
}

impl<T> ChunkedArray<T>
where
T: PolarsNumericType,
where
T: PolarsNumericType,
{
/// Extend the memory backed by this array with the values from `other`.
///
Expand All @@ -30,13 +30,14 @@ impl<T> ChunkedArray<T>
/// online operations where you add `n` rows and rerun a query.
///
/// Prefer `append` over `extend` when you want to append many times before doing a query. For instance
/// when you read in multiple files and when to store them in a single `DataFrame`. Finish the sequence
/// of `append` operations with a [`rechunk`](Self::rechunk).
/// when you read in multiple files and when to store them in a single `DataFrame`.
/// In the latter case finish the sequence of `append` operations with a [`rechunk`](Self::rechunk).
pub fn extend(&mut self, other: &Self) {
// make sure that we are a single chunk already
// all to a single chunk
if self.chunks.len() > 1 {
self.rechunk();
self.extend(other)
self.append(other);
*self = self.rechunk();
return;
}
// Depending on the state of the underlying arrow array we
// might be able to get a `MutablePrimitiveArray`
Expand All @@ -48,7 +49,7 @@ impl<T> ChunkedArray<T>
let arr = self.downcast_iter().next().unwrap();

// increments 1
let mut arr = arr.clone();
let arr = arr.clone();

// now we drop our owned ArrayRefs so that
// decrements 1
Expand Down Expand Up @@ -79,15 +80,15 @@ impl<T> ChunkedArray<T>
#[doc(hidden)]
impl Utf8Chunked {
pub fn extend(&mut self, other: &Self) {
// make sure that we are a single chunk already
if self.chunks.len() > 1 {
self.rechunk();
self.extend(other)
self.append(other);
*self = self.rechunk();
return;
}
let arr = self.downcast_iter().next().unwrap();

// increments 1
let mut arr = arr.clone();
let arr = arr.clone();

// now we drop our owned ArrayRefs so that
// decrements 1
Expand Down Expand Up @@ -117,13 +118,14 @@ impl BooleanChunked {
pub fn extend(&mut self, other: &Self) {
// make sure that we are a single chunk already
if self.chunks.len() > 1 {
self.rechunk();
self.extend(other)
self.append(other);
*self = self.rechunk();
return;
}
let arr = self.downcast_iter().next().unwrap();

// increments 1
let mut arr = arr.clone();
let arr = arr.clone();

// now we drop our owned ArrayRefs so that
// decrements 1
Expand Down Expand Up @@ -155,7 +157,6 @@ impl ListChunked {
// this is harder because we don't know the inner type of the list
self.append(other);
}

}

#[cfg(test)]
Expand Down Expand Up @@ -189,8 +190,8 @@ mod test {

#[test]
fn test_extend_utf8() {
let mut ca= Utf8Chunked::new("a", &["a", "b", "c"]);
let to_append= Utf8Chunked::new("a", &["a", "b", "e"]);
let mut ca = Utf8Chunked::new("a", &["a", "b", "c"]);
let to_append = Utf8Chunked::new("a", &["a", "b", "e"]);

ca.extend(&to_append);
let vals = ca.into_no_null_iter().collect::<Vec<_>>();
Expand All @@ -199,8 +200,8 @@ mod test {

#[test]
fn test_extend_bool() {
let mut ca= BooleanChunked::new("a", [true, false]);
let to_append= BooleanChunked::new("a", &[false, false]);
let mut ca = BooleanChunked::new("a", [true, false]);
let to_append = BooleanChunked::new("a", &[false, false]);

ca.extend(&to_append);
let vals = ca.into_no_null_iter().collect::<Vec<_>>();
Expand Down
107 changes: 73 additions & 34 deletions polars/polars-core/src/frame/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -731,6 +731,8 @@ impl DataFrame {

/// Concatenate a `DataFrame` to this `DataFrame` and return as newly allocated `DataFrame`.
///
/// If many `vstack` operations are done, it is recommended to call [`DataFrame::rechunk`].
///
/// # Example
///
/// ```rust
Expand Down Expand Up @@ -767,14 +769,46 @@ impl DataFrame {
/// | Palladium | 1828.05 |
/// +-----------+-------------------+
/// ```
pub fn vstack(&self, columns: &DataFrame) -> Result<Self> {
pub fn vstack(&self, other: &DataFrame) -> Result<Self> {
let mut df = self.clone();
df.vstack_mut(columns)?;
df.vstack_mut(other)?;
Ok(df)
}

// utility to test if we can vstack/extend the columns
fn can_extend(&self, left: &Series, right: &Series) -> Result<()> {
if left.dtype() != right.dtype() || left.name() != right.name() {
if left.dtype() != right.dtype() {
return Err(PolarsError::SchemaMisMatch(
format!(
"cannot vstack: because column datatypes (dtypes) in the two DataFrames do not match for \
left.name='{}' with left.dtype={} != right.dtype={} with right.name='{}'",
left.name(),
left.dtype(),
right.dtype(),
right.name()
)
.into(),
));
} else {
return Err(PolarsError::SchemaMisMatch(
format!(
"cannot vstack: because column names in the two DataFrames do not match for \
left.name='{}' != right.name='{}'",
left.name(),
right.name()
)
.into(),
));
}
};
Ok(())
}

/// Concatenate a DataFrame to this DataFrame
///
/// If many `vstack` operations are done, it is recommended to call [`DataFrame::rechunk`].
///
/// # Example
///
/// ```rust
Expand Down Expand Up @@ -811,51 +845,56 @@ impl DataFrame {
/// | Palladium | 1828.05 |
/// +-----------+-------------------+
/// ```
pub fn vstack_mut(&mut self, df: &DataFrame) -> Result<&mut Self> {
if self.width() != df.width() {
pub fn vstack_mut(&mut self, other: &DataFrame) -> Result<&mut Self> {
if self.width() != other.width() {
return Err(PolarsError::ShapeMisMatch(
format!("Could not vertically stack DataFrame. The DataFrames appended width {} differs from the parent DataFrames width {}", self.width(), df.width()).into()
format!("Could not vertically stack DataFrame. The DataFrames appended width {} differs from the parent DataFrames width {}", self.width(), other.width()).into()
));
}

self.columns
.iter_mut()
.zip(df.columns.iter())
.zip(other.columns.iter())
.try_for_each(|(left, right)| {
if left.dtype() != right.dtype() || left.name() != right.name() {
if left.dtype() != right.dtype() {
return Err(PolarsError::SchemaMisMatch(
format!(
"cannot vstack: because column datatypes (dtypes) in the two DataFrames do not match for \
left.name='{}' with left.dtype={} != right.dtype={} with right.name='{}'",
left.name(),
left.dtype(),
right.dtype(),
right.name()
)
.into(),
));
}
else {
return Err(PolarsError::SchemaMisMatch(
format!(
"cannot vstack: because column names in the two DataFrames do not match for \
left.name='{}' != right.name='{}'",
left.name(),
right.name()
)
.into(),
));
}
}

self.can_extend(left, right)?;
left.append(right).expect("should not fail");
Ok(())
})?;
// don't rechunk here. Chunks in columns always match.
Ok(self)
}

/// Extend the memory backed by this [`DataFrame`] with the values from `other`.
///
/// Different from [`vstack`](Self::vstack) which adds the chunks from `other` to the chunks of this [`DataFrame`]
/// `extent` appends the data from `other` to the underlying memory locations and thus may cause a reallocation.
///
/// If this does not cause a reallocation, the resulting data structure will not have any extra chunks
/// and thus will yield faster queries.
///
/// Prefer `extend` over `vstack` when you want do a query after a single append. For instance during
/// online operations where you add `n` rows and rerun a query.
///
/// Prefer `vstack` over `extend` when you want to append many times before doing a query. For instance
/// when you read in multiple files and when to store them in a single `DataFrame`. In the latter case, finish the sequence
/// of `append` operations with a [`rechunk`](Self::rechunk).
pub fn extend(&mut self, other: &DataFrame) -> Result<()> {
if self.width() != other.width() {
return Err(PolarsError::ShapeMisMatch(
format!("Could not extend DataFrame. The DataFrames extended width {} differs from the parent DataFrames width {}", self.width(), other.width()).into()
));
}

self.columns
.iter_mut()
.zip(other.columns.iter())
.try_for_each(|(left, right)| {
self.can_extend(left, right)?;
left.extend(right).unwrap();
Ok(())
})?;
Ok(())
}

/// Remove a column by name and return the column removed.
///
/// # Example
Expand Down
2 changes: 1 addition & 1 deletion polars/polars-core/src/prelude.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ pub use crate::{
ListPrimitiveChunkedBuilder, ListUtf8ChunkedBuilder, NewChunkedArray,
PrimitiveChunkedBuilder, Utf8ChunkedBuilder,
},
iterator::{PolarsIterator},
iterator::PolarsIterator,
ops::{aggregate::*, *},
ChunkedArray,
},
Expand Down
12 changes: 11 additions & 1 deletion polars/polars-core/src/series/implementations/boolean.rs
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,6 @@ impl SeriesTrait for SeriesWrap<BooleanChunked> {

fn append(&mut self, other: &Series) -> Result<()> {
if self.0.dtype() == other.dtype() {
// todo! add object
self.0.append(other.as_ref().as_ref());
Ok(())
} else {
Expand All @@ -169,6 +168,17 @@ impl SeriesTrait for SeriesWrap<BooleanChunked> {
}
}

fn extend(&mut self, other: &Series) -> Result<()> {
if self.0.dtype() == other.dtype() {
self.0.extend(other.as_ref().as_ref());
Ok(())
} else {
Err(PolarsError::SchemaMisMatch(
"cannot extend Series; data types don't match".into(),
))
}
}

fn filter(&self, filter: &BooleanChunked) -> Result<Series> {
ChunkFilter::filter(&self.0, filter).map(|ca| ca.into_series())
}
Expand Down
17 changes: 15 additions & 2 deletions polars/polars-core/src/series/implementations/categorical.rs
Original file line number Diff line number Diff line change
Expand Up @@ -151,15 +151,28 @@ impl SeriesTrait for SeriesWrap<CategoricalChunked> {

fn append(&mut self, other: &Series) -> Result<()> {
if self.0.dtype() == other.dtype() {
// todo! add object
self.0.append(other.as_ref().as_ref());
let other = other.categorical()?;
self.0.append(other);
self.0.merge_categorical_map(other);
Ok(())
} else {
Err(PolarsError::SchemaMisMatch(
"cannot append Series; data types don't match".into(),
))
}
}
fn extend(&mut self, other: &Series) -> Result<()> {
if self.0.dtype() == other.dtype() {
let other = other.categorical()?;
self.0.extend(other);
self.0.merge_categorical_map(other);
Ok(())
} else {
Err(PolarsError::SchemaMisMatch(
"cannot extend Series; data types don't match".into(),
))
}
}

fn filter(&self, filter: &BooleanChunked) -> Result<Series> {
ChunkFilter::filter(&self.0, filter).map(|ca| ca.into_series())
Expand Down
23 changes: 21 additions & 2 deletions polars/polars-core/src/series/implementations/dates_time.rs
Original file line number Diff line number Diff line change
Expand Up @@ -298,15 +298,34 @@ macro_rules! impl_dyn_series {

fn append(&mut self, other: &Series) -> Result<()> {
if self.0.dtype() == other.dtype() {
let other = other.to_physical_repr().into_owned();
self.0.append(other.as_ref().as_ref());
let other = other.to_physical_repr();
// 3 refs
// ref Cow
// ref SeriesTrait
// ref ChunkedArray
self.0.append(other.as_ref().as_ref().as_ref());
Ok(())
} else {
Err(PolarsError::SchemaMisMatch(
"cannot append Series; data types don't match".into(),
))
}
}
fn extend(&mut self, other: &Series) -> Result<()> {
if self.0.dtype() == other.dtype() {
// 3 refs
// ref Cow
// ref SeriesTrait
// ref ChunkedArray
let other = other.to_physical_repr();
self.0.extend(other.as_ref().as_ref().as_ref());
Ok(())
} else {
Err(PolarsError::SchemaMisMatch(
"cannot extend Series; data types don't match".into(),
))
}
}

fn filter(&self, filter: &BooleanChunked) -> Result<Series> {
self.0
Expand Down
16 changes: 14 additions & 2 deletions polars/polars-core/src/series/implementations/datetime.rs
Original file line number Diff line number Diff line change
Expand Up @@ -296,8 +296,8 @@ impl SeriesTrait for SeriesWrap<DatetimeChunked> {

fn append(&mut self, other: &Series) -> Result<()> {
if self.0.dtype() == other.dtype() {
let other = other.to_physical_repr().into_owned();
self.0.append(other.as_ref().as_ref());
let other = other.to_physical_repr();
self.0.append(other.as_ref().as_ref().as_ref());
Ok(())
} else {
Err(PolarsError::SchemaMisMatch(
Expand All @@ -306,6 +306,18 @@ impl SeriesTrait for SeriesWrap<DatetimeChunked> {
}
}

fn extend(&mut self, other: &Series) -> Result<()> {
if self.0.dtype() == other.dtype() {
let other = other.to_physical_repr();
self.0.extend(other.as_ref().as_ref().as_ref());
Ok(())
} else {
Err(PolarsError::SchemaMisMatch(
"cannot extend Series; data types don't match".into(),
))
}
}

fn filter(&self, filter: &BooleanChunked) -> Result<Series> {
self.0.filter(filter).map(|ca| {
ca.into_datetime(self.0.time_unit(), self.0.time_zone().clone())
Expand Down
Loading

0 comments on commit eea2710

Please sign in to comment.