Skip to content

Commit

Permalink
feat: allow specifying schema in pl.scan_ndjson
Browse files Browse the repository at this point in the history
Relates to pola-rs#8279.

I'm not 100% sure about the Python schema type annotation, there are a
few different variations in this file but this seems to make the most
sense? Happy to adjust though.
  • Loading branch information
sd2k committed Sep 7, 2023
1 parent 64bf7bf commit 617e751
Show file tree
Hide file tree
Showing 6 changed files with 46 additions and 5 deletions.
1 change: 1 addition & 0 deletions crates/polars-lazy/src/frame/ndjson.rs
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ impl LazyJsonLineReader {
}
/// Set the number of rows to use when inferring the json schema.
/// the default is 100 rows.
/// Ignored when the schema is specified explicitly using [`Self::with_schema`].
/// Setting to `None` will do a full table scan, very slow.
#[must_use]
pub fn with_infer_schema_length(mut self, num_rows: Option<usize>) -> Self {
Expand Down
5 changes: 5 additions & 0 deletions crates/polars-lazy/src/physical_plan/executors/scan/ndjson.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,11 @@ impl AnonymousScan for LazyJsonLineReader {
}

fn schema(&self, infer_schema_length: Option<usize>) -> PolarsResult<Schema> {
// Short-circuit schema inference if the schema has been explicitly provided.
if let Some(schema) = &self.schema {
return Ok(schema.clone());
}

let f = polars_utils::open_file(&self.path)?;
let mut reader = std::io::BufReader::new(f);

Expand Down
12 changes: 12 additions & 0 deletions py-polars/polars/io/ndjson.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ def scan_ndjson(
rechunk: bool = True,
row_count_name: str | None = None,
row_count_offset: int = 0,
schema: SchemaDefinition | None = None,
) -> LazyFrame:
"""
Lazily read from a newline delimited JSON file or multiple files via glob patterns.
Expand All @@ -92,6 +93,16 @@ def scan_ndjson(
DataFrame
row_count_offset
Offset to start the row_count column (only use if the name is set)
schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict
The DataFrame schema may be declared in several ways:
* As a dict of {name:type} pairs; if type is None, it will be auto-inferred.
* As a list of column names; in this case types are automatically inferred.
* As a list of (name,type) pairs; this is equivalent to the dictionary form.
If you supply a list of column names that does not match the names in the
underlying data, the names given here will overwrite them. The number
of names given in the schema should match the underlying data dimensions.
"""
if isinstance(source, (str, Path)):
Expand All @@ -100,6 +111,7 @@ def scan_ndjson(
return pl.LazyFrame._scan_ndjson(
source,
infer_schema_length=infer_schema_length,
schema=schema,
batch_size=batch_size,
n_rows=n_rows,
low_memory=low_memory,
Expand Down
2 changes: 2 additions & 0 deletions py-polars/polars/lazyframe/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -482,6 +482,7 @@ def _scan_ndjson(
source: str,
*,
infer_schema_length: int | None = None,
schema: SchemaDefinition | None = None,
batch_size: int | None = None,
n_rows: int | None = None,
low_memory: bool = False,
Expand All @@ -503,6 +504,7 @@ def _scan_ndjson(
self._ldf = PyLazyFrame.new_from_ndjson(
source,
infer_schema_length,
schema,
batch_size,
n_rows,
low_memory,
Expand Down
13 changes: 8 additions & 5 deletions py-polars/src/lazyframe.rs
Original file line number Diff line number Diff line change
Expand Up @@ -119,10 +119,11 @@ impl PyLazyFrame {
#[staticmethod]
#[cfg(feature = "json")]
#[allow(clippy::too_many_arguments)]
#[pyo3(signature = (path, infer_schema_length, batch_size, n_rows, low_memory, rechunk, row_count))]
#[pyo3(signature = (path, infer_schema_length, schema, batch_size, n_rows, low_memory, rechunk, row_count))]
fn new_from_ndjson(
path: String,
infer_schema_length: Option<usize>,
schema: Option<Wrap<Schema>>,
batch_size: Option<usize>,
n_rows: Option<usize>,
low_memory: bool,
Expand All @@ -131,15 +132,17 @@ impl PyLazyFrame {
) -> PyResult<Self> {
let row_count = row_count.map(|(name, offset)| RowCount { name, offset });

let lf = LazyJsonLineReader::new(path)
let mut lf = LazyJsonLineReader::new(path)
.with_infer_schema_length(infer_schema_length)
.with_batch_size(batch_size)
.with_n_rows(n_rows)
.low_memory(low_memory)
.with_rechunk(rechunk)
.with_row_count(row_count)
.finish()
.map_err(PyPolarsErr::from)?;
.with_row_count(row_count);
if let Some(schema) = schema {
lf = lf.with_schema(schema.0);
}
let lf = lf.finish().map_err(PyPolarsErr::from)?;
Ok(lf.into())
}

Expand Down
18 changes: 18 additions & 0 deletions py-polars/tests/unit/io/test_lazy_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,24 @@ def test_scan_ndjson(foods_ndjson_path: Path) -> None:
assert df["foo"].to_list() == [10, 16, 21, 23, 24, 30, 35]


def test_scan_ndjson_with_schema(foods_ndjson_path: Path) -> None:
schema = {
"category": pl.Categorical,
"calories": pl.Int64,
"fats_g": pl.Float64,
"sugars_g": pl.Int64,
}
df = pl.scan_ndjson(foods_ndjson_path, schema=schema).collect()
assert df["category"].dtype == pl.Categorical
assert df["calories"].dtype == pl.Int64
assert df["fats_g"].dtype == pl.Float64
assert df["sugars_g"].dtype == pl.Int64

schema["sugars_g"] = pl.Float64
df = pl.scan_ndjson(foods_ndjson_path, schema=schema).collect()
assert df["sugars_g"].dtype == pl.Float64


@pytest.mark.write_disk()
def test_scan_with_projection(tmp_path: Path) -> None:
tmp_path.mkdir(exist_ok=True)
Expand Down

0 comments on commit 617e751

Please sign in to comment.