From 10a54dca28c35a1c6a87d262d839dd7ad8f74d8a Mon Sep 17 00:00:00 2001 From: universalmind303 Date: Mon, 9 May 2022 09:23:04 -0500 Subject: [PATCH] Fast json (#3324) --- nodejs-polars/__tests__/dataframe.test.ts | 4 +-- nodejs-polars/polars/dataframe.ts | 3 +++ nodejs-polars/polars/io.ts | 5 ++-- nodejs-polars/polars/lazy/expr.ts | 5 ++-- nodejs-polars/polars/series/series.ts | 3 +++ nodejs-polars/src/dataframe.rs | 32 +++++++++++++++++++++-- nodejs-polars/src/lazy/dsl.rs | 4 +-- nodejs-polars/src/series.rs | 6 +++-- 8 files changed, 50 insertions(+), 12 deletions(-) diff --git a/nodejs-polars/__tests__/dataframe.test.ts b/nodejs-polars/__tests__/dataframe.test.ts index 6c7831866..92bc28b6c 100644 --- a/nodejs-polars/__tests__/dataframe.test.ts +++ b/nodejs-polars/__tests__/dataframe.test.ts @@ -1386,7 +1386,7 @@ describe("io", () => { } }); df.writeJSON(writeStream, {format:"lines"}); - const newDF = pl.readJSON(body); + const newDF = pl.readJSON(body).select("foo", "bar"); expect(newDF).toFrameEqual(df); done(); }); @@ -1396,7 +1396,7 @@ describe("io", () => { pl.Series("bar", ["a", "b", "c"]) ]); df.writeJSON("./test.json", {format:"lines"}); - const newDF = pl.readJSON("./test.json"); + const newDF = pl.readJSON("./test.json").select("foo", "bar"); expect(newDF).toFrameEqual(df); fs.rmSync("./test.json"); done(); diff --git a/nodejs-polars/polars/dataframe.ts b/nodejs-polars/polars/dataframe.ts index 53d772045..70674a42a 100644 --- a/nodejs-polars/polars/dataframe.ts +++ b/nodejs-polars/polars/dataframe.ts @@ -1744,6 +1744,7 @@ export const _DataFrame = (_df: any): DataFrame => { return wrap("sampleN", 1, withReplacement, + false, seed ); } @@ -1754,6 +1755,7 @@ export const _DataFrame = (_df: any): DataFrame => { return wrap("sampleN", opts, withReplacement, + false, seed ); } @@ -1761,6 +1763,7 @@ export const _DataFrame = (_df: any): DataFrame => { return wrap("sampleFrac", frac, withReplacement, + false, seed ); } diff --git a/nodejs-polars/polars/io.ts b/nodejs-polars/polars/io.ts index df9dcf352..15304a366 100644 --- a/nodejs-polars/polars/io.ts +++ b/nodejs-polars/polars/io.ts @@ -232,6 +232,7 @@ export function scanCSV(path, options?) { export function readJSON(pathOrBody: string | Buffer, options?: any): DataFrame export function readJSON(pathOrBody, options = readJsonDefaultOptions) { options = {...readJsonDefaultOptions, ...options}; + let method = options.format === "lines" ? pli.readJsonLines : pli.readJson; const extensions = [".ndjson", ".json", ".jsonl"]; if (Buffer.isBuffer(pathOrBody)) { return _DataFrame(pli.readJson(pathOrBody, options)); @@ -240,9 +241,9 @@ export function readJSON(pathOrBody, options = readJsonDefaultOptions) { if (typeof pathOrBody === "string") { const inline = !isPath(pathOrBody, extensions); if (inline) { - return _DataFrame(pli.readJson(Buffer.from(pathOrBody, "utf-8"), options)); + return _DataFrame(method(Buffer.from(pathOrBody, "utf-8"), options)); } else { - return _DataFrame(pli.readJson(pathOrBody, options)); + return _DataFrame(method(pathOrBody, options)); } } else { throw new Error("must supply either a path or body"); diff --git a/nodejs-polars/polars/lazy/expr.ts b/nodejs-polars/polars/lazy/expr.ts index 0bf23b392..f3f6c18dc 100644 --- a/nodejs-polars/polars/lazy/expr.ts +++ b/nodejs-polars/polars/lazy/expr.ts @@ -927,11 +927,12 @@ export const _Expr = (_expr: any): Expr => { throw new Error("sample_n is not yet supported for expr"); } if(typeof frac === "number") { - return wrap("sampleFrac", { + return wrap("sampleFrac", frac, withReplacement, + false, seed - }); + ); } else { throw new TypeError("must specify either 'frac' or 'n'"); diff --git a/nodejs-polars/polars/series/series.ts b/nodejs-polars/polars/series/series.ts index 819d644ce..772f205bd 100644 --- a/nodejs-polars/polars/series/series.ts +++ b/nodejs-polars/polars/series/series.ts @@ -1571,6 +1571,7 @@ export function _Series(_s: any): Series { return wrap("sampleN", 1, withReplacement, + false, seed ); } @@ -1581,6 +1582,7 @@ export function _Series(_s: any): Series { return wrap("sampleN", opts, withReplacement, + false, seed ); } @@ -1588,6 +1590,7 @@ export function _Series(_s: any): Series { return wrap("sampleFrac", frac, withReplacement, + false, seed ); } diff --git a/nodejs-polars/src/dataframe.rs b/nodejs-polars/src/dataframe.rs index 9e529957b..163800bf0 100644 --- a/nodejs-polars/src/dataframe.rs +++ b/nodejs-polars/src/dataframe.rs @@ -157,6 +157,30 @@ pub struct WriteJsonOptions { pub format: String, } +#[napi] +pub fn read_json_lines( + path_or_buffer: Either, + options: ReadJsonOptions, +) -> napi::Result { + let infer_schema_length = options.infer_schema_length.unwrap_or(100) as usize; + let batch_size = options.batch_size.unwrap_or(10000) as usize; + + let df = match path_or_buffer { + Either::A(path) => JsonLineReader::from_path(path) + .expect("unable to read file") + .infer_schema_len(Some(infer_schema_length)) + .finish() + .map_err(JsPolarsErr::from)?, + Either::B(buf) => { + let cursor = Cursor::new(buf.as_ref()); + JsonLineReader::new(cursor) + .infer_schema_len(Some(infer_schema_length)) + .finish() + .map_err(JsPolarsErr::from)? + } + }; + Ok(df.into()) +} #[napi] pub fn read_json( path_or_buffer: Either, @@ -1013,11 +1037,13 @@ impl JsDataFrame { &self, n: i64, with_replacement: bool, + shuffle: bool, seed: Option, ) -> napi::Result { + let df = self .df - .sample_n(n as usize, with_replacement, seed.map(|s| s as u64)) + .sample_n(n as usize, with_replacement, shuffle, seed.map(|s| s as u64)) .map_err(JsPolarsErr::from)?; Ok(df.into()) } @@ -1027,11 +1053,13 @@ impl JsDataFrame { &self, frac: f64, with_replacement: bool, + shuffle: bool, seed: Option, ) -> napi::Result { + let df = self .df - .sample_frac(frac, with_replacement, seed.map(|s| s as u64)) + .sample_frac(frac, with_replacement, shuffle, seed.map(|s| s as u64)) .map_err(JsPolarsErr::from)?; Ok(df.into()) } diff --git a/nodejs-polars/src/lazy/dsl.rs b/nodejs-polars/src/lazy/dsl.rs index f169ac8e9..4b1cc6823 100644 --- a/nodejs-polars/src/lazy/dsl.rs +++ b/nodejs-polars/src/lazy/dsl.rs @@ -1184,11 +1184,11 @@ impl JsExpr { } #[napi] - pub fn sample_frac(&self, frac: f64, with_replacement: bool, seed: Option) -> JsExpr { + pub fn sample_frac(&self, frac: f64, with_replacement: bool, shuffle: bool, seed: Option) -> JsExpr { let seed = seed.map(|s| s as u64); self.inner .clone() - .sample_frac(frac, with_replacement, seed) + .sample_frac(frac, with_replacement, shuffle, seed) .into() } #[napi] diff --git a/nodejs-polars/src/series.rs b/nodejs-polars/src/series.rs index 814f804db..c20874ec9 100644 --- a/nodejs-polars/src/series.rs +++ b/nodejs-polars/src/series.rs @@ -548,6 +548,7 @@ impl JsSeries { &self, n: u32, with_replacement: bool, + shuffle: bool, seed: Option>, ) -> napi::Result { // Safety: @@ -555,7 +556,7 @@ impl JsSeries { let seed: Option = unsafe { std::mem::transmute(seed) }; let s = self .series - .sample_n(n as usize, with_replacement, seed) + .sample_n(n as usize, with_replacement, shuffle, seed) .map_err(JsPolarsErr::from)?; Ok(s.into()) } @@ -565,6 +566,7 @@ impl JsSeries { &self, frac: f64, with_replacement: bool, + shuffle: bool, seed: Option>, ) -> napi::Result { // Safety: @@ -572,7 +574,7 @@ impl JsSeries { let seed: Option = unsafe { std::mem::transmute(seed) }; let s = self .series - .sample_frac(frac, with_replacement, seed) + .sample_frac(frac, with_replacement, shuffle, seed) .map_err(JsPolarsErr::from)?; Ok(s.into()) }