This repository has been archived by the owner on Feb 18, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 224
/
Copy pathfile.rs
127 lines (113 loc) · 3.79 KB
/
file.rs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
use std::io::BufRead;
use fallible_streaming_iterator::FallibleStreamingIterator;
use indexmap::set::IndexSet as HashSet;
use serde_json;
use serde_json::Value;
use crate::{
datatypes::DataType,
error::{ArrowError, Result},
};
use super::super::super::json::read::{coerce_data_type, infer as infer_json};
/// Reads up to a number of lines from `reader` into `rows` bounded by `limit`.
fn read_rows<R: BufRead>(reader: &mut R, rows: &mut [String], limit: usize) -> Result<usize> {
if limit == 0 {
return Ok(0);
}
let mut row_number = 0;
for row in rows.iter_mut() {
loop {
row.clear();
let _ = reader.read_line(row).map_err(|e| {
ArrowError::External(format!(" at line {}", row_number), Box::new(e))
})?;
if row.is_empty() {
break;
}
if !row.trim().is_empty() {
break;
}
}
if row.is_empty() {
break;
}
row_number += 1;
if row_number == limit {
break;
}
}
Ok(row_number)
}
/// A [`FallibleStreamingIterator`] of NDJSON rows.
///
/// This iterator is used to read chunks of an NDJSON in batches.
/// This iterator is guaranteed to yield at least one row.
/// # Implementantion
/// Advancing this iterator is IO-bounded, but does require parsing each byte to find end of lines.
/// # Error
/// Advancing this iterator errors iff the reader errors.
pub struct FileReader<R: BufRead> {
reader: R,
rows: Vec<String>,
number_of_rows: usize,
remaining: usize,
}
impl<R: BufRead> FileReader<R> {
/// Creates a new [`FileReader`] from a reader and `rows`.
///
/// The number of items in `rows` denotes the batch size.
pub fn new(reader: R, rows: Vec<String>, limit: Option<usize>) -> Self {
Self {
reader,
rows,
remaining: limit.unwrap_or(usize::MAX),
number_of_rows: 0,
}
}
/// Deconstruct [`FileReader`] into the reader and the internal buffer.
pub fn into_inner(self) -> (R, Vec<String>) {
(self.reader, self.rows)
}
}
impl<R: BufRead> FallibleStreamingIterator for FileReader<R> {
type Error = ArrowError;
type Item = [String];
fn advance(&mut self) -> Result<()> {
self.number_of_rows = read_rows(&mut self.reader, &mut self.rows, self.remaining)?;
self.remaining -= self.number_of_rows;
Ok(())
}
fn get(&self) -> Option<&Self::Item> {
if self.number_of_rows > 0 {
Some(&self.rows[..self.number_of_rows])
} else {
None
}
}
}
/// Infers the [`DataType`] from an NDJSON file, optionally only using `number_of_rows` rows.
///
/// # Implementation
/// This implementation reads the file line by line and infers the type of each line.
/// It performs both `O(N)` IO and CPU-bounded operations where `N` is the number of rows.
pub fn infer<R: std::io::BufRead>(
reader: &mut R,
number_of_rows: Option<usize>,
) -> Result<DataType> {
if reader.fill_buf().map(|b| b.is_empty())? {
return Err(ArrowError::ExternalFormat(
"Cannot infer NDJSON types on empty reader because empty string is not a valid JSON value".to_string(),
));
}
let rows = vec!["".to_string(); 1]; // 1 <=> read row by row
let mut reader = FileReader::new(reader, rows, number_of_rows);
let mut data_types = HashSet::new();
while let Some(rows) = reader.next()? {
let value: Value = serde_json::from_str(&rows[0])?; // 0 because it is row by row
let data_type = infer_json(&value)?;
if data_type != DataType::Null {
data_types.insert(data_type);
}
}
let v: Vec<&DataType> = data_types.iter().collect();
Ok(coerce_data_type(&v))
}