From 16b83de943a280b6c666707fbcfcbb30cfb9079f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Eustace?= Date: Mon, 19 Jun 2023 10:22:40 +0200 Subject: [PATCH] Reintroduce type accuracy parsing --- Cargo.lock | 24 +- Cargo.toml | 2 +- rust/parsing.rs | 342 +++++++++++++++++----------- rust/python/parsing.rs | 102 ++++++--- rust/python/types/timezone.rs | 13 +- tests/parsing/test_parse_iso8601.py | 31 ++- 6 files changed, 323 insertions(+), 191 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a067ca16..42aa1cc2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -59,9 +59,9 @@ checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d" [[package]] name = "memoffset" -version = "0.8.0" +version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d61c719bcfbcf5d62b3a09efa6088de8c54bc0bfcd3ea7ae39fcc186108b8de1" +checksum = "5a634b1c61a95585bd15607c6ab0c4e5b226e695ff2800ba0cdccddf208c406c" dependencies = [ "autocfg", ] @@ -131,9 +131,9 @@ dependencies = [ [[package]] name = "pyo3" -version = "0.18.1" +version = "0.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "06a3d8e8a46ab2738109347433cb7b96dffda2e4a218b03ef27090238886b147" +checksum = "cffef52f74ec3b1a1baf295d9b8fcc3070327aefc39a6d00656b13c1d0b8885c" dependencies = [ "cfg-if", "indoc", @@ -148,9 +148,9 @@ dependencies = [ [[package]] name = "pyo3-build-config" -version = "0.18.1" +version = "0.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "75439f995d07ddfad42b192dfcf3bc66a7ecfd8b4a1f5f6f046aa5c2c5d7677d" +checksum = "713eccf888fb05f1a96eb78c0dbc51907fee42b3377272dc902eb38985f418d5" dependencies = [ "once_cell", "target-lexicon", @@ -158,9 +158,9 @@ dependencies = [ [[package]] name = "pyo3-ffi" -version = "0.18.1" +version = "0.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "839526a5c07a17ff44823679b68add4a58004de00512a95b6c1c98a6dcac0ee5" +checksum = "5b2ecbdcfb01cbbf56e179ce969a048fd7305a66d4cdf3303e0da09d69afe4c3" dependencies = [ "libc", "pyo3-build-config", @@ -168,9 +168,9 @@ dependencies = [ [[package]] name = "pyo3-macros" -version = "0.18.1" +version = "0.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bd44cf207476c6a9760c4653559be4f206efafb924d3e4cbf2721475fc0d6cc5" +checksum = "b78fdc0899f2ea781c463679b20cb08af9247febc8d052de941951024cd8aea0" dependencies = [ "proc-macro2", "pyo3-macros-backend", @@ -180,9 +180,9 @@ dependencies = [ [[package]] name = "pyo3-macros-backend" -version = "0.18.1" +version = "0.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc1f43d8e30460f36350d18631ccf85ded64c059829208fe680904c65bcd0a4c" +checksum = "60da7b84f1227c3e2fe7593505de274dcf4c8928b4e0a1c23d551a14e4e80a0f" dependencies = [ "proc-macro2", "quote", diff --git a/Cargo.toml b/Cargo.toml index 7d3b0fd0..737ec459 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -11,7 +11,7 @@ path = "rust/lib.rs" [dependencies] iso8601 = "0.6.1" nom = "7.1.3" -pyo3 = { version = "0.18.1", features = ["extension-module"] } +pyo3 = { version = "0.19.0", features = ["extension-module"] } [features] extension-module = ["pyo3/extension-module"] diff --git a/rust/parsing.rs b/rust/parsing.rs index f14ae9d9..2af5b976 100644 --- a/rust/parsing.rs +++ b/rust/parsing.rs @@ -33,6 +33,9 @@ pub struct ParsedDateTime { pub microsecond: u32, pub offset: Option, pub has_offset: bool, + pub has_date: bool, + pub has_time: bool, + pub extended_date_format: bool, pub time_is_midnight: bool, } @@ -48,6 +51,9 @@ impl<'a> ParsedDateTime { microsecond: 0, offset: None, has_offset: false, + has_date: false, + has_time: false, + extended_date_format: false, time_is_midnight: false, } } @@ -212,13 +218,67 @@ impl<'a> Parser<'a> { fn parse_datetime(&mut self, parsed: &mut Parsed) -> Result<(), ParseError> { let mut datetime = ParsedDateTime::new(); - let mut extended_date_format: bool = false; - datetime.year = self.parse_integer(4, "year")?; + if self.current == 'T' { + self.parse_time(&mut datetime, false)?; + + if !self.end() { + return Err(self.parse_error(format!("Unconverted data remains"))); + } + + match &parsed.datetime { + Some(_) => { + parsed.second_datetime = Some(datetime); + } + None => match &parsed.duration { + Some(_) => { + parsed.second_datetime = Some(datetime); + } + None => { + parsed.datetime = Some(datetime); + } + }, + } + + return Ok(()); + } + + datetime.year = self.parse_integer(2, "year")?; + + if self.current == ':' { + // Time in extended format + datetime.hour = datetime.year; + datetime.year = 0; + datetime.extended_date_format = true; + self.parse_time(&mut datetime, true)?; + + if !self.end() { + return Err(self.parse_error(format!("Unconverted data remains"))); + } + + match &parsed.datetime { + Some(_) => { + parsed.second_datetime = Some(datetime); + } + None => match &parsed.duration { + Some(_) => { + parsed.second_datetime = Some(datetime); + } + None => { + parsed.datetime = Some(datetime); + } + }, + } + + return Ok(()); + } + + datetime.has_date = true; + datetime.year = datetime.year * 100 + self.parse_integer(2, "year")?; if self.current == '-' { self.inc(); - extended_date_format = true; + datetime.extended_date_format = true; if self.current == 'W' { // ISO week and day in extended format (i.e. Www-D) @@ -327,138 +387,180 @@ impl<'a> Parser<'a> { } if !self.end() { - // Date/Time separator - if self.current != 'T' && self.current != ' ' { - return Err(self.parse_error(format!( - "Invalid character \"{}\" while parsing {}", - self.current, "date and time separator (\"T\" or \" \")" - ))); + self.parse_time(&mut datetime, false)?; + } + + if !self.end() { + if self.current == '/' && parsed.datetime.is_none() && parsed.duration.is_none() { + // Interval + parsed.datetime = Some(datetime); + + self.inc(); + + if self.current == 'P' { + // Duration + self.parse_duration(parsed)?; + } else { + self.parse_datetime(parsed)?; + } + + return Ok(()); } + return Err(self.parse_error(format!("Unconverted data remains"))); + } + + match &parsed.datetime { + Some(_) => { + parsed.second_datetime = Some(datetime); + } + None => match &parsed.duration { + Some(_) => { + parsed.second_datetime = Some(datetime); + } + None => { + parsed.datetime = Some(datetime); + } + }, + } + + Ok(()) + } + + fn parse_time( + &mut self, + datetime: &mut ParsedDateTime, + skip_hour: bool, + ) -> Result<(), ParseError> { + // Date/Time separator + if self.current != 'T' && self.current != ' ' && !skip_hour { + return Err(self.parse_error(format!( + "Invalid character \"{}\" while parsing {}", + self.current, "date and time separator (\"T\" or \" \")" + ))); + } + + datetime.has_time = true; + + if !skip_hour { self.inc(); // Hour datetime.hour = self.parse_integer(2, "hour")?; + } - if !self.end() && self.current != 'Z' && self.current != '+' && self.current != '-' { - // Optional minute and second - if self.current == ':' { - // Minute and second in extended format (mm:ss) - self.inc(); + if !self.end() && self.current != 'Z' && self.current != '+' && self.current != '-' { + // Optional minute and second + if self.current == ':' { + // Minute and second in extended format (mm:ss) + self.inc(); - // Minute - datetime.minute = self.parse_integer(2, "minute")?; - - if !self.end() - && self.current != 'Z' - && self.current != '+' - && self.current != '-' - { - // Optional second - if self.current != ':' { - return Err(self.parse_error(format!( - "Invalid character \"{}\" while parsing {}", - self.current, "time separator (\":\")" - ))); - } + // Minute + datetime.minute = self.parse_integer(2, "minute")?; - self.inc(); + if !self.end() && self.current != 'Z' && self.current != '+' && self.current != '-' + { + // Optional second + if self.current != ':' { + return Err(self.parse_error(format!( + "Invalid character \"{}\" while parsing {}", + self.current, "time separator (\":\")" + ))); + } - // Second - datetime.second = self.parse_integer(2, "second")?; + self.inc(); - if self.current == '.' || self.current == ',' { - // Optional fractional second - self.inc(); + // Second + datetime.second = self.parse_integer(2, "second")?; - datetime.microsecond = 0; - let mut i: u8 = 0; - - while i < 6 { - if self.current >= '0' && self.current <= '9' { - datetime.microsecond = datetime.microsecond * 10 - + self.current.to_digit(10).unwrap(); - } else if i == 0 { - // One digit minimum is required - return Err(self.unexpected_character_error("subsecond", 1)); - } else { - break; - } + if self.current == '.' || self.current == ',' { + // Optional fractional second + self.inc(); - self.inc(); - i += 1; + datetime.microsecond = 0; + let mut i: u8 = 0; + + while i < 6 { + if self.current >= '0' && self.current <= '9' { + datetime.microsecond = + datetime.microsecond * 10 + self.current.to_digit(10).unwrap(); + } else if i == 0 { + // One digit minimum is required + return Err(self.unexpected_character_error("subsecond", 1)); + } else { + break; } - // Drop extraneous digits - while self.current >= '0' && self.current <= '9' { - self.inc(); - } + self.inc(); + i += 1; + } - // Expand missing microsecond - while i < 6 { - datetime.microsecond *= 10; - i += 1; - } + // Drop extraneous digits + while self.current >= '0' && self.current <= '9' { + self.inc(); } - if !extended_date_format { - return Err(self.parse_error(format!("Cannot combine \"basic\" date format with \"extended\" time format (Should be either `YYYY-MM-DDThh:mm:ss` or `YYYYMMDDThhmmss`)."))); + // Expand missing microsecond + while i < 6 { + datetime.microsecond *= 10; + i += 1; } } - } else { - // Minute and second in compact format (mmss) - // Minute - datetime.minute = self.parse_integer(2, "minute")?; + if !datetime.extended_date_format { + return Err(self.parse_error(format!("Cannot combine \"basic\" date format with \"extended\" time format (Should be either `YYYY-MM-DDThh:mm:ss` or `YYYYMMDDThhmmss`)."))); + } + } + } else { + // Minute and second in compact format (mmss) - if !self.end() - && self.current != 'Z' - && self.current != '+' - && self.current != '-' - { - // Optional second + // Minute + datetime.minute = self.parse_integer(2, "minute")?; - datetime.second = self.parse_integer(2, "second")?; + if !self.end() && self.current != 'Z' && self.current != '+' && self.current != '-' + { + // Optional second - if self.current == '.' || self.current == ',' { - // Optional fractional second - self.inc(); + datetime.second = self.parse_integer(2, "second")?; - datetime.microsecond = 0; - let mut i: u8 = 0; - - while i < 6 { - if self.current >= '0' && self.current <= '9' { - datetime.microsecond = datetime.microsecond * 10 - + self.current.to_digit(10).unwrap(); - } else if i == 0 { - // One digit minimum is required - return Err(self.unexpected_character_error("subsecond", 1)); - } else { - break; - } + if self.current == '.' || self.current == ',' { + // Optional fractional second + self.inc(); - self.inc(); - i += 1; + datetime.microsecond = 0; + let mut i: u8 = 0; + + while i < 6 { + if self.current >= '0' && self.current <= '9' { + datetime.microsecond = + datetime.microsecond * 10 + self.current.to_digit(10).unwrap(); + } else if i == 0 { + // One digit minimum is required + return Err(self.unexpected_character_error("subsecond", 1)); + } else { + break; } - // Drop extraneous digits - while self.current >= '0' && self.current <= '9' { - self.inc(); - } + self.inc(); + i += 1; + } - // Expand missing microsecond - while i < 6 { - datetime.microsecond *= 10; - i += 1; - } + // Drop extraneous digits + while self.current >= '0' && self.current <= '9' { + self.inc(); } - } - if extended_date_format { - return Err(self.parse_error(format!("Cannot combine \"extended\" date format with \"basic\" time format (Should be either `YYYY-MM-DDThh:mm:ss` or `YYYYMMDDThhmmss`)."))); + // Expand missing microsecond + while i < 6 { + datetime.microsecond *= 10; + i += 1; + } } } + + if datetime.extended_date_format { + return Err(self.parse_error(format!("Cannot combine \"extended\" date format with \"basic\" time format (Should be either `YYYY-MM-DDThh:mm:ss` or `YYYYMMDDThhmmss`)."))); + } } } @@ -515,41 +617,7 @@ impl<'a> Parser<'a> { datetime.offset = Some(tzminute * 60); } - if !self.end() { - if self.current == '/' && parsed.datetime.is_none() && parsed.duration.is_none() { - // Interval - parsed.datetime = Some(datetime); - - self.inc(); - - if self.current == 'P' { - // Duration - self.parse_duration(parsed)?; - } else { - self.parse_datetime(parsed)?; - } - - return Ok(()); - } - - return Err(self.parse_error(format!("Unconverted data remains"))); - } - - match &parsed.datetime { - Some(_) => { - parsed.second_datetime = Some(datetime); - } - None => match &parsed.duration { - Some(_) => { - parsed.second_datetime = Some(datetime); - } - None => { - parsed.datetime = Some(datetime); - } - }, - } - - Ok(()) + return Ok(()); } fn parse_duration(&mut self, parsed: &mut Parsed) -> Result<(), ParseError> { diff --git a/rust/python/parsing.rs b/rust/python/parsing.rs index 60505f06..16113e86 100644 --- a/rust/python/parsing.rs +++ b/rust/python/parsing.rs @@ -1,7 +1,10 @@ use pyo3::exceptions; -use pyo3::{prelude::*, types::PyDateTime}; +use pyo3::prelude::*; +use pyo3::types::PyDate; +use pyo3::types::PyDateTime; +use pyo3::types::PyTime; -use crate::parsing::Parser; +use crate::parsing::{ParseError, Parser}; use crate::python::types::{Duration, FixedTimezone}; #[pyfunction] @@ -10,41 +13,84 @@ pub fn parse_iso8601(py: Python, input: &str) -> PyResult { match parsed { Ok(parsed) => match (parsed.datetime, parsed.duration, parsed.second_datetime) { - (Some(datetime), None, None) => match datetime.offset { - Some(offset) => { - let dt = PyDateTime::new( - py, - datetime.year as i32, - datetime.month as u8, - datetime.day as u8, - datetime.hour as u8, - datetime.minute as u8, - datetime.second as u8, - datetime.microsecond as u32, - Some( - Py::new(py, FixedTimezone::new(offset, None))? - .to_object(py) - .extract(py)?, - ), - )?; + (Some(datetime), None, None) => match (datetime.has_date, datetime.has_time) { + (true, true) => match datetime.offset { + Some(offset) => { + let dt = PyDateTime::new( + py, + datetime.year as i32, + datetime.month as u8, + datetime.day as u8, + datetime.hour as u8, + datetime.minute as u8, + datetime.second as u8, + datetime.microsecond as u32, + Some( + Py::new(py, FixedTimezone::new(offset, None))? + .to_object(py) + .extract(py)?, + ), + )?; - return Ok(dt.to_object(py)); - } - None => { - let dt = PyDateTime::new( + return Ok(dt.to_object(py)); + } + None => { + let dt = PyDateTime::new( + py, + datetime.year as i32, + datetime.month as u8, + datetime.day as u8, + datetime.hour as u8, + datetime.minute as u8, + datetime.second as u8, + datetime.microsecond as u32, + None, + )?; + + return Ok(dt.to_object(py)); + } + }, + (true, false) => { + let dt = PyDate::new( py, datetime.year as i32, datetime.month as u8, datetime.day as u8, - datetime.hour as u8, - datetime.minute as u8, - datetime.second as u8, - datetime.microsecond as u32, - None, )?; return Ok(dt.to_object(py)); } + (false, true) => match datetime.offset { + Some(offset) => { + let dt = PyTime::new( + py, + datetime.hour as u8, + datetime.minute as u8, + datetime.second as u8, + datetime.microsecond as u32, + Some( + Py::new(py, FixedTimezone::new(offset, None))? + .to_object(py) + .extract(py)?, + ), + )?; + + return Ok(dt.to_object(py)); + } + None => { + let dt = PyTime::new( + py, + datetime.hour as u8, + datetime.minute as u8, + datetime.second as u8, + datetime.microsecond as u32, + None, + )?; + + return Ok(dt.to_object(py)); + } + }, + (_, _) => Err(exceptions::PyValueError::new_err(format!("Parsing error"))), }, (None, Some(duration), None) => { return Ok(Py::new( diff --git a/rust/python/types/timezone.rs b/rust/python/types/timezone.rs index c0688fef..c3a5565a 100644 --- a/rust/python/types/timezone.rs +++ b/rust/python/types/timezone.rs @@ -1,7 +1,8 @@ use pyo3::prelude::*; -use pyo3::types::{PyDateTime, PyDelta, PyTzInfo}; +use pyo3::types::{PyDateTime, PyDelta, PyDict, PyTzInfo}; #[pyclass(module = "_pendulum", extends = PyTzInfo)] +#[derive(Clone)] pub struct FixedTimezone { offset: i32, name: Option, @@ -14,15 +15,15 @@ impl FixedTimezone { Self { offset, name } } - fn utcoffset<'p>(&self, py: Python<'p>, _dt: &PyDateTime) -> PyResult<&'p PyDelta> { + fn utcoffset<'p>(&self, py: Python<'p>, _dt: &PyAny) -> PyResult<&'p PyDelta> { PyDelta::new(py, 0, self.offset, 0, true) } - fn tzname(&self, _dt: &PyDateTime) -> String { + fn tzname(&self, _dt: &PyAny) -> String { self.__str__() } - fn dst<'p>(&self, py: Python<'p>, _dt: &PyDateTime) -> PyResult<&'p PyDelta> { + fn dst<'p>(&self, py: Python<'p>, _dt: &PyAny) -> PyResult<&'p PyDelta> { PyDelta::new(py, 0, 0, 0, true) } @@ -45,4 +46,8 @@ impl FixedTimezone { } } } + + fn __deepcopy__(&self, py: Python, _memo: &PyDict) -> PyResult> { + Py::new(py, self.clone()) + } } diff --git a/tests/parsing/test_parse_iso8601.py b/tests/parsing/test_parse_iso8601.py index 664ef8fa..28b85bbc 100644 --- a/tests/parsing/test_parse_iso8601.py +++ b/tests/parsing/test_parse_iso8601.py @@ -2,6 +2,7 @@ from datetime import date from datetime import datetime +from datetime import time import pytest @@ -16,17 +17,29 @@ @pytest.mark.parametrize( ["text", "expected"], [ - ("2016-10", datetime(2016, 10, 1)), - ("2016-10-06", datetime(2016, 10, 6)), + ("2016-10", date(2016, 10, 1)), + ("2016-10-06", date(2016, 10, 6)), # Ordinal date - ("2012-007", datetime(2012, 1, 7)), - ("2012007", datetime(2012, 1, 7)), - ("2017-079", datetime(2017, 3, 20)), + ("2012-007", date(2012, 1, 7)), + ("2012007", date(2012, 1, 7)), + ("2017-079", date(2017, 3, 20)), # Week date - ("2012-W05", datetime(2012, 1, 30)), - ("2008-W39-6", datetime(2008, 9, 27)), - ("2009-W53-7", datetime(2010, 1, 3)), - ("2009-W01-1", datetime(2008, 12, 29)), + ("2012-W05", date(2012, 1, 30)), + ("2008-W39-6", date(2008, 9, 27)), + ("2009-W53-7", date(2010, 1, 3)), + ("2009-W01-1", date(2008, 12, 29)), + # Time + ("12:34", time(12, 34, 0)), + ("12:34:56", time(12, 34, 56)), + ("12:34:56.123", time(12, 34, 56, 123000)), + ("12:34:56.123456", time(12, 34, 56, 123456)), + ("12:34+05:30", time(12, 34, 0, tzinfo=FixedTimezone(19800))), + ("12:34:56+05:30", time(12, 34, 56, tzinfo=FixedTimezone(19800))), + ("12:34:56.123+05:30", time(12, 34, 56, 123000, tzinfo=FixedTimezone(19800))), + ( + "12:34:56.123456+05:30", + time(12, 34, 56, 123456, tzinfo=FixedTimezone(19800)), + ), # Datetime ("2016-10-06T12:34:56.123456", datetime(2016, 10, 6, 12, 34, 56, 123456)), ("2016-10-06T12:34:56.123", datetime(2016, 10, 6, 12, 34, 56, 123000)),