Skip to content

Commit

Permalink
Add support for requiring base prefixes and suffixes.
Browse files Browse the repository at this point in the history
This requires them when parsing but also adds them to our float and integer writers when writing formats. This is useful for cases like hex floats where the floats only make sense when they have a literal `0x` prefixing them.
  • Loading branch information
Alexhuszagh committed Jan 12, 2025
1 parent 933a8da commit fdef013
Show file tree
Hide file tree
Showing 18 changed files with 882 additions and 122 deletions.
5 changes: 4 additions & 1 deletion CHANGELOG
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Added `build_checked` to our `Options` API (#204).
- Added `has_digit_separator` to `NumberFormat` (#204).
- Re-export `NumberFormat` to our other crates (#204).
- Add `Options::from_radix` for all options for similar APIs for each (#208).
- Added `Options::from_radix` for all options for similar APIs for each (#208).
- Support for requiring both integer and fraction digits with exponents, that is, `1.e5` and `.1e5`, as opposed to just requiring `1e5` (#215).
- Added `supports_parsing_integers`, `supports_parsing_floats`, `supports_writing_integers`, and `supports_writing_floats` for our number formats (#215).
- Added `required_base_prefix` and `required_base_suffix` for our number formats, requiring base prefixes and/or suffixes when parsing, and allowing writing base prefixes and/or suffixes (#215).

### Changed

Expand Down
29 changes: 19 additions & 10 deletions lexical-parse-float/src/parse.rs
Original file line number Diff line number Diff line change
Expand Up @@ -536,25 +536,31 @@ pub fn parse_number<'a, const FORMAT: u128, const IS_PARTIAL: bool>(
// INTEGER

// Check to see if we have a valid base prefix.
// NOTE: `lz_prefix` is if we had a leading zero when
// checking for a base prefix: it is not if the prefix
// exists or not.
#[allow(unused_variables)]
let mut is_prefix = false;
#[cfg(feature = "format")]
let mut lz_prefix = false;
#[cfg(all(feature = "format", feature = "power-of-two"))]
{
let base_prefix = format.base_prefix();
let mut has_prefix = false;
let mut iter = byte.integer_iter();
if base_prefix != 0 && iter.read_if_value_cased(b'0').is_some() {
// Check to see if the next character is the base prefix.
// We must have a format like `0x`, `0d`, `0o`.
// NOTE: The check for empty integer digits happens below so
// we don't need a redundant check here.
is_prefix = true;
if iter.read_if_value(base_prefix, format.case_sensitive_base_prefix()).is_some()
&& iter.is_buffer_empty()
&& format.required_integer_digits()
{
lz_prefix = true;
let prefix = iter.read_if_value(base_prefix, format.case_sensitive_base_prefix());
has_prefix = prefix.is_some();
if has_prefix && iter.is_buffer_empty() && format.required_integer_digits() {
return Err(Error::EmptyInteger(iter.cursor()));
}
}
if format.required_base_prefix() && !has_prefix {
return Err(Error::MissingBasePrefix(iter.cursor()));
}
}

// Parse our integral digits.
Expand Down Expand Up @@ -600,7 +606,7 @@ pub fn parse_number<'a, const FORMAT: u128, const IS_PARTIAL: bool>(

// Check if integer leading zeros are disabled.
#[cfg(feature = "format")]
if !is_prefix && format.no_float_leading_zeros() {
if !lz_prefix && format.no_float_leading_zeros() {
if integer_digits.len() > 1 && integer_digits.first() == Some(&b'0') {
return Err(Error::InvalidLeadingZeros(start.cursor()));
}
Expand Down Expand Up @@ -741,11 +747,14 @@ pub fn parse_number<'a, const FORMAT: u128, const IS_PARTIAL: bool>(
// that the first character **is not** a digit separator.
#[allow(unused_variables)]
let base_suffix = format.base_suffix();
#[cfg(feature = "format")]
#[cfg(all(feature = "format", feature = "power-of-two"))]
if base_suffix != 0 {
if byte.first_is(base_suffix, format.case_sensitive_base_suffix()) {
let is_suffix = byte.first_is(base_suffix, format.case_sensitive_base_suffix());
if is_suffix {
// SAFETY: safe since `byte.len() >= 1`.
unsafe { byte.step_unchecked() };
} else if format.required_base_suffix() {
return Err(Error::MissingBaseSuffix(byte.cursor()));
}
}

Expand Down
36 changes: 36 additions & 0 deletions lexical-parse-float/tests/api_tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1307,3 +1307,39 @@ fn supported_test() {
let value = f64::from_lexical_partial_with_options::<FORMAT>(float.as_bytes(), &OPTIONS);
assert_eq!(value, Ok((12345.0, 7)));
}

#[test]
#[cfg(all(feature = "format", feature = "power-of-two"))]
fn require_base_prefix_test() {
use core::num;

const PREFIX: u128 = NumberFormatBuilder::new()
.base_prefix(num::NonZeroU8::new(b'd'))
.required_base_prefix(true)
.build_strict();
const OPTIONS: Options = Options::new();

let value = f64::from_lexical_with_options::<PREFIX>(b"0d12345", &OPTIONS);
assert_eq!(value, Ok(12345.0));
let value = f64::from_lexical_with_options::<PREFIX>(b"12345", &OPTIONS);
assert_eq!(value, Err(Error::MissingBasePrefix(0)));

let value = f64::from_lexical_with_options::<PREFIX>(b"-0d12345", &OPTIONS);
assert_eq!(value, Ok(-12345.0));
let value = f64::from_lexical_with_options::<PREFIX>(b"-12345", &OPTIONS);
assert_eq!(value, Err(Error::MissingBasePrefix(1)));

const SUFFIX: u128 = NumberFormatBuilder::rebuild(PREFIX)
.base_suffix(num::NonZeroU8::new(b'z'))
.required_base_suffix(true)
.build_strict();
let value = f64::from_lexical_with_options::<SUFFIX>(b"0d12345z", &OPTIONS);
assert_eq!(value, Ok(12345.0));
let value = f64::from_lexical_with_options::<SUFFIX>(b"0d12345", &OPTIONS);
assert_eq!(value, Err(Error::MissingBaseSuffix(7)));

let value = f64::from_lexical_with_options::<SUFFIX>(b"-0d12345z", &OPTIONS);
assert_eq!(value, Ok(-12345.0));
let value = f64::from_lexical_with_options::<SUFFIX>(b"-0d12345", &OPTIONS);
assert_eq!(value, Err(Error::MissingBaseSuffix(8)));
}
138 changes: 121 additions & 17 deletions lexical-parse-integer/src/algorithm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -120,19 +120,26 @@ macro_rules! into_error {
#[cfg(feature = "format")]
macro_rules! fmt_invalid_digit {
(
$value:ident, $iter:ident, $c:expr, $start_index:ident, $invalid_digit:ident, $is_end:expr
$value:ident,
$iter:ident,
$c:expr,
$start_index:ident,
$invalid_digit:ident,
$has_suffix:ident,
$is_end:expr $(,)?
) => {{
// NOTE: If we have non-contiguous iterators, we could have a skip character
// here at the boundary. This does not affect safety but it does affect
// correctness.
debug_assert!($iter.is_contiguous() || $is_end);

let base_suffix = NumberFormat::<FORMAT>::BASE_SUFFIX;
let uncased_base_suffix = NumberFormat::<FORMAT>::CASE_SENSITIVE_BASE_SUFFIX;
let format = NumberFormat::<FORMAT> {};
let base_suffix = format.base_suffix();
let uncased_base_suffix = format.case_sensitive_base_suffix();
// Need to check for a base suffix, if so, return a valid value.
// We can't have a base suffix at the first value (need at least
// 1 digit).
if base_suffix != 0 && $iter.cursor() - $start_index > 1 {
if cfg!(feature = "power-of-two") && base_suffix != 0 && $iter.cursor() - $start_index > 1 {
let is_suffix = if uncased_base_suffix {
$c == base_suffix
} else {
Expand All @@ -144,6 +151,7 @@ macro_rules! fmt_invalid_digit {
// contiguous iterators.
if is_suffix && $is_end && $iter.is_buffer_empty() {
// Break out of the loop, we've finished parsing.
$has_suffix = true;
break;
} else if !$iter.is_buffer_empty() {
// Haven't finished parsing, so we're going to call
Expand All @@ -165,7 +173,13 @@ macro_rules! fmt_invalid_digit {
#[cfg(not(feature = "format"))]
macro_rules! fmt_invalid_digit {
(
$value:ident, $iter:ident, $c:expr, $start_index:ident, $invalid_digit:ident, $is_end:expr
$value:ident,
$iter:ident,
$c:expr,
$start_index:ident,
$invalid_digit:ident,
$has_suffix:ident,
$is_end:expr $(,)?
) => {{
$invalid_digit!($value, $iter.cursor(), $iter.current_count());
}};
Expand Down Expand Up @@ -393,6 +407,7 @@ where
/// * `add_op` - The unchecked add/sub op.
/// * `start_index` - The offset where parsing started.
/// * `invalid_digit` - Behavior when an invalid digit is found.
/// * `has_suffix` - If a base suffix was found at the end of the buffer.
/// * `is_end` - If iter corresponds to the full input.
///
/// core: <https://doc.rust-lang.org/1.81.0/src/core/num/mod.rs.html#1480>
Expand All @@ -403,15 +418,24 @@ macro_rules! parse_1digit_unchecked {
$add_op:ident,
$start_index:ident,
$invalid_digit:ident,
$is_end:expr
$has_suffix:ident,
$is_end:expr $(,)?
) => {{
// This is a slower parsing algorithm, going 1 digit at a time, but doing it in
// an unchecked loop.
let radix = NumberFormat::<FORMAT>::MANTISSA_RADIX;
while let Some(&c) = $iter.next() {
let digit = match char_to_digit_const(c, radix) {
Some(v) => v,
None => fmt_invalid_digit!($value, $iter, c, $start_index, $invalid_digit, $is_end),
None => fmt_invalid_digit!(
$value,
$iter,
c,
$start_index,
$invalid_digit,
$has_suffix,
$is_end,
),
};
// multiply first since compilers are good at optimizing things out and will do
// a fused mul/add We must do this after getting the digit for
Expand All @@ -431,6 +455,7 @@ macro_rules! parse_1digit_unchecked {
/// * `add_op` - The checked add/sub op.
/// * `start_index` - The offset where parsing started.
/// * `invalid_digit` - Behavior when an invalid digit is found.
/// * `has_suffix` - If a base suffix was found at the end of the buffer.
/// * `overflow` - If the error is overflow or underflow.
///
/// core: <https://doc.rust-lang.org/1.81.0/src/core/num/mod.rs.html#1505>
Expand All @@ -441,15 +466,24 @@ macro_rules! parse_1digit_checked {
$add_op:ident,
$start_index:ident,
$invalid_digit:ident,
$overflow:ident
$has_suffix:ident,
$overflow:ident $(,)?
) => {{
// This is a slower parsing algorithm, going 1 digit at a time, but doing it in
// an unchecked loop.
let radix = NumberFormat::<FORMAT>::MANTISSA_RADIX;
while let Some(&c) = $iter.next() {
let digit = match char_to_digit_const(c, radix) {
Some(v) => v,
None => fmt_invalid_digit!($value, $iter, c, $start_index, $invalid_digit, true),
None => fmt_invalid_digit!(
$value,
$iter,
c,
$start_index,
$invalid_digit,
$has_suffix,
true,
),
};
// multiply first since compilers are good at optimizing things out and will do
// a fused mul/add
Expand Down Expand Up @@ -477,6 +511,7 @@ macro_rules! parse_1digit_checked {
/// * `start_index` - The offset where parsing started.
/// * `invalid_digit` - Behavior when an invalid digit is found.
/// * `no_multi_digit` - If to disable multi-digit optimizations.
/// * `has_suffix` - If a base suffix was found at the end of the buffer.
/// * `is_end` - If iter corresponds to the full input.
macro_rules! parse_digits_unchecked {
(
Expand All @@ -486,7 +521,8 @@ macro_rules! parse_digits_unchecked {
$start_index:ident,
$invalid_digit:ident,
$no_multi_digit:expr,
$is_end:expr
$has_suffix:ident,
$is_end:expr $(,)?
) => {{
let can_multi = can_try_parse_multidigits::<_, FORMAT>(&$iter);
let use_multi = can_multi && !$no_multi_digit;
Expand All @@ -510,7 +546,15 @@ macro_rules! parse_digits_unchecked {
$value = $value.wrapping_mul(radix4).$add_op(value);
}
}
parse_1digit_unchecked!($value, $iter, $add_op, $start_index, $invalid_digit, $is_end)
parse_1digit_unchecked!(
$value,
$iter,
$add_op,
$start_index,
$invalid_digit,
$has_suffix,
$is_end
)
}};
}

Expand All @@ -528,6 +572,7 @@ macro_rules! parse_digits_unchecked {
/// * `invalid_digit` - Behavior when an invalid digit is found.
/// * `overflow` - If the error is overflow or underflow.
/// * `no_multi_digit` - If to disable multi-digit optimizations.
/// * `has_suffix` - If a base suffix was found at the end of the buffer.
/// * `overflow_digits` - The number of digits before we need to consider
/// checked ops.
macro_rules! parse_digits_checked {
Expand All @@ -540,7 +585,8 @@ macro_rules! parse_digits_checked {
$invalid_digit:ident,
$overflow:ident,
$no_multi_digit:expr,
$overflow_digits:expr
$has_suffix:ident,
$overflow_digits:expr $(,)?
) => {{
// Can use the unchecked for the `max_digits` here. If we
// have a non-contiguous iterator, we could have a case like
Expand All @@ -557,13 +603,22 @@ macro_rules! parse_digits_checked {
$start_index,
$invalid_digit,
$no_multi_digit,
$has_suffix,
false
);
}
}

// NOTE: all our multi-digit optimizations have been done here: skip this
parse_1digit_checked!($value, $iter, $add_op, $start_index, $invalid_digit, $overflow)
parse_1digit_checked!(
$value,
$iter,
$add_op,
$start_index,
$invalid_digit,
$has_suffix,
$overflow
)
}};
}

Expand Down Expand Up @@ -650,6 +705,9 @@ macro_rules! algorithm {
}
}
}
if cfg!(all(feature = "format", feature = "power-of-two")) && format.required_base_prefix() && !is_prefix {
return Err(Error::MissingBasePrefix(iter.cursor()));
}

// If we have a format that doesn't accept leading zeros,
// check if the next value is invalid. It's invalid if the
Expand Down Expand Up @@ -684,14 +742,60 @@ macro_rules! algorithm {
// culminates in **way** slower performance overall for simple
// integers, and no improvement for large integers.
let mut value = T::ZERO;
#[allow(unused_mut)]
let mut has_suffix = false;
if cannot_overflow && is_negative {
parse_digits_unchecked!(value, iter, wrapping_sub, start_index, $invalid_digit, $no_multi_digit, true);
parse_digits_unchecked!(
value,
iter,
wrapping_sub,
start_index,
$invalid_digit,
$no_multi_digit,
has_suffix,
true,
);
} if cannot_overflow {
parse_digits_unchecked!(value, iter, wrapping_add, start_index, $invalid_digit, $no_multi_digit, true);
parse_digits_unchecked!(
value,
iter,
wrapping_add,
start_index,
$invalid_digit,
$no_multi_digit,
has_suffix,
true,
);
} else if is_negative {
parse_digits_checked!(value, iter, checked_sub, wrapping_sub, start_index, $invalid_digit, Underflow, $no_multi_digit, overflow_digits);
parse_digits_checked!(
value,
iter,
checked_sub,
wrapping_sub,
start_index,
$invalid_digit,
Underflow,
$no_multi_digit,
has_suffix,
overflow_digits,
);
} else {
parse_digits_checked!(value, iter, checked_add, wrapping_add, start_index, $invalid_digit, Overflow, $no_multi_digit, overflow_digits);
parse_digits_checked!(
value,
iter,
checked_add,
wrapping_add,
start_index,
$invalid_digit,
Overflow,
$no_multi_digit,
has_suffix,
overflow_digits,
);
}

if cfg!(all(feature = "format", feature = "power-of-two")) && format.required_base_suffix() && !has_suffix {
return Err(Error::MissingBaseSuffix(iter.cursor()));
}

$into_ok!(value, iter.buffer_length(), iter.current_count())
Expand Down
Loading

0 comments on commit fdef013

Please sign in to comment.