From cb436be2b3955343aa15f5f583feaf5c0dee0fd3 Mon Sep 17 00:00:00 2001 From: Lachlan Deakin Date: Tue, 7 Jan 2025 08:41:44 +1100 Subject: [PATCH] fix: make `pcodec` compatible with `numcodecs` (#122) This should be compatible with the next numcodecs release. Fixes #111 --- CHANGELOG.md | 5 + .../array_to_bytes/pcodec/pcodec_codec.rs | 69 +++-- zarrs_metadata/CHANGELOG.md | 3 + zarrs_metadata/src/v3/array/codec/pcodec.rs | 244 ++++-------------- 4 files changed, 104 insertions(+), 217 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index eedf33d4..4e7c8cc0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Add `Vlen{Array,Bytes,Utf8}Codec`, replacing `VlenV2Codec` - Add `ZstdCodecConfigurationNumCodecs` - Adds support for Zarr V2 `zstd` encoded data created with `numcodecs` < 0.13 +- Add support for pcodec `Auto`, `None`, and `TryLookback` delta specs ### Changed - **Breaking**: Seal `Array` extension traits: `ArraySharded[Readable]Ext` and `ArrayChunkCacheExt` @@ -26,6 +27,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - **Breaking**: Make `VlenV2Codec` private - Bump `itertools` to 0.14 +### Removed +- Remove support for pcodec `Try{FloatMult,FloatQuant,IntMult}` mode specs + - These may be reimplemented when supported by `zarr-python`/`numcodecs` + ### Fixed - Cleanup unnecessary lifetime constraints in partial decoders - Fix `clippy::useless_conversion` lint diff --git a/zarrs/src/array/codec/array_to_bytes/pcodec/pcodec_codec.rs b/zarrs/src/array/codec/array_to_bytes/pcodec/pcodec_codec.rs index 5925f64f..10c9f6db 100644 --- a/zarrs/src/array/codec/array_to_bytes/pcodec/pcodec_codec.rs +++ b/zarrs/src/array/codec/array_to_bytes/pcodec/pcodec_codec.rs @@ -1,6 +1,9 @@ use std::{borrow::Cow, sync::Arc}; use pco::{standalone::guarantee::file_size, ChunkConfig, DeltaSpec, ModeSpec, PagingSpec}; +use zarrs_metadata::v3::array::codec::pcodec::{ + PcodecDeltaSpecConfiguration, PcodecPagingSpecConfiguration, +}; use crate::{ array::{ @@ -31,13 +34,10 @@ pub struct PcodecCodec { chunk_config: ChunkConfig, } -fn mode_spec_config_to_pco(mode_spec: &PcodecModeSpecConfiguration) -> ModeSpec { +fn mode_spec_config_to_pco(mode_spec: PcodecModeSpecConfiguration) -> ModeSpec { match mode_spec { PcodecModeSpecConfiguration::Auto => ModeSpec::Auto, PcodecModeSpecConfiguration::Classic => ModeSpec::Classic, - PcodecModeSpecConfiguration::TryFloatMult(base) => ModeSpec::TryFloatMult(*base), - PcodecModeSpecConfiguration::TryFloatQuant(k) => ModeSpec::TryFloatQuant(*k), - PcodecModeSpecConfiguration::TryIntMult(base) => ModeSpec::TryIntMult(*base), } } @@ -45,27 +45,32 @@ fn mode_spec_pco_to_config(mode_spec: &ModeSpec) -> PcodecModeSpecConfiguration match mode_spec { ModeSpec::Auto => PcodecModeSpecConfiguration::Auto, ModeSpec::Classic => PcodecModeSpecConfiguration::Classic, - ModeSpec::TryFloatMult(base) => PcodecModeSpecConfiguration::TryFloatMult(*base), - ModeSpec::TryFloatQuant(k) => PcodecModeSpecConfiguration::TryFloatQuant(*k), - ModeSpec::TryIntMult(base) => PcodecModeSpecConfiguration::TryIntMult(*base), _ => unreachable!("Mode spec is not supported"), } } fn configuration_to_chunk_config(configuration: &PcodecCodecConfigurationV1) -> ChunkConfig { - let mode_spec = mode_spec_config_to_pco(&configuration.mode_spec); + let mode_spec = mode_spec_config_to_pco(configuration.mode_spec); + let delta_spec = match configuration.delta_spec { + PcodecDeltaSpecConfiguration::Auto => DeltaSpec::Auto, + PcodecDeltaSpecConfiguration::None => DeltaSpec::None, + PcodecDeltaSpecConfiguration::TryConsecutive => DeltaSpec::TryConsecutive( + configuration + .delta_encoding_order + .map_or(0, |o| o.as_usize()), + ), + PcodecDeltaSpecConfiguration::TryLookback => DeltaSpec::TryLookback, + }; + let paging_spec = match configuration.paging_spec { + PcodecPagingSpecConfiguration::EqualPagesUpTo => { + PagingSpec::EqualPagesUpTo(configuration.equal_pages_up_to) + } + }; ChunkConfig::default() .with_compression_level(configuration.level.as_usize()) .with_mode_spec(mode_spec) - .with_delta_spec( - configuration - .delta_encoding_order - .map(|delta_encoding_order| { - DeltaSpec::TryConsecutive(delta_encoding_order.as_usize()) - }) - .unwrap_or_default(), - ) - .with_paging_spec(PagingSpec::EqualPagesUpTo(configuration.equal_pages_up_to)) + .with_delta_spec(delta_spec) + .with_paging_spec(paging_spec) } impl PcodecCodec { @@ -80,21 +85,33 @@ impl PcodecCodec { impl CodecTraits for PcodecCodec { fn create_metadata_opt(&self, _options: &ArrayMetadataOptions) -> Option { - let PagingSpec::EqualPagesUpTo(equal_pages_up_to) = self.chunk_config.paging_spec else { - unreachable!() + let mode_spec = mode_spec_pco_to_config(&self.chunk_config.mode_spec); + let (delta_spec, delta_encoding_order) = match self.chunk_config.delta_spec { + DeltaSpec::Auto => (PcodecDeltaSpecConfiguration::Auto, None), + DeltaSpec::None => (PcodecDeltaSpecConfiguration::None, None), + DeltaSpec::TryConsecutive(delta_encoding_order) => ( + PcodecDeltaSpecConfiguration::TryConsecutive, + Some(PcodecDeltaEncodingOrder::try_from(delta_encoding_order).expect("valid")), + ), + DeltaSpec::TryLookback => (PcodecDeltaSpecConfiguration::TryLookback, None), + _ => unimplemented!("unsupported pcodec delta spec"), + }; + let (paging_spec, equal_pages_up_to) = match self.chunk_config.paging_spec { + PagingSpec::EqualPagesUpTo(equal_pages_up_to) => ( + PcodecPagingSpecConfiguration::EqualPagesUpTo, + equal_pages_up_to, + ), + PagingSpec::Exact(_) => unimplemented!("pcodec exact paging spec not supported"), + _ => unimplemented!("unsupported pcodec paging spec"), }; - let delta_encoding_order = - if let DeltaSpec::TryConsecutive(order) = self.chunk_config.delta_spec { - Some(PcodecDeltaEncodingOrder::try_from(order).expect("validated on creation")) - } else { - None - }; let configuration = PcodecCodecConfiguration::V1(PcodecCodecConfigurationV1 { level: PcodecCompressionLevel::try_from(self.chunk_config.compression_level) .expect("validated on creation"), + mode_spec, + delta_spec, + paging_spec, delta_encoding_order, - mode_spec: mode_spec_pco_to_config(&self.chunk_config.mode_spec), equal_pages_up_to, }); diff --git a/zarrs_metadata/CHANGELOG.md b/zarrs_metadata/CHANGELOG.md index af622b3a..1c19bbc4 100644 --- a/zarrs_metadata/CHANGELOG.md +++ b/zarrs_metadata/CHANGELOG.md @@ -9,6 +9,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Changed - **Breaking**: Rename `DataTypeMetadataV3::Binary` to `Bytes` for compatibility with `zarr-python` +- **Breaking**: Revise `PcodecCodecConfiguration` to match `numcodecs`: + - Adds `delta_spec: PcodecDeltaSpecConfiguration` and `paging_spec: PcodecPagingSpecConfiguration` + - Removes `PcodecModeSpecConfiguration::{TryFloatMult,TryFloatQuant,TryIntMult}` ### Removed - **Breaking**: Remove the `v3::array::codec::vlen_v2` module and all associated types diff --git a/zarrs_metadata/src/v3/array/codec/pcodec.rs b/zarrs_metadata/src/v3/array/codec/pcodec.rs index 818e2441..1d1d7f35 100644 --- a/zarrs_metadata/src/v3/array/codec/pcodec.rs +++ b/zarrs_metadata/src/v3/array/codec/pcodec.rs @@ -21,7 +21,7 @@ impl Default for PcodecCodecConfiguration { /// Configuration parameters for the `pcodec` codec (version 1.0 draft). /// -/// Based upon [`pco::ChunkConfig`](https://docs.rs/pco/latest/pco/struct.ChunkConfig.html). +/// This configuration matches the implementation in `numcodecs`. /// /// ### Example: encode with a compression level of 12 and otherwise default parameters /// ```rust @@ -33,21 +33,23 @@ impl Default for PcodecCodecConfiguration { /// # use zarrs_metadata::v3::array::codec::pcodec::PcodecCodecConfigurationV1; /// # let configuration: PcodecCodecConfigurationV1 = serde_json::from_str(JSON).unwrap(); // TODO: Examples for more advanced configurations -// TODO: Docs about valid usage with base/k -#[derive(Clone, PartialEq, Debug, Display)] +#[derive(Serialize, Deserialize, Clone, PartialEq, Debug, Display)] #[display("{}", serde_json::to_string(self).unwrap_or_default())] +#[serde(default)] // for compatibility with zarrs < 0.19 pub struct PcodecCodecConfigurationV1 { /// A compression level from 0-12, where 12 takes the longest and compresses the most. - /// - /// The default is 8. pub level: PcodecCompressionLevel, + /// The pcodec mode spec. + pub mode_spec: PcodecModeSpecConfiguration, + /// The delta encoding strategy. + pub delta_spec: PcodecDeltaSpecConfiguration, + /// The paging spec. + pub paging_spec: PcodecPagingSpecConfiguration, /// Either a delta encoding level from 0-7 or None. /// /// If set to None, pcodec will try to infer the optimal delta encoding order. /// The default is None. pub delta_encoding_order: Option, - /// The pcodec mode spec. - pub mode_spec: PcodecModeSpecConfiguration, /// The maximum number of values to encode per pcodec page. /// /// If set too high or too low, pcodec's compression ratio may drop. @@ -57,162 +59,64 @@ pub struct PcodecCodecConfigurationV1 { pub equal_pages_up_to: usize, } -/// Specifies how Pco should choose a [`mode`](https://docs.rs/pco/latest/pco/enum.Mode.html) to compress this -/// chunk of data. +impl Default for PcodecCodecConfigurationV1 { + fn default() -> Self { + Self { + level: PcodecCompressionLevel::default(), + mode_spec: PcodecModeSpecConfiguration::default(), + delta_spec: PcodecDeltaSpecConfiguration::default(), + paging_spec: PcodecPagingSpecConfiguration::default(), + delta_encoding_order: None, + equal_pages_up_to: default_equal_pages_up_to(), + } + } +} + +/// The [`pco::ModeSpec`](https://docs.rs/pco/latest/pco/enum.ModeSpec.html). /// -/// see [`pco::ModeSpec`](https://docs.rs/pco/latest/pco/enum.ModeSpec.html). -#[derive(Deserialize, Clone, Copy, PartialEq, Debug, Display)] +/// `TryFloatMult`, `TryFloatQuant`, and `TryIntMult` are not currently supported. +#[derive(Serialize, Deserialize, Default, Clone, Copy, PartialEq, Debug, Display)] +#[serde(rename_all = "snake_case")] pub enum PcodecModeSpecConfiguration { - /// Automatically detect a good mode. - /// - /// This works well most of the time, but costs some compression time and can - /// select a bad mode in adversarial cases. + /// See . + #[default] Auto, - /// Only use `Classic` mode. + /// See . Classic, - /// Try using `FloatMult` mode with a given `base`. - /// - /// Only applies to floating-point types. - TryFloatMult(f64), - /// Try using `FloatQuant` mode with `k` bits of quantization. - /// - /// Only applies to floating-point types. - TryFloatQuant(u32), - /// Try using `IntMult` mode with a given `base`. - /// - /// Only applies to integer types. - TryIntMult(u64), } -#[derive(Serialize, Deserialize, Debug)] -// #[serde(untagged)] +/// The [`pco::DeltaSpec`](https://docs.rs/pco/latest/pco/enum.DeltaSpec.html). +/// +/// The delta encoding order for `TryConsecutive` is serialised in a separate field. +#[derive(Serialize, Deserialize, Default, Debug, Clone, Copy, PartialEq, Display)] #[serde(rename_all = "snake_case")] -enum PcodecModeSpecConfigurationIntermediate { +pub enum PcodecDeltaSpecConfiguration { + /// See . + #[default] Auto, - Classic, - TryFloatMult, - TryFloatQuant, - TryIntMult, + /// See . + None, + /// See . + TryConsecutive, + /// See . + TryLookback, } -impl Default for PcodecModeSpecConfigurationIntermediate { - fn default() -> Self { - Self::Auto - } -} - -#[derive(Serialize, Deserialize, Debug)] -#[serde(untagged)] -enum UIntOrFloat { - UInt(u64), - Float(f64), -} - -#[derive(Serialize, Deserialize, Debug)] -#[serde(deny_unknown_fields)] -struct PcodecCodecConfigurationIntermediate { - #[serde(default)] - level: PcodecCompressionLevel, - #[serde(default)] - delta_encoding_order: Option, - #[serde(default)] - mode_spec: PcodecModeSpecConfigurationIntermediate, - #[serde(skip_serializing_if = "Option::is_none")] - base: Option, - #[serde(skip_serializing_if = "Option::is_none")] - k: Option, - #[serde(default = "default_equal_pages_up_to")] - equal_pages_up_to: usize, -} - -impl serde::Serialize for PcodecCodecConfigurationV1 { - fn serialize(&self, s: S) -> Result { - let (mode_spec, base, k) = match self.mode_spec { - PcodecModeSpecConfiguration::Auto => { - (PcodecModeSpecConfigurationIntermediate::Auto, None, None) - } - PcodecModeSpecConfiguration::Classic => { - (PcodecModeSpecConfigurationIntermediate::Classic, None, None) - } - PcodecModeSpecConfiguration::TryFloatMult(base) => ( - PcodecModeSpecConfigurationIntermediate::TryFloatMult, - Some(UIntOrFloat::Float(base)), - None, - ), - PcodecModeSpecConfiguration::TryFloatQuant(k) => ( - PcodecModeSpecConfigurationIntermediate::TryFloatQuant, - None, - Some(k), - ), - PcodecModeSpecConfiguration::TryIntMult(base) => ( - PcodecModeSpecConfigurationIntermediate::TryIntMult, - Some(UIntOrFloat::UInt(base)), - None, - ), - }; - - let config = PcodecCodecConfigurationIntermediate { - level: self.level, - delta_encoding_order: self.delta_encoding_order, - mode_spec, - base, - k, - equal_pages_up_to: self.equal_pages_up_to, - }; - config.serialize(s) - } -} - -impl<'de> serde::Deserialize<'de> for PcodecCodecConfigurationV1 { - fn deserialize>(d: D) -> Result { - let config = PcodecCodecConfigurationIntermediate::deserialize(d)?; - let mode_spec = match (config.mode_spec, config.base, config.k) { - (PcodecModeSpecConfigurationIntermediate::Auto, None, None) => { - Ok(PcodecModeSpecConfiguration::Auto) - } - (PcodecModeSpecConfigurationIntermediate::Classic, None, None) => { - Ok(PcodecModeSpecConfiguration::Classic) - } - ( - PcodecModeSpecConfigurationIntermediate::TryFloatMult, - Some(UIntOrFloat::Float(base)), - None, - ) => Ok(PcodecModeSpecConfiguration::TryFloatMult(base)), - (PcodecModeSpecConfigurationIntermediate::TryFloatQuant, None, Some(k)) => { - Ok(PcodecModeSpecConfiguration::TryFloatQuant(k)) - } - ( - PcodecModeSpecConfigurationIntermediate::TryIntMult, - Some(UIntOrFloat::UInt(base)), - None, - ) => Ok(PcodecModeSpecConfiguration::TryIntMult(base)), - _ => Err(serde::de::Error::custom( - "For requested mode_spec, base or k incompatible/missing", - )), - }?; - let config = Self { - level: config.level, - delta_encoding_order: config.delta_encoding_order, - mode_spec, - equal_pages_up_to: config.equal_pages_up_to, - }; - Ok(config) - } -} - -impl Default for PcodecCodecConfigurationV1 { - fn default() -> Self { - Self { - level: PcodecCompressionLevel::default(), - delta_encoding_order: None, - mode_spec: PcodecModeSpecConfiguration::Auto, - equal_pages_up_to: default_equal_pages_up_to(), - } - } +/// The [`pco::PagingSpec`](https://docs.rs/pco/latest/pco/enum.PagingSpec.html). +/// +/// The `Exact` paging spec is not supported. +#[derive(Serialize, Deserialize, Default, Debug, Clone, Copy, PartialEq, Display)] +#[serde(rename_all = "snake_case")] +pub enum PcodecPagingSpecConfiguration { + /// See . + #[default] + EqualPagesUpTo, } /// An integer from 0 to 12 controlling the compression level. /// +/// The default is 8. +/// /// See . /// /// - Level 0 achieves only a small amount of compression. @@ -403,48 +307,6 @@ mod tests { .unwrap(); } - #[test] - fn codec_pcodec_valid_try_float_mult() { - serde_json::from_str::( - r#"{ - "level": 8, - "delta_encoding_order": 2, - "mode_spec": "try_float_mult", - "base": 0.1, - "equal_pages_up_to": 262144 - }"#, - ) - .unwrap(); - } - - #[test] - fn codec_pcodec_valid_try_float_quant() { - serde_json::from_str::( - r#"{ - "level": 8, - "delta_encoding_order": 2, - "mode_spec": "try_float_quant", - "k": 1, - "equal_pages_up_to": 262144 - }"#, - ) - .unwrap(); - } - - #[test] - fn codec_pcodec_valid_try_int_mult() { - serde_json::from_str::( - r#"{ - "level": 8, - "delta_encoding_order": 2, - "mode_spec": "try_int_mult", - "base": 1, - "equal_pages_up_to": 262144 - }"#, - ) - .unwrap(); - } - #[test] fn codec_pcodec_invalid_level() { assert!(serde_json::from_str::(