From 01046b880a30d1b64b72c5a27378ae78794d9546 Mon Sep 17 00:00:00 2001 From: sg495 Date: Thu, 21 Jul 2022 13:29:57 +0100 Subject: [PATCH] v0.1.4 All dependencies relative to hash function implementation are now optional, dynamically imported only when the hash functions using them are used for the first time. Running `pip install --upgrade multiformats` will not install any of them, but they can be all installed by running `pip install --upgrade multiformats[full]`. In particular, this closes #4. Hash function implementations are loaded and registered transparently on first use, to reduce memory footprint and module loading times. Analogously, a number of multibases are created and registered transparently on first use. All hash functions with a readily available, well-supported existing Python implementation are now supported. Finally, closes #3. --- MULTIFORMATS-LICENSE => ADDITIONAL-LICENSES | 6 +- README.rst | 23 + docs/api/multiformats.multihash.raw.rst | 3 +- docs/getting-started.rst | 21 + docs/make-api.json | 8 +- docs/make-api.py | 10 +- docs/requirements.txt | 1 - multiformats/__init__.py | 2 +- multiformats/multiaddr/err.py | 3 +- multiformats/multibase/__init__.py | 18 +- multiformats/multibase/err.py | 2 - multiformats/multibase/multibase-table.csv | 51 +-- multiformats/multibase/multibase-table.json | 6 + multiformats/multibase/raw.py | 332 +++++++------- multiformats/multicodec/err.py | 2 - multiformats/multicodec/multicodec-table.csv | 13 +- multiformats/multicodec/multicodec-table.json | 91 +++- multiformats/multihash/__init__.py | 49 ++- multiformats/multihash/_hashfuns/__init__.py | 5 + multiformats/multihash/_hashfuns/blake.py | 46 ++ multiformats/multihash/_hashfuns/filecoin.py | 31 ++ .../multihash/_hashfuns/kangarootwelve.py | 27 ++ multiformats/multihash/_hashfuns/keccak.py | 29 ++ multiformats/multihash/_hashfuns/md.py | 42 ++ multiformats/multihash/_hashfuns/murmur3.py | 48 +++ multiformats/multihash/_hashfuns/sha.py | 86 ++++ multiformats/multihash/_hashfuns/skein.py | 31 ++ multiformats/multihash/_hashfuns/utils.py | 70 +++ multiformats/multihash/err.py | 2 - multiformats/multihash/raw.py | 241 ++++++++--- multiformats/varint/__init__.py | 31 +- report.py | 78 +++- report.txt | 404 ++++++++++-------- setup.cfg | 14 +- test/multihash-test-hex-vectors.csv | 12 + ...ors.csv => multihash-test-str-vectors.csv} | 1 + test/test_00_varint.py | 3 +- test/test_02_multibase.py | 2 +- test/test_03_multihash.py | 237 ++++++++-- tox.ini | 5 + update-multibase-table.py | 11 +- 41 files changed, 1547 insertions(+), 550 deletions(-) rename MULTIFORMATS-LICENSE => ADDITIONAL-LICENSES (79%) create mode 100644 multiformats/multihash/_hashfuns/__init__.py create mode 100644 multiformats/multihash/_hashfuns/blake.py create mode 100644 multiformats/multihash/_hashfuns/filecoin.py create mode 100644 multiformats/multihash/_hashfuns/kangarootwelve.py create mode 100644 multiformats/multihash/_hashfuns/keccak.py create mode 100644 multiformats/multihash/_hashfuns/md.py create mode 100644 multiformats/multihash/_hashfuns/murmur3.py create mode 100644 multiformats/multihash/_hashfuns/sha.py create mode 100644 multiformats/multihash/_hashfuns/skein.py create mode 100644 multiformats/multihash/_hashfuns/utils.py create mode 100644 test/multihash-test-hex-vectors.csv rename test/{multihash-test-vectors.csv => multihash-test-str-vectors.csv} (99%) diff --git a/MULTIFORMATS-LICENSE b/ADDITIONAL-LICENSES similarity index 79% rename from MULTIFORMATS-LICENSE rename to ADDITIONAL-LICENSES index f89bca1..dfc4c1c 100644 --- a/MULTIFORMATS-LICENSE +++ b/ADDITIONAL-LICENSES @@ -1,8 +1,10 @@ -The following items are subject to MIT License by Protocol Labs Inc: +The following items are subject to MIT License by Protocol Labs Inc, included below: - multibase table, downloaded from https://github.com/multiformats/multibase/raw/master/multibase.csv - multicodec table, downloaded from https://github.com/multiformats/multicodec/raw/master/table.csv -- test vectors for multihash, downloaded from https://github.com/multiformats/multihash/raw/master/tests/values/test_cases.csv on 14 Dec 2021 +- the test vectors for multihash in multihash-test-str-vectors.csv, downloaded from https://github.com/multiformats/multihash/raw/master/tests/values/test_cases.csv on 14 Dec 2021 + +Test vectors for murmur3 hash are public domain, courtesy of Ian Boyd https://stackoverflow.com/questions/14747343/murmurhash3-test-vectors#31929528 The MIT License (MIT) diff --git a/README.rst b/README.rst index 96d0be0..7d1c8fd 100644 --- a/README.rst +++ b/README.rst @@ -44,6 +44,27 @@ You can install the latest release from `PyPI `_, for backward compatibility of static typing. +- `typing-validation `_, for dynamic typechecking +- `bases `_, for implementation of base encodings used by Multibase + +The following are optional dependencies for this module: + +- `pysha3 `_, for the ``keccak`` hash functions. +- `blake3 `_, for the ``blake3`` hash function. +- `pyskein `_, for the ``skein`` hash functions. +- `mmh3 `_, for the ``murmur3`` hash functions. +- `pycryptodomex `_, for the ``ripemd-160`` hash function, \ + the ``kangarootwelve`` hash function and the ``sha2-512-224``/``sha2-512-256`` hash functions. + +You can install the latest release together with all optional dependencies as follows: + +.. code-block:: console + + $ pip install --upgrade multiformats[full] + Usage ----- @@ -311,3 +332,5 @@ License ------- `MIT © Hashberg Ltd. `_ + +See `additional Licenses `_ for licensing of the multicodec table, the multibase table and test vectors for multihashes. diff --git a/docs/api/multiformats.multihash.raw.rst b/docs/api/multiformats.multihash.raw.rst index f020830..258b45a 100644 --- a/docs/api/multiformats.multihash.raw.rst +++ b/docs/api/multiformats.multihash.raw.rst @@ -6,7 +6,8 @@ multiformats.multihash.raw Hashfun ------- -.. autodata:: multiformats.multihash.raw.Hashfun +.. autoclass:: multiformats.multihash.raw.Hashfun + :members: MultihashImpl ------------- diff --git a/docs/getting-started.rst b/docs/getting-started.rst index 86da765..11e9429 100644 --- a/docs/getting-started.rst +++ b/docs/getting-started.rst @@ -29,4 +29,25 @@ The above will import the following names: The first five are modules implementing the homonymous specifications, while :class:`~multiformats.cid.CID` is a class for Content IDentifiers. +The following are mandatory dependencies for this module: + +- `typing-extensions `_, for backward compatibility of static typing. +- `typing-validation `_, for dynamic typechecking +- `bases `_, for implementation of base encodings used by Multibase + +The following are optional dependencies for this module: + +- `pysha3 `_, for the ``keccak`` hash functions. +- `blake3 `_, for the ``blake3`` hash function. +- `pyskein `_, for the ``skein`` hash functions. +- `mmh3 `_, for the ``murmur3`` hash functions. +- `pycryptodomex `_, for the ``ripemd-160`` hash function, \ + the ``kangarootwelve`` hash function and the ``sha2-512-224``/``sha2-512-256`` hash functions. + +You can install the latest release together with all optional dependencies as follows: + +.. code-block:: console + + $ pip install --upgrade multiformats[full] + GitHub repo: https://github.com/hashberg-io/multiformats diff --git a/docs/make-api.json b/docs/make-api.json index b56e032..2643748 100644 --- a/docs/make-api.json +++ b/docs/make-api.json @@ -11,10 +11,12 @@ "exclude_members": { "multiformats.multicodec": ["build_multicodec_tables"], "multiformats.multibase": ["build_multibase_tables"], - "multiformats.multibase.raw": ["identity_raw_encoder", "identity_raw_decoder", "proquint_raw_encoder", "proquint_raw_decoder", "RawEncoder", "RawDecoder"], + "multiformats.multibase.raw": ["RawEncoder", "RawDecoder"], "multiformats.cid": ["CIDVersionNumbers", "byteslike"], "multiformats.multiaddr.raw": ["ip4_encoder", "ip4_decoder", "ip6_encoder", "ip6_decoder", "tcp_udp_encoder", "tcp_udp_decoder"] }, "include_modules": [], - "exclude_modules": [] -} \ No newline at end of file + "exclude_modules": [ + "multiformats.multihash._hashfuns" + ] +} diff --git a/docs/make-api.py b/docs/make-api.py index ef63898..79112bf 100644 --- a/docs/make-api.py +++ b/docs/make-api.py @@ -87,8 +87,12 @@ def make_apidocs() -> None: os.remove(apidoc_file) print() + mod_name_to_del: List[str] = [] + for mod_name, mod in modules_dict.items(): - if mod_name in exclude_modules: + if any(mod_name.startswith(name) for name in exclude_modules): + # if mod_name in exclude_modules: + mod_name_to_del.append(mod_name) continue filename = f"{apidocs_folder}/{mod_name}.rst" print(f"Writing API docfile {filename}") @@ -164,6 +168,10 @@ def make_apidocs() -> None: f.write("\n".join(lines)) print("") + + for mod_name in mod_name_to_del: + del modules_dict[mod_name] + toctable_lines = [ ".. toctree::", " :maxdepth: 2", diff --git a/docs/requirements.txt b/docs/requirements.txt index 19056eb..97f3c30 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -4,4 +4,3 @@ sphinx_autodoc_typehints bases typing-extensions typing-validation -pyskein diff --git a/multiformats/__init__.py b/multiformats/__init__.py index f386ba8..1f75ec1 100644 --- a/multiformats/__init__.py +++ b/multiformats/__init__.py @@ -15,7 +15,7 @@ while :class:`~multiformats.cid.CID` is a class for Content IDentifiers. """ -__version__ = "0.1.3" +__version__ = "0.1.4" from . import varint from . import multicodec diff --git a/multiformats/multiaddr/err.py b/multiformats/multiaddr/err.py index 3e9c8c1..f257ff8 100644 --- a/multiformats/multiaddr/err.py +++ b/multiformats/multiaddr/err.py @@ -6,8 +6,7 @@ class MultiaddrKeyError(builtins.KeyError): # pylint: disable = redefined-builtin """ Class for :mod:`~multiformats.multiaddr` key errors. """ - ... + class MultiaddrValueError(builtins.ValueError): # pylint: disable = redefined-builtin """ Class for :mod:`~multiformats.multiaddr` value errors. """ - ... diff --git a/multiformats/multibase/__init__.py b/multiformats/multibase/__init__.py index 2f12524..36f37bb 100644 --- a/multiformats/multibase/__init__.py +++ b/multiformats/multibase/__init__.py @@ -39,7 +39,7 @@ class Multibase: :param name: the multibase name :type name: :obj:`str` - :param code: the multibase code, as single-char string or ``0xYZ`` hex-string of a byte + :param code: the multibase code, as single-char string or ``0x...`` hex-string of a non-empty bytestring :type code: :obj:`str` :param status: the multibase status :type status: ``'draft'``, ``'candidate'`` or ``'default'``, *optional* @@ -91,20 +91,20 @@ def validate_code(code: str) -> str: MultibaseValueError: Multibase codes must be single-character strings or the hex digits '0xYZ' of a single byte. - :param code: the multibase code, as single character or ``0xYZ`` hex-string of a single byte + :param code: the multibase code, as single character or ``0x...`` hex-string of a non-empty bytestring :type code: :obj:`str` :raises ValueError: if the code is invalid """ validate(code, str) - if re.match(r"^0x[0-9a-zA-Z][0-9a-zA-Z]$", code): + if re.match(r"^0x([0-9a-zA-Z][0-9a-zA-Z])+$", code): ord_code = int(code, base=16) + if ord_code in range(0x20, 0x7F): + raise MultibaseValueError("Multibase codes in hex format cannot be printable ASCII characters.") code = chr(ord_code) elif len(code) != 1: - raise MultibaseValueError("Multibase codes must be single-character strings or the hex digits '0xYZ' of a single byte.") - if ord(code) not in range(0x00, 0x80): - raise MultibaseValueError("Multibase codes must be ASCII characters.") + raise MultibaseValueError("Multibase codes must be single-character strings or the hex digits '0x...' of a non-empty bytestring.") return code @staticmethod @@ -145,7 +145,9 @@ def code_printable(self) -> str: code = self.code ord_code = ord(code) if ord_code not in range(0x20, 0x7F): - return "0x"+base16.encode(bytes([ord_code])) + ord_code_num_bytes = max(1, math.ceil(ord_code.bit_length()/8)) + ord_code_bytes = ord_code.to_bytes(ord_code_num_bytes, byteorder="big") + return "0x"+base16.encode(ord_code_bytes) return code @property @@ -555,6 +557,6 @@ def build_multibase_tables(bases: Iterable[Multibase]) -> Tuple[Dict[str, Multib # Create the global code->multibase and name->multibase mappings. _code_table: Dict[str, Multibase] _name_table: Dict[str, Multibase] -with importlib_resources.open_text("multiformats.multibase", "multibase-table.json") as _table_f: +with importlib_resources.open_text("multiformats.multibase", "multibase-table.json", encoding="utf8") as _table_f: _table_json = json.load(_table_f) _code_table, _name_table = build_multibase_tables(Multibase(**row) for row in _table_json) diff --git a/multiformats/multibase/err.py b/multiformats/multibase/err.py index ef068d6..573eb8a 100644 --- a/multiformats/multibase/err.py +++ b/multiformats/multibase/err.py @@ -6,8 +6,6 @@ class MultibaseKeyError(builtins.KeyError): # pylint: disable = redefined-builtin """ Class for :mod:`~multiformats.multibase` key errors. """ - ... class MultibaseValueError(builtins.ValueError): # pylint: disable = redefined-builtin """ Class :mod:`~multiformats.multibase` value errors. """ - ... diff --git a/multiformats/multibase/multibase-table.csv b/multiformats/multibase/multibase-table.csv index 33f4f09..7c7549d 100644 --- a/multiformats/multibase/multibase-table.csv +++ b/multiformats/multibase/multibase-table.csv @@ -1,25 +1,26 @@ -encoding, code, description, status -identity, 0x00, 8-bit binary (encoder and decoder keeps data unmodified), default -base2, 0, binary (01010101), candidate -base8, 7, octal, draft -base10, 9, decimal, draft -base16, f, hexadecimal, default -base16upper, F, hexadecimal, default -base32hex, v, rfc4648 case-insensitive - no padding - highest char, candidate -base32hexupper, V, rfc4648 case-insensitive - no padding - highest char, candidate -base32hexpad, t, rfc4648 case-insensitive - with padding, candidate -base32hexpadupper, T, rfc4648 case-insensitive - with padding, candidate -base32, b, rfc4648 case-insensitive - no padding, default -base32upper, B, rfc4648 case-insensitive - no padding, default -base32pad, c, rfc4648 case-insensitive - with padding, candidate -base32padupper, C, rfc4648 case-insensitive - with padding, candidate -base32z, h, z-base-32 (used by Tahoe-LAFS), draft -base36, k, base36 [0-9a-z] case-insensitive - no padding, draft -base36upper, K, base36 [0-9a-z] case-insensitive - no padding, draft -base58btc, z, base58 bitcoin, default -base58flickr, Z, base58 flicker, candidate -base64, m, rfc4648 no padding, default -base64pad, M, rfc4648 with padding - MIME encoding, candidate -base64url, u, rfc4648 no padding, default -base64urlpad, U, rfc4648 with padding, default -proquint, p, PRO-QUINT https://arxiv.org/html/0901.4016, draft +encoding, code, description, status +identity, 0x00, 8-bit binary (encoder and decoder keeps data unmodified), default +base2, 0, binary (01010101), candidate +base8, 7, octal, draft +base10, 9, decimal, draft +base16, f, hexadecimal, default +base16upper, F, hexadecimal, default +base32hex, v, rfc4648 case-insensitive - no padding - highest char, candidate +base32hexupper, V, rfc4648 case-insensitive - no padding - highest char, candidate +base32hexpad, t, rfc4648 case-insensitive - with padding, candidate +base32hexpadupper, T, rfc4648 case-insensitive - with padding, candidate +base32, b, rfc4648 case-insensitive - no padding, default +base32upper, B, rfc4648 case-insensitive - no padding, default +base32pad, c, rfc4648 case-insensitive - with padding, candidate +base32padupper, C, rfc4648 case-insensitive - with padding, candidate +base32z, h, z-base-32 (used by Tahoe-LAFS), draft +base36, k, base36 [0-9a-z] case-insensitive - no padding, draft +base36upper, K, base36 [0-9a-z] case-insensitive - no padding, draft +base58btc, z, base58 bitcoin, default +base58flickr, Z, base58 flicker, candidate +base64, m, rfc4648 no padding, default +base64pad, M, rfc4648 with padding - MIME encoding, candidate +base64url, u, rfc4648 no padding, default +base64urlpad, U, rfc4648 with padding, default +proquint, p, PRO-QUINT https://arxiv.org/html/0901.4016, draft +base256emoji, 🚀, base256 with custom alphabet using variable-sized-codepoints, draft diff --git a/multiformats/multibase/multibase-table.json b/multiformats/multibase/multibase-table.json index 95f929d..85a3d0e 100644 --- a/multiformats/multibase/multibase-table.json +++ b/multiformats/multibase/multibase-table.json @@ -142,5 +142,11 @@ "code": "z", "status": "default", "description": "base58 bitcoin" + }, + { + "name": "base256emoji", + "code": "0x01F680", + "status": "draft", + "description": "base256 with custom alphabet using variable-sized-codepoints" } ] \ No newline at end of file diff --git a/multiformats/multibase/raw.py b/multiformats/multibase/raw.py index c41bba3..e265a38 100644 --- a/multiformats/multibase/raw.py +++ b/multiformats/multibase/raw.py @@ -29,8 +29,10 @@ """ import binascii +from itertools import product from types import MappingProxyType -from typing import Callable, Dict, List, Union +from typing import Any, Callable, Dict, List, Tuple, Union +from typing_extensions import Literal from typing_validation import validate from bases import (base2, base16, base8, base10, base36, base58btc, base58flickr, @@ -111,7 +113,8 @@ def get(name: str) -> RawEncoding: """ validate(name, str) if name not in _raw_encodings: - raise MultibaseKeyError(f"No raw encoding named {repr(name)}.") + if not _jit_register_encoding(name): + raise MultibaseKeyError(f"No raw encoding named {repr(name)}.") return _raw_encodings[name] @@ -129,7 +132,7 @@ def exists(name: str) -> bool: """ validate(name, str) - return name in _raw_encodings + return name in _raw_encodings or name in _jit_registered_encodings def register(name: str, enc: RawEncoding, *, overwrite: bool = False) -> None: @@ -183,154 +186,197 @@ def unregister(name: str) -> None: del _raw_encodings[name] -def identity_raw_encoder(b: BytesLike) -> str: - """ - Implementation of the raw identity encoder according to the `multibase spec `_. - """ - if isinstance(b, (bytes, bytearray)): - return b.decode("utf-8") # type: ignore - validate(b, memoryview) - return bytes(b).decode("utf-8") - -identity_raw_encoder.__repr__ = lambda: "identity_raw_encoder" # type: ignore - - -def identity_raw_decoder(s: str) -> bytes: - """ - Implementation of the raw identity decoder according to the `multibase spec `_. - """ - validate(s, str) - return s.encode("utf-8") - -identity_raw_decoder.__repr__ = lambda: "identity_raw_decoder" # type: ignore - - -_proquint_consonants = "bdfghjklmnprstvz" -_proquint_consonants_set = frozenset("bdfghjklmnprstvz") -_proquint_vowels = "aiou" -_proquint_vowels_set = frozenset("aiou") -_proquint_consonants_revdir = MappingProxyType({char: idx for idx, char in enumerate(_proquint_consonants)}) -_proquint_vowels_revdir = MappingProxyType({char: idx for idx, char in enumerate(_proquint_vowels)}) - - -def proquint_raw_encoder(b: BytesLike) -> str: - """ - Implementation of the proquint encoder according to the `proquint spec `_, - with additional 'ro-' prefix as prescribed by the `multibase spec `_ - and extended to include odd-length bytestrings (adding a final 3-letter block, using two zero pad bits). - """ - validate(b, BytesLike) - b = memoryview(b) # makes slicing cheap - consonants = _proquint_consonants - vowels = _proquint_vowels - char_blocks: List[str] = [] - for idx in range(0, len(b), 2): - byte_block = b[idx: idx+2] - i = int.from_bytes(byte_block, byteorder="big") - if len(byte_block) == 2: # ordinary byte pair - i, c2 = divmod(i, 16) # 4 bits - i, v1 = divmod(i, 4) # 2 bits - i, c1 = divmod(i, 16) # 4 bits - i, v0 = divmod(i, 4) # 2 bits - i, c0 = divmod(i, 16) # 4 bits - assert i == 0 - char_block = consonants[c0]+vowels[v0]+consonants[c1]+vowels[v1]+consonants[c2] - char_blocks.append(char_block) - else: # final byte for odd-length bytestrings - i <<= 2 # add 2 zero pad bits - i, c1 = divmod(i, 16) # 4 bits - i, v0 = divmod(i, 4) # 2 bits - i, c0 = divmod(i, 16) # 4 bits - assert i == 0 - char_block = consonants[c0]+vowels[v0]+consonants[c1] - char_blocks.append(char_block) - prefix = "ro-" # follows multibase code "p" to make "pro-", e.g. "pro-lusab-babad" - return prefix+"-".join(char_blocks) - -proquint_raw_encoder.__repr__ = lambda: "proquint_raw_encoder" # type: ignore - -def proquint_raw_decoder(s: str) -> bytes: - """ - Implementation of the proquint decoder according to the `proquint spec `_, - with additional 'ro-' prefix as prescribed by the `multibase spec `_ - and extended to include odd-length bytestrings (adding a final 3-letter block, using two zero pad bits). - """ - # pylint: disable = too-many-branches - validate(s, str) - consonants = _proquint_consonants - vowels = _proquint_vowels - consonants_set = _proquint_consonants_set - vowels_set = _proquint_vowels_set - consonants_revdir = _proquint_consonants_revdir - vowels_revdir = _proquint_vowels_revdir - # validate string - if not s.startswith("ro-"): - raise binascii.Error("Multibase proquint encoded strings must start with 'ro-'.") - # remove 'ro-' prefix, return empty bytestring if resultant string is empty - s = s[3:] - if len(s) == 0: - return b"" - # validate length for patterns cvcvc (len 5), cvcvc-...-cvc (len 6k+3) or cvcvc-...-cvcvc (len 6k+5) - if len(s) % 6 not in (3, 5): - raise binascii.Error("Proquint encoded string length must give remainder of 3 or 5 when divided by 6.") - # validate characters and convert encoded string into unsigned integer - i = 0 - for idx, char in enumerate(s): - if idx % 6 == 5: # separator - if char != "-": - raise binascii.Error(f"Incorrect char at position {idx}: expected '-', found {repr(char)}.") - elif idx % 2 == 0: # consonant - if char not in consonants_set: - raise binascii.Error(f"Incorrect char at position {idx}: expected consonant in {repr(consonants)}, " - f"found {repr(char)}.") - i <<= 4 # make space for 4 bits - i += consonants_revdir[char] # insert consonant bits - else: # vowel - if char not in vowels_set: - raise binascii.Error(f"Incorrect char at position {idx}: expected vowel in {repr(vowels)}, " - f"found {repr(char)}.") - i <<= 2 # make space for 2 bits - i += vowels_revdir[char] # insert vowel bits - # set number of bytes to number of quintuplets - nbytes = 2*((len(s)+1)//6) - # deal with the case of terminating tripled (odd bytestring length) - if len(s) % 6 == 3: - # ensure pad bits are zero - i, pad_bits = divmod(i, 4) - if pad_bits != 0: - raise binascii.Error(f"Expected pad bits to be 00, found {bin(pad_bits)[2:]} instead.") - # add an extra byte - nbytes += 1 - # convert unsigned integer to bytes and return - return i.to_bytes(nbytes, byteorder="big") - -proquint_raw_decoder.__repr__ = lambda: "proquint_raw_decoder" # type: ignore - - -# custom encodings -register("identity", CustomEncoding(identity_raw_encoder, identity_raw_decoder)) -register("proquint", CustomEncoding(proquint_raw_encoder, proquint_raw_decoder)) - -# base encodings +# register base encodings already instantiated by 'bases' v0.2.1 register("base2", base2) register("base8", base8) register("base10", base10) -register("base16", base16.lower()) register("base16upper", base16) -register("base32hex", base32hex.nopad().lower()) -register("base32hexupper", base32hex.nopad()) -register("base32hexpad", base32hex.lower()) -register("base32hexpadupper", base32hex) -register("base32", base32.nopad().lower()) -register("base32upper", base32.nopad()) -register("base32pad", base32.lower()) register("base32padupper", base32) +register("base32hexpadupper", base32hex) register("base32z", base32z) -register("base36", base36.lower()) register("base36upper", base36) register("base58btc", base58btc) register("base58flickr", base58flickr) -register("base64", base64.nopad()) register("base64pad", base64) -register("base64url", base64url.nopad()) register("base64urlpad", base64url) + + +def _jit_register_identity_encoding() -> None: + def identity_raw_encoder(b: BytesLike) -> str: + """ + Implementation of the raw identity encoder according to the `multibase spec `_. + """ + if isinstance(b, (bytes, bytearray)): + return b.decode("utf-8") # type: ignore + validate(b, memoryview) + return bytes(b).decode("utf-8") + identity_raw_encoder.__repr__ = lambda: "identity_raw_encoder" # type: ignore + def identity_raw_decoder(s: str) -> bytes: + """ + Implementation of the raw identity decoder according to the `multibase spec `_. + """ + validate(s, str) + return s.encode("utf-8") + identity_raw_decoder.__repr__ = lambda: "identity_raw_decoder" # type: ignore + register("identity", CustomEncoding(identity_raw_encoder, identity_raw_decoder)) + +def _jit_register_proquint_encoding() -> None: + # pylint: disable = too-many-statements + _proquint_consonants = "bdfghjklmnprstvz" + _proquint_consonants_set = frozenset("bdfghjklmnprstvz") + _proquint_vowels = "aiou" + _proquint_vowels_set = frozenset("aiou") + _proquint_consonants_revdir = MappingProxyType({char: idx for idx, char in enumerate(_proquint_consonants)}) + _proquint_vowels_revdir = MappingProxyType({char: idx for idx, char in enumerate(_proquint_vowels)}) + def proquint_raw_encoder(b: BytesLike) -> str: + """ + Implementation of the proquint encoder according to the `proquint spec `_, + with additional 'ro-' prefix as prescribed by the `multibase spec `_ + and extended to include odd-length bytestrings (adding a final 3-letter block, using two zero pad bits). + """ + validate(b, BytesLike) + b = memoryview(b) # makes slicing cheap + consonants = _proquint_consonants + vowels = _proquint_vowels + char_blocks: List[str] = [] + for idx in range(0, len(b), 2): + byte_block = b[idx: idx+2] + i = int.from_bytes(byte_block, byteorder="big") + if len(byte_block) == 2: # ordinary byte pair + i, c2 = divmod(i, 16) # 4 bits + i, v1 = divmod(i, 4) # 2 bits + i, c1 = divmod(i, 16) # 4 bits + i, v0 = divmod(i, 4) # 2 bits + i, c0 = divmod(i, 16) # 4 bits + assert i == 0 + char_block = consonants[c0]+vowels[v0]+consonants[c1]+vowels[v1]+consonants[c2] + char_blocks.append(char_block) + else: # final byte for odd-length bytestrings + i <<= 2 # add 2 zero pad bits + i, c1 = divmod(i, 16) # 4 bits + i, v0 = divmod(i, 4) # 2 bits + i, c0 = divmod(i, 16) # 4 bits + assert i == 0 + char_block = consonants[c0]+vowels[v0]+consonants[c1] + char_blocks.append(char_block) + prefix = "ro-" # follows multibase code "p" to make "pro-", e.g. "pro-lusab-babad" + return prefix+"-".join(char_blocks) + proquint_raw_encoder.__repr__ = lambda: "proquint_raw_encoder" # type: ignore + def proquint_raw_decoder(s: str) -> bytes: + """ + Implementation of the proquint decoder according to the `proquint spec `_, + with additional 'ro-' prefix as prescribed by the `multibase spec `_ + and extended to include odd-length bytestrings (adding a final 3-letter block, using two zero pad bits). + """ + # pylint: disable = too-many-branches + validate(s, str) + consonants = _proquint_consonants + vowels = _proquint_vowels + consonants_set = _proquint_consonants_set + vowels_set = _proquint_vowels_set + consonants_revdir = _proquint_consonants_revdir + vowels_revdir = _proquint_vowels_revdir + # validate string + if not s.startswith("ro-"): + raise binascii.Error("Multibase proquint encoded strings must start with 'ro-'.") + # remove 'ro-' prefix, return empty bytestring if resultant string is empty + s = s[3:] + if len(s) == 0: + return b"" + # validate length for patterns cvcvc (len 5), cvcvc-...-cvc (len 6k+3) or cvcvc-...-cvcvc (len 6k+5) + if len(s) % 6 not in (3, 5): + raise binascii.Error("Proquint encoded string length must give remainder of 3 or 5 when divided by 6.") + # validate characters and convert encoded string into unsigned integer + i = 0 + for idx, char in enumerate(s): + if idx % 6 == 5: # separator + if char != "-": + raise binascii.Error(f"Incorrect char at position {idx}: expected '-', found {repr(char)}.") + elif idx % 2 == 0: # consonant + if char not in consonants_set: + raise binascii.Error(f"Incorrect char at position {idx}: expected consonant in {repr(consonants)}, " + f"found {repr(char)}.") + i <<= 4 # make space for 4 bits + i += consonants_revdir[char] # insert consonant bits + else: # vowel + if char not in vowels_set: + raise binascii.Error(f"Incorrect char at position {idx}: expected vowel in {repr(vowels)}, " + f"found {repr(char)}.") + i <<= 2 # make space for 2 bits + i += vowels_revdir[char] # insert vowel bits + # set number of bytes to number of quintuplets + nbytes = 2*((len(s)+1)//6) + # deal with the case of terminating tripled (odd bytestring length) + if len(s) % 6 == 3: + # ensure pad bits are zero + i, pad_bits = divmod(i, 4) + if pad_bits != 0: + raise binascii.Error(f"Expected pad bits to be 00, found {bin(pad_bits)[2:]} instead.") + # add an extra byte + nbytes += 1 + # convert unsigned integer to bytes and return + return i.to_bytes(nbytes, byteorder="big") + proquint_raw_decoder.__repr__ = lambda: "proquint_raw_decoder" # type: ignore + register("proquint", CustomEncoding(proquint_raw_encoder, proquint_raw_decoder)) + +def _jit_register_base_encoding(b: Literal[16, 32, 36, 64], + _hex: bool = False, + _pad: bool = False, + _upper: bool = False, + _url: bool = False) -> None: + if b == 64: + assert not _hex and not _pad and not _upper + if _url: + register("base64url", base64url.nopad()) + else: + register("base64", base64.nopad()) + return + assert not _url + if b in (16, 36): + assert not _hex and not _pad and not _upper + if b == 16: + register("base16", base16.lower()) + else: + register("base36", base36.lower()) + return + assert b == 32 and (not _pad or not _upper) + base = base32hex if _hex else base32 + if not _pad: + base = base.nopad() + if not _upper: + base = base.lower() + key = f"base32{'hex' if _hex else ''}{'pad' if _pad else ''}{'upper' if _upper else ''}" + register(key, base) + +_jit_registered_encodings: Dict[str, Tuple[Callable[..., Any], Any]] = { + "identity": (_jit_register_identity_encoding, tuple()), + "proquint": (_jit_register_proquint_encoding, tuple()), + **{ + f"base64{'url' if _url else ''}": ( + _jit_register_base_encoding, + (64, False, False, False, _url) + ) + for _url in (False, True) + }, + **{ + f"base{b}": ( + _jit_register_base_encoding, + (b,) + ) + for b in (16, 36) + }, + **{ + f"base32{'hex' if _hex else ''}{'pad' if _pad else ''}{'upper' if _upper else ''}": ( + _jit_register_base_encoding, + (32, _hex, _pad, _upper) + ) + for _hex, _pad, _upper in product((False, True), repeat=3) + } +} + +def _jit_register_encoding(name: str) -> bool: + if name not in _jit_registered_encodings: + return False + f, args = _jit_registered_encodings[name] + f(*args) + return True diff --git a/multiformats/multicodec/err.py b/multiformats/multicodec/err.py index b6174cb..f4903dd 100644 --- a/multiformats/multicodec/err.py +++ b/multiformats/multicodec/err.py @@ -6,8 +6,6 @@ class MulticodecKeyError(builtins.KeyError): # pylint: disable = redefined-builtin """ Class for :mod:`~multiformats.multicodec` key errors. """ - ... class MulticodecValueError(builtins.ValueError): # pylint: disable = redefined-builtin """ Class for :mod:`~multiformats.multicodec` value errors. """ - ... diff --git a/multiformats/multicodec/multicodec-table.csv b/multiformats/multicodec/multicodec-table.csv index 4ac2113..9a30c33 100644 --- a/multiformats/multicodec/multicodec-table.csv +++ b/multiformats/multicodec/multicodec-table.csv @@ -79,7 +79,6 @@ stellar-block, ipld, 0xd0, draft, Stell stellar-tx, ipld, 0xd1, draft, Stellar Tx md4, multihash, 0xd4, draft, md5, multihash, 0xd5, draft, -bmt, multihash, 0xd6, draft, Binary Merkle Tree Hash decred-block, ipld, 0xe0, draft, Decred Block decred-tx, ipld, 0xe1, draft, Decred Tx ipld-ns, namespace, 0xe2, draft, IPLD path @@ -88,6 +87,7 @@ swarm-ns, namespace, 0xe4, draft, Swarm ipns-ns, namespace, 0xe5, draft, IPNS path zeronet, namespace, 0xe6, draft, ZeroNet site address secp256k1-pub, key, 0xe7, draft, Secp256k1 public key (compressed) +dnslink, namespace, 0xe8, permanent, DNSLink path bls12_381-g1-pub, key, 0xea, draft, BLS12-381 public key in the G1 field bls12_381-g2-pub, key, 0xeb, draft, BLS12-381 public key in the G2 field x25519-pub, key, 0xec, draft, Curve25519 public key @@ -101,6 +101,7 @@ udp, multiaddr, 0x0111, draft, p2p-webrtc-star, multiaddr, 0x0113, draft, p2p-webrtc-direct, multiaddr, 0x0114, draft, p2p-stardust, multiaddr, 0x0115, draft, +webrtc, multiaddr, 0x0118, draft, WebRTC p2p-circuit, multiaddr, 0x0122, permanent, dag-json, ipld, 0x0129, permanent, MerkleDAG json udt, multiaddr, 0x012d, draft, @@ -117,6 +118,8 @@ garlic32, multiaddr, 0x01bf, draft, I2P b tls, multiaddr, 0x01c0, draft, noise, multiaddr, 0x01c6, draft, quic, multiaddr, 0x01cc, permanent, +webtransport, multiaddr, 0x01d1, draft, +certhash, multiaddr, 0x01d2, draft, TLS certificate's fingerprint as a multihash ws, multiaddr, 0x01dd, permanent, wss, multiaddr, 0x01de, permanent, p2p-websocket-star, multiaddr, 0x01df, permanent, @@ -124,10 +127,13 @@ http, multiaddr, 0x01e0, draft, swhid-1-snp, ipld, 0x01f0, draft, SoftWare Heritage persistent IDentifier version 1 snapshot json, ipld, 0x0200, permanent, JSON (UTF-8-encoded) messagepack, serialization, 0x0201, draft, MessagePack +car, serialization, 0x0202, draft, Content Addressable aRchive (CAR) libp2p-peer-record, libp2p, 0x0301, permanent, libp2p peer record type libp2p-relay-rsvp, libp2p, 0x0302, permanent, libp2p relay reservation voucher car-index-sorted, serialization, 0x0400, draft, CARv2 IndexSorted index format car-multihash-index-sorted, serialization, 0x0401, draft, CARv2 MultihashIndexSorted index format +transport-bitswap, transport, 0x0900, draft, Bitswap datatransfer +transport-graphsync-filecoinv1, transport, 0x0910, draft, Filecoin graphsync datatransfer sha2-256-trunc254-padded, multihash, 0x1012, permanent, SHA2-256 with the two most significant bits from the last byte zeroed (as via a mask with 0b00111111) - used for proving trees as in Filecoin sha2-224, multihash, 0x1013, permanent, aka SHA-224; as specified by FIPS 180-4. sha2-512-224, multihash, 0x1014, permanent, aka SHA-512/224; as specified by FIPS 180-4. @@ -144,6 +150,7 @@ p521-pub, key, 0x1202, draft, P-521 ed448-pub, key, 0x1203, draft, Ed448 public Key x448-pub, key, 0x1204, draft, X448 public Key rsa-pub, key, 0x1205, draft, RSA public key. DER-encoded ASN.1 type RSAPublicKey according to IETF RFC 8017 (PKCS #1) +sm2-pub, key, 0x1206, draft, SM2 public key (compressed) ed25519-priv, key, 0x1300, draft, Ed25519 private key secp256k1-priv, key, 0x1301, draft, Secp256k1 private key x25519-priv, key, 0x1302, draft, Curve25519 private key @@ -471,6 +478,9 @@ skein1024-1016, multihash, 0xb3df, draft, skein1024-1024, multihash, 0xb3e0, draft, poseidon-bls12_381-a2-fc1, multihash, 0xb401, permanent, Poseidon using BLS12-381 and arity of 2 with Filecoin parameters poseidon-bls12_381-a2-fc1-sc, multihash, 0xb402, draft, Poseidon using BLS12-381 and arity of 2 with Filecoin parameters - high-security variant +ssz, serialization, 0xb501, draft, SimpleSerialize (SSZ) serialization +ssz-sha2-256-bmt, multihash, 0xb502, draft, SSZ Merkle tree root using SHA2-256 as the hashing function and SSZ serialization for the block binary +iscc, softhash, 0xcc01, draft, ISCC (International Standard Content Code) - similarity preserving hash zeroxcert-imprint-256, zeroxcert, 0xce11, draft, 0xcert Asset Imprint (root hash) fil-commitment-unsealed, filecoin, 0xf101, permanent, Filecoin piece or sector data commitment merkle node/root (CommP & CommD) fil-commitment-sealed, filecoin, 0xf102, permanent, Filecoin sector data commitment merkle node/root - sealed and replicated (CommR) @@ -484,3 +494,4 @@ holochain-sig-v1, holochain, 0xa37124, draft, Holoc skynet-ns, namespace, 0xb19910, draft, Skynet Namespace arweave-ns, namespace, 0xb29910, draft, Arweave Namespace subspace-ns, namespace, 0xb39910, draft, Subspace Network Namespace +kumandra-ns, namespace, 0xb49910, draft, Kumandra Network Namespace diff --git a/multiformats/multicodec/multicodec-table.json b/multiformats/multicodec/multicodec-table.json index 301f862..b285409 100644 --- a/multiformats/multicodec/multicodec-table.json +++ b/multiformats/multicodec/multicodec-table.json @@ -559,13 +559,6 @@ "status": "draft", "description": "" }, - { - "name": "bmt", - "tag": "multihash", - "code": "0xd6", - "status": "draft", - "description": "Binary Merkle Tree Hash" - }, { "name": "decred-block", "tag": "ipld", @@ -622,6 +615,13 @@ "status": "draft", "description": "Secp256k1 public key (compressed)" }, + { + "name": "dnslink", + "tag": "namespace", + "code": "0xe8", + "status": "permanent", + "description": "DNSLink path" + }, { "name": "bls12_381-g1-pub", "tag": "key", @@ -713,6 +713,13 @@ "status": "draft", "description": "" }, + { + "name": "webrtc", + "tag": "multiaddr", + "code": "0x0118", + "status": "draft", + "description": "WebRTC" + }, { "name": "p2p-circuit", "tag": "multiaddr", @@ -818,6 +825,20 @@ "status": "permanent", "description": "" }, + { + "name": "webtransport", + "tag": "multiaddr", + "code": "0x01d1", + "status": "draft", + "description": "" + }, + { + "name": "certhash", + "tag": "multiaddr", + "code": "0x01d2", + "status": "draft", + "description": "TLS certificate's fingerprint as a multihash" + }, { "name": "ws", "tag": "multiaddr", @@ -867,6 +888,13 @@ "status": "draft", "description": "MessagePack" }, + { + "name": "car", + "tag": "serialization", + "code": "0x0202", + "status": "draft", + "description": "Content Addressable aRchive (CAR)" + }, { "name": "libp2p-peer-record", "tag": "libp2p", @@ -895,6 +923,20 @@ "status": "draft", "description": "CARv2 MultihashIndexSorted index format" }, + { + "name": "transport-bitswap", + "tag": "transport", + "code": "0x0900", + "status": "draft", + "description": "Bitswap datatransfer" + }, + { + "name": "transport-graphsync-filecoinv1", + "tag": "transport", + "code": "0x0910", + "status": "draft", + "description": "Filecoin graphsync datatransfer" + }, { "name": "sha2-256-trunc254-padded", "tag": "multihash", @@ -1007,6 +1049,13 @@ "status": "draft", "description": "RSA public key. DER-encoded ASN.1 type RSAPublicKey according to IETF RFC 8017 (PKCS #1)" }, + { + "name": "sm2-pub", + "tag": "key", + "code": "0x1206", + "status": "draft", + "description": "SM2 public key (compressed)" + }, { "name": "ed25519-priv", "tag": "key", @@ -3296,6 +3345,27 @@ "status": "draft", "description": "Poseidon using BLS12-381 and arity of 2 with Filecoin parameters - high-security variant" }, + { + "name": "ssz", + "tag": "serialization", + "code": "0xb501", + "status": "draft", + "description": "SimpleSerialize (SSZ) serialization" + }, + { + "name": "ssz-sha2-256-bmt", + "tag": "multihash", + "code": "0xb502", + "status": "draft", + "description": "SSZ Merkle tree root using SHA2-256 as the hashing function and SSZ serialization for the block binary" + }, + { + "name": "iscc", + "tag": "softhash", + "code": "0xcc01", + "status": "draft", + "description": "ISCC (International Standard Content Code) - similarity preserving hash" + }, { "name": "zeroxcert-imprint-256", "tag": "zeroxcert", @@ -3386,5 +3456,12 @@ "code": "0xb39910", "status": "draft", "description": "Subspace Network Namespace" + }, + { + "name": "kumandra-ns", + "tag": "namespace", + "code": "0xb49910", + "status": "draft", + "description": "Kumandra Network Namespace" } ] \ No newline at end of file diff --git a/multiformats/multihash/__init__.py b/multiformats/multihash/__init__.py index d68c8a7..00766a5 100644 --- a/multiformats/multihash/__init__.py +++ b/multiformats/multihash/__init__.py @@ -39,7 +39,7 @@ class Multihash: _cache: ClassVar[WeakValueDictionary] = WeakValueDictionary() # type: ignore _codec: Multicodec - _implementation: MultihashImpl + _implementation: Optional[MultihashImpl] __slots__ = ("__weakref__", "_codec", "_implementation") @@ -58,20 +58,26 @@ def __new__(cls, *, codec: Union[str, int, Multicodec]) -> "Multihash": # check that the codec is a multihash multicodec: if codec.tag != "multihash": raise MultihashValueError(f"Multicodec named {repr(codec.name)} exists, but is not a multihash.") - implementation: MultihashImpl = raw.get(codec.name) + if not raw.exists(codec.name): + raise MultihashKeyError(f"No implementation for multihash multicodec {repr(codec.name)}.") _cache = Multihash._cache if codec.name in _cache: # if a multihash instance with this name is already registered instance: Multihash = _cache[codec.name] - if instance.codec == codec and instance._implementation == implementation: - # nothing changed, can use the existing instance - return instance + if instance.codec == codec: + # same codec, check same implementation: + if instance._implementation is None: + # implementation not loaded yet, can use the existing instance + return instance + if codec.name in raw._hashfun and instance._implementation == raw._hashfun[codec.name]: + # nothing changed, can use the existing instance + return instance # otherwise remove the existing instance del _cache[codec.name] # create a fresh instance, register it and return it instance = super().__new__(cls) instance._codec = codec - instance._implementation = implementation + instance._implementation = None _cache[codec.name] = instance return instance @@ -133,7 +139,7 @@ def max_digest_size(self) -> Optional[int]: return max_digest_size @property - def implementation(self) ->MultihashImpl: + def implementation(self) -> MultihashImpl: """ Returns the implementation of a multihash multicodec, as a pair: @@ -154,7 +160,11 @@ def implementation(self) ->MultihashImpl: :rtype: :obj:`~multiformats.multihash.raw.MultihashImpl` """ - return self._implementation + implementation = self._implementation + if implementation is None: + implementation = raw.get(self.name) + self._implementation = implementation + return implementation def wrap(self, raw_digest: BytesLike) -> bytes: """ @@ -202,16 +212,23 @@ def digest(self, data: BytesLike, *, size: Optional[int] = None) -> bytes: :param data: the raw digest :type data: :obj:`~multiformats.varint.BytesLike` - :param size: optional truncated size for the raw digest (if :obj:`None`, raw digest is not truncated) + :param size: size for the raw digest, in bytes. If not :obj:`None`, raw digest is truncated to fit the given size. :type size: :obj:`int` or :obj:`None`, *optional* + :raises ValueError: if size parameter is not :obj:`None` and negative. + :raises ValueError: if size parameter is not :obj:`None`, max digest size is not :obj:`None` and given size exceeds max digest size. + :raises ValueError: if size parameter is :obj:`None` but a size is required for the hash function (e.g. for the KangarooTwelve XOF). + See :func:`digest` for more information. """ hf, _ = self.implementation - raw_digest = hf(data) - if size is not None: - raw_digest = raw_digest[:size] # truncate digest - size = len(raw_digest) + raw_digest = hf(data, size) + # if size is not None: + # raw_digest = raw_digest[:size] # truncate digest + if size is None: + size = len(raw_digest) + else: + assert size == len(raw_digest), f"Expected {size}B digest, found {len(raw_digest)}B digest." return self.codec.wrap(varint.encode(size)+raw_digest) def unwrap(self, digest: Union[BytesLike, BufferedIOBase]) -> bytes: @@ -451,9 +468,13 @@ def digest(data: BytesLike, hashfun: Union[str, int, Multihash], *, size: Option :type data: :obj:`~multiformats.varint.BytesLike` :param hashfun: the multihash function name, code or object :type hashfun: :obj:`str`, :obj:`int` or :class:`Multihash` - :param size: optional truncated size for the raw digest (if :obj:`None`, raw digest is not truncated) + :param size: size for the raw digest, in bytes. If not :obj:`None`, raw digest is truncated to fit the given size. :type size: :obj:`int` or :obj:`None`, *optional* + :raises ValueError: if size parameter is not :obj:`None` and negative. + :raises ValueError: if size parameter is not :obj:`None`, max digest size is not :obj:`None` and given size exceeds max digest size. + :raises ValueError: if size parameter is :obj:`None` but a size is required for the hash function (e.g. for the KangarooTwelve XOF). + """ if not isinstance(hashfun, Multihash): hashfun = Multihash(codec=hashfun) diff --git a/multiformats/multihash/_hashfuns/__init__.py b/multiformats/multihash/_hashfuns/__init__.py new file mode 100644 index 0000000..0b82628 --- /dev/null +++ b/multiformats/multihash/_hashfuns/__init__.py @@ -0,0 +1,5 @@ +""" Implementations for specific hash functions. """ + +from .utils import Hashfun, validate_hashfun_args, repeat_hashfun + +__all__ = ["Hashfun", "validate_hashfun_args", "repeat_hashfun"] diff --git a/multiformats/multihash/_hashfuns/blake.py b/multiformats/multihash/_hashfuns/blake.py new file mode 100644 index 0000000..7cfcd0f --- /dev/null +++ b/multiformats/multihash/_hashfuns/blake.py @@ -0,0 +1,46 @@ +""" + Implementation for the ``blake2`` and ``blake3`` hash functions, using the optional dependency `blake3 `_. +""" + +import hashlib +from typing import Optional + +from multiformats.varint import BytesLike +from .utils import Hashfun, validate_hashfun_args + +def _hashlib_blake2(version: str, digest_bits: int) -> Hashfun: + h = getattr(hashlib, f"blake2{version}") + def hashfun(data: BytesLike, size: Optional[int] = None) -> bytes: + validate_hashfun_args(data, size, digest_bits//8) + m: hashlib._Hash = h(digest_size=digest_bits//8) # pylint: disable = no-member + m.update(data) + d = m.digest() + return d if size is None else d[:size] + return hashfun + +def _jit_register_blake2(m, register) -> bool: # type: ignore + blake2_version, digest_bits = (m[1], int(m[2])) + if digest_bits not in range(8, 513 if blake2_version == "b" else 257, 8): + return False + if register is not None: + register(f"blake2{blake2_version}-{digest_bits}", _hashlib_blake2(blake2_version, digest_bits), digest_bits//8) + return True + +def _blake3() -> Hashfun: + try: + from blake3 import blake3 # type: ignore # pylint: disable = import-outside-toplevel + except ImportError as e: + raise ImportError("Module 'blake3' must be installed to use 'blake3' hash function. Consider running 'pip install blake3'.") from e + def hashfun(data: BytesLike, size: Optional[int] = None) -> bytes: + validate_hashfun_args(data, size, None, size_required=True, name="blake3") + assert size is not None + m = blake3() # pylint: disable = not-callable + m.update(data) + d: bytes = m.digest(size) + return d + return hashfun + +def _jit_register_blake3(m, register) -> bool: # type: ignore + if register is not None: + register("blake3", _blake3(), None) + return True diff --git a/multiformats/multihash/_hashfuns/filecoin.py b/multiformats/multihash/_hashfuns/filecoin.py new file mode 100644 index 0000000..6892b8a --- /dev/null +++ b/multiformats/multihash/_hashfuns/filecoin.py @@ -0,0 +1,31 @@ +""" + Implementation for the ``sha2-256-trunc254-padded`` hash function, + using `hashlib `_. + + Future support planned for the ``poseidon-bls12_381-a2-fc1`` hash functions, + possibly using `poseidon-hash `_. + Additional references on the Poseidon hash function: + + - https://www.poseidon-hash.info/ + - https://github.com/filecoin-project/neptune +""" + +import hashlib +from hashlib import sha256 +from typing import Optional + +from multiformats.varint import BytesLike +from .utils import validate_hashfun_args + +def _sha_256_trunc254_padded(data: BytesLike, size: Optional[int] = None) -> bytes: + validate_hashfun_args(data, size, 32) + m: hashlib._Hash = sha256() # pylint: disable = no-member + m.update(data) + d = m.digest() + d = d[:-1]+bytes([d[-1]&0x00111111]) + return d if size is None else d[:size] + +def _jit_register_sha_256_trunc254_padded(m, register) -> bool: # type: ignore + if register is not None: + register("sha2-256-trunc254-padded", _sha_256_trunc254_padded, 32) # 32B = 256 bits + return True diff --git a/multiformats/multihash/_hashfuns/kangarootwelve.py b/multiformats/multihash/_hashfuns/kangarootwelve.py new file mode 100644 index 0000000..58c1339 --- /dev/null +++ b/multiformats/multihash/_hashfuns/kangarootwelve.py @@ -0,0 +1,27 @@ +""" + Implementation for the ``kangarootwelve`` hash function, using `pycryptodomex `_. +""" + +from typing import Optional + +from multiformats.varint import BytesLike +from .utils import Hashfun, validate_hashfun_args + +def _kangarootwelve() -> Hashfun: + try: + from Cryptodome.Hash import KangarooTwelve # pylint: disable = import-outside-toplevel + except ImportError as e: + raise ImportError("Module 'Cryptodome' must be installed to use the 'kangarootwelve' hash function. " + "Consider running 'pip install pycryptodomex'.") from e + def hashfun(data: BytesLike, size: Optional[int] = None) -> bytes: + validate_hashfun_args(data, size, None, size_required=True, name="kangarootwelve") + assert size is not None + m = KangarooTwelve.new() + m.update(data) + return m.read(size) + return hashfun + +def _jit_register_kangarootwelve(m, register) -> bool: # type: ignore + if register is not None: + register("kangarootwelve", _kangarootwelve(), None) + return True diff --git a/multiformats/multihash/_hashfuns/keccak.py b/multiformats/multihash/_hashfuns/keccak.py new file mode 100644 index 0000000..69aec2b --- /dev/null +++ b/multiformats/multihash/_hashfuns/keccak.py @@ -0,0 +1,29 @@ +""" + Implementation for the ``keccak`` hash functions, using the optional dependency `pysha3 `_. +""" + +import hashlib +from typing import Optional + +from multiformats.varint import BytesLike +from .utils import Hashfun, validate_hashfun_args + +def _keccak(digest_bits: int) -> Hashfun: + try: + import sha3 # type: ignore # pylint: disable = import-outside-toplevel + except ImportError as e: + raise ImportError("Module 'sha3' must be installed to use 'keccak' hash functions. Consider running 'pip install pysha3'.") from e + h = getattr(sha3, f"keccak_{digest_bits}") + def hashfun(data: BytesLike, size: Optional[int] = None) -> bytes: + validate_hashfun_args(data, size, digest_bits//8) + m: hashlib._Hash = h() # pylint: disable = no-member + m.update(data) + d = m.digest() + return d if size is None else d[:size] + return hashfun + +def _jit_register_keccak(m, register) -> bool: # type: ignore + digest_bits = int(m[1]) + if register is not None: + register(f"keccak-{digest_bits}", _keccak(digest_bits), digest_bits//8) + return True diff --git a/multiformats/multihash/_hashfuns/md.py b/multiformats/multihash/_hashfuns/md.py new file mode 100644 index 0000000..53f39f5 --- /dev/null +++ b/multiformats/multihash/_hashfuns/md.py @@ -0,0 +1,42 @@ +""" + Implementation for the ``md5`` and ``ripemd`` hash functions, + using `hashlib `_ and `pycryptodomex `_. +""" + +import hashlib +from typing import Optional + +from multiformats.varint import BytesLike +from .utils import Hashfun, validate_hashfun_args + +def _md5(data: BytesLike, size: Optional[int] = None) -> bytes: + validate_hashfun_args(data, size, 16) + m: hashlib._Hash = hashlib.md5() # pylint: disable = no-member + m.update(data) + d = m.digest() + return d if size is None else d[:size] + +def _jit_register_md5(m, register) -> bool: # type: ignore + if register is not None: + register("md5", _md5, 16) + return True + +def _ripemd(digest_bits: int) -> Hashfun: + assert digest_bits == 160, "Only 'ripemd-160' is currently supported." + try: + from Cryptodome.Hash import RIPEMD160 # pylint: disable = import-outside-toplevel + except ImportError as e: + raise ImportError("Module 'Cryptodome' must be installed to use the 'ripemd-160' hash function. Consider running 'pip install pycryptodomex'.") from e + def hashfun(data: BytesLike, size: Optional[int] = None) -> bytes: + validate_hashfun_args(data, size, digest_bits//8) + m = RIPEMD160.new() + m.update(data) + d = m.digest() + return d if size is None else d[:size] + return hashfun + +def _jit_register_ripemd(m, register) -> bool: # type: ignore + digest_bits = int(m[1]) + if register is not None: + register(f"ripemd-{digest_bits}", _ripemd(digest_bits), digest_bits//8) + return True diff --git a/multiformats/multihash/_hashfuns/murmur3.py b/multiformats/multihash/_hashfuns/murmur3.py new file mode 100644 index 0000000..3f414e1 --- /dev/null +++ b/multiformats/multihash/_hashfuns/murmur3.py @@ -0,0 +1,48 @@ +""" + Implementation for the ``murmur3`` hash functions, using the optional dependency `mmh3 `_. +""" + +from typing import Optional + +from multiformats.varint import BytesLike +from .utils import Hashfun, validate_hashfun_args + +def _murmur3(version: str, digest_bits: int) -> Hashfun: + try: + import mmh3 # type: ignore # pylint: disable = import-outside-toplevel + except ImportError as e: + raise ImportError("Module 'mmh3' must be installed to use 'murmur3' hash functions. Consider running 'pip install mmh3'.") from e + if version == "32": + def hashfun(data: BytesLike, size: Optional[int] = None) -> bytes: + validate_hashfun_args(data, size, 4) + if not isinstance(data, bytes): + data = bytes(data) + d: bytes = mmh3.hash(data, signed=False).to_bytes(4, byteorder="big") # pylint: disable = c-extension-no-member + return d if size is None else d[:size] + elif digest_bits == 128: # version == "x64" + def hashfun(data: BytesLike, size: Optional[int] = None) -> bytes: + validate_hashfun_args(data, size, 16) + if not isinstance(data, bytes): + data = bytes(data) + d: bytes = mmh3.hash128(data, signed=False).to_bytes(16, byteorder="big") # pylint: disable = c-extension-no-member + return d if size is None else d[:size] + else: # version == "x64" + def hashfun(data: BytesLike, size: Optional[int] = None) -> bytes: + validate_hashfun_args(data, size, 8) + if not isinstance(data, bytes): + data = bytes(data) + d: bytes = mmh3.hash128(data, signed=False).to_bytes(16, byteorder="big") # pylint: disable = c-extension-no-member + return d[:8] if size is None else d[:size] + return hashfun + +def _jit_register_murmur3(m, register) -> bool: # type: ignore + if m[1] == "32": + if register is not None: + register("murmur3-32", _murmur3("32", 32), 32//8) + return True + # version == "x64" + assert m[2] == "x64" + digest_bits = int(m[3]) + if register is not None: + register(f"murmur3-x64-{digest_bits}", _murmur3("x64", digest_bits), digest_bits//8) + return True diff --git a/multiformats/multihash/_hashfuns/sha.py b/multiformats/multihash/_hashfuns/sha.py new file mode 100644 index 0000000..067a4d3 --- /dev/null +++ b/multiformats/multihash/_hashfuns/sha.py @@ -0,0 +1,86 @@ +""" + Implementation for the ``sha`` and ``shake`` hash functions, + using `hashlib `_ and `pycryptodomex `_. +""" + +import hashlib +from typing import Optional + +from multiformats.varint import BytesLike +from .utils import Hashfun, validate_hashfun_args + +def _sha(version: int, digest_bits: int) -> Hashfun: + name = ("sha1", f"sha{digest_bits}", f"sha3_{digest_bits}")[version-1] + h = getattr(hashlib, name) + def hashfun(data: BytesLike, size: Optional[int] = None) -> bytes: + validate_hashfun_args(data, size, digest_bits//8) + m: hashlib._Hash = h() # pylint: disable = no-member + m.update(data) + d = m.digest() + return d if size is None else d[:size] + return hashfun + +def _jit_register_sha1(m, register) -> bool: # type: ignore + if register is not None: + register("sha1", _sha(1, 160), 20) # 20B = 160 bits + return True + +def _shake(digest_bits: int) -> Hashfun: + assert digest_bits in (256, 512) + h = getattr(hashlib, f"shake_{digest_bits//2}") + def hashfun(data: BytesLike, size: Optional[int] = None) -> bytes: + validate_hashfun_args(data, size, digest_bits//8) + m: hashlib._Hash = h() # pylint: disable = no-member + m.update(data) + d = m.digest(digest_bits//8) # type: ignore + return d if size is None else d[:size] + return hashfun + +def _jit_register_sha23ke(m, register) -> bool: # type: ignore + if m[1] == "ke": + digest_bits = 2*int(m[2]) + if register is not None: + register(f"shake-{digest_bits//2}", _shake(digest_bits), digest_bits//8) + return True + sha_version, digest_bits = (int(m[1]), int(m[2])) + if register is not None: + register(f"sha{sha_version}-{digest_bits}", _sha(sha_version, digest_bits), digest_bits//8) + return True + +def _sha2_512(digest_bits: int) -> Hashfun: + assert digest_bits in (224, 256) + try: + from Cryptodome.Hash import SHA512 # pylint: disable = import-outside-toplevel + except ImportError as e: + raise ImportError("Module 'Cryptodome' must be installed to use the 'sha2-256' hash functions. " + "Consider running 'pip install pycryptodomex'.") from e + def hashfun(data: BytesLike, size: Optional[int] = None) -> bytes: + validate_hashfun_args(data, size, digest_bits//8) + m = SHA512.new(truncate=str(digest_bits)) + m.update(data) + d = m.digest() + return d if size is None else d[:size] + return hashfun + +def _jit_register_sha2_512(m, register) -> bool: # type: ignore + digest_bits = int(m[1]) + if register is not None: + register(f"sha2-512-{digest_bits}", _sha2_512(digest_bits), digest_bits//8) + return True + +def _dbl_sha23(version: int, digest_bits: int) -> Hashfun: + name = ("sha1", f"sha{digest_bits}", f"sha3_{digest_bits}")[version-1] + h = getattr(hashlib, name) + def hashfun(data: BytesLike, size: Optional[int] = None) -> bytes: + validate_hashfun_args(data, size, digest_bits//8) + m: hashlib._Hash = h() # pylint: disable = no-member + m.update(data) + d = m.digest() + return d if size is None else d[:size] + return hashfun + +def _jit_register_dbl_sha23(m, register) -> bool: # type: ignore + sha_version, digest_bits = (int(m[1]), int(m[2])) + if register is not None: + register(f"sha{sha_version}-{digest_bits}", _dbl_sha23(sha_version, digest_bits), digest_bits//8) + return True diff --git a/multiformats/multihash/_hashfuns/skein.py b/multiformats/multihash/_hashfuns/skein.py new file mode 100644 index 0000000..a7b3a09 --- /dev/null +++ b/multiformats/multihash/_hashfuns/skein.py @@ -0,0 +1,31 @@ +""" + Implementation for the ``skein`` hash functions, using the optional dependency `pyskein `_. +""" + +import hashlib +from typing import Optional + +from multiformats.varint import BytesLike +from .utils import Hashfun, validate_hashfun_args + +def _skein(version: int, digest_bits: int) -> Hashfun: + try: + import skein # type: ignore # pylint: disable = import-outside-toplevel + except ImportError as e: + raise ImportError("Module 'skein' must be installed to use 'skein' hash functions. Consider running 'pip install pyskein'.") from e + h = getattr(skein, f"skein{version}") + def hashfun(data: BytesLike, size: Optional[int] = None) -> bytes: + validate_hashfun_args(data, size, digest_bits//8) + m: hashlib._Hash = h(digest_bits=digest_bits) # pylint: disable = no-member + m.update(data) + d = m.digest() + return d if size is None else d[:size] + return hashfun + +def _jit_register_skein(m, register) -> bool: # type: ignore + skein_version, digest_bits = (int(m[1]), int(m[2])) + if digest_bits not in range(8, skein_version+1, 8): + return False + if register is not None: + register(f"skein{skein_version}-{digest_bits}", _skein(skein_version, digest_bits), digest_bits//8) + return True diff --git a/multiformats/multihash/_hashfuns/utils.py b/multiformats/multihash/_hashfuns/utils.py new file mode 100644 index 0000000..0e8ed18 --- /dev/null +++ b/multiformats/multihash/_hashfuns/utils.py @@ -0,0 +1,70 @@ +""" + Utilities for hash function implementation. +""" + +from typing import Optional +from typing_extensions import Literal, Protocol, runtime_checkable +from typing_validation import validate +from multiformats.varint import BytesLike +from multiformats.multihash.err import MultihashValueError + +@runtime_checkable +class Hashfun(Protocol): + """ + Protocol for raw hash functions. + + .. code-block:: python + + @runtime_checkable + class Hashfun(Protocol): + def __call__(self, data: BytesLike, size: Optional[int] = None) -> bytes: + ... + + """ + + def __call__(self, data: BytesLike, size: Optional[int] = None) -> bytes: + ... + +def validate_hashfun_args(data: BytesLike, + size: Optional[int], + max_digest_size: Optional[int], + *, + size_required: bool = False, + name: str = "") -> None: + """ + Utility function to validate the arguments passed to hash functions. + """ + validate(data, BytesLike) + validate(size, Optional[int]) + if size is not None and size < 0: + raise MultihashValueError("If specified, digest size must be non-negative integer.") + if size is not None and max_digest_size is not None and size > max_digest_size: + raise MultihashValueError("If specified, digest size must not exceed maximum digest size for hash function.") + if size_required and size is None: + raise MultihashValueError(f"Digest size is mandatory for hash function{' '+name if name else ''}.") + +def repeat_hashfun(hashfun: Hashfun, + repeat: int = 1, + truncate: Literal["end", "always"] = "end") -> Hashfun: + """ + Utility function for repeated hashing. + """ + validate(hashfun, Hashfun) + validate(repeat, int) + validate(truncate, Literal["end", "always"]) + if repeat <= 0: + raise MultihashValueError("Argument 'repeat' must be positive integer.") + if repeat == 1: + return hashfun + if truncate == "always": + def repeated_hashfun(data: BytesLike, size: Optional[int]=None) -> bytes: + for _ in range(repeat): + data = hashfun(data, size) + return data + else: + def repeated_hashfun(data: BytesLike, size: Optional[int]=None) -> bytes: + for _ in range(repeat-1): + data = hashfun(data, None) + data = hashfun(data, size) + return data + return repeated_hashfun diff --git a/multiformats/multihash/err.py b/multiformats/multihash/err.py index 472552d..ab5526c 100644 --- a/multiformats/multihash/err.py +++ b/multiformats/multihash/err.py @@ -6,8 +6,6 @@ class MultihashKeyError(builtins.KeyError): # pylint: disable = redefined-builtin """ Class for :mod:`~multiformats.multihash` key errors. """ - ... class MultihashValueError(builtins.ValueError): # pylint: disable = redefined-builtin """ Class :mod:`~multiformats.multihash` value errors. """ - ... diff --git a/multiformats/multihash/raw.py b/multiformats/multihash/raw.py index c00a1bb..3dd20bc 100644 --- a/multiformats/multihash/raw.py +++ b/multiformats/multihash/raw.py @@ -1,10 +1,19 @@ """ Implementation of raw hash functions used by multihash multicodecs. - Hash functions are implemented using the following libraries: + Hash functions are implemented using the following modules: - - `hashlib `_ - - `pyskein `_ + - `hashlib `_, for the ``sha``/``shake`` hash functions and the ``blake2`` hash functions. + - `pysha3 `_, for the ``keccak`` hash functions. + - `blake3 `_, for the ``blake3`` hash function. + - `pyskein `_, for the ``skein`` hash functions. + - `mmh3 `_, for the ``murmur3`` hash functions. + - `pycryptodomex `_, for the ``ripemd-160`` hash function, \ + the ``kangarootwelve`` hash function and the ``sha2-512-224``/``sha2-512-256`` hash functions. + + All modules other than `hashlib `_ are optional dependencies. + The :func:`get` function attempts to dynamically import any optional dependencies required by desired multihash + implementation, raising :py:obj:`ImportError` if the dependency is not installed. Core functionality is provided by the :func:`exists` and :func:`get` functions, which can be used to check whether an implementatino with given name is known, and if so to get the corresponding pair @@ -22,18 +31,16 @@ can always be discounted as invalid. """ -import hashlib -from typing import Callable, Dict, Optional, Tuple +import re +from typing import Dict, Optional, Tuple from typing_validation import validate -import skein # type: ignore - from multiformats import multicodec from multiformats.varint import BytesLike from .err import MultihashKeyError, MultihashValueError +from ._hashfuns import Hashfun, validate_hashfun_args, repeat_hashfun -Hashfun = Callable[[BytesLike], bytes] -"""Type alias for raw hash functions.""" +__all__ = ["Hashfun"] _hashfun: Dict[str, Tuple[Hashfun, Optional[int]]] = {} @@ -57,9 +64,11 @@ def get(name: str) -> MultihashImpl: """ validate(name, str) if name not in _hashfun: - raise MultihashKeyError(f"No implementation for multihash multicodec {repr(name)}.") + if not _jit_register_hashfun(name): + raise MultihashKeyError(f"No implementation for multihash multicodec {repr(name)}.") return _hashfun[name] + def exists(name: str) -> bool: """ Checks whether the multihash multicodec with given name has an implementation. @@ -72,7 +81,9 @@ def exists(name: str) -> bool: """ validate(name, str) - return name in _hashfun + if name in _hashfun: + return True + return _jit_register_hashfun(name, check_only=True) def register(name: str, hashfun: Hashfun, digest_size: Optional[int], *, overwrite: bool = False) -> None: @@ -127,64 +138,156 @@ def unregister(name: str) -> None: raise MultihashKeyError(f"There is no implementation for multihash multicodec with name {repr(name)}.") del _hashfun[name] -def _identity(data: BytesLike) -> bytes: - validate(data, BytesLike) - return bytes(data) +# identity has function is always registered + +def _identity(data: BytesLike, size: Optional[int] = None) -> bytes: + validate_hashfun_args(data, size, None) + d = bytes(data) + if size is None: + return d + if len(d) < size: + raise MultihashValueError("With 'identity' hash, size must be at most data lenght in bytes.") + return d[:size] register("identity", _identity, None) -def _hashlib_sha(version: int, digest_bits: Optional[int] = None) -> Hashfun: - name = ("sha1", f"sha{digest_bits}", f"sha3_{digest_bits}")[version-1] - h = getattr(hashlib, name) - def hashfun(data: BytesLike) -> bytes: - validate(data, BytesLike) - m: hashlib._Hash = h() # pylint: disable = no-member - m.update(data) - return m.digest() - return hashfun - -register("sha1", _hashlib_sha(1), 20) # 20B = 160 bits - -for _digest_bits in (256, 512): - register(f"sha2-{_digest_bits}", _hashlib_sha(2, _digest_bits), _digest_bits//8) - -for _digest_bits in (224, 256, 384, 512): - register(f"sha3-{_digest_bits}", _hashlib_sha(3, _digest_bits), _digest_bits//8) - -def _hashlib_shake(digest_bits: int) -> Hashfun: - h = getattr(hashlib, f"shake_{digest_bits//2}") - def hashfun(data: BytesLike) -> bytes: - validate(data, BytesLike) - m: hashlib._Hash = h() # pylint: disable = no-member - m.update(data) - return m.digest(digest_bits//8) # type: ignore - return hashfun - -for _digest_bits in (256, 512): - register(f"shake-{_digest_bits//2}", _hashlib_shake(_digest_bits), _digest_bits//8) - -def _hashlib_blake2(version: str, digest_bits: int) -> Hashfun: - h = getattr(hashlib, f"blake2{version}") - def hashfun(data: BytesLike) -> bytes: - validate(data, BytesLike) - m: hashlib._Hash = h(digest_size=digest_bits//8) # pylint: disable = no-member - m.update(data) - return m.digest() - return hashfun - -for _blake2_version in ("b", "s"): - for _digest_bits in range(8, 513 if _blake2_version == "b" else 257, 8): - register(f"blake2{_blake2_version}-{_digest_bits}", _hashlib_blake2(_blake2_version, _digest_bits), _digest_bits//8) - -def _skein(version: int, digest_bits: int) -> Hashfun: - h = getattr(skein, f"skein{version}") - def hashfun(data: BytesLike) -> bytes: - validate(data, BytesLike) - m: hashlib._Hash = h(digest_bits=digest_bits) # pylint: disable = no-member - m.update(data) - return m.digest() - return hashfun - -for _skein_version in (256, 512, 1024): - for _digest_bits in range(8, _skein_version+1, 8): - register(f"skein{_skein_version}-{_digest_bits}", _skein(_skein_version, _digest_bits), _digest_bits//8) +# just-in-time hash implementation registration functions + +_sha1_regex = re.compile(r"sha1") +_sha23_regex = re.compile(r"sha(2|3)-(224|256|384|512)") +_shake_regex = re.compile(r"sha(ke)-(128|256)") +_sha2_512_regex = re.compile(r"sha2-512-(224|256)") +_sha2_256_trunc254_padded_regex = re.compile(r"sha2-256-trunc254-padded") + +def _jit_register_hashfun_sha(name: str, check_only: bool = False) -> bool: + # 'sha' hash functions + m = re.fullmatch(_sha1_regex, name) + if m is not None: + from ._hashfuns.sha import _jit_register_sha1 # pylint: disable = import-outside-toplevel + return _jit_register_sha1(m, None if check_only else register) + m = re.fullmatch(_sha23_regex, name) + if m is None: + m = re.fullmatch(_shake_regex, name) + if m is not None: + from ._hashfuns.sha import _jit_register_sha23ke # pylint: disable = import-outside-toplevel + return _jit_register_sha23ke(m, None if check_only else register) + m = re.fullmatch(_sha2_512_regex, name) + if m is not None: + from ._hashfuns.sha import _jit_register_sha2_512 # pylint: disable = import-outside-toplevel + return _jit_register_sha2_512(m, None if check_only else register) + m = re.fullmatch(_sha2_256_trunc254_padded_regex, name) + if m is not None: + from ._hashfuns.filecoin import _jit_register_sha_256_trunc254_padded # pylint: disable = import-outside-toplevel + return _jit_register_sha_256_trunc254_padded(m, None if check_only else register) + return False + +_blake2_regex = re.compile(r"blake2([bs])-([89]|[1-9][0-9]|[1-5][0-9][0-9])") +_blake3_regex = re.compile(r"blake3") + +def _jit_register_hashfun_bla(name: str, check_only: bool = False) -> bool: + # 'blake' hash functions + m = re.fullmatch(_blake2_regex, name) + if m is not None: + from ._hashfuns.blake import _jit_register_blake2 # pylint: disable = import-outside-toplevel + return _jit_register_blake2(m, None if check_only else register) + m = re.fullmatch(_blake3_regex, name) + if m is not None: + from ._hashfuns.blake import _jit_register_blake3 # pylint: disable = import-outside-toplevel + return _jit_register_blake3(m, None if check_only else register) + return False + +_keccak_regex = re.compile(r"keccak-(224|256|384|512)") + +def _jit_register_hashfun_kec(name: str, check_only: bool = False) -> bool: + # 'keccak' hash function + m = re.fullmatch(_keccak_regex, name) + if m is not None: + from ._hashfuns.keccak import _jit_register_keccak # pylint: disable = import-outside-toplevel + return _jit_register_keccak(m, None if check_only else register) + return False + +_skein_regex = re.compile(r"skein(256|512|1024)-([89]|[1-9][0-9]|[1-9][0-9][0-9]|10[0-2][0-9])") + +def _jit_register_hashfun_ske(name: str, check_only: bool = False) -> bool: + # 'skein' hash function + m = re.fullmatch(_skein_regex, name) + if m is not None: + from ._hashfuns.skein import _jit_register_skein # pylint: disable = import-outside-toplevel + return _jit_register_skein(m, None if check_only else register) + return False + +_murmur3_regex = re.compile(r"murmur3-(32)|murmur3-(x64)-(64|128)") + +def _jit_register_hashfun_mur(name: str, check_only: bool = False) -> bool: + # 'murmur3' hash function + m = re.fullmatch(_murmur3_regex, name) + if m is not None: + from ._hashfuns.murmur3 import _jit_register_murmur3 # pylint: disable = import-outside-toplevel + return _jit_register_murmur3(m, None if check_only else register) + return False + +_md5_regex = re.compile("md5") + +def _jit_register_hashfun_md5(name: str, check_only: bool = False) -> bool: + # 'md5' hash function + m = re.fullmatch(_md5_regex, name) + if m is not None: + from ._hashfuns.md import _jit_register_md5 # pylint: disable = import-outside-toplevel + return _jit_register_md5(m, None if check_only else register) + return False + +_ripemd_regex = re.compile(r"ripemd-(160)") + +def _jit_register_hashfun_rip(name: str, check_only: bool = False) -> bool: + # 'ripemd' hash functions + m = re.fullmatch(_ripemd_regex, name) + if m is not None: + from ._hashfuns.md import _jit_register_ripemd # pylint: disable = import-outside-toplevel + return _jit_register_ripemd(m, None if check_only else register) + return False + +_kangarootwelve_regex = re.compile("kangarootwelve") + +def _jit_register_hashfun_kan(name: str, check_only: bool = False) -> bool: + # 'kangarootwelve' hash function + m = re.fullmatch(_kangarootwelve_regex, name) + if m is not None: + from ._hashfuns.kangarootwelve import _jit_register_kangarootwelve # pylint: disable = import-outside-toplevel + return _jit_register_kangarootwelve(m, None if check_only else register) + return False + +_dbl_sha2_regex = re.compile(r"dbl-sha2-(256)") + +def _jit_register_hashfun_dbl(name: str, check_only: bool = False) -> bool: + # 'dbl-sha2-256' hash function + m = re.fullmatch(_dbl_sha2_regex, name) + if m is not None: + sha2_256, _ = get("sha2-256") + assert sha2_256 is not None + dbl_sha2_256 = repeat_hashfun(sha2_256, repeat=2, truncate="end") + register("dbl-sha2-256", dbl_sha2_256, 32) + return True + return False + +# directory of just-in-time hash implementation registration functions + +_jit_register_hashfun_dir = { + "sha": _jit_register_hashfun_sha, + "bla": _jit_register_hashfun_bla, + "kec": _jit_register_hashfun_kec, + "ske": _jit_register_hashfun_ske, + "mur": _jit_register_hashfun_mur, + "md5": _jit_register_hashfun_md5, + "rip": _jit_register_hashfun_rip, + "kan": _jit_register_hashfun_kan, + "dbl": _jit_register_hashfun_dbl, +} + +def _jit_register_hashfun(name: str, check_only: bool = False) -> bool: + # pylint: disable = too-many-return-statements + if len(name) < 3: + return False + jit_reg_fun = _jit_register_hashfun_dir.get(name[:3], None) + if jit_reg_fun is None: + return False + return jit_reg_fun(name, check_only) diff --git a/multiformats/varint/__init__.py b/multiformats/varint/__init__.py index 4b7006b..19bb8bc 100644 --- a/multiformats/varint/__init__.py +++ b/multiformats/varint/__init__.py @@ -7,7 +7,7 @@ """ from io import BufferedIOBase -from typing import cast, List, overload, Tuple, Union, TypeVar +from typing import BinaryIO, cast, List, Optional, overload, Tuple, Union, TypeVar from typing_extensions import Final from typing_validation import validate @@ -52,7 +52,7 @@ def encode(x: int) -> bytes: return bytes(varint_bytelist) -def decode(b: Union[BytesLike, BufferedIOBase]) -> int: +def decode(b: Union[BytesLike, BufferedIOBase, BinaryIO]) -> int: """ Decodes an unsigned varint from a bytes-like object or a buffered binary stream. @@ -75,7 +75,7 @@ def decode(b: Union[BytesLike, BufferedIOBase]) -> int: b'\\x12\\xff\\x01' :param b: the bytes-like object or stream from which to decode a varint - :type b: :obj:`~multiformats.varint.BytesLike` or :obj:`~io.BufferedIOBase` + :type b: :obj:`~multiformats.varint.BytesLike`, :obj:`~io.BufferedIOBase` or :obj:`~typing.BinaryIO` :raises ValueError: if the input contains no bytes (from specs, the number 0 is encoded as ``0b00000000``) :raises ValueError: if the 9th byte of the input is a continuation byte (from specs, no number >= 2**63 is allowed) @@ -98,6 +98,7 @@ def _no_next_byte_error(num_bytes_read: int) -> ValueError: return ValueError(f"Byte #{num_bytes_read-1} was a continuation byte, but byte #{num_bytes_read} not available.") _BufferedIOT = TypeVar("_BufferedIOT", bound=BufferedIOBase) +_BinaryIOT = TypeVar("_BinaryIOT", bound=BinaryIO) @overload def decode_raw(b: BytesLike) -> Tuple[int, int, memoryview]: @@ -107,7 +108,11 @@ def decode_raw(b: BytesLike) -> Tuple[int, int, memoryview]: def decode_raw(b: _BufferedIOT) -> Tuple[int, int, _BufferedIOT]: ... -def decode_raw(b: Union[BytesLike, BufferedIOBase]) -> Tuple[int, int, Union[memoryview, BufferedIOBase]]: +@overload +def decode_raw(b: _BinaryIOT) -> Tuple[int, int, _BinaryIOT]: + ... + +def decode_raw(b: Union[BytesLike, BufferedIOBase, BinaryIO]) -> Tuple[int, int, Union[memoryview, BufferedIOBase, BinaryIO]]: """ Specialised version of :func:`~multiformats.varint.decode` for partial decoding, returning a pair ``(x, n)`` of the decoded varint ``x`` and the number ``n`` of bytes read from the start and/or consumed from the stream. @@ -150,23 +155,27 @@ def decode_raw(b: Union[BytesLike, BufferedIOBase]) -> Tuple[int, int, Union[mem # note: stream.read() consumed the bytes :param b: the bytes-like object or stream from which to decode a varint - :type b: :obj:`~multiformats.varint.BytesLike` or :obj:`~io.BufferedIOBase` + :type b: :obj:`~multiformats.varint.BytesLike`, :obj:`~io.BufferedIOBase` or :obj:`~typing.BinaryIO` :raises ValueError: same reasons as :func:`~multiformats.varint.decode`, except for the last (where no error is raised) """ + stream_mode: Optional[type] if isinstance(b, BufferedIOBase): - stream_mode = True + stream_mode = BufferedIOBase validate(b, BufferedIOBase) + elif isinstance(b, BinaryIO): + stream_mode = BinaryIO + validate(b, BinaryIO) else: - stream_mode = False + stream_mode = None validate(b, BytesLike) expect_next = True num_bytes_read = 0 x = 0 while expect_next: - if stream_mode: - _next_byte: bytes = cast(BufferedIOBase, b).read(1) + if stream_mode is not None: + _next_byte: bytes = cast(Union[BufferedIOBase, BinaryIO], b).read(1) if len(_next_byte) == 0: raise _no_next_byte_error(num_bytes_read) next_byte: int = _next_byte[0] @@ -181,6 +190,6 @@ def decode_raw(b: Union[BytesLike, BufferedIOBase]) -> Tuple[int, int, Union[mem raise ValueError(f"Varints must be at most {_max_num_bytes} bytes long.") if num_bytes_read > 1 and x < 2**(7*(num_bytes_read-1)): raise ValueError(f"Number {x} was not minimally encoded (as a {num_bytes_read} bytes varint).") - if stream_mode: - return x, num_bytes_read, cast(BufferedIOBase, b) + if stream_mode is not None: + return x, num_bytes_read, cast(Union[BufferedIOBase, BinaryIO], b) return x, num_bytes_read, memoryview(cast(BytesLike, b))[num_bytes_read:] diff --git a/report.py b/report.py index e4e7b25..60e9b2c 100644 --- a/report.py +++ b/report.py @@ -6,38 +6,43 @@ if __name__ != "__main__": raise RuntimeError("usage: report.py [-h] [-d]") - # == Memory profiling == +import gc # `psutil` is not a dependency for the `multiformats` library import psutil # type: ignore mem_usage = {} +gc.collect() baseline = psutil.Process().memory_full_info().uss / (1024 * 1024) prev = baseline -import typing_validation +import typing_extensions +gc.collect() diff = psutil.Process().memory_full_info().uss / (1024 * 1024)-prev -mem_usage["typing-validation"] = diff +mem_usage["typing-extensions"] = diff prev += diff -import bases +import typing_validation +gc.collect() diff = psutil.Process().memory_full_info().uss / (1024 * 1024)-prev -mem_usage["bases"] = diff +mem_usage["typing-validation"] = diff prev += diff -import skein # type: ignore +import bases +gc.collect() diff = psutil.Process().memory_full_info().uss / (1024 * 1024)-prev -mem_usage["pyskein"] = diff +mem_usage["bases"] = diff prev += diff import multiformats from multiformats import * +gc.collect() diff = psutil.Process().memory_full_info().uss / (1024 * 1024)-prev mem_usage["multiformats"] = diff @@ -49,6 +54,7 @@ import argparse from typing import Any, Callable, Collection, Dict, List, Optional, Tuple, Union +from typing_extensions import Literal # `rich` is not a dependency for the `multiformats` library from rich.console import Console @@ -71,7 +77,7 @@ version = get_version(root='.', version_scheme="post-release") -console = Console(record=True, width=104) +console = Console(record=True, width=110) console.print(Panel(f"Multiformats implementation report [bold blue]v{version}[white]")) @@ -95,14 +101,17 @@ # == Group multihash multicodecs together == -# TODO: introduce grouped multicodecs doing this directly, to reduce mem footprint +# TODO: consider introduce grouped multicodecs doing this directly, to reduce mem footprint (currently footprint is negligible) _multihash_indices: Dict[str, int] = {} -_grouped_multicodecs: List[Tuple[str, str, Optional[List[int]], List[int], List[bool]]] = [] +_grouped_multicodecs: List[Tuple[str, str, Optional[List[int]], List[int], List[bool], Literal["draft", "permanent"]]] = [] for codec in multicodec.table(tag="multihash"): is_implemented = multihash.is_implemented(codec.name) tokens = codec.name.split("-") - label = "-".join(tokens[:-1]) + if len(tokens) == 1: + label = codec.name + else: + label = "-".join(tokens[:-1]) max_digest_size: Optional[int] = None try: max_digest_size = multihash.raw.get(codec.name)[1] @@ -110,20 +119,20 @@ pass if max_digest_size is None: try: - max_digest_size = int(tokens[-1]) + max_digest_size = int(tokens[-1])//8 except ValueError: pass if max_digest_size is None: - _grouped_multicodecs.append((codec.name, codec.tag, None, [codec.code], [is_implemented])) + _grouped_multicodecs.append((codec.name, codec.tag, None, [codec.code], [is_implemented], codec.status)) continue bitsize = max_digest_size*8 if label not in _multihash_indices: _multihash_indices[label] = len(_grouped_multicodecs) - _grouped_multicodecs.append((codec.name, codec.tag, [bitsize], [codec.code], [is_implemented])) + _grouped_multicodecs.append((codec.name, codec.tag, [bitsize], [codec.code], [is_implemented], codec.status)) else: bitsize_list = _grouped_multicodecs[_multihash_indices[label]][2] if bitsize_list is None: - _grouped_multicodecs.append((codec.name, codec.tag, None, [codec.code], [is_implemented])) + _grouped_multicodecs.append((codec.name, codec.tag, None, [codec.code], [is_implemented], codec.status)) continue code_list = _grouped_multicodecs[_multihash_indices[label]][3] impl_list = _grouped_multicodecs[_multihash_indices[label]][4] @@ -162,9 +171,10 @@ def set_str(l: Collection[int], *, use_hex: bool = False, minlen: int = 4, maxle table.add_column("Name") table.add_column("Bitsize", style="bright_black") table.add_column("Implem.") +table.add_column("Status") num_implemented = 0 num_total = 0 -for name, tag, bitsize_list, code_list, impl_list in _grouped_multicodecs: +for name, tag, bitsize_list, code_list, impl_list, status in _grouped_multicodecs: num_total += len(impl_list) impl_status = "[red]no" if all(impl_list): @@ -174,17 +184,19 @@ def set_str(l: Collection[int], *, use_hex: bool = False, minlen: int = 4, maxle num_impl = sum(1 if b else 0 for b in impl_list) impl_status = f"[yellow]{num_impl}/{len(impl_list)}" num_implemented += num_impl + codec_status = "[yellow]draft" if status == "draft" else "[green]perm." if bitsize_list is None: - table.add_row(code2str(code_list[0]), name, "", impl_status) + table.add_row(code2str(code_list[0]), name, "", impl_status, codec_status) continue if len(bitsize_list) <= 1: - table.add_row(code2str(code_list[0]), f"{name}-{bitsize_list[0]}", str(bitsize_list[0]), impl_status) + table.add_row(code2str(code_list[0]), f"{name}", str(bitsize_list[0]), impl_status, codec_status) else: label = "-".join(name.split("-")[:-1]) table.add_row(set_str(code_list, use_hex=hex_codes), f"{label}-[bright_black]Bitsize", set_str(bitsize_list), - impl_status) + impl_status, + codec_status) console.print(f"> Multihash functions implemented: [bold blue]{num_implemented}/{num_total}") console.print(table) @@ -196,6 +208,7 @@ def set_str(l: Collection[int], *, use_hex: bool = False, minlen: int = 4, maxle table.add_column("Code", style="bold blue") table.add_column("Name") table.add_column("Implem.") +table.add_column("Status") num_implemented = 0 num_total = 0 for codec in multicodec.table(tag="multiaddr"): @@ -203,10 +216,31 @@ def set_str(l: Collection[int], *, use_hex: bool = False, minlen: int = 4, maxle num_implemented += 1 if is_implemented else 0 num_total += 1 impl_status = "[green]yes" if is_implemented else "[red]no" - table.add_row(code2str(codec.code), codec.name, impl_status) + codec_status = "[yellow]draft" if codec.status == "draft" else "[green]perm." + table.add_row(code2str(codec.code), codec.name, impl_status, codec_status) console.print(f"> Multiaddr protocols implemented: [bold blue]{num_implemented}/{num_total}") console.print(table) +# == Multibase table == + +console.rule("Multibases") +table = Table() +table.add_column("Code", style="bold blue") +table.add_column("Name") +table.add_column("Implem.") +table.add_column("Status") + +num_implemented = 0 +num_total = 0 +for base in multibase.table(): + is_implemented = multibase.raw.exists(base.name) + num_implemented += 1 if is_implemented else 0 + num_total += 1 + impl_status = "[green]yes" if is_implemented else "[red]no" + codec_status = "[yellow]draft" if base.status == "draft" else "[green]perm." + table.add_row(base.code_printable, base.name, impl_status, codec_status) +console.print(f"> Multibases implemented: [bold blue]{num_implemented}/{num_total}") +console.print(table) # == Other multicodecs table == @@ -215,9 +249,11 @@ def set_str(l: Collection[int], *, use_hex: bool = False, minlen: int = 4, maxle table.add_column("Code", style="bold blue") table.add_column("Name") table.add_column("Tag", style="magenta") +table.add_column("Status") for codec in multicodec.table(): if codec.tag not in ("multihash", "multiaddr"): - table.add_row(code2str(codec.code), codec.name, codec.tag) + codec_status = "[yellow]draft" if codec.status == "draft" else "[green]perm." + table.add_row(code2str(codec.code), codec.name, codec.tag, codec_status) console.print(table) diff --git a/report.txt b/report.txt index 779bc09..4c6b3cc 100644 --- a/report.txt +++ b/report.txt @@ -1,185 +1,227 @@ -┌──────────────────────────────────────────────────────────────────────────────────────────────────────┐ -│ Multiformats implementation report v0.1.2.post3+g474dabd.d20211228 │ -└──────────────────────────────────────────────────────────────────────────────────────────────────────┘ -───────────────────────────────────────────── Memory Usage ───────────────────────────────────────────── -> python+psutil memory baseline: 7.3MiB -> multiformats memory total: 3.4MiB +┌────────────────────────────────────────────────────────────────────────────────────────────────────────────┐ +│ Multiformats implementation report v0.1.3.post6+gb76a9ea.d20220721 │ +└────────────────────────────────────────────────────────────────────────────────────────────────────────────┘ +──────────────────────────────────────────────── Memory Usage ──────────────────────────────────────────────── +> python+psutil memory baseline: 7.4MiB +> multiformats memory total: 3.0MiB ┌───────────────────┬────────┬──────────┐ │ Component │ Memory │ Memory % │ ├───────────────────┼────────┼──────────┤ -│ typing-validation │ 168KiB │ 5% │ -│ bases │ 608KiB │ 17% │ -│ pyskein │ 88KiB │ 3% │ -│ multiformats │ 2.6MiB │ 75% │ +│ typing-extensions │ 180KiB │ 6% │ +│ typing-validation │ 0KiB │ 0% │ +│ bases │ 844KiB │ 28% │ +│ multiformats │ 2.0MiB │ 67% │ └───────────────────┴────────┴──────────┘ -───────────────────────────────────────── Multihash functions ────────────────────────────────────────── -> Multihash functions implemented: 330/356 -┌──────────────────────────────────┬──────────────────────────────┬──────────────────────────┬─────────┐ -│ Code │ Name │ Bitsize │ Implem. │ -├──────────────────────────────────┼──────────────────────────────┼──────────────────────────┼─────────┤ -│ 0x0 │ identity │ │ yes │ -│ 0x11 │ sha1-160 │ 160 │ yes │ -│ {0x12, 0x13, 0x20, 0x1013} │ sha2-Bitsize │ {256, 512, 1792, 3072} │ 2/4 │ -│ {0x14, 0x15, 0x16, 0x17} │ sha3-Bitsize │ {224, 256, 384, 512} │ yes │ -│ {0x18, 0x19} │ shake-Bitsize │ {256, 512} │ yes │ -│ {0x1a, 0x1b, 0x1c, 0x1d} │ keccak-Bitsize │ {1792, 2048, 3072, 4096} │ no │ -│ 0x1e │ blake3 │ │ no │ -│ {0x22, 0x1022} │ murmur3-x64-Bitsize │ {512, 1024} │ no │ -│ 0x23 │ murmur3-32-256 │ 256 │ no │ -│ 0x56 │ dbl-sha2-256-2048 │ 2048 │ no │ -│ 0xd4 │ md4 │ │ no │ -│ 0xd5 │ md5 │ │ no │ -│ 0xd6 │ bmt │ │ no │ -│ 0x1012 │ sha2-256-trunc254-padded │ │ no │ -│ {0x1014, 0x1015} │ sha2-512-Bitsize │ {1792, 2048} │ no │ -│ {0x1052, 0x1053, 0x1054, 0x1055} │ ripemd-Bitsize │ {1024, 1280, 2048, 2560} │ no │ -│ 0x1100 │ x11 │ │ no │ -│ 0x1d01 │ kangarootwelve │ │ no │ -│ 0x534d │ sm3-256-2048 │ 2048 │ no │ -│ {0xb201, 0xb202, ..., 0xb240} │ blake2b-Bitsize │ {8, 16, ..., 512} │ yes │ -│ {0xb241, 0xb242, ..., 0xb260} │ blake2s-Bitsize │ {8, 16, ..., 256} │ yes │ -│ {0xb301, 0xb302, ..., 0xb320} │ skein256-Bitsize │ {8, 16, ..., 256} │ yes │ -│ {0xb321, 0xb322, ..., 0xb360} │ skein512-Bitsize │ {8, 16, ..., 512} │ yes │ -│ {0xb361, 0xb362, ..., 0xb3e0} │ skein1024-Bitsize │ {8, 16, ..., 1024} │ yes │ -│ 0xb401 │ poseidon-bls12_381-a2-fc1 │ │ no │ -│ 0xb402 │ poseidon-bls12_381-a2-fc1-sc │ │ no │ -└──────────────────────────────────┴──────────────────────────────┴──────────────────────────┴─────────┘ -───────────────────────────────────────── Multiaddr protocols ────────────────────────────────────────── -> Multiaddr protocols implemented: 18/33 -┌──────────┬────────────────────┬─────────┐ -│ Code │ Name │ Implem. │ -├──────────┼────────────────────┼─────────┤ -│ 0x4 │ ip4 │ yes │ -│ 0x6 │ tcp │ yes │ -│ 0x21 │ dccp │ no │ -│ 0x29 │ ip6 │ yes │ -│ 0x2a │ ip6zone │ no │ -│ 0x35 │ dns │ no │ -│ 0x36 │ dns4 │ no │ -│ 0x37 │ dns6 │ no │ -│ 0x38 │ dnsaddr │ no │ -│ 0x84 │ sctp │ no │ -│ 0x111 │ udp │ yes │ -│ 0x113 │ p2p-webrtc-star │ yes │ -│ 0x114 │ p2p-webrtc-direct │ yes │ -│ 0x115 │ p2p-stardust │ yes │ -│ 0x122 │ p2p-circuit │ yes │ -│ 0x12d │ udt │ yes │ -│ 0x12e │ utp │ yes │ -│ 0x190 │ unix │ no │ -│ 0x196 │ thread │ no │ -│ 0x1a5 │ p2p │ no │ -│ 0x1bb │ https │ yes │ -│ 0x1bc │ onion │ no │ -│ 0x1bd │ onion3 │ no │ -│ 0x1be │ garlic64 │ no │ -│ 0x1bf │ garlic32 │ no │ -│ 0x1c0 │ tls │ yes │ -│ 0x1c6 │ noise │ yes │ -│ 0x1cc │ quic │ yes │ -│ 0x1dd │ ws │ yes │ -│ 0x1de │ wss │ yes │ -│ 0x1df │ p2p-websocket-star │ yes │ -│ 0x1e0 │ http │ yes │ -│ 0x706c61 │ plaintextv2 │ no │ -└──────────┴────────────────────┴─────────┘ -────────────────────────────────────────── Other Multicodecs ─────────────────────────────────────────── -┌──────────┬────────────────────────────┬───────────────┐ -│ Code │ Name │ Tag │ -├──────────┼────────────────────────────┼───────────────┤ -│ 0x1 │ cidv1 │ cid │ -│ 0x2 │ cidv2 │ cid │ -│ 0x3 │ cidv3 │ cid │ -│ 0x2f │ path │ namespace │ -│ 0x30 │ multicodec │ multiformat │ -│ 0x31 │ multihash │ multiformat │ -│ 0x32 │ multiaddr │ multiformat │ -│ 0x33 │ multibase │ multiformat │ -│ 0x50 │ protobuf │ serialization │ -│ 0x51 │ cbor │ ipld │ -│ 0x55 │ raw │ ipld │ -│ 0x60 │ rlp │ serialization │ -│ 0x63 │ bencode │ serialization │ -│ 0x70 │ dag-pb │ ipld │ -│ 0x71 │ dag-cbor │ ipld │ -│ 0x72 │ libp2p-key │ ipld │ -│ 0x78 │ git-raw │ ipld │ -│ 0x7b │ torrent-info │ ipld │ -│ 0x7c │ torrent-file │ ipld │ -│ 0x81 │ leofcoin-block │ ipld │ -│ 0x82 │ leofcoin-tx │ ipld │ -│ 0x83 │ leofcoin-pr │ ipld │ -│ 0x85 │ dag-jose │ ipld │ -│ 0x86 │ dag-cose │ ipld │ -│ 0x90 │ eth-block │ ipld │ -│ 0x91 │ eth-block-list │ ipld │ -│ 0x92 │ eth-tx-trie │ ipld │ -│ 0x93 │ eth-tx │ ipld │ -│ 0x94 │ eth-tx-receipt-trie │ ipld │ -│ 0x95 │ eth-tx-receipt │ ipld │ -│ 0x96 │ eth-state-trie │ ipld │ -│ 0x97 │ eth-account-snapshot │ ipld │ -│ 0x98 │ eth-storage-trie │ ipld │ -│ 0x99 │ eth-receipt-log-trie │ ipld │ -│ 0x9a │ eth-reciept-log │ ipld │ -│ 0xa0 │ aes-128 │ key │ -│ 0xa1 │ aes-192 │ key │ -│ 0xa2 │ aes-256 │ key │ -│ 0xa3 │ chacha-128 │ key │ -│ 0xa4 │ chacha-256 │ key │ -│ 0xb0 │ bitcoin-block │ ipld │ -│ 0xb1 │ bitcoin-tx │ ipld │ -│ 0xb2 │ bitcoin-witness-commitment │ ipld │ -│ 0xc0 │ zcash-block │ ipld │ -│ 0xc1 │ zcash-tx │ ipld │ -│ 0xca │ caip-50 │ multiformat │ -│ 0xce │ streamid │ namespace │ -│ 0xd0 │ stellar-block │ ipld │ -│ 0xd1 │ stellar-tx │ ipld │ -│ 0xe0 │ decred-block │ ipld │ -│ 0xe1 │ decred-tx │ ipld │ -│ 0xe2 │ ipld-ns │ namespace │ -│ 0xe3 │ ipfs-ns │ namespace │ -│ 0xe4 │ swarm-ns │ namespace │ -│ 0xe5 │ ipns-ns │ namespace │ -│ 0xe6 │ zeronet │ namespace │ -│ 0xe7 │ secp256k1-pub │ key │ -│ 0xea │ bls12_381-g1-pub │ key │ -│ 0xeb │ bls12_381-g2-pub │ key │ -│ 0xec │ x25519-pub │ key │ -│ 0xed │ ed25519-pub │ key │ -│ 0xee │ bls12_381-g1g2-pub │ key │ -│ 0xf0 │ dash-block │ ipld │ -│ 0xf1 │ dash-tx │ ipld │ -│ 0xfa │ swarm-manifest │ ipld │ -│ 0xfb │ swarm-feed │ ipld │ -│ 0x129 │ dag-json │ ipld │ -│ 0x1f0 │ swhid-1-snp │ ipld │ -│ 0x200 │ json │ ipld │ -│ 0x201 │ messagepack │ serialization │ -│ 0x301 │ libp2p-peer-record │ libp2p │ -│ 0x302 │ libp2p-relay-rsvp │ libp2p │ -│ 0x400 │ car-index-sorted │ serialization │ -│ 0x401 │ car-multihash-index-sorted │ serialization │ -│ 0x1200 │ p256-pub │ key │ -│ 0x1201 │ p384-pub │ key │ -│ 0x1202 │ p521-pub │ key │ -│ 0x1203 │ ed448-pub │ key │ -│ 0x1204 │ x448-pub │ key │ -│ 0x1205 │ rsa-pub │ key │ -│ 0x1300 │ ed25519-priv │ key │ -│ 0x1301 │ secp256k1-priv │ key │ -│ 0x1302 │ x25519-priv │ key │ -│ 0xce11 │ zeroxcert-imprint-256 │ zeroxcert │ -│ 0xf101 │ fil-commitment-unsealed │ filecoin │ -│ 0xf102 │ fil-commitment-sealed │ filecoin │ -│ 0x807124 │ holochain-adr-v0 │ holochain │ -│ 0x817124 │ holochain-adr-v1 │ holochain │ -│ 0x947124 │ holochain-key-v0 │ holochain │ -│ 0x957124 │ holochain-key-v1 │ holochain │ -│ 0xa27124 │ holochain-sig-v0 │ holochain │ -│ 0xa37124 │ holochain-sig-v1 │ holochain │ -│ 0xb19910 │ skynet-ns │ namespace │ -│ 0xb29910 │ arweave-ns │ namespace │ -│ 0xb39910 │ subspace-ns │ namespace │ -└──────────┴────────────────────────────┴───────────────┘ +──────────────────────────────────────────── Multihash functions ───────────────────────────────────────────── +> Multihash functions implemented: 347/356 +┌──────────────────────────────────┬──────────────────────────────┬──────────────────────┬─────────┬────────┐ +│ Code │ Name │ Bitsize │ Implem. │ Status │ +├──────────────────────────────────┼──────────────────────────────┼──────────────────────┼─────────┼────────┤ +│ 0x0 │ identity │ │ yes │ perm. │ +│ 0x11 │ sha1 │ 160 │ yes │ perm. │ +│ {0x12, 0x13, 0x20, 0x1013} │ sha2-Bitsize │ {224, 256, 384, 512} │ yes │ perm. │ +│ {0x14, 0x15, 0x16, 0x17} │ sha3-Bitsize │ {224, 256, 384, 512} │ yes │ perm. │ +│ {0x18, 0x19} │ shake-Bitsize │ {256, 512} │ yes │ draft │ +│ {0x1a, 0x1b, 0x1c, 0x1d} │ keccak-Bitsize │ {224, 256, 384, 512} │ yes │ draft │ +│ 0x1e │ blake3 │ │ yes │ draft │ +│ {0x22, 0x1022} │ murmur3-x64-Bitsize │ {64, 128} │ yes │ perm. │ +│ 0x23 │ murmur3-32 │ 32 │ yes │ draft │ +│ 0x56 │ dbl-sha2-256 │ 256 │ yes │ draft │ +│ 0xd4 │ md4 │ │ no │ draft │ +│ 0xd5 │ md5 │ 128 │ yes │ draft │ +│ 0x1012 │ sha2-256-trunc254-padded │ 256 │ yes │ perm. │ +│ {0x1014, 0x1015} │ sha2-512-Bitsize │ {224, 256} │ yes │ perm. │ +│ {0x1052, 0x1053, 0x1054, 0x1055} │ ripemd-Bitsize │ {128, 160, 256, 320} │ 1/4 │ draft │ +│ 0x1100 │ x11 │ │ no │ draft │ +│ 0x1d01 │ kangarootwelve │ │ yes │ draft │ +│ 0x534d │ sm3-256 │ 256 │ no │ draft │ +│ {0xb201, 0xb202, ..., 0xb240} │ blake2b-Bitsize │ {8, 16, ..., 512} │ yes │ draft │ +│ {0xb241, 0xb242, ..., 0xb260} │ blake2s-Bitsize │ {8, 16, ..., 256} │ yes │ draft │ +│ {0xb301, 0xb302, ..., 0xb320} │ skein256-Bitsize │ {8, 16, ..., 256} │ yes │ draft │ +│ {0xb321, 0xb322, ..., 0xb360} │ skein512-Bitsize │ {8, 16, ..., 512} │ yes │ draft │ +│ {0xb361, 0xb362, ..., 0xb3e0} │ skein1024-Bitsize │ {8, 16, ..., 1024} │ yes │ draft │ +│ 0xb401 │ poseidon-bls12_381-a2-fc1 │ │ no │ perm. │ +│ 0xb402 │ poseidon-bls12_381-a2-fc1-sc │ │ no │ draft │ +│ 0xb502 │ ssz-sha2-256-bmt │ │ no │ draft │ +└──────────────────────────────────┴──────────────────────────────┴──────────────────────┴─────────┴────────┘ +──────────────────────────────────────────── Multiaddr protocols ───────────────────────────────────────────── +> Multiaddr protocols implemented: 18/36 +┌──────────┬────────────────────┬─────────┬────────┐ +│ Code │ Name │ Implem. │ Status │ +├──────────┼────────────────────┼─────────┼────────┤ +│ 0x4 │ ip4 │ yes │ perm. │ +│ 0x6 │ tcp │ yes │ perm. │ +│ 0x21 │ dccp │ no │ draft │ +│ 0x29 │ ip6 │ yes │ perm. │ +│ 0x2a │ ip6zone │ no │ draft │ +│ 0x35 │ dns │ no │ perm. │ +│ 0x36 │ dns4 │ no │ perm. │ +│ 0x37 │ dns6 │ no │ perm. │ +│ 0x38 │ dnsaddr │ no │ perm. │ +│ 0x84 │ sctp │ no │ draft │ +│ 0x111 │ udp │ yes │ draft │ +│ 0x113 │ p2p-webrtc-star │ yes │ draft │ +│ 0x114 │ p2p-webrtc-direct │ yes │ draft │ +│ 0x115 │ p2p-stardust │ yes │ draft │ +│ 0x118 │ webrtc │ no │ draft │ +│ 0x122 │ p2p-circuit │ yes │ perm. │ +│ 0x12d │ udt │ yes │ draft │ +│ 0x12e │ utp │ yes │ draft │ +│ 0x190 │ unix │ no │ perm. │ +│ 0x196 │ thread │ no │ draft │ +│ 0x1a5 │ p2p │ no │ perm. │ +│ 0x1bb │ https │ yes │ draft │ +│ 0x1bc │ onion │ no │ draft │ +│ 0x1bd │ onion3 │ no │ draft │ +│ 0x1be │ garlic64 │ no │ draft │ +│ 0x1bf │ garlic32 │ no │ draft │ +│ 0x1c0 │ tls │ yes │ draft │ +│ 0x1c6 │ noise │ yes │ draft │ +│ 0x1cc │ quic │ yes │ perm. │ +│ 0x1d1 │ webtransport │ no │ draft │ +│ 0x1d2 │ certhash │ no │ draft │ +│ 0x1dd │ ws │ yes │ perm. │ +│ 0x1de │ wss │ yes │ perm. │ +│ 0x1df │ p2p-websocket-star │ yes │ perm. │ +│ 0x1e0 │ http │ yes │ draft │ +│ 0x706c61 │ plaintextv2 │ no │ draft │ +└──────────┴────────────────────┴─────────┴────────┘ +───────────────────────────────────────────────── Multibases ───────────────────────────────────────────────── +> Multibases implemented: 24/25 +┌──────────┬───────────────────┬─────────┬────────┐ +│ Code │ Name │ Implem. │ Status │ +├──────────┼───────────────────┼─────────┼────────┤ +│ 0x00 │ identity │ yes │ perm. │ +│ 0 │ base2 │ yes │ perm. │ +│ 7 │ base8 │ yes │ draft │ +│ 9 │ base10 │ yes │ draft │ +│ B │ base32upper │ yes │ perm. │ +│ C │ base32padupper │ yes │ perm. │ +│ F │ base16upper │ yes │ perm. │ +│ K │ base36upper │ yes │ draft │ +│ M │ base64pad │ yes │ perm. │ +│ T │ base32hexpadupper │ yes │ perm. │ +│ U │ base64urlpad │ yes │ perm. │ +│ V │ base32hexupper │ yes │ perm. │ +│ Z │ base58flickr │ yes │ perm. │ +│ b │ base32 │ yes │ perm. │ +│ c │ base32pad │ yes │ perm. │ +│ f │ base16 │ yes │ perm. │ +│ h │ base32z │ yes │ draft │ +│ k │ base36 │ yes │ draft │ +│ m │ base64 │ yes │ perm. │ +│ p │ proquint │ yes │ draft │ +│ t │ base32hexpad │ yes │ perm. │ +│ u │ base64url │ yes │ perm. │ +│ v │ base32hex │ yes │ perm. │ +│ z │ base58btc │ yes │ perm. │ +│ 0x01F680 │ base256emoji │ no │ draft │ +└──────────┴───────────────────┴─────────┴────────┘ +───────────────────────────────────────────── Other Multicodecs ────────────────────────────────────────────── +┌──────────┬────────────────────────────────┬───────────────┬────────┐ +│ Code │ Name │ Tag │ Status │ +├──────────┼────────────────────────────────┼───────────────┼────────┤ +│ 0x1 │ cidv1 │ cid │ perm. │ +│ 0x2 │ cidv2 │ cid │ draft │ +│ 0x3 │ cidv3 │ cid │ draft │ +│ 0x2f │ path │ namespace │ perm. │ +│ 0x30 │ multicodec │ multiformat │ draft │ +│ 0x31 │ multihash │ multiformat │ draft │ +│ 0x32 │ multiaddr │ multiformat │ draft │ +│ 0x33 │ multibase │ multiformat │ draft │ +│ 0x50 │ protobuf │ serialization │ draft │ +│ 0x51 │ cbor │ ipld │ perm. │ +│ 0x55 │ raw │ ipld │ perm. │ +│ 0x60 │ rlp │ serialization │ draft │ +│ 0x63 │ bencode │ serialization │ draft │ +│ 0x70 │ dag-pb │ ipld │ perm. │ +│ 0x71 │ dag-cbor │ ipld │ perm. │ +│ 0x72 │ libp2p-key │ ipld │ perm. │ +│ 0x78 │ git-raw │ ipld │ perm. │ +│ 0x7b │ torrent-info │ ipld │ draft │ +│ 0x7c │ torrent-file │ ipld │ draft │ +│ 0x81 │ leofcoin-block │ ipld │ draft │ +│ 0x82 │ leofcoin-tx │ ipld │ draft │ +│ 0x83 │ leofcoin-pr │ ipld │ draft │ +│ 0x85 │ dag-jose │ ipld │ draft │ +│ 0x86 │ dag-cose │ ipld │ draft │ +│ 0x90 │ eth-block │ ipld │ perm. │ +│ 0x91 │ eth-block-list │ ipld │ perm. │ +│ 0x92 │ eth-tx-trie │ ipld │ perm. │ +│ 0x93 │ eth-tx │ ipld │ perm. │ +│ 0x94 │ eth-tx-receipt-trie │ ipld │ perm. │ +│ 0x95 │ eth-tx-receipt │ ipld │ perm. │ +│ 0x96 │ eth-state-trie │ ipld │ perm. │ +│ 0x97 │ eth-account-snapshot │ ipld │ perm. │ +│ 0x98 │ eth-storage-trie │ ipld │ perm. │ +│ 0x99 │ eth-receipt-log-trie │ ipld │ draft │ +│ 0x9a │ eth-reciept-log │ ipld │ draft │ +│ 0xa0 │ aes-128 │ key │ draft │ +│ 0xa1 │ aes-192 │ key │ draft │ +│ 0xa2 │ aes-256 │ key │ draft │ +│ 0xa3 │ chacha-128 │ key │ draft │ +│ 0xa4 │ chacha-256 │ key │ draft │ +│ 0xb0 │ bitcoin-block │ ipld │ perm. │ +│ 0xb1 │ bitcoin-tx │ ipld │ perm. │ +│ 0xb2 │ bitcoin-witness-commitment │ ipld │ perm. │ +│ 0xc0 │ zcash-block │ ipld │ perm. │ +│ 0xc1 │ zcash-tx │ ipld │ perm. │ +│ 0xca │ caip-50 │ multiformat │ draft │ +│ 0xce │ streamid │ namespace │ draft │ +│ 0xd0 │ stellar-block │ ipld │ draft │ +│ 0xd1 │ stellar-tx │ ipld │ draft │ +│ 0xe0 │ decred-block │ ipld │ draft │ +│ 0xe1 │ decred-tx │ ipld │ draft │ +│ 0xe2 │ ipld-ns │ namespace │ draft │ +│ 0xe3 │ ipfs-ns │ namespace │ draft │ +│ 0xe4 │ swarm-ns │ namespace │ draft │ +│ 0xe5 │ ipns-ns │ namespace │ draft │ +│ 0xe6 │ zeronet │ namespace │ draft │ +│ 0xe7 │ secp256k1-pub │ key │ draft │ +│ 0xe8 │ dnslink │ namespace │ perm. │ +│ 0xea │ bls12_381-g1-pub │ key │ draft │ +│ 0xeb │ bls12_381-g2-pub │ key │ draft │ +│ 0xec │ x25519-pub │ key │ draft │ +│ 0xed │ ed25519-pub │ key │ draft │ +│ 0xee │ bls12_381-g1g2-pub │ key │ draft │ +│ 0xf0 │ dash-block │ ipld │ draft │ +│ 0xf1 │ dash-tx │ ipld │ draft │ +│ 0xfa │ swarm-manifest │ ipld │ draft │ +│ 0xfb │ swarm-feed │ ipld │ draft │ +│ 0x129 │ dag-json │ ipld │ perm. │ +│ 0x1f0 │ swhid-1-snp │ ipld │ draft │ +│ 0x200 │ json │ ipld │ perm. │ +│ 0x201 │ messagepack │ serialization │ draft │ +│ 0x202 │ car │ serialization │ draft │ +│ 0x301 │ libp2p-peer-record │ libp2p │ perm. │ +│ 0x302 │ libp2p-relay-rsvp │ libp2p │ perm. │ +│ 0x400 │ car-index-sorted │ serialization │ draft │ +│ 0x401 │ car-multihash-index-sorted │ serialization │ draft │ +│ 0x900 │ transport-bitswap │ transport │ draft │ +│ 0x910 │ transport-graphsync-filecoinv1 │ transport │ draft │ +│ 0x1200 │ p256-pub │ key │ draft │ +│ 0x1201 │ p384-pub │ key │ draft │ +│ 0x1202 │ p521-pub │ key │ draft │ +│ 0x1203 │ ed448-pub │ key │ draft │ +│ 0x1204 │ x448-pub │ key │ draft │ +│ 0x1205 │ rsa-pub │ key │ draft │ +│ 0x1206 │ sm2-pub │ key │ draft │ +│ 0x1300 │ ed25519-priv │ key │ draft │ +│ 0x1301 │ secp256k1-priv │ key │ draft │ +│ 0x1302 │ x25519-priv │ key │ draft │ +│ 0xb501 │ ssz │ serialization │ draft │ +│ 0xcc01 │ iscc │ softhash │ draft │ +│ 0xce11 │ zeroxcert-imprint-256 │ zeroxcert │ draft │ +│ 0xf101 │ fil-commitment-unsealed │ filecoin │ perm. │ +│ 0xf102 │ fil-commitment-sealed │ filecoin │ perm. │ +│ 0x807124 │ holochain-adr-v0 │ holochain │ draft │ +│ 0x817124 │ holochain-adr-v1 │ holochain │ draft │ +│ 0x947124 │ holochain-key-v0 │ holochain │ draft │ +│ 0x957124 │ holochain-key-v1 │ holochain │ draft │ +│ 0xa27124 │ holochain-sig-v0 │ holochain │ draft │ +│ 0xa37124 │ holochain-sig-v1 │ holochain │ draft │ +│ 0xb19910 │ skynet-ns │ namespace │ draft │ +│ 0xb29910 │ arweave-ns │ namespace │ draft │ +│ 0xb39910 │ subspace-ns │ namespace │ draft │ +│ 0xb49910 │ kumandra-ns │ namespace │ draft │ +└──────────┴────────────────────────────────┴───────────────┴────────┘ diff --git a/setup.cfg b/setup.cfg index 0459e03..bf7a449 100644 --- a/setup.cfg +++ b/setup.cfg @@ -27,10 +27,9 @@ classifiers = packages = find: python_requires = >=3.7 install_requires = - bases typing-extensions typing-validation - pyskein + bases [options.package_data] * = py.typed, *.json @@ -44,3 +43,14 @@ dev = pylint pytest pytest-cov + blake3 + pysha3 + pyskein + mmh3 + pycryptodomex +full = + pysha3 + blake3 + pyskein + mmh3 + pycryptodomex diff --git a/test/multihash-test-hex-vectors.csv b/test/multihash-test-hex-vectors.csv new file mode 100644 index 0000000..7f0552f --- /dev/null +++ b/test/multihash-test-hex-vectors.csv @@ -0,0 +1,12 @@ +algorithm,bits,input,multihash +keccak-256,256,74657374696e67,1b205f16f4c7f149ac4f9510d9cf8cf384038ad348b3bcdc01915f95de12df9d1b02 +murmur3-32,32,,230400000000 +murmur3-32,32,ffffffff,230476293b50 +murmur3-32,32,21436587,2304f55b516b +murmur3-32,32,214365,23047e4a8634 +murmur3-32,32,2143,2304a0f7b07a +murmur3-32,32,21,230472661cf4 +murmur3-32,32,00000000,23042362f9de +murmur3-32,32,000000,230485f0b427 +murmur3-32,32,0000,230430f4c306 +murmur3-32,32,00,2304514e28b7 diff --git a/test/multihash-test-vectors.csv b/test/multihash-test-str-vectors.csv similarity index 99% rename from test/multihash-test-vectors.csv rename to test/multihash-test-str-vectors.csv index ecfeb34..6abfc57 100644 --- a/test/multihash-test-vectors.csv +++ b/test/multihash-test-str-vectors.csv @@ -259,3 +259,4 @@ sha3-512,80,444a8ca03cce5b47991b3baf807f4eaa1d8175a72e,140a5541d5148a03e5ba0a91 sha3-512,160,444a8ca03cce5b47991b3baf807f4eaa1d8175a72e,14145541d5148a03e5ba0a916b8f91175219b15e260f sha3-512,256,444a8ca03cce5b47991b3baf807f4eaa1d8175a72e,14205541d5148a03e5ba0a916b8f91175219b15e260f0b26c4fdfa24b24bc54ac640 sha3-512,512,444a8ca03cce5b47991b3baf807f4eaa1d8175a72e,14405541d5148a03e5ba0a916b8f91175219b15e260f0b26c4fdfa24b24bc54ac64045ea813027252d05c43a3f0d5def492937a33fbc04c3eaf7ad61af5e16bb7c3f +dbl-sha2-256,256,hello,56209595c9df90075148eb06860365df33584b75bff782a510c6cd4883a419833d50 diff --git a/test/test_00_varint.py b/test/test_00_varint.py index 62ad58f..77d3e1a 100644 --- a/test/test_00_varint.py +++ b/test/test_00_varint.py @@ -1,9 +1,10 @@ """ Tests for the `multiformats.varint` module. """ -import pytest from io import BytesIO from random import Random + +import pytest from multiformats import varint random = Random(0) diff --git a/test/test_02_multibase.py b/test/test_02_multibase.py index 1152bf8..17ac04b 100644 --- a/test/test_02_multibase.py +++ b/test/test_02_multibase.py @@ -151,7 +151,7 @@ def test_api_failure_modes() -> None: except ValueError: pass try: - Multibase(name="my-codec", code="0x80") + Multibase(name="my-codec", code="0x79") assert False, "Codes in hex format must be non-printable ASCII characters." except ValueError: pass diff --git a/test/test_03_multihash.py b/test/test_03_multihash.py index 1e0f519..bfdc5ce 100644 --- a/test/test_03_multihash.py +++ b/test/test_03_multihash.py @@ -3,51 +3,123 @@ import csv import hashlib import importlib.resources as importlib_resources -from typing import Dict +from typing import Dict, Optional import pytest import skein # type: ignore +from blake3 import blake3 # type: ignore +import sha3 # type: ignore +import mmh3 # type: ignore +from Cryptodome.Hash import RIPEMD160, KangarooTwelve, SHA512 from multiformats import multihash from multiformats.multihash import wrap, digest, unwrap -def id_digest(data: bytes) -> bytes: - return data +def id_digest(data: bytes, size: Optional[int]) -> bytes: + return data if size is None else data[:size] -def sha1_digest(data: bytes) -> bytes: +def sha1_digest(data: bytes, size: Optional[int]) -> bytes: m = hashlib.sha1() m.update(data) - return m.digest() + d = m.digest() + return d if size is None else d[:size] -def sha2_digest(data: bytes, digest_bits: int) -> bytes: +def sha2_digest(data: bytes, digest_bits: int, size: Optional[int]) -> bytes: m: hashlib._Hash = getattr(hashlib, f"sha{digest_bits}")() m.update(data) - return m.digest() + d = m.digest() + return d if size is None else d[:size] -def sha3_digest(data: bytes, digest_bits: int) -> bytes: +def dbl_sha2_digest(data: bytes, digest_bits: int, size: Optional[int]) -> bytes: + m: hashlib._Hash = hashlib.sha256() + m.update(data) + n: hashlib._Hash = hashlib.sha256() + n.update(m.digest()) + d = n.digest() + return d if size is None else d[:size] + +def sha3_digest(data: bytes, digest_bits: int, size: Optional[int]) -> bytes: m: hashlib._Hash = getattr(hashlib, f"sha3_{digest_bits}")() m.update(data) - return m.digest() + d = m.digest() + return d if size is None else d[:size] + +def sha2_512_digest(data: bytes, digest_bits: int, size: Optional[int]) -> bytes: + m = SHA512.new(truncate=str(digest_bits)) + m.update(data) + d = m.digest() + return d if size is None else d[:size] -def shake_digest(data: bytes, digest_bits: int) -> bytes: +def shake_digest(data: bytes, digest_bits: int, size: Optional[int]) -> bytes: m: hashlib._Hash = getattr(hashlib, f'shake_{digest_bits//2}')() m.update(data) - return m.digest(digest_bits//8) # type: ignore + d = m.digest(digest_bits//8) # type: ignore + return d if size is None else d[:size] -def blake2_digest(data: bytes, variant: str, digest_bits: int) -> bytes: +def blake2_digest(data: bytes, variant: str, digest_bits: int, size: Optional[int]) -> bytes: assert variant in ('b', 's') m: hashlib._Hash = getattr(hashlib, 'blake2{}'.format(variant))(digest_size=digest_bits//8) m.update(data) - return m.digest() + d = m.digest() + return d if size is None else d[:size] + +def blake3_digest(data: bytes, size: int) -> bytes: + m = blake3() + m.update(data) + d: bytes = m.digest(size) + return d -def skein_digest(data: bytes, variant: int, digest_bits: int) -> bytes: +def skein_digest(data: bytes, variant: int, digest_bits: int, size: Optional[int]) -> bytes: assert variant in (256, 512, 1024) m: hashlib._Hash = getattr(skein, 'skein{}'.format(variant))(digest_bits=digest_bits) m.update(data) - return m.digest() + d = m.digest() + return d if size is None else d[:size] + +def keccak_digest(data: bytes, digest_bits: int, size: Optional[int]) -> bytes: + m: hashlib._Hash = getattr(sha3, f"keccak_{digest_bits}")() + m.update(data) + d = m.digest() + return d if size is None else d[:size] + +def murmur3_digest(data: bytes, variant: str, digest_bits: int, size: Optional[int]) -> bytes: + assert variant in ("32", "x64") + if variant == "32": + d: bytes = mmh3.hash(data, signed=False).to_bytes(4, byteorder="big") # pylint: disable = c-extension-no-member + return d if size is None else d[:size] + d = mmh3.hash128(data, signed=False).to_bytes(16, byteorder="big") # pylint: disable = c-extension-no-member + d = d[:(digest_bits//8)] + return d if size is None else d[:size] + +def md5_digest(data: bytes, size: Optional[int]) -> bytes: + m = hashlib.md5() + m.update(data) + d = m.digest() + return d if size is None else d[:size] -def _test(hash_fn: str, data: bytes, hash_digest: bytes) -> None: - multihash_digest = digest(data, hash_fn) +def ripemd_digest(data: bytes, digest_bits: int, size: Optional[int]) -> bytes: + assert digest_bits == 160 + m = RIPEMD160.new() + m.update(data) + d = m.digest() + return d if size is None else d[:size] + +def kangarootwelve_digest(data: bytes, size: int) -> bytes: + m = KangarooTwelve.new() + m.update(data) + return m.read(size) + +def sha2_256_trunc254_padded_digest(data: bytes, size: Optional[int]) -> bytes: + m: hashlib._Hash = hashlib.sha256() + m.update(data) + d = m.digest() + d = d[:-1]+bytes([d[-1]&0x00111111]) + return d if size is None else d[:size] + +def _test(hash_fn: str, data: bytes, hash_digest: bytes, size: Optional[int] = None) -> None: + if size is not None: + assert len(hash_digest) == size + multihash_digest = digest(data, hash_fn, size=size) assert multihash.exists(hash_fn) codec = multihash.get(hash_fn) assert hash_fn == codec.name @@ -59,72 +131,155 @@ def _test(hash_fn: str, data: bytes, hash_digest: bytes) -> None: trunc_multihash_digest = wrap(trunc_hash_digest, hash_fn) assert trunc_hash_digest == unwrap(trunc_multihash_digest) assert trunc_hash_digest == unwrap(trunc_multihash_digest, hash_fn) - multihash_digest = digest(bytearray(data), hash_fn) + multihash_digest = digest(bytearray(data), hash_fn, size=size) assert wrap(bytearray(hash_digest), hash_fn) == multihash_digest assert hash_digest == unwrap(bytearray(multihash_digest)) - multihash_digest = digest(memoryview(data), hash_fn) + multihash_digest = digest(memoryview(data), hash_fn, size=size) assert wrap(memoryview(hash_digest), hash_fn) == multihash_digest assert hash_digest == unwrap(memoryview(multihash_digest)) data_samples = [ b"", - b"Test data to be wrapd.", - b"Test data to be wrapd."*100, + b"Test data to be wrapped.", + b"Test data to be wrapped."*100, ] @pytest.mark.parametrize("data", data_samples) -def test_id(data: bytes) -> None: +@pytest.mark.parametrize("size", (None, 8, 16)) +def test_id(data: bytes, size: Optional[int]) -> None: hash_fn = 'identity' - _test(hash_fn, data, id_digest(data)) + if size is None or len(data) >= size: + _test(hash_fn, data, id_digest(data, size), size) @pytest.mark.parametrize("data", data_samples) -def test_sha1(data: bytes) -> None: +@pytest.mark.parametrize("size", (None, 10, 20)) +def test_sha1(data: bytes, size: Optional[int]) -> None: hash_fn = 'sha1' - _test(hash_fn, data, sha1_digest(data)) + _test(hash_fn, data, sha1_digest(data, size), size) @pytest.mark.parametrize("digest_bits", (256, 512)) @pytest.mark.parametrize("data", data_samples) -def test_sha2(data: bytes, digest_bits: int) -> None: +@pytest.mark.parametrize("size", (None, 16, 32)) +def test_sha2(data: bytes, digest_bits: int, size: Optional[int]) -> None: hash_fn = f"sha2-{digest_bits}" - _test(hash_fn, data, sha2_digest(data, digest_bits)) + _test(hash_fn, data, sha2_digest(data, digest_bits, size), size) + +@pytest.mark.parametrize("digest_bits", (256,)) +@pytest.mark.parametrize("data", data_samples) +@pytest.mark.parametrize("size", (None, 16, 32)) +def test_dbl_sha2(data: bytes, digest_bits: int, size: Optional[int]) -> None: + hash_fn = f"dbl-sha2-{digest_bits}" + _test(hash_fn, data, dbl_sha2_digest(data, digest_bits, size), size) @pytest.mark.parametrize("digest_bits", (224, 256, 384, 512)) @pytest.mark.parametrize("data", data_samples) -def test_sha3(data: bytes, digest_bits: int) -> None: +@pytest.mark.parametrize("size", (None, 16, 28)) +def test_sha3(data: bytes, digest_bits: int, size: Optional[int]) -> None: hash_fn = f"sha3-{digest_bits}" - _test(hash_fn, data, sha3_digest(data, digest_bits)) + _test(hash_fn, data, sha3_digest(data, digest_bits, size), size) + +@pytest.mark.parametrize("digest_bits", (224, 256)) +@pytest.mark.parametrize("data", data_samples) +@pytest.mark.parametrize("size", (None, 16, 28)) +def test_sha2_512(data: bytes, digest_bits: int, size: Optional[int]) -> None: + hash_fn = f"sha2-512-{digest_bits}" + _test(hash_fn, data, sha2_512_digest(data, digest_bits, size), size) @pytest.mark.parametrize("digest_bits", (256, 512)) @pytest.mark.parametrize("data", data_samples) -def test_shake(data: bytes, digest_bits: int) -> None: +@pytest.mark.parametrize("size", (None, 16, 32)) +def test_shake(data: bytes, digest_bits: int, size: Optional[int]) -> None: hash_fn = f"shake-{digest_bits//2}" - _test(hash_fn, data, shake_digest(data, digest_bits)) + _test(hash_fn, data, shake_digest(data, digest_bits, size), size) @pytest.mark.parametrize("version", ("b", "s")) @pytest.mark.parametrize("data", data_samples) -def test_blake2(data: bytes, version: str) -> None: +@pytest.mark.parametrize("size", (None, 1)) +def test_blake2(data: bytes, version: str, size: Optional[int]) -> None: for digest_bits in range(8, (512 if version == "b" else 256)+1, 8): hash_fn = f"blake2{version}-{digest_bits}" - _test(hash_fn, data, blake2_digest(data, version, digest_bits)) + _test(hash_fn, data, blake2_digest(data, version, digest_bits, size), size) + +@pytest.mark.parametrize("data", data_samples) +@pytest.mark.parametrize("size", (16, 32, 64)) +def test_blake3(data: bytes, size: int) -> None: + hash_fn = 'blake3' + _test(hash_fn, data, blake3_digest(data, size), size) @pytest.mark.parametrize("version", (256, 512, 1024)) @pytest.mark.parametrize("data", data_samples) -def test_skein(data: bytes, version: int) -> None: +@pytest.mark.parametrize("size", (None, 1)) +def test_skein(data: bytes, version: int, size: Optional[int]) -> None: for digest_bits in range(8, version+1, 8): hash_fn = f"skein{version}-{digest_bits}" - _test(hash_fn, data, skein_digest(data, version, digest_bits)) + _test(hash_fn, data, skein_digest(data, version, digest_bits, size), size) + +@pytest.mark.parametrize("digest_bits", (224, 256, 384, 512)) +@pytest.mark.parametrize("data", data_samples) +@pytest.mark.parametrize("size", (None, 16, 28)) +def test_keccak(data: bytes, digest_bits: int, size: Optional[int]) -> None: + hash_fn = f"keccak-{digest_bits}" + _test(hash_fn, data, keccak_digest(data, digest_bits, size), size) -with importlib_resources.open_text("test", "multihash-test-vectors.csv") as csv_table: - multihash_test_vectors = list(csv.DictReader(csv_table)) +@pytest.mark.parametrize("version", ("32", "x64")) +@pytest.mark.parametrize("data", data_samples) +@pytest.mark.parametrize("size", (None, 4)) +def test_murmur3(data: bytes, version: str, size: Optional[int]) -> None: + if version == "32": + hash_fn = f"murmur3-{version}" + _test(hash_fn, data, murmur3_digest(data, version, 32, size), size) + else: + for digest_bits in (64, 128): + hash_fn = f"murmur3-{version}-{digest_bits}" + _test(hash_fn, data, murmur3_digest(data, version, digest_bits, size), size) -# with open("multihash_test_vectors.csv") as f: -# multihash_test_vectors = list(csv.DictReader(f)) +@pytest.mark.parametrize("data", data_samples) +@pytest.mark.parametrize("size", (None, 8, 16)) +def test_md5(data: bytes, size: Optional[int]) -> None: + hash_fn = 'md5' + _test(hash_fn, data, md5_digest(data, size), size) -@pytest.mark.parametrize("test_vector", multihash_test_vectors) -def test_vectors(test_vector: Dict[str, str]) -> None: +@pytest.mark.parametrize("digest_bits", (160, )) +@pytest.mark.parametrize("data", data_samples) +@pytest.mark.parametrize("size", (None, 16, 20)) +def test_ripemd(data: bytes, digest_bits: int, size: Optional[int]) -> None: + hash_fn = f"ripemd-{digest_bits}" + _test(hash_fn, data, ripemd_digest(data, digest_bits, size), size) + +@pytest.mark.parametrize("data", data_samples) +@pytest.mark.parametrize("size", (16, 32, 64)) +def test_kangarootwelve(data: bytes, size: int) -> None: + hash_fn = "kangarootwelve" + _test(hash_fn, data, kangarootwelve_digest(data, size), size) + +@pytest.mark.parametrize("data", data_samples) +@pytest.mark.parametrize("size", (None, 16, 32)) +def test_sha_256_trunc254_padded(data: bytes, size: Optional[int]) -> None: + hash_fn = "sha2-256-trunc254-padded" + _test(hash_fn, data, sha2_256_trunc254_padded_digest(data, size), size) + +# specific test vectors + +with importlib_resources.open_text("test", "multihash-test-str-vectors.csv") as csv_table: + multihash_test_str_vectors = list(csv.DictReader(csv_table)) + +@pytest.mark.parametrize("test_vector", multihash_test_str_vectors) +def test_str_vectors(test_vector: Dict[str, str]) -> None: hash_fn = test_vector["algorithm"] digest_size = int(test_vector["bits"])//8 data = test_vector["input"].encode("utf-8") multihash_digest = bytes.fromhex(test_vector["multihash"]) assert hash_fn == multihash.from_digest(multihash_digest).name assert digest(data, hash_fn, size=digest_size) == multihash_digest + +with importlib_resources.open_text("test", "multihash-test-hex-vectors.csv") as csv_table: + multihash_test_hex_vectors = list(csv.DictReader(csv_table)) + +@pytest.mark.parametrize("test_vector", multihash_test_hex_vectors) +def test_hex_vectors(test_vector: Dict[str, str]) -> None: + hash_fn = test_vector["algorithm"] + digest_size = int(test_vector["bits"])//8 + data = bytes.fromhex(test_vector["input"]) + multihash_digest = bytes.fromhex(test_vector["multihash"]) + assert hash_fn == multihash.from_digest(multihash_digest).name + assert digest(data, hash_fn, size=digest_size) == multihash_digest diff --git a/tox.ini b/tox.ini index ace7042..2691c31 100644 --- a/tox.ini +++ b/tox.ini @@ -10,6 +10,11 @@ deps = pylint pytest pytest-cov + pysha3 + blake3 + pyskein + mmh3 + pycryptodomex setenv = PYTHONPATH = {toxinidir} diff --git a/update-multibase-table.py b/update-multibase-table.py index e8a35d7..5a993a9 100644 --- a/update-multibase-table.py +++ b/update-multibase-table.py @@ -31,7 +31,7 @@ # Loads and validates the current multibase table: print("Building current multibase table...") -with open("multiformats/multibase/multibase-table.csv", "r") as f: +with open("multiformats/multibase/multibase-table.csv", "r", encoding="utf8") as f: current_text = f.read() reader = csv.DictReader(io.StringIO(current_text)) clean_rows = ({k.strip(): v.strip() for k, v in row.items()} for row in reader) @@ -92,15 +92,10 @@ if added or removed or changed: answer = input("Would you like to update the multibase table? (y/n) ") if answer.lower().startswith("y"): - with open("multiformats/multibase/multibase-table.csv", "w") as f: + with open("multiformats/multibase/multibase-table.csv", "w", encoding="utf8") as f: f.write(new_text) - with open("multiformats/multibase/multibase-table.json", "w") as f: + with open("multiformats/multibase/multibase-table.json", "w", encoding="utf8") as f: table = [new_table[code].to_json() for code in sorted(new_table.keys())] json.dump(table, f, indent=4) else: print("Nothing to update, exiting.") - - -with open("multiformats/multibase/multibase-table.json", "w") as f: - table = [new_table[code].to_json() for code in sorted(new_table.keys())] - json.dump(table, f, indent=4)