diff --git a/LICENSE.txt b/LICENSE similarity index 100% rename from LICENSE.txt rename to LICENSE diff --git a/README.md b/README.md index 110b7af..4afa889 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,10 @@ # Balsa (wsp-balsa) +[![Conda Latest Release](https://anaconda.org/wsp_sap/wsp-balsa/badges/version.svg)](https://anaconda.org/wsp_sap/wsp-balsa) +[![Conda Last Updated](https://anaconda.org/wsp_sap/wsp-balsa/badges/latest_release_date.svg)](https://anaconda.org/wsp_sap/wsp-balsa) +[![Platforms](https://anaconda.org/wsp_sap/wsp-balsa/badges/platforms.svg)](https://anaconda.org/wsp_sap/wsp-balsa) +[![License](https://anaconda.org/wsp_sap/wsp-balsa/badges/license.svg)](https://github.com/wsp-sag/balsa/blob/master/LICENSE) + Balsa is a collection of functions and tools for Python to facilitate travel demand forecasting applications and analyses. It is designed to work the the “scientific stack” of Python, namely NumPy, Pandas, and Matplotlib; which are optimized for speed and usability. Most of balsa consists of standalone functions - for input/output, for analysis, etc. - as well as a few lightweight class-based data structures for specific applications. Balsa is owned and published by WSP Canada's Systems Analytics for Policy group. @@ -13,26 +18,28 @@ Balsa is owned and published by WSP Canada's Systems Analytics for Policy group. - Management of JSON configuration files, including comments. - and more! -Balsa is compatible with Python 2.7 and 3.5+ +Balsa is compatible with Python 3.5+ -## Current Status +## Installation -*Version 1.0* is the latest release of balsa. +> **For TRESO users:** TRESO is only compatible with the [`v0.6.1`](https://github.com/wsp-sag/balsa/releases/tag/v0.6.1) release of Balsa, which can only be installed directly from GitHub using `pip`. -**For TRESO users:** TRESO is only compatible with the [`v0.5.0`](https://github.com/wsp-sag/balsa/tree/v0.5.0) release of balsa. +### With `conda` -## Installation +Balsa can be installed with conda by running the following command: + +```batch +conda install -c wsp_sap wsp-balsa +``` ### With `pip` -As a private package, Balsa **is not hosted on PyPI or other services that do not permit private code**. Currently the best way to install Balsa is using `pip` to install directly from GitHub: +Balsa can be installed directly from GitHub using `pip` by running the following command: ```batch pip install git+https://github.com/wsp-sag/balsa.git ``` -Git will prompt you to login to your account (also works with 2FA) before installing. This requires you to download and install a [standalone Git client](https://git-scm.com/downloads) to communicate with GitHub. - > **Windows Users:** It is recommended to install Balsa from inside an activated Conda environment. Balsa uses several packages (NumPy, Pandas, etc.) that will otherwise not install correctly from `pip` otherwise. For example: ```batch @@ -42,20 +49,6 @@ C:\> conda activate base ... ``` -### With `conda` - -Balsa can be installed with Conda, but requires you to install it from a local Conda channel. This can be done by using [conda-build](https://github.com/conda/conda-build), which will create a Conda package for Balsa (that has been cloned from GitHub onto your machine) and set up a local Conda channel (i.e. `conda-bld`) in your Conda installation folder. conda-build must be installed in your base Conda environment. Once the Conda package is built, you can install it to your Conda environment of choice using `conda install`. - -The following code block provides the commands to install Balsa using Conda. - -```batch -(base) C:\> conda build "/conda_recipe" - -... - -(base) C:\> conda install -c "/conda-bld" wsp-balsa -``` - ## Documentation HTML documentation is available upon request. diff --git a/balsa/__init__.py b/balsa/__init__.py index 14cd162..d3683f8 100644 --- a/balsa/__init__.py +++ b/balsa/__init__.py @@ -1,2 +1,2 @@ from .routines import * -from .configuration import Config, ConfigParseError, ConfigSpecificationError, ConfigTypeError +from .version import __version__ diff --git a/balsa/configuration.py b/balsa/configuration.py deleted file mode 100644 index 58a2b31..0000000 --- a/balsa/configuration.py +++ /dev/null @@ -1,400 +0,0 @@ -from __future__ import division, absolute_import, print_function, unicode_literals - -import json -import os -from collections import OrderedDict -from six import iteritems, StringIO -import re - -try: - from pathlib import Path - PATHLIB_LOADED = True -except ImportError: - PATHLIB_LOADED = False - -from .routines.general import is_identifier -from .routines.io.common import open_file - - -class ConfigParseError(IOError): - pass - - -class ConfigSpecificationError(AttributeError): - pass - - -class ConfigTypeError(ValueError): - pass - - -class ConfigValue(object): - """ - Wraps the value of a Config attribute to facilitate type-checking and pretty error messages. - """ - - def __init__(self, value, name, owner=None): - self.value = value - self._name = str(name) - self._owner = owner - - def __str__(self): return str(self.value) - - def __repr__(self): return "ConfigValue(%r)" % self.value - - @property - def namespace(self): - """ Dot-separated name of this config value""" - if self._owner is not None: - return self._owner.namespace + '.' + self._name - return self._name - - def as_type(self, type_): - """ - Attempts to cast the value to a specified type. - - Args: - type_ (type): The type (e.g. int, float, etc.) to try to cast - - Returns: - The value cast as type - - Raises: - ConfigTypeError: if the casting could not be performed. - - """ - - try: - return type_(self.value) - except (ValueError, TypeError): - - message = "Attribute <{}> = '{}' could not be converted to {}".format( - self.namespace, self.value, type_ - ) - raise ConfigTypeError(message) - - def as_bool(self): - """ - Resolves the value to bool - - Raises: - ConfigTypeError: If the value cannot be resolved to bool - """ - return self.as_type(bool) - - def as_int(self): - """ - Resolves the value to int - - Raises: - ConfigTypeError: If the value cannot be resolved to int - """ - return self.as_type(int) - - def as_float(self): - """ - Resolves the value to float - - Raises: - ConfigTypeError: If the value cannot be resolved to float - """ - return self.as_type(float) - - def as_str(self): - """ - Resolves the value to str - - Raises: - ConfigTypeError: If the value cannot be resolved to str - """ - return self.as_type(str) - - def as_list(self, sub_type=None): - """ - Resolves the value to a list. - - Args: - sub_type (type): Optional. Specifies the expected contiguous (uniform) type of the list to convert to. - - Returns: - list: The value, as a list - - """ - if sub_type is None: - return self.as_type(list) - - return [ - item.as_type(sub_type) - for item in self.as_type(list) - ] - - if PATHLIB_LOADED: - def as_path(self, parent=None): - """ - Resolves the value to Path type (available only when using Python 3) - - Args: - parent: Optional parent folder if this is a relative path - - Raises: - ConfigTypeError: If the value cannot be resolved to Path - """ - if parent is not None: return Path(parent) / Path(self.as_str()) - return Path(self.as_str()) - - def as_set(self, sub_type=None): - """ - Converts the value to a set. - - Args: - sub_type (type): Optional. Specifies the expected contiguous (uniform) type of the set to convert to. - - Returns: - set: The value, as a set - - """ - if sub_type is None: return self.as_type(set) - - return { - item.as_type(sub_type) - for item in self.as_type(set) - } - - def serialize(self): - if isinstance(self.value, list): - return [x.serialize() for x in self.value] - return self.value - - -class Config(object): - """ - Represents a model configuration, usually stored in JSON format with the order of items preserved and comments - (beginning with '//') stripped out. Keys in the JSON file which conform to Python variable names (e.g. - "my_attribute" but not "My Attribute") become *attributes* of the Config object (e.g. instance.my_attribute). - - Value attributes (e.g. ``value`` in ``{"key": value}``) are stored as ConfigValue objects to facilitate type - conversion and checking. So to access the raw value, write "instance.my_attribute.value" or, to convert it to a - specified type, write ``instance.my_attribute.as_bool()``. - - This all facilitates "pretty" error message generation, to provide the end-user with as much information about the - source of an error as these are common when specifying a model. - - A `Config` can be constructed from three static methods: - - - ``from_file()`` to construct from a JSON file on-disk - - ``from_string()`` to construct from a JSON-formatted string in-memory - - ``from_dict()`` to construct from a dictionary in-memory - - Note: - - Config implements ``__contains__`` for testing if a name is 'in' the set of attributes. - - To use ``__getitem__``, ``__setitem__`` (like a Dictionary), use the ``as_dict()`` method to convert to a dictionary representation. This also exposes dictionary iteration methods. - """ - - def __init__(self, config_dict, name=None, parent=None, file_=None): - self._contents = {} - self._name = name - self._parent = parent - self._file = file_ - - for key, original_value in iteritems(config_dict): - if isinstance(original_value, dict): - value = Config(original_value, name=key, parent=self, file_=file_) - elif isinstance(original_value, (list, set)): - value_list = [] - for (i, item) in enumerate(original_value): - if isinstance(item, dict): - value_list.append(Config(item, name=key + "[%s]" % i, parent=self, file_=file_)) - else: - value_list.append(ConfigValue(item, key + "[%s]" % i, owner=self)) - value = ConfigValue(value_list, key, owner=self) - elif original_value is None: - value = None - else: - value = ConfigValue(original_value, key, owner=self) - - if is_identifier(key): - try: - setattr(self, key, value) - except AttributeError: - print("WARNING: Config key '%s' conflicts with reserved properties" % key) - self._contents[key] = value - - @property - def name(self): - """Short name of each part of the config. For non-root Configs, this will be the name of the attribute used - to access this Config from the parent.""" - return self._name - - @property - def parent(self): - """Pointer to the parent of non-root Configs.""" - return self._parent - - @property - def namespace(self): - """The dot-separated namespace of this part of the full Config.""" - name = self._name if self._name is not None else '' - if self._parent is None: - return name - return '.'.join([self._parent.namespace, name]) - - def __str__(self): - if self._parent is None: - return "Config @%s" % self._file - - return "Config(%s) @%s" % (self.namespace, self._file) - - def __getattr__(self, item): - raise ConfigSpecificationError("Item '%s' is missing from config <%s>" % (item, self.namespace)) - - def __contains__(self, item): return item in self._contents - - def __getitem__(self, item): - if item not in self: - raise ConfigSpecificationError("Item '%s' is missing from config <%s>" % (item, self.namespace)) - return self._contents[item] - - def as_dict(self, key_type=None, value_type=None): - """ - Converts this entry to a primitive dictionary, using specified types for the keys and values. - - Args: - key_type (type, optional): Defaults to ``None``. The type to which the keys will be cast, or None to ignore - casting. - value_type (type, optional): Defaults to ``None``. The type to which the values will be cast, or None to - ignore casting. - - Returns: - dict: A dictionary containing the entry's keys and values - - """ - - if key_type is None and value_type is None: - return self._contents.copy() - - def any_type(val): return val - - if key_type is None: key_type = any_type - if value_type is None: value_type = any_type - - retval = OrderedDict() - for key, val in iteritems(self._contents): - try: - key = key_type(key) - except ValueError: - message = "Key <{}> = '{}' could not be converted to {}".format( - self.namespace, key, key_type - ) - raise ConfigTypeError(message) - - try: - val = val.as_type(value_type) - except ValueError: - message = "Value <{}.{}> = '{}' could not be converted to {}".format( - self.namespace, key, val, key_type - ) - raise ConfigTypeError(message) - retval[key] = val - return retval - - def serialize(self): - """Recursively converts the Config back to primitive dictionaries""" - child_dict = OrderedDict() - for attr, item in iteritems(self._contents): - child_dict[attr] = item.serialize() - return child_dict - - def to_file(self, fp): - """ - Writes the Config to a JSON file. - - Args: - fp (str): File path to the output files - - """ - dict_ = self.serialize() - with open_file(fp, mode='w') as writer: - json.dump(dict_, writer, indent=2) - - @classmethod - def from_file(cls, fp): - """ - Reads a Config from a JSON file. Comments beginning with '//' are ignored. - - Args: - fp (str): The path to the JSON file - - Returns: - Config: The Config object representing the JSON file. - - Raises: - ConfigParseError: if there's a problem parsing the JSON file - - """ - with open_file(fp, mode='r') as reader: - try: - dict_ = json.loads(cls._parse_comments(reader), object_pairs_hook=OrderedDict) - except ValueError as ve: - # If there's an error reading the JSON file, re-raise it as a ConfigParseError for clarity - raise ConfigParseError(str(ve)) - - root_name = os.path.splitext(os.path.basename(fp))[0] - return Config(dict_, name=root_name, file_=fp) - - @classmethod - def from_string(cls, s, file_name='', root_name=''): - """ - Reads a Config from a JSON file as a string. Comments beginning with '//' are ignored. - - Args: - s (str): The string containing the Config data, in JSON format. - file_name (str): Optional 'file' name for display purposes. - root_name (str): Optional root name for display purposes. - - Returns: - Config: - The Config object representing the JSON file. - - Raises: - ConfigParseError: if there's a problem parsing the JSON file - - """ - sio = StringIO(s) - try: - dict_ = json.loads(cls._parse_comments(sio), object_pairs_hook=OrderedDict) - except ValueError as ve: - raise ConfigParseError(str(ve)) - - return Config(dict_, name=root_name, file_=file_name) - - @staticmethod - def from_dict(dict_, file_name='', root_name=''): - """ - Converts a raw dictionary to a Config object. - - Args: - dict_ (dict): The dictionary to create a Config from - file_name: - root_name: - - Returns: - Config - - """ - return Config(dict_, name=root_name, file_=file_name) - - @staticmethod - def _parse_comments(reader): - """Removes comments beginning with '//' from the stream""" - regex = r'\s*(#|\/{2}).*$' - regex_inline = r'(:?(?:\s)*([A-Za-z\d\.{}]*)|((?<=\").*\"),?)(?:\s)*(((#|(\/{2})).*)|)$' - - pipe = [] - for line in reader: - if re.search(regex, line): - if re.search(r'^' + regex, line, re.IGNORECASE): continue - elif re.search(regex_inline, line): - pipe.append(re.sub(regex_inline, r'\1', line)) - else: - pipe.append(line) - return "\n".join(pipe) diff --git a/balsa/configuration.pyi b/balsa/configuration.pyi deleted file mode 100644 index d58d85a..0000000 --- a/balsa/configuration.pyi +++ /dev/null @@ -1,86 +0,0 @@ -from io import FileIO -from typing import Union, Optional, Dict, Any, List -from six import string_types - -try: - from pathlib import Path - PATHLIB_LOADED = True - file_types = Union[string_types, Path, FileIO] -except ImportError: - Path = None - PATHLIB_LOADED = False - file_types = Union[string_types, FileIO] - - -class ConfigParseError(IOError): - pass - - -class ConfigSpecificationError(AttributeError): - pass - - -class ConfigTypeError(ValueError): - pass - - -class ConfigValue: - - value: Union[str, List[Union[str, ConfigValue]]] - _name: str - _owner: Config - - def namespace(self) -> str: - pass - - def as_type(self, type_): pass - - def as_bool(self) -> bool: pass - - def as_int(self) -> int: pass - - def as_float(self) -> float: pass - - def as_str(self) -> str: pass - - def as_list(self, sub_type=None) -> list: pass - - if PATHLIB_LOADED: - def as_path(self, parent: Optional[Path]=None) -> Path: - pass - - def as_set(self, sub_type=None) -> set: pass - - def serialize(self) -> Union[list, Any]: pass - - -class Config: - - _contents: dict - _name: str - _parent = Optional['Config'] - _file = file_types - - def name(self) -> string_types: pass - - def parent(self) -> Optional['Config']: pass - - def namespace(self) -> string_types: pass - - def as_dict(self, key_type: type=None, value_type: type=None) -> dict: pass - - def serialize(self) -> string_types: pass - - def to_file(self, fp: file_types): pass - - @classmethod - def from_file(cls, fp: file_types) -> 'Config': pass - - @classmethod - def from_string(cls, s: string_types, file_name: string_types, root_name: string_types) -> 'Config': pass - - @staticmethod - def from_dict(dict_: Dict[string_types, Any], file_name: string_types, root_name: string_types) -> 'Config': pass - - @staticmethod - def _parse_comments(reader): pass diff --git a/balsa/logging.py b/balsa/logging.py index 8c3ed69..d0eec9f 100644 --- a/balsa/logging.py +++ b/balsa/logging.py @@ -1,10 +1,7 @@ -from __future__ import division, absolute_import, print_function, unicode_literals - import logging import sys from contextlib import contextmanager import traceback as tb -import six from json import dumps as json_to_str from enum import Enum @@ -51,7 +48,7 @@ def make_formatter(item): return logging.Formatter(item) if isinstance(item, str) else item self._default = make_formatter(default_format) - self._formats = {lvl: make_formatter(f) for lvl, f in six.iteritems(level_formats)} + self._formats = {lvl: make_formatter(f) for lvl, f in level_formats.items()} def format(self, record): level = record.levelno @@ -72,9 +69,7 @@ def format(self, record): class ModelLogger(logging.Logger): - """ - Extends the standard Python Logger object, adding additional logging statements such as ``.report()``. - """ + """Extends the standard Python Logger object, adding additional logging statements such as ``.report()``""" def report(self, msg, *args, **kwargs): """Report useful model statistics or results to the user. Distinct from ``.info()`` which provides status diff --git a/balsa/routines/__init__.py b/balsa/routines/__init__.py index fa2b6b0..5554224 100644 --- a/balsa/routines/__init__.py +++ b/balsa/routines/__init__.py @@ -2,7 +2,7 @@ from .io import * from .matrices import (fast_unstack, fast_stack, aggregate_matrix, matrix_balancing_1d, matrix_balancing_2d, matrix_bucket_rounding, split_zone_in_matrix, disaggregate_matrix) -from .modelling import (tlfd, distance_array, distance_matrix) +from .modelling import tlfd, distance_array, distance_matrix try: from .plotting import trumpet_diagram, convergence_boxplot, location_summary diff --git a/balsa/routines/best_intermediates.py b/balsa/routines/best_intermediates.py index c7ef536..1109a50 100644 --- a/balsa/routines/best_intermediates.py +++ b/balsa/routines/best_intermediates.py @@ -33,9 +33,11 @@ def _update_heap(utilities: ndarray, zones: ndarray, new_u: float, new_zone: int top = len(utilities) while i < top: current_u = utilities[i] - if new_u < current_u: break + if new_u < current_u: + break i += 1 - if i <= 0: return + if i <= 0: + return for j in range(i - 1): utilities[j] = utilities[j + 1] zones[j] = zones[j + 1] @@ -71,7 +73,8 @@ def _nbf_twopart_worker(access_utils: ndarray, egress_utils: ndarray, result_uti # In general, for problems where (n_origins * n_destinations) >> k, most values will not be in the top k. # So quickly check against the lowest utility in the heap to avoid calling the updater func - if interim_util < util_heap[0] or interim_util == _NEG_INF: continue + if interim_util < util_heap[0] or interim_util == _NEG_INF: + continue _update_heap(util_heap, zones_heap, interim_util, interim_zone) result_utils[origin_zone, destination_zone, :] = util_heap @@ -100,8 +103,7 @@ def best_intermediate_zones(access_table: DataFrame, egress_table: DataFrame, co intermediate_name: str = "intermediate_zone", maximize=True, availability_column: str = "available", null_index=0 ) -> Union[DataFrame, Dict[int, DataFrame]]: - """ - Numba-accelerated. + """Numba-accelerated. Triple-index operation for two matrices, finding the most- or least-cost intermediate zones. Takes an access matrix of the shape (O, I) and an egress matrix of the shape (I, D) to produce a combined matrix of the shape (O, D), with @@ -116,7 +118,7 @@ def best_intermediate_zones(access_table: DataFrame, egress_table: DataFrame, co When constructing the result tables, columns in the access and egress tables are "carried forward" such that the results columns will be the union of columns in the input tables. Columns in one table only will be carried forward - unmodified and retain their dtype. Columns in both tables will be added together, and thus MUST be numeric. + unmodified and retain their data type. Columns in both tables will be added together, and thus MUST be numeric. In the specified cost column, a value of `-inf` (or `inf` when minimizing) is respected as the sentinel value for unavailable. (O, I) or (I, D) interchanges with this sentinel value will not be considered. @@ -147,12 +149,12 @@ def best_intermediate_zones(access_table: DataFrame, egress_table: DataFrame, co Dict[int, DataFrame]: If k > 1. The keys represent the ranks, so result[1] is the best intermediate zone, result[2] is the second-best, etc. The value DataFrames are in the same format as if k == 1, just with different intermediate zones chosen. - """ # Check inputs k = max(1, k) - if n_threads is None: n_threads = cpu_count() + if n_threads is None: + n_threads = cpu_count() origins, intermediates, destinations = _validate_access_egress_tables(access_table, egress_table) n_origins, n_intermediate, n_destinations = len(origins), len(intermediates), len(destinations) @@ -176,8 +178,10 @@ def best_intermediate_zones(access_table: DataFrame, egress_table: DataFrame, co ]) for start, stop in breaks ] - for t in threads: t.start() - for t in threads: t.join() + for t in threads: + t.start() + for t in threads: + t.join() # Construct composite result tables if other_columns: diff --git a/balsa/routines/general.py b/balsa/routines/general.py index 8dd7386..a3eb0a8 100644 --- a/balsa/routines/general.py +++ b/balsa/routines/general.py @@ -1,16 +1,10 @@ -from __future__ import division, absolute_import, print_function, unicode_literals - from keyword import kwlist - from pandas import DataFrame, Series, Index, MultiIndex -from six import iteritems -import six -import re -import tokenize - +from typing import Union, List, Dict, Iterable -def reindex_series(series, target_series, source_levels=None, target_levels=None, fill_value=None): +def reindex_series(series: Series, target_series: Series, source_levels: List[int] = None, + target_levels: List[int] = None, fill_value: Union[int, float] = None) -> Series: # Make shallow copies of the source and target series in case their indexes need to be changed series = series.copy(deep=False) target_series = target_series.copy(deep=False) @@ -29,9 +23,8 @@ def reindex_series(series, target_series, source_levels=None, target_levels=None return reindexed -def align_categories(iterable): - """ - Pre-processing step for ``pd.concat()`` which attempts to align any Categorical series in the sequence to using +def align_categories(iterable: Union[Series, DataFrame]): + """Pre-processing step for ``pd.concat()`` which attempts to align any Categorical series in the sequence to using the same set of categories. It passes through the sequence twice: once to accumulate the complete set of all categories used in the sequence; and a second time to modify the sequence's contents to use this full set. The contents of the sequence are modified in-place. @@ -42,14 +35,16 @@ def align_categories(iterable): Args: iterable (Union[pandas.Series, pandas.DataFrame]): Any iterable of Series or DataFrame objects (anything that is acceptable to ``pandas.concat()``) - """ iterable_type = None for item in iterable: if iterable_type is None: - if isinstance(item, DataFrame): iterable_type = DataFrame - elif isinstance(item, Series): iterable_type = Series - else: raise TypeError(type(item)) + if isinstance(item, DataFrame): + iterable_type = DataFrame + elif isinstance(item, Series): + iterable_type = Series + else: + raise TypeError(type(item)) else: assert isinstance(item, iterable_type) @@ -59,10 +54,8 @@ def align_categories(iterable): column_categories = _enumerate_frame_categories(iterable) _align_frame_categories(iterable, column_categories) - return - -def _align_series_categories(series_list): +def _align_series_categories(series_list: Series): all_categories = set() for series in series_list: if not hasattr(series, 'cat'): @@ -77,11 +70,12 @@ def _align_series_categories(series_list): series.cat.reorder_categories(sorted_categories, inplace=True) -def _enumerate_frame_categories(frames): +def _enumerate_frame_categories(frames: DataFrame) -> Dict[str, set]: column_categories = {} for frame in frames: for col_name, series in frame.items(): - if not hasattr(series, 'cat'): continue + if not hasattr(series, 'cat'): + continue categories = set(series.cat.categories) if col_name not in column_categories: @@ -91,11 +85,12 @@ def _enumerate_frame_categories(frames): return column_categories -def _align_frame_categories(frames, column_categories): - for col_name, all_categories in iteritems(column_categories): +def _align_frame_categories(frames: DataFrame, column_categories: Dict[str, set]): + for col_name, all_categories in column_categories.items(): sorted_categories = sorted(all_categories) for frame in frames: - if col_name not in frame: continue + if col_name not in frame: + continue s = frame[col_name] missing_categories = all_categories.difference(s.cat.categories) if missing_categories: @@ -103,19 +98,17 @@ def _align_frame_categories(frames, column_categories): s.cat.reorder_categories(sorted_categories, inplace=True) -def sum_df_sequence(seq, fill_value=0): - """ - Sums over a sequence of DataFrames, even if they have different indexes or columns, filling in 0 (or a value of your - choice) for missing rows or columns. Useful when you have a sequence of DataFrames which are supposed to have +def sum_df_sequence(seq: Iterable[DataFrame], fill_value: Union[int, float] = 0) -> DataFrame: + """Sums over a sequence of DataFrames, even if they have different indexes or columns, filling in 0 (or a value of + your choice) for missing rows or columns. Useful when you have a sequence of DataFrames which are supposed to have the same indexes and columns but might be missing a few values. Args: seq (Iterable[pandas.DataFrame]): Any iterable of DataFrame type, ordered or unordered. - fill_value: Defaults to ``0``. The value to use for missing cells. Preferably a number to avoid errors. + fill_value (Union[int, float], optional): Defaults to ``0``. The value to use for missing cells. Returns: pandas.DataFrame: The sum over all items in seq. - """ common_index = Index([]) common_columns = Index([]) @@ -134,32 +127,14 @@ def sum_df_sequence(seq, fill_value=0): return accumulator -if six.PY3: - def is_identifier(name): - """ - Tests that the name is a valid Python variable name and does not collide with reserved keywords +def is_identifier(name: str) -> bool: + """Tests that the name is a valid Python variable name and does not collide with reserved keywords - Args: - name (str): Name to test - - Returns: - bool: If the name is 'Pythonic' - - """ - - return name.isidentifier() and name not in kwlist -else: - def is_identifier(name): - """ - Tests that the name is a valid Python variable name and does not collide with reserved keywords - - Args: - name (str): Name to test - - Returns: - bool: If the name is 'Pythonic' - - """ + Args: + name (str): Name to test - return bool(re.match(tokenize.Name + '$', name)) and name not in kwlist + Returns: + bool: If the name is 'Pythonic' + """ + return name.isidentifier() and name not in kwlist diff --git a/balsa/routines/general.pyi b/balsa/routines/general.pyi deleted file mode 100644 index 884e7f9..0000000 --- a/balsa/routines/general.pyi +++ /dev/null @@ -1,15 +0,0 @@ -import pandas as pd -from typing import Union, Iterable - - -def reindex_series(series, target_series, source_levels=None, target_levels=None, fill_value=None): - pass - -def align_categories(iterable: Iterable[Union[pd.Series, pd.DataFrame]]) -> None: - pass - -def sum_df_sequence(seq: Iterable[pd.DataFrame], fill_value: Union[int, float]=0) -> pd.DataFrame: - pass - -def is_identifier(name: str) -> bool: - pass diff --git a/balsa/routines/io/common.py b/balsa/routines/io/common.py index dbf617a..2a93b0d 100644 --- a/balsa/routines/io/common.py +++ b/balsa/routines/io/common.py @@ -1,27 +1,22 @@ +from contextlib import contextmanager +from io import FileIO import numpy as np import pandas as pd -from contextlib import contextmanager - -from six import string_types - -try: - from pathlib import Path -except ImportError: - Path = None +from pathlib import Path +from typing import Union -def coerce_matrix(matrix, allow_raw=True, force_square=True): - """ - Infers a NumPy array from given input +def coerce_matrix(matrix: Union[np.ndarray, pd.DataFrame, pd.Series], allow_raw: bool = True, + force_square: bool = True) -> np.ndarray: + """Infers a NumPy array from given input Args: - matrix: + matrix (Union[numpy.ndarray, pandas.DataFrame, pandas.Series]): allow_raw (bool, optional): Defaults to ``True``. force_square (bool, optional): Defaults to ``True``. Returns: - numpy.ndarray: - A 2D ndarray of type float32 + numpy.ndarray: A 2D ndarray of type float32 """ if isinstance(matrix, pd.DataFrame): if force_square: @@ -46,9 +41,8 @@ def coerce_matrix(matrix, allow_raw=True, force_square=True): return matrix -def expand_array(a, n, axis=None): - """ - Expands an array across all dimensions by a set amount +def expand_array(a: np.ndarray, n: np.ndarray, axis: int = None) -> np.ndarray: + """Expands an array across all dimensions by a set amount Args: a (numpy.ndarray): The array to expand @@ -56,11 +50,11 @@ def expand_array(a, n, axis=None): axis (int, optional): Defaults to ``None``. The axis to expand along, or None to expand along all axes. Returns: - numpy.ndarray: - The expanded array + numpy.ndarray: The expanded array """ - if axis is None: new_shape = [dim + n for dim in a.shape] + if axis is None: + new_shape = [dim + n for dim in a.shape] else: new_shape = [] for i, dim in enumerate(a.shape): @@ -76,22 +70,20 @@ def expand_array(a, n, axis=None): @contextmanager -def open_file(file_handle, **kwargs): - """ - Context manager for opening files provided as several different types. Supports a file handler as a str, unicode, +def open_file(file_handle: Union[str, Path, FileIO], **kwargs): + """Context manager for opening files provided as several different types. Supports a file handler as a str, unicode, ``pathlib.Path``, or an already-opened handler. Args: - file_handle (Union[str, unicode, Path, File]): The item to be opened or is already open. + file_handle (Union[str, Path, FileIO]): The item to be opened or is already open. **kwargs: Keyword args passed to ``open()``. Usually mode='w'. Yields: File: The opened file handler. Automatically closed once out of context. - """ opened = False - if isinstance(file_handle, string_types): + if isinstance(file_handle, str): f = open(file_handle, **kwargs) opened = True elif Path is not None and isinstance(file_handle, Path): diff --git a/balsa/routines/io/common.pyi b/balsa/routines/io/common.pyi deleted file mode 100644 index 95824d5..0000000 --- a/balsa/routines/io/common.pyi +++ /dev/null @@ -1,17 +0,0 @@ -from typing import Union -from io import FileIO - -import numpy as np - -try: - from pathlib import Path - file_type = Union[str, FileIO, Path] -except ImportError: - Path = None - file_type = Union[str, FileIO] - -def coerce_matrix(matrix, allow_raw=True, force_square=True): pass - -def expand_array(a: np.ndarray, n: int, axis: int=None) -> np.ndarray: pass - -def open_file(file_handle, **kwargs): pass diff --git a/balsa/routines/io/fortran.py b/balsa/routines/io/fortran.py index ebf0661..bf6fd0e 100644 --- a/balsa/routines/io/fortran.py +++ b/balsa/routines/io/fortran.py @@ -1,5 +1,8 @@ -import pandas as pd +from io import FileIO import numpy as np +import pandas as pd +from pathlib import Path +from typing import Union, Iterable from .common import coerce_matrix, open_file, expand_array @@ -11,21 +14,23 @@ def _infer_fortran_zones(n_words): return n -def read_fortran_rectangle(file, n_columns, zones=None, tall=False, reindex_rows=False, fill_value=None): - """ - Reads a FORTRAN-friendly .bin file (a.k.a. 'simple binary format') which is known to NOT be square. Also works with - square matrices. +def read_fortran_rectangle(file: Union[str, FileIO, Path], n_columns: int, + zones: Union[int, Iterable[int], pd.Index] = None, tall: bool = False, + reindex_rows: bool = False, fill_value: Union[int, float] = None + ) -> Union[np.ndarray, pd.DataFrame, pd.Series]: + """Reads a FORTRAN-friendly .bin file (a.k.a. 'simple binary format') which is known to NOT be square. Also works + with square matrices. - This file format is an array of 4-bytes, where each row is prefaced by an integer referring to the 1-based positional - index that FORTRAN uses. The rest of the data are in 4-byte floats. To read this, the number of columns present - must be known, since the format does not self-specify. + This file format is an array of 4-bytes, where each row is prefaced by an integer referring to the 1-based + positional index that FORTRAN uses. The rest of the data are in 4-byte floats. To read this, the number of columns + present must be known, since the format does not self-specify. Args: - file(Union[str, File, Path]): The file to read. + file(Union[str, FileIO, Path]): The file to read. n_columns (int): The number of columns in the matrix. - zones (Union[int, pandas.Index], optional): Defaults to ``None``. An Index or Iterable will be interpreted as - the zone labels for the matrix rows and columns; returning a DataFrame or Series (depending on `tall`). If - an integer is provided, the returned ndarray will be truncated to this 'number of zones'. + zones (Union[int, Iterable[int], pandas.Index], optional): Defaults to ``None``. An Index or Iterable will be + interpreted as the zone labels for the matrix rows and columns; returning a DataFrame or Series (depending + on `tall`). If an integer is provided, the returned ndarray will be truncated to this 'number of zones'. tall (bool, optional): Defaults to ``False``. If true, a 'tall' version of the matrix will be returned. reindex_rows (bool, optional): Defaults to ``False``. If true, and zones is an Index, the returned DataFrame will be reindexed to fill-in any missing rows. @@ -76,26 +81,25 @@ def read_fortran_rectangle(file, n_columns, zones=None, tall=False, reindex_rows return matrix -def read_fortran_square(file, zones=None, tall=False): - """ - Reads a FORTRAN-friendly .bin file (a.k.a. 'simple binary format') which is known to be square. +def read_fortran_square(file: Union[str, FileIO, Path], zones: Union[int, Iterable[int], pd.Index] = None, + tall: bool = False) -> Union[np.ndarray, pd.DataFrame, pd.Series]: + """Reads a FORTRAN-friendly .bin file (a.k.a. 'simple binary format') which is known to be square. This file format is an array of 4-bytes, where each row is prefaced by an integer referring to the 1-based positional index that FORTRAN uses. The rest of the data are in 4-byte floats. To read this, the number of columns present must be known, since the format does not self-specify. This method can infer the shape if it is square. Args: - file (Union[str, File, Path]): The file to read. - zones (Union[pandas.Index, int], optional): Defaults to ``None``. An Index or Iterable will be interpreted as - the zone labels for the matrix rows and columns; returning a DataFrame or Series (depending on ``tall``). - If an integer is provided, the returned ndarray will be truncated to this 'number of zones'. Otherwise, the - returned ndarray will be size to the maximum number of zone dimensioned by the Emmebank. + file (Union[str, FileIO, Path]): The file to read. + zones (int, Union[pandas.Index, Iterable[int]], optional): Defaults to ``None``. An Index or Iterable will be + interpreted as the zone labels for the matrix rows and columns; returning a DataFrame or Series (depending + on ``tall``). If an integer is provided, the returned ndarray will be truncated to this 'number of zones'. + Otherwise, the returned ndarray will be size to the maximum number of zone dimensioned by the Emmebank. tall (bool, optional): Defaults to ``False``. If True, a 1D data structure will be returned. If ``zone_index`` is provided, a Series will be returned, otherwise a 1D ndarray. Returns: - pandas.DataFrame, pandas.Series or numpy.ndarray - + numpy.ndarray, pandas.DataFrame, or pandas.Series """ with open_file(file, mode='rb') as reader: floats = np.fromfile(reader, dtype=np.float32) @@ -131,20 +135,19 @@ def read_fortran_square(file, zones=None, tall=False): return matrix.stack() if tall else matrix -def to_fortran(matrix, file, n_columns=None, min_index=1, force_square=True): - """ - Reads a FORTRAN-friendly .bin file (a.k.a. 'simple binary format'), in a square format. +def to_fortran(matrix: Union[np.ndarray, pd.DataFrame, pd.Series], file: Union[str, FileIO], n_columns: int = None, + min_index: int = 1, force_square: bool = True): + """Writes a FORTRAN-friendly .bin file (a.k.a. 'simple binary format'), in a square format. Args: matrix (Union[pandas.DataFrame, pandas.Series, numpy.ndarray]): The matrix to write to disk. If a Series is given, it MUST have a MultiIndex with exactly 2 levels to unstack. - file (Union[basestring, File]): The path or file handler to write to. + file (Union[str, FileIO]): The path or file handler to write to. n_columns (int, optional): Defaults to ``None``. Specifies a desired "width" of the matrix file. For example, ``n_columns=4000`` on a 3500x3500 matrix will pad the width with 500 extra columns containing 0. If ``None`` is provided or the value is <= the width of the given matrix, no padding will be performed. min_index (int, optional): Defaults to ``1``. The lowest numbered row. Used when slicing matrices force_square (bool, optional): Defaults to ``True``. - """ assert min_index >= 1 array = coerce_matrix(matrix, force_square=force_square) diff --git a/balsa/routines/io/fortran.pyi b/balsa/routines/io/fortran.pyi deleted file mode 100644 index a22f9d5..0000000 --- a/balsa/routines/io/fortran.pyi +++ /dev/null @@ -1,18 +0,0 @@ -from typing import Any, Union - -import pandas as pd -import numpy as np - -from .common import file_type - - -def read_fortran_rectangle(file: file_type, n_columns: int, zones: pd.Index=None, tall: bool=False, - reindex_rows: bool=False, fill_value: Any=None - ) -> Union[pd.Series, pd.DataFrame, np.ndarray]: pass - -def read_fortran_square(file: file_type, zones: pd.Index=None, tall: bool=False - ) -> Union[pd.Series, pd.DataFrame, np.ndarray]: pass - - -def to_fortran(matrix: Union[pd.Series, pd.DataFrame, np.ndarray], file: file_type, n_columns: int=None, - min_index: int=1, forec_square: bool=True): pass \ No newline at end of file diff --git a/balsa/routines/io/inro.py b/balsa/routines/io/inro.py index 779b95a..fc2e5b8 100644 --- a/balsa/routines/io/inro.py +++ b/balsa/routines/io/inro.py @@ -1,23 +1,26 @@ +from io import FileIO import numpy as np import pandas as pd +from pathlib import Path +from typing import Union, List, Iterable from .common import open_file, coerce_matrix -def read_mdf(file, raw=False, tall=False): - """ - Reads Emme's official matrix "binary serialization" format, created using ``inro.emme.matrix.MatrixData.save()``. +def read_mdf(file: Union[str, FileIO, Path], raw: bool = False, tall: bool = False + ) -> Union[np.ndarray, pd.DataFrame, pd.Series]: + """Reads Emme's official matrix "binary serialization" format, created using ``inro.emme.matrix.MatrixData.save()``. There is no official extension for this type of file; '.mdf' is recommended. '.emxd' is also sometimes encountered. Args: - file (Union[str, File, Path]): The file to read. + file (Union[str, FileIO, Path]): The file to read. raw (bool, optional): Defaults to ``False``. If ``True``, returns an unlabelled ndarray. Otherwise, a DataFrame will be returned. tall (bool, optional): Defaults to ``False``. If ``True``, a 1D data structure will be returned. If ``raw=False``, a Series will be returned, otherwise a 1D ndarray. + Returns: - numpy.ndarray or pandas.DataFrame: - The matrix stored in the file. + numpy.ndarray, pandas.DataFrame, or pandas.Series: The matrix stored in the file. """ with open_file(file, mode='rb') as file_handler: magic, version, dtype_index, ndim = np.fromfile(file_handler, np.uint32, count=4) @@ -37,11 +40,13 @@ def read_mdf(file, raw=False, tall=False): flat_length = shape.prod() # Multiply the shape tuple matrix = np.fromfile(file_handler, dtype, count=flat_length) - if raw and tall: return matrix + if raw and tall: + return matrix matrix.shape = shape - if raw: return matrix + if raw: + return matrix if ndim == 1: return pd.Series(matrix, index=index_list[0]) @@ -53,15 +58,14 @@ def read_mdf(file, raw=False, tall=False): raise NotImplementedError() # This should never happen -def to_mdf(matrix, file): - """ - Writes a matrix to Emme's official "binary serialization" format, to load using +def to_mdf(matrix: Union[pd.DataFrame, pd.Series], file: Union[str, FileIO, Path]): + """Writes a matrix to Emme's official "binary serialization" format, which can be loaded in Emme using ``inro.emme.matrix.MatrixData.load()``. There is no official extension for this type of file; '.mdf' is recommended. Args: matrix (Union[pandas.DataFrame, panda.Series]): The matrix to write to disk. If a Series is given, it MUST have a MultiIndex with exactly 2 levels to unstack. - file (Union[basestring, File, Path]): The path or file handler to write to. + file (Union[str, File, Path]): The path or file handler to write to. """ if isinstance(matrix, pd.Series): row_index = matrix.index.get_level_values(0).unique() @@ -84,20 +88,16 @@ def to_mdf(matrix, file): data.tofile(writer) -def peek_mdf(file, as_index=True): - """ - Partially opens an MDF file to get the zone system of its rows and its columns. +def peek_mdf(file: Union[str, FileIO, Path], as_index: bool = True) -> Union[List[List[int]], List[pd.Index]]: + """Partially opens an MDF file to get the zone system of its rows and its columns. Args: - file (Union[str, File, Path]): The file to read. + file (Union[str, FileIO, Path]): The file to read. as_index (bool, optional): Defaults to ``True``. Set to ``True`` to return a pandas.Index object rather than List[int] Returns: - List[int] or pandas.Index: - One item for each dimension. If ``as_index=True``, the items will be pandas.Index objects, otherwise they - will be List[int] - + List[int] or List[pandas.Index]: One item for each dimension. If ``as_index=True``, the items will be pandas.Index objects, otherwise they will be List[int] """ with open_file(file, mode='rb') as file_handler: magic, version, dtype_index, ndim = np.fromfile(file_handler, np.uint32, count=4) @@ -113,45 +113,46 @@ def peek_mdf(file, as_index=True): indices = np.fromfile(file_handler, np.int32, n_items) index_list.append(indices) - if not as_index: return index_list + if not as_index: + return index_list return [pd.Index(zones) for zones in index_list] -def read_emx(file, zones=None, tall=False): - """ - Reads an "internal" Emme matrix (found in `/Database/emmemat`); with an '.emx' extension. This data +def read_emx(file: Union[str, FileIO, Path], zones: Union[int, Iterable[int], pd.Index] = None, + tall: bool = False) -> Union[np.ndarray, pd.DataFrame, pd.Series]: + """Reads an "internal" Emme matrix (found in `/Database/emmemat`); with an '.emx' extension. This data format does not contain information about zones. Its size is determined by the dimensions of the Emmebank (``Emmebank.dimensions['centroids']``), regardless of the number of zones actually used in all scenarios. Args: file (Union[str, File, Path]): The file to read. - zones (Union[pandas.Index, int], optional): Defaults to ``None``. An Index or Iterable will be interpreted as - the zone labels for the matrix rows and columns; returning a DataFrame or Series (depending on ``tall``). If - an integer is provided, the returned ndarray will be truncated to this 'number of zones'. Otherwise, the - returned ndarray will be size to the maximum number of zone dimensioned by the Emmebank. + zones (Union[int, Iterable[int], pandas.Index], optional): Defaults to ``None``. An Index or Iterable will be + interpreted as the zone labels for the matrix rows and columns; returning a DataFrame or Series (depending + on ``tall``). If an integer is provided, the returned ndarray will be truncated to this 'number of zones'. + Otherwise, the returned ndarray will be size to the maximum number of zone dimensioned by the Emmebank. tall (bool, optional): Defaults to ``False``. If True, a 1D data structure will be returned. If ``zone_index`` is provided, a Series will be returned, otherwise a 1D ndarray. Returns: - DataFrame or Series or ndarray. + numpy.ndarray, pandas.DataFrame, or pandas.Series. Examples: For a project with 20 zones: - >>> matrix = from_emx("Database/emmemat/mf1.emx") + >>> matrix = read_emx("Database/emmemat/mf1.emx") >>> print type(matrix), matrix.shape (numpy.ndarray, (20, 20)) - >>> matrix = from_emx("Database/emmemat/mf1.emx", zones=10) + >>> matrix = read_emx("Database/emmemat/mf1.emx", zones=10) >>> print type(matrix), matrix.shape (numpy.ndarray, (10, 10)) - >>> matrix = from_emx("Database/emmemat/mf1.emx", zones=range(10)) + >>> matrix = read_emx("Database/emmemat/mf1.emx", zones=range(10)) >>> print type(matrix), matrix.shape (10, 10) - >>> matrix = from_emx("Database/emmemat/mf1.emx", zones=range(10), tall=True) + >>> matrix = read_emx("Database/emmemat/mf1.emx", zones=range(10), tall=True) >>> print type(matrix), matrix.shape 100 @@ -186,13 +187,12 @@ def read_emx(file, zones=None, tall=False): return matrix.stack() if tall else matrix -def to_emx(matrix, file, emmebank_zones): - """ - Writes an "internal" Emme matrix (found in `/Database/emmemat`); with an '.emx' extension. The number - of zones that the Emmebank is dimensioned for must be known in order for the file to be written correctly. +def to_emx(matrix: Union[pd.DataFrame, pd.Series, np.ndarray], file: Union[str, FileIO, Path], emmebank_zones: int): + """Writes an "internal" Emme matrix (found in `/Database/emmemat`); with an '.emx' extension. The + number of zones that the Emmebank is dimensioned for must be known in order for the file to be written correctly. Args: - matrix (Union[pandas.DataFrame, pandas.Series, pandas.ndarray]): The matrix to write to disk. If a Series is + matrix (Union[pandas.DataFrame, pandas.Series, numpy.ndarray]): The matrix to write to disk. If a Series is given, it MUST have a MultiIndex with exactly 2 levels to unstack. file (Union[basestring, File]): The path or file handler to write to. emmebank_zones (int): The number of zones the target Emmebank is dimensioned for. diff --git a/balsa/routines/io/inro.pyi b/balsa/routines/io/inro.pyi deleted file mode 100644 index 73bd7a6..0000000 --- a/balsa/routines/io/inro.pyi +++ /dev/null @@ -1,17 +0,0 @@ -from typing import Union, List, Any, Iterable, Dict, Tuple - -import pandas as pd -import numpy as np - -from .common import file_type - -def read_mdf(file: file_type, raw: bool=False, tall: bool=False) -> Union[pd.DataFrame, np.ndarray]: pass - -def to_mdf(matrix: Union[pd.Series, pd.DataFrame], file: file_type): pass - -def peek_mdf(file: file_type, as_index: bool=True) -> Union[List[pd.Index], List[List[int]]]: pass - -def read_emx(file: file_type, zones: pd.Index=None, tall: bool=False) -> Union[pd.Series, pd.DataFrame, np.ndarray]: - pass - -def to_emx(matrix: Union[pd.Series, pd.DataFrame, np.ndarray], file: file_type, emmebank_zones: int): pass diff --git a/balsa/routines/io/omx.py b/balsa/routines/io/omx.py index ab0c775..c7997f9 100644 --- a/balsa/routines/io/omx.py +++ b/balsa/routines/io/omx.py @@ -8,7 +8,7 @@ import numpy as np import pandas as pd -from six import iteritems, itervalues, iterkeys +from typing import Union, Iterable, Dict from ..matrices import fast_unstack @@ -17,9 +17,12 @@ except ImportError: omx = None +MATRIX_TYPES = Union[pd.DataFrame, pd.Series, np.ndarray] + if omx is not None: - def read_omx(file, matrices=None, mapping=None, raw=False, tall=False, squeeze=True): + def read_omx(file: str, matrices: Iterable[str] = None, mapping: str = None, raw: bool = False, + tall: bool = False, squeeze: bool = True) -> Union[MATRIX_TYPES, Dict[str, MATRIX_TYPES]]: """ Reads Open Matrix (OMX) files. An OMX file can contain multiple matrices, so this function typically returns a Dict. @@ -77,9 +80,9 @@ def read_omx(file, matrices=None, mapping=None, raw=False, tall=False, squeeze=T return return_value - def to_omx(file, matrices, zone_index=None, title='', descriptions=None, attrs=None, mapping='zone_numbers'): - """ - Creates a new (or overwrites an old) OMX file with a collection of matrices. + def to_omx(file: str, matrices: Dict[str, MATRIX_TYPES], zone_index: pd.Index = None, title: str = '', + descriptions: Dict[str, str] = None, attrs: Dict[str, dict] = None, mapping: str = 'zone_numbers'): + """Creates a new (or overwrites an old) OMX file with a collection of matrices. Args: file: OMX to write. @@ -96,15 +99,15 @@ def to_omx(file, matrices, zone_index=None, title='', descriptions=None, attrs=N matrices, zone_index = _prep_matrix_dict(matrices, zone_index) if descriptions is None: - descriptions = {name: '' for name in iterkeys(matrices)} + descriptions = {name: '' for name in matrices.keys()} if attrs is None: - attrs = {name: None for name in iterkeys(matrices)} + attrs = {name: None for name in matrices.keys()} file = str(file) # Converts from Path with omx.open_file(file, mode='w', title=title) as omx_file: omx_file.create_mapping(mapping, zone_index.tolist()) - for name, array in iteritems(matrices): + for name, array in matrices.items(): description = descriptions[name] attr = attrs[name] @@ -131,12 +134,14 @@ def _prep_matrix_dict(matrices, desired_zone_index): return checked, zone_index def _check_types(matrices): - gen = iter(itervalues(matrices)) + gen = iter(matrices.values()) first = next(gen) item_type = 'RAW' - if isinstance(first, pd.Series): item_type = 'SERIES' - elif isinstance(first, pd.DataFrame): item_type = 'FRAME' + if isinstance(first, pd.Series): + item_type = 'SERIES' + elif isinstance(first, pd.DataFrame): + item_type = 'FRAME' msg = "All items must be the same type" @@ -151,7 +156,7 @@ def _check_types(matrices): return item_type def _check_raw_matrices(matrices): - gen = iter(iteritems(matrices)) + gen = iter(matrices.items()) name, matrix = next(gen) n_dim = len(matrix.shape) @@ -183,7 +188,7 @@ def _check_raw_matrices(matrices): return retval, n def _check_matrix_series(matrices): - gen = iter(iteritems(matrices)) + gen = iter(matrices.items()) name, matrix = next(gen) tall_index = matrix.index @@ -200,7 +205,7 @@ def _check_matrix_series(matrices): return retval, zone_index def _check_matrix_frames(matrices): - gen = iter(iteritems(matrices)) + gen = iter(matrices.items()) name, matrix = next(gen) zone_index = matrix.index diff --git a/balsa/routines/io/omx.pyi b/balsa/routines/io/omx.pyi deleted file mode 100644 index 9988f9d..0000000 --- a/balsa/routines/io/omx.pyi +++ /dev/null @@ -1,14 +0,0 @@ -from typing import Union, Iterable, Dict - -import pandas as pd -import numpy as np - -from .common import file_type - -MATRIX_TYPES = Union[pd.DataFrame, pd.Series, np.ndarray] - -def read_omx(file: file_type, matrices: Iterable[str]=None, mapping: str=None, raw=False, tall=False, - squeeze=True) -> Union[MATRIX_TYPES, Dict[str, MATRIX_TYPES]]: pass - -def to_omx(file: str, matrices: Dict[str, MATRIX_TYPES], zone_index: pd.Index=None, title: str='', - descriptions: Dict[str, str]=None, attrs: Dict[str, dict]=None, mapping: str='zone_numbers'): pass \ No newline at end of file diff --git a/balsa/routines/matrices.py b/balsa/routines/matrices.py index d78e3ca..442f0d9 100644 --- a/balsa/routines/matrices.py +++ b/balsa/routines/matrices.py @@ -1,18 +1,15 @@ -from __future__ import division as _division - -import multiprocessing as _mp -import numba as _nb -import numpy as _np -import pandas as _pd -from pandas import Series, DataFrame, Index -import numexpr as _ne +from multiprocessing import cpu_count +import numba as nb +import numexpr as ne +import numpy as np +import pandas as pd +from typing import Tuple, Union, List, Callable, Iterable EPS = 1.0e-7 -def matrix_balancing_1d(m, a, axis): - """ - Balances a matrix using a single constraint. +def matrix_balancing_1d(m: np.ndarray, a: np.ndarray, axis: int) -> np.ndarray: + """Balances a matrix using a single constraint. Args: m (numpy.ndarray): The matrix (a 2-dimensional ndarray) to be balanced @@ -21,23 +18,24 @@ def matrix_balancing_1d(m, a, axis): Return: numpy.ndarray: A balanced matrix - """ assert axis in [0, 1], "axis must be either 0 or 1" assert m.ndim == 2, "`m` must be a two-dimensional matrix" assert a.ndim == 1, "`a` must be an one-dimensional vector" - assert m.shape[axis] == a.shape[0], "axis %d of matrices 'm' and 'a' must be the same." % axis + assert np.all(m.shape[axis] == a.shape[0]), "axis %d of matrices 'm' and 'a' must be the same." % axis return _balance(m, a, axis) -def matrix_balancing_2d(m, a, b, totals_to_use='raise', max_iterations=1000, rel_error=0.0001, n_procs=1): - """ - Balances a two-dimensional matrix using iterative proportional fitting. +def matrix_balancing_2d(m: Union[np.ndarray, pd.DataFrame], a: np.ndarray, b: np.ndarray, totals_to_use: str = 'raise', + max_iterations: int = 1000, rel_error: float = 0.0001, + n_procs: int = 1) -> Tuple[Union[np.ndarray, pd.DataFrame], float, int]: + """Balances a two-dimensional matrix using iterative proportional fitting. Args: - m (numpy.ndarray): The matrix (a 2-dimensional ndarray) to be balanced + m (Union[numpy.ndarray, pandas.DataFrame]): The matrix (a 2-dimensional ndarray) to be balanced. If a DataFrame + is supplied, the output will be returned as a DataFrame. a (numpy.ndarray): The row totals (a 1-dimensional ndarray) to use for balancing b (numpy.ndarray): The column totals (a 1-dimensional ndarray) to use for balancing totals_to_use (str, optional): Defaults to ``'raise'``. Describes how to scale the row and column totals if @@ -51,21 +49,22 @@ def matrix_balancing_2d(m, a, b, totals_to_use='raise', max_iterations=1000, rel n_procs (int, optional): Defaults to ``1``. Number of processors for parallel computation. (Not used) Return: - Tuple[numpy.ndarray, float, int]: The balanced matrix, residual, and n_iterations + Tuple[Union[numpy.ndarray, pandas.DataFrame], float, int]: The balanced matrix, residual, and n_iterations """ max_iterations = int(max_iterations) n_procs = int(n_procs) # Test if matrix is Pandas DataFrame data_type = '' - if isinstance(m, _pd.DataFrame): + m_pd = None + if isinstance(m, pd.DataFrame): data_type = 'pd' m_pd = m m = m_pd.values - if isinstance(a, _pd.Series) or isinstance(a, _pd.DataFrame): + if isinstance(a, pd.Series) or isinstance(a, pd.DataFrame): a = a.values - if isinstance(b, _pd.Series) or isinstance(b, _pd.DataFrame): + if isinstance(b, pd.Series) or isinstance(b, pd.DataFrame): b = b.values # ################################################################################## @@ -85,23 +84,23 @@ def matrix_balancing_2d(m, a, b, totals_to_use='raise', max_iterations=1000, rel assert totals_to_use in valid_totals_to_use, "totals_to_use must be one of %s" % valid_totals_to_use assert max_iterations >= 1, "max_iterations must be integer >= 1" assert 0 < rel_error < 1.0, "rel_error must be float between 0.0 and 1.0" - assert 1 <= n_procs <= _mp.cpu_count(), \ - "n_procs must be integer between 1 and the number of processors (%d) " % _mp.cpu_count() + assert 1 <= n_procs <= cpu_count(), \ + "n_procs must be integer between 1 and the number of processors (%d) " % cpu_count() if n_procs > 1: raise NotImplementedError("Multiprocessing capability is not implemented yet.") # Scale row and column totals, if required a_sum = a.sum() b_sum = b.sum() - if not _np.isclose(a_sum, b_sum): + if not np.isclose(a_sum, b_sum): if totals_to_use == 'rows': - b = _np.multiply(b, a_sum / b_sum) + b = np.multiply(b, a_sum / b_sum) elif totals_to_use == 'columns': - a = _np.multiply(a, b_sum / a_sum) + a = np.multiply(a, b_sum / a_sum) elif totals_to_use == 'average': avg_sum = 0.5 * (a_sum + b_sum) - a = _np.multiply(a, avg_sum / a_sum) - b = _np.multiply(b, avg_sum / b_sum) + a = np.multiply(a, avg_sum / a_sum) + b = np.multiply(b, avg_sum / b_sum) else: raise RuntimeError("a and b vector totals do not match.") @@ -119,15 +118,14 @@ def matrix_balancing_2d(m, a, b, totals_to_use='raise', max_iterations=1000, rel i += 1 if data_type == 'pd': - new_df = _pd.DataFrame(m, index=m_pd.index, columns=m_pd.columns) + new_df = pd.DataFrame(m, index=m_pd.index, columns=m_pd.columns) return new_df, err, i else: return m, err, i -def _balance(matrix, tot, axis): - """ - Balances a matrix using a single constraint. +def _balance(matrix: np.ndarray, tot: np.ndarray, axis: int) -> np.ndarray: + """Balances a matrix using a single constraint. Args: matrix (numpy.ndarray): The matrix to be balanced @@ -136,39 +134,37 @@ def _balance(matrix, tot, axis): Return: numpy.ndarray: The balanced matrix - """ sc = tot / (matrix.sum(axis) + EPS) - sc = _np.nan_to_num(sc) # replace divide by 0 errors from the prev. line + sc = np.nan_to_num(sc) # replace divide by 0 errors from the prev. line if axis: # along rows - matrix = _np.multiply(matrix.T, sc).T + matrix = np.multiply(matrix.T, sc).T else: # along columns - matrix = _np.multiply(matrix, sc) + matrix = np.multiply(matrix, sc) return matrix def _calc_error(m, a, b): - row_sum = _np.absolute(a - m.sum(1)).sum() - col_sum = _np.absolute(b - m.sum(0)).sum() + row_sum = np.absolute(a - m.sum(1)).sum() + col_sum = np.absolute(b - m.sum(0)).sum() return row_sum + col_sum -@_nb.jit(_nb.float64[:, :](_nb.float64[:, :], _nb.int64)) +@nb.jit(nb.float64[:, :](nb.float64[:, :], nb.int64)) def _nbf_bucket_round(a_, decimals=0): a = a_.ravel() - b = _np.copy(a) + b = np.copy(a) residual = 0 for i in range(0, len(b)): - b[i] = _np.round(a[i] + residual, decimals) + b[i] = np.round(a[i] + residual, decimals) residual += a[i] - b[i] return b.reshape(a_.shape) -def matrix_bucket_rounding(m, decimals=0): - """ - Bucket rounds to the given number of decimals. +def matrix_bucket_rounding(m: Union[np.ndarray, pd.DataFrame], decimals: int = 0) -> Union[np.ndarray, pd.DataFrame]: + """Bucket rounds to the given number of decimals. Args: m (Union[numpy.ndarray, pandas.DataFrame]): The matrix to be rounded @@ -176,37 +172,36 @@ def matrix_bucket_rounding(m, decimals=0): specifies the number of positions to the left of the decimal point. Return: - numpy.ndarray: The rounded matrix - + Union[numpy.ndarray, pandas.DataFrame]: The rounded matrix """ # Test if matrix is Pandas DataFrame data_type = '' - if isinstance(m, _pd.DataFrame): + m_pd = None + if isinstance(m, pd.DataFrame): data_type = 'pd' m_pd = m m = m_pd.values decimals = int(decimals) - # I really can't think of a way to vectorize bucket rounding, - # so here goes the slow for loop + # I really can't think of a way to vectorize bucket rounding, so here goes the slow for loop b = _nbf_bucket_round(m, decimals) if decimals <= 0: - b = b.astype(_np.int32) + b = b.astype(np.int32) if data_type == 'pd': - new_df = _pd.DataFrame(b.reshape(m.shape), index=m_pd.index, columns=m_pd.columns) + new_df = pd.DataFrame(b.reshape(m.shape), index=m_pd.index, columns=m_pd.columns) return new_df else: return b.reshape(m.shape) -def split_zone_in_matrix(base_matrix, old_zone, new_zones, proportions): - """ - Takes a zone in a matrix (as a DataFrame) and splits it into several new zones, prorating affected cells by a vector - of proportions (one value for each new zone). The old zone is removed. +def split_zone_in_matrix(base_matrix: pd.DataFrame, old_zone: int, new_zones: List[int], + proportions: List[float]) -> pd.DataFrame: + """Takes a zone in a matrix (as a DataFrame) and splits it into several new zones, prorating affected cells by a + vector of proportions (one value for each new zone). The old zone is removed. Args: base_matrix (pandas.DataFrame): The matrix to re-shape @@ -217,28 +212,28 @@ def split_zone_in_matrix(base_matrix, old_zone, new_zones, proportions): Returns: pandas.DataFrame: The re-shaped matrix - """ - assert isinstance(base_matrix, _pd.DataFrame), "Base matrix must be a DataFrame" + assert isinstance(base_matrix, pd.DataFrame), "Base matrix must be a DataFrame" old_zone = int(old_zone) - new_zones = _np.array(new_zones, dtype=_np.int32) - proportions = _np.array(proportions, dtype=_np.float64) + new_zones = np.array(new_zones, dtype=np.int32) + proportions = np.array(proportions, dtype=np.float64) assert len(new_zones) == len(proportions), "Proportion array must be the same length as the new zone array" assert len(new_zones.shape) == 1, "New zones must be a vector" assert base_matrix.index.equals(base_matrix.columns), "DataFrame is not a matrix" - assert _np.isclose(proportions.sum(), 1.0), "Proportions must sum to 1.0 " + assert np.isclose(proportions.sum(), 1.0), "Proportions must sum to 1.0 " n_new_zones = len(new_zones) intersection_index = base_matrix.index.drop(old_zone) new_index = intersection_index - for z in new_zones: new_index = new_index.insert(-1, z) - new_index = _pd.Index(sorted(new_index)) + for z in new_zones: + new_index = new_index.insert(-1, z) + new_index = pd.Index(sorted(new_index)) - new_matrix = _pd.DataFrame(0, index=new_index, columns=new_index, dtype=base_matrix.dtypes.iat[0]) + new_matrix = pd.DataFrame(0, index=new_index, columns=new_index, dtype=base_matrix.dtypes.iat[0]) # 1. Copy over the values from the regions of the matrix not being updated new_matrix.loc[intersection_index, intersection_index] = base_matrix @@ -247,10 +242,10 @@ def split_zone_in_matrix(base_matrix, old_zone, new_zones, proportions): # This section (and the next) works with the underlying Numpy arrays, since they handle # broadcasting better than Pandas does original_row = base_matrix.loc[old_zone, intersection_index] - original_row = original_row.values[:] # Make a shallow copy to preserve shape of the original data + original_row = original_row.values[:] # Make a shallow copy to preserve shape of the original data original_row.shape = 1, len(intersection_index) proportions.shape = n_new_zones, 1 - result = _pd.DataFrame(original_row * proportions, index=new_zones, columns=intersection_index) + result = pd.DataFrame(original_row * proportions, index=new_zones, columns=intersection_index) new_matrix.loc[result.index, result.columns] = result # 3. Proprate the column corresponding to the dropped zone @@ -258,40 +253,45 @@ def split_zone_in_matrix(base_matrix, old_zone, new_zones, proportions): original_column = original_column.values[:] original_column.shape = len(intersection_index), 1 proportions.shape = 1, n_new_zones - result = _pd.DataFrame(original_column * proportions, index=intersection_index, columns=new_zones) + result = pd.DataFrame(original_column * proportions, index=intersection_index, columns=new_zones) new_matrix.loc[result.index, result.columns] = result # 4. Expand the old intrazonal - proportions_copy = proportions[:,:] + proportions_copy = proportions[:, :] proportions_copy.shape = 1, n_new_zones proportions.shape = n_new_zones, 1 intrzonal_matrix = proportions * proportions_copy intrazonal_scalar = base_matrix.at[old_zone, old_zone] - result = _pd.DataFrame(intrazonal_scalar * intrzonal_matrix, index=new_zones, columns=new_zones) + result = pd.DataFrame(intrazonal_scalar * intrzonal_matrix, index=new_zones, columns=new_zones) new_matrix.loc[result.index, result.columns] = result return new_matrix -def aggregate_matrix(matrix, groups=None, row_groups=None, col_groups=None, aggfunc=_np.sum): - """ - Aggregates a matrix based on mappings provided for each axis, using a specified aggregation function. +def aggregate_matrix(matrix: Union[pd.DataFrame, pd.Series], groups: Union[pd.Series, np.ndarray] = None, + row_groups: Union[pd.Series, np.ndarray] = None, col_groups: Union[pd.Series, np.ndarray] = None, + aggfunc: Callable[[Iterable[Union[int, float]]], Union[int, float]] = np.sum + ) -> Union[pd.DataFrame, pd.Series]: + """Aggregates a matrix based on mappings provided for each axis, using a specified aggregation function. Args: matrix (Union[pandas.DataFrame, pandas.Series]): Matrix data to aggregate. DataFrames and Series with 2-level indices are supported - groups: Syntactic sugar to specify both row_groups and col_groups to use the same grouping series. - row_groups: Groups for the rows. If aggregating a DataFrame, this must match the index of the matrix. For a - "tall" matrix, this series can match either the "full" index of the series, or it can match the first level - of the matrix (it would be the same as if aggregating a DataFrame). Alternatively, an array can be provided, - but it must be the same length as the DataFrame's index, or the full length of the Series. - col_groups: Groups for the columns. If aggregating a DataFrame, this must match the columns of the matrix. For a - "tall" matrix, this series can match either the "full" index of the series, or it can match the second level - of the matrix (it would be the same as if aggregating a DataFrame). Alternatively, an array can be provided, - but it must be the same length as the DataFrame's columns, or the full length of the Series. - aggfunc: The aggregation function to use. Default is sum. + groups (Union[pandas.Series, numpy.ndarray], optional): Syntactic sugar to specify both row_groups and + col_groups to use the same grouping series. + row_groups (Union[pandas.Series, numpy.ndarray], optional): Groups for the rows. If aggregating a DataFrame, + this must match the index of the matrix. For a "tall" matrix, this series can match either the "full" index + of the series, or it can match the first level of the matrix (it would be the same as if aggregating a + DataFrame). Alternatively, an array can be provided, but it must be the same length as the DataFrame's + index, or the full length of the Series. + col_groups (Union[pandas.Series, numpy.ndarray], optional): Groups for the columns. If aggregating a DataFrame, + this must match the columns of the matrix. For a "tall" matrix, this series can match either the "full" + index of the series, or it can match the second level of the matrix (it would be the same as if aggregating + a DataFrame). Alternatively, an array can be provided, but it must be the same length as the DataFrame's + columns, or the full length of the Series. + aggfunc: The aggregation function to use. Default is np.sum. Returns: pandas.Series or pandas.DataFrame: @@ -340,7 +340,7 @@ def aggregate_matrix(matrix, groups=None, row_groups=None, col_groups=None, aggf ``new_matrix = aggregate_matrix(matrix, groups=groups)`` new_matrix: - + +-------+----+----+----+ | | A | B | C | +=======+====+====+====+ @@ -359,12 +359,12 @@ def aggregate_matrix(matrix, groups=None, row_groups=None, col_groups=None, aggf assert row_groups is not None, "Row groups must be specified" assert col_groups is not None, "Column groups must be specified" - if isinstance(matrix, _pd.DataFrame): + if isinstance(matrix, pd.DataFrame): row_groups = _prep_square_index(matrix.index, row_groups) col_groups = _prep_square_index(matrix.columns, col_groups) return _aggregate_frame(matrix, row_groups, col_groups, aggfunc) - elif isinstance(matrix, _pd.Series): + elif isinstance(matrix, pd.Series): assert matrix.index.nlevels == 2 row_groups, col_groups = _prep_tall_index(matrix.index, row_groups, col_groups) @@ -375,7 +375,7 @@ def aggregate_matrix(matrix, groups=None, row_groups=None, col_groups=None, aggf def _prep_tall_index(target_index, row_aggregator, col_aggregator): - if isinstance(row_aggregator, _pd.Series): + if isinstance(row_aggregator, pd.Series): if row_aggregator.index.equals(target_index): row_aggregator = row_aggregator.values else: @@ -384,9 +384,9 @@ def _prep_tall_index(target_index, row_aggregator, col_aggregator): row_aggregator = reindexed.values else: assert len(row_aggregator) == len(target_index) - row_aggregator = _np.array(row_aggregator) + row_aggregator = np.array(row_aggregator) - if isinstance(col_aggregator, _pd.Series): + if isinstance(col_aggregator, pd.Series): if col_aggregator.index.equals(target_index): col_aggregator = col_aggregator.values else: @@ -395,18 +395,18 @@ def _prep_tall_index(target_index, row_aggregator, col_aggregator): col_aggregator = reindexed.values else: assert len(col_aggregator) == len(target_index) - col_aggregator = _np.array(col_aggregator) + col_aggregator = np.array(col_aggregator) return row_aggregator, col_aggregator def _prep_square_index(index, aggregator): - if isinstance(aggregator, _pd.Series): + if isinstance(aggregator, pd.Series): assert aggregator.index.equals(index) return aggregator.values else: assert len(aggregator) == len(index) - return _np.array(aggregator) + return np.array(aggregator) def _aggregate_frame(matrix, row_aggregator, col_aggregator, aggfunc): @@ -417,10 +417,9 @@ def _aggregate_series(matrix, row_aggregator, col_aggregator, aggfunc): return matrix.groupby([row_aggregator, col_aggregator]).aggregate(aggfunc) -def fast_stack(frame, multi_index, deep_copy=True): - """ - Performs the same action as ``DataFrame.stack()``, but provides better performance when the target stacked index is - known before hand. Useful in converting a lot of matrices from "wide" to "tall" format. The inverse of +def fast_stack(frame: pd.DataFrame, multi_index: pd.MultiIndex, deep_copy: bool = True) -> pd.Series: + """Performs the same action as ``DataFrame.stack()``, but provides better performance when the target stacked index + is known before hand. Useful in converting a lot of matrices from "wide" to "tall" format. The inverse of ``fast_unstack()``. Notes: @@ -430,7 +429,7 @@ def fast_stack(frame, multi_index, deep_copy=True): Args: frame (pandas.DataFrame): The DataFrame to stack. - multi_index (pandas.Index): The 2-level MultiIndex known ahead-of-time. + multi_index (pandas.MultiIndex): The 2-level MultiIndex known ahead-of-time. deep_copy (bool, optional): Defaults to ``True``. A flag indicating if the returned Series should be a view of the underlying data (deep_copy=False) or a copy of it (deep_copy=True). A deep copy takes a little longer to convert and takes up more memory but preserves the original data of the DataFrame. The default value of True @@ -438,25 +437,23 @@ def fast_stack(frame, multi_index, deep_copy=True): Returns: pandas.Series: The stacked data. - """ assert multi_index.nlevels == 2, "Target index must be a MultiIndex with exactly 2 levels" assert len(multi_index) == len(frame.index) * len(frame.columns), "Target index and source index and columns do " \ "not have compatible lengths" - array = _np.ascontiguousarray(frame.values) + array = np.ascontiguousarray(frame.values) array = array.copy() if deep_copy else array[:, :] array.shape = len(frame.index) * len(frame.columns) - return Series(array, index=multi_index) + return pd.Series(array, index=multi_index) -def fast_unstack(series, index, columns, deep_copy=True): - """ - Performs the same action as ``DataFrame.unstack()``, but provides better performance when the target unstacked index - and columns are known before hand. Useful in converting a lot of matrices from "tall" to "wide" format. The inverse - of ``fast_stack()``. +def fast_unstack(series: pd.Series, index: pd.Index, columns: pd.Index, deep_copy: bool = True) -> pd.DataFrame: + """Performs the same action as ``DataFrame.unstack()``, but provides better performance when the target unstacked + index and columns are known before hand. Useful in converting a lot of matrices from "tall" to "wide" format. The + inverse of ``fast_stack()``. Notes: This function does not check that the entries in index and columns are compatible with the MultiIndex of the @@ -473,8 +470,7 @@ def fast_unstack(series, index, columns, deep_copy=True): recommended for most uses. Returns: - pandas.DataFrame: The unstacked data - + pandas.DataFrame: The unstacked dat """ assert series.index.nlevels == 2, "Source Series must have an index with exactly 2 levels" @@ -484,10 +480,10 @@ def fast_unstack(series, index, columns, deep_copy=True): array = series.values.copy() if deep_copy else series.values[:] array.shape = len(index), len(columns) - return DataFrame(array, index=index, columns=columns) + return pd.DataFrame(array, index=index, columns=columns) -def _check_disaggregation_input(mapping: Series, proportions: Series) -> _np.ndarray: +def _check_disaggregation_input(mapping: pd.Series, proportions: pd.Series) -> np.ndarray: assert mapping is not None assert proportions is not None assert mapping.index.equals(proportions.index) @@ -503,25 +499,26 @@ def _check_disaggregation_input(mapping: Series, proportions: Series) -> _np.nda return proportions.values / parent_totals -def disaggregate_matrix(matrix, mapping=None, proportions=None, row_mapping=None, row_proportions=None, - col_mapping=None, col_proportions=None): - """ - Split multiple rows and columns in a matrix all at once. The cells in the matrix MUST be numeric, but the row and - column labels do not. +def disaggregate_matrix(matrix: pd.DataFrame, mapping: pd.Series = None, proportions: pd.Series = None, + row_mapping: pd.Series = None, row_proportions: pd.Series = None, col_mapping: pd.Series = None, + col_proportions: pd.Series = None) -> pd.DataFrame: + """ Split multiple rows and columns in a matrix all at once. The cells in the matrix MUST be numeric, but the row + and column labels do not. Args: - matrix: The input matrix to disaggregate - mapping: Dict-like Series of "New label" : "Old label". Sets both the row_mapping and col_mapping variables if - provided (resulting in a square matrix). - proportions: Dict-like Series of "New label": "Proportion of old label". Its index must match the index of - the mapping argument. Sets both the row_proportions and col_proportions arguments if provided. - row_mapping: Same as mapping, except applied only to the rows. - row_proportions: Same as proportions, except applied only to the rows - col_mapping: Same as mapping, except applied only to the columns. - col_proportions: Same as proportions, except applied only to the columns + matrix (pandas.DataFrame): The input matrix to disaggregate + mapping (pandas.Series, optional): Dict-like Series of "New label" : "Old label". Sets both the row_mapping and + col_mapping variables if provided (resulting in a square matrix). + proportions (pandas.Series, optional): Dict-like Series of "New label": "Proportion of old label". Its index + must match the index of the mapping argument. Sets both the row_proportions and col_proportions arguments + if provided. + row_mapping (pandas.Series, optional): Same as mapping, except applied only to the rows. + row_proportions (pandas.Series, optional): Same as proportions, except applied only to the rows + col_mapping (pandas.Series, optional): Same as mapping, except applied only to the columns. + col_proportions (pandas.Series, optional): Same as proportions, except applied only to the columns Returns: - An expanded DataFrame with the new indices. The new matrix will sum to the same total as the original. + pandas.DataFrame: An expanded DataFrame with the new indices. The new matrix will sum to the same total as the original. Examples: @@ -591,16 +588,16 @@ def disaggregate_matrix(matrix, mapping=None, proportions=None, row_mapping=None new_cols = col_mapping.index # Get raw indexers for NumPy & lookup the value in each parent cell - row_indexer = matrix.index.get_indexer(row_mapping)[:, _np.newaxis] - col_indexer = matrix.columns.get_indexer(col_mapping)[_np.newaxis, :] + row_indexer = matrix.index.get_indexer(row_mapping)[:, np.newaxis] + col_indexer = matrix.columns.get_indexer(col_mapping)[np.newaxis, :] parent_cells = matrix.values[row_indexer, col_indexer] # Convert proportions to 2D vectors - row_proportions = row_proportions[:, _np.newaxis] - col_proportions = col_proportions[_np.newaxis, :] + row_proportions = row_proportions[:, np.newaxis] + col_proportions = col_proportions[np.newaxis, :] # Multiply each parent cell by its disaggregation proportion & return - result_matrix = _ne.evaluate("parent_cells * row_proportions * col_proportions") + result_matrix = ne.evaluate("parent_cells * row_proportions * col_proportions") - result_matrix = DataFrame(result_matrix, index=new_rows, columns=new_cols) + result_matrix = pd.DataFrame(result_matrix, index=new_rows, columns=new_cols) return result_matrix diff --git a/balsa/routines/matrices.pyi b/balsa/routines/matrices.pyi deleted file mode 100644 index 6c81f37..0000000 --- a/balsa/routines/matrices.pyi +++ /dev/null @@ -1,40 +0,0 @@ -from typing import Union, Callable, Tuple, Iterable, List -import numpy as np -import pandas as pd - - -def matrix_balancing_1d(m: np.ndarray, a: np.ndarray, axis: int) -> np.ndarray: pass - - -def matrix_balancing_2d(m: np.ndarray, a: np.ndarray, b: np.ndarray, max_iterations: int=1000, rel_error: float=0.0001, - n_procs: int=1) -> np.ndarray: pass - - -def matrix_bucket_rounding(m: Union[np.ndarray, pd.DataFrame], decimals: int=0) -> Union[np.ndarray, pd.DataFrame]: - pass - - -def split_zone_in_matrix(base_matrix: pd.DataFrame, old_zone: int, new_zones: List[int], proportions: List[float] - ) -> pd.DataFrame: - pass - -Num = Union[int, float] -Vector = Union[pd.Series, np.ndarray] - -def aggregate_matrix(matrix: Union[pd.DataFrame, pd.Series], - groups: Vector=None, row_groups: Vector=None, col_groups: Vector=None, - aggfunc: Callable[[Iterable[Num]], Num]=np.sum) -> Union[pd.DataFrame, pd.Series]: - pass - - -def fast_stack(frame: pd.DataFrame, multi_index: pd.MultiIndex, deep_copy: bool=True) -> pd.Series: - pass - - -def fast_unstack(series: pd.Series, index: pd.Index, columns: pd.Index, deep_copy: bool=True) -> pd.DataFrame: - pass - -def disaggregate_matrix(matrix: pd.DataFrame, mapping: pd.Series = None, proportions: pd.Series = None, - row_mapping: pd.Series = None, row_proportions: pd.Series = None, col_mapping: pd.Series = None, - col_proportions: pd.Series = None) -> pd.DataFrame: - pass \ No newline at end of file diff --git a/balsa/routines/modelling.py b/balsa/routines/modelling.py index 9cafeba..ae0808b 100644 --- a/balsa/routines/modelling.py +++ b/balsa/routines/modelling.py @@ -1,16 +1,14 @@ -from typing import Tuple, Optional, Union - -import pandas as pd -import numpy as np import numexpr as ne -from six import iteritems +import numpy as np +import pandas as pd +from typing import Union, Tuple, Dict, Any, Iterable -def tlfd(values, bin_start=0, bin_end=200, bin_step=2, weights=None, intrazonal=None, label_type='MULTI', - include_top=False): - """ - Generates a Trip Length Frequency Distribution (i.e. a histogram) from given data. Produces a "pretty" Pandas object - suitable for charting. +def tlfd(values: Union[np.ndarray, pd.Series], bin_start: int = 0, bin_end: int = 200, bin_step: int = 2, + weights: Union[np.ndarray, pd.Series] = None, intrazonal: Union[np.ndarray, pd.Series] = None, + label_type: str = 'MULTI', include_top: bool = False) -> pd.Series: + """Generates a Trip Length Frequency Distribution (i.e. a histogram) from given data. Produces a "pretty" Pandas + object suitable for charting. Args: values (Union[numpy.ndarray, pandas.Series]): A vector of trip lengths, with a length of "N". Can be provided @@ -40,6 +38,7 @@ def tlfd(values, bin_start=0, bin_end=200, bin_step=2, weights=None, intrazonal= """ bins = list(range(bin_start, bin_end + bin_step, bin_step)) + iz_total = None if intrazonal is not None: if weights is not None: iz_total = weights.loc[intrazonal].sum() @@ -55,8 +54,10 @@ def tlfd(values, bin_start=0, bin_end=200, bin_step=2, weights=None, intrazonal= hist, _ = np.histogram(values, bins=bins) new_len = len(hist) - if intrazonal is not None: new_len += 1 - if include_top: new_len += 1 + if intrazonal is not None: + new_len += 1 + if include_top: + new_len += 1 new_hist = np.zeros(shape=new_len, dtype=hist.dtype) lower_index = 0 upper_index = new_len @@ -84,9 +85,9 @@ def tlfd(values, bin_start=0, bin_end=200, bin_step=2, weights=None, intrazonal= elif label_type == 'BOTTOM': index = pd.Index(bins[:-1]) elif label_type == 'TEXT': - s0 = pd.Series(bins[:-1], dtype=str).astype(str) - s1 = pd.Series(bins[1:], dtype=str).astype(str) - index = pd.Index(s0 + ' to ' + s1) + s0 = pd.Series(bins[:-1]).astype(str) + s1 = pd.Series(bins[1:]).astype(str) + index = pd.Index(s0.str.cat(s1, sep=' to ')) else: raise NotImplementedError(label_type) @@ -95,7 +96,7 @@ def tlfd(values, bin_start=0, bin_end=200, bin_step=2, weights=None, intrazonal= return new_hist -def _get_distance_equation(method): +def _get_distance_equation(method: str) -> str: if method.lower() == 'euclidean': expr = "sqrt((x0 - x1)**2 + (y0 - y1) ** 2) * coord_unit" elif method.lower() == 'manhattan': @@ -114,15 +115,16 @@ def _get_distance_equation(method): return expr -def _prepare_distance_kwargs(kwargs): +def _prepare_distance_kwargs(kwargs: Dict[str, Any]): defaults = {'coord_unit': 1.0, 'earth_radius_factor': 1.0, 'pi': np.pi} - for key, val in iteritems(defaults): + for key, val in defaults.items(): if key not in kwargs: kwargs[key] = val def _check_vectors(description: str, *vectors): - if len(vectors) < 1: return [] + if len(vectors) < 1: + return [] first = vectors[0] retval = [] @@ -145,7 +147,11 @@ def _check_vectors(description: str, *vectors): return common_index, retval -def distance_matrix(x0, y0, tall=False, method='EUCLIDEAN', labels0=None, x1=None, y1=None, labels1=None, **kwargs): +def distance_matrix(x0: Union[np.ndarray, pd.Series], y0: Union[np.ndarray, pd.Series], tall: bool = False, + method: str = 'EUCLIDEAN', labels0: Union[Iterable, pd.Index] = None, + x1: Union[np.ndarray, pd.Series] = None, y1: Union[np.ndarray, pd.Series] = None, + labels1: Union[np.ndarray, pd.Series] = None, + **kwargs) -> Union[pd.Series, pd.DataFrame, np.ndarray]: """ Fastest method of computing a distance matrix from vectors of coordinates, using the NumExpr package. Supports several equations for computing distances. @@ -155,8 +161,10 @@ def distance_matrix(x0, y0, tall=False, method='EUCLIDEAN', labels0=None, x1=Non will be the 2D product of the first and second vector (vector0 * vector1). Args: - x0 (Union[numpy.ndarray, pandas.Series]): Vector of x-coordinates, of length N0. Can be a Series to specify labels. - y0 (Union[numpy.ndarray, pandas.Series]): Vector of y-coordinates, of length N0. Can be a Series to specify labels. + x0 (Union[numpy.ndarray, pandas.Series]): Vector of x-coordinates, of length N0. Can be a Series to specify + labels. + y0 (Union[numpy.ndarray, pandas.Series]): Vector of y-coordinates, of length N0. Can be a Series to specify + labels. tall (bool, optional): Defaults to ``False``. If True, returns a vector whose shape is N0 x N1. Otherwise, returns a matrix whose shape is (N0, N1). method (str, optional): Defaults to ``'EUCLIDEAN'``. Specifies the method by which to compute distance. Valid @@ -164,12 +172,12 @@ def distance_matrix(x0, y0, tall=False, method='EUCLIDEAN', labels0=None, x1=Non ``'EUCLIDEAN'``: Computes straight-line, 'as-the-crow flies' distance. ``'MANHATTAN'``: Computes the Manhattan distance ``'HAVERSINE'``: Computes distance based on lon/lat. - labels0 (pandas.Index-like, optional): Defaults to ``None``. Override set of labels to use if x0 and y0 are both raw - Numpy arrays - x1 (Union[numpy.ndarray, pandas.Series], optional): Defaults to ``None``. A second vector of x-coordinates, of length - N1. Can be a Series to specify labels - y1 (Union[numpy.ndarray, pandas.Series], optional): Defaults to ``None``. A second vector of y-coordinates, of length - N1. Can be a Series to specify labels + labels0 (pandas.Index-like, optional): Defaults to ``None``. Override set of labels to use if x0 and y0 are both + raw Numpy arrays + x1 (Union[numpy.ndarray, pandas.Series], optional): Defaults to ``None``. A second vector of x-coordinates, of + length N1. Can be a Series to specify labels + y1 (Union[numpy.ndarray, pandas.Series], optional): Defaults to ``None``. A second vector of y-coordinates, of + length N1. Can be a Series to specify labels labels1 (pandas.Index-like): Override set of labels to use if x1 and y1 are both raw Numpy arrays **kwargs: Additional scalars to pass into the evaluation context @@ -195,18 +203,19 @@ def distance_matrix(x0, y0, tall=False, method='EUCLIDEAN', labels0=None, x1=Non Otherwise, the function will try and infer the labels from the `x` and `y` objects, if one or both of them are provided as Series. - """ second_coords = x1 is not None and y1 is not None descr = "first coordinate" if second_coords else "coordinate" temp_labels, (x_array0, y_array0) = _check_vectors(descr, x0, y0) - if labels0 is None: labels0 = temp_labels + if labels0 is None: + labels0 = temp_labels if second_coords: temp_labels, (x_array1, y_array1) = _check_vectors("second coordinate", x1, y1) - if labels1 is None: labels1 = temp_labels + if labels1 is None: + labels1 = temp_labels else: x_array1 = x_array0[...] y_array1 = y_array0[...] @@ -232,7 +241,8 @@ def distance_matrix(x0, y0, tall=False, method='EUCLIDEAN', labels0=None, x1=Non if tall: raw_matrix.shape = n0 * n1 - if not labelled_result: return raw_matrix + if not labelled_result: + return raw_matrix mi = pd.MultiIndex.from_product([labels0, labels1]) return pd.Series(raw_matrix, index=mi) @@ -242,7 +252,9 @@ def distance_matrix(x0, y0, tall=False, method='EUCLIDEAN', labels0=None, x1=Non return pd.DataFrame(raw_matrix, index=labels0, columns=labels1) -def distance_array(x0, y0, x1, y1, method='euclidean', **kwargs): +def distance_array(x0: Union[np.ndarray, pd.Series], y0: Union[np.ndarray, pd.Series], + x1: Union[np.ndarray, pd.Series], y1: Union[np.ndarray, pd.Series], method: str = 'euclidean', + **kwargs) -> Union[np.ndarray, pd.Series]: """ Fast method to compute distance between 2 (x, y) points, represented by 4 separate arrays, using the NumExpr package. Supports several equations for computing distances @@ -271,7 +283,6 @@ def distance_array(x0, y0, x1, y1, method='euclidean', **kwargs): numpy.ndarray or pandas.Series: Distance from the vectors of first points to the vectors of second points. A Series is returned when one or more coordinate arrays are given as a Series object - """ labels, (x0, y0, x1, y1) = _check_vectors("coordinate", x0, y0, x1, y1) @@ -292,7 +303,8 @@ def distance_array(x0, y0, x1, y1, method='euclidean', **kwargs): return result_array -def indexers_for_map_matrix(row_labels, col_labels, superset, check=True): +def indexers_for_map_matrix(row_labels: pd.Series, col_labels: pd.Series, superset: pd.Index, + check: bool = True) -> Tuple[np.ndarray, np.ndarray]: if check: assert np.all(row_labels.isin(superset)) assert np.all(col_labels.isin(superset)) @@ -303,8 +315,10 @@ def indexers_for_map_matrix(row_labels, col_labels, superset, check=True): return row_offsets, col_offsets -def map_to_matrix(values, super_labels, fill_value=0, row_col_labels=None, row_col_offsets=None, out=None, - grouper_func='sum', out_operand='+'): +def map_to_matrix(values: pd.Series, super_labels: pd.Index, fill_value: float = 0, + row_col_labels: Tuple[pd.Series, pd.Series] = None, + row_col_offsets: Tuple[np.ndarray, np.ndarray] = None, out: Union[pd.DataFrame, np.ndarray] = None, + grouper_func: str = 'sum', out_operand: str = '+') -> pd.DataFrame: # TODO: Check that `values` dtype is numeric, or at least, add-able diff --git a/balsa/routines/modelling.pyi b/balsa/routines/modelling.pyi deleted file mode 100644 index 5d06524..0000000 --- a/balsa/routines/modelling.pyi +++ /dev/null @@ -1,37 +0,0 @@ -from typing import Iterable, Union, Tuple, Optional -from pandas import DataFrame, Series, Index -from numpy import ndarray - - -_vector_type = Union[ndarray, Series] - - -def tlfd(values: _vector_type, bin_start: int=0, bin_end: int=200, bin_step: int=2, weights: _vector_type=None, - intrazonal: _vector_type=None, label_type: str='MULTI', include_top: bool=False - ) -> Series: - pass - - -def distance_matrix(x: _vector_type, y: _vector_type, tall: bool=False, method: str='euclidean', - labels0: Union[Iterable, Index]=None, x1: _vector_type=None, y1: _vector_type=None, - labels1: Union[Iterable, Index]=None, earth_radius_factor: float=1.0, coord_unit: float=1.0 - ) -> Union[ndarray, Series, DataFrame]: - pass - - -def distance_array(x0: _vector_type, y0: _vector_type, x1: _vector_type, y1: _vector_type, method: str='euclidean', - earth_radius_factor: float=1.0, coord_unit: float=1.0): - pass - -def indexers_for_map_matrix(row_labels: Series, col_labels: Series, superset: Index, check=True - ) -> Tuple[ndarray, ndarray]: - pass - - -def map_to_matrix(values: Series, super_labels: Index, fill_value=0, *, - row_col_labels: Optional[Tuple[Series, Series]] = None, - row_col_offsets: Optional[Tuple[ndarray, ndarray]] = None, - out: Optional[Union[DataFrame, ndarray]] = None, - grouper_func='sum', out_operand: str='+' - ) -> DataFrame: - pass diff --git a/balsa/routines/plotting.py b/balsa/routines/plotting.py index d10db06..ae2c7d5 100644 --- a/balsa/routines/plotting.py +++ b/balsa/routines/plotting.py @@ -1,30 +1,32 @@ -import pandas as pd -import numpy as np from matplotlib import pyplot as plt from matplotlib.axes import Axes -from matplotlib.ticker import FormatStrFormatter, StrMethodFormatter, FuncFormatter +from matplotlib.ticker import FuncFormatter +import numpy as np +import pandas as pd +from pathlib import Path +from typing import Callable, Tuple, Union, List, Dict -def convergence_boxplot(targets, results, filter_func, adjust_target=True, percentage=True, band=None, - simple_labels=True, ax=None, fp=None, title=None): - """ - Measures convergence of constrained location-choice models (such as work-location choice). Can be used to - produce multiple boxplots for different sub-sets of zones, usually based on size. +def convergence_boxplot(targets: pd.DataFrame, results: pd.DataFrame, filter_func: Callable[[pd.Series], pd.Series], + adjust_target: bool = True, percentage: bool = True, band: Tuple[float, float] = None, + simple_labels: bool = True, ax: Axes = None, fp: Path = None, title: str = None) -> Axes: + """Measures convergence of constrained location-choice models (such as work-location choice). Can be used to + produce multiple box plots for different sub-sets of zones, usually based on size. Args: - targets: - results: - filter_func: - adjust_target: - percentage: - band: - simple_labels: - ax: - fp: - title: + targets (pandas.DataFrame): + results (pandas.DataFrame): + filter_func (Callable[[pandas.Series], pandas.Series]): + adjust_target (bool, optional): + percentage (bool, optional): + band (Tuple[float, float], optional): + simple_labels (bool, optional): + ax (Axes, optional): + fp (Path, optional): + title (str, optional): Returns: - + matplotlib.Axes """ assert results.columns.equals(targets.columns) @@ -53,7 +55,8 @@ def convergence_boxplot(targets, results, filter_func, adjust_target=True, perce target_sums.append(target_vector.sum()) err = model_vector - target_vector - if percentage: err /= target_vector + if percentage: + err /= target_vector unlabelled_zones[:m, i] = err.values @@ -84,7 +87,8 @@ def convergence_boxplot(targets, results, filter_func, adjust_target=True, perce ax.axhline(lower, color='black', linewidth=1, alpha=0.5) ax.axhline(upper, color='black', linewidth=1, alpha=0.5) - if title: ax.set_title(title) + if title: + ax.set_title(title) if fp is not None: plt.savefig(str(fp)) @@ -92,18 +96,18 @@ def convergence_boxplot(targets, results, filter_func, adjust_target=True, perce return ax -def location_summary(model, target, ensemble_names, title='', fp=None, dpi=150, district_name='Ensemble'): - """ - Creates a compound plot showing total attractions to specified locations +def location_summary(model: pd.DataFrame, target: pd.DataFrame, ensemble_names: pd.Series, title: str = '', + fp: Path = None, dpi: int = 150, district_name: str = 'Ensemble') -> Axes: + """Creates a compound plot showing total attractions to specified locations Args: - model: - target: - ensemble_names: - title: - fp: - dpi: - district_name: + model (pandas.DataFrame): + target (pandas.DataFrame): + ensemble_names (pandas.Series): + title (str, optional): + fp (Path, optional): + dpi (int, optional): + district_name (str, optional): Returns: matplotlib.Axes @@ -164,20 +168,21 @@ def location_summary(model, target, ensemble_names, title='', fp=None, dpi=150, return ax -def trumpet_diagram(counts, model_volume, categories=None, category_colours=None, category_markers=None, - label_format=None, title='', y_bounds=(-2, 2), ax=None, x_label="Count volume", legend=True, - **kwargs): - """ - Plots a auto volumes "trumpet" diagram of relative error vs. target count, and will draw min/max error curves based - on FHWA guidelines. Can be used to plot different categories of count locations. +def trumpet_diagram(counts: pd.Series, model_volume: pd.Series, categories: Union[pd.Series, List[pd.Series]] = None, + category_colours: Dict[Union[str, tuple], str] = None, + category_markers: Dict[Union[str, tuple], str] = None, label_format: str = None, title: str = '', + y_bounds: Tuple[float, float] = (-2, 2), ax: Axes = None, x_label: str = "Count volume", + legend: bool = True, **kwargs) -> Axes: + """Plots an auto volumes "trumpet" diagram of relative error vs. target count, and will draw min/max error curves + based on FHWA guidelines. Can be used to plot different categories of count locations. Args: - counts (pandas.Series): Target counts. Each item represents a different count location. Index does not need to be - unique. + counts (pandas.Series): Target counts. Each item represents a different count location. Index does not need to + be unique. model_volume (pandas.Series): Modelled volumes for each location. The index must match the counts Series. - categories (Union[pandas.Series, List[pandas.Series]], optional): Defaults to ``None``. Optional classification of each - count location. Must match the index of the count Series. Can be provided as a List of Series (which all - must match the count index) to enable tuple-based categorization. + categories (Union[pandas.Series, List[pandas.Series]], optional): Defaults to ``None``. Optional classification + of each count location. Must match the index of the count Series. Can be provided as a List of Series (which + all must match the count index) to enable tuple-based categorization. category_colours (Dict[Union[str, tuple], str], optional): Defaults to ``None``. Mapping of each category to a colour, specified as a hex string. Only used when categories are provided. Missing categories revert to ``None``, using the default colour for the style. @@ -199,7 +204,6 @@ def trumpet_diagram(counts, model_volume, categories=None, category_colours=None Returns: matplotlib.Axes: The Axes object generated from the plot. For most use cases, this is not really needed. - """ assert model_volume.index.equals(counts.index) @@ -207,17 +211,22 @@ def trumpet_diagram(counts, model_volume, categories=None, category_colours=None n_categories = 0 if categories is not None: if isinstance(categories, list): - for s in categories: assert s.index.equals(model_volume.index) - if label_format is None: label_format = '-'.join(['%s'] * len(categories)) + for s in categories: + assert s.index.equals(model_volume.index) + if label_format is None: + label_format = '-'.join(['%s'] * len(categories)) categories = pd.MultiIndex.from_arrays(categories) n_categories = len(categories.unique()) else: assert categories.index.equals(model_volume.index) n_categories = categories.nunique() - if category_colours is None: category_colours = {} - if category_markers is None: category_markers = {} - if label_format is None: label_format = "%s" + if category_colours is None: + category_colours = {} + if category_markers is None: + category_markers = {} + if label_format is None: + label_format = "%s" df = pd.DataFrame({'Model Volume': model_volume, 'Count Volume': counts}) df['Error'] = df['Model Volume'] - df['Count Volume'] @@ -252,6 +261,7 @@ def trumpet_diagram(counts, model_volume, categories=None, category_colours=None ax.set_title(title) ax.set_ylabel("Relative Error") ax.set_xlabel(x_label) - if legend: ax.legend() + if legend: + ax.legend() return ax diff --git a/balsa/routines/plotting.pyi b/balsa/routines/plotting.pyi deleted file mode 100644 index 38e4422..0000000 --- a/balsa/routines/plotting.pyi +++ /dev/null @@ -1,53 +0,0 @@ -from typing import List, Union, Dict, Callable, Tuple, Any, Optional - -from pandas import DataFrame, Series -from matplotlib.axes import Axes - -import six -if six.PY3: - from pathlib import Path as PathType -else: - PathType = str - - -def convergence_boxplot( - targets: DataFrame, - results: DataFrame, - filter_func: Callable[[Series], Series], - adjust_target: bool=True, - percentage: bool=True, - band: Tuple[float, float]=None, - simple_labels: bool=True, - ax=None, - fp: str=None, - title: str=None - ) -> Axes: - pass - - -def location_summary( - model: DataFrame, - target: DataFrame, - ensemble_names: Series, - title: str='', - fp: PathType=None, - dpi: int=150, - district_name: str='Ensemble' - ) -> Axes: - pass - - -def trumpet_diagram( - counts: Series, - model_volume: Series, - categories: Union[Series, List[Series]]=None, - category_colours: Dict[Union[Any, tuple]]=None, - category_markers: Dict[Union[Any, tuple]]=None, - label_format: str=None, - title: str='', - y_bounds: Tuple[float, float]=(-2, 2), - ax: Optional[Axes]=None, - alpha: float=1.0, - x_label: str="Count volume" - ) -> Axes: - pass diff --git a/balsa/test/matrices/io/__init__.py b/balsa/test/matrices/io/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/balsa/test/matrices/__init__.py b/balsa/test/routines/__init__.py similarity index 100% rename from balsa/test/matrices/__init__.py rename to balsa/test/routines/__init__.py diff --git a/balsa/test/matrices/routines.py b/balsa/test/routines/test_matrices.py similarity index 73% rename from balsa/test/matrices/routines.py rename to balsa/test/routines/test_matrices.py index 80b4799..d114380 100644 --- a/balsa/test/matrices/routines.py +++ b/balsa/test/routines/test_matrices.py @@ -4,11 +4,7 @@ import pandas as pd from pandas import testing as pdt -from balsa.matrices.routines import matrix_bucket_rounding, aggregate_matrix - -if __name__ == '__main__': - unittest.main() - +from ...routines import matrix_balancing_1d, matrix_balancing_2d, matrix_bucket_rounding, aggregate_matrix class TestMatrixBucketRounding(unittest.TestCase): @@ -26,10 +22,10 @@ def test_return_type(self): # first test, float return b = matrix_bucket_rounding(a, decimals=2) - assert b.dtype == a.dtype + self.assertEqual(b.dtype, a.dtype, "dtype of bucket rounded matrix is not equal to dtype of input matrix") # second test, int return b = matrix_bucket_rounding(a, decimals=0) - assert b.dtype == np.dtype('int32') + self.assertEqual(b.dtype, np.dtype('int32'), "dtype of bucket rounded matrix is not integer") def test_large(self): """ Test bucket rounding routine on a large matrix to various levels of rounding precision. """ @@ -47,19 +43,19 @@ def test_pandas_import(self): df_rnd = matrix_bucket_rounding(df, decimals=decimals) self._compare_matrix_sums(df.values, df_rnd.values, decimals) self._compare_matrix_values(df.values, df_rnd.values, decimals) - assert type(df_rnd) == pd.DataFrame + self.assertEqual(type(df_rnd), pd.DataFrame, "dtype of returned matrix is a Pandas DataFrame") def _compare_matrix_sums(self, a, b, decimal): max_error = 0.5*(10.0 ** (-decimal)) a_sum = np.sum(a) b_sum = np.sum(b) - self.assertLessEqual(a_sum, b_sum + max_error) - self.assertGreaterEqual(a_sum, b_sum - max_error) + self.assertLessEqual(a_sum, b_sum + max_error, "Bucket rounded matrix is not within a small margin of error") + self.assertGreaterEqual(a_sum, b_sum - max_error, "Bucket rounded matrix is not within a small margin of error") def _compare_matrix_values(self, a, b, decimal): max_error = 10.0 ** (-decimal) - np.testing.assert_allclose(a, b, atol=max_error, rtol=0.0) - + np.testing.assert_allclose(a, b, atol=max_error, rtol=0.0, + err_msg="Bucket rounded matrix values are not within %f" % (max_error)) class TestAggregateMatrix(unittest.TestCase): @@ -195,5 +191,46 @@ def test_tall_dmatrix(self): row_groups=tall_row_grouper.values, col_groups=tall_col_grouper.values) pdt.assert_series_equal(expected_result, test3, check_dtype=False, check_names=False) +class TestMatrixBalancing(unittest.TestCase): + def setUp(self): + self._square_matrix = np.random.uniform(0, 1000, (5, 5)) + self._1darray = np.random.uniform(0, 1000, 5) + + def test_1d_balance(self): + axes = [0, 1] + for ax in axes: + test = matrix_balancing_1d(self._square_matrix, self._1darray, ax) + self.assertAlmostEqual(test.sum(), self._1darray.sum(), places=5) + pdt.assert_series_equal(pd.Series(np.sum(test, ax)), pd.Series(self._1darray)) + + def test_2d_balance_matched_total(self): + row = self._1darray + column = np.roll(self._1darray, 2) + + test = matrix_balancing_2d(self._square_matrix, row, column, rel_error=0.000001) + pdt.assert_series_equal(pd.Series(np.sum(test[0], 1)), pd.Series(row), check_less_precise=True) + pdt.assert_series_equal(pd.Series(np.sum(test[0], 0)), pd.Series(column), check_less_precise=True) + + def test_2d_balance_average_total(self): + row = self._1darray + column = np.roll(np.sqrt(row), 2) + + test = matrix_balancing_2d(self._square_matrix, row, column, rel_error=0.000001, totals_to_use='average') + self.assertAlmostEqual(test[0].sum().sum(), (row.sum() + column.sum())/2, places=5) + + def test_2d_balance_row_total(self): + row = self._1darray + column = np.sqrt(row) + + test = matrix_balancing_2d(self._square_matrix, row, column, rel_error=0.000001, totals_to_use='rows') + pdt.assert_series_equal(pd.Series(np.sum(test[0], 1)), pd.Series(row), check_less_precise=True) + + def test_2d_balance_col_total(self): + row = self._1darray + column = np.sqrt(row) + + test = matrix_balancing_2d(self._square_matrix, row, column, rel_error=0.000001, totals_to_use='columns') + pdt.assert_series_equal(pd.Series(np.sum(test[0], 0)), pd.Series(column), check_less_precise=True) + if __name__ == '__main__': unittest.main() diff --git a/balsa/version.py b/balsa/version.py new file mode 100644 index 0000000..1a72d32 --- /dev/null +++ b/balsa/version.py @@ -0,0 +1 @@ +__version__ = '1.1.0' diff --git a/conda_recipe/meta.yaml b/conda_recipe/meta.yaml index 79448e4..98d25fa 100644 --- a/conda_recipe/meta.yaml +++ b/conda_recipe/meta.yaml @@ -1,6 +1,8 @@ +{% set data = load_setup_py_data() %} + package: name: wsp-balsa - version: "1.0.0" + version: {{ data.get('version') }} source: path: ../ @@ -10,17 +12,21 @@ build: noarch: python requirements: - build: + host: - python - - setuptools run: - - pandas>=0.21,<0.24 - - numpy>=1.15 - - numba>=0.35 - - numexpr>=2.6 - - six>=1.10 - - matplotlib>=3.0 + - python + - pandas >=0.21 + - numpy >=1.15 + - numba >=0.35 + - numexpr >=2.6 about: home: https://github.com/wsp-sag/balsa - summary: A collection of Python functions and tools to facilitate travel demand forecasting applications and analyses + license: MIT + summary: Python tools for travel demand forecasting applications and analyses + +extra: + maintainers: + - Brian Cheung + - Peter Kucirek diff --git a/doc/index.rst b/doc/index.rst index 5c85276..503f0d8 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -1,16 +1,21 @@ -Balsa: Common modelling tools -============================= +wsp-balsa: Common modelling tools +================================= -Balsa is a collection of functions and tools for Python to facilitate travel demand forecasting applications and analyses. It is designed to work the the "scientific stack" of Python, namely NumPy, Pandas, and Matplotlib; which are optimized for speed and usability. Most of `balsa` consists of standalone functions; for input/output, for analysis, etc.; as well as a few lightweight class-based data structures for specific applications. +``wsp-balsa`` is a collection of functions and tools for Python to facilitate travel demand forecasting applications and analyses. It is designed to work the the "scientific stack" of Python, namely NumPy, Pandas, and Matplotlib; which are optimized for speed and usability. The package mostly consists of standalone functions; for input/output, for analysis, etc.; as well as a few lightweight class-based data structures for specific applications. -Balsa is published by the Systems Analytics for Policy group inside WSP Canada. +The import statement for the ``wsp-balsa`` package is as follows: + +.. code-block:: python + + import balsa + +``wsp-balsa`` is published by the Systems Analytics for Policy group inside WSP Canada. .. toctree:: :maxdepth: 2 :caption: Contents: modules/balsa.routines - modules/balsa.configuration modules/balsa.logging Indices and tables diff --git a/doc/modules/balsa.configuration.rst b/doc/modules/balsa.configuration.rst deleted file mode 100644 index c54e97c..0000000 --- a/doc/modules/balsa.configuration.rst +++ /dev/null @@ -1,10 +0,0 @@ -Configuration -============= - -Contains the `Config` class for working with JSON model configuration files. Designed to allow modellers to use code to specify JSON contents, such as name and types of variables, raising errors if the JSON file is not formatted correctly. Also allows such files to include comments. - -Contents --------- - -.. automodule:: balsa.configuration - :members: diff --git a/setup.py b/setup.py index b0de9fa..21169ab 100644 --- a/setup.py +++ b/setup.py @@ -1,19 +1,31 @@ +from os import path +from pkg_resources import safe_version from setuptools import setup, find_packages +version = {} +with open(path.join(path.dirname(path.realpath(__file__)), 'balsa', 'version.py')) as fp: + exec(fp.read(), {}, version) +version_string = safe_version(version['__version__']) + setup( name='wsp-balsa', - author='wsp', - maintatiner='Peter Kucirek', - maintainer_email='peter.kucirek@wsp.com', - version='1.0', + version=version_string, + description='Python tools for travel demand forecasting applications and analyses', + url='https://github.com/wsp-sag/balsa', + author='WSP', + maintatiner='Brian Cheung', + maintainer_email='brian.cheung@wsp.com', + classifiers=[ + 'License :: OSI Approved :: MIT License' + ], packages=find_packages(), install_requires=[ - 'pandas>=0.21, <0.24', + 'pandas>=0.21', 'numpy>=1.15', 'numba>=0.35', - 'numexpr>=2.6', - 'six>=1.10' + 'numexpr>=2.6' ], + python_requires='>=3.5', extras_require={ 'plotting': 'matplotlib>=3.0' }