diff --git a/EMBEDR/_affinity.py b/EMBEDR/_affinity.py index 1e68fd9..2c73fb9 100644 --- a/EMBEDR/_affinity.py +++ b/EMBEDR/_affinity.py @@ -210,4 +210,20 @@ def GaussianAff_fromPrec(distances, return P - +@jit(nopython=True) +def calculate_kEff(P, row_idx, alpha_nu=0.02): + + kEff_arr = np.zeros_like(row_idx)[:-1] + + for ii in range(len(kEff_arr)): + ## Find the max value in the row. + P_row_max = -1 + for jj in range(row_idx[ii], row_idx[ii + 1]): + if P[jj] > P_row_max: + P_row_max = P[jj] + ## Count the number larger + for jj in range(row_idx[ii], row_idx[ii + 1]): + if P[jj] > (P_row_max * alpha_nu): + kEff_arr[ii] += 1 + + return kEff_arr diff --git a/EMBEDR/callbacks.py b/EMBEDR/callbacks.py index 1aab53b..a34a894 100644 --- a/EMBEDR/callbacks.py +++ b/EMBEDR/callbacks.py @@ -128,7 +128,7 @@ class QuitNoExaggerationPhase(Callback): gradient descent when the relative changes in DKL are less than a specified tolerance. """ - def __init__(self, rel_err_tol=1.e-5, min_iter=250, verbose=False): + def __init__(self, rel_err_tol=1.e-6, min_iter=250, verbose=False): self.iter_count = 0 self.last_log_time = None self.init_DKL = -1.0 diff --git a/EMBEDR/embedr.py b/EMBEDR/embedr.py index 7c74a8b..1cf276e 100644 --- a/EMBEDR/embedr.py +++ b/EMBEDR/embedr.py @@ -1,8 +1,10 @@ import EMBEDR.affinity as aff +from EMBEDR._affinity import calculate_kEff as _calc_kEff_from_sparse import EMBEDR.callbacks as cb import EMBEDR.ees as ees import EMBEDR.nearest_neighbors as nn +import EMBEDR.plotting_utility as putl from EMBEDR.tsne import tSNE_Embed from EMBEDR.umap import _initialize_UMAP_embed as initUMAP import EMBEDR.utility as utl @@ -39,6 +41,7 @@ def __init__(self, # Affinity matrix parameters aff_type="fixed_entropy_gauss", aff_params={}, + kEff_alpha=0.02, # Dimensionality reduction parameters n_components=2, DRA='tsne', @@ -67,20 +70,21 @@ def __init__(self, ## kNN graph parameters self.kNN_metric = kNN_metric self.kNN_alg = kNN_alg - self.kNN_params = kNN_params + self.kNN_params = kNN_params.copy() ## Affinity matrix parameters self.aff_type = aff_type - self.aff_params = aff_params + self.aff_params = aff_params.copy() + self.kEff_alpha = kEff_alpha ## Dimensionality reduction parameters self.n_components = n_components self.DRA = DRA - self.DRA_params = DRA_params + self.DRA_params = DRA_params.copy() ## Embedding statistic parameters self.EES_type = EES_type - self.EES_params = EES_params + self.EES_params = EES_params.copy() self.pVal_type = pVal_type ## Runtime Parameters @@ -139,6 +143,12 @@ def _validate_parameters_without_data(self): if not isinstance(self.aff_params, dict): err_str = f"Input argument `aff_params` must be a dictionary." raise ValueError(err_str) + try: + self.kEff_alpha = float(self.kEff_alpha) + assert self.kEff_alpha > 0 + except (ValueError, AssertionError): + err_str = f"`kEff_alpha` must be a nonnegative float!" + raise ValueError(err_str) ## Dimensionality reduction parameters try: @@ -507,24 +517,14 @@ def _fit(self, null_fit=False): ## If we're using t-SNE to embed or DKL as the EES, we need an ## affinity matrix. if (self.DRA in ['tsne', 't-sne']): - self.data_P = self.get_affinity_matrix(X=self.data_X, - kNN_graph=self.data_kNN) + dP = self.get_affinity_matrix(X=self.data_X, + kNN_graph=self.data_kNN) + self.data_P = dP elif (self.DRA in ['umap']) and (self.EES_type == 'dkl'): - if 'normalization' not in self.aff_params: - self.aff_params['normalization'] = 'local' - if self.verbose >= 5: - print(f"Affinity matrix normalization wasn't set, " - f"setting to be 'local' for use with EES.") - if self.aff_params['normalization'] != 'local': - warn_str = f"WARNING: Setting affinity matrix" - warn_str += f" normalization to 'global' may result in" - warn_str += f" illegal values for the EES. Use aff_params" - warn_str += f"['normalization'] = 'local' instead." - warnings.warn(warn_str) - - self.data_P = self.get_affinity_matrix(X=self.data_X, - kNN_graph=self.data_kNN) + dP = self._get_asym_local_affmat(X=self.data_X, + kNN_graph=self.data_kNN) + self.data_P = dP ## We then need to get the requested embeddings. if (self.DRA in ['tsne', 't-sne']): @@ -581,22 +581,10 @@ def _fit(self, null_fit=False): self.null_P[nNo] = nP elif (self.DRA in ['umap']) and (self.EES_type == 'dkl'): - if 'normalization' not in self.aff_params: - self.aff_params['normalization'] = 'local' - if self.verbose >= 5: - print(f"Affinity matrix normalization wasn't set, " - f"setting to be 'local' for use with EES.") - if self.aff_params['normalization'] != 'local': - warn_str = f"WARNING: Setting affinity matrix" - warn_str += f" normalization to 'global' may result in" - warn_str += f" illegal values for the EES. Use" - warn_str += f" aff_params['normalization'] = 'local'." - if self.verbose >= 4: - warnings.warn(warn_str) - nP = self.get_affinity_matrix(null_X, - kNN_graph=nKNN, - null_fit=null_fit) + nP = self._get_asym_local_affmat(null_X, + kNN_graph=nKNN, + null_fit=null_fit) self.null_P[nNo] = nP ## We then need to get the requested embeddings. @@ -606,7 +594,6 @@ def _fit(self, null_fit=False): aff_mat=nP, null_fit=null_fit) elif (self.DRA in ['umap']): - print(f"WARNING: UMAP has not been implemented!") nY, nEES = self.get_UMAP_embedding(null_X, kNN_graph=nKNN, aff_mat=nP, @@ -1113,6 +1100,32 @@ def _match_affinity_matrix(self, affObj, kNNObj, aff_hdr): aff_name = matching_aff.split("/")[-1] return os.path.join(self.project_dir, self.project_subdir, aff_name) + def _get_asym_local_affmat(self, X, kNN_graph=None, null_fit=False): + + old_aff_params = {} + if self.aff_params is not None: + old_aff_params = self.aff_params.copy() + + ## Force current normalization to be 'local' and symmetry to be False. + self.aff_params.update({"normalization": 'local', + "symmetrize": False}) + + aff_mat = self.get_affinity_matrix(X, kNN_graph=kNN_graph, + null_fit=null_fit) + + ## Return the old affinity matrix parameters + self.aff_params = old_aff_params.copy() + + return aff_mat + + def _recalculate_P(self, affObj, kNN_graph): + if not hasattr(affObj, "P"): + try: + affObj.P = affObj.calculate_affinities(kNN_graph, recalc=True) + except AttributeError: + affObj.P = affObj.calculate_affinities(kNN_graph, recalc=False) + return affObj + def get_tSNE_embedding(self, X, kNN_graph=None, @@ -1133,17 +1146,21 @@ def get_tSNE_embedding(self, aff_mat = self.get_affinity_matrix(X, kNN_graph=kNN_graph, null_fit=null_fit) - # Make sure that an affinity matrix is loaded... - if not hasattr(aff_mat, "P"): - try: - aff_mat.P = aff_mat.calculate_affinities(kNN_graph, - recalc=True) - except AttributeError: - aff_mat.P = aff_mat.calculate_affinities(kNN_graph, - recalc=False) + ## Make sure that an affinity matrix is loaded... + aff_mat = self._recalculate_P(aff_mat, kNN_graph) + ## Initialize the t-SNE object. embObj = self._initialize_tSNE_embed(aff_mat) + ## Get a locally-normed and asymmetric affinity matrix for EES... + local_aff_mat = self._get_asym_local_affmat(X, kNN_graph=kNN_graph, + null_fit=null_fit) + local_aff_mat = self._recalculate_P(local_aff_mat, kNN_graph) + + ## If we haven't calculated kEff, do that here. + if (not hasattr(self, "kEff")) and (not null_fit): + self._calculate_kEff(local_aff_mat) + ## If we're doing file caching... if self.do_cache: @@ -1190,38 +1207,6 @@ def get_tSNE_embedding(self, tmp_embed_arr[ii] = embObj.embedding[:] - ## If the current affinity matrix is not locally normalized... - if aff_mat.normalization != 'local': - - ## Make copy of current affinity parameters - old_aff_params = {} - if self.aff_params is not None: - old_aff_params = self.aff_params.copy() - ## Force current normalization to be 'local' - self.aff_params.update({"normalization": 'local'}) - - ## Load the correct affinity matrix... - local_aff_mat = self.get_affinity_matrix(X, - kNN_graph=kNN_graph, - null_fit=null_fit) - - # Make sure that an affinity matrix is loaded... - if not hasattr(local_aff_mat, "P"): - try: - P = local_aff_mat.calculate_affinities(kNN_graph, - recalc=True) - except AttributeError: - P = local_aff_mat.calculate_affinities(kNN_graph, - recalc=False) - local_aff_mat.P = P - - ## Return the old affinity matrix parameters - self.aff_params = old_aff_params.copy() - - ## ... if it is locally normalized... - else: - local_aff_mat = aff_mat - ## Calculate the EES tmp_EES_arr = self.calculate_EES(local_aff_mat.P, tmp_embed_arr) @@ -1339,7 +1324,7 @@ def load_tSNE_embedding(self, data_type = "Null" if null_fit else "Data" if self.verbose >= 3: print(f"Looking for matching t-SNE embeddings in" - f"the '{data_type}' cache.") + f" the '{data_type}' cache.") emb_hdr = self.project_hdr['Embed_tSNE'][data_type] ## Look in the cache for a matching embedding. @@ -1457,13 +1442,11 @@ def get_UMAP_embedding(self, embObj = self._initialize_UMAP_embed(X, seed) # Make sure that an affinity matrix is loaded... - if not hasattr(aff_mat, "P"): - try: - aff_mat.P = aff_mat.calculate_affinities(kNN_graph, - recalc=True) - except AttributeError: - aff_mat.P = aff_mat.calculate_affinities(kNN_graph, - recalc=False) + aff_mat = self._recalculate_P(aff_mat, kNN_graph) + + ## If we haven't calculated kEff, do that here. + if (not hasattr(self, "kEff")) and (not null_fit): + self._calculate_kEff(aff_mat) ## If we're doing file caching try to load previous embeddings! if self.do_cache: @@ -1761,6 +1744,13 @@ def _match_UMAP_embeds(self, embObj, kNNObj, emb_hdr, null_fit=False): emb_name = matching_embed.split("/")[-1] return os.path.join(self.project_dir, self.project_subdir, emb_name) + def _calculate_kEff(self, affObj): + kEff_arr = _calc_kEff_from_sparse(affObj.P.data, + affObj.P.indptr, + alpha_nu=self.kEff_alpha) + self._kEff = kEff_arr.copy() + self.kEff = np.median(kEff_arr.astype(float)) + def calculate_EES(self, P, Y): if self.EES_type == 'dkl': @@ -1954,7 +1944,7 @@ def plot(self, Y = self.null_Y[embed_2_show] [pVal_cmap, - pVal_cnorm] = utl.make_categ_cmap(change_points=pVal_clr_change) + pVal_cnorm] = putl.make_categ_cmap(change_points=pVal_clr_change) color_bounds = np.linspace(pVal_clr_change[0], pVal_clr_change[-1], @@ -2026,6 +2016,7 @@ def __init__(self, # Affinity matrix parameters aff_type="fixed_entropy_gauss", aff_params={}, + kEff_alpha=0.02, # Dimensionality reduction parameters n_components=2, DRA='tsne', @@ -2142,20 +2133,21 @@ def __init__(self, ## kNN graph parameters self.kNN_metric = kNN_metric self.kNN_alg = kNN_alg.lower() - self.kNN_params = kNN_params + self.kNN_params = kNN_params.copy() ## Affinity matrix parameters self.aff_type = aff_type.lower() - self.aff_params = aff_params + self.aff_params = aff_params.copy() + self.kEff_alpha = kEff_alpha ## Dimensionality reduction parameters self.n_components = int(n_components) self.DRA = DRA.lower() - self.DRA_params = DRA_params + self.DRA_params = DRA_params.copy() ## Embedding statistic parameters self.EES_type = EES_type.lower() - self.EES_params = EES_params + self.EES_params = EES_params.copy() self.pVal_type = pVal_type.lower() self._use_t_stat = bool(use_t_stat_for_opt) @@ -2223,6 +2215,8 @@ def fit(self, X): self.pValues = {} self.data_EES = {} self.null_EES = {} + self.kEff = {} + self._kEff = {} ## Loop over the values of the hyperparameters! for ii, hp in enumerate(self.sweep_values): @@ -2236,6 +2230,7 @@ def fit(self, X): perp = hp else: knn = hp + embObj = EMBEDR(perplexity=perp, n_neighbors=knn, kNN_metric=self.kNN_metric, @@ -2243,6 +2238,7 @@ def fit(self, X): kNN_params=self.kNN_params, aff_type=self.aff_type, aff_params=self.aff_params, + kEff_alpha=self.kEff_alpha, n_components=self.n_components, DRA=self.DRA, DRA_params=self.DRA_params, @@ -2269,6 +2265,8 @@ def fit(self, X): self.data_EES[hp] = embObj.data_EES.copy() self.null_EES[hp] = embObj.null_EES.copy() self.pValues[hp] = embObj.pValues.copy() + self.kEff[hp] = embObj.kEff + self._kEff[hp] = embObj._kEff return @@ -2509,10 +2507,32 @@ def fit_samplewise_optimal(self): optObj.fit(self.data_X) - - self.opt_obj = optObj self.opt_embed = optObj.data_Y[:] self.opt_embed_data_EES = optObj.data_EES[:] self.opt_embed_null_EES = optObj.null_EES[:] self.opt_embed_pValues = optObj.pValues[:] + + def _get_kEff_from_hp(self, hp_value): + sort_idx = np.argsort(self.sweep_values) + x_coords = self.sweep_values[sort_idx] + y_coords = [self.kEff[self.sweep_values[ii]] for ii in sort_idx] + + try: + _ = (hp for hp in hp_value) + except TypeError: + hp_value = [hp_value] + + return utl.interpolate(x_coords, y_coords, np.asarray(perp)).squeeze() + + def _get_hp_from_kEff(self, kEff): + sort_idx = np.argsort(self.sweep_values) + y_coords = self.sweep_values[sort_idx] + x_coords = [self.kEff[self.sweep_values[ii]] for ii in sort_idx] + + try: + _ = (kE for kE in kEff) + except TypeError: + kEff = [kEff] + + return interpolate(x_coords, y_coords, np.asarray(nn)).squeeze() diff --git a/EMBEDR/human_round.py b/EMBEDR/human_round.py index 692d7e3..aed0e5f 100644 --- a/EMBEDR/human_round.py +++ b/EMBEDR/human_round.py @@ -4,6 +4,13 @@ import numpy as np +## The thresholds at which to modify rounding in `human_round` +round_levels = { 5: 5, + 20: 10, + 100: 50, + 500: 100} + + def is_iterable(x): """Check if a variable is iterable""" @@ -13,11 +20,12 @@ def is_iterable(x): except TypeError: return False -## The thresholds at which to modify rounding in `human_round` -round_levels = { 5: 5, - 20: 10, - 100: 50, - 500: 100} + +def unique_logspace(lb, ub, n, dtype=int): + """Return array of up to `n` unique log-spaced ints from `lb` to `ub`.""" + lb, ub = np.log10(lb), np.log10(ub) + return np.unique(np.logspace(lb, ub, n).astype(dtype)) + def human_round(x, round_levels=round_levels, @@ -187,4 +195,4 @@ def generate_perp_arr(n_samples, n_perp=30, round_levels=round_levels, if counter >= 50: break - return perp_arr \ No newline at end of file + return perp_arr diff --git a/EMBEDR/plotting_utility.py b/EMBEDR/plotting_utility.py new file mode 100644 index 0000000..6b51954 --- /dev/null +++ b/EMBEDR/plotting_utility.py @@ -0,0 +1,466 @@ +""" +############################################################################### + Plotting Utility +############################################################################### + + Author: Eric Johnson + Date Created: Monday, March 8, 2021 + Date Edited: Wednesday, October 27 2021 + Email: ericjohnson1.2015@u.northwestern.edu + +############################################################################### + + This file contains a collection of functions that facilitate the + construction of figures for the EMBEDR manuscript. + +############################################################################### +""" +import matplotlib +import matplotlib.pyplot as plt +import matplotlib.colors as mcl +import numpy as np +import os +from os import path +import pandas as pd +import pickle as pkl +import seaborn as sns +import time + + +############################################################################### +## Functions for Figure Objects +############################################################################### + + +def save_figure(fig, + fig_name, + fig_dir="./", + formats=['pdf', 'png', 'svg'], + tight_layout_pad=None, + bbox_inches=None, + transparent=True, + dpi=600, + verbose=True): + """Wrapper to save a given figure using several formats""" + + if verbose: + fmt_str = ", ".join([f"{fmt}" for fmt in formats]) + print(f"\nSaving {fig_name} to file (" + fmt_str + ")") + + if tight_layout_pad is not None: + fig.tight_layout(pad=tight_layout_pad) + + fig_path = path.join(fig_dir, fig_name) + + for fmt in formats: + fig_fmt_path = fig_path + "." + fmt + + if verbose: + print(f"Saving {fig_fmt_path}") + + fig.savefig(fig_fmt_path, + format=fmt, + bbox_inches=bbox_inches, + transparent=transparent, + dpi=dpi) + return + + +def update_tight_bounds(fig, + inner_gs, + outer_gs, + rect=None, + inner_pad=1.08, + w_pad=0, + h_pad=0, + fig_pad=0): + """Update the figure layout. Layout a gridspec inside another gridspec.""" + + fig.tight_layout(pad=fig_pad) + + out_crds = outer_gs.get_position(fig).bounds + + if rect is None: + rect = [out_crds[0], + out_crds[1], + out_crds[0] + out_crds[2], + out_crds[1] + out_crds[3]] + + inner_gs.tight_layout(fig, + rect=rect, + pad=inner_pad, + w_pad=w_pad, + h_pad=h_pad) + + return + +############################################################################### +## Functions for Axes Objects +############################################################################### + + +def make_border_axes(axis, + patch_alpha=0, + xticks=[], + yticks=[], + xticklabels=[], + yticklabels=[], + spine_width=1.25, + spine_color='0.8', + spine_alpha=1.0, + spines_2_show='all', + visible=True): + """Initialize axes with a border.""" + + valid_spines = ['top', 'left', 'bottom', 'right'] + if spines_2_show == 'all': + spines_2_show = valid_spines + + for spine in spines_2_show: + if spine not in valid_spines: + err_str = f"Spine '{spine}' is not a valid spine label! ('top'," + err_str += f" 'left', 'bottom', 'right')" + raise ValueError(err_str) + + for spine in axis.spines: + if spine in spines_2_show: + axis.spines[spine].set_alpha(spine_alpha) + axis.spines[spine].set_linewidth(spine_width) + axis.spines[spine].set_color(spine_color) + else: + axis.spines[spine].set_alpha(0) + axis.spines[spine].set_linewidth(0) + axis.spines[spine].set_color('w') + + axis.patch.set_alpha(patch_alpha) + + if xticks is not None: + axis.set_xticks(xticks) + + if xticklabels is not None: + axis.set_xticks(xticklabels) + + if yticks is not None: + axis.set_yticks(yticks) + + if yticklabels is not None: + axis.set_yticks(yticklabels) + + axis.set_visible(visible) + + return axis + + +def add_panel_number(axis, + number, + edge_pad=15, + number_loc=(None, None), + hw_ratio=None, + corner=('left', 'top'), + **text_kws): + """Add a panel number to the corner of an axis object.""" + + default_kws = {'ha': 'left', + 'va': 'bottom', + 'fontsize': 10, + 'bbox': {'boxstyle': 'round', + 'facecolor': 'w', + 'alpha': 0.5, + 'edgecolor': 'k'}, + 'fontweight': 'bold'} + + ## Get axes position in figure fraction + ax_X0, ax_Y0, ax_wid, ax_hgt = axis.get_position().bounds + + ## Get the axis' figure + fig = axis.get_figure() + + ## Get the coordinates of the axes in DISPLAY UNITS + disp_X0, disp_Y0 = fig.transFigure.transform([ax_X0, ax_Y0]) + + ## Set the height and width of the panel number box using DISPLAY UNITS + width = edge_pad + if hw_ratio is None: + height = width + else: + height = hw_ratio * width + + ## Get the position in AXES FRACTION + p_X0, p_Y0 = axis.transAxes.inverted().transform([width + disp_X0, + height + disp_Y0]) + + ## Depending on the corner, adjust the position and text alignment + if corner[0] == 'right': + p_X0 = 1 - p_X0 + default_kws['ha'] = 'right' + elif corner[0] != 'left': + raise ValueError("corner[0] must be 'left' or 'right'!") + + if corner[1] == 'top': + p_Y0 = 1 - p_Y0 + default_kws['va'] = 'top' + elif corner[1] != 'bottom': + raise ValueError("corner[0] must be 'top' or 'bottom'!") + + ## Get the actual text coordinates + x, y = number_loc + if x is None: + x = 0 + if y is None: + y = 0 + + p_X0 += x + p_Y0 += y + + default_kws.update(text_kws) + + axis.text(p_X0, p_Y0, number, transform=axis.transAxes, **default_kws) + + return axis + + +############################################################################### +## Functions for Colorbars +############################################################################### + + +def make_categ_cmap(change_points=[0, 2, 3, 4, 5], + categorical_cmap=None, + cmap_idx=None, + cmap_dx=0.001, + reverse_last_interval=True): + """Make categorical colormap that fades between colors at specific values. + + This function takes in a list of end points + interior points to set as + the edge of regions on a colormap. The function then returns a new + continuous colormap that transitions between these regions (fades to white, + then changes colors). + + Parameters + ---------- + change_points: Iterable (optional) + The values at which to change between categories. The end-points (the + max and min values to be shown on the colormap) must be supplied so + that if 4 categories are desired, `change_points` must contain 4 + 1 + values. + + categorical_cmap: Seaborn colormap object (optional) + A categorical colormap (list of tuples) to which the intervals between + `change_points` will be mapped. + + cmap_idx: Iterable (optional) + A list of indices that maps the colors in the colormap to the correct + interval. This allows for preset colormaps to be remapped by changing + `cmap_idx` from [0, 1, 2, 3] to [2, 3, 1, 0], for example. + + cmap_dx: float (optional, default=0.001) + Interval at which to interpolate colors. Smaller will make the + colormap seem more continuous, but may have trouble rendering on some + computers. + + reverse_last_interval: bool (optional, default=True) + Flag indicating whether to reverse the interpolation direction on the + last interval. This can be useful to set up a maximal contrast in one + part of the colormap. + """ + + if categorical_cmap is None: + import seaborn as sns + categorical_cmap = sns.color_palette('colorblind') + + ## Set the list of indices to use from the colormap + if cmap_idx is None: + cmap_idx = [4, 0, 3, 2] + list(range(4, len(change_points) - 1)) + + ## Set the base colors for regions of the colormap + colors = [categorical_cmap[idx] for idx in cmap_idx] + + ## Make an appropriate grid of points on which to set colors. + color_grid = [] + for intNo, end in enumerate(change_points[1:]): + color_grid += list(np.arange(change_points[intNo], end, cmap_dx)) + color_grid += [change_points[-1]] + color_grid = np.sort(np.unique(np.asarray(color_grid)).squeeze()) + + ## Initialize the RGB+ array. + out_colors = np.ones((len(color_grid), 4)) + + ## Iterate through the grid, setting interpolated colors for each region. + start_idx = 0 + for intNo, start in enumerate(change_points[:-1]): + + ## Get the number of grid points in this interval + N_ticks = int((change_points[intNo + 1] - start) / cmap_dx) + ## If it's the last interval, add an extra. + if intNo == (len(change_points) - 2): + N_ticks += 1 + + ## Iterate through each of the RGB values. + for jj in range(3): + + ## Base color for each interval + base_color = colors[intNo][jj] + + ## Maximum divergence from the base color. + upper_bound = 0.75 * (1 - base_color) + base_color + + ## Interpolated grid for the interval. + intv_color_grid = np.linspace(base_color, upper_bound, N_ticks) + + ## If we're in the last interval, can reverse the direction + if (intNo == (len(change_points) - 2)) and reverse_last_interval: + intv_color_grid = intv_color_grid[::-1] + + ## Set the colors! + out_colors[start_idx:start_idx + N_ticks, jj] = intv_color_grid + + start_idx += N_ticks + + ## Convert the grids and colors to matplotlib colormaps. + import matplotlib.colors as mcl + out_cmap = mcl.ListedColormap(out_colors) + out_cnorm = mcl.BoundaryNorm(color_grid, out_cmap.N) + + return out_cmap, out_cnorm + + +def make_seq_cmap(color_1, color_2, n_colors=10): + """Make a sequential colormap between two arbitrary colors.""" + + out_colors = np.zeros((n_colors, 3)) + for ii in range(3): + out_colors[:, ii] = np.linspace(color_1[ii], color_2[ii], n_colors) + + return out_colors + + +def cyclic_perm(a): + n = len(a) + b = [[a[i - j] for i in range(n)] for j in range(n)] + return b + + +def wrap_strings(str_arr, line_len=26): + """Automatically break strings into multiple lines.""" + + out_list = [] + for line in str_arr: + + total_char = len(line) + + opt_breaks = int(total_char / line_len) + + words = line.split(" ") + alt_words = [word.split("-") for word in words] + + words = [wrd for word in alt_words for wrd in word] + + front_sum = np.cumsum([len(word) for word in words]) + + new_words = [] + for ii, word in enumerate(words[:-1]): + new_words.append(word + line[front_sum[ii] + ii]) + words = new_words + [words[-1]] + + word_lens = [len(word.strip()) for word in words] + if np.all([wl > line_len for wl in word_lens]): + out_list.append("\n".join(words)) + + out_lines = [] + + for fudgeNo in range(opt_breaks + 1): + out_line = "" + curr_line = "" + break_counter = 0 + for idx, word in enumerate(words): + + if break_counter < opt_breaks: + + if (len(word.strip()) + len(curr_line)) <= line_len: + curr_line += word + out_line += word + + elif break_counter == fudgeNo - 1: + curr_line += word + out_line += word + fudgeNo = -1 + + else: + out_line = out_line.strip() + "\n" + word + curr_line = word + break_counter += 1 + + else: + out_line += word + + if len(out_line.split("\n")) < (opt_breaks + 1): + out_line += (opt_breaks + 1 - len(out_line.split("\n"))) * "\n" + + out_lines.append(out_line) + + words[-1] += " " + for fudgeNo in range(opt_breaks + 1): + out_line = "" + curr_line = "" + break_counter = 0 + for idx, word in enumerate(words[::-1]): + + word = word[::-1] + + if break_counter < opt_breaks: + + if (len(word.strip()) + len(curr_line)) <= line_len: + curr_line += word + out_line += word + + elif break_counter == fudgeNo: + curr_line += word + out_line += word + fudgeNo = -1 + + else: + out_line = out_line.strip() + "\n" + word + curr_line = word + break_counter += 1 + + else: + out_line += word + + if len(out_line.split("\n")) < (opt_breaks + 1): + out_line += (opt_breaks + 1 - len(out_line.split("\n"))) * "\n" + + out_line = out_line[::-1] + + out_line = "\n".join([ll.strip() for ll in out_line.split("\n")]) + + out_lines.append(out_line) + + print(out_lines) + + out_lines = np.asarray(out_lines) + line_lens = np.asarray([[len(word) for word in out_line.split('\n')] + for out_line in out_lines]) + + print(line_lens, line_len) + + all_good = np.asarray([np.all(ll < line_len) for ll in line_lens]) + + if np.sum(all_good) == 1: + out_list.append(out_lines[all_good][0]) + + elif np.any(all_good): + len_col = np.ones((sum(all_good), 1)) * line_len + SSR = np.sum((line_lens[all_good] - len_col)**2, axis=1) + best_idx = np.argmin(SSR) + + out_list.append(out_lines[all_good][best_idx]) + + else: + len_col = np.ones((len(out_lines), 1)) * line_len + SSR = np.sum((line_lens - len_col)**2, axis=1) + best_idx = np.argmin(SSR) + + out_list.append(out_lines[best_idx]) + + return out_list + diff --git a/EMBEDR/utility.py b/EMBEDR/utility.py index 7065c2f..4dda9a2 100644 --- a/EMBEDR/utility.py +++ b/EMBEDR/utility.py @@ -1,12 +1,11 @@ from collections import Counter import numpy as np +import scanpy as sc from time import time - -def unique_logspace(lb, ub, n, dtype=int): - """Return array of up to `n` unique log-spaced ints from `lb` to `ub`.""" - lb, ub = np.log10(lb), np.log10(ub) - return np.unique(np.logspace(lb, ub, n).astype(dtype)) +############################################################################### +## Output logging utilities. +############################################################################### class Timer: @@ -28,6 +27,59 @@ def __exit__(self): print(f"---> Time Elapsed: {dt:.2g} seconds!") +############################################################################### +## Data I/O Functions +############################################################################### +tabula_muris_tissues = ['marrow', 'diaphragm', 'brain_non-myeloid'] + + +def load_data(data_name, + data_dir=None, + dtype=float, + load_metadata=True): + + metadata = None + if data_name.lower() == 'mnist': + + if data_dir is None: + data_dir = "./data/" + + data_path = os.path.join(data_dir, "mnist2500_X.txt") + + X = np.loadtxt(data_path).astype(dtype) + + if load_metadata: + metadata_path = path.join(data_dir, "mnist2500_labels.txt") + metadata = np.loadtxt(metadata_path).astype(int) + + elif data_name.lower() in tabula_muris_tissues: + + if data_dir is None: + data_dir = "./data/tabula-muris/04_facs_processed_data/FACS/" + + data_file = f"Processed_{data_name.title()}.h5ad" + data_path = os.path.join(data_dir, data_file) + + X = sc.read_h5ad(data_path) + + elif data_name.lower() == "ATAC": + + if data_dir is None: + data_dir = "./data/10kPBMC_scATAC/02_processed_data/" + + data_file = f"atac_pbmc_10k_nextgem_preprocessed_data.h5ad" + data_path = os.path.join(data_dir, data_file) + + X = sc.read_h5ad(data_path) + + return X, metadata + + +############################################################################### +## Functions for eCDFs +############################################################################### + + def calculate_eCDF(data, extend=False): """Calculate the x- and y-coordinates of an empirical CDF curve. @@ -55,123 +107,51 @@ def calculate_eCDF(data, extend=False): return vals, CDF -def make_categ_cmap(change_points=[0, 2, 3, 4, 5], - categorical_cmap=None, - cmap_idx=None, - cmap_dx=0.001, - reverse_last_interval=True): - """Make categorical colormap that fades between colors at specific values. - - This function takes in a list of end points + interior points to set as - the edge of regions on a colormap. The function then returns a new - continuous colormap that transitions between these regions (fades to white, - then changes colors). +def get_QQ_vals(data1, data2): + """Align 2 datasets' eCDFs for plotting on QQ-plot.""" + + vals1, CDF1 = get_eCDF(data1, extend=True) + vals2, CDF2 = get_eCDF(data2, extend=True) + + joint_vals = np.msort(np.unique(np.hstack((vals1, vals2)))) + + joint_CDF1 = np.zeros_like(joint_vals) + joint_CDF2 = np.zeros_like(joint_vals) + + id1, id2 = 0, 0 + for ii, val in enumerate(joint_vals): + + joint_CDF1[ii] = CDF1[id1] + if (val in vals1) and (id1 + 1 < len(vals1)): + id1 += 1 + + joint_CDF2[ii] = CDF2[id2] + if (val in vals2) and (id2 + 1 < len(vals2)): + id2 += 1 + + return joint_vals, joint_CDF1, joint_CDF2 + + +def interpolate(x_coords, y_coords, x_arr): + """Interpolate between coordinates to find the y-values at `x_arr`.""" + + x_out = np.zeros_like(x_arr).astype(float) + for ii, x in enumerate(x_arr): + if (x < x_coords[0]) or (x > x_coords[-1]): + err_str = f"Value {x} is outside interpolation regime!" + raise ValueError(err_str) + + if np.any(np.isclose(x, x_coords)): + idx = np.where(np.isclose(x, x_coords))[0] + if len(idx) > 1: + idx = idx[0] + x_out[ii] = (y_coords[idx]).squeeze() + continue + + grid_id = np.searchsorted(x_coords, x, side='right') + x_diff = (x_coords[grid_id] - x_coords[grid_id - 1]) + frac = (x - x_coords[grid_id - 1]) + slope = (y_coords[grid_id] - y_coords[grid_id - 1]) / x_diff + x_out[ii] = (y_coords[grid_id - 1] + slope * frac).squeeze() - Parameters - ---------- - change_points: Iterable (optional) - The values at which to change between categories. The end-points (the - max and min values to be shown on the colormap) must be supplied so - that if 4 categories are desired, `change_points` must contain 4 + 1 - values. - - categorical_cmap: Seaborn colormap object (optional) - A categorical colormap (list of tuples) to which the intervals between - `change_points` will be mapped. - - cmap_idx: Iterable (optional) - A list of indices that maps the colors in the colormap to the correct - interval. This allows for preset colormaps to be remapped by changing - `cmap_idx` from [0, 1, 2, 3] to [2, 3, 1, 0], for example. - - cmap_dx: float (optional, default=0.001) - Interval at which to interpolate colors. Smaller will make the - colormap seem more continuous, but may have trouble rendering on some - computers. - - reverse_last_interval: bool (optional, default=True) - Flag indicating whether to reverse the interpolation direction on the - last interval. This can be useful to set up a maximal contrast in one - part of the colormap. - """ - - if categorical_cmap is None: - import seaborn as sns - categorical_cmap = sns.color_palette('colorblind') - - ## Set the list of indices to use from the colormap - if cmap_idx is None: - cmap_idx = [4, 0, 3, 2] + list(range(4, len(change_points) - 1)) - - ## Set the base colors for regions of the colormap - colors = [categorical_cmap[idx] for idx in cmap_idx] - - ## Make an appropriate grid of points on which to set colors. - color_grid = [] - for intNo, end in enumerate(change_points[1:]): - color_grid += list(np.arange(change_points[intNo], end, cmap_dx)) - color_grid += [change_points[-1]] - color_grid = np.sort(np.unique(np.asarray(color_grid)).squeeze()) - - ## Initialize the RGB+ array. - out_colors = np.ones((len(color_grid), 4)) - - ## Iterate through the grid, setting interpolated colors for each region. - start_idx = 0 - for intNo, start in enumerate(change_points[:-1]): - - ## Get the number of grid points in this interval - N_ticks = int((change_points[intNo + 1] - start) / cmap_dx) - ## If it's the last interval, add an extra. - if intNo == (len(change_points) - 2): - N_ticks += 1 - - ## Iterate through each of the RGB values. - for jj in range(3): - - ## Base color for each interval - base_color = colors[intNo][jj] - - ## Maximum divergence from the base color. - upper_bound = 0.75 * (1 - base_color) + base_color - - ## Interpolated grid for the interval. - intv_color_grid = np.linspace(base_color, upper_bound, N_ticks) - - ## If we're in the last interval, can reverse the direction - if (intNo == (len(change_points) - 2)) and reverse_last_interval: - intv_color_grid = intv_color_grid[::-1] - - ## Set the colors! - out_colors[start_idx:start_idx + N_ticks, jj] = intv_color_grid - - start_idx += N_ticks - - ## Convert the grids and colors to matplotlib colormaps. - import matplotlib.colors as mcl - out_cmap = mcl.ListedColormap(out_colors) - out_cnorm = mcl.BoundaryNorm(color_grid, out_cmap.N) - - return out_cmap, out_cnorm - - -def save_figure(fig, - fig_name, - fig_dir="./Figures", - formats=['pdf', 'png', 'svg'], - do_tight_layout=True, - bbox_inches=None, - transparent=True, - dpi=300): - - if do_tight_layout: - fig.tight_layout() - - fig_path = os.path.join(fig_dir, fig_name) - - for fmt in formats: - fig.savefig(fig_path + "." + fmt, - format=fmt, - bbox_inches=bbox_inches, - transparent=transparent, - dpi=dpi) + return np.asarray(x_out, dtype=float).squeeze()