From 385c37be79209e63d532d2869e1e6124905b835c Mon Sep 17 00:00:00 2001 From: ejohnson643 Date: Wed, 3 Nov 2021 09:16:02 -0500 Subject: [PATCH] Added plotting function for Sweep objects --- EMBEDR/embedr.py | 259 ++++++++++++- EMBEDR/human_round.py | 3 +- .../EMBEDR_Figure_01v1_DimRed_Zoology.py | 348 ++++++++++++++++++ EMBEDR/plotting_utility.py | 266 +++++++++---- EMBEDR/utility.py | 16 +- 5 files changed, 816 insertions(+), 76 deletions(-) diff --git a/EMBEDR/embedr.py b/EMBEDR/embedr.py index d75bcc3..e307c83 100644 --- a/EMBEDR/embedr.py +++ b/EMBEDR/embedr.py @@ -3,6 +3,7 @@ from EMBEDR._affinity import calculate_kEff as _calc_kEff_from_sparse import EMBEDR.callbacks as cb import EMBEDR.ees as ees +from EMBEDR.human_round import * import EMBEDR.nearest_neighbors as nn import EMBEDR.plotting_utility as putl from EMBEDR.tsne import tSNE_Embed @@ -1945,12 +1946,7 @@ def plot(self, else: Y = self.null_Y[embed_2_show] - [pVal_cmap, - pVal_cnorm] = putl.make_categ_cmap(change_points=pVal_clr_change) - - color_bounds = np.linspace(pVal_clr_change[0], - pVal_clr_change[-1], - pVal_cmap.N) + pVal_cmap = putl.CategoricalFadingCMap(change_points=pVal_clr_change) pVals = -np.log10(self.pValues) @@ -1959,8 +1955,8 @@ def plot(self, h_ax = ax.scatter(*Y[sort_idx].T, s=scatter_s, c=pVals[sort_idx], - cmap=pVal_cmap, - norm=pVal_cnorm, + cmap=pVal_cmap.cmap, + norm=pVal_cmap.cnorm, alpha=scatter_alpha, **scatter_kwds) @@ -1977,13 +1973,13 @@ def plot(self, cbar_ax = fig.colorbar(h_ax, ax=ax, cax=cax, - boundaries=color_bounds, + boundaries=pVal_cmap.cnorm.boundaries, ticks=[], **cbar_kwds) cbar_ax.ax.invert_yaxis() if cbar_ticks is None: - cbar_ticks = pVal_clr_change + cbar_ticks = pVal_cmap.change_points cbar_ax.set_ticks(cbar_ticks) if cbar_ticklabels is None: @@ -2538,3 +2534,246 @@ def _get_hp_from_kEff(self, kEff): kEff = [kEff] return interpolate(x_coords, y_coords, np.asarray(nn)).squeeze() + + def plot_sweep(self, + fig=None, + gridspec=None, + box_widths=None, + box_positions=None, + box_notch=True, + box_bootstrap=100, + box_whiskers=(1, 99), + box_color='grey', + box_fliers=None, + box_props=None, + box_hl_color='grey', + box_hl_props=None, + values_2_highlight=None, + xLabel_idx=None, + xLabel=None, + xLabel_size=16, + xLim=None, + pVal_cmap=None, + fig_size=(12, 5), + fig_pad=0.4, + fig_ppad=0.01, + show_borders=False, + bot_wpad=0.0, + bot_hpad=0.0, + cax_ticklabels=None, + cax_width_frac=1.3, + cax_w2h_ratio=0.1): + """Generates scatter plot of embedded data colored by EMBEDR p-value + + Parameters + ---------- + """ + + import matplotlib.gridspec as gs + import matplotlib.pyplot as plt + + hp_array = np.sort(self.sweep_values) + + if box_fliers is None: + box_fliers = {'marker': ".", + 'markeredgecolor': box_color, + 'markersize': 2, + 'alpha': 0.5} + + if box_props is None: + box_props = {'alpha': 0.5, + 'color': box_color, + 'fill': True} + + if box_hl_props is None: + box_hl_props = box_props.copy() + box_hl_props.update({"alpha": 0.9, "color": box_hl_color}) + + if values_2_highlight is None: + values_2_highlight = [] + + box_patches = ['boxes', 'whiskers', 'fliers', 'caps', 'medians'] + + if fig is None: + fig = plt.figure(figsize=fig_size) + + if gridspec is None: + gridspec = fig.add_gridspec(1, 1) + + if pVal_cmap is None: + pVal_cmap = putl.CategoricalFadingCMap() + + ## Set up large axes + spine_alpha = 1 if show_borders else 0 + bot_ax = fig.add_subplot(gridspec[0]) + bot_ax = putl.make_border_axes(bot_ax, xticks=[], yticks=[], + spine_alpha=spine_alpha) + + ## Set up floating bottom gridspec + bot_gs = gs.GridSpec(nrows=1, ncols=1, + wspace=bot_wpad, hspace=bot_hpad) + + ax = putl.make_border_axes(fig.add_subplot(bot_gs[0]), + yticklabels=[], + yticks=-np.sort(pVal_cmap.change_points), + spine_alpha=1) + + putl.update_tight_bounds(fig, bot_gs, gridspec[0], w_pad=bot_wpad, + h_pad=bot_hpad, fig_pad=fig_pad) + + hl_boxes = {} + hl_idx = [] + for hpNo, hpVal in enumerate(hp_array): + + if box_widths is not None: + try: + box_wid = box_widths[hpNo] + except TypeError as err: + box_wid = box_widths + else: + box_wid = 0.8 + + if box_positions is not None: + try: + box_pos = [box_positions[hpNo]] + except TypeError as err: + box_pos = [box_positions] + else: + box_pos = [hpNo] + + if hpVal in values_2_highlight: + box_pps = box_hl_props.copy() + box_col = box_color + hl_idx.append(hpNo) + else: + box_pps = box_props.copy() + box_col = box_hl_color + + box = ax.boxplot(np.log10(self.pValues[hpVal]), + widths=box_wid, + positions=box_pos, + notch=box_notch, + bootstrap=box_bootstrap, + patch_artist=True, + whis=box_whiskers, + boxprops=box_pps, + flierprops=box_fliers) + + for item in box_patches: + plt.setp(box[item], color=box_col) + + if hpVal in values_2_highlight: + hl_boxes[hpVal] = box['boxes'][0] + + if xLabel_idx is None: + if values_2_highlight: + xLabel_idx = [0] + hl_idx + [hpNo] + else: + if len(hp_array) <= 5: + xLabel_idx = np.arange(len(hp_array)) + else: + xLabel_idx = np.linspace(0, len(hp_array), 5) + xLabel_idx = human_round(xLabel_idx) + xLabel_idx = np.asarray(xLabel_idx).astype(int) + + ax.set_xticks(xLabel_idx) + xticks = [f"{int(self.kEff[hp_array[idx]])}" for idx in xLabel_idx] + xticks = human_round(np.asarray(xticks).squeeze()) + ax.grid(which='major', axis='x', alpha=0) + ax.set_xticklabels(xticks) + + if xLim is None: + xLim = [-1, len(hp_array)] + + ax.set_xlabel(r"$ k_{Eff}$", fontsize=xLabel_size, labelpad=0) + ax.set_xlim(*xLim) + + # ax.set_yticks(-np.sort(pVal_cmap.change_points)) + # ax.set_yticklabels([]) + + ax.set_ylim(-pVal_cmap.change_points.max(), + -pVal_cmap.change_points.min()) + + ax.tick_params(pad=-3) + + ## Update the figure again... + putl.update_tight_bounds(fig, bot_gs, gridspec[0], w_pad=bot_wpad, + h_pad=bot_hpad, fig_pad=fig_pad) + + ## Colorbar parameters + if cax_ticklabels is None: + cax_ticklabels = [f"{10.**(-cp):.1e}" + for cp in pVal_cmap.change_points] + + inv_ax_trans = ax.transAxes.inverted() + fig_trans = fig.transFigure + + ## Convert from data to display + min_pVal = np.min([np.log10(self.pValues[hp].min()) + for hp in self.sweep_values]) + min_pVal = np.min([min_pVal, -pVal_cmap.change_points.max()]) + max_pVal = np.max([np.log10(self.pValues[hp].max()) + for hp in self.sweep_values]) + max_pVal = np.min([max_pVal, -pVal_cmap.change_points.min()]) + min_pVal_crds = ax.transData.transform([xLim[0], min_pVal]) + max_pVal_crds = ax.transData.transform([xLim[0], max_pVal]) + + # print(f"min_pVal_crds: {min_pVal_crds}") + # print(f"max_pVal_crds: {max_pVal_crds}") + + ## Convert from display to figure coordinates + cFigX0, cFigY0 = fig.transFigure.inverted().transform(min_pVal_crds) + cFigX1, cFigY1 = fig.transFigure.inverted().transform(max_pVal_crds) + + # print(f"cFig0: {cFigX0:.4f}, {cFigY0:.4f}") + # print(f"cFig1: {cFigX1:.4f}, {cFigY1:.4f}") + + cFig_height = np.abs(cFigY1 - cFigY0) + cFig_width = cax_w2h_ratio * cFig_height + + # print(f"The color bar will be {cFig_width:.4f} x {cFig_height:.4f}") + + cAxX0, cAxY0 = cFigX0 - cax_width_frac * cFig_width, cFigY0 + cAxX1, cAxY1 = cAxX0 + cFig_width, cFigY0 + cFig_height + + ## Convert from Figure back into Axes + [cAxX0, + cAxY0] = inv_ax_trans.transform(fig_trans.transform([cAxX0, cAxY0])) + [cAxX1, + cAxY1] = inv_ax_trans.transform(fig_trans.transform([cAxX1, cAxY1])) + + # print(f"cAx0: {cAxX0:.4f}, {cAxY0:.4f}") + # print(f"cAx1: {cAxX1:.4f}, {cAxY1:.4f}") + + cAx_height = np.abs(cAxY1 - cAxY0) + cAx_width = np.abs(cAxX1 - cAxX0) + + # print(f"The color bar will be {cAx_width:.4f} x {cAx_height:.4f}") + + caxIns = ax.inset_axes([cAxX0, cAxY0, cAx_width, cAx_height]) + caxIns = putl.make_border_axes(caxIns, spine_alpha=0) + + hax = plt.scatter([], [], c=[], s=[], cmap=pVal_cmap.cmap, + norm=pVal_cmap.cnorm) + cAx = fig.colorbar(hax, cax=caxIns, ticks=[], + boundaries=pVal_cmap.cnorm.boundaries) + cAx.ax.invert_yaxis() + + cAx.set_ticks(pVal_cmap.change_points) + cAx.set_ticklabels(cax_ticklabels) + cAx.ax.tick_params(length=0) + cAx.ax.yaxis.set_ticks_position('left') + + cAx.ax.set_ylabel(r"EMBEDR $p$-Value", + fontsize=xLabel_size, + labelpad=2) + cAx.ax.yaxis.set_label_position('left') + + ## Update the figure again... + putl.update_tight_bounds(fig, bot_gs, gridspec[0], w_pad=bot_wpad, + h_pad=bot_hpad, fig_pad=fig_pad) + + return ax + + + diff --git a/EMBEDR/human_round.py b/EMBEDR/human_round.py index aed0e5f..2d66b20 100644 --- a/EMBEDR/human_round.py +++ b/EMBEDR/human_round.py @@ -54,8 +54,7 @@ def human_round(x, Options are 'up', 'down', and 'none'. """ - if not is_iterable(x): - x = np.asarray([x]) + x = np.asarray([x]).squeeze().astype(float) if not inplace: x = x.copy() diff --git a/EMBEDR/plots/EMBEDR_Figure_01v1_DimRed_Zoology.py b/EMBEDR/plots/EMBEDR_Figure_01v1_DimRed_Zoology.py index 17d95c0..59f36c6 100644 --- a/EMBEDR/plots/EMBEDR_Figure_01v1_DimRed_Zoology.py +++ b/EMBEDR/plots/EMBEDR_Figure_01v1_DimRed_Zoology.py @@ -25,9 +25,13 @@ ############################################################################### """ from EMBEDR.embedr import EMBEDR, EMBEDR_sweep +from EMBEDR.human_round import human_round import EMBEDR.plotting_utility as putl +import EMBEDR.utility as utl +import anndata as ad import matplotlib +import matplotlib.gridspec as gs import matplotlib.pyplot as plt import numpy as np import os @@ -37,6 +41,350 @@ warnings.filterwarnings("ignore", message="This figure includes Axes that") warnings.filterwarnings("ignore", message="tight_layout not applied: ") +warnings.filterwarnings("ignore", message="Creating an ndarray from ragged") + + +def _make_figure_grid(fig_size=(7.2, 5.76), + n_rows=2, + n_cols=2, + show_all_borders=False, + wspace=0.005, + hspace=0.01, + spines_2_show='all', + spine_alpha=0.5, + spine_width=1.0): + + back_spine_alpha = 0 + if show_all_borders: + back_spine_alpha = 1 + + fig = plt.figure(figsize=fig_size) + + back_axis = fig.add_subplot(111) + back_axis = putl.make_border_axes(back_axis, spine_alpha=back_spine_alpha) + + main_gs = fig.add_gridspec(nrows=n_rows, ncols=n_cols, + wspace=wspace, hspace=hspace) + + main_axes = [] + for rowNo in range(n_rows): + axes_row = [] + for colNo in range(n_cols): + ax = fig.add_subplot(main_gs[rowNo, colNo]) + ax = putl.make_border_axes(ax, spines_2_show=spines_2_show, + spine_alpha=spine_alpha, + spine_width=spine_width) + axes_row.append(ax) + main_axes.append(axes_row) + + return fig, back_axis, main_gs, main_axes + + +def _add_plot_colored_by_cluster(Y, + labels, + axis, + colors, + sizes, + labels_2_hl, + scatter_alpha=0.2): + + hax = axis.scatter(*Y.T, c=colors, s=sizes, alpha=scatter_alpha) + + for lNo, lab in enumerate(labels_2_hl): + good_idx = (labels == lab).squeeze() + + label_median = np.median(Y[good_idx], axis=0) + + axis.text(*label_median, "{}".format(lNo + 1), fontsize=12, + fontweight='bold', va='center', ha='center') + + return hax + + +def _add_plot_colored_by_var(Y, + labels, + axis, + sizes, + scatter_alpha=0.2, + reverse_label=False): + + sort_idx = np.argsort(labels) + if reverse_label: + sort_idx = sort_idx[::-1] + + hax = axis.scatter(*Y[sort_idx].T, c=labels[sort_idx], s=sizes[sort_idx], + alpha=scatter_alpha) + + return hax + + + + + +def EMBEDR_Figure_01(X, + metadata=None, + data_dir=None, + embedding_params=None, + EMBEDR_params=None, + project_dir="./", + project_name="EMBEDR_Figure_01v1_DimRedZoology", + color_by_cluster=True, + label_name="cell_ontology_class", + labels_2_hl=None, + label_colors=None, + label_sizes=None, + label_params=None, + grid_params=None, + n_rows=2, + n_cols=2, + scatter_alpha=0.2, + title_size=14, + title_pad=-15, + add_panel_numbers=False, + fig_dir="./", + fig_pad=None): + + if metadata is None: + load_metadata = True + + data_name = "" + if isinstance(X, str): + data_name = X.title() + if load_metadata: + X, metadata = utl.load_data(X, data_dir=data_dir) + else: + X = utl.load_data(X, data_dir=data_dir, load_metadata=False) + + if metadata is None: + err_str = f"Metadata must be either loadable with `utl.load_data`" + err_str += f" or provided. Metadata is currently `None`..." + raise ValueError(err_str) + + if embedding_params is None: + embedding_params = [('tSNE', 9), ('UMAP', 15), + ('tSNE', 350), ('UMAP', 400)] + + if EMBEDR_params is None: + EMBEDR_params = {} + + if color_by_cluster: + if label_params is None: + label_params = {} + + [labels, + label_counts, + long_labels, + lab_2_idx_map, + label_cmap] = putl.process_categorical_label(metadata, + label_name, + **label_params) + + if labels_2_hl is None: + labels_2_hl = label_counts.index.values[:10] + + if label_colors is None: + label_colors = [label_cmap[lab_2_idx_map[ll]] if ll in labels_2_hl + else 'lightgrey'for ll in labels] + + if label_sizes is None: + label_sizes = [3 if ll in labels_2_hl else 1 for ll in labels] + + elif label_sizes is None: + label_sizes = 3 * np.ones((len(X))) + + if grid_params is None: + grid_params = {} + + [fig, + back_axis, + main_gs, + main_axes] = _make_figure_grid(n_rows=n_rows, n_cols=n_cols, + **grid_params) + + if EMBEDR_params is None: + EMBEDR_params = {'verbose': 1} + + for algNo, (alg, param) in enumerate(embedding_params): + print(f"Plotting data embedded by {alg} (param = {param})") + + if alg.lower() in ['tsne', 't-sne']: + embObj = EMBEDR(X=X, + perplexity=param, + DRA='tsne', + n_data_embed=1, + n_jobs=-1, + project_name=project_name, + project_dir=project_dir) + Y, _ = embObj.get_tSNE_embedding(X) + kEff = human_round(embObj.kEff) + title = f"t-SNE: " + r"$k_{Eff} \approx $" + f"{kEff:.0f}" + + if not color_by_cluster: + labels = np.log10(embObj._kEff) + + if alg.lower() in ['umap']: + embObj = EMBEDR(X=X, + n_neighbors=param, + DRA='umap', + n_data_embed=1, + n_jobs=-1, + project_name=project_name, + project_dir=project_dir, + **EMBEDR_params) + Y, _ = embObj.get_UMAP_embedding(X) + title = f"UMAP: " + r"$k = $" + f"{param:.0f}" + + if not color_by_cluster: + kNN_graph = embObj.get_kNN_graph(X) + labels = np.log10(kNN_graph.kNN_dst[:, param - 1]) + + rowNo = int(algNo / n_cols) + colNo = int(algNo % n_cols) + axis = main_axes[rowNo][colNo] + + if color_by_cluster: + hax = _add_plot_colored_by_cluster(Y[0], labels, axis, + label_colors, label_sizes, + labels_2_hl, + scatter_alpha=scatter_alpha) + + else: + hax = _add_plot_colored_by_var(Y[0], labels, axis, label_sizes, + scatter_alpha=scatter_alpha) + + cax = fig.colorbar(hax, ax=axis, pad=-0.002, + drawedges=False) + + c_ticks = cax.get_ticks() + cax.set_ticks(c_ticks, ) + c_ticklabels = [f"{int(human_round(10**tck))}" for tck in c_ticks] + cax.set_ticklabels(c_ticklabels) + cax.ax.yaxis.set_tick_params(pad=-0.5) + + if alg.lower() == 'tsne': + cax.set_label(r"Effective Nearest Neighbors, $k_{Eff}$", + labelpad=-3) + if alg.lower() == 'umap': + cax.set_label(r"Distance to $k^{th}$ Neighbor", + labelpad=-3) + + cax.solids.set_edgecolor('face') + fig.canvas.draw() + + axis.set_title(title, fontsize=title_size, pad=title_pad) + ylim = axis.get_ylim() + axis.set_ylim(ylim[0], ylim[1] + 0.1 * (ylim[1] - ylim[0])) + + if color_by_cluster: + text_off = 0 + text_h = 0 + + ax_width = back_axis.get_window_extent().width + ax_height = back_axis.get_window_extent().height / fig.dpi + + pad = 3 + + rect_width = axis.get_window_extent().width / ax_width + + for lNo, lab in enumerate(labels_2_hl): + + if lNo < 5: + x_loc = rect_width / 2 + rect_x = 0 + else: + x_loc = 1 - (rect_width / 2) + rect_x = 1 - rect_width + + if "Slamf1-Negative" in lab: + lab = " ".join(lab.split(" Multipotent ")) + label_str = f"{lNo + 1}: " + lab.title() + + bb = axis.text(x_loc, -0.013 - (lNo % 5) * text_h, + label_str, ha='center', va='top', fontsize=10, + transform=back_axis.transAxes) + + text_h = (bb.get_size() + 2 * pad) / 72. / ax_height + + # if (cNo % 2) == 1: + # text_off -= text_h + + # rect_y = (int(cNo / 2) + 1) * text_h + rect_y = ((lNo % 5) + 1) * text_h + + label_color = label_cmap[lab_2_idx_map[lab]] + + rect = plt.Rectangle((rect_x, -rect_y), + width=rect_width, + height=text_h, + transform=back_axis.transAxes, + zorder=3, + fill=True, + facecolor=label_color, + clip_on=False, + alpha=0.8, + edgecolor='0.8') + + back_axis.add_patch(rect) + + fig.tight_layout() + + if add_panel_numbers: + + letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + for rowNo in range(n_rows): + for colNo in range(n_cols): + + axis = main_axes[rowNo][colNo] + letter = letters[rowNo * n_cols + colNo] + + _ = putl.add_panel_number(axis, letter, edge_pad=10) + + fig.tight_layout() + + if color_by_cluster: + fig_base = project_name + f"_{data_name}" + "_ColoredByCluster" + fig_pad = 0.5 if fig_pad is not None else fig_pad + else: + fig_base = project_name + f"_{data_name}" + "_ColoredByVariable" + fig_pad = 3 if fig_pad is not None else fig_pad + + print(fig_base) + ## SAVE FIGURE HERE + putl.save_figure(fig, + fig_base, + fig_dir=fig_dir, + tight_layout_pad=fig_pad) + + return fig + + + + + + + + + + + + + + + + + + + + + + + + + + + + + def make_figure(X, cluster_labels, clusters_2_label=None, label_colors=None, diff --git a/EMBEDR/plotting_utility.py b/EMBEDR/plotting_utility.py index 6b51954..b1fafde 100644 --- a/EMBEDR/plotting_utility.py +++ b/EMBEDR/plotting_utility.py @@ -220,35 +220,42 @@ def add_panel_number(axis, ############################################################################### -## Functions for Colorbars +## Functions for Figure Aesthetics ############################################################################### -def make_categ_cmap(change_points=[0, 2, 3, 4, 5], - categorical_cmap=None, - cmap_idx=None, - cmap_dx=0.001, - reverse_last_interval=True): +class CategoricalFadingCMap(object): """Make categorical colormap that fades between colors at specific values. - This function takes in a list of end points + interior points to set as - the edge of regions on a colormap. The function then returns a new - continuous colormap that transitions between these regions (fades to white, - then changes colors). + This class creates a blended categorical-continuous colormap in which + designated colors are faded to black or white in descrete regions. As an + example, this class is used to create the p-value colorbars in the EMBEDR + plotting functions, where different levels of p-values are given different + colors, but within each level, the color also fades as a p-value goes from + one end of the category to the other. This is useful for situations in + which values have discrete bins into which they can be mapped, but we still + want to see the individual variation in points. + + The default arguments are set to that used by EMBEDR for p-value colorbars. Parameters ---------- - change_points: Iterable (optional) + change_points: Iterable (optional, default=[0, 2, 3, 4, 5]) The values at which to change between categories. The end-points (the max and min values to be shown on the colormap) must be supplied so that if 4 categories are desired, `change_points` must contain 4 + 1 - values. - - categorical_cmap: Seaborn colormap object (optional) - A categorical colormap (list of tuples) to which the intervals between - `change_points` will be mapped. - - cmap_idx: Iterable (optional) + values. These values are in units of the measurement being used to + assign colors to points, i.e. if height is being used to color points, + then change_points might be [1ft, 2ft, 4ft, 6ft], so that there are 3 + height categories: 1-2ft, 2-4ft, and 4-6ft. All values outside this + range will be mapped to the minimum and maximum color of the range + (i.e. a 7ft person would have the same color as a 6ft person). + + base_cmap: Union[str, Iterable of tuples] (optional, default='colorblind') + A categorical colormap (list of tuples) or the name of a Seaborn + colormap to which the intervals between `change_points` will be mapped. + + cmap_idx: Iterable (optional, default=[4, 0, 3, 2]) A list of indices that maps the colors in the colormap to the correct interval. This allows for preset colormaps to be remapped by changing `cmap_idx` from [0, 1, 2, 3] to [2, 3, 1, 0], for example. @@ -256,72 +263,181 @@ def make_categ_cmap(change_points=[0, 2, 3, 4, 5], cmap_dx: float (optional, default=0.001) Interval at which to interpolate colors. Smaller will make the colormap seem more continuous, but may have trouble rendering on some - computers. + computers. If `cmap_dx` < 1, this will be interpreted as an interval + size in the units of `change_points`. If `cmap_dx` > 1, this will + be interpreted as the number of interpolation intervals to calculate + across `change_points`. + + cmap_kwds: dict (optional, default={}) + Other keywords to pass to `matplotlib.colors.ListedColormap` object. + + max_divergence: float (optional, default=0.75) + Maximal distance between white/black and the category color to allow + in each region. Setting to 0 will keep the colors constant in each + category, while setting to 1 will allow the colors to fade entirely to + black or white. reverse_last_interval: bool (optional, default=True) Flag indicating whether to reverse the interpolation direction on the last interval. This can be useful to set up a maximal contrast in one part of the colormap. + + fade_to_white: bool (optional, default=True) + Flag indicating whether the colors should fade to white or black within + a category. """ - if categorical_cmap is None: - import seaborn as sns - categorical_cmap = sns.color_palette('colorblind') + def __init__(self, + change_points=[0, 2, 3, 4, 5], + base_cmap='colorblind', + cmap_idx=None, + cmap_dx=0.001, + cmap_kwds=None, + max_divergence=0.75, + reverse_last_interval=True, + fade_to_white=True): + + self.change_points = change_points + self.base_cmap = base_cmap + self.cmap_idx = cmap_idx + self.cmap_dx = cmap_dx + self.cmap_kwds = cmap_kwds + self.max_divergence = max_divergence + + ## Optional flags + self.reverse_last_interval = reverse_last_interval + self.fade_to_white = fade_to_white + + self._validate_parameters() + + self.cmap, self.cnorm = self.make_cmap() + + def _validate_parameters(self): + + try: + self.change_points = np.unique([el for el in self.change_points]) + self.change_points = np.sort(self.change_points).squeeze() + self.n_categ = len(self.change_points) - 1 + except TypeError as te: + err_str = "Input argument `change_points` is not iterable!" + raise TypeError(err_str) + + if isinstance(self.base_cmap, str): + self.base_cmap = sns.color_palette(self.base_cmap) + else: + try: + _ = self.base_cmap[0] + except TypeError as err: + err_str = err.args[0] + f"\n\n\t Input `base_cmap` could" + err_str += f" not be indexed (_ = cmap[0] failed). Make sure" + err_str += f" `base_cmap` is either a subscriptable colormap" + err_str += f" or an iterable containing colors from which to" + err_str += f" create the categorical colormap." + raise TypeError(err_str) + self._n_base_colors = len(self.base_cmap) + + if self.cmap_idx is None: + self.cmap_idx = [4, 0, 3, 2] + list(range(4, self.n_categ)) + + try: + [el for el in self.cmap_idx] + assert len(self.cmap_idx) == self.n_categ + except TypeError as te: + err_str = "Input argument `change_points` is not iterable!" + raise TypeError(err_str) + except AssertionError as ae: + err_str = f"Input size of `cmap_idx` does not map the number of" + err_str += f" categories indicated by `change_points`" + err_str += f" ({self.n_categ} != {len(self.cmap_idx)}). There must" + err_str += f" be one index in `cmap_idx` for each category." + raise ValueError(err_str) - ## Set the list of indices to use from the colormap - if cmap_idx is None: - cmap_idx = [4, 0, 3, 2] + list(range(4, len(change_points) - 1)) + try: + self.cmap_dx = float(self.cmap_dx) + assert self.cmap_dx > 0 + except (AssertionError, ValueError) as err: + err_str = f"Input argument `cmap_dx` must be a positive float." + raise ValueError(err_str) + + if self.cmap_dx > 1: + self.cmap_dx = ((self.change_points[-1] - self.change_points[0]) + / self.cmap_dx) + + if self.cmap_kwds is None: + self.cmap_kwds = {'name': "EMBEDR p-Values (-log10)"} + err_str = f"Input argument `cmap_kwds` must be a dictionary!" + assert isinstance(self.cmap_kwds, dict), err_str + self.cmap_kwds = self.cmap_kwds.copy() + + try: + self.max_divergence = float(self.max_divergence) + assert 1 > self.max_divergence > 0 + except (AssertionError, ValueError) as err: + err_str = f"Input argument `max_divergence` must be in [0, 1]." + raise ValueError(err_str) - ## Set the base colors for regions of the colormap - colors = [categorical_cmap[idx] for idx in cmap_idx] + self.reverse_last_interval = bool(self.reverse_last_interval) + self.fade_to_white = bool(self.fade_to_white) - ## Make an appropriate grid of points on which to set colors. - color_grid = [] - for intNo, end in enumerate(change_points[1:]): - color_grid += list(np.arange(change_points[intNo], end, cmap_dx)) - color_grid += [change_points[-1]] - color_grid = np.sort(np.unique(np.asarray(color_grid)).squeeze()) + def make_cmap(self): - ## Initialize the RGB+ array. - out_colors = np.ones((len(color_grid), 4)) + ## Get the base colors of the categories. + self.base_colors = [self.base_cmap[idx] for idx in self.cmap_idx] - ## Iterate through the grid, setting interpolated colors for each region. - start_idx = 0 - for intNo, start in enumerate(change_points[:-1]): + ## Make an appropriate grid of points on which to set colors. + color_grid = [] + for start, end in zip(self.change_points[:-1], self.change_points[1:]): + color_grid += list(np.arange(start, end, self.cmap_dx)) + color_grid += [self.change_points[-1]] + ## This checks that we didn't double up any grid points. + color_grid = np.sort(np.unique(np.asarray(color_grid)).squeeze()) - ## Get the number of grid points in this interval - N_ticks = int((change_points[intNo + 1] - start) / cmap_dx) - ## If it's the last interval, add an extra. - if intNo == (len(change_points) - 2): - N_ticks += 1 + ## Initialize the RGB+ array. + final_colors = np.ones((len(color_grid), 4)) - ## Iterate through each of the RGB values. - for jj in range(3): + ## Iterate through the category boundaries, setting interpolated colors + ## for each category. + cat_idx = 0 + for catNo, [start, end] in enumerate(zip(self.change_points[:-1], + self.change_points[1:])): + ## Get the number of grid points in this interval + n_ticks = int((end - start) / self.cmap_dx) + ## If it's the last interval, add an extra. + if end == self.change_points[-1]: + n_ticks += 1 - ## Base color for each interval - base_color = colors[intNo][jj] + ## Iterate through each of the RGB values. + for jj in range(3): - ## Maximum divergence from the base color. - upper_bound = 0.75 * (1 - base_color) + base_color + ## Base color for each interval + base_color = self.base_colors[catNo][jj] - ## Interpolated grid for the interval. - intv_color_grid = np.linspace(base_color, upper_bound, N_ticks) + ## Maximum divergence from the base color. + top = 1 if self.fade_to_white else 0 + color_diff = self.max_divergence * (top - base_color) + upper_bound = color_diff + base_color - ## If we're in the last interval, can reverse the direction - if (intNo == (len(change_points) - 2)) and reverse_last_interval: - intv_color_grid = intv_color_grid[::-1] + ## Interpolated grid for the interval. + intv_color_grid = np.linspace(base_color, upper_bound, n_ticks) - ## Set the colors! - out_colors[start_idx:start_idx + N_ticks, jj] = intv_color_grid + ## If we're in the last interval, can reverse the direction + if catNo == (self.n_categ - 1): + if self.reverse_last_interval: + intv_color_grid = intv_color_grid[::-1] - start_idx += N_ticks + ## Set the colors! + final_colors[cat_idx:cat_idx + n_ticks, jj] = intv_color_grid - ## Convert the grids and colors to matplotlib colormaps. - import matplotlib.colors as mcl - out_cmap = mcl.ListedColormap(out_colors) - out_cnorm = mcl.BoundaryNorm(color_grid, out_cmap.N) + cat_idx += n_ticks - return out_cmap, out_cnorm + ## Convert the grids and colors to matplotlib colormaps. + cmap = mcl.ListedColormap(final_colors, **self.cmap_kwds) + cmap.set_extremes(bad='lightgrey', + under=self.base_colors[0], + over=self.base_colors[-1]) + cnorm = mcl.BoundaryNorm(color_grid, cmap.N) + + return cmap, cnorm def make_seq_cmap(color_1, color_2, n_colors=10): @@ -464,3 +580,29 @@ def wrap_strings(str_arr, line_len=26): return out_list + +def process_categorical_label(metadata, label, cmap='colorblind', + alphabetical_sort=False): + + ## Extract the raw labels + raw_labels = metadata[label].values.copy() + + ## Get the unique labels and their counts + label_counts = metadata[label].value_counts() + unique_labels = label_counts.index.values + + if alphabetical_sort: + unique_labels = np.sort(unique_labels) + + ## Make some nice long labels. + long_labels = np.asarray([f"{ll} (N = {label_counts.loc[ll]:d})" + for ll in unique_labels]) + + ## Make a colormap + if isinstance(cmap, str): + label_cmap = sns.color_palette(cmap, len(unique_labels)) + + lab_2_idx_map = {ll: ii for ii, ll in enumerate(unique_labels)} + + return raw_labels, label_counts, long_labels, lab_2_idx_map, label_cmap + diff --git a/EMBEDR/utility.py b/EMBEDR/utility.py index 4dda9a2..ebbd5ff 100644 --- a/EMBEDR/utility.py +++ b/EMBEDR/utility.py @@ -1,5 +1,7 @@ from collections import Counter import numpy as np +import os +import pandas as pd import scanpy as sc from time import time @@ -49,8 +51,9 @@ def load_data(data_name, X = np.loadtxt(data_path).astype(dtype) if load_metadata: - metadata_path = path.join(data_dir, "mnist2500_labels.txt") + metadata_path = os.path.join(data_dir, "mnist2500_labels.txt") metadata = np.loadtxt(metadata_path).astype(int) + metadata = pd.DataFrame(metadata, columns=['label']) elif data_name.lower() in tabula_muris_tissues: @@ -61,6 +64,9 @@ def load_data(data_name, data_path = os.path.join(data_dir, data_file) X = sc.read_h5ad(data_path) + metadata = X.obs.copy() + + X = X.obsm['X_pca'] elif data_name.lower() == "ATAC": @@ -71,8 +77,14 @@ def load_data(data_name, data_path = os.path.join(data_dir, data_file) X = sc.read_h5ad(data_path) + metadata = X.obs.copy() + + X = X.obsm['lsi'] - return X, metadata + if load_metadata: + return X, metadata + else: + return X ###############################################################################