Merge pull request ME-ICA#2 from smoia/OLS_stats_modification

Modified get_coeffs into get_ls_coeffs, corrected its use in the code.
smoia · Nov 29, 2019 · 2ec3b3c · 2ec3b3c
2 parents 1def734 + ba3aa26
commit 2ec3b3c
Show file tree

Hide file tree

Showing 7 changed files with 92 additions and 71 deletions.
diff --git a/docs/api.rst b/docs/api.rst
@@ -177,7 +177,7 @@ API
    :toctree: generated/
    :template: function.rst
 
-   tedana.stats.get_coeffs
+   tedana.stats.get_ls_coeffs
    tedana.stats.computefeats2
    tedana.stats.getfbounds
 

diff --git a/tedana/io.py b/tedana/io.py
@@ -13,7 +13,7 @@
 from nilearn.image import new_img_like
 
 from tedana import utils
-from tedana.stats import computefeats2, get_coeffs
+from tedana.stats import computefeats2, get_ls_coeffs
 
 LGR = logging.getLogger(__name__)
 RepLGR = logging.getLogger('REPORT')
@@ -47,8 +47,8 @@ def split_ts(data, mmix, mask, comptable):
     """
     acc = comptable[comptable.classification == 'accepted'].index.values
 
-    cbetas = get_coeffs(data - data.mean(axis=-1, keepdims=True),
-                        mmix, mask)
+    cbetas = get_ls_coeffs(data - data.mean(axis=-1, keepdims=True),
+                           mmix, mask)
     betas = cbetas[mask]
     if len(acc) != 0:
         hikts = utils.unmask(betas[:, acc].dot(mmix.T[acc, :]), mask)
@@ -104,7 +104,7 @@ def write_split_ts(data, mmix, mask, comptable, ref_img, suffix=''):
     dmdata = mdata.T - mdata.T.mean(axis=0)
 
     # get variance explained by retained components
-    betas = get_coeffs(dmdata.T, mmix, mask=None)
+    betas = get_ls_coeffs(dmdata.T, mmix, mask=None)
     varexpl = (1 - ((dmdata.T - betas.dot(mmix.T))**2.).sum() /
                (dmdata**2.).sum()) * 100
     LGR.info('Variance explained by ICA decomposition: {:.02f}%'.format(varexpl))
@@ -223,7 +223,7 @@ def writeresults(ts, mask, comptable, mmix, n_vols, ref_img):
 
     write_split_ts(ts, mmix, mask, comptable, ref_img, suffix='OC')
 
-    ts_B = get_coeffs(ts, mmix, mask)
+    ts_B = get_ls_coeffs(ts, mmix, mask)
     fout = filewrite(ts_B, 'betas_OC', ref_img)
     LGR.info('Writing full ICA coefficient feature set: {}'.format(op.abspath(fout)))
 

diff --git a/tedana/metrics/__init__.py b/tedana/metrics/__init__.py
@@ -2,8 +2,8 @@
 # ex: set sts=4 ts=4 sw=4 et:
 
 from .kundu_fit import (
-    dependence_metrics, kundu_metrics, get_coeffs, computefeats2
+    dependence_metrics, kundu_metrics, get_ls_coeffs, computefeats2
 )
 
 __all__ = [
-    'dependence_metrics', 'kundu_metrics', 'get_coeffs', 'computefeats2']
+    'dependence_metrics', 'kundu_metrics', 'get_ls_coeffs', 'computefeats2']
diff --git a/tedana/metrics/kundu_fit.py b/tedana/metrics/kundu_fit.py
@@ -9,7 +9,7 @@
 from scipy import stats
 
 from tedana import io, utils
-from tedana.stats import getfbounds, computefeats2, get_coeffs
+from tedana.stats import getfbounds, computefeats2, get_ls_coeffs
 
 
 LGR = logging.getLogger(__name__)
@@ -107,7 +107,7 @@ def dependence_metrics(catd, tsoc, mmix, t2s, tes, ref_img,
     WTS = computefeats2(tsoc, mmixN, mask=None, normalize=False)
 
     # compute PSC dataset - shouldn't have to refit data
-    tsoc_B = get_coeffs(tsoc_dm, mmix, mask=None)
+    tsoc_B = get_ls_coeffs(tsoc_dm, mmix, mask=None)
     del tsoc_dm
     tsoc_Babs = np.abs(tsoc_B)
     PSC = tsoc_B / tsoc.mean(axis=-1, keepdims=True) * 100
@@ -124,9 +124,9 @@ def dependence_metrics(catd, tsoc, mmix, t2s, tes, ref_img,
     totvar_norm = (WTS**2).sum()
 
     # compute Betas and means over TEs for TE-dependence analysis
-    betas = get_coeffs(utils.unmask(catd, mask),
-                       mmix,
-                       np.repeat(mask[:, np.newaxis], len(tes), axis=1))
+    betas = get_ls_coeffs(utils.unmask(catd, mask),
+                          mmix,
+                          np.repeat(mask[:, np.newaxis], len(tes), axis=1))
     betas = betas[mask, ...]
     n_voxels, n_echos, n_components = betas.shape
     mu = catd.mean(axis=-1, dtype=float)

diff --git a/tedana/stats.py b/tedana/stats.py
@@ -7,19 +7,37 @@
 from scipy import stats
 
 from tedana import utils
+from tedana.due import due, BibTeX, Doi
 
 LGR = logging.getLogger(__name__)
 RepLGR = logging.getLogger('REPORT')
 RefLGR = logging.getLogger('REFERENCES')
 
-@due.dcite(references.T2Z_TRANSFORM,
+T2Z_TRANSFORM = BibTeX("""
+    @article{hughett2007accurate,
+      title={Accurate Computation of the F-to-z and t-to-z Transforms
+             for Large Arguments},
+      author={Hughett, Paul and others},
+      journal={Journal of Statistical Software},
+      volume={23},
+      number={1},
+      pages={1--5},
+      year={2007},
+      publisher={Foundation for Open Access Statistics}
+    }
+    """)
+T2Z_IMPLEMENTATION = Doi('10.5281/zenodo.32508')
+
+
+@due.dcite(T2Z_TRANSFORM,
            description='Introduces T-to-Z transform.')
-@due.dcite(references.T2Z_IMPLEMENTATION,
+@due.dcite(T2Z_IMPLEMENTATION,
            description='Python implementation of T-to-Z transform.')
-
 def t_to_z(t_values, dof):
     """
     From Vanessa Sochat's TtoZ package.
+    Copyright (c) 2015 Vanessa Sochat
+    MIT Licensed
     """
 
     # check if t_values is np.array, and convert if required
@@ -53,6 +71,7 @@ def t_to_z(t_values, dof):
     out[t_values != 0] = z_values
     return out
 
+
 def getfbounds(n_echos):
     """
     Gets F-statistic boundaries based on number of echos
@@ -96,12 +115,14 @@ def computefeats2(data, mmix, mask=None, normalize=True):
         Data in component space
     """
     if data.ndim != 2:
-        raise ValueError('Parameter data should be 2d, not {0}d'.format(data.ndim))
+        raise ValueError('Parameter data should be 2d, not '
+                         '{0}d'.format(data.ndim))
     elif mmix.ndim not in [2]:
         raise ValueError('Parameter mmix should be 2d, not '
                          '{0}d'.format(mmix.ndim))
     elif (mask is not None) and (mask.ndim != 1):
-        raise ValueError('Parameter mask should be 1d, not {0}d'.format(mask.ndim))
+        raise ValueError('Parameter mask should be 1d, not '
+                         '{0}d'.format(mask.ndim))
     elif (mask is not None) and (data.shape[0] != mask.shape[0]):
         raise ValueError('First dimensions (number of samples) of data ({0}) '
                          'and mask ({1}) do not match.'.format(data.shape[0],
@@ -119,22 +140,15 @@ def computefeats2(data, mmix, mask=None, normalize=True):
 
     # get betas and z-values of `data`~`mmix`
     # mmix is normalized internally
-    data_R, data_Z = get_coeffs(data_vn, mmix, mask=None, add_const=False, compute_zvalues=True)
+    _, data_Z = get_ls_coeffs(data_vn, mmix, mask=None, add_const=False,
+                              compute_zvalues=True)
     if data_Z.ndim == 1:
         data_Z = np.atleast_2d(data_Z).T
 
-    # normalize data (only division by std)
-    if normalize:
-        # minus mean and divided by std
-        data_Zm = stats.zscore(data_Z, axis=0)
-        # adding back the mean
-        data_Z = data_Zm + (data_Z.mean(axis=0, keepdims=True) /
-                            data_Z.std(axis=0, keepdims=True))
-
     return data_Z
 
 
-def get_coeffs(data, X, mask=None, add_const=False, compute_zvalues=False, min_df=1):
+def get_ls_coeffs(data, X, mask=None, add_const=False, compute_zvalues=False, min_df=1):
     """
     Performs least-squares fit of `X` against `data`
 
@@ -162,21 +176,23 @@ def get_coeffs(data, X, mask=None, add_const=False, compute_zvalues=False, min_d
 
     """
     if data.ndim not in [2, 3]:
-        raise ValueError('Parameter data should be 2d or 3d, not {0}d'.format(data.ndim))
+        raise ValueError('Parameter data should be 2d or 3d, not '
+                         '{0}d'.format(data.ndim))
     elif X.ndim not in [2]:
         raise ValueError('Parameter X should be 2d, not {0}d'.format(X.ndim))
     elif data.shape[-1] != X.shape[0]:
-        raise ValueError('Last dimension (dimension {0}) of data ({1}) does not '
-                         'match first dimension of '
-                         'X ({2})'.format(data.ndim, data.shape[-1], X.shape[0]))
+        raise ValueError('Last dimension (dimension {0}) of data ({1}) does '
+                         'not match first dimension of X '
+                         '({2})'.format(data.ndim, data.shape[-1], X.shape[0]))
 
     # mask data and flip (time x samples)
     if mask is not None:
         if mask.ndim not in [1, 2]:
-            raise ValueError('Parameter data should be 1d or 2d, not {0}d'.format(mask.ndim))
+            raise ValueError('Parameter data should be 1d or 2d, not '
+                             '{0}d'.format(mask.ndim))
         elif data.shape[0] != mask.shape[0]:
-            raise ValueError('First dimensions of data ({0}) and mask ({1}) do not '
-                             'match'.format(data.shape[0], mask.shape[0]))
+            raise ValueError('First dimensions of data ({0}) and mask ({1}) do'
+                             ' not match'.format(data.shape[0], mask.shape[0]))
         mdata = data[mask, :].T
     else:
         mdata = data.T
@@ -190,25 +206,32 @@ def get_coeffs(data, X, mask=None, add_const=False, compute_zvalues=False, min_d
     if add_const:  # add intercept, if specified
         X = np.column_stack([X, np.ones((len(X), 1))])
 
-    # least squares estimation
-    betas = np.dot(np.linalg.pinv(X),mdata)
+    # least squares estimation: beta = (X^T * X)^(-1) * X^T * mdata
+    # betas is transposed due to backward compatibility with rest of code.
+    betas = np.dot(np.linalg.pinv(X), mdata).T
 
     if compute_zvalues:
         # compute t-values of betas (estimates) and then convert to z-values
         # first compute number of degrees of freedom
         df = mdata.shape[0] - X.shape[1]
         if df == 0:
-            LGR.error('ERROR: No degrees of freedom left in least squares calculation. Stopping!!')
-        else:
-            elif df <= min_df:
-                LGR.warning('Number of degrees of freedom in least-square estimation is less than {}'.format(min_df+1))
-            # compute residual sum of squares (RSS)
-            RSS = np.sum(np.power(mdata - np.dot(X, betas.T),2),axis=0)/df
-            RSS = RSS[:,np.newaxis]
-            C = np.diag(np.linalg.pinv(np.dot(X.T,X)))
-            C = C[:,np.newaxis]
-            std_betas = np.sqrt(np.dot(RSS,C.T))
-            z_values = t_to_z(betas / std_betas,df)
+            LGR.error('ERROR: No degrees of freedom left in least squares '
+                      'calculation. Stopping!!')
+        elif df <= min_df:
+            LGR.warning('Number of degrees of freedom in least-square '
+                        'estimation is less than {}'.format(min_df + 1))
+        # compute sigma:
+        # RSS = sum{[mdata - (X * betas)]^2}
+        # sigma = RSS / Degrees_of_Freedom
+        sigma = np.sum(np.power(mdata - np.dot(X, betas.T), 2), axis=0) / df
+        sigma = sigma[:, np.newaxis]
+        # Copmute std of betas:
+        # C = (X^T * X)_ii^(-1)
+        # std(betas) = sqrt(sigma * C)
+        C = np.diag(np.linalg.pinv(np.dot(X.T, X)))
+        C = C[:, np.newaxis]
+        std_betas = np.sqrt(np.dot(sigma, C.T))
+        z_values = t_to_z(betas / std_betas, df)
 
     if add_const:  # drop beta for intercept, if specified
         betas = betas[:, :-1]
@@ -224,5 +247,3 @@ def get_coeffs(data, X, mask=None, add_const=False, compute_zvalues=False, min_d
         return betas, z_values
     else:
         return betas
-
-
diff --git a/tedana/tests/test_stats.py b/tedana/tests/test_stats.py
@@ -6,7 +6,7 @@
 import random
 
 from tedana.stats import computefeats2
-from tedana.stats import get_coeffs
+from tedana.stats import get_ls_coeffs
 from tedana.stats import getfbounds
 
 
@@ -58,7 +58,7 @@ def test_smoke_computefeats2():
     assert computefeats2(data, mmix, normalize=False) is not None
 
 
-def test_get_coeffs():
+def test_get_ls_coeffs():
     """
     Check least squares coefficients.
     """
@@ -69,26 +69,26 @@ def test_get_coeffs():
     X = np.arange(0, 40)[:, np.newaxis]
     mask = np.array([True, False])
 
-    betas = get_coeffs(data, X, mask=None, add_const=False)
+    betas = get_ls_coeffs(data, X, mask=None, add_const=False)
     betas = np.squeeze(betas)
     assert np.allclose(betas, np.array([5., 5.]))
 
-    betas = get_coeffs(data, X, mask=None, add_const=True)
+    betas = get_ls_coeffs(data, X, mask=None, add_const=True)
     betas = np.squeeze(betas)
     assert np.allclose(betas, np.array([5., 5.]))
 
-    betas = get_coeffs(data, X, mask=mask, add_const=False)
+    betas = get_ls_coeffs(data, X, mask=mask, add_const=False)
     betas = np.squeeze(betas)
     assert np.allclose(betas, np.array([5, 0]))
 
-    betas = get_coeffs(data, X, mask=mask, add_const=True)
+    betas = get_ls_coeffs(data, X, mask=mask, add_const=True)
     betas = np.squeeze(betas)
     assert np.allclose(betas, np.array([5, 0]))
 
 
-def test_break_get_coeffs():
+def test_break_get_ls_coeffs():
     """
-    Ensure that get_coeffs fails when input data do not have the right
+    Ensure that get_ls_coeffs fails when input data do not have the right
     shapes.
     """
     n_samples, n_echos, n_vols, n_comps = 10000, 5, 100, 50
@@ -98,41 +98,41 @@ def test_break_get_coeffs():
 
     data = np.empty((n_samples))
     with pytest.raises(ValueError):
-        get_coeffs(data, X, mask, add_const=False)
+        get_ls_coeffs(data, X, mask, add_const=False)
 
     data = np.empty((n_samples, n_vols))
     X = np.empty((n_vols))
     with pytest.raises(ValueError):
-        get_coeffs(data, X, mask, add_const=False)
+        get_ls_coeffs(data, X, mask, add_const=False)
 
     data = np.empty((n_samples, n_echos, n_vols + 1))
     X = np.empty((n_vols, n_comps))
     with pytest.raises(ValueError):
-        get_coeffs(data, X, mask, add_const=False)
+        get_ls_coeffs(data, X, mask, add_const=False)
 
     data = np.empty((n_samples, n_echos, n_vols))
     mask = np.empty((n_samples, n_echos, n_vols))
     with pytest.raises(ValueError):
-        get_coeffs(data, X, mask, add_const=False)
+        get_ls_coeffs(data, X, mask, add_const=False)
 
     mask = np.empty((n_samples + 1, n_echos))
     with pytest.raises(ValueError):
-        get_coeffs(data, X, mask, add_const=False)
+        get_ls_coeffs(data, X, mask, add_const=False)
 
 
-def test_smoke_get_coeffs():
+def test_smoke_get_ls_coeffs():
     """
-    Ensure that get_coeffs returns outputs with different inputs and optional paramters
+    Ensure that get_ls_coeffs returns outputs with different inputs and optional paramters
     """
     n_samples, _, n_times, n_components = 100, 5, 20, 6
     data_2d = np.random.random((n_samples, n_times))
     x = np.random.random((n_times, n_components))
     mask = np.random.randint(2, size=n_samples)
 
-    assert get_coeffs(data_2d, x) is not None
-    # assert get_coeffs(data_3d, x) is not None TODO: submit an issue for the bug
-    assert get_coeffs(data_2d, x, mask=mask) is not None
-    assert get_coeffs(data_2d, x, add_const=True) is not None
+    assert get_ls_coeffs(data_2d, x) is not None
+    # assert get_ls_coeffs(data_3d, x) is not None TODO: submit an issue for the bug
+    assert get_ls_coeffs(data_2d, x, mask=mask) is not None
+    assert get_ls_coeffs(data_2d, x, add_const=True) is not None
 
 
 def test_getfbounds():

diff --git a/tedana/viz.py b/tedana/viz.py
@@ -9,7 +9,7 @@
 matplotlib.use('AGG')
 import matplotlib.pyplot as plt
 
-from tedana import metrics
+from tedana.stats import get_ls_coeffs
 from tedana.utils import get_spectrum
 
 LGR = logging.getLogger(__name__)
@@ -79,7 +79,7 @@ def write_comp_figs(ts, mask, comptable, mmix, ref_img, out_dir,
         LGR.warning('Provided colormap is not recognized, proceeding with default')
         png_cmap = 'coolwarm'
     # regenerate the beta images
-    ts_B = metrics.get_coeffs(ts, mmix, mask)
+    ts_B = get_ls_coeffs(ts, mmix, mask)
     ts_B = ts_B.reshape(ref_img.shape[:3] + ts_B.shape[1:])
     # trim edges from ts_B array
     ts_B = trim_edge_zeros(ts_B)