diff --git a/python/test/perf_metric_test.py b/python/test/perf_metric_test.py
index 0eddb9c50..74abfb1cc 100644
--- a/python/test/perf_metric_test.py
+++ b/python/test/perf_metric_test.py
@@ -97,11 +97,32 @@ def test_auc_perf_metric(self):
         predictions = [1, 2, 3, 4]
         metric = AucPerfMetric(groundtruths, predictions)
         result = metric.evaluate()
-        self.assertAlmostEqual(result['score'], 0.9375, places=6)
+        self.assertAlmostEqual(result['score'], 0.9999999999999999, places=6)
         self.assertAlmostEqual(result['AUC_BW'], 0.9999999999999999, places=6)
         self.assertAlmostEqual(result['AUC_DS'], 0.9375, places=6)
         self.assertAlmostEqual(result['CC_0'], 1.0, places=6)
-        self.assertAlmostEqual(result['THR'], 3.0, places=6)
+        self.assertAlmostEqual(result['THR'], 1.0, places=6)
+
+    @unittest.skipIf(sys.version_info < (3,), reason="For py3 only: py2 uses a different random seed.")
+    def test_auc_perf_multiple_metrics(self):
+        np.random.seed(1)
+        groundtruths = np.random.normal(0, 1.0, [4, 10]) + np.tile(np.array([1, 2, 3, 4]), [10, 1]).T
+        predictions = [[1, 2, 3, 4], [3, 1, 2, 4]]
+        metric = AucPerfMetric(groundtruths, predictions)
+        result = metric.evaluate()
+        self.assertAlmostEqual(result['score'][0], 0.9999999999999999, places=6)
+        self.assertAlmostEqual(result['AUC_BW'][0], 0.9999999999999999, places=6)
+        self.assertAlmostEqual(result['AUC_DS'][0], 0.9375, places=6)
+        self.assertAlmostEqual(result['CC_0'][0], 1.0, places=6)
+        self.assertAlmostEqual(result['THR'][0], 1.0, places=6)
+        self.assertAlmostEqual(result['score'][1], 0.8125, places=6)
+        self.assertAlmostEqual(result['AUC_BW'][1], 0.8125, places=6)
+        self.assertAlmostEqual(result['AUC_DS'][1], 0.6250, places=6)
+        self.assertAlmostEqual(result['CC_0'][1], 0.75, places=6)
+        self.assertAlmostEqual(result['THR'][1], 2, places=6)
+        self.assertAlmostEqual(result['pDS_DL'][0, 1], 0.02746864, places=6)
+        self.assertAlmostEqual(result['pBW_DL'][0, 1], 0.06136883, places=6)
+        self.assertAlmostEqual(result['pCC0_b'][0, 1], 0.03250944, places=6)
 
     def test_auc_metrics_performance(self):
         mat_filepath = VmafConfig.test_resource_path('data_Toyama.mat')
@@ -110,7 +131,7 @@ def test_auc_metrics_performance(self):
         self.assertAlmostEqual(np.float(np.mean(results['AUC_DS'])), 0.69767003960902052, places=6)
         self.assertAlmostEqual(np.float(np.mean(results['AUC_BW'])), 0.94454700301894534, places=6)
         self.assertAlmostEqual(np.float(np.mean(results['CC_0'])), 0.88105386206276415, places=6)
-        self.assertAlmostEqual(np.float(np.mean(results['THR'])), 6.2392849606450556, places=6)
+        self.assertAlmostEqual(np.float(np.mean(results['THR'])), 3.899105581509778, places=6)
 
     def test_respow_perf_metric(self):
         np.random.seed(0)
diff --git a/python/vmaf/core/perf_metric.py b/python/vmaf/core/perf_metric.py
index 41f2eb1a9..3ab10b894 100644
--- a/python/vmaf/core/perf_metric.py
+++ b/python/vmaf/core/perf_metric.py
@@ -173,16 +173,16 @@ def _metrics_performance(objScoDif, signif):
         # end
         pDS_DL = np.ones([M, M])
         for i in range(1, M):
-            for j in range(i+1, M+1):
+            for j in range(i + 1, M + 1):
                 # http://stackoverflow.com/questions/4257394/slicing-of-a-numpy-2d-array-or-how-do-i-extract-an-mxm-submatrix-from-an-nxn-ar
-                pDS_DL[i-1, j-1] = calpvalue(AUC_DS[[i-1, j-1]], C[[[i-1],[j-1]],[i-1, j-1]])
-                pDS_DL[j-1, i-1] = pDS_DL[i-1, j-1]
+                pDS_DL[i - 1, j - 1] = calpvalue(AUC_DS[[i - 1, j - 1]], C[[[i - 1], [j - 1]], [i - 1, j - 1]])
+                pDS_DL[j - 1, i - 1] = pDS_DL[i - 1, j - 1]
 
-        # [pDS_HM,CI_DS] = significanceHM(S, D, AUC_DS);
-        pDS_HM, CI_DS = significanceHM(S, D, AUC_DS)
+        ## [pDS_HM,CI_DS] = significanceHM(S, D, AUC_DS);
+        # pDS_HM, CI_DS = significanceHM(S, D, AUC_DS)
 
         # THR = prctile(D',95);
-        THR = np.percentile(D, 95, axis=1)
+        THR = np.percentile(S, 95, axis=1)
 
         # %%%%%%%%%%%%%%%%%%%%%%% Better / Worse %%%%%%%%%%%%%%%%%%%%%%%%%%%
 
@@ -213,7 +213,7 @@ def _metrics_performance(objScoDif, signif):
         L = B.shape[1] + W.shape[1]
         CC_0 = np.zeros(M)
         for m in range(M):
-            CC_0[m] = float(np.sum(B[m,:] > 0) + np.sum(W[m,:] < 0)) / L
+            CC_0[m] = float(np.sum(B[m, :] > 0) + np.sum(W[m, :] < 0)) / L
 
         # % significance calculation
 
@@ -236,18 +236,18 @@ def _metrics_performance(objScoDif, signif):
         pCC0_b = np.ones([M, M])
         # pCC0_F = np.ones([M, M])
         for i in range(1, M):
-            for j in range(i+1, M+1):
-                pBW_DL[i-1, j-1] = calpvalue(AUC_BW[[i-1, j-1]], C[[[i-1],[j-1]],[i-1, j-1]])
-                pBW_DL[j-1, i-1] = pBW_DL[i-1, j-1]
+            for j in range(i + 1, M + 1):
+                pBW_DL[i - 1, j - 1] = calpvalue(AUC_BW[[i - 1, j - 1]], C[[[i - 1], [j - 1]], [i - 1, j - 1]])
+                pBW_DL[j - 1, i - 1] = pBW_DL[i - 1, j - 1]
 
-                pCC0_b[i-1, j-1] = significanceBinomial(CC_0[i-1], CC_0[j-1], L)
-                pCC0_b[j-1, i-1] = pCC0_b[i-1, j-1]
+                pCC0_b[i - 1, j - 1] = significanceBinomial(CC_0[i - 1], CC_0[j - 1], L)
+                pCC0_b[j - 1, i - 1] = pCC0_b[i - 1, j - 1]
 
                 # pCC0_F[i-1, j-1] = fexact(CC_0[i-1]*L, 2*L, CC_0[i-1]*L + CC_0[j-1]*L, L, 'tail', 'b') / 2.0
                 # pCC0_F[j-1, i-1] = pCC0_F[i-1,j]
 
-        # [pBW_HM,CI_BW] = significanceHM(B, W, AUC_BW);
-        pBW_HM,CI_BW = significanceHM(B, W, AUC_BW)
+        # # [pBW_HM,CI_BW] = significanceHM(B, W, AUC_BW);
+        # pBW_HM, CI_BW = significanceHM(B, W, AUC_BW)
 
         # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 
@@ -266,10 +266,10 @@ def _metrics_performance(objScoDif, signif):
         result = {
             'AUC_DS': AUC_DS,
             'pDS_DL': pDS_DL,
-            'pDS_HM': pDS_HM,
+            # 'pDS_HM': pDS_HM,
             'AUC_BW': AUC_BW,
             'pBW_DL': pBW_DL,
-            'pBW_HM': pBW_HM,
+            # 'pBW_HM': pBW_HM,
             'CC_0': CC_0,
             'pCC0_b': pCC0_b,
             # 'pCC0_F': pCC0_F,
@@ -304,7 +304,7 @@ def _signif(a, b):
             n_b = len(b)
             var_a = np.var(a, ddof=1)
             var_b = np.var(b, ddof=1)
-            den = var_a/n_a + var_b/n_b
+            den = var_a / n_a + var_b / n_b
             if den == 0.0:
                 den = 1e-8
             z = (mos_a - mos_b) / np.sqrt(den)
@@ -317,19 +317,41 @@ def _signif(a, b):
 
         # generate pairs
         N = len(groundtruths)
-        objscodif_mtx = np.zeros([N, N])
         signif_mtx = np.zeros([N, N])
         i = 0
-        for groundtruth, prediction in zip(groundtruths, predictions):
+        for groundtruth in groundtruths:
             j = 0
-            for groundtruth2, prediction2 in zip(groundtruths, predictions):
-                objscodif = prediction - prediction2
+            for groundtruth2 in groundtruths:
                 signif = _signif(groundtruth, groundtruth2)
-                objscodif_mtx[i, j] = objscodif
                 signif_mtx[i, j] = signif
                 j += 1
             i += 1
 
+        if isinstance(predictions[0], list):
+            M = len(predictions)
+        else:
+            M = 1
+
+        objscodif_all = np.zeros([M, N * N])
+        for metric_idx in range(M):
+            objscodif_mtx = np.zeros([N, N])
+
+            if M > 1:
+                metric_predictions = predictions[metric_idx]
+            else:
+                metric_predictions = predictions
+
+            i = 0
+            for prediction in metric_predictions:
+                j = 0
+                for prediction2 in metric_predictions:
+                    objscodif = prediction - prediction2
+                    objscodif_mtx[i, j] = objscodif
+                    j += 1
+                i += 1
+
+            objscodif_all[metric_idx, :] = objscodif_mtx.reshape(1, N * N)
+
         # import matplotlib.pyplot as plt
         # plt.figure()
         # plt.imshow(objscodif_mtx, interpolation='nearest')
@@ -341,18 +363,26 @@ def _signif(a, b):
         # plt.colorbar()
         # DisplayConfig.show()
 
-        results = cls._metrics_performance(objscodif_mtx.reshape(1, N*N), signif_mtx.reshape(1, N*N))
-
-        # _metrics_performance allows processing multiple objective quality
-        # metrics together. Here we just process one:
-        result = {}
-        for key in results:
-            result[key] = results[key][0]
+        results = cls._metrics_performance(objscodif_all, signif_mtx.reshape(1, N * N))
+        results['score'] = results['AUC_BW']
 
-        result['score'] = result['AUC_DS']
+        if isinstance(predictions[0], list):
+            return results
+        else:
+            result = {}
+            for key in results:
+                result[key] = results[key][0]
+            return result
 
-        return result
+    def _assert_args(self):
+        if isinstance(self.predictions[0], list):
+            for metric in self.predictions:
+                assert len(self.groundtruths) == len(metric), 'The lengths of groundtruth labels and predictions do not match.'
+                for score in metric:
+                    assert isinstance(score, float) or isinstance(score, int), 'Predictions need to be a list of lists of numbers.'
 
+        else:
+            assert len(self.groundtruths) == len(self.predictions), 'The lengths of groundtruth labels and predictions do not match.'
 
 class ResolvingPowerPerfMetric(RawScorePerfMetric):
     """