From e6328c58f5169fcb659359e6fcb0838062d0dc42 Mon Sep 17 00:00:00 2001
From: John Welsh <jwelsh@nvidia.com>
Date: Sat, 29 Feb 2020 13:24:00 -0800
Subject: [PATCH 01/16] find peaks cpp

---
 trt_pose/plugins/find_peaks.cpp | 141 ++++++++++++++++----------------
 trt_pose/plugins/find_peaks.hpp |  22 ++++-
 trt_pose/plugins/plugins.cpp    |  61 ++++++++++++--
 trt_pose/plugins/test_all.cpp   |  48 +++++++++++
 4 files changed, 189 insertions(+), 83 deletions(-)
 create mode 100644 trt_pose/plugins/test_all.cpp

diff --git a/trt_pose/plugins/find_peaks.cpp b/trt_pose/plugins/find_peaks.cpp
index b732d09..7a09b17 100644
--- a/trt_pose/plugins/find_peaks.cpp
+++ b/trt_pose/plugins/find_peaks.cpp
@@ -1,78 +1,77 @@
 #include "find_peaks.hpp"
 
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
 
-void find_peaks_out(torch::Tensor counts, torch::Tensor peaks, torch::Tensor input, float threshold, int window_size, int max_count)
-{
-    auto counts_a = counts.accessor<int, 2>();
-    auto peaks_a = peaks.accessor<int, 4>();
-    auto input_a = input.accessor<float, 4>();
-    
-    int w = window_size / 2;
-    int width = input.size(3);
-    int height = input.size(2);
-    
-    for (int b = 0; b < input.size(0); b++)
-    {
-        for (int c = 0; c < input.size(1); c++)
-        {
-            int count = 0;
-            auto peaks_a_bc = peaks_a[b][c];
-            auto input_a_bc = input_a[b][c];
-            
-            for (int i = 0; i < height && count < max_count; i++)
-            {
-                for (int j = 0; j < width && count < max_count; j++)
-                {
-                    float value = input_a_bc[i][j];
-                    
-                    if (value < threshold)
-                        continue;
-                    
-                    int ii_min = i - w;
-                    int jj_min = j - w;
-                    int ii_max = i + w + 1;
-                    int jj_max = j + w + 1;
-                    
-                    if (ii_min < 0) ii_min = 0;
-                    if (ii_max > height) ii_max = height;
-                    if (jj_min < 0) jj_min = 0;
-                    if (jj_max > width) jj_max = width;
-                    
-                    // get max
-                    bool is_peak = true;
-                    for (int ii = ii_min; ii < ii_max; ii++)
-                    {
-                        for (int jj = jj_min; jj < jj_max; jj++)
-                        {
-                            if (input_a_bc[ii][jj] > value) {
-                                is_peak = false;
-                            }
-                        }
-                    }
-                    
-                    if (is_peak) {
-                        peaks_a_bc[count][0] = i;
-                        peaks_a_bc[count][1] = j;
-                        count++;
-                    }
-                }
-            }
-            
-            counts_a[b][c] = count;
+void find_peaks_out_hw(int *counts,        // 1
+                       int *peaks,         // Mx2
+                       const float *input, // HxW
+                       const int H, const int W, const int M,
+                       const float threshold, const int window_size) {
+  int win = window_size / 2;
+  int count = 0;
+
+  for (int i = 0; i < H && count < M; i++) {
+    for (int j = 0; j < W && count < M; j++) {
+      float val = input[i * W + j];
+
+      // skip if below threshold
+      if (val < threshold)
+        continue;
+
+      // compute window bounds
+      int ii_min = MAX(i - win, 0);
+      int jj_min = MAX(j - win, 0);
+      int ii_max = MIN(i + win + 1, H);
+      int jj_max = MIN(j + win + 1, W);
+
+      // search for larger value in window
+      bool is_peak = true;
+      for (int ii = ii_min; ii < ii_max; ii++) {
+        for (int jj = jj_min; jj < jj_max; jj++) {
+          if (input[ii * W + jj] > val) {
+            is_peak = false;
+          }
         }
+      }
+
+      // add peak
+      if (is_peak) {
+        peaks[count * 2] = i;
+        peaks[count * 2 + 1] = j;
+        count++;
+      }
     }
+  }
+
+  *counts = count;
+}
+
+void find_peaks_out_chw(int *counts,        // C
+                        int *peaks,         // CxMx2
+                        const float *input, // CxHxW
+                        const int C, const int H, const int W, const int M,
+                        const float threshold, const int window_size) {
+  for (int c = 0; c < C; c++) {
+    int *counts_c = &counts[c];
+    int *peaks_c = &peaks[c * M * 2];
+    const float *input_c = &input[c * H * W];
+    find_peaks_out_hw(counts_c, peaks_c, input_c, H, W, M, threshold,
+                      window_size);
+  }
 }
 
-std::vector<torch::Tensor> find_peaks(torch::Tensor input, float threshold, int window_size, int max_count)
-{
-    auto options = torch::TensorOptions()
-        .dtype(torch::kInt32)
-        .layout(torch::kStrided)
-        .device(torch::kCPU)
-        .requires_grad(false);
-    
-    auto counts = torch::zeros({input.size(0), input.size(1)}, options);
-    auto peaks = torch::zeros({input.size(0), input.size(1), max_count, 2}, options); // valid, i, j
-    find_peaks_out(counts, peaks, input, threshold, window_size, max_count);
-    return {counts, peaks};
-}
\ No newline at end of file
+void find_peaks_out_nchw(int *counts,        // C
+                         int *peaks,         // CxMx2
+                         const float *input, // CxHxW
+                         const int N, const int C, const int H, const int W,
+                         const int M, const float threshold,
+                         const int window_size) {
+  for (int n = 0; n < N; n++) {
+    int *counts_n = &counts[n * C];
+    int *peaks_n = &peaks[n * C * M * 2];
+    const float *input_n = &input[n * C * H * W];
+    find_peaks_out_chw(counts_n, peaks_n, input_n, C, H, W, M, threshold,
+                       window_size);
+  }
+}
diff --git a/trt_pose/plugins/find_peaks.hpp b/trt_pose/plugins/find_peaks.hpp
index e489051..f596bdd 100644
--- a/trt_pose/plugins/find_peaks.hpp
+++ b/trt_pose/plugins/find_peaks.hpp
@@ -1,6 +1,20 @@
-#include <torch/extension.h>
-#include <vector>
+#pragma once
 
+void find_peaks_out_hw(int *counts,        // 1
+                       int *peaks,         // Mx2
+                       const float *input, // HxW
+                       const int H, const int W, const int M,
+                       const float threshold, const int window_size);
 
-void find_peaks_out(torch::Tensor counts, torch::Tensor peaks, torch::Tensor input, float threshold, int window_size, int max_count);
-std::vector<torch::Tensor> find_peaks(torch::Tensor input, float threshold, int window_size, int max_count);
\ No newline at end of file
+void find_peaks_out_chw(int *counts,        // C
+                        int *peaks,         // CxMx2
+                        const float *input, // CxHxW
+                        const int C, const int H, const int W, const int M,
+                        const float threshold, const int window_size);
+
+void find_peaks_out_nchw(int *counts,        // NxC
+                         int *peaks,         // NxCxMx2
+                         const float *input, // NxCxHxW
+                         const int N, const int C, const int H, const int W,
+                         const int M, const float threshold,
+                         const int window_size);
diff --git a/trt_pose/plugins/plugins.cpp b/trt_pose/plugins/plugins.cpp
index 720fc8d..9c0f82f 100644
--- a/trt_pose/plugins/plugins.cpp
+++ b/trt_pose/plugins/plugins.cpp
@@ -1,16 +1,61 @@
-#include <torch/extension.h>
-#include "find_peaks.hpp"
-#include "paf_score_graph.hpp"
-#include "refine_peaks.hpp"
-#include "munkres.hpp"
 #include "connect_parts.hpp"
+#include "find_peaks.hpp"
 #include "generate_cmap.hpp"
 #include "generate_paf.hpp"
+#include "munkres.hpp"
+#include "paf_score_graph.hpp"
+#include "refine_peaks.hpp"
+#include <torch/extension.h>
+#include <vector>
+
+void find_peaks_out_torch(torch::Tensor counts, torch::Tensor peaks,
+                          torch::Tensor input, const float threshold,
+                          const int window_size, const int max_count) {
+  const int N = input.size(0);
+  const int C = input.size(1);
+  const int H = input.size(2);
+  const int W = input.size(3);
+  const int M = max_count;
+
+  // get pointers to tensor data
+  int *counts_ptr = (int *)counts.data_ptr();
+  int *peaks_ptr = (int *)peaks.data_ptr();
+  const float *input_ptr = (const float *)input.data_ptr();
+
+  // find peaks
+  find_peaks_out_nchw(counts_ptr, peaks_ptr, input_ptr, N, C, H, W, M,
+                      threshold, window_size);
+}
+
+std::vector<torch::Tensor> find_peaks_torch(torch::Tensor input,
+                                            const float threshold,
+                                            const int window_size,
+                                            const int max_count) {
+  auto options = torch::TensorOptions()
+                     .dtype(torch::kInt32)
+                     .layout(torch::kStrided)
+                     .device(torch::kCPU)
+                     .requires_grad(false);
+
+  const int N = input.size(0);
+  const int C = input.size(1);
+  const int H = input.size(2);
+  const int W = input.size(3);
+  const int M = max_count;
+
+  // create output tensors
+  auto counts = torch::zeros({N, C}, options);
+  auto peaks = torch::zeros({N, C, M, 2}, options);
+
+  // find peaks
+  find_peaks_out_torch(counts, peaks, input, threshold, window_size, max_count);
 
+  return {counts, peaks};
+}
 
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("find_peaks", &find_peaks, "find_peaks");
-  m.def("find_peaks_out", &find_peaks_out, "find_peaks_out");
+  m.def("find_peaks", &find_peaks_torch, "find_peaks");
+  m.def("find_peaks_out", &find_peaks_out_torch, "find_peaks_out");
   m.def("paf_score_graph", &paf_score_graph, "paf_score_graph");
   m.def("paf_score_graph_out", &paf_score_graph_out, "paf_score_graph_out");
   m.def("refine_peaks", &refine_peaks, "refine_peaks");
@@ -21,4 +66,4 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
   m.def("assignment_out", &assignment_out, "assignment_out");
   m.def("generate_cmap", &generate_cmap, "generate_cmap");
   m.def("generate_paf", &generate_paf, "generate_paf");
-}
\ No newline at end of file
+}
diff --git a/trt_pose/plugins/test_all.cpp b/trt_pose/plugins/test_all.cpp
new file mode 100644
index 0000000..aaab2bd
--- /dev/null
+++ b/trt_pose/plugins/test_all.cpp
@@ -0,0 +1,48 @@
+#include "find_peaks.hpp"
+#include <stdexcept>
+
+
+void test_find_peaks_out_hw()
+{
+  const int N = 1;
+  const int C = 2;
+  const int H = 4;
+  const int W = 4;
+  const int M = 10;
+  const float threshold = 2.0;
+  const int window_size = 3;
+
+  int counts[N * C];
+  int peaks[N * C * M * 2];
+  float input[N * C * H * W] = {
+
+    0., 0., 0., 0.,
+    0., 0., 3., 0.,
+    0., 0., 0., 0.,
+    1., 0., 0., 0.,
+
+    0., 0., 0., 0.,
+    0., 0., 0., 0.,
+    0., 0., 0., 0.,
+    0., 0., 0., 0.
+
+  };
+
+  find_peaks_out_nchw(counts, peaks, input, N, C, H, W, M, threshold, window_size);
+
+  if (counts[0] != 1) {
+    throw std::runtime_error("Number of peaks should be 1.");
+  }
+  if (peaks[0] != 1) {
+    throw std::runtime_error("Peak i coordinate should be 1.");
+  }
+  if (peaks[1] != 2) {
+    throw std::runtime_error("Peak j coordinate should be 2,");
+  }
+}
+
+int main()
+{
+  test_find_peaks_out_hw();
+  return 0;
+}

From daceb807b236d6a4a4625e262eecde65e0519122 Mon Sep 17 00:00:00 2001
From: John Welsh <jwelsh@nvidia.com>
Date: Sat, 29 Feb 2020 14:26:04 -0800
Subject: [PATCH 02/16] trt_pose cpp

---
 trt_pose/plugins/refine_peaks.cpp | 140 +++++++++++++++---------------
 trt_pose/plugins/refine_peaks.hpp |  40 ++++++++-
 trt_pose/plugins/test_all.cpp     |  58 ++++++++++---
 3 files changed, 149 insertions(+), 89 deletions(-)

diff --git a/trt_pose/plugins/refine_peaks.cpp b/trt_pose/plugins/refine_peaks.cpp
index 0aa637b..44313c1 100644
--- a/trt_pose/plugins/refine_peaks.cpp
+++ b/trt_pose/plugins/refine_peaks.cpp
@@ -1,78 +1,76 @@
 #include "refine_peaks.hpp"
 
+inline int reflect(int idx, int min, int max) {
+  if (idx < min) {
+    return -idx;
+  } else if (idx >= max) {
+    return max - (idx - max) - 2;
+  } else {
+    return idx;
+  }
+}
+
+void refine_peaks_out_hw(float *refined_peaks, // Mx2
+                         const int *counts,    // 1
+                         const int *peaks,     // Mx2
+                         const float *cmap,    // HxW
+                         const int H, const int W, const int M,
+                         const int window_size) {
+  int count = *counts;
+  int win = window_size / 2;
+
+  for (int m = 0; m < count; m++) {
+    float *refined_peak = &refined_peaks[m * 2];
+    refined_peak[0] = 0.;
+    refined_peak[1] = 0.;
+    const int *peak = &peaks[m * 2];
+
+    int i = peak[0];
+    int j = peak[1];
+    float weight_sum = 0.;
 
-void refine_peaks_out(torch::Tensor refined_peaks, torch::Tensor counts, torch::Tensor peaks, torch::Tensor cmap, int window_size)
-{
-    auto refined_peaks_a = refined_peaks.accessor<float, 4>();
-    auto counts_a = counts.accessor<int, 2>();
-    auto peaks_a = peaks.accessor<int, 4>();
-    auto cmap_a = cmap.accessor<float, 4>();
-    
-    int w = window_size / 2;
-    int width = cmap.size(3);
-    int height = cmap.size(2);
-    
-    for (int b = 0; b < cmap.size(0); b++)
-    {
-        for (int c = 0; c < cmap.size(1); c++)
-        {
-            int count = counts_a[b][c];
-            auto refined_peaks_a_bc = refined_peaks_a[b][c];
-            auto peaks_a_bc = peaks_a[b][c];
-            auto cmap_a_bc = cmap_a[b][c];
-            
-            for (int p = 0; p < count; p++)
-            {
-                auto refined_peak = refined_peaks_a_bc[p];
-                auto peak = peaks_a_bc[p];
-                
-                int i = peak[0];
-                int j = peak[1];                
-                float weight_sum = 0.0f;
-                
-                for (int ii = i - w; ii < i + w + 1; ii++)
-                {
-                    int ii_idx = ii;
-                    
-                    // reflect index at border
-                    if (ii < 0) ii_idx = -ii;
-                    else if (ii >= height) ii_idx = height - (ii - height) - 2;
-                        
-                    for (int jj = j - w; jj < j + w + 1; jj++)
-                    {
-                        int jj_idx = jj;
+    for (int ii = i - win; ii < i + win + 1; ii++) {
+      int ii_idx = reflect(ii, 0, H);
+      for (int jj = j - win; jj < j + win + 1; jj++) {
+        int jj_idx = reflect(jj, 0, W);
 
-                        // reflect index at border
-                        if (jj < 0) jj_idx = -jj;
-                        else if (jj >= width) jj_idx = width - (jj - width) - 2;
-                        
-                        float weight = cmap_a_bc[ii_idx][jj_idx];
-                        refined_peak[0] += weight * ii;
-                        refined_peak[1] += weight * jj;
-                        weight_sum += weight;
-                    }
-                }
-                
-                refined_peak[0] /= weight_sum;
-                refined_peak[1] /= weight_sum;
-                refined_peak[0] += 0.5;
-                refined_peak[1] += 0.5;
-                refined_peak[0] /= height;
-                refined_peak[1] /= width;
-            }
-        }
+        float weight = cmap[ii_idx * W + jj_idx];
+        refined_peak[0] += weight * ii;
+        refined_peak[1] += weight * jj;
+        weight_sum += weight;
+      }
     }
+
+    refined_peak[0] /= weight_sum;
+    refined_peak[1] /= weight_sum;
+    refined_peak[0] += 0.5; // center pixel
+    refined_peak[1] += 0.5; // center pixel
+    refined_peak[0] /= H;   // normalize coordinates
+    refined_peak[1] /= W;   // normalize coordinates
+  }
+}
+
+void refine_peaks_out_chw(float *refined_peaks, // CxMx2
+                          const int *counts,    // C
+                          const int *peaks,     // CxMx2
+                          const float *cmap, const int C, const int H,
+                          const int W, const int M, const int window_size) {
+  for (int c = 0; c < C; c++) {
+    refine_peaks_out_hw(&refined_peaks[c * M * 2], &counts[c],
+                        &peaks[c * M * 2], &cmap[c * H * W], H, W, M,
+                        window_size);
+  }
 }
 
-torch::Tensor refine_peaks(torch::Tensor counts, torch::Tensor peaks, torch::Tensor cmap, int window_size)
-{
-    auto options = torch::TensorOptions()
-        .dtype(torch::kFloat32)
-        .layout(torch::kStrided)
-        .device(torch::kCPU)
-        .requires_grad(false);
-    
-    auto refined_peaks = torch::zeros({peaks.size(0), peaks.size(1), peaks.size(2), peaks.size(3)}, options);
-    refine_peaks_out(refined_peaks, counts, peaks, cmap, window_size);
-    return refined_peaks;
-}
\ No newline at end of file
+void refine_peaks_out_nchw(float *refined_peaks, // NxCxMx2
+                           const int *counts,    // NxC
+                           const int *peaks,     // NxCxMx2
+                           const float *cmap, const int N, const int C,
+                           const int H, const int W, const int M,
+                           const int window_size) {
+  for (int n = 0; n < N; n++) {
+    refine_peaks_out_chw(&refined_peaks[n * C * M * 2], &counts[n * C],
+                         &peaks[n * C * M * 2], &cmap[n * C * H * W], C, H, W,
+                         M, window_size);
+  }
+}
diff --git a/trt_pose/plugins/refine_peaks.hpp b/trt_pose/plugins/refine_peaks.hpp
index 0009d94..ddd86fe 100644
--- a/trt_pose/plugins/refine_peaks.hpp
+++ b/trt_pose/plugins/refine_peaks.hpp
@@ -1,6 +1,38 @@
-#include <torch/extension.h>
-#include <vector>
+#pragma once
 
+void refine_peaks_out_hw(
+    float *refined_peaks, 
+    const int *counts,
+    const int *peaks,
+    const float *cmap,
+    const int H,
+    const int W,
+    const int M,
+    const int window_size
+);
 
-void refine_peaks_out(torch::Tensor refined_peaks, torch::Tensor counts, torch::Tensor peaks, torch::Tensor cmap, int window_size);
-torch::Tensor refine_peaks(torch::Tensor counts, torch::Tensor peaks, torch::Tensor cmap, int window_size);
\ No newline at end of file
+void refine_peaks_out_chw(
+    float *refined_peaks, 
+    const int *counts,
+    const int *peaks,
+    const float *cmap,
+    const int C,
+    const int H,
+    const int W,
+    const int M,
+    const int window_size
+);
+
+
+void refine_peaks_out_nchw(
+    float *refined_peaks, 
+    const int *counts,
+    const int *peaks,
+    const float *cmap,
+    const int N,
+    const int C,
+    const int H,
+    const int W,
+    const int M,
+    const int window_size
+);
diff --git a/trt_pose/plugins/test_all.cpp b/trt_pose/plugins/test_all.cpp
index aaab2bd..190dc5c 100644
--- a/trt_pose/plugins/test_all.cpp
+++ b/trt_pose/plugins/test_all.cpp
@@ -1,36 +1,31 @@
-#include "find_peaks.hpp"
 #include <stdexcept>
+#include "find_peaks.hpp"
+#include "refine_peaks.hpp"
 
+#define ABS(x) ((x) > 0 ? (x) : (-x))
 
 void test_find_peaks_out_hw()
 {
-  const int N = 1;
-  const int C = 2;
   const int H = 4;
   const int W = 4;
   const int M = 10;
   const float threshold = 2.0;
   const int window_size = 3;
 
-  int counts[N * C];
-  int peaks[N * C * M * 2];
-  float input[N * C * H * W] = {
+  int counts;
+  int peaks[M * 2];
+  const float input[H * W] = {
 
     0., 0., 0., 0.,
     0., 0., 3., 0.,
     0., 0., 0., 0.,
-    1., 0., 0., 0.,
-
-    0., 0., 0., 0.,
-    0., 0., 0., 0.,
-    0., 0., 0., 0.,
-    0., 0., 0., 0.
+    1., 0., 0., 0.
 
   };
 
-  find_peaks_out_nchw(counts, peaks, input, N, C, H, W, M, threshold, window_size);
+  find_peaks_out_hw(&counts, peaks, input, H, W, M, threshold, window_size);
 
-  if (counts[0] != 1) {
+  if (counts != 1) {
     throw std::runtime_error("Number of peaks should be 1.");
   }
   if (peaks[0] != 1) {
@@ -41,8 +36,43 @@ void test_find_peaks_out_hw()
   }
 }
 
+void test_refined_peaks_out_hw()
+{
+  const int H = 4;
+  const int W = 4;
+  const int M = 1;
+  const int window_size = 3;
+
+  const int counts = 1;
+  const int peaks[M * 2] = { 1, 2 };
+  const float cmap[H * W] = {
+
+    0., 0., 1., 0.,
+    0., 2., 3., 1.,
+    0., 0., 2., 0.,
+    0., 0., 0., 0.
+
+  };
+  const float i_true = (0.5 + (1. * 0 + 2. * 1 + 3. * 1 + 1. * 1 + 2. * 2) / 9.) / H;
+  const float j_true = (0.5 + (2. * 1 + 1. * 2 + 3. * 2 + 2. * 2 + 1. * 3) / 9.) / W;
+  const float tolerance = 1e-5;
+
+  float refined_peaks[M * 2];
+
+  refine_peaks_out_hw(refined_peaks, &counts, peaks, cmap, H, W, M, window_size);
+
+  if (ABS(refined_peaks[0] - i_true) > tolerance) {
+    throw std::runtime_error("i coordinate incorrect");
+  }
+  if (ABS(refined_peaks[1] - j_true) > tolerance) {
+    throw std::runtime_error("j coordinate incorrect");
+  }
+
+}
+
 int main()
 {
   test_find_peaks_out_hw();
+  test_refined_peaks_out_hw();
   return 0;
 }

From 6d3b3fc7b7594c66bb8ffd76b40cc52cac2bce72 Mon Sep 17 00:00:00 2001
From: John Welsh <jwelsh@nvidia.com>
Date: Sat, 29 Feb 2020 14:49:30 -0800
Subject: [PATCH 03/16] refine peaks cpp

---
 trt_pose/plugins/plugins.cpp | 39 ++++++++++++++++++++++++++++++++++--
 1 file changed, 37 insertions(+), 2 deletions(-)

diff --git a/trt_pose/plugins/plugins.cpp b/trt_pose/plugins/plugins.cpp
index 9c0f82f..5985580 100644
--- a/trt_pose/plugins/plugins.cpp
+++ b/trt_pose/plugins/plugins.cpp
@@ -53,13 +53,48 @@ std::vector<torch::Tensor> find_peaks_torch(torch::Tensor input,
   return {counts, peaks};
 }
 
+void refine_peaks_out_torch(torch::Tensor refined_peaks, torch::Tensor counts, torch::Tensor peaks, torch::Tensor cmap, int window_size)
+{
+  const int N = cmap.size(0);
+  const int C = cmap.size(1);
+  const int H = cmap.size(2);
+  const int W = cmap.size(3);
+  const int M = peaks.size(2);
+
+  refine_peaks_out_nchw(
+      (float *) refined_peaks.data_ptr(),
+      (const int*) counts.data_ptr(),
+      (const int*) peaks.data_ptr(),
+      (const float*) cmap.data_ptr(),
+      N,
+      C,
+      H,
+      W,
+      M,
+      window_size
+    );
+}
+
+torch::Tensor refine_peaks_torch(torch::Tensor counts, torch::Tensor peaks, torch::Tensor cmap, int window_size)
+{
+  auto options = torch::TensorOptions()
+      .dtype(torch::kFloat32)
+      .layout(torch::kStrided)
+      .device(torch::kCPU)
+      .requires_grad(false);
+    
+    auto refined_peaks = torch::zeros({peaks.size(0), peaks.size(1), peaks.size(2), peaks.size(3)}, options);
+    refine_peaks_out_torch(refined_peaks, counts, peaks, cmap, window_size);
+    return refined_peaks;
+}
+
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
   m.def("find_peaks", &find_peaks_torch, "find_peaks");
   m.def("find_peaks_out", &find_peaks_out_torch, "find_peaks_out");
   m.def("paf_score_graph", &paf_score_graph, "paf_score_graph");
   m.def("paf_score_graph_out", &paf_score_graph_out, "paf_score_graph_out");
-  m.def("refine_peaks", &refine_peaks, "refine_peaks");
-  m.def("refine_peaks_out", &refine_peaks_out, "refine_peaks_out");
+  m.def("refine_peaks", &refine_peaks_torch, "refine_peaks");
+  m.def("refine_peaks_out", &refine_peaks_out_torch, "refine_peaks_out");
   m.def("munkres", &munkres, "munkres");
   m.def("connect_parts", &connect_parts, "connect_parts");
   m.def("assignment", &assignment, "assignment");

From 1a7ea4fa342dbe571c066106f9d13adeb7751920 Mon Sep 17 00:00:00 2001
From: John Welsh <jwelsh@nvidia.com>
Date: Sat, 29 Feb 2020 14:51:35 -0800
Subject: [PATCH 04/16] reformatted refine peaks header

---
 trt_pose/plugins/refine_peaks.hpp | 51 +++++++++++--------------------
 1 file changed, 17 insertions(+), 34 deletions(-)

diff --git a/trt_pose/plugins/refine_peaks.hpp b/trt_pose/plugins/refine_peaks.hpp
index ddd86fe..fe13d5e 100644
--- a/trt_pose/plugins/refine_peaks.hpp
+++ b/trt_pose/plugins/refine_peaks.hpp
@@ -1,38 +1,21 @@
 #pragma once
 
-void refine_peaks_out_hw(
-    float *refined_peaks, 
-    const int *counts,
-    const int *peaks,
-    const float *cmap,
-    const int H,
-    const int W,
-    const int M,
-    const int window_size
-);
+void refine_peaks_out_hw(float *refined_peaks, // Mx2
+                         const int *counts,    // 1
+                         const int *peaks,     // Mx2
+                         const float *cmap,    // HxW
+                         const int H, const int W, const int M,
+                         const int window_size);
 
-void refine_peaks_out_chw(
-    float *refined_peaks, 
-    const int *counts,
-    const int *peaks,
-    const float *cmap,
-    const int C,
-    const int H,
-    const int W,
-    const int M,
-    const int window_size
-);
+void refine_peaks_out_chw(float *refined_peaks, // CxMx2
+                          const int *counts,    // C
+                          const int *peaks,     // CxMx2
+                          const float *cmap, const int C, const int H,
+                          const int W, const int M, const int window_size);
 
-
-void refine_peaks_out_nchw(
-    float *refined_peaks, 
-    const int *counts,
-    const int *peaks,
-    const float *cmap,
-    const int N,
-    const int C,
-    const int H,
-    const int W,
-    const int M,
-    const int window_size
-);
+void refine_peaks_out_nchw(float *refined_peaks, // NxCxMx2
+                           const int *counts,    // NxC
+                           const int *peaks,     // NxCxMx2
+                           const float *cmap, const int N, const int C,
+                           const int H, const int W, const int M,
+                           const int window_size);

From fe9ad40d8c8ef079b0d188d5e632b60750131370 Mon Sep 17 00:00:00 2001
From: John Welsh <jwelsh@nvidia.com>
Date: Sat, 29 Feb 2020 14:53:06 -0800
Subject: [PATCH 05/16] reformat

---
 .ipynb_checkpoints/README-checkpoint.md | 72 +++++++++++++++++++++++++
 trt_pose/plugins/plugins.cpp            | 40 ++++++--------
 2 files changed, 89 insertions(+), 23 deletions(-)
 create mode 100644 .ipynb_checkpoints/README-checkpoint.md

diff --git a/.ipynb_checkpoints/README-checkpoint.md b/.ipynb_checkpoints/README-checkpoint.md
new file mode 100644
index 0000000..2b172e2
--- /dev/null
+++ b/.ipynb_checkpoints/README-checkpoint.md
@@ -0,0 +1,72 @@
+# TensorRT Pose Estimation
+
+This project features multi-instance pose estimation accelerated by NVIDIA TensorRT.  It is ideal for applications where low latency is necessary.  It includes
+
+- Training scripts to train on any keypoint task data in MSCOCO format
+
+- A collection of models that may be easily optimized with TensorRT using [torch2trt](https://github.com/NVIDIA-AI-IOT/torch2trt)
+
+This project can be used easily for the task of human pose estimation, or extended for something new.
+
+If you run into any issues please [let us know](../../issues).
+
+## Tasks
+
+### Human pose estimation
+
+<img src="https://user-images.githubusercontent.com/4212806/67125332-71a64580-f1a9-11e9-8ee1-e759a38de215.gif" height=256/>
+
+This task involves detecting human body pose using models trained on the MSCOCO dataset. 
+
+#### Models
+
+Below are models pre-trained on the MSCOCO dataset.  The throughput in FPS is shown for each platform
+
+| Model | Jetson Nano | Jetson Xavier | Weights |
+|-------|-------------|---------------|---------|
+| resnet18_baseline_att_224x224_A | 22 | 251 | [download (81MB)](https://drive.google.com/open?id=1XYDdCUdiF2xxx4rznmLb62SdOUZuoNbd) |
+| densenet121_baseline_att_256x256_B | 12 | 101 | [download (84MB)](https://drive.google.com/open?id=13FkJkx7evQ1WwP54UmdiDXWyFMY1OxDU) |
+<!--
+| resnet50_baseline_att_256x256_A |  |  | [download (182MB)](https://drive.google.com/open?id=1eLgzGsh1yjuLG66r9BFmoOzp3nTdVHS2) |
+| resnet50_baseline_att_384x384_A |  |  | [download (182MB)](https://drive.google.com/open?id=1ck66N0Lqxqcg-7OImh_5YNwvnrb9yHym) |
+| densenet121_baseline_att_224x224_B |  |  | [download (84MB)](https://drive.google.com/open?id=1ZP6Wh9CpFQxiRJYO9ECyIVU-soy4aUoW) |
+| densenet121_baseline_att_320x320_A |  |  | [download (84MB)](https://drive.google.com/open?id=1SX-LWAfYNdcNKb42b31UmZwsjXmB3a9l) |
+-->
+
+#### Live demo
+
+To run the live Jupyter Notebook demo on real-time camera input, follow these steps
+ 
+1. Place the downloaded weights in the [tasks/human_pose](tasks/human_pose) directory
+
+2. Open and follow the [live_demo.ipynb](tasks/human_pose/live_demo.ipynb) notebook
+
+    > You may need to modify the notebook, depending on which model you use
+
+## Setup
+
+To install trt_pose, call this command
+
+> We assume you have already installed PyTorch, torchvision, and TensorRT
+
+```bash
+sudo pip3 install tqdm cython pycocotools
+sudo apt-get install python3-matplotlib
+git clone https://github.com/NVIDIA-AI-IOT/trt_pose
+cd trt_pose
+sudo python3 setup.py install
+```
+
+## See also
+
+- [torch2trt](http://github.com/NVIDIA-AI-IOT/torch2trt) - An easy to use PyTorch to TensorRT converter
+
+- [JetBot](http://github.com/NVIDIA-AI-IOT/jetbot) - An educational AI robot based on NVIDIA Jetson Nano
+- [JetRacer](http://github.com/NVIDIA-AI-IOT/jetracer) - An educational AI racecar using NVIDIA Jetson Nano
+- [JetCam](http://github.com/NVIDIA-AI-IOT/jetcam) - An easy to use Python camera interface for NVIDIA Jetson
+
+## References
+
+Cao, Zhe, et al. "Realtime multi-person 2d pose estimation using part affinity fields." Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. 2017.
+
+Xiao, Bin, Haiping Wu, and Yichen Wei. "Simple baselines for human pose estimation and tracking." Proceedings of the European Conference on Computer Vision (ECCV). 2018.
diff --git a/trt_pose/plugins/plugins.cpp b/trt_pose/plugins/plugins.cpp
index 5985580..ae3de67 100644
--- a/trt_pose/plugins/plugins.cpp
+++ b/trt_pose/plugins/plugins.cpp
@@ -53,8 +53,9 @@ std::vector<torch::Tensor> find_peaks_torch(torch::Tensor input,
   return {counts, peaks};
 }
 
-void refine_peaks_out_torch(torch::Tensor refined_peaks, torch::Tensor counts, torch::Tensor peaks, torch::Tensor cmap, int window_size)
-{
+void refine_peaks_out_torch(torch::Tensor refined_peaks, torch::Tensor counts,
+                            torch::Tensor peaks, torch::Tensor cmap,
+                            int window_size) {
   const int N = cmap.size(0);
   const int C = cmap.size(1);
   const int H = cmap.size(2);
@@ -62,30 +63,23 @@ void refine_peaks_out_torch(torch::Tensor refined_peaks, torch::Tensor counts, t
   const int M = peaks.size(2);
 
   refine_peaks_out_nchw(
-      (float *) refined_peaks.data_ptr(),
-      (const int*) counts.data_ptr(),
-      (const int*) peaks.data_ptr(),
-      (const float*) cmap.data_ptr(),
-      N,
-      C,
-      H,
-      W,
-      M,
-      window_size
-    );
+      (float *)refined_peaks.data_ptr(), (const int *)counts.data_ptr(),
+      (const int *)peaks.data_ptr(), (const float *)cmap.data_ptr(), N, C, H, W,
+      M, window_size);
 }
 
-torch::Tensor refine_peaks_torch(torch::Tensor counts, torch::Tensor peaks, torch::Tensor cmap, int window_size)
-{
+torch::Tensor refine_peaks_torch(torch::Tensor counts, torch::Tensor peaks,
+                                 torch::Tensor cmap, int window_size) {
   auto options = torch::TensorOptions()
-      .dtype(torch::kFloat32)
-      .layout(torch::kStrided)
-      .device(torch::kCPU)
-      .requires_grad(false);
-    
-    auto refined_peaks = torch::zeros({peaks.size(0), peaks.size(1), peaks.size(2), peaks.size(3)}, options);
-    refine_peaks_out_torch(refined_peaks, counts, peaks, cmap, window_size);
-    return refined_peaks;
+                     .dtype(torch::kFloat32)
+                     .layout(torch::kStrided)
+                     .device(torch::kCPU)
+                     .requires_grad(false);
+
+  auto refined_peaks = torch::zeros(
+      {peaks.size(0), peaks.size(1), peaks.size(2), peaks.size(3)}, options);
+  refine_peaks_out_torch(refined_peaks, counts, peaks, cmap, window_size);
+  return refined_peaks;
 }
 
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {

From 793708a2d3eec3c767a6967f5cd7b6bfcd74f7e0 Mon Sep 17 00:00:00 2001
From: John Welsh <jwelsh@nvidia.com>
Date: Sat, 29 Feb 2020 16:06:16 -0800
Subject: [PATCH 06/16] paf_score_graph

---
 trt_pose/plugins/paf_score_graph.cpp | 182 ++++++++++-----------------
 trt_pose/plugins/paf_score_graph.hpp |  14 ++-
 trt_pose/plugins/test_all.cpp        |  53 ++++++++
 3 files changed, 130 insertions(+), 119 deletions(-)

diff --git a/trt_pose/plugins/paf_score_graph.cpp b/trt_pose/plugins/paf_score_graph.cpp
index ed9dde5..fbccf54 100644
--- a/trt_pose/plugins/paf_score_graph.cpp
+++ b/trt_pose/plugins/paf_score_graph.cpp
@@ -1,120 +1,74 @@
 #include "paf_score_graph.hpp"
 #include <cmath>
 
-#define EPS 1e-6
+#define EPS 1e-5
 
-void paf_score_graph_out(torch::Tensor score_graph, torch::Tensor paf, torch::Tensor topology, torch::Tensor counts, torch::Tensor peaks, int num_integral_samples)
-{
-    int N = paf.size(0);
-    int K = topology.size(0);
-    int M = peaks.size(2);
-    int H = paf.size(2);
-    int W = paf.size(3);
-    
-    auto score_graph_a = score_graph.accessor<float, 4>();
-    auto paf_a = paf.accessor<float, 4>();
-    auto topology_a = topology.accessor<int, 2>();
-    auto counts_acc = counts.accessor<int, 2>();
-    auto peaks_acc = peaks.accessor<float, 4>();
-    
-    for (int n = 0; n < N; n++)
-    {
-        for (int k = 0; k < K; k++)
-        {
-            auto score_graph_nk = score_graph_a[n][k];
-            auto paf_i_idx = topology_a[k][0];
-            auto paf_j_idx = topology_a[k][1];
-            auto cmap_a_idx = topology_a[k][2];
-            auto cmap_b_idx = topology_a[k][3];
-            auto paf_i = paf_a[n][paf_i_idx];
-            auto paf_j = paf_a[n][paf_j_idx];
-            
-            auto counts_a = counts_acc[n][cmap_a_idx];
-            auto counts_b = counts_acc[n][cmap_b_idx];
-            auto peaks_a = peaks_acc[n][cmap_a_idx];
-            auto peaks_b = peaks_acc[n][cmap_b_idx];
-            
-            for (int a = 0; a < counts_a; a++)
-            {
-                // compute point A
-                float pa_i = peaks_a[a][0] * H;
-                float pa_j = peaks_a[a][1] * W;
-                
-                for (int b = 0; b < counts_b; b++)
-                {
-                    // compute point B
-                    float pb_i = peaks_b[b][0] * H;
-                    float pb_j = peaks_b[b][1] * W;
-                        
-                    // compute vector A->B
-                    float pab_i = pb_i - pa_i;
-                    float pab_j = pb_j - pa_j;
-                    
-                    // compute normalized vector A->B
-                    float pab_norm = sqrtf(pab_i * pab_i + pab_j * pab_j) + EPS;
-                    float uab_i = pab_i / pab_norm;
-                    float uab_j = pab_j / pab_norm;
-                        
-                    float integral = 0.0;
-                    float progress = 0.0;
-                    float increment = 1.0f / num_integral_samples;
-                    
-                    for (int t = 0; t < num_integral_samples; t++)
-                    {
-                        // compute integral point T
-                        float progress = (float) t / (float) num_integral_samples;
-                        float pt_i = pa_i + progress * pab_i; //(1.0 - progress) * pa_i + progress * pb_i;
-                        float pt_j = pa_j + progress * pab_j;//(1.0 - progress) * pa_j + progress * pb_j;
-                        
-                        // convert to int
-                        int pt_i_int = (int) pt_i;
-                        int pt_j_int = (int) pt_j;
-                        
-                        // skip point if out of bounds (will weaken integral)
-                        if (pt_i_int < 0) continue;
-                        if (pt_i_int > H) continue;
-                        if (pt_j_int < 0) continue;
-                        if (pt_j_int > W) continue;
-                        
-                        // get vector at integral point from PAF
-                        float pt_paf_i = paf_i[pt_i_int][pt_j_int];
-                        float pt_paf_j = paf_j[pt_i_int][pt_j_int];
-                        
-                        // compute dot product of normalized A->B with PAF vector at integral point
-                        float dot = pt_paf_i * uab_i + pt_paf_j * uab_j;
-                        integral += dot;
-                        
-                        progress += increment;
-                    }
-                    
-                    // normalize integral by number of samples
-                    integral /= num_integral_samples;
-                    score_graph_nk[a][b] = integral;
-                }
-            }
-        }
-    }
-}
+void paf_score_graph_out_hw(float *score_graph, // MxM
+                        const float *paf_i, // HxW
+                        const float *paf_j, // HxW
+                        const int counts_a, const int counts_b,
+                        const float *peaks_a, // Mx2
+                        const float *peaks_b, // Mx2
+                        const int H, const int W, const int M,
+                        const int num_integral_samples) {
+  for (int a = 0; a < counts_a; a++) {
+    // compute point A
+    float pa_i = peaks_a[a * 2] * H;
+    float pa_j = peaks_a[a * 2 + 1] * W;
+
+    for (int b = 0; b < counts_b; b++) {
+      // compute point B
+      float pb_i = peaks_b[b * 2] * H;
+      float pb_j = peaks_b[b * 2 + 1] * W;
+
+      // compute vector A->B
+      float pab_i = pb_i - pa_i;
+      float pab_j = pb_j - pa_j;
+
+      // compute normalized vector A->B
+      float pab_norm = sqrtf(pab_i * pab_i + pab_j * pab_j) + EPS;
+      float uab_i = pab_i / pab_norm;
+      float uab_j = pab_j / pab_norm;
+
+      float integral = 0.;
+      float progress = 0.;
+      float increment = 1.f / num_integral_samples;
 
-// paf = Nx(2*K)xHxW
-// topology = Kx4 --> (paf_i_idx, paf_j_idx, cmap_a_idx, cmap_b_idx)
-// counts = NxC
-// peaks = NxCxMx2
-// score_graph = NxKxMxM
+      for (int t = 0; t < num_integral_samples; t++) {
+        // compute integral point T
+        float progress = (float)t / (float)num_integral_samples;
+        float pt_i = pa_i + progress * pab_i;
+        float pt_j = pa_j + progress * pab_j;
 
-torch::Tensor paf_score_graph(torch::Tensor paf, torch::Tensor topology, torch::Tensor counts, torch::Tensor peaks, int num_integral_samples)
-{
-    auto options = torch::TensorOptions()
-        .dtype(torch::kFloat32)
-        .layout(torch::kStrided)
-        .device(torch::kCPU)
-        .requires_grad(false);
-    
-    int N = peaks.size(0);
-    int K = topology.size(0);
-    int M = peaks.size(2);
-    
-    auto score_graph = torch::zeros({N, K, M, M}, options);
-    paf_score_graph_out(score_graph, paf, topology, counts, peaks, num_integral_samples);
-    return score_graph;
-}
\ No newline at end of file
+        // convert to int
+        // note: we do not need to subtract 0.5 when indexing, because
+        // round(x - 0.5) = int(x)
+        int pt_i_int = (int)pt_i;
+        int pt_j_int = (int)pt_j;
+
+        // skip point if out of bounds (will weaken integral)
+        if (pt_i_int < 0)
+          continue;
+        if (pt_i_int >= H)
+          continue;
+        if (pt_j_int < 0)
+          continue;
+        if (pt_j_int >= W)
+          continue;
+
+        // get vector at integral point from PAF
+        float pt_paf_i = paf_i[pt_i_int * W + pt_j_int];
+        float pt_paf_j = paf_j[pt_i_int * W + pt_j_int];
+
+        // compute dot product of normalized A->B with PAF vector at integral
+        // point
+        float dot = pt_paf_i * uab_i + pt_paf_j * uab_j;
+        integral += dot;
+        progress += increment;
+      }
+
+      integral /= num_integral_samples;
+      score_graph[a * M + b] = integral;
+    }
+  }
+}
diff --git a/trt_pose/plugins/paf_score_graph.hpp b/trt_pose/plugins/paf_score_graph.hpp
index 8a3db79..c4f1f5d 100644
--- a/trt_pose/plugins/paf_score_graph.hpp
+++ b/trt_pose/plugins/paf_score_graph.hpp
@@ -1,6 +1,10 @@
-#include <torch/extension.h>
-#include <vector>
+#pragma once
 
-
-void paf_score_graph_out(torch::Tensor score_graph, torch::Tensor paf, torch::Tensor topology, torch::Tensor counts, torch::Tensor peaks, int num_integral_samples);
-torch::Tensor paf_score_graph(torch::Tensor paf, torch::Tensor topology, torch::Tensor counts, torch::Tensor peaks, int num_integral_samples);
\ No newline at end of file
+void paf_score_graph_out_hw(float *score_graph, // MxM
+                        const float *paf_i, // HxW
+                        const float *paf_j, // HxW
+                        const int counts_a, const int counts_b,
+                        const float *peaks_a, // Mx2
+                        const float *peaks_b, // Mx2
+                        const int H, const int W, const int M,
+                        const int num_integral_samples);
diff --git a/trt_pose/plugins/test_all.cpp b/trt_pose/plugins/test_all.cpp
index 190dc5c..a68194e 100644
--- a/trt_pose/plugins/test_all.cpp
+++ b/trt_pose/plugins/test_all.cpp
@@ -1,4 +1,5 @@
 #include <stdexcept>
+#include "paf_score_graph.hpp"
 #include "find_peaks.hpp"
 #include "refine_peaks.hpp"
 
@@ -67,12 +68,64 @@ void test_refined_peaks_out_hw()
   if (ABS(refined_peaks[1] - j_true) > tolerance) {
     throw std::runtime_error("j coordinate incorrect");
   }
+}
 
+void test_paf_score_graph_hw()
+{
+  const int M = 2;
+  const int H = 4;
+  const int W = 4;
+  const int counts_a = 2;
+  const int counts_b = 2;
+  const int num_integral_samples = 3;
+
+  float score_graph[M * M];
+
+  // test points
+  //
+  // _ _ _ b
+  // _ _ _ |
+  // a - b |
+  // _ _ _ a
+  
+  const float paf_i[H * W] = {
+    0., 0., 0., -1.,
+    0., 0., 0., -1.,
+    0., 0., 0., -1.,
+    0., 0., 0., -1.
+  };
+  const float paf_j[H * W] = {
+    0., 0., 0., 0.,
+    0., 0., 0., 0.,
+    1., 1., 1., 0.,
+    0., 0., 0., 0.
+  };
+  const float peaks_a[M * 2] = {
+    0.625, 0.125, // mid-left
+    0.875, 0.875  // bot-right
+  };
+  const float peaks_b[M * 2] = {
+    0.625, 0.625, // mid-mid
+    0.125, 0.875  // top-right
+  };
+  
+  paf_score_graph_out_hw(
+      score_graph,
+      paf_i,
+      paf_j,
+      counts_a,
+      counts_b,
+      peaks_a,
+      peaks_b,
+      H, W, M,
+      num_integral_samples
+      );
 }
 
 int main()
 {
   test_find_peaks_out_hw();
   test_refined_peaks_out_hw();
+  test_paf_score_graph_hw();
   return 0;
 }

From 67d230f9ee5e9bb7ce3fc30235d8d67877bb0524 Mon Sep 17 00:00:00 2001
From: John Welsh <jwelsh@nvidia.com>
Date: Sat, 29 Feb 2020 16:29:15 -0800
Subject: [PATCH 07/16] paf batch/k methods

---
 trt_pose/plugins/paf_score_graph.cpp | 57 ++++++++++++++++++++++++----
 trt_pose/plugins/paf_score_graph.hpp | 31 +++++++++++----
 2 files changed, 74 insertions(+), 14 deletions(-)

diff --git a/trt_pose/plugins/paf_score_graph.cpp b/trt_pose/plugins/paf_score_graph.cpp
index fbccf54..5e1ead4 100644
--- a/trt_pose/plugins/paf_score_graph.cpp
+++ b/trt_pose/plugins/paf_score_graph.cpp
@@ -4,13 +4,13 @@
 #define EPS 1e-5
 
 void paf_score_graph_out_hw(float *score_graph, // MxM
-                        const float *paf_i, // HxW
-                        const float *paf_j, // HxW
-                        const int counts_a, const int counts_b,
-                        const float *peaks_a, // Mx2
-                        const float *peaks_b, // Mx2
-                        const int H, const int W, const int M,
-                        const int num_integral_samples) {
+                            const float *paf_i, // HxW
+                            const float *paf_j, // HxW
+                            const int counts_a, const int counts_b,
+                            const float *peaks_a, // Mx2
+                            const float *peaks_b, // Mx2
+                            const int H, const int W, const int M,
+                            const int num_integral_samples) {
   for (int a = 0; a < counts_a; a++) {
     // compute point A
     float pa_i = peaks_a[a * 2] * H;
@@ -72,3 +72,46 @@ void paf_score_graph_out_hw(float *score_graph, // MxM
     }
   }
 }
+
+void paf_score_graph_out_khw(float *score_graph,  // KxMxM
+                             const int *topology, // Kx4
+                             const float *paf,    // 2KxHxW
+                             const int *counts,   // C
+                             const float *peaks,  // CxMx2
+                             const int K, const int C, const int H, const int W,
+                             const int M, const int num_integral_samples) {
+  for (int k = 0; k < K; k++) {
+    float *score_graph_k = &score_graph[k * M * M];
+    const int *tk = &topology[k * 4];
+    const int paf_i_idx = tk[0];
+    const int paf_j_idx = tk[1];
+    const int cmap_a_idx = tk[2];
+    const int cmap_b_idx = tk[3];
+    const float *paf_i = &paf[paf_i_idx * H * W];
+    const float *paf_j = &paf[paf_j_idx * H * W];
+
+    const int counts_a = counts[cmap_a_idx];
+    const int counts_b = counts[cmap_b_idx];
+    const float *peaks_a = &peaks[cmap_a_idx * M * 2];
+    const float *peaks_b = &peaks[cmap_b_idx * M * 2];
+
+    paf_score_graph_out_hw(score_graph_k, paf_i, paf_j, counts_a, counts_b,
+                           peaks_a, peaks_b, H, W, M, num_integral_samples);
+  }
+}
+
+void paf_score_graph_out_nkhw(float *score_graph,  // NxKxMxM
+                              const int *topology, // Kx4
+                              const float *paf,    // Nx2KxHxW
+                              const int *counts,   // NxC
+                              const float *peaks,  // NxCxMx2
+                              const int N, const int K, const int C,
+                              const int H, const int W, const int M,
+                              const int num_integral_samples) {
+  for (int n = 0; n < N; n++) {
+    paf_score_graph_out_khw(&score_graph[n * K * M * M], topology,
+                            &paf[n * 2 * K * H * W], &counts[n * C],
+                            &peaks[n * C * M * 2], K, C, H, W, M,
+                            num_integral_samples);
+  }
+}
diff --git a/trt_pose/plugins/paf_score_graph.hpp b/trt_pose/plugins/paf_score_graph.hpp
index c4f1f5d..dda9e61 100644
--- a/trt_pose/plugins/paf_score_graph.hpp
+++ b/trt_pose/plugins/paf_score_graph.hpp
@@ -1,10 +1,27 @@
 #pragma once
 
 void paf_score_graph_out_hw(float *score_graph, // MxM
-                        const float *paf_i, // HxW
-                        const float *paf_j, // HxW
-                        const int counts_a, const int counts_b,
-                        const float *peaks_a, // Mx2
-                        const float *peaks_b, // Mx2
-                        const int H, const int W, const int M,
-                        const int num_integral_samples);
+                            const float *paf_i, // HxW
+                            const float *paf_j, // HxW
+                            const int counts_a, const int counts_b,
+                            const float *peaks_a, // Mx2
+                            const float *peaks_b, // Mx2
+                            const int H, const int W, const int M,
+                            const int num_integral_samples);
+
+void paf_score_graph_out_khw(float *score_graph,  // KxMxM
+                             const int *topology, // Kx4
+                             const float *paf,    // 2KxHxW
+                             const int *counts,   // C
+                             const float *peaks,  // CxMx2
+                             const int K, const int C, const int H, const int W,
+                             const int M, const int num_integral_samples);
+
+void paf_score_graph_out_nkhw(float *score_graph,  // NxKxMxM
+                              const int *topology, // Kx4
+                              const float *paf,    // Nx2KxHxW
+                              const int *counts,   // NxC
+                              const float *peaks,  // NxCxMx2
+                              const int N, const int K, const int C,
+                              const int H, const int W, const int M,
+                              const int num_integral_samples);

From 50b11c75671a678f4aabae5c21d783c1e703b20d Mon Sep 17 00:00:00 2001
From: John Welsh <jwelsh@nvidia.com>
Date: Sat, 29 Feb 2020 18:22:55 -0800
Subject: [PATCH 08/16] added munkres cpp

---
 trt_pose/plugins/munkres.cpp  | 359 +++++++++++++++-------------------
 trt_pose/plugins/munkres.hpp  |  30 ++-
 trt_pose/plugins/test_all.cpp |  36 ++++
 3 files changed, 210 insertions(+), 215 deletions(-)

diff --git a/trt_pose/plugins/munkres.cpp b/trt_pose/plugins/munkres.cpp
index 26e1582..65fa36b 100644
--- a/trt_pose/plugins/munkres.cpp
+++ b/trt_pose/plugins/munkres.cpp
@@ -1,56 +1,51 @@
-#include "utils/PairGraph.hpp"
-#include "utils/CoverTable.hpp"
 #include "munkres.hpp"
+#include "utils/CoverTable.hpp"
+#include "utils/PairGraph.hpp"
 
-
-void subMinRow(torch::TensorAccessor<float, 2> cost_graph, int nrows, int ncols)
-{
-  for (int i = 0; i < nrows; i++) 
-  {
+void subMinRow(float *cost_graph, const int M, const int nrows,
+               const int ncols) {
+  for (int i = 0; i < nrows; i++) {
     // find min
-    float min = cost_graph[i][0];
+    float min = cost_graph[i * M];
     for (int j = 0; j < ncols; j++) {
-        float val = cost_graph[i][j];
-        if (val < min) {
-            min = val;
-        }
+      float val = cost_graph[i * M + j];
+      if (val < min) {
+        min = val;
+      }
     }
-    
+
     // subtract min
     for (int j = 0; j < ncols; j++) {
-        cost_graph[i][j] -= min;
+      cost_graph[i * M + j] -= min;
     }
   }
 }
 
-void subMinCol(torch::TensorAccessor<float, 2> cost_graph, int nrows, int ncols)
-{
-  for (int j = 0; j < ncols; j++)
-  {
+void subMinCol(float *cost_graph, const int M, const int nrows,
+               const int ncols) {
+  for (int j = 0; j < ncols; j++) {
     // find min
-    float min = cost_graph[0][j];
+    float min = cost_graph[j];
     for (int i = 0; i < nrows; i++) {
-        float val = cost_graph[i][j];
-        if (val < min) {
-            min = val;
-        }
+      float val = cost_graph[i * M + j];
+      if (val < min) {
+        min = val;
+      }
     }
-    
+
     // subtract min
     for (int i = 0; i < nrows; i++) {
-        cost_graph[i][j] -= min;
+      cost_graph[i * M + j] -= min;
     }
   }
 }
 
-void munkresStep1(torch::TensorAccessor<float, 2> cost_graph, PairGraph &star_graph, int nrows, int ncols)
-{
-  for (int i = 0; i < nrows; i++)
-  {
-    for (int j = 0; j < ncols; j++)
-    {
-      if (!star_graph.isRowSet(i) && !star_graph.isColSet(j) && (cost_graph[i][j] == 0))
-      {
+void munkresStep1(const float *cost_graph, const int M, PairGraph &star_graph,
+                  const int nrows, const int ncols) {
+  for (int i = 0; i < nrows; i++) {
+    for (int j = 0; j < ncols; j++) {
+      if (!star_graph.isRowSet(i) && !star_graph.isColSet(j) &&
+          (cost_graph[i * M + j] == 0)) {
         star_graph.set(i, j);
       }
     }
@@ -58,14 +53,12 @@ void munkresStep1(torch::TensorAccessor<float, 2> cost_graph, PairGraph &star_gr
 }
 
 // returns 1 if we should exit
-bool munkresStep2(const PairGraph &star_graph, CoverTable &cover_table)
-{
-  int k = star_graph.nrows < star_graph.ncols ? star_graph.nrows : star_graph.ncols;
+bool munkresStep2(const PairGraph &star_graph, CoverTable &cover_table) {
+  int k =
+      star_graph.nrows < star_graph.ncols ? star_graph.nrows : star_graph.ncols;
   int count = 0;
-  for (int j = 0; j < star_graph.ncols; j++) 
-  {
-    if (star_graph.isColSet(j)) 
-    {
+  for (int j = 0; j < star_graph.ncols; j++) {
+    if (star_graph.isColSet(j)) {
       cover_table.coverCol(j);
       count++;
     }
@@ -73,22 +66,18 @@ bool munkresStep2(const PairGraph &star_graph, CoverTable &cover_table)
   return count >= k;
 }
 
-bool munkresStep3(torch::TensorAccessor<float, 2> cost_graph, const PairGraph &star_graph, PairGraph &prime_graph, CoverTable &cover_table, std::pair<int, int> &p, int nrows, int ncols)
-{
-  for (int i = 0; i < nrows; i++)
-  {
-    for (int j = 0; j < ncols; j++)
-    {
-      if (cost_graph[i][j] == 0 && !cover_table.isCovered(i, j))
-      {
+bool munkresStep3(const float *cost_graph, const int M,
+                  const PairGraph &star_graph, PairGraph &prime_graph,
+                  CoverTable &cover_table, std::pair<int, int> &p,
+                  const int nrows, const int ncols) {
+  for (int i = 0; i < nrows; i++) {
+    for (int j = 0; j < ncols; j++) {
+      if (cost_graph[i * M + j] == 0 && !cover_table.isCovered(i, j)) {
         prime_graph.set(i, j);
-        if (star_graph.isRowSet(i))
-        {
+        if (star_graph.isRowSet(i)) {
           cover_table.coverRow(i);
           cover_table.uncoverCol(star_graph.colForRow(i));
-        }
-        else
-        {
+        } else {
           p.first = i;
           p.second = j;
           return 1;
@@ -97,212 +86,170 @@ bool munkresStep3(torch::TensorAccessor<float, 2> cost_graph, const PairGraph &s
     }
   }
   return 0;
-}; 
+};
 
-void munkresStep4(PairGraph &star_graph, PairGraph &prime_graph, CoverTable &cover_table, std::pair<int, int> p)
-{
+void munkresStep4(PairGraph &star_graph, PairGraph &prime_graph,
+                  CoverTable &cover_table, std::pair<int, int> p) {
   // repeat until no star found in prime's column
-  while (star_graph.isColSet(p.second))
-  {
-    // find and reset star in prime's column 
-    std::pair<int, int> s = { star_graph.rowForCol(p.second), p.second }; 
+  while (star_graph.isColSet(p.second)) {
+    // find and reset star in prime's column
+    std::pair<int, int> s = {star_graph.rowForCol(p.second), p.second};
     star_graph.reset(s.first, s.second);
 
     // set this prime to a star
     star_graph.set(p.first, p.second);
 
     // repeat for prime in cleared star's row
-    p = { s.first, prime_graph.colForRow(s.first) };
+    p = {s.first, prime_graph.colForRow(s.first)};
   }
   star_graph.set(p.first, p.second);
   cover_table.clear();
   prime_graph.clear();
 }
 
-void munkresStep5(torch::TensorAccessor<float, 2> cost_graph, const CoverTable &cover_table, int nrows, int ncols)
-{
+void munkresStep5(float *cost_graph, const int M, const CoverTable &cover_table,
+                  const int nrows, const int ncols) {
   bool valid = false;
   float min;
-  for (int i = 0; i < nrows; i++)
-  {
-    for (int j = 0; j < ncols; j++)
-    {
-      if (!cover_table.isCovered(i, j))
-      {
-        if (!valid)
-        {
-          min = cost_graph[i][j];
+  for (int i = 0; i < nrows; i++) {
+    for (int j = 0; j < ncols; j++) {
+      if (!cover_table.isCovered(i, j)) {
+        if (!valid) {
+          min = cost_graph[i * M + j];
           valid = true;
-        }
-        else if (cost_graph[i][j] < min)
-        {
-          min = cost_graph[i][j];
+        } else if (cost_graph[i * M + j] < min) {
+          min = cost_graph[i * M + j];
         }
       }
     }
   }
 
-  for (int i = 0; i < nrows; i++)
-  {
-    if (cover_table.isRowCovered(i))
-    {
+  for (int i = 0; i < nrows; i++) {
+    if (cover_table.isRowCovered(i)) {
       for (int j = 0; j < ncols; j++) {
-          cost_graph[i][j] += min;
+        cost_graph[i * M + j] += min;
       }
-//       cost_graph.addToRow(i, min);
+      //       cost_graph.addToRow(i, min);
     }
   }
-  for (int j = 0; j < ncols; j++)
-  {
-    if (!cover_table.isColCovered(j))
-    {
+  for (int j = 0; j < ncols; j++) {
+    if (!cover_table.isColCovered(j)) {
       for (int i = 0; i < nrows; i++) {
-          cost_graph[i][j] -= min;
+        cost_graph[i * M + j] -= min;
       }
-//       cost_graph.addToCol(j, -min);
+      //       cost_graph.addToCol(j, -min);
     }
   }
 }
 
-
-void _munkres(torch::TensorAccessor<float, 2> cost_graph, PairGraph &star_graph, int nrows, int ncols)
-{
-  PairGraph prime_graph(nrows, ncols);  
+void _munkres(float *cost_graph, const int M, PairGraph &star_graph,
+              const int nrows, const int ncols) {
+  PairGraph prime_graph(nrows, ncols);
   CoverTable cover_table(nrows, ncols);
   prime_graph.clear();
   cover_table.clear();
   star_graph.clear();
-    
+
   int step = 0;
-  if (ncols >= nrows)
-  {
-    subMinRow(cost_graph, nrows, ncols);
+  if (ncols >= nrows) {
+    subMinRow(cost_graph, M, nrows, ncols);
   }
-  if (ncols > nrows)
-  {
+  if (ncols > nrows) {
     step = 1;
   }
 
   std::pair<int, int> p;
   bool done = false;
-  while (!done)
-  {
-    switch(step)
-    {
-      case 0:
-        subMinCol(cost_graph, nrows, ncols);
-      case 1:
-        munkresStep1(cost_graph, star_graph, nrows, ncols);
-      case 2:
-        if(munkresStep2(star_graph, cover_table))
-        {
-          done = true;
-          break;
-        }
-      case 3:
-        if (!munkresStep3(cost_graph, star_graph, prime_graph, cover_table, p, nrows, ncols))
-        {
-          step = 5;
-          break;
-        }
-      case 4:
-        munkresStep4(star_graph, prime_graph, cover_table, p);
-        step = 2;
+  while (!done) {
+    switch (step) {
+    case 0:
+      subMinCol(cost_graph, M, nrows, ncols);
+    case 1:
+      munkresStep1(cost_graph, M, star_graph, nrows, ncols);
+    case 2:
+      if (munkresStep2(star_graph, cover_table)) {
+        done = true;
         break;
-      case 5:
-        munkresStep5(cost_graph, cover_table, nrows, ncols);
-        step = 3;
+      }
+    case 3:
+      if (!munkresStep3(cost_graph, M, star_graph, prime_graph, cover_table, p,
+                        nrows, ncols)) {
+        step = 5;
         break;
+      }
+    case 4:
+      munkresStep4(star_graph, prime_graph, cover_table, p);
+      step = 2;
+      break;
+    case 5:
+      munkresStep5(cost_graph, M, cover_table, nrows, ncols);
+      step = 3;
+      break;
     }
   }
 }
 
-
-void munkres_out(torch::Tensor cost_graph_out, torch::Tensor cost_graph, torch::Tensor topology, torch::Tensor counts)
-{
-    int N = counts.size(0);
-    int K = topology.size(0);
-    
-    cost_graph_out.copy_(cost_graph);
-    auto topology_a = topology.accessor<int, 2>();
-    auto counts_a = counts.accessor<int, 2>();
-    auto cost_graph_out_a = cost_graph_out.accessor<float, 4>();
-    
-    for (int n = 0; n < N; n++)
-    {
-        for (int k = 0; k < K; k++)
-        {
-            int cmap_a_idx = topology_a[k][2];
-            int cmap_b_idx = topology_a[k][3];
-            int nrows = counts_a[n][cmap_a_idx];
-            int ncols = counts_a[n][cmap_b_idx];
-            auto star_graph = PairGraph(nrows, ncols);
-            _munkres(cost_graph_out_a[n][k], star_graph, nrows, ncols);
-        }
-    }
+std::size_t assignment_out_workspace(const int M) {
+  return sizeof(float) * M * M;
 }
 
-torch::Tensor munkres(torch::Tensor cost_graph, torch::Tensor topology, torch::Tensor counts)
-{
-    auto cost_graph_out = torch::empty_like(cost_graph);
-    munkres_out(cost_graph_out, cost_graph, topology, counts);
-    return cost_graph_out;
-}
+void assignment_out(int *connections,         // 2xM
+                    const float *score_graph, // MxM
+                    const int count_a, const int count_b, const int M,
+                    const float score_threshold, void *workspace) {
+  const int nrows = count_a;
+  const int ncols = count_b;
+
+  // compute cost graph (negate score graph)
+  float *cost_graph = (float *)workspace;
+  for (int i = 0; i < count_a; i++) {
+    for (int j = 0; j < count_b; j++) {
+      const int idx = i * M + j;
+      cost_graph[idx] = -score_graph[idx];
+    }
+  }
 
+  // run munkres algorithm
+  auto star_graph = PairGraph(nrows, ncols);
+  _munkres(cost_graph, M, star_graph, nrows, ncols);
 
-// assignment NxKx2xM
-void assignment_out(torch::Tensor connections, torch::Tensor score_graph, torch::Tensor topology, torch::Tensor counts, float score_threshold)
-{
-    int N = counts.size(0);
-    int K = topology.size(0);
-    
-    auto cost_graph = -score_graph;
-    auto score_graph_a = score_graph.accessor<float, 4>();
-    auto connections_a = connections.accessor<int, 4>();
-    auto topology_a = topology.accessor<int, 2>();
-    auto counts_a = counts.accessor<int, 2>();
-    auto cost_graph_out_a = cost_graph.accessor<float, 4>();
-    
-    for (int n = 0; n < N; n++)
-    {
-        for (int k = 0; k < K; k++)
-        {
-            int cmap_a_idx = topology_a[k][2];
-            int cmap_b_idx = topology_a[k][3];
-            int nrows = counts_a[n][cmap_a_idx];
-            int ncols = counts_a[n][cmap_b_idx];
-            auto star_graph = PairGraph(nrows, ncols);
-            auto cost_graph_out_a_nk = cost_graph_out_a[n][k];
-            _munkres(cost_graph_out_a_nk, star_graph, nrows, ncols);
-            
-            auto connections_a_nk = connections_a[n][k];
-            auto score_graph_a_nk = score_graph_a[n][k];
-            
-            for (int i = 0; i < nrows; i++) {
-                for (int j = 0; j < ncols; j++) {
-                    if (star_graph.isPair(i, j) && score_graph_a_nk[i][j] > score_threshold) {
-                        connections_a_nk[0][i] = j;
-                        connections_a_nk[1][j] = i;
-                    }
-                }
-            }
-        }
+  // fill output connections
+  for (int i = 0; i < nrows; i++) {
+    for (int j = 0; j < ncols; j++) {
+      if (star_graph.isPair(i, j) && score_graph[i * M + j] > score_threshold) {
+        connections[0 * M + i] = j;
+        connections[1 * M + j] = i;
+      }
     }
+  }
 }
 
-torch::Tensor assignment(torch::Tensor score_graph, torch::Tensor topology, torch::Tensor counts, float score_threshold)
-{
-    auto options = torch::TensorOptions()
-        .dtype(torch::kInt32)
-        .layout(torch::kStrided)
-        .device(torch::kCPU)
-        .requires_grad(false);
-    
-    int N = counts.size(0);
-    int K = topology.size(0);
-    int M = score_graph.size(2);
-    
-    auto connections = torch::full({N, K, 2, M}, -1, options);
-    assignment_out(connections, score_graph, topology, counts, score_threshold);
-    return connections;
-}
\ No newline at end of file
+void assignement_out_k(int *connections,         // Kx2xM
+                       const int *topology,      // Kx4
+                       const float *score_graph, // KxMxM
+                       const int *counts,        // C
+                       const int K, const int M, const float score_threshold,
+                       void *workspace) {
+  for (int k = 0; k < K; k++) {
+    const int *tk = &topology[k * 4];
+    const int cmap_idx_a = tk[2];
+    const int cmap_idx_b = tk[3];
+    const int count_a = counts[cmap_idx_a];
+    const int count_b = counts[cmap_idx_b];
+    assignment_out(&connections[k * 2 * M], &score_graph[k * M * M], count_a,
+                   count_b, M, score_threshold, workspace);
+  }
+}
+
+void assignement_out_nk(int *connections,         // NxKx2xM
+                        const int *topology,      // Kx4
+                        const float *score_graph, // NxKxMxM
+                        const int *counts,        // NxC
+                        const int N, const int C, const int K, const int M,
+                        const float score_threshold, void *workspace) {
+  for (int n = 0; n < N; n++) {
+    assignement_out_k(&connections[n * K * 2 * M], topology,
+                      &score_graph[n * K * M * M], &counts[n * C], K, M,
+                      score_threshold, workspace);
+  }
+}
diff --git a/trt_pose/plugins/munkres.hpp b/trt_pose/plugins/munkres.hpp
index be3e74c..6fafc7f 100644
--- a/trt_pose/plugins/munkres.hpp
+++ b/trt_pose/plugins/munkres.hpp
@@ -1,12 +1,24 @@
-#include <torch/extension.h>
-#include <vector>
-#include "utils/PairGraph.hpp"
+#pragma once
 
+#include <cstring>
 
-void _munkres(torch::TensorAccessor<float, 2> cost_graph, PairGraph &star_graph, int nrows, int ncols);
-void munkres_out(torch::Tensor cost_graph_out, torch::Tensor cost_graph, torch::Tensor topology, torch::Tensor counts);
-torch::Tensor munkres(torch::Tensor cost_graph, torch::Tensor topology, torch::Tensor counts);
+std::size_t assignment_out_workspace(const int M);
 
-// assignment NxKx2xM
-void assignment_out(torch::Tensor connections, torch::Tensor score_graph, torch::Tensor topology, torch::Tensor counts, float score_threshold);
-torch::Tensor assignment(torch::Tensor score_graph, torch::Tensor topology, torch::Tensor counts, float score_threshold);
\ No newline at end of file
+void assignment_out(int *connections,         // 2xM
+                    const float *score_graph, // MxM
+                    const int count_a, const int count_b, const int M,
+                    const float score_threshold, void *workspace);
+
+void assignement_out_k(int *connections,         // Kx2xM
+                       const int *topology,      // Kx4
+                       const float *score_graph, // KxMxM
+                       const int *counts,        // C
+                       const int K, const int M, const float score_threshold,
+                       void *workspace);
+
+void assignement_out_nk(int *connections,         // NxKx2xM
+                        const int *topology,      // Kx4
+                        const float *score_graph, // NxKxMxM
+                        const int *counts,        // NxC
+                        const int N, const int C, const int K, const int M,
+                        const float score_threshold, void *workspace);
diff --git a/trt_pose/plugins/test_all.cpp b/trt_pose/plugins/test_all.cpp
index a68194e..8e6b018 100644
--- a/trt_pose/plugins/test_all.cpp
+++ b/trt_pose/plugins/test_all.cpp
@@ -2,6 +2,7 @@
 #include "paf_score_graph.hpp"
 #include "find_peaks.hpp"
 #include "refine_peaks.hpp"
+#include "munkres.hpp"
 
 #define ABS(x) ((x) > 0 ? (x) : (-x))
 
@@ -122,10 +123,45 @@ void test_paf_score_graph_hw()
       );
 }
 
+void test_assignment_out()
+{
+  const int M = 4;
+  const int count_a = 3;
+  const int count_b = 3;
+  const float score_threshold = 0.3;
+  
+  std::size_t workspace_size = assignment_out_workspace(M);
+  void *workspace = (void *) malloc(workspace_size);
+
+  int connections[2 * M];
+  const float score_graph[M * M] = {
+    1., 3., 0., 0.,
+    1., 2., 1., 0.,
+    4., 3., 4., 0.,
+    0., 0., 0., 0.,
+  };
+
+  assignment_out(connections, score_graph, count_a, count_b, M, score_threshold, workspace);
+
+  if (connections[0] != 1) {
+    throw std::runtime_error("connections[0] should be 1.");
+  }
+  if (connections[1] != 0) {
+    throw std::runtime_error("connections[0] should be 1.");
+  }
+  if (connections[2] != 2) {
+    throw std::runtime_error("connections[0] should be 1.");
+  }
+
+  free(workspace);
+}
+
+
 int main()
 {
   test_find_peaks_out_hw();
   test_refined_peaks_out_hw();
   test_paf_score_graph_hw();
+  test_assignment_out();
   return 0;
 }

From 908154c4bdf07274c22c2e75635347a8dc4363fa Mon Sep 17 00:00:00 2001
From: John Welsh <jwelsh@nvidia.com>
Date: Sat, 29 Feb 2020 18:26:38 -0800
Subject: [PATCH 09/16] todo

---
 trt_pose/plugins/README.md | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/trt_pose/plugins/README.md b/trt_pose/plugins/README.md
index b7386fa..2264416 100644
--- a/trt_pose/plugins/README.md
+++ b/trt_pose/plugins/README.md
@@ -1,3 +1,16 @@
+## TODO
+
+- [x] find_peaks plain cpp
+- [x] find_peaks torch binding
+- [x] refine_peaks plain cpp
+- [x] refine_peaks torch binding
+- [x] paf_score_graph plain cpp
+- [ ] paf_score_graph torch binding
+- [x] munkres plain cpp
+- [ ] munkres torch binding
+- [ ] connect parts plain cpp
+- [ ] connect parts torch binding
+
 ## Terminology
 
 * ``N`` - int - Batch size
@@ -71,4 +84,4 @@ cmap = generate_cmap(peak_counts, normalized_peaks, height=46, width=46, stdev=1
 
 ```python
 paf = generate_paf(connections, topology, peak_counts, normalized_peaks, height=46, width=46, stdev=1)
-```
\ No newline at end of file
+```

From 3de7916b306a4e4fc8e9fc859d698e3bd18a1f41 Mon Sep 17 00:00:00 2001
From: John Welsh <jwelsh@nvidia.com>
Date: Sun, 1 Mar 2020 12:08:54 -0800
Subject: [PATCH 10/16] added torch binding for paf_score_graph

---
 trt_pose/plugins/README.md   |  2 +-
 trt_pose/plugins/plugins.cpp | 46 ++++++++++++++++++++++++++++++++----
 2 files changed, 42 insertions(+), 6 deletions(-)

diff --git a/trt_pose/plugins/README.md b/trt_pose/plugins/README.md
index 2264416..c5cfaf1 100644
--- a/trt_pose/plugins/README.md
+++ b/trt_pose/plugins/README.md
@@ -5,7 +5,7 @@
 - [x] refine_peaks plain cpp
 - [x] refine_peaks torch binding
 - [x] paf_score_graph plain cpp
-- [ ] paf_score_graph torch binding
+- [x] paf_score_graph torch binding
 - [x] munkres plain cpp
 - [ ] munkres torch binding
 - [ ] connect parts plain cpp
diff --git a/trt_pose/plugins/plugins.cpp b/trt_pose/plugins/plugins.cpp
index ae3de67..40cc7b5 100644
--- a/trt_pose/plugins/plugins.cpp
+++ b/trt_pose/plugins/plugins.cpp
@@ -82,17 +82,53 @@ torch::Tensor refine_peaks_torch(torch::Tensor counts, torch::Tensor peaks,
   return refined_peaks;
 }
 
+void paf_score_graph_out_torch(torch::Tensor score_graph, torch::Tensor paf, torch::Tensor topology, torch::Tensor counts, torch::Tensor peaks, const int num_integral_samples)
+{
+  const int N = paf.size(0);
+  const int K = topology.size(0);
+  const int C = peaks.size(1);
+  const int H = paf.size(2);
+  const int W = paf.size(3);
+  const int M = score_graph.size(3);
+
+  paf_score_graph_out_nkhw(
+      (float*) score_graph.data_ptr(),
+      (const int*) topology.data_ptr(),
+      (const float*) paf.data_ptr(),
+      (const int*) counts.data_ptr(),
+      (const float *) peaks.data_ptr(),
+      N, K, C, H, W, M,
+      num_integral_samples
+      );
+}
+
+torch::Tensor paf_score_graph_torch(torch::Tensor paf, torch::Tensor topology, torch::Tensor counts, torch::Tensor peaks, const int num_integral_samples)
+{
+    auto options = torch::TensorOptions()
+        .dtype(torch::kFloat32)
+        .layout(torch::kStrided)
+        .device(torch::kCPU)
+        .requires_grad(false);
+    const int N = peaks.size(0);
+    const int K = topology.size(0);
+    const int M = peaks.size(2);
+
+    torch::Tensor score_graph = torch::zeros({N, K, M, M}, options);
+    paf_score_graph_out_torch(score_graph, paf, topology, counts, peaks, num_integral_samples);
+    return score_graph;
+}
+
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
   m.def("find_peaks", &find_peaks_torch, "find_peaks");
   m.def("find_peaks_out", &find_peaks_out_torch, "find_peaks_out");
-  m.def("paf_score_graph", &paf_score_graph, "paf_score_graph");
-  m.def("paf_score_graph_out", &paf_score_graph_out, "paf_score_graph_out");
+  m.def("paf_score_graph", &paf_score_graph_torch, "paf_score_graph");
+  m.def("paf_score_graph_out", &paf_score_graph_out_torch, "paf_score_graph_out");
   m.def("refine_peaks", &refine_peaks_torch, "refine_peaks");
   m.def("refine_peaks_out", &refine_peaks_out_torch, "refine_peaks_out");
-  m.def("munkres", &munkres, "munkres");
+  //m.def("munkres", &munkres, "munkres");
   m.def("connect_parts", &connect_parts, "connect_parts");
-  m.def("assignment", &assignment, "assignment");
-  m.def("assignment_out", &assignment_out, "assignment_out");
+  //m.def("assignment", &assignment, "assignment");
+  //m.def("assignment_out", &assignment_out, "assignment_out");
   m.def("generate_cmap", &generate_cmap, "generate_cmap");
   m.def("generate_paf", &generate_paf, "generate_paf");
 }

From 364ff0bee62138f75c22ebd1ea1b11028f7c8eb3 Mon Sep 17 00:00:00 2001
From: John Welsh <jwelsh@nvidia.com>
Date: Sun, 1 Mar 2020 12:27:41 -0800
Subject: [PATCH 11/16] munkres torch binding

---
 trt_pose/plugins/README.md   |  2 +-
 trt_pose/plugins/munkres.cpp | 30 ++++++------
 trt_pose/plugins/munkres.hpp |  8 ++--
 trt_pose/plugins/plugins.cpp | 92 +++++++++++++++++++++++++-----------
 4 files changed, 84 insertions(+), 48 deletions(-)

diff --git a/trt_pose/plugins/README.md b/trt_pose/plugins/README.md
index c5cfaf1..840cc68 100644
--- a/trt_pose/plugins/README.md
+++ b/trt_pose/plugins/README.md
@@ -7,7 +7,7 @@
 - [x] paf_score_graph plain cpp
 - [x] paf_score_graph torch binding
 - [x] munkres plain cpp
-- [ ] munkres torch binding
+- [x] munkres torch binding
 - [ ] connect parts plain cpp
 - [ ] connect parts torch binding
 
diff --git a/trt_pose/plugins/munkres.cpp b/trt_pose/plugins/munkres.cpp
index 65fa36b..6923956 100644
--- a/trt_pose/plugins/munkres.cpp
+++ b/trt_pose/plugins/munkres.cpp
@@ -224,12 +224,12 @@ void assignment_out(int *connections,         // 2xM
   }
 }
 
-void assignement_out_k(int *connections,         // Kx2xM
-                       const int *topology,      // Kx4
-                       const float *score_graph, // KxMxM
-                       const int *counts,        // C
-                       const int K, const int M, const float score_threshold,
-                       void *workspace) {
+void assignment_out_k(int *connections,         // Kx2xM
+                      const float *score_graph, // KxMxM
+                      const int *topology,      // Kx4
+                      const int *counts,        // C
+                      const int K, const int M, const float score_threshold,
+                      void *workspace) {
   for (int k = 0; k < K; k++) {
     const int *tk = &topology[k * 4];
     const int cmap_idx_a = tk[2];
@@ -241,15 +241,15 @@ void assignement_out_k(int *connections,         // Kx2xM
   }
 }
 
-void assignement_out_nk(int *connections,         // NxKx2xM
-                        const int *topology,      // Kx4
-                        const float *score_graph, // NxKxMxM
-                        const int *counts,        // NxC
-                        const int N, const int C, const int K, const int M,
-                        const float score_threshold, void *workspace) {
+void assignment_out_nk(int *connections,         // NxKx2xM
+                       const float *score_graph, // NxKxMxM
+                       const int *topology,      // Kx4
+                       const int *counts,        // NxC
+                       const int N, const int C, const int K, const int M,
+                       const float score_threshold, void *workspace) {
   for (int n = 0; n < N; n++) {
-    assignement_out_k(&connections[n * K * 2 * M], topology,
-                      &score_graph[n * K * M * M], &counts[n * C], K, M,
-                      score_threshold, workspace);
+    assignment_out_k(&connections[n * K * 2 * M], &score_graph[n * K * M * M],
+                     topology, &counts[n * C], K, M, score_threshold,
+                     workspace);
   }
 }
diff --git a/trt_pose/plugins/munkres.hpp b/trt_pose/plugins/munkres.hpp
index 6fafc7f..3cfbbb4 100644
--- a/trt_pose/plugins/munkres.hpp
+++ b/trt_pose/plugins/munkres.hpp
@@ -9,16 +9,16 @@ void assignment_out(int *connections,         // 2xM
                     const int count_a, const int count_b, const int M,
                     const float score_threshold, void *workspace);
 
-void assignement_out_k(int *connections,         // Kx2xM
-                       const int *topology,      // Kx4
+void assignment_out_k(int *connections,         // Kx2xM
                        const float *score_graph, // KxMxM
+                       const int *topology,      // Kx4
                        const int *counts,        // C
                        const int K, const int M, const float score_threshold,
                        void *workspace);
 
-void assignement_out_nk(int *connections,         // NxKx2xM
-                        const int *topology,      // Kx4
+void assignment_out_nk(int *connections,         // NxKx2xM
                         const float *score_graph, // NxKxMxM
+                        const int *topology,      // Kx4
                         const int *counts,        // NxC
                         const int N, const int C, const int K, const int M,
                         const float score_threshold, void *workspace);
diff --git a/trt_pose/plugins/plugins.cpp b/trt_pose/plugins/plugins.cpp
index 40cc7b5..e9a19bf 100644
--- a/trt_pose/plugins/plugins.cpp
+++ b/trt_pose/plugins/plugins.cpp
@@ -82,8 +82,10 @@ torch::Tensor refine_peaks_torch(torch::Tensor counts, torch::Tensor peaks,
   return refined_peaks;
 }
 
-void paf_score_graph_out_torch(torch::Tensor score_graph, torch::Tensor paf, torch::Tensor topology, torch::Tensor counts, torch::Tensor peaks, const int num_integral_samples)
-{
+void paf_score_graph_out_torch(torch::Tensor score_graph, torch::Tensor paf,
+                               torch::Tensor topology, torch::Tensor counts,
+                               torch::Tensor peaks,
+                               const int num_integral_samples) {
   const int N = paf.size(0);
   const int K = topology.size(0);
   const int C = peaks.size(1);
@@ -92,43 +94,77 @@ void paf_score_graph_out_torch(torch::Tensor score_graph, torch::Tensor paf, tor
   const int M = score_graph.size(3);
 
   paf_score_graph_out_nkhw(
-      (float*) score_graph.data_ptr(),
-      (const int*) topology.data_ptr(),
-      (const float*) paf.data_ptr(),
-      (const int*) counts.data_ptr(),
-      (const float *) peaks.data_ptr(),
-      N, K, C, H, W, M,
-      num_integral_samples
-      );
+      (float *)score_graph.data_ptr(), (const int *)topology.data_ptr(),
+      (const float *)paf.data_ptr(), (const int *)counts.data_ptr(),
+      (const float *)peaks.data_ptr(), N, K, C, H, W, M, num_integral_samples);
 }
 
-torch::Tensor paf_score_graph_torch(torch::Tensor paf, torch::Tensor topology, torch::Tensor counts, torch::Tensor peaks, const int num_integral_samples)
-{
-    auto options = torch::TensorOptions()
-        .dtype(torch::kFloat32)
-        .layout(torch::kStrided)
-        .device(torch::kCPU)
-        .requires_grad(false);
-    const int N = peaks.size(0);
-    const int K = topology.size(0);
-    const int M = peaks.size(2);
-
-    torch::Tensor score_graph = torch::zeros({N, K, M, M}, options);
-    paf_score_graph_out_torch(score_graph, paf, topology, counts, peaks, num_integral_samples);
-    return score_graph;
+torch::Tensor paf_score_graph_torch(torch::Tensor paf, torch::Tensor topology,
+                                    torch::Tensor counts, torch::Tensor peaks,
+                                    const int num_integral_samples) {
+  auto options = torch::TensorOptions()
+                     .dtype(torch::kFloat32)
+                     .layout(torch::kStrided)
+                     .device(torch::kCPU)
+                     .requires_grad(false);
+  const int N = peaks.size(0);
+  const int K = topology.size(0);
+  const int M = peaks.size(2);
+
+  torch::Tensor score_graph = torch::zeros({N, K, M, M}, options);
+  paf_score_graph_out_torch(score_graph, paf, topology, counts, peaks,
+                            num_integral_samples);
+  return score_graph;
+}
+
+void assignment_out_torch(torch::Tensor connections, torch::Tensor score_graph,
+                          torch::Tensor topology, torch::Tensor counts,
+                          const float score_threshold) {
+  const int N = counts.size(0);
+  const int C = counts.size(1);
+  const int K = topology.size(0);
+  const int M = score_graph.size(2);
+  void *workspace = (void *)malloc(assignment_out_workspace(M));
+
+  assignment_out_nk(
+      (int *)connections.data_ptr(), (const float *)score_graph.data_ptr(),
+      (const int *)topology.data_ptr(), (const int *)counts.data_ptr(), N, C, K, M,
+      score_threshold, workspace);
+
+  free(workspace);
+}
+
+torch::Tensor assignment_torch(torch::Tensor score_graph,
+                               torch::Tensor topology, torch::Tensor counts,
+                               float score_threshold) {
+  auto options = torch::TensorOptions()
+                     .dtype(torch::kInt32)
+                     .layout(torch::kStrided)
+                     .device(torch::kCPU)
+                     .requires_grad(false);
+
+  int N = counts.size(0);
+  int K = topology.size(0);
+  int M = score_graph.size(2);
+
+  auto connections = torch::full({N, K, 2, M}, -1, options);
+  assignment_out_torch(connections, score_graph, topology, counts,
+                       score_threshold);
+  return connections;
 }
 
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
   m.def("find_peaks", &find_peaks_torch, "find_peaks");
   m.def("find_peaks_out", &find_peaks_out_torch, "find_peaks_out");
   m.def("paf_score_graph", &paf_score_graph_torch, "paf_score_graph");
-  m.def("paf_score_graph_out", &paf_score_graph_out_torch, "paf_score_graph_out");
+  m.def("paf_score_graph_out", &paf_score_graph_out_torch,
+        "paf_score_graph_out");
   m.def("refine_peaks", &refine_peaks_torch, "refine_peaks");
   m.def("refine_peaks_out", &refine_peaks_out_torch, "refine_peaks_out");
-  //m.def("munkres", &munkres, "munkres");
+  // m.def("munkres", &munkres, "munkres");
   m.def("connect_parts", &connect_parts, "connect_parts");
-  //m.def("assignment", &assignment, "assignment");
-  //m.def("assignment_out", &assignment_out, "assignment_out");
+  m.def("assignment", &assignment_torch, "assignment");
+  m.def("assignment_out", &assignment_out_torch, "assignment_out");
   m.def("generate_cmap", &generate_cmap, "generate_cmap");
   m.def("generate_paf", &generate_paf, "generate_paf");
 }

From 18c008ff9d2b8ac356912a37e8c52a28d3d79f54 Mon Sep 17 00:00:00 2001
From: John Welsh <jwelsh@nvidia.com>
Date: Sun, 1 Mar 2020 12:29:42 -0800
Subject: [PATCH 12/16] removed ipynb checkpt

---
 .ipynb_checkpoints/README-checkpoint.md | 72 -------------------------
 1 file changed, 72 deletions(-)
 delete mode 100644 .ipynb_checkpoints/README-checkpoint.md

diff --git a/.ipynb_checkpoints/README-checkpoint.md b/.ipynb_checkpoints/README-checkpoint.md
deleted file mode 100644
index 2b172e2..0000000
--- a/.ipynb_checkpoints/README-checkpoint.md
+++ /dev/null
@@ -1,72 +0,0 @@
-# TensorRT Pose Estimation
-
-This project features multi-instance pose estimation accelerated by NVIDIA TensorRT.  It is ideal for applications where low latency is necessary.  It includes
-
-- Training scripts to train on any keypoint task data in MSCOCO format
-
-- A collection of models that may be easily optimized with TensorRT using [torch2trt](https://github.com/NVIDIA-AI-IOT/torch2trt)
-
-This project can be used easily for the task of human pose estimation, or extended for something new.
-
-If you run into any issues please [let us know](../../issues).
-
-## Tasks
-
-### Human pose estimation
-
-<img src="https://user-images.githubusercontent.com/4212806/67125332-71a64580-f1a9-11e9-8ee1-e759a38de215.gif" height=256/>
-
-This task involves detecting human body pose using models trained on the MSCOCO dataset. 
-
-#### Models
-
-Below are models pre-trained on the MSCOCO dataset.  The throughput in FPS is shown for each platform
-
-| Model | Jetson Nano | Jetson Xavier | Weights |
-|-------|-------------|---------------|---------|
-| resnet18_baseline_att_224x224_A | 22 | 251 | [download (81MB)](https://drive.google.com/open?id=1XYDdCUdiF2xxx4rznmLb62SdOUZuoNbd) |
-| densenet121_baseline_att_256x256_B | 12 | 101 | [download (84MB)](https://drive.google.com/open?id=13FkJkx7evQ1WwP54UmdiDXWyFMY1OxDU) |
-<!--
-| resnet50_baseline_att_256x256_A |  |  | [download (182MB)](https://drive.google.com/open?id=1eLgzGsh1yjuLG66r9BFmoOzp3nTdVHS2) |
-| resnet50_baseline_att_384x384_A |  |  | [download (182MB)](https://drive.google.com/open?id=1ck66N0Lqxqcg-7OImh_5YNwvnrb9yHym) |
-| densenet121_baseline_att_224x224_B |  |  | [download (84MB)](https://drive.google.com/open?id=1ZP6Wh9CpFQxiRJYO9ECyIVU-soy4aUoW) |
-| densenet121_baseline_att_320x320_A |  |  | [download (84MB)](https://drive.google.com/open?id=1SX-LWAfYNdcNKb42b31UmZwsjXmB3a9l) |
--->
-
-#### Live demo
-
-To run the live Jupyter Notebook demo on real-time camera input, follow these steps
- 
-1. Place the downloaded weights in the [tasks/human_pose](tasks/human_pose) directory
-
-2. Open and follow the [live_demo.ipynb](tasks/human_pose/live_demo.ipynb) notebook
-
-    > You may need to modify the notebook, depending on which model you use
-
-## Setup
-
-To install trt_pose, call this command
-
-> We assume you have already installed PyTorch, torchvision, and TensorRT
-
-```bash
-sudo pip3 install tqdm cython pycocotools
-sudo apt-get install python3-matplotlib
-git clone https://github.com/NVIDIA-AI-IOT/trt_pose
-cd trt_pose
-sudo python3 setup.py install
-```
-
-## See also
-
-- [torch2trt](http://github.com/NVIDIA-AI-IOT/torch2trt) - An easy to use PyTorch to TensorRT converter
-
-- [JetBot](http://github.com/NVIDIA-AI-IOT/jetbot) - An educational AI robot based on NVIDIA Jetson Nano
-- [JetRacer](http://github.com/NVIDIA-AI-IOT/jetracer) - An educational AI racecar using NVIDIA Jetson Nano
-- [JetCam](http://github.com/NVIDIA-AI-IOT/jetcam) - An easy to use Python camera interface for NVIDIA Jetson
-
-## References
-
-Cao, Zhe, et al. "Realtime multi-person 2d pose estimation using part affinity fields." Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. 2017.
-
-Xiao, Bin, Haiping Wu, and Yichen Wei. "Simple baselines for human pose estimation and tracking." Proceedings of the European Conference on Computer Vision (ECCV). 2018.

From d3e2404534f2aa5c26af47c29d89625aac7fe868 Mon Sep 17 00:00:00 2001
From: John Welsh <jwelsh@nvidia.com>
Date: Sun, 1 Mar 2020 13:26:34 -0800
Subject: [PATCH 13/16] added connect_parts refactor

---
 trt_pose/plugins/README.md           |   5 +-
 trt_pose/plugins/connect_parts.cpp   | 193 +++++++++++++--------------
 trt_pose/plugins/connect_parts.hpp   |  23 +++-
 trt_pose/plugins/paf_score_graph.cpp |   4 +-
 trt_pose/plugins/plugins.cpp         |  40 +++++-
 5 files changed, 151 insertions(+), 114 deletions(-)

diff --git a/trt_pose/plugins/README.md b/trt_pose/plugins/README.md
index 840cc68..9fe6690 100644
--- a/trt_pose/plugins/README.md
+++ b/trt_pose/plugins/README.md
@@ -8,8 +8,9 @@
 - [x] paf_score_graph torch binding
 - [x] munkres plain cpp
 - [x] munkres torch binding
-- [ ] connect parts plain cpp
-- [ ] connect parts torch binding
+- [x] connect parts plain cpp
+- [x] connect parts torch binding
+- [ ] test full refactored pipeline
 
 ## Terminology
 
diff --git a/trt_pose/plugins/connect_parts.cpp b/trt_pose/plugins/connect_parts.cpp
index da2b0f9..4d8bda7 100644
--- a/trt_pose/plugins/connect_parts.cpp
+++ b/trt_pose/plugins/connect_parts.cpp
@@ -1,112 +1,99 @@
 #include "connect_parts.hpp"
+#include <queue>
 
+std::size_t connect_parts_out_workspace(const int C, const int M) {
+  return sizeof(int) * C * M;
+}
+
+void connect_parts_out(int *object_counts,     // 1
+                       int *objects,           // PxC
+                       const int *connections, // Kx2xM
+                       const int *topology,    // Kx4
+                       const int *counts,      // C
+                       const int K, const int C, const int M, const int P,
+                       void *workspace) {
+
+  // initialize objects
+  for (int i = 0; i < C * M; i++) {
+    objects[i] = -1;
+  }
+
+  // initialize visited
+  std::memset(workspace, 0, connect_parts_out_workspace(C, M));
+  int *visited = (int *)workspace;
+
+  int num_objects = 0;
+
+  for (int c = 0; c < C; c++) {
+    if (num_objects >= P) {
+      break;
+    }
+
+    const int count = counts[c];
+
+    for (int i = 0; i < count; i++) {
+      if (num_objects >= P) {
+        break;
+      }
+
+      std::queue<std::pair<int, int>> q;
+      bool new_object = false;
+      q.push({c, i});
 
-void connect_parts_out(torch::Tensor object_counts, torch::Tensor objects, torch::Tensor connections, torch::Tensor topology, torch::Tensor counts, int max_count)
-{
-    auto options = torch::TensorOptions()
-        .dtype(torch::kInt32)
-        .layout(torch::kStrided)
-        .device(torch::kCPU)
-        .requires_grad(false);
-    int N = counts.size(0);
-    int K = topology.size(0);
-    int C = counts.size(1);
-    int M = connections.size(3);
-    
-    auto visited = torch::zeros({N, C, M}, options);
-    auto visited_a = visited.accessor<int, 3>();
-    auto counts_a = counts.accessor<int, 2>();
-    auto topology_a = topology.accessor<int, 2>();
-    auto objects_a = objects.accessor<int, 3>();
-    auto object_counts_a = object_counts.accessor<int, 1>();
-    auto connections_a = connections.accessor<int, 4>();
-    
-    for (int n = 0; n < N; n++)
-    {
-        int num_objects = 0;
-        for (int c = 0; c < C; c++)
-        {
-            if (num_objects >= max_count) {
-                break;
+      while (!q.empty()) {
+        auto node = q.front();
+        q.pop();
+        int c_n = node.first;
+        int i_n = node.second;
+
+        if (visited[c_n * M + i_n]) {
+          continue;
+        }
+
+        visited[c_n * M + i_n] = 1;
+        new_object = true;
+        objects[num_objects * C + c_n] = i_n;
+
+        for (int k = 0; k < K; k++) {
+          const int *tk = &topology[k * 4];
+          const int c_a = tk[2];
+          const int c_b = tk[3];
+          const int *ck = &connections[k * 2 * M];
+
+          if (c_a == c_n) {
+            int i_b = ck[i_n];
+            if (i_b >= 0) {
+              q.push({c_b, i_b});
             }
-            
-            int count = counts_a[n][c];
-            
-            for (int i = 0; i < count; i++)
-            {
-                if (num_objects >= max_count) {
-                    break;
-                }
-                
-                std::queue<std::pair<int, int>> q;
-                bool new_object = false;
-                q.push({c, i});
-                
-                while (!q.empty())
-                {
-                    auto node = q.front();
-                    q.pop();
-                    int c_n = node.first;
-                    int i_n = node.second;
-                    
-                    if (visited_a[n][c_n][i_n]) {
-                        continue;
-                    }
-                    
-                    visited_a[n][c_n][i_n] = 1;
-                    new_object = true;
-                    objects_a[n][num_objects][c_n] = i_n;
-                    
-                    for (int k = 0; k < K; k++)
-                    {
-                        int c_a = topology_a[k][2];
-                        int c_b = topology_a[k][3];
-                        
-                        if (c_a == c_n)
-                        {
-                            int i_b = connections_a[n][k][0][i_n];
-                            if (i_b >= 0) {
-                                q.push({c_b, i_b});
-                            }
-                        }
-                        
-                        if (c_b == c_n)
-                        {
-                            int i_a = connections_a[n][k][1][i_n];
-                            if (i_a >= 0) {
-                                q.push({c_a, i_a});
-                            }
-                        }
-                    }
-                }
-                
-                if (new_object)
-                {
-                    num_objects++;
-                }
+          }
+
+          if (c_b == c_n) {
+            int i_a = ck[M + i_n];
+            if (i_a >= 0) {
+              q.push({c_a, i_a});
             }
+          }
         }
-        
-        object_counts_a[n] = num_objects;
+      }
+
+      if (new_object) {
+        num_objects++;
+      }
     }
+  }
+  *object_counts = num_objects;
 }
 
-
-std::vector<torch::Tensor> connect_parts(torch::Tensor connections, torch::Tensor topology, torch::Tensor counts, int max_count)
-{
-    auto options = torch::TensorOptions()
-        .dtype(torch::kInt32)
-        .layout(torch::kStrided)
-        .device(torch::kCPU)
-        .requires_grad(false);
-    
-    int N = counts.size(0);
-    int K = topology.size(0);
-    int C = counts.size(1);
-    int M = connections.size(3);
-    
-    auto objects = torch::full({N, max_count, C}, -1, options);
-    auto object_counts = torch::zeros({N}, options);
-    connect_parts_out(object_counts, objects, connections, topology, counts, max_count);
-    return {object_counts, objects};
-}
\ No newline at end of file
+void connect_parts_out_batch(int *object_counts,     // N
+                             int *objects,           // NxPxC
+                             const int *connections, // NxKx2xM
+                             const int *topology,    // Kx4
+                             const int *counts,      // NxC
+                             const int N, const int K, const int C, const int M,
+                             const int P, void *workspace) {
+  for (int n = 0; n < N; n++) {
+    connect_parts_out(&object_counts[n], &objects[n * P * C],
+                      &connections[n * K * 2 * M], topology, &counts[n * C], K,
+                      C, M, P, workspace);
+  }
+}
diff --git a/trt_pose/plugins/connect_parts.hpp b/trt_pose/plugins/connect_parts.hpp
index be2bd15..6294a1a 100644
--- a/trt_pose/plugins/connect_parts.hpp
+++ b/trt_pose/plugins/connect_parts.hpp
@@ -1,7 +1,20 @@
-#include <torch/extension.h>
-#include <vector>
-#include <queue>
+#pragma once
+#include <cstring>
 
+std::size_t connect_parts_out_workspace(const int C, const int M);
 
-void connect_parts_out(torch::Tensor object_counts, torch::Tensor objects, torch::Tensor connections, torch::Tensor topology, torch::Tensor counts, int max_count);
-std::vector<torch::Tensor> connect_parts(torch::Tensor connections, torch::Tensor topology, torch::Tensor counts, int max_count);
\ No newline at end of file
+void connect_parts_out(int *object_counts,     // 1
+                       int *objects,           // PxC
+                       const int *connections, // Kx2xM
+                       const int *topology,    // Kx4
+                       const int *counts,      // C
+                       const int K, const int C, const int M, const int P,
+                       void *workspace);
+
+void connect_parts_out_batch(int *object_counts,     // N
+                             int *objects,           // NxPxC
+                             const int *connections, // NxKx2xM
+                             const int *topology,    // Kx4
+                             const int *counts,      // NxC
+                             const int N, const int K, const int C, const int M,
+                             const int P, void *workspace);
diff --git a/trt_pose/plugins/paf_score_graph.cpp b/trt_pose/plugins/paf_score_graph.cpp
index 5e1ead4..4037d16 100644
--- a/trt_pose/plugins/paf_score_graph.cpp
+++ b/trt_pose/plugins/paf_score_graph.cpp
@@ -31,12 +31,11 @@ void paf_score_graph_out_hw(float *score_graph, // MxM
       float uab_j = pab_j / pab_norm;
 
       float integral = 0.;
-      float progress = 0.;
       float increment = 1.f / num_integral_samples;
 
       for (int t = 0; t < num_integral_samples; t++) {
         // compute integral point T
-        float progress = (float)t / (float)num_integral_samples;
+        float progress = (float)t / ((float)num_integral_samples - 1);
         float pt_i = pa_i + progress * pab_i;
         float pt_j = pa_j + progress * pab_j;
 
@@ -64,7 +63,6 @@ void paf_score_graph_out_hw(float *score_graph, // MxM
         // point
         float dot = pt_paf_i * uab_i + pt_paf_j * uab_j;
         integral += dot;
-        progress += increment;
       }
 
       integral /= num_integral_samples;
diff --git a/trt_pose/plugins/plugins.cpp b/trt_pose/plugins/plugins.cpp
index e9a19bf..3265ed6 100644
--- a/trt_pose/plugins/plugins.cpp
+++ b/trt_pose/plugins/plugins.cpp
@@ -153,6 +153,43 @@ torch::Tensor assignment_torch(torch::Tensor score_graph,
   return connections;
 }
 
+void connect_parts_out_torch(torch::Tensor object_counts, torch::Tensor objects, torch::Tensor connections, torch::Tensor topology, torch::Tensor counts, int max_count)
+{
+  const int N = object_counts.size(0);
+  const int K = topology.size(0);
+  const int C = counts.size(1);
+  const int M = connections.size(3);
+  const int P = max_count;
+  void *workspace = malloc(connect_parts_out_workspace(C, M));
+  connect_parts_out_batch(
+      (int *) object_counts.data_ptr(),
+      (int *) objects.data_ptr(),
+      (const int *) connections.data_ptr(),
+      (const int *) topology.data_ptr(),
+      (const int *) counts.data_ptr(),
+      N, K, C, M, P, workspace);
+  free(workspace);
+}
+
+std::vector<torch::Tensor> connect_parts_torch(torch::Tensor connections, torch::Tensor topology, torch::Tensor counts, int max_count)
+{
+    auto options = torch::TensorOptions()
+        .dtype(torch::kInt32)
+        .layout(torch::kStrided)
+        .device(torch::kCPU)
+        .requires_grad(false);
+    
+    int N = counts.size(0);
+    int K = topology.size(0);
+    int C = counts.size(1);
+    int M = connections.size(3);
+    
+    auto objects = torch::full({N, max_count, C}, -1, options);
+    auto object_counts = torch::zeros({N}, options);
+    connect_parts_out_torch(object_counts, objects, connections, topology, counts, max_count);
+    return {object_counts, objects};
+}
+
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
   m.def("find_peaks", &find_peaks_torch, "find_peaks");
   m.def("find_peaks_out", &find_peaks_out_torch, "find_peaks_out");
@@ -162,7 +199,8 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
   m.def("refine_peaks", &refine_peaks_torch, "refine_peaks");
   m.def("refine_peaks_out", &refine_peaks_out_torch, "refine_peaks_out");
   // m.def("munkres", &munkres, "munkres");
-  m.def("connect_parts", &connect_parts, "connect_parts");
+  m.def("connect_parts", &connect_parts_torch, "connect_parts");
+  m.def("connect_parts_out", &connect_parts_out_torch, "connect_parts_out");
   m.def("assignment", &assignment_torch, "assignment");
   m.def("assignment_out", &assignment_out_torch, "assignment_out");
   m.def("generate_cmap", &generate_cmap, "generate_cmap");

From dff0983c12be74642296e1f68c21cd86b7b6031e Mon Sep 17 00:00:00 2001
From: John Welsh <jwelsh@nvidia.com>
Date: Sun, 1 Mar 2020 13:26:56 -0800
Subject: [PATCH 14/16] added cmake

---
 CMakeLists.txt | 15 +++++++++++++++
 1 file changed, 15 insertions(+)
 create mode 100644 CMakeLists.txt

diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..464fad4
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,15 @@
+cmake_minimum_required(VERSION 3.6)
+project(trt_pose)
+
+add_library(trt_pose SHARED
+  trt_pose/plugins/find_peaks.cpp
+  trt_pose/plugins/refine_peaks.cpp
+  trt_pose/plugins/paf_score_graph.cpp
+  trt_pose/plugins/munkres.cpp
+  trt_pose/plugins/connect_parts.cpp
+)
+
+add_executable(trt_pose_test_all
+  trt_pose/plugins/test_all.cpp
+)
+target_link_libraries(trt_pose_test_all trt_pose)

From ad8d806b006c2e664d3c3a5a42f75b8bdd96c10b Mon Sep 17 00:00:00 2001
From: John <jwelsh@nvidia.com>
Date: Mon, 2 Mar 2020 10:08:59 -0800
Subject: [PATCH 15/16] Update README.md

---
 trt_pose/plugins/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/trt_pose/plugins/README.md b/trt_pose/plugins/README.md
index 9fe6690..f62f328 100644
--- a/trt_pose/plugins/README.md
+++ b/trt_pose/plugins/README.md
@@ -10,7 +10,7 @@
 - [x] munkres torch binding
 - [x] connect parts plain cpp
 - [x] connect parts torch binding
-- [ ] test full refactored pipeline
+- [x] test full refactored pipeline
 
 ## Terminology
 

From 6a1cace63ae6bc40f9b42ed3e44734227794d4d3 Mon Sep 17 00:00:00 2001
From: John Welsh <jwelsh@nvidia.com>
Date: Mon, 2 Mar 2020 10:10:41 -0800
Subject: [PATCH 16/16] incremented patch setup.py

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 9c1da6e..926825c 100644
--- a/setup.py
+++ b/setup.py
@@ -10,7 +10,7 @@
 
 setup(
     name='trt_pose',
-    version='0.0.0',
+    version='0.0.1',
     description='Pose detection accelerated by NVIDIA TensorRT',
     packages=find_packages(),
     ext_package='trt_pose',