diff --git a/README.md b/README.md
index 65487d1..7c166b4 100644
--- a/README.md
+++ b/README.md
@@ -13,7 +13,7 @@ Rcpp bindings for [hnswlib](https://github.com/nmslib/hnswlib).
 
 ## Status
 
-*February 4 2023* RcppHNSW 0.6.0 is released to CRAN, supporting
+*February 4 2024* RcppHNSW 0.6.0 is released to CRAN, supporting
 [hnswlib version 0.8.0](https://github.com/nmslib/hnswlib/releases/tag/v0.8.0).
 
 *September 19 2023* RcppHNSW 0.5.0 is released to CRAN, supporting
@@ -123,6 +123,11 @@ iris_nn <- hnsw_search(irism[101:150, ], ann, k = 5)
 
 ## Class Example
 
+As noted in the "Do not use named parameters" section below, you should avoid
+using named parameters when using class methods. But I do use them in a few
+places below to document the name of the parameters the positional arguments
+refer to.
+
 ```R
 library(RcppHNSW)
 data <- as.matrix(iris[, -5])
@@ -153,7 +158,7 @@ res <- ann$getNNsList(data[1, ], k = 4, include_distances = TRUE)
 ann2 <- new(HnswL2, dim, nitems, M, ef)
 ann2$addItems(data)
 # Retrieve the 4 nearest neighbors for every item in data
-res2 <- ann2$getAllNNsList(data, k = 4, include_distances = TRUE)
+res2 <- ann2$getAllNNsList(data, 4, TRUE)
 # labels of the data are in res$item, distances in res$distance
 
 # If you are able to store your data column-wise, then the overhead of copying
@@ -162,10 +167,18 @@ data_by_col <- t(data)
 ann3 <- new(HnswL2, dim, nitems, M, ef)
 ann3$addItemsCol(data_by_col)
 # Retrieve the 4 nearest neighbors for every item in data_by_col
-res3 <- ann3$getAllNNsListCol(data_by_col, k = 4, include_distances = TRUE)
+res3 <- ann3$getAllNNsListCol(data_by_col, 4, TRUE)
 # The returned neared neighbor data matrices are also returned column-wise
 all(res2$item == t(res3$item) & res2$distance == t(res3$distance))
 
+# Save the index
+ann$save("iris.hnsw")
+
+# load it back in: you do need to know the dimension of the original data
+ann4 <- new(HnswL2, dim, "iris.hnsw")
+# new index should behave like the original
+all(ann$getNNs(data[1, ], 4) == ann4$getNNs(data[1, ], 4))
+
 # other distance classes:
 # Cosine: HnswCosine
 # Inner Product: HnswIP
diff --git a/inst/include/RcppPerpendicular/RcppPerpendicular.h b/inst/include/RcppPerpendicular/RcppPerpendicular.h
deleted file mode 100644
index ef7515f..0000000
--- a/inst/include/RcppPerpendicular/RcppPerpendicular.h
+++ /dev/null
@@ -1,145 +0,0 @@
-// Taken from RcppParallel.h and then modified slightly to rename header guards
-// and namespaces to avoid any potential clashes. RcppParallel is licensed under
-// GPLv2 or later:
-
-// RcppPerpendicular.h a version of parallel for based on RcppParallel
-// Copyright (C) 2020 James Melville
-//
-// This program is free software; you can redistribute it and/or
-// modify it under the terms of the GNU General Public License
-// as published by the Free Software Foundation; either version 2
-// of the License, or (at your option) any later version.
-//
-// This program is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-// GNU General Public License for more details.
-//
-// You should have received a copy of the GNU General Public License
-// along with this program; if not, write to the Free Software
-// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301,
-// USA.
-
-#ifndef RCPP_PERPENDICULAR
-#define RCPP_PERPENDICULAR
-
-#include <thread>
-#include <utility>
-#include <vector>
-
-namespace RcppPerpendicular {
-
-using IndexRange = std::pair<std::size_t, std::size_t>;
-
-template <typename Worker>
-auto worker_thread(Worker &worker, const IndexRange &range) -> void {
-  try {
-    worker(range.first, range.second);
-  } catch (...) {
-  }
-}
-
-template <typename Worker>
-auto worker_thread_id(Worker &worker, const IndexRange &range,
-                      std::size_t thread_id) -> void {
-  try {
-    worker(range.first, range.second, thread_id);
-  } catch (...) {
-  }
-}
-
-// Function to calculate the ranges for a given input
-inline auto split_input_range(const IndexRange &range, std::size_t n_threads,
-                              std::size_t grain_size)
-    -> std::vector<IndexRange> {
-
-  // determine max number of threads
-  if (n_threads == 0) {
-    n_threads = std::thread::hardware_concurrency();
-  }
-
-  // compute grain_size (including enforcing requested minimum)
-  std::size_t length = range.second - range.first;
-  if (n_threads == 1) {
-    grain_size = length;
-  } else if ((length % n_threads) == 0) { // perfect division
-    grain_size = (std::max)(length / n_threads, grain_size);
-  } else { // imperfect division, divide by threads - 1
-    grain_size = (std::max)(length / (n_threads - 1), grain_size);
-  }
-
-  // allocate ranges
-  std::vector<IndexRange> ranges;
-  std::size_t begin = range.first;
-  while (begin < range.second) {
-    std::size_t end = (std::min)(begin + grain_size, range.second);
-    ranges.emplace_back(std::make_pair(begin, end));
-    begin = end;
-  }
-
-  return ranges;
-}
-
-// Execute the Worker over the IndexRange in parallel
-template <typename Worker>
-inline void parallel_for(std::size_t begin, std::size_t end, Worker &worker,
-                         std::size_t n_threads, std::size_t grain_size = 1) {
-  if (n_threads > 0) {
-    // split the work
-    IndexRange input_range(begin, end);
-    std::vector<IndexRange> ranges =
-        split_input_range(input_range, n_threads, grain_size);
-
-    std::vector<std::thread> threads;
-    threads.reserve(ranges.size());
-    for (auto &range : ranges) {
-      threads.push_back(
-          std::thread(&worker_thread<Worker>, std::ref(worker), range));
-    }
-
-    for (auto &thread : threads) {
-      thread.join();
-    }
-  } else {
-    worker(begin, end);
-  }
-}
-
-template <typename Worker>
-inline void parallel_for(std::size_t end, Worker &worker, std::size_t n_threads,
-                         std::size_t grain_size = 1) {
-  parallel_for(0, end, worker, n_threads, grain_size);
-}
-
-template <typename Worker>
-inline void pfor(std::size_t begin, std::size_t end, Worker &worker,
-                 std::size_t n_threads, std::size_t grain_size = 1) {
-  if (n_threads > 0) {
-    IndexRange input_range(begin, end);
-    std::vector<IndexRange> ranges =
-        split_input_range(input_range, n_threads, grain_size);
-
-    std::vector<std::thread> threads;
-    for (std::size_t thread_id = 0; thread_id < ranges.size(); ++thread_id) {
-      auto &range = ranges[thread_id];
-      threads.push_back(std::thread(&worker_thread_id<Worker>, std::ref(worker),
-                                    range, thread_id));
-    }
-
-    for (auto &thread : threads) {
-      thread.join();
-    }
-  } else {
-    worker(begin, end, 0);
-  }
-}
-
-template <typename Worker>
-inline void pfor(std::size_t end, Worker &worker, std::size_t n_threads,
-                 std::size_t grain_size = 1) {
-  pfor(0, end, worker, n_threads, grain_size);
-}
-
-} // namespace RcppPerpendicular
-
-#endif // RCPP_PERPENDICULAR
diff --git a/inst/include/pforr/pforr.h b/inst/include/pforr/pforr.h
new file mode 100644
index 0000000..7f59568
--- /dev/null
+++ b/inst/include/pforr/pforr.h
@@ -0,0 +1,97 @@
+// Taken from RcppParallel.h and then modified slightly to rename header guards
+// and namespaces to avoid any potential clashes. RcppParallel is licensed under
+// GPLv2 or later:
+
+// pfor.h a version of parallel for based on RcppParallel
+// Copyright (C) 2020 James Melville
+//
+// This program is free software; you can redistribute it and/or
+// modify it under the terms of the GNU General Public License
+// as published by the Free Software Foundation; either version 2
+// of the License, or (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301,
+// USA.
+
+#ifndef PFORR
+#define PFORR
+
+#include <thread>
+#include <utility>
+#include <vector>
+
+namespace pforr {
+
+using IndexRange = std::pair<std::size_t, std::size_t>;
+
+template <typename Worker>
+auto worker_thread(Worker &worker, const IndexRange &range) -> void {
+  try {
+    worker(range.first, range.second);
+  } catch (...) {
+  }
+}
+
+// Function to calculate the ranges for a given input
+inline auto split_input_range(const IndexRange &range, std::size_t n_threads,
+                              std::size_t grain_size)
+  -> std::vector<IndexRange> {
+
+    // compute grain_size (including enforcing requested minimum)
+    std::size_t length = range.second - range.first;
+    if (n_threads == 1)
+      grain_size = length;
+    else if ((length % n_threads) == 0) // perfect division
+      grain_size = (std::max)(length / n_threads, grain_size);
+    else // imperfect division, divide by threads - 1
+      grain_size = (std::max)(length / (n_threads - 1), grain_size);
+
+    // allocate ranges
+    std::vector<IndexRange> ranges;
+    std::size_t begin = range.first;
+    while (begin < range.second) {
+      std::size_t end = (std::min)(begin + grain_size, range.second);
+      ranges.emplace_back(begin, end);
+      begin = end;
+    }
+
+    return ranges;
+  }
+
+// Execute the Worker over the IndexRange in parallel
+template <typename Worker>
+inline void parallel_for(std::size_t begin, std::size_t end, Worker &worker,
+                         std::size_t n_threads, std::size_t grain_size = 1) {
+  if (n_threads == 0) {
+    worker(begin, end);
+    return;
+  }
+  // split the work
+  IndexRange input_range(begin, end);
+  std::vector<IndexRange> ranges =
+    split_input_range(input_range, n_threads, grain_size);
+
+  std::vector<std::thread> threads;
+  threads.reserve(ranges.size());
+  for (auto &range : ranges) {
+    threads.push_back(
+      std::thread(&worker_thread<Worker>, std::ref(worker), range));
+  }
+
+  for (auto &thread : threads) {
+    thread.join();
+  }
+
+  return;
+}
+
+} // namespace pforr
+
+#endif // PFORR
\ No newline at end of file
diff --git a/src/hnsw.cpp b/src/hnsw.cpp
index 019beb3..4013020 100644
--- a/src/hnsw.cpp
+++ b/src/hnsw.cpp
@@ -28,7 +28,7 @@
 
 #include "rcpphnsw.h"
 
-#include "RcppPerpendicular/RcppPerpendicular.h"
+#include "pforr/pforr.h"
 
 template <typename dist_t, bool DoNormalize = false> struct Normalizer {
   static void normalize(std::vector<dist_t> &vec) {}
@@ -130,7 +130,7 @@ class Hnsw {
         addItemImpl(item_copy, index_start + i);
       }
     };
-    RcppPerpendicular::parallel_for(nitems, worker, numThreads);
+    pforr::parallel_for(0, nitems, worker, numThreads);
     cur_l = size();
   }
 
@@ -158,7 +158,7 @@ class Hnsw {
       }
     };
 
-    RcppPerpendicular::parallel_for(nitems, worker, numThreads);
+    pforr::parallel_for(0, nitems, worker, numThreads);
     cur_l = size();
   }
 
@@ -295,7 +295,7 @@ class Hnsw {
       }
     };
 
-    RcppPerpendicular::parallel_for(nitems, worker, numThreads);
+    pforr::parallel_for(0, nitems, worker, numThreads);
 
     return found_all;
   }
@@ -428,7 +428,7 @@ class Hnsw {
       }
     };
 
-    RcppPerpendicular::parallel_for(nitems, worker, numThreads);
+    pforr::parallel_for(0, nitems, worker, numThreads);
 
     return found_all;
   }
@@ -446,7 +446,7 @@ class Hnsw {
       }
     };
 
-    RcppPerpendicular::parallel_for(nitems, worker, numThreads);
+    pforr::parallel_for(0, nitems, worker, numThreads);
 
     return data;
   }
diff --git a/tests/testthat/test_save_load.R b/tests/testthat/test_save_load.R
new file mode 100644
index 0000000..64c14bc
--- /dev/null
+++ b/tests/testthat/test_save_load.R
@@ -0,0 +1,47 @@
+library(RcppHNSW)
+context("Save/load index")
+
+num_elements <- nrow(uirism)
+dim <- ncol(uirism)
+
+M <- 16
+ef_construction <- 10
+p <- new(HnswL2, dim, num_elements, M, ef_construction)
+
+for (i in 1:num_elements) {
+  p$addItem(uirism[i, ])
+}
+
+nn4idx <- matrix(0L, nrow = num_elements, ncol = 4)
+nn4dist <- matrix(0.0, nrow = num_elements, ncol = 4)
+
+for (i in 1:num_elements) {
+  res <- p$getNNsList(uirism[i, ], k = 4, TRUE)
+  nn4idx[i, ] <- res$item
+  nn4dist[i, ] <- res$distance
+}
+
+temp_file <- tempfile()
+on.exit(unlink(temp_file), add = TRUE)
+p$save(temp_file)
+
+nn4idx_aftersave <- matrix(0L, nrow = num_elements, ncol = 4)
+nn4dist_aftersave <- matrix(0.0, nrow = num_elements, ncol = 4)
+for (i in 1:num_elements) {
+  res_aftersave <- p$getNNsList(uirism[i, ], k = 4, TRUE)
+  nn4idx_aftersave[i, ] <- res_aftersave$item
+  nn4dist_aftersave[i, ] <- res_aftersave$distance
+}
+expect_equal(nn4idx, nn4idx_aftersave)
+expect_equal(nn4dist, nn4dist_aftersave)
+
+pload <- new(HnswL2, dim, temp_file)
+nn4idx_afterload <- matrix(0L, nrow = num_elements, ncol = 4)
+nn4dist_afterload <- matrix(0.0, nrow = num_elements, ncol = 4)
+for (i in 1:num_elements) {
+  res_afterload <- pload$getNNsList(uirism[i, ], k = 4, TRUE)
+  nn4idx_afterload[i, ] <- res_afterload$item
+  nn4dist_afterload[i, ] <- res_afterload$distance
+}
+expect_equal(nn4idx, nn4idx_afterload)
+expect_equal(nn4dist, nn4dist_afterload)