diff --git a/.circleci/config.yml b/.circleci/config.yml
index 922c53db..be4da9f3 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -25,6 +25,9 @@ jobs:
           name: Run Pytorch scripts
           command: ./scripts/run_pytorch.sh
           no_output_timeout: 1h
+      - store_test_results:
+          path: test-results
+
 
 workflows:
   version: 2
diff --git a/scripts/install.sh b/scripts/install.sh
index 896f66bf..1e05d87f 100755
--- a/scripts/install.sh
+++ b/scripts/install.sh
@@ -6,6 +6,8 @@ conda activate base
 
 conda install -y pytorch torchvision torchaudio -c pytorch-nightly
 
+conda install -y pytest
+
 # Dependencies required to load models
 conda install -y regex pillow tqdm boto3 requests numpy\
     h5py scipy matplotlib unidecode ipython pyyaml
diff --git a/scripts/run_pytorch.sh b/scripts/run_pytorch.sh
index 541df731..a4b9132a 100755
--- a/scripts/run_pytorch.sh
+++ b/scripts/run_pytorch.sh
@@ -2,38 +2,20 @@
 . ~/miniconda3/etc/profile.d/conda.sh
 conda activate base
 
-ALL_FILE=$(find *.md ! -name README.md)
-TEMP_PY="temp.py"
-CUDAS="nvidia"
+ALL_FILES=$(find *.md ! -name README.md)
+PYTHON_CODE_DIR="python_code"
 
-declare -i error_code=0
+mkdir $PYTHON_CODE_DIR
 
-for f in $ALL_FILE
+# Quick rundown: for each file we extract the python code that's within
+# the ``` markers and we put that code in a corresponding .py file in $PYTHON_CODE_DIR
+# Then we execute each of these python files with pytest in test_run_python_code.py
+for f in $ALL_FILES
 do
-  echo "Running pytorch example in $f"
-  # FIXME: NVIDIA models checkoints are on cuda
-  if [[ $f = $CUDAS* ]]; then
-    echo "...skipped due to cuda checkpoints."
-  elif [[ $f = "pytorch_fairseq_translation"* ]]; then
-    echo "...temporarily disabled"
-  # FIXME: torch.nn.modules.module.ModuleAttributeError: 'autoShape' object has no attribute 'fuse'
-  elif [[ $f = "ultralytics_yolov5"* ]]; then
-    echo "...temporarily disabled"
-  elif [[ $f = "huggingface_pytorch-transformers"* ]]; then
-    echo "...temporarily disabled"
-  # FIXME: TypeError: compose() got an unexpected keyword argument 'strict'
-  elif [[ $f = "pytorch_fairseq_roberta"* ]]; then
-    echo "...temporarily disabled"
-  # FIXME: rate limiting
-  else
-    sed -n '/^```python/,/^```/ p' < $f | sed '/^```/ d' > $TEMP_PY
-    python $TEMP_PY
-    error_code+=$?
-
-    if [ -f "$TEMP_PY" ]; then
-      rm $TEMP_PY
-    fi
-  fi
+  f_no_ext=${f%.md}  # remove .md extension
+  out_py=$PYTHON_CODE_DIR/$f_no_ext.py
+  echo "Extracting Python code from $f into $out_py"
+  sed -n '/^```python/,/^```/ p' < $f | sed '/^```/ d' > $out_py
 done
 
-exit $error_code
+pytest --junitxml=test-results/junit.xml test_run_python_code.py -vv
diff --git a/test_run_python_code.py b/test_run_python_code.py
new file mode 100644
index 00000000..f44a2856
--- /dev/null
+++ b/test_run_python_code.py
@@ -0,0 +1,41 @@
+from subprocess import check_output, STDOUT, CalledProcessError
+import sys
+import pytest
+import glob
+
+
+PYTHON_CODE_DIR = "python_code"
+ALL_FILES = glob.glob(PYTHON_CODE_DIR + "/*.py")
+
+
+@pytest.mark.parametrize('file_path', ALL_FILES)
+def test_run_file(file_path):
+    if 'nvidia' in file_path:
+        # FIXME: NVIDIA models checkoints are on cuda
+        pytest.skip("temporarily disabled")
+    if 'pytorch_fairseq_translation' in file_path:
+        pytest.skip("temporarily disabled")
+    if 'ultralytics_yolov5' in file_path:
+        # FIXME torch.nn.modules.module.ModuleAttributeError: 'autoShape' object has no attribute 'fuse
+        pytest.skip("temporarily disabled")
+    if 'huggingface_pytorch-transformers' in file_path:
+        # FIXME torch.nn.modules.module.ModuleAttributeError: 'autoShape' object has no attribute 'fuse
+        pytest.skip("temporarily disabled")
+    if 'pytorch_fairseq_roberta' in file_path:
+        pytest.skip("temporarily disabled")
+
+    # We just run the python files in a separate sub-process. We really want a
+    # subprocess here because otherwise we might run into package versions
+    # issues: imagine script A that needs torchvivion 0.9 and script B that
+    # needs torchvision 0.10. If script A is run prior to script B in the same
+    # process, script B will still be run with torchvision 0.9 because the only
+    # "import torchvision" statement that counts is the first one, and even
+    # torchub sys.path shenanigans can do nothing about this. By creating
+    # subprocesses we're sure that all file executions are fully independent.
+    try:
+        # This is inspired (and heavily simplified) from
+        # https://github.com/cloudpipe/cloudpickle/blob/343da119685f622da2d1658ef7b3e2516a01817f/tests/testutils.py#L177
+        out = check_output([sys.executable, file_path], stderr=STDOUT)
+        print(out.decode())
+    except CalledProcessError as e:
+        raise RuntimeError(f"Script {file_path} errored with output:\n{e.output.decode()}")